Trying to turn this into an automatic web-search engine

But, im quite bad at coding as of yet
And to be honest, i used GPT +/ Other AI’s to refine this concept.
But it just seems like a very handy tool…
And yeah, obviously you can adapt it to openai API, but, honestly… Im not about to test 20TB on openai API tokens (no offense intended)


from bs4 import BeautifulSoup
import markdown
import requests
import argparse
import sys
import json
import time
import chardet
import os

HOST = "localhost:5000"
URI = f"http://{HOST}/api/v1/chat"

# Define some constants for the chat API parameters
MODE = 'instruct'
CHARACTER = 'Document AI'
INSTRUCTION_TEMPLATE = 'D44kb0b-v4.1'
YOUR_NAME = 'You'
MAX_NEW_TOKENS =2000
DO_SAMPLE = True
TEMPERATURE = 1.3
TOP_P = 0.1
TYPICAL_P = 1
EPSILON_CUTOFF = 0
ETA_CUTOFF = 0
TFS = 1
TOP_A = 0
REPETITION_PENALTY = 1.18
TOP_K = 40
MIN_LENGTH = 0
NO_REPEAT_NGRAM_SIZE = 0
NUM_BEAMS = 10
PENALTY_ALPHA = 0
LENGTH_PENALTY = 1
EARLY_STOPPING = False
MIROSTAT_MODE = 0
MIROSTAT_TAU = 5
MIROSTAT_ETA = 0.1
SEED = -1
ADD_BOS_TOKEN = True
TRUNCATION_LENGTH = 7000
BAN_EOS_TOKEN = False
SKIP_SPECIAL_TOKENS = True

def get_chunk_size(input_length, default_chunk_size=MAX_NEW_TOKENS):
    if input_length > default_chunk_size:
        return default_chunk_size
    else:
        return input_length

def detect_file_encoding(file_path):
    with open(file_path, 'rb') as file:
        raw_data = file.read()
        encoding_result = chardet.detect(raw_data)
        encoding = encoding_result['encoding']
    return encoding

def parse_html_to_markdown(html):
    soup = BeautifulSoup(html, 'html.parser')
    md_output = ""

    # Find the title element and convert it to a markdown heading
    title = soup.find("title")
    if title:
        md_output += f"# {title.text}\n\n"

    # Find all the h1-h6 elements and convert them to markdown headings
    headings = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
    for heading in headings:
        level = heading.name[1]  # Get the heading level from the tag name
        md_output += f"{'#' * int(level)} {heading.text}\n\n"

    # Find all the p elements and convert them to markdown paragraphs
    paragraphs = soup.find_all("p")
    for paragraph in paragraphs:
        md_output += f"{paragraph.text}\n\n"

    # Find all the a elements and convert them to markdown links
    links = soup.find_all("a")
    for link in links:
        text = link.text
        href = link.get("href")
        md_output += f"[{text}]({href})\n\n"

    # Find all the img elements and convert them to markdown images
    images = soup.find_all("img")
    for image in images:
        alt = image.get("alt")
        src = image.get("src")
        md_output += f"![{alt}]({src})\n\n"

    return md_output

def run(prompt, file_path=None, chunk_size=None, output_file=None, **kwargs):
    if file_path:
        encoding = detect_file_encoding(file_path)
        with open(file_path, "r", encoding=encoding) as file:
            content = file.read()
    elif not prompt:
        content = input("Enter the prompt: ")
    else:
        content = prompt

    input_length = len(content)
    chunk_size = chunk_size or get_chunk_size(input_length)

    chunks = [content[i:i+chunk_size] for i in range(0, input_length, chunk_size)]

    # Initialize history outside the loop
    history = {'internal': [], 'visible': []}

    output_data = []

    for chunk in chunks:
        prompt += chunk

        # Create the request payload with the default parameters and the optional parameters from kwargs
        request_payload = {
            'user_input': prompt,
            'history': history,
            'mode': MODE,
            'character': CHARACTER,
            'instruction_template': INSTRUCTION_TEMPLATE,
            'your_name': YOUR_NAME,

            'max_new_tokens': MAX_NEW_TOKENS,
            'do_sample': DO_SAMPLE,
            'temperature': TEMPERATURE,
            'top_p': TOP_P,
            'typical_p': TYPICAL_P,
            'epsilon_cutoff': EPSILON_CUTOFF,
            'eta_cutoff': ETA_CUTOFF,
            'tfs': TFS,
            'top_a': TOP_A,
            'repetition_penalty': REPETITION_PENALTY,
            'top_k': TOP_K,
            'min_length': MIN_LENGTH,
            'no_repeat_ngram_size': NO_REPEAT_NGRAM_SIZE,
            'num_beams': NUM_BEAMS,
            'penalty_alpha': PENALTY_ALPHA,
            'length_penalty': LENGTH_PENALTY,
            'early_stopping': EARLY_STOPPING,
            'mirostat_mode': MIROSTAT_MODE,
            'mirostat_tau': MIROSTAT_TAU,
            'mirostat_eta': MIROSTAT_ETA,
            'seed': SEED,
            'add_bos_token': ADD_BOS_TOKEN,
            'truncation_length': TRUNCATION_LENGTH,
            'ban_eos_token': BAN_EOS_TOKEN,
            'skip_special_tokens': SKIP_SPECIAL_TOKENS,

        }

        # Update the request payload with the optional parameters from kwargs if any
        request_payload.update(kwargs)

        response = requests.post(URI, json=request_payload)
        if response.status_code == 200:
            result = response.json()['results'][0]['history']
            visible_output = result['visible'][-1][1]
            
            # Check if the file extension is html
            if file_path and file_path.lower().endswith('.html'):
                html_output = parse_html_to_markdown(visible_output)
                visible_output = html_output

            # Convert the json data to a formatted string
            formatted_output = json.dumps(visible_output, indent=4, separators=(',', ': '))
            
            # Print the formatted output
            print(formatted_output)

            history = result

            # Collect output data
            output_data.append(formatted_output)

        time.sleep(1)  # Add a delay of 1 second between API requests

    # Write output data to file
    if output_file:
        with open(output_file, 'w') as file:
            file.write('\n'.join(output_data))

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process a file with AI.")
    parser.add_argument("-f", "--file", nargs="?", help="The path to the file.")
    parser.add_argument("-p", "--prompt", nargs="?", help="The prompt for the AI.")
    parser.add_argument("-c", "--chunk-size", type=int, default=None, help="The size of each processing chunk.")
    parser.add_argument("-o", "--output-file", nargs="?", help="The output file to save the visible output.")
    
    # Add some optional arguments for the chat API parameters
    parser.add_argument("--temperature", type=float, default=None, help="The temperature for sampling.")
    parser.add_argument("--top-p", type=float, default=None, help="The top-p for sampling.")
    parser.add_argument("--top-k", type=int, default=None, help="The top-k for sampling.")
    parser.add_argument("--repetition-penalty", type=float, default=None, help="The repetition penalty for sampling.")
    
    args = parser.parse_args()
    
    # Get the optional arguments from args and store them in a dictionary
    optional_args = {}
    if args.temperature:
        optional_args['temperature'] = args.temperature
    if args.top_p:
        optional_args['top_p'] = args.top_p
    if args.top_k:
        optional_args['top_k'] = args.top_k
    if args.repetition_penalty:
        optional_args['repetition_penalty'] = args.repetition_penalty
    
    run(args.prompt, args.file, args.chunk_size, args.output_file, **optional_args)