But, im quite bad at coding as of yet
And to be honest, i used GPT +/ Other AI’s to refine this concept.
But it just seems like a very handy tool…
And yeah, obviously you can adapt it to openai API, but, honestly… Im not about to test 20TB on openai API tokens (no offense intended)
from bs4 import BeautifulSoup
import markdown
import requests
import argparse
import sys
import json
import time
import chardet
import os
HOST = "localhost:5000"
URI = f"http://{HOST}/api/v1/chat"
# Define some constants for the chat API parameters
MODE = 'instruct'
CHARACTER = 'Document AI'
INSTRUCTION_TEMPLATE = 'D44kb0b-v4.1'
YOUR_NAME = 'You'
MAX_NEW_TOKENS =2000
DO_SAMPLE = True
TEMPERATURE = 1.3
TOP_P = 0.1
TYPICAL_P = 1
EPSILON_CUTOFF = 0
ETA_CUTOFF = 0
TFS = 1
TOP_A = 0
REPETITION_PENALTY = 1.18
TOP_K = 40
MIN_LENGTH = 0
NO_REPEAT_NGRAM_SIZE = 0
NUM_BEAMS = 10
PENALTY_ALPHA = 0
LENGTH_PENALTY = 1
EARLY_STOPPING = False
MIROSTAT_MODE = 0
MIROSTAT_TAU = 5
MIROSTAT_ETA = 0.1
SEED = -1
ADD_BOS_TOKEN = True
TRUNCATION_LENGTH = 7000
BAN_EOS_TOKEN = False
SKIP_SPECIAL_TOKENS = True
def get_chunk_size(input_length, default_chunk_size=MAX_NEW_TOKENS):
if input_length > default_chunk_size:
return default_chunk_size
else:
return input_length
def detect_file_encoding(file_path):
with open(file_path, 'rb') as file:
raw_data = file.read()
encoding_result = chardet.detect(raw_data)
encoding = encoding_result['encoding']
return encoding
def parse_html_to_markdown(html):
soup = BeautifulSoup(html, 'html.parser')
md_output = ""
# Find the title element and convert it to a markdown heading
title = soup.find("title")
if title:
md_output += f"# {title.text}\n\n"
# Find all the h1-h6 elements and convert them to markdown headings
headings = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
for heading in headings:
level = heading.name[1] # Get the heading level from the tag name
md_output += f"{'#' * int(level)} {heading.text}\n\n"
# Find all the p elements and convert them to markdown paragraphs
paragraphs = soup.find_all("p")
for paragraph in paragraphs:
md_output += f"{paragraph.text}\n\n"
# Find all the a elements and convert them to markdown links
links = soup.find_all("a")
for link in links:
text = link.text
href = link.get("href")
md_output += f"[{text}]({href})\n\n"
# Find all the img elements and convert them to markdown images
images = soup.find_all("img")
for image in images:
alt = image.get("alt")
src = image.get("src")
md_output += f"\n\n"
return md_output
def run(prompt, file_path=None, chunk_size=None, output_file=None, **kwargs):
if file_path:
encoding = detect_file_encoding(file_path)
with open(file_path, "r", encoding=encoding) as file:
content = file.read()
elif not prompt:
content = input("Enter the prompt: ")
else:
content = prompt
input_length = len(content)
chunk_size = chunk_size or get_chunk_size(input_length)
chunks = [content[i:i+chunk_size] for i in range(0, input_length, chunk_size)]
# Initialize history outside the loop
history = {'internal': [], 'visible': []}
output_data = []
for chunk in chunks:
prompt += chunk
# Create the request payload with the default parameters and the optional parameters from kwargs
request_payload = {
'user_input': prompt,
'history': history,
'mode': MODE,
'character': CHARACTER,
'instruction_template': INSTRUCTION_TEMPLATE,
'your_name': YOUR_NAME,
'max_new_tokens': MAX_NEW_TOKENS,
'do_sample': DO_SAMPLE,
'temperature': TEMPERATURE,
'top_p': TOP_P,
'typical_p': TYPICAL_P,
'epsilon_cutoff': EPSILON_CUTOFF,
'eta_cutoff': ETA_CUTOFF,
'tfs': TFS,
'top_a': TOP_A,
'repetition_penalty': REPETITION_PENALTY,
'top_k': TOP_K,
'min_length': MIN_LENGTH,
'no_repeat_ngram_size': NO_REPEAT_NGRAM_SIZE,
'num_beams': NUM_BEAMS,
'penalty_alpha': PENALTY_ALPHA,
'length_penalty': LENGTH_PENALTY,
'early_stopping': EARLY_STOPPING,
'mirostat_mode': MIROSTAT_MODE,
'mirostat_tau': MIROSTAT_TAU,
'mirostat_eta': MIROSTAT_ETA,
'seed': SEED,
'add_bos_token': ADD_BOS_TOKEN,
'truncation_length': TRUNCATION_LENGTH,
'ban_eos_token': BAN_EOS_TOKEN,
'skip_special_tokens': SKIP_SPECIAL_TOKENS,
}
# Update the request payload with the optional parameters from kwargs if any
request_payload.update(kwargs)
response = requests.post(URI, json=request_payload)
if response.status_code == 200:
result = response.json()['results'][0]['history']
visible_output = result['visible'][-1][1]
# Check if the file extension is html
if file_path and file_path.lower().endswith('.html'):
html_output = parse_html_to_markdown(visible_output)
visible_output = html_output
# Convert the json data to a formatted string
formatted_output = json.dumps(visible_output, indent=4, separators=(',', ': '))
# Print the formatted output
print(formatted_output)
history = result
# Collect output data
output_data.append(formatted_output)
time.sleep(1) # Add a delay of 1 second between API requests
# Write output data to file
if output_file:
with open(output_file, 'w') as file:
file.write('\n'.join(output_data))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process a file with AI.")
parser.add_argument("-f", "--file", nargs="?", help="The path to the file.")
parser.add_argument("-p", "--prompt", nargs="?", help="The prompt for the AI.")
parser.add_argument("-c", "--chunk-size", type=int, default=None, help="The size of each processing chunk.")
parser.add_argument("-o", "--output-file", nargs="?", help="The output file to save the visible output.")
# Add some optional arguments for the chat API parameters
parser.add_argument("--temperature", type=float, default=None, help="The temperature for sampling.")
parser.add_argument("--top-p", type=float, default=None, help="The top-p for sampling.")
parser.add_argument("--top-k", type=int, default=None, help="The top-k for sampling.")
parser.add_argument("--repetition-penalty", type=float, default=None, help="The repetition penalty for sampling.")
args = parser.parse_args()
# Get the optional arguments from args and store them in a dictionary
optional_args = {}
if args.temperature:
optional_args['temperature'] = args.temperature
if args.top_p:
optional_args['top_p'] = args.top_p
if args.top_k:
optional_args['top_k'] = args.top_k
if args.repetition_penalty:
optional_args['repetition_penalty'] = args.repetition_penalty
run(args.prompt, args.file, args.chunk_size, args.output_file, **optional_args)