Hereās my code. If you take the time to look at thisā¦ thank you.
from openai import OpenAI
import os
# Set your OpenAI API key here
client = OpenAI(api_key="MY_API_KEY")
# Define the maximum token limit for GPT-3 (example: 4000 tokens)
MAX_TOKENS = 4000 # Adjust depending on your model (GPT-3.5, GPT-4, etc.)
# Function to split the text into manageable chunks
def split_text_into_chunks(text, max_tokens=MAX_TOKENS):
# Tokenize the text and split it based on token limit
tokens = text.split() # Simple word-based split (you could use a real tokenizer for more accuracy)
chunks = []
current_chunk = []
for token in tokens:
current_chunk.append(token)
# Wrap the chunk at half of the token limit to leave space for output
if len(current_chunk) > max_tokens // 3:
chunks.append(" ".join(current_chunk[:-1])) # Add the current chunk (without the last token)
current_chunk = [current_chunk[-1]] # Start a new chunk with the last token
if current_chunk:
chunks.append(" ".join(current_chunk)) # Add the last chunk
return chunks
def process_chunk(chunk):
# Updated API usage with the new 'chat' method and gpt-3.5-turbo model
response = client.chat.completions.create(
model="gpt-3.5-turbo", # Use a newer model (or gpt-4 if you have access)
messages=[
{"role": "system", "content": f"You have a singular automated task and purpose: to improve the quality of text transcripts. You do this by first taking the entire transcript and making the text into a single paragraph. Then, given the context, you make one new paragraph for each speaker in the audio, with **Speaker 1:** at the start of each paragraph. You may use speaker names if they are presented in the text, but you may also just number the speakers sequentially. Next, in each paragraph, you perform a revision according to the following instructions: You are not allowed to remove any words of content, such that when comparing the output back to the input, there should be no loss of meaning. You are not allowed to correct grammar or substitute incorrect word usage (such as replacing mute point with moot point if the speaker uses the common incorrect word), or add words for clarity. You are only allowed to remove stutters or repeated words (except when the repeated words are for emphasis, such as saying it was very, very, difficult), you may also remove a thought fragment (such as a speaker saying I was--He told me something...) when the speaker changes thought entirely midsentence. You may also remove an interjection from another speaker that disrupts a thought from the current speaker and does not add content or new information, especially single word interjections like wow! or really? (but not necessarily limited to single word interjections). In this case, the interjection speaker paragraph should be removed entirely and the current speakers dialogue should continue uninterrupted in a single paragraph. You may also remove filler words or speakers ticks, such as repeated usage of um like you know, but you should not remove natural lead ins or transitive words that naturally break up dialogue, such as now or then. You are not allowed to review content for correctness, in fact, other than performing your removal of filler words, you do not care about context at all. You succeed when the output has the maximum possible similarity with the input in content and the minimum possible extraneous words according to the above guidelines, nothing was summarized in the output, no words are present in the output text that was not in the input text aside from the new paragraph headers (this task is purely reductive in nature), and each speaker has a single paragraph of text per speaking engagement with a paragraph header to identify the speaker."},
{"role": "user", "content": f"Here is the current section of the transcript for you to revise according to your instructions. Remember, do not summarize anything or add to the text except for the paragraph headers. Focus only on your instructions. {chunk}"}
],
max_tokens=MAX_TOKENS,
temperature=0,
top_p=0,
frequency_penalty=0,
presence_penalty=0
)
return response.choices[0].message.content.strip()
# Function to edit a large transcript
def edit_large_transcript(input_file, output_file):
# Read the large text file
with open(input_file, "r", encoding="utf-8") as f:
text = f.read()
# Split the text into manageable chunks
chunks = split_text_into_chunks(text)
# Process each chunk and store the results
edited_text = ""
for chunk in chunks:
edited_text += process_chunk(chunk) + "\n"
# Write the final edited text to an output file
with open(output_file, "w", encoding="utf-8") as f:
f.write(edited_text)
print(f"Editing completed. Results saved to {output_file}")
# Example usage
if __name__ == "__main__":
input_file = r"C:\Users\mcmas\Desktop\TriDot Podcast\Output Files\TDP - 253 - Swim Straight! Your Guide to Open-Water Success.txt" # Input file path
output_file = r"C:\Users\mcmas\Desktop\TriDot Podcast\Revised Output Files\TDP - 253 - Swim Straight! Your Guide to Open-Water Success.txt" # Output file path
edit_large_transcript(input_file, output_file)