I want to correct english text from some html tags. I have this prompts:
You are an expert English grammar corrector. Please follow these rules strictly:
Correct the following text, focusing on grammatical and lexical corrections.
Do not add or modify punctuation unless necessary.
Preserve all HTML tags (e.g., <em>, <strong>). Do not add any html tags.
Return only the corrected text, without explanations or additional text.
It corrects the text very well. But I want to add two other prompts:
Preserve all original punctuation marks, including quotation marks, commas, and periods.
Maintain original quotation marks and punctuation unless there's a clear grammatical error.
Do not alter, correct, or translate any non-English text. Leave it exactly as it is, even if it appears within the same HTML tag as English text.
In this case, the correction is no longer as good, that is, even if the sentences are 99% the same, it can be seen that many of them are no longer corrected as well as before I added these prompts.
Lets say also that I have this case:
<p class="text_obisnuit">After having successfully eluded the authorities for years, Hannibal peacefully lives in Italy in disfrazado de erudito en arte. Trouble strikes again when he is discovered leaving a deserving few dead in the process. </p>
You can see that the words “disfrazado de erudito en arte” are in spannish language.
I already write the prompt: “Do not alter, correct, or translate any non-English text. Leave it exactly as it is, even if it appears within the same HTML tag as English text.” but chatGPT translates those words in english. Why?
Python code:
import os
import re
from openai import OpenAI
import html
# Inițializează clientul OpenAI
client = OpenAI(api_key="YOUR-API-KEY")
# Directorul sursă și destinație
source_dir = r"d:\3"
output_dir = os.path.join(source_dir, "Output")
# Asigură-te că directorul de ieșire există
os.makedirs(output_dir, exist_ok=True)
def grammar_check(text):
try:
instruction = """
You are an expert English grammar corrector. Please follow these rules strictly:
Correct the following text, focusing on grammatical and lexical corrections.
Do not add or modify punctuation unless necessary.
Preserve all HTML tags (e.g., <em>, <strong>). Do not add any html tags.
Return only the corrected text, without explanations or additional text.
"""
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": instruction},
{"role": "user", "content": text}
]
)
corrected = response.choices[0].message.content.strip()
# Eliminăm orice text suplimentar care ar putea fi adăugat de model
corrected = re.sub(r'^(Corrected text:?\s*)', '', corrected, flags=re.IGNORECASE)
return corrected
except Exception as e:
print(f"Error in grammatical correction: {e}")
return text
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": instruction},
{"role": "user", "content": text}
]
)
return response.choices[0].message.content.strip()
except Exception as e:
print(f"Error in grammatical correction: {e}")
return text
def separate_non_english(text):
# Regex to match non-English phrases (for simplicity assuming non-English text has no space before/after)
non_english_pattern = r'[^\x00-\x7F]+'
matches = re.finditer(non_english_pattern, text)
parts = []
last_end = 0
for match in matches:
start, end = match.span()
if start > last_end:
parts.append(('english', text[last_end:start])) # English text before the non-English part
parts.append(('non-english', match.group(0))) # Non-English part
last_end = end
if last_end < len(text):
parts.append(('english', text[last_end:])) # Remaining English text
return parts
def process_html_content(content):
def replace_content(match):
full_match = match.group(0)
tag_content = match.group(2)
print(f"Processing tag content: {tag_content[:30]}...")
# Separate content into English and non-English parts
separated_content = separate_non_english(html.unescape(tag_content))
corrected_content = ""
for part_type, part_text in separated_content:
if part_type == 'english':
corrected_content += grammar_check(part_text)
else:
corrected_content += part_text # Keep non-English text unchanged
return full_match.replace(tag_content, corrected_content)
patterns = [
r'(<p class="text_obisnuit2">)(.*?)(</p>)',
r'(<p class="text_obisnuit">)(.*?)(</p>)'
]
for pattern in patterns:
print(f"Processing pattern: {pattern}")
content = re.sub(pattern, replace_content, content, flags=re.DOTALL)
return content
def process_html_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
processed_content = process_html_content(content)
output_path = os.path.join(output_dir, os.path.basename(file_path))
with open(output_path, 'w', encoding='utf-8') as file:
file.write(processed_content)
print(f"File processed and saved: {output_path}")
# Procesează toate fișierele HTML din director
for filename in os.listdir(source_dir):
if filename.endswith('.html'):
file_path = os.path.join(source_dir, filename)
print(f"Processing file: {filename}")
process_html_file(file_path)
print("Processing of all files has been completed.")