Prompt problems ! ChatGPT does't follow the prompts!

I want to correct english text from some html tags. I have this prompts:

You are an expert English grammar corrector. Please follow these rules strictly:
Correct the following text, focusing on grammatical and lexical corrections.
Do not add or modify punctuation unless necessary.
Preserve all HTML tags (e.g., <em>, <strong>). Do not add any html tags.
Return only the corrected text, without explanations or additional text.

It corrects the text very well. But I want to add two other prompts:

Preserve all original punctuation marks, including quotation marks, commas, and periods.
Maintain original quotation marks and punctuation unless there's a clear grammatical error.
Do not alter, correct, or translate any non-English text. Leave it exactly as it is, even if it appears within the same HTML tag as English text.

In this case, the correction is no longer as good, that is, even if the sentences are 99% the same, it can be seen that many of them are no longer corrected as well as before I added these prompts.

Lets say also that I have this case:

<p class="text_obisnuit">After having successfully eluded the authorities for years, Hannibal peacefully lives in Italy in disfrazado de erudito en arte. Trouble strikes again when he is discovered leaving a deserving few dead in the process. </p>

You can see that the words “disfrazado de erudito en arte” are in spannish language.

I already write the prompt: “Do not alter, correct, or translate any non-English text. Leave it exactly as it is, even if it appears within the same HTML tag as English text.” but chatGPT translates those words in english. Why?

Python code:

import os
import re
from openai import OpenAI
import html

# Inițializează clientul OpenAI
client = OpenAI(api_key="YOUR-API-KEY")

# Directorul sursă și destinație
source_dir = r"d:\3"
output_dir = os.path.join(source_dir, "Output")

# Asigură-te că directorul de ieșire există
os.makedirs(output_dir, exist_ok=True)

def grammar_check(text):
    try:
        instruction = """
You are an expert English grammar corrector. Please follow these rules strictly:
Correct the following text, focusing on grammatical and lexical corrections.
Do not add or modify punctuation unless necessary.
Preserve all HTML tags (e.g., <em>, <strong>). Do not add any html tags.


Return only the corrected text, without explanations or additional text.
"""





        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": instruction},
                {"role": "user", "content": text}
            ]
        )
        corrected = response.choices[0].message.content.strip()
        # Eliminăm orice text suplimentar care ar putea fi adăugat de model
        corrected = re.sub(r'^(Corrected text:?\s*)', '', corrected, flags=re.IGNORECASE)
        return corrected
    except Exception as e:
        print(f"Error in grammatical correction: {e}")
        return text

        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": instruction},
                {"role": "user", "content": text}
            ]
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error in grammatical correction: {e}")
        return text

def separate_non_english(text):
    # Regex to match non-English phrases (for simplicity assuming non-English text has no space before/after)
    non_english_pattern = r'[^\x00-\x7F]+'
    matches = re.finditer(non_english_pattern, text)

    parts = []
    last_end = 0

    for match in matches:
        start, end = match.span()
        if start > last_end:
            parts.append(('english', text[last_end:start]))  # English text before the non-English part
        parts.append(('non-english', match.group(0)))  # Non-English part
        last_end = end

    if last_end < len(text):
        parts.append(('english', text[last_end:]))  # Remaining English text

    return parts

def process_html_content(content):
    def replace_content(match):
        full_match = match.group(0)
        tag_content = match.group(2)
        print(f"Processing tag content: {tag_content[:30]}...")

        # Separate content into English and non-English parts
        separated_content = separate_non_english(html.unescape(tag_content))
        corrected_content = ""

        for part_type, part_text in separated_content:
            if part_type == 'english':
                corrected_content += grammar_check(part_text)
            else:
                corrected_content += part_text  # Keep non-English text unchanged

        return full_match.replace(tag_content, corrected_content)

    patterns = [
        r'(<p class="text_obisnuit2">)(.*?)(</p>)',
        r'(<p class="text_obisnuit">)(.*?)(</p>)'
    ]

    for pattern in patterns:
        print(f"Processing pattern: {pattern}")
        content = re.sub(pattern, replace_content, content, flags=re.DOTALL)

    return content

def process_html_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    processed_content = process_html_content(content)

    output_path = os.path.join(output_dir, os.path.basename(file_path))
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(processed_content)

    print(f"File processed and saved: {output_path}")

# Procesează toate fișierele HTML din director
for filename in os.listdir(source_dir):
    if filename.endswith('.html'):
        file_path = os.path.join(source_dir, filename)
        print(f"Processing file: {filename}")
        process_html_file(file_path)

print("Processing of all files has been completed.")

I tried 3 other combinations of prompts, which take into account the formula “Do not alter, correct, or translate any non-English text. It should remain exactly as it is”, but none of them are good.

Likewise, the only problem is that it translates words from another language. In other words, it can be seen that he recognizes their foreign words, because he translates them. But I don’t want to translate them, but to leave them as they are in that html tag.

Option 1: Separate the Constraints into Specific Steps

You are an expert English grammar corrector. Please follow these rules strictly:
1. Correct the following text, focusing on grammatical and lexical corrections, while preserving the original structure.
2. Do not add or modify punctuation unless necessary.
3. Preserve all original HTML tags (e.g., <em>, <strong>) and do not add any new HTML tags.
4. Maintain all original punctuation marks, including quotation marks, commas, and periods, unless there's a clear grammatical error.
5. Do not alter, correct, or translate any non-English text. Leave it exactly as it is, even if it appears within the same HTML tag as English text.

Return only the corrected text, without explanations or additional content.

Option 2: Emphasize the Primary Task with Conditional Clauses

You are an expert English grammar corrector. Correct the following text, focusing primarily on grammatical and lexical improvements. However, adhere to the following conditions:
- Preserve all original HTML tags (e.g., <em>, <strong>) without adding new ones.
- Do not alter or remove any original punctuation marks unless they cause a clear grammatical error.
- Leave all non-English text exactly as it is, even if it appears within the same HTML tag as English text.

Return only the corrected text, ensuring all conditions are met.

Option 3: Use a Prioritized List of Rules

You are an expert English grammar corrector. Your primary task is to correct the text for grammar and vocabulary, while adhering to the following prioritized rules:
1. Correct grammar and vocabulary, ensuring clarity and correctness.
2. Do not add or modify punctuation unless it corrects a clear grammatical mistake.
3. Preserve all HTML tags and do not add any new HTML tags.
4. Retain all original punctuation marks, including quotation marks, commas, and periods, unless necessary for grammatical accuracy.
5. Do not alter, correct, or translate any non-English text. It should remain exactly as it is.

Return only the corrected text, strictly following these rules.

Hi @oanaaaa08 :wave:

You may try this:

You are 🐢Polepole🐢 English Grammar Corrector-TEST. Please follow these instructions carefully and strictly:

1. **Primary Task:**
   - Correct the following text, focusing only on grammatical and lexical corrections within the English language portions of the text.
   - Ensure clarity and correctness of English grammar and vocabulary without altering the original meaning.

2. **Preservation of Punctuation:**
   - Do not add, remove, or modify any punctuation marks (including quotation marks, commas, periods, etc.) unless there is a clear grammatical error that necessitates a change.

3. **HTML Tags:**
   - Preserve all original HTML tags (e.g., `<em>`, `<strong>`, `<p>`) exactly as they appear in the text. Do not add, remove, or alter any HTML tags.

4. **Non-English Text:**
   - **Critical Instruction:** Do not alter, correct, or translate any non-English text. Leave any non-English text exactly as it is, even if it appears within the same HTML tag as English text.
   - Treat all non-English content as untouchable and maintain it in its original form within the text.

5. **Output Formatting:**
   - Return only the corrected text, without any explanations, additional content, or changes to the original HTML structure.
   - Ensure that the output preserves the HTML tags and non-English text as specified.

---

### Example Scenario:

- **Input:**
  
  <p class="text_obisnuit">After having successfully eluded the authorities for years, Hannibal peacefully lives in Italy in disfrazado de erudito en arte. Trouble strikes again when he is discovered leaving a deserving few dead in the process. </p>
  
- **Expected Output:**
  
  <p class="text_obisnuit">After successfully eluding the authorities for years, Hannibal peacefully lives in Italy in disfrazado de erudito en arte. Trouble strikes again when he is discovered, leaving a deserving few dead in the process. </p>
  
In this example, the non-English text "disfrazado de erudito en arte" remains untouched, and HTML tags are preserved while English grammar is corrected.

### Additional Clarifications:
- Under no circumstances should the non-English text be altered or translated.
- The output must strictly adhere to the original structure, with only the necessary grammatical corrections applied to the English text.

thank you

Nope, I just test your prompts, I replace them into my python code. The same problem. Yes, corects my text, but also translate words from other languages, and ChatGPT should skip those words.

I do not know which model are you using, but interesting it works on regular custom GPT, and custom GPTs work on GPT 4o.

I tried 6 times, and It gave correct respond:


yes, it works on regular custom GPT, and custom GPTs work on GPT 4o. But doesn’t work with API KEY from the Python code !

I have to correct the mistakes in the English language in over 2000 html files. If I use the API KEY, everything will work for me while I’m doing something else. I can’t use MyGPT for this, I can’t select file by file, because it would take a very long time.

That’s why I use API KEY and Python

Hi @oanaaaa08 , I used API KEY and Python

Model GPT-4

Model GPT-4o

Model GPT-4o mini

my_project_folder/

├── .env------------------------> Environment file containing your API key
├── grammar_corrector.py----> The main Python script
├── requirements.txt----------> List of required Python packages
├── .gitignore------------------> (Optional) Ignore list for version control
└── home.html ----------------> The input HTML file


.env file

OPENAI_API_KEY=your-openai-api-key

grammar_corrector.py

last line should be your expecting file names:

correct_html_file(‘home.html’, ‘home_corrected.html’)

as default in code, it uses GPT-4o mini, you can change it:

import openai
from dotenv import load_dotenv
import os
import re
from bs4 import BeautifulSoup

# Load environment variables from the .env file
load_dotenv()

# Set your OpenAI API key from the environment variable
openai.api_key = os.getenv('OPENAI_API_KEY')

def mask_non_english(text):
    """Mask non-English text with placeholders."""
    non_english_parts = re.findall(r'[^\x00-\x7F]+', text)
    for i, part in enumerate(non_english_parts):
        text = text.replace(part, f"<MASKED_{i}>")
    return text, non_english_parts

def unmask_non_english(text, non_english_parts):
    """Replace placeholders with original non-English text."""
    for i, part in enumerate(non_english_parts):
        text = text.replace(f"<MASKED_{i}>", part)
    return text

def correct_html_file(input_file, output_file):
    """Corrects the English grammar in HTML content while preserving HTML tags and non-English text."""
    with open(input_file, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file.read(), 'html.parser')

    for paragraph in soup.find_all('p'):
        original_text = paragraph.get_text()
        masked_text, non_english_parts = mask_non_english(original_text)

        # Simplified prompt
        prompt = f"Correct the grammar without changing HTML tags or non-English text: {masked_text}"

        # Call OpenAI's GPT-4o mini model to correct grammar
        response = openai.ChatCompletion.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}]
        )

        corrected_text = unmask_non_english(response.choices[0].message['content'], non_english_parts)
        paragraph.string.replace_with(corrected_text)

    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(soup.prettify())

# Example usage:
correct_html_file('home.html', 'home_corrected.html')

requirements.txt File

openai
beautifulsoup4
python-dotenv

.gitignore File (Optional)

.env

home.html

INSERT YOUR TEXT HERE YOU WANT TO CORRECT ITS GRAMMAR. 

Run the Script

python grammar_corrector.py

home_corrected.html

YOUR CORRECTED TEXT WILL BE DISPLAYED HERE

And as default, it will save the corrected file to home_corrected.html.
You may change their names in grammar_corrector.py

Test 1 - translate the content of html tags:

The correction is good, but also translate non-english words

Test 1 - translate simple text lines:

Doesn’t do anything.

Your code gives this error, so I update your code:

*** Remote Interpreter Reinitialized ***
Traceback (most recent call last):
  File "E:\Carte\BB\17 - Site Leadership\alte\Ionel Balauta\Aryeht\Task 1 - Traduce tot site-ul\Doar Google Web\Andreea\Meditatii\2023\CORRECT ENILISH\module1 - tipul.py", line 51, in <module>
    correct_html_file('home.html', 'home_corrected.html')
  File "E:\Carte\BB\17 - Site Leadership\alte\Ionel Balauta\Aryeht\Task 1 - Traduce tot site-ul\Doar Google Web\Andreea\Meditatii\2023\CORRECT ENILISH\module1 - tipul.py", line 39, in correct_html_file
    response = openai.ChatCompletion.create(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\necul\AppData\Local\Programs\Python\Python312\Lib\site-packages\openai\lib\_old_api.py", line 39, in __call__
    raise APIRemovedInV1(symbol=self._symbol)
openai.lib._old_api.APIRemovedInV1: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742

Your code (A little bit updated)

import os
from openai import OpenAI
from dotenv import load_dotenv
import re
from bs4 import BeautifulSoup

# Încarcă variabilele de mediu din fișierul .env
load_dotenv()

# Inițializează clientul OpenAI
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

def mask_non_english(text):
    """Maschează textul non-englezesc cu placeholder-uri."""
    non_english_parts = re.findall(r'[^\x00-\x7F]+', text)
    for i, part in enumerate(non_english_parts):
        text = text.replace(part, f"<MASKED_{i}>")
    return text, non_english_parts

def unmask_non_english(text, non_english_parts):
    """Înlocuiește placeholder-urile cu textul non-englezesc original."""
    for i, part in enumerate(non_english_parts):
        text = text.replace(f"<MASKED_{i}>", part)
    return text

def correct_html_file(input_file, output_file):
    """Corectează gramatica engleză în conținutul HTML păstrând tag-urile HTML și textul non-englezesc."""
    with open(input_file, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file.read(), 'html.parser')

    for paragraph in soup.find_all('p'):
        original_text = paragraph.get_text()
        masked_text, non_english_parts = mask_non_english(original_text)

        # Prompt simplificat
        prompt = f"Correct the grammar without changing HTML tags or non-English text: {masked_text}"

        # Apelează modelul GPT-4-turbo (sau alt model disponibil) de la OpenAI pentru a corecta gramatica
        response = client.chat.completions.create(
            model="gpt-4-1106-preview",  # Sau alt model disponibil
            messages=[{"role": "user", "content": prompt}]
        )

        corrected_text = unmask_non_english(response.choices[0].message.content, non_english_parts)
        paragraph.string.replace_with(corrected_text)

    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(soup.prettify())

# Exemplu de utilizare:
correct_html_file('home.html', 'home_corrected.html')

print("Processing of the file has been completed.")

This Python code does exactly what shows those 2 print screens

You may try this, but it consumes more tokens.
Can you show me which words are translated to English? I could not aware.
Also, can you provide what your original text AS IS, let me try in my environment:

import openai
from dotenv import load_dotenv
import os
import re
from bs4 import BeautifulSoup

# Load environment variables from the .env file
load_dotenv()

# Set your OpenAI API key from the environment variable
openai.api_key = os.getenv('OPENAI_API_KEY')

def mask_non_english(text):
    # Find all non-English words/phrases
    non_english_parts = re.findall(r'[^\x00-\x7F]+', text)
    masked_text = text
    for i, part in enumerate(non_english_parts):
        masked_text = masked_text.replace(part, f"<MASKED_{i}>")
    return masked_text, non_english_parts

def unmask_non_english(masked_text, non_english_parts):
    unmasked_text = masked_text
    for i, part in enumerate(non_english_parts):
        unmasked_text = unmasked_text.replace(f"<MASKED_{i}>", part)
    return unmasked_text

def correct_html_file(input_file, output_file):
    # Read HTML content from the input file
    with open(input_file, 'r', encoding='utf-8') as file:
        html_content = file.read()

    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all paragraph tags
    paragraphs = soup.find_all('p')

    for paragraph in paragraphs:
        original_text = paragraph.get_text()

        # Mask non-English text automatically
        masked_text, non_english_parts = mask_non_english(original_text)

        # Create the instruction prompt
        prompt = f"""
        You are 🐢Polepole🐢 English Grammar Corrector-TEST. Please follow these instructions carefully and strictly:
        
        1. Primary Task:
        - Correct the following text, focusing only on grammatical and lexical corrections within the English language portions of the text.
        - Ensure clarity and correctness of English grammar and vocabulary without altering the original meaning.

        2. Preservation of Punctuation:
        - Do not add, remove, or modify any punctuation marks (including quotation marks, commas, periods, etc.) unless there is a clear grammatical error that necessitates a change.

        3. HTML Tags:
        - Preserve all original HTML tags (e.g., <em>, <strong>, <p>) exactly as they appear in the text. Do not add, remove, or alter any HTML tags.

        4. Non-English Text:
        - Critical Instruction: Do not alter, correct, or translate any non-English text. Leave any non-English text exactly as it is, even if it appears within the same HTML tag as English text.
        - Treat all non-English content as untouchable and maintain it in its original form within the text.

        5. Output Formatting:
        - Return only the corrected text, without any explanations, additional content, or changes to the original HTML structure.
        - Ensure that the output preserves the HTML tags and non-English text as specified.

        Here is the text for correction:
        {masked_text}
        """

        # Call OpenAI's GPT-4o mini model to correct grammar
        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are an expert in English grammar correction."},
                {"role": "user", "content": prompt}
            ]
        )

        # Unmask the non-English text
        corrected_text = unmask_non_english(response.choices[0].message['content'], non_english_parts)

        # Update the paragraph text in the soup
        paragraph.string.replace_with(corrected_text)

    # Save the corrected HTML to a new file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(soup.prettify())

# Example usage:
correct_html_file('home.html', 'home_corrected.html')

This code gave this output, left is original, right is return:

Please text this example, compare the result with it:

         <p class="text_obisnuit">I always associate the concept of leadership to life events or situations, a algunas historias o personajes, moments, actions or experiences once lived by other people. Hence it is that, wanting to expand my knowledge and understanding of what it means to be a leader, I reread yesterday the biography of the famous Irish writer Oscar Wilde, whose workse traveledt around the world.</p>																											
         <p class="text_obisnuit2"><em>Wilde cieszył się dużym szacunkiem w wyższych sferach, a person whose influence was exerted on all those whom he came in contact with, winning their sympathy with his finet manners and his elegant personalityt. His works, an incredible "Ethisrr Sioreff" have been admired, he won many literary awards.</em></p>
		 
		 
		 I always associate the concept of leadership to life events or situations, a algunas historias o personajes, moments, actions or experiences once lived by other people. Hence it is that, wanting to expand my knowledge and understanding of what it means to be a leader, I reread yesterday the biography of the famous Irish writer Oscar Wilde, whose workse traveledt around the world.
		 
         Wilde cieszył się dużym szacunkiem w wyższych sferach, a person whose influence was exerted on all those whom he came in contact with, winning their sympathy with his finet manners and his elegant personalityt. His works, an incredible "Ethisrr Sioreff" have been admired, he won many literary awards.

Your code gives this error, again:

*** Remote Interpreter Reinitialized ***
Traceback (most recent call last):
  File "E:\Carte\BB\17 - Site Leadership\alte\Ionel Balauta\Aryeht\Task 1 - Traduce tot site-ul\Doar Google Web\Andreea\Meditatii\2023\CORRECT ENILISH\Test 4.py", line 90, in <module>
    correct_html_file('home.html', 'home_corrected.html')
  File "E:\Carte\BB\17 - Site Leadership\alte\Ionel Balauta\Aryeht\Task 1 - Traduce tot site-ul\Doar Google Web\Andreea\Meditatii\2023\CORRECT ENILISH\Test 4.py", line 71, in correct_html_file
    response = openai.ChatCompletion.create(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\necul\AppData\Local\Programs\Python\Python312\Lib\site-packages\openai\lib\_old_api.py", line 39, in __call__
    raise APIRemovedInV1(symbol=self._symbol)
openai.lib._old_api.APIRemovedInV1: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742

>>>

EVRIKA !! This seems to work great.

I test the new Python code below for this example, in home.html

         <p class="text_obisnuit">I always associate the concept of leadership to life events or situations, a algunas historias o personajes, moments, actions or experiences once lived by other people. Hence it is that, wanting to expand my knowledge and understanding of what it means to be a leader, I reread yesterday the biography of the famous Irish writer Oscar Wilde, whose workse traveledt around the world.</p>																											
         <p class="text_obisnuit2"><em>Wilde cieszył się dużym szacunkiem w wyższych sferach, a person whose influence was exerted on all those whom he came in contact with, winning their sympathy with his finet manners and his elegant personalityt. His works, an incredible "Ethisrr Sioreff" have been admired, he won many literary awards.</em></p>
		 
		 
		 I always associate the concept of leadership to life events or situations, a algunas historias o personajes, moments, actions or experiences once lived by other people. Hence it is that, wanting to expand my knowledge and understanding of what it means to be a leader, I reread yesterday the biography of the famous Irish writer Oscar Wilde, whose workse traveledt around the world.
		 
         Wilde cieszył się dużym szacunkiem w wyższych sferach, a person whose influence was exerted on all those whom he came in contact with, winning their sympathy with his finet manners and his elegant personalityt. His works, an incredible "Ethisrr Sioreff" have been admired, او برنده جوایز ادبی بسیاری شد.

Python Code:

import os
from openai import OpenAI
from dotenv import load_dotenv
import re
import logging

# Set up logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# Load environment variables from the .env file
load_dotenv()

# Initialize the OpenAI client
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
    api_key = "YOUR-API-KEY"
    logging.warning("Using hardcoded API key. It's recommended to use environment variables for security.")

client = OpenAI(api_key=api_key)

def mask_non_english(text):
    non_english_parts = re.findall(r'[^\x00-\x7F]+', text)
    masked_text = text
    for i, part in enumerate(non_english_parts):
        masked_text = masked_text.replace(part, f"<MASKED_{i}>")
    return masked_text, non_english_parts

def unmask_non_english(masked_text, non_english_parts):
    unmasked_text = masked_text
    for i, part in enumerate(non_english_parts):
        unmasked_text = unmasked_text.replace(f"<MASKED_{i}>", part)
    return unmasked_text

def correct_html_file(input_file, output_file):
    logging.info(f"Starting correction of {input_file}")

    try:
        with open(input_file, 'r', encoding='utf-8') as file:
            html_content = file.read()
        logging.debug(f"Successfully read {input_file}")
        logging.debug(f"HTML content length: {len(html_content)} characters")
        logging.debug(f"First 200 characters of HTML content: {html_content[:200]}")
    except Exception as e:
        logging.error(f"Error reading input file: {e}")
        return

    # Look for any HTML tags
    all_tags = re.findall(r'<[^>]+>', html_content)
    logging.info(f"Found {len(all_tags)} HTML tags in the file")
    if all_tags:
        logging.debug(f"First 5 tags found: {all_tags[:5]}")
    else:
        logging.warning("No HTML tags found in the file. Is this a valid HTML file?")

    paragraphs = re.findall(r'<p>(.*?)</p>', html_content, re.DOTALL)
    logging.info(f"Found {len(paragraphs)} paragraphs to process")

    if not paragraphs:
        logging.warning("No paragraphs found. Attempting to process entire content as one block.")
        paragraphs = [html_content]

    for i, paragraph in enumerate(paragraphs):
        logging.debug(f"Processing block {i+1}, length: {len(paragraph)} characters")
        masked_text, non_english_parts = mask_non_english(paragraph)

        prompt = f"""
        You are 🐢Polepole🐢 English Grammar Corrector-TEST. Please follow these instructions carefully and strictly:
        1. Primary Task:
        - Correct the following text, focusing only on grammatical and lexical corrections within the English language portions of the text.
        - Ensure clarity and correctness of English grammar and vocabulary without altering the original meaning.
        2. Preservation of Punctuation and Structure:
        - Do not add, remove, or modify any punctuation marks or HTML tags unless there is a clear grammatical error that necessitates a change.
        3. Non-English Text:
        - Do not alter, correct, or translate any non-English text. Leave any non-English text exactly as it is.
        4. Output Formatting:
        - Return only the corrected text, without any explanations or additional content.
        Here is the text for correction:
        {masked_text}
        """

        try:
            response = client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are an expert in English grammar correction."},
                    {"role": "user", "content": prompt}
                ]
            )
            logging.debug(f"Successfully received response for block {i+1}")
        except Exception as e:
            logging.error(f"Error calling OpenAI API: {e}")
            continue

        corrected_text = unmask_non_english(response.choices[0].message.content, non_english_parts)

        if len(paragraphs) == 1:
            html_content = corrected_text
        else:
            html_content = html_content.replace(paragraph, corrected_text, 1)

        logging.debug(f"Corrected block {i+1}, new length: {len(corrected_text)} characters")

    try:
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(html_content)
        logging.info(f"Successfully wrote corrected content to {output_file}")
    except Exception as e:
        logging.error(f"Error writing output file: {e}")

if __name__ == "__main__":
    correct_html_file('home.html', 'home_corrected.html')
    logging.info("Script execution completed")
1 Like

@polepole FINAL SOLUTION ! Works PERFECT ! Thanks for your help.

import os
from openai import OpenAI
from dotenv import load_dotenv
import re
import logging

# Set up logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# Load environment variables from the .env file
load_dotenv()

# Initialize the OpenAI client
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
    api_key = "your-api-key-here"
    logging.warning("Using hardcoded API key. It's recommended to use environment variables for security.")

client = OpenAI(api_key=api_key)

def mask_non_english(text):
    non_english_parts = re.findall(r'[^\x00-\x7F]+', text)
    masked_text = text
    for i, part in enumerate(non_english_parts):
        masked_text = masked_text.replace(part, f"<MASKED_{i}>")
    return masked_text, non_english_parts

def unmask_non_english(masked_text, non_english_parts):
    unmasked_text = masked_text
    for i, part in enumerate(non_english_parts):
        unmasked_text = unmasked_text.replace(f"<MASKED_{i}>", part)
    return unmasked_text

def correct_punctuation(text):
    # Correct punctuation around quotation marks
    text = re.sub(r'(\w+),"\s', r'\1", ', text)
    text = re.sub(r'(\w+)\."\s', r'\1." ', text)
    return text

def correct_html_file(input_file, output_file):
    logging.info(f"Starting correction of {input_file}")

    try:
        with open(input_file, 'r', encoding='utf-8') as file:
            html_content = file.read()
        logging.debug(f"Successfully read {input_file}")
        logging.debug(f"HTML content length: {len(html_content)} characters")
    except Exception as e:
        logging.error(f"Error reading input file: {e}")
        return

    # Look for any HTML tags
    all_tags = re.findall(r'<[^>]+>', html_content)
    logging.info(f"Found {len(all_tags)} HTML tags in the file")

    paragraphs = re.findall(r'<p>(.*?)</p>', html_content, re.DOTALL)
    logging.info(f"Found {len(paragraphs)} paragraphs to process")

    if not paragraphs:
        logging.warning("No paragraphs found. Attempting to process entire content as one block.")
        paragraphs = [html_content]

    for i, paragraph in enumerate(paragraphs):
        logging.debug(f"Processing block {i+1}, length: {len(paragraph)} characters")
        masked_text, non_english_parts = mask_non_english(paragraph)

        prompt = f"""
        You are 🐢Polepole🐢 English Grammar Corrector-TEST. Please follow these instructions carefully and strictly:
        1. Primary Task:
        - Correct the following text, focusing only on grammatical and lexical corrections within the English language portions of the text.
        - Ensure clarity and correctness of English grammar and vocabulary without altering the original meaning.
        2. Preservation of Punctuation and Structure:
        - Do not add, remove, or modify any punctuation marks or HTML tags unless there is a clear grammatical error that necessitates a change.
        - Pay special attention to punctuation around quotation marks. Ensure commas and periods are placed correctly in relation to closing quotation marks.
        3. Non-English Text:
        - Do not alter, correct, or translate any non-English text. Leave any non-English text exactly as it is.
        4. Output Formatting:
        - Return only the corrected text, without any explanations or additional content.
        Here is the text for correction:
        {masked_text}
        """

        try:
            response = client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are an expert in English grammar correction."},
                    {"role": "user", "content": prompt}
                ]
            )
            logging.debug(f"Successfully received response for block {i+1}")
        except Exception as e:
            logging.error(f"Error calling OpenAI API: {e}")
            continue

        corrected_text = unmask_non_english(response.choices[0].message.content, non_english_parts)
        corrected_text = correct_punctuation(corrected_text)

        if len(paragraphs) == 1:
            html_content = corrected_text
        else:
            html_content = html_content.replace(paragraph, corrected_text, 1)

        logging.debug(f"Corrected block {i+1}, new length: {len(corrected_text)} characters")

    try:
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(html_content)
        logging.info(f"Successfully wrote corrected content to {output_file}")
    except Exception as e:
        logging.error(f"Error writing output file: {e}")

if __name__ == "__main__":
    correct_html_file('home.html', 'home_corrected.html')
    logging.info("Script execution completed")
1 Like

This is plain text:

This is with tags:

see my last post. I solved it !

1 Like