Need help on job scraper 'job_description' part

import os
import time
import json
from datetime import datetime, timedelta
from typing import List, Dict, Optional, Union
from pydantic import BaseModel, Field, HttpUrl, field_validator
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
# from openai import OpenAI
import tiktoken
import openai

# Load environment variables
load_dotenv()

# Set up the Chrome WebDriver options
def setup_selenium():
    options = Options()
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    service = Service(r"C:\Users\ajoyd\Downloads\chromedriver-win64\chromedriver.exe")  
    driver = webdriver.Chrome(service=service, options=options)
    return driver

# Fetch HTML using Selenium
def fetch_html_selenium(url: str) -> str:
    driver = setup_selenium()
    try:
        driver.get(url)
        time.sleep(5)  # Simulate user interaction delay
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)
        html = driver.page_source
        return html
    finally:
        driver.quit()

# Clean HTML content using BeautifulSoup
def clean_html(html_content: str) -> str:
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Remove irrelevant sections (style, footer, header, script)
    for element in soup.find_all(['style', 'footer', 'header', 'script', 'meta']):
        element.decompose()
    
    # Remove all class attributes except for those containing 'description' as part of the class name
    for tag in soup.find_all(True):
        if 'class' in tag.attrs:
            tag_classes = tag.attrs['class']
            tag.attrs['class'] = [cls for cls in tag_classes if 'description' in cls]
            if not tag.attrs['class']:
                tag.attrs.pop('class', None)
    
    return str(soup)

# Define the pricing for models
pricing = {
    "gpt-4o-mini": {
        "input": 0.15 / 1_000_000,  # $0.150 per 1M input tokens
        "output": 0.60 / 1_000_000, # $0.600 per 1M output tokens
    },
    "gpt-4o-mini-2024-07-18": {
        "input": 0.15 / 1_000_000,  # $0.150 per 1M input tokens
        "output": 0.60 / 1_000_000, # $0.600 per 1M output tokens
    },
    "babbage-002": {
        "input": 0.4 / 1_000_000,  # $0.40 per 1M input tokens
        "output": 0.4 / 1_000_000,  # $0.40 per 1M output tokens
    },
    "gpt-3.5-turbo-0125": {
        "input": 0.5 / 1_000_000,  # $0.50 per 1M input tokens
        "output": 1.5 / 1_000_000,  # $1.50 per 1M output tokens
    },
    "gpt-3.5-turbo-1106": {
        "input": 1 / 1_000_000,  # $1 per 1M input tokens
        "output": 2 / 1_000_000,  # $2 per 1M output tokens
    },
}

model_used = "gpt-4o-mini"

# Save raw HTML data to a file
def save_raw_data(raw_data: str, timestamp: str, output_folder: str = 'output') -> str:
    os.makedirs(output_folder, exist_ok=True)
    raw_output_path = os.path.join(output_folder, f'rawData_{timestamp}.html')
    with open(raw_output_path, 'w', encoding='utf-8') as f:
        f.write(raw_data)
    print(f"Raw HTML data saved to {raw_output_path}")
    return raw_output_path

# Pydantic models for job postings
class Salary(BaseModel):
    min: Optional[int] = None
    max: Optional[int] = None
    currency: str = "USD"  
    period: str = "yearly"  

    @field_validator('min', 'max', mode='before')
    def validate_salary(cls, value):
        if value is not None and value < 0:
            raise ValueError('Salary cannot be negative')
        return value
    # Field validator to handle None values for currency and period
    @field_validator('currency', 'period', mode='before')
    def set_defaults(cls, value, info):
        if value is None:
            if info.field_name == 'currency':
                return 'USD'
            elif info.field_name == 'period':
                return 'yearly'
        return value

from dateutil import parser

def parse_date(date_str: str) -> Optional[str]:
    """
    Tries to parse various date formats and returns a date in 'YYYY-MM-DD' format.
    If the date cannot be parsed, returns None.
    """
    try:
        # Attempt to parse the date
        parsed_date = parser.parse(date_str).strftime("%Y-%m-%d")
        return parsed_date
    except (ValueError, OverflowError):
        if "ongoing" in date_str.lower():
            return None  # Handle special cases like 'Ongoing'
        return None  # Return None if the format is unrecognized

class Location(BaseModel):
    city: Optional[str] = None
    state: Optional[str] = None
    country: Optional[str] = "Unknown"  

class EducationalQualification(BaseModel):
    degree: Optional[str] = "Unspecified"
    field_of_study: Optional[str] = "General"  

class JobPosting(BaseModel):
    job_title: str
    company_name: str
    locations: List[Location]
    job_tags: List[str] = Field(default_factory=list)
    employment_type: str
    salary: Salary
    job_description: str
    responsibilities: List[str] = Field(default_factory=list)
    requirements: List[str] = Field(default_factory=list)
    skills: List[str] = Field(default_factory=list)
    educational_qualifications: List[EducationalQualification] = Field(default_factory=list)
    date_posted: Optional[str] = None
    application_deadline: Optional[str] = None
    application_link: str 

    @field_validator('job_title', 'company_name', mode='before')
    def validate_mandatory_fields(cls, value, field):
        if not value or value.strip() == "":
            raise ValueError(f"{field.name} is a required field and cannot be empty")
        return value

    @field_validator('date_posted', 'application_deadline', mode='before')
    def validate_dates(cls, value):
        return parse_date(value) if value else value

class JobPostingsContainer(BaseModel):
    job_postings: List[JobPosting]
    metadata: Dict[str, Union[str, int]]

    @field_validator('job_postings', mode='before')
    def validate_job_postings(cls, value):
        if not value or len(value) == 0:
            raise ValueError("Job postings list cannot be empty")
        return value

# Function to convert relative date strings to absolute dates
def convert_relative_to_absolute(date_str: str) -> str:
    if "day" in date_str:
        days_ago = int(date_str.split()[0])
        return (datetime.now() - timedelta(days=days_ago)).strftime("%Y-%m-%d")
    elif "week" in date_str:
        weeks_ago = int(date_str.split()[0])
        return (datetime.now() - timedelta(weeks=weeks_ago)).strftime("%Y-%m-%d")
    elif "month" in date_str:
        months_ago = int(date_str.split()[0])
        return (datetime.now() - timedelta(days=months_ago * 30)).strftime("%Y-%m-%d")
    
    try:
        return datetime.strptime(date_str, "%Y-%m-%d").strftime("%Y-%m-%d")
    except ValueError:
        raise ValueError("Date should be in YYYY-MM-DD format")

# Extract and format data using OpenAI API
def format_data(data: str, model: str = "gpt-4o-mini") -> Optional[JobPostingsContainer]:
    openai.api_key = os.getenv('OPENAI_API_KEY')
    
    system_message = """
        You are an intelligent data extraction assistant. Your task is to extract structured job posting data from HTML content and convert it into JSON format.

        Output must match the following structure:
        {
            "job_postings": [
                {
                    "job_title": "string",
                    "company_name": "string",
                    "locations": [
                        {
                            "city": "string",
                            "state": "string",
                            "country": "string"
                        }
                    ],
                    "job_tags": ["string"],  // Extract all the job tags correctly. Most job tags are present inside <a> tags.
                    "employment_type": "string",  // Parse for employment types such as Full-Time, Part-Time, Contract, etc.
                    "salary": {
                        "min": "integer",
                        "max": "integer",
                        "currency": "string",
                        "period": "string"  // e.g., "annual", "hourly", "monthly". If not provided, leave null.
                    },
                    "job_description": "string",  extract all text content from the <div> tag with a class name containing "description
                    "responsibilities": ["string"],  // Extract from list items within sections labeled "Responsibilities".
                    "requirements": ["string"],  // Extract from list items within sections labeled "Requirements" or "Qualifications".
                    "skills": ["string"],  // Extract skills from whole content like Python, JavaScript, SQL, etc.
                    "educational_qualifications": [
                        {
                            "degree": "string",  // e.g., Bachelor's, Master's. Leave null if not provided.
                            "field_of_study": "string"  // e.g., Computer Science. Leave null if not provided.
                        }
                    ],
                    "date_posted": "string",  // Extract exact posting date in YYYY-MM-DD format
                    "application_deadline": "string",  // Extract exact deadline in YYYY-MM-DD format
                    "application_link": "string"  // Extract URL link to apply
                }
            ],
            "metadata": {
                "scraping_timestamp": "string",  // Time when the data is scraped.
                "scraped_from": "string",  // URL from which the job posting is scraped.
                "source_type": "string",  // The type of website or platform (e.g., "job board", "company website").
                "scraper_version": "string",  // Version of the scraper.
                "data_format_version": "string",  // Version of the data format.
                "total_job_postings": "integer"  // The number of job postings extracted from the HTML.
            }
        }

        Additional Guidelines:
        - For "job_description", **capture all text from the <div> tag with a class name containing 'description'**. 
              - Ensure you account for deeply nested elements and extract all textual content, regardless of the depth of nesting.
              - If there are multiple paragraphs, bullet points, or lists, ensure they are all included without skipping any content.
              - The final output should be a clear, structured plain text representation, with appropriate formatting such as line breaks and indentation to maintain readability.
              - **Do not truncate or cut off** any part of the description, regardless of how long it is.
              - **Strip out all HTML tags** and provide only the pure textual information in a coherent format.
        - For "responsibilities" and "requirements", extract the text from list items under headings containing keywords such as "Responsibilities" or "Requirements".
        - For "job_tags", extract tags from <a> tags or any section labeled with "tags".
    """

    user_message = f"Extract the following information from the provided HTML content:\n\n{data}"

    try:
        completion = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message},
            ],
            max_tokens=4096  # Maximize tokens to handle large descriptions
        )
        
        response_content = completion.choices[0].message.content.strip()

        # Clean the response content
        if response_content.startswith('```json'):
            response_content = response_content.replace('```json', '').replace('```', '').strip()
        
        if response_content.startswith('{'):
            formatted_data = JobPostingsContainer.parse_raw(response_content)
            return formatted_data
        else:
            print(f"Unexpected response format: {response_content}")
            return None

    except Exception as e:
        print(f"Error during API call: {e}")
        return None

# Save formatted data to JSON
def save_formatted_data(formatted_data, timestamp, output_folder='output'):
    os.makedirs(output_folder, exist_ok=True)
    formatted_data_dict = formatted_data.dict() if formatted_data else {}
    json_output_path = os.path.join(output_folder, f'sorted_data_{timestamp}.json')
    
    with open(json_output_path, 'w', encoding='utf-8') as f:
        json.dump(formatted_data_dict, f, indent=4)
    print(f"Formatted data saved to JSON at {json_output_path}")
    if formatted_data:
        print("Formatted data:", json.dumps(formatted_data_dict, indent=4))

# Calculate price based on input and output tokens
def calculate_price(input_text: str, output_text: str, model: str = "gpt-4o-mini") -> float:
    encoder = tiktoken.encoding_for_model(model)
    
    input_token_count = len(encoder.encode(input_text))
    output_token_count = len(encoder.encode(output_text))
    
    input_cost = input_token_count * pricing[model]["input"]
    output_cost = output_token_count * pricing[model]["output"]
    total_cost = input_cost + output_cost
    
    return input_token_count, output_token_count, total_cost

# Main function to execute the scraper
if __name__ == "__main__":
    # url = 'https://aijobs.ai/job/senior-software-engineer-semantic-scholar'  # Example URL
    # url = 'https://aijobs.ai/job/machine-learning-engineer-voice-cloning-and-speech-synthesis'  # Example URL
    # url = 'https://aijobs.ai/job/software-engineer-aiadas'
    # url = 'https://aijobs.ai/job/software-engineer-applied-engineering'
    # url = 'https://aijobs.ai/job/senior-software-engineer-ai-platform-6'
    url = 'https://aijobs.ai/job/lead-software-engineer-prog-data-scientist'
    
    try:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        
        raw_html = fetch_html_selenium(url)
        cleaned_html = clean_html(raw_html)  # Cleaned HTML, no markdown conversion
        # print(cleaned_html)
        
        raw_file_path = save_raw_data(cleaned_html, timestamp)
        
        formatted_data = format_data(cleaned_html)  # Pass cleaned HTML directly to the LLM
        save_formatted_data(formatted_data, timestamp)
        
        if formatted_data:  # Only calculate price if formatted_data is not None
            formatted_data_text = json.dumps(formatted_data.dict()) 
            
            input_tokens, output_tokens, total_cost = calculate_price(cleaned_html, formatted_data_text, model=model_used)
            print(f"Input token count: {input_tokens}")
            print(f"Output token count: {output_tokens}")
            print(f"Estimated total cost: ${total_cost:.4f}")
        else:
            print("No formatted data to calculate cost.")
            
    except Exception as e:
        print(f"An error occurred: {e}")

Here every output is good except job_description. whenever job description is long that time it is not giving full description. Could anyone help me what is the issue over here

The issue you’re facing with the incomplete job descriptions likely stems from the token limit of the model your using. Large job descriptions can be truncated if the token count for the input data is too high, resulting in partial extraction.

  • Break down the long job descriptions into smaller chunks before sending them to the OpenAI API. This will help bypass the token limit issue.

  • For example, you can extract text in sections (such as paragraphs or lists) and then process each chunk separately.

  • Here’s how you can modify your script to chunk the job description:

def chunk_text(text: str, max_tokens: int, model: str = “gpt-4o-mini”) → List[str]:
“”"
Splits text into smaller chunks that do not exceed the token limit of the model.
“”"
encoder = tiktoken.encoding_for_model(model)
tokens = encoder.encode(text)
chunks =

while tokens:
    chunk = tokens[:max_tokens]
    tokens = tokens[max_tokens:]
    chunk_text = encoder.decode(chunk)
    chunks.append(chunk_text)

return chunks

You can then modify the format_data function to process these chunks:

def format_data(data: str, model: str = “gpt-4o-mini”) → Optional[JobPostingsContainer]:
openai.api_key = os.getenv(‘OPENAI_API_KEY’)

system_message = "Your system message with instructions..."

chunks = chunk_text(data, max_tokens=3000)  # Limit to prevent truncation

all_responses = []

for chunk in chunks:
    user_message = f"Extract information from the following HTML content:\n\n{chunk}"
    
    try:
        completion = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message},
            ],
            max_tokens=4096  # Max tokens for response
        )
        response_content = completion.choices[0].message.content.strip()
        
        if response_content.startswith('```json'):
            response_content = response_content.replace('```json', '').replace('```', '').strip()
        
        if response_content.startswith('{'):
            all_responses.append(response_content)
        else:
            print(f"Unexpected response format: {response_content}")

    except Exception as e:
        print(f"Error during API call: {e}")

# Combine all responses if necessary, depending on the structure you need
return all_responses

2. Ensure Maximum Tokens Are Allocated

You can make sure that the max_tokens parameter in ChatCompletion.create is set to its maximum (4096 tokens for GPT-3.5 and higher for GPT-4). However, if the total token count of the input and the desired output exceeds this, the model will still truncate. Chunking is still necessary in such cases.

3. Post-processing the Responses

After breaking the content into smaller chunks and processing them, you can stitch the chunks back together to form a complete job description. Ensure that you reassemble the sections correctly, keeping all paragraphs and bullet points.

This approach should help avoid truncation of long job descriptions. Let me know if you need help refining this further!

import os
import time
import json
from datetime import datetime, timedelta
from typing import List, Dict, Optional, Union
from pydantic import BaseModel, Field, HttpUrl, field_validator
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
# from openai import OpenAI
import tiktoken
import openai

# Load environment variables
load_dotenv()

# Set up the Chrome WebDriver options
def setup_selenium():
    options = Options()
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    service = Service(r"C:\Users\ajoyd\Downloads\chromedriver-win64\chromedriver.exe")  
    driver = webdriver.Chrome(service=service, options=options)
    return driver

# Fetch HTML using Selenium
def fetch_html_selenium(url: str) -> str:
    driver = setup_selenium()
    try:
        driver.get(url)
        time.sleep(5)  # Simulate user interaction delay
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)
        html = driver.page_source
        return html
    finally:
        driver.quit()

# Clean HTML content using BeautifulSoup
def clean_html(html_content: str) -> str:
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Remove irrelevant sections (style, footer, header, script)
    for element in soup.find_all(['style', 'footer', 'header', 'script', 'meta']):
        element.decompose()
    
    # Remove all class attributes except for those containing 'description' as part of the class name
    for tag in soup.find_all(True):
        if 'class' in tag.attrs:
            tag_classes = tag.attrs['class']
            tag.attrs['class'] = [cls for cls in tag_classes if 'description' in cls]
            if not tag.attrs['class']:
                tag.attrs.pop('class', None)
    
    return str(soup)

# Define the pricing for models
pricing = {
    "gpt-4o-mini": {
        "input": 0.15 / 1_000_000,  # $0.150 per 1M input tokens
        "output": 0.60 / 1_000_000, # $0.600 per 1M output tokens
    },
    "gpt-4o-mini-2024-07-18": {
        "input": 0.15 / 1_000_000,  # $0.150 per 1M input tokens
        "output": 0.60 / 1_000_000, # $0.600 per 1M output tokens
    },
    "babbage-002": {
        "input": 0.4 / 1_000_000,  # $0.40 per 1M input tokens
        "output": 0.4 / 1_000_000,  # $0.40 per 1M output tokens
    },
    "gpt-3.5-turbo-0125": {
        "input": 0.5 / 1_000_000,  # $0.50 per 1M input tokens
        "output": 1.5 / 1_000_000,  # $1.50 per 1M output tokens
    },
    "gpt-3.5-turbo-1106": {
        "input": 1 / 1_000_000,  # $1 per 1M input tokens
        "output": 2 / 1_000_000,  # $2 per 1M output tokens
    },
}

model_used = "gpt-4o-mini"

# Save raw HTML data to a file
def save_raw_data(raw_data: str, timestamp: str, output_folder: str = 'output') -> str:
    os.makedirs(output_folder, exist_ok=True)
    raw_output_path = os.path.join(output_folder, f'rawData_{timestamp}.html')
    with open(raw_output_path, 'w', encoding='utf-8') as f:
        f.write(raw_data)
    print(f"Raw HTML data saved to {raw_output_path}")
    return raw_output_path

# Pydantic models for job postings
class Salary(BaseModel):
    min: Optional[int] = None
    max: Optional[int] = None
    currency: str = "USD"  
    period: str = "yearly"  

    @field_validator('min', 'max', mode='before')
    def validate_salary(cls, value):
        if value is not None and value < 0:
            raise ValueError('Salary cannot be negative')
        return value
    # Field validator to handle None values for currency and period
    @field_validator('currency', 'period', mode='before')
    def set_defaults(cls, value, info):
        if value is None:
            if info.field_name == 'currency':
                return 'USD'
            elif info.field_name == 'period':
                return 'yearly'
        return value

from dateutil import parser

def parse_date(date_str: str) -> Optional[str]:
    """
    Tries to parse various date formats and returns a date in 'YYYY-MM-DD' format.
    If the date cannot be parsed, returns None.
    """
    try:
        # Attempt to parse the date
        parsed_date = parser.parse(date_str).strftime("%Y-%m-%d")
        return parsed_date
    except (ValueError, OverflowError):
        if "ongoing" in date_str.lower():
            return None  # Handle special cases like 'Ongoing'
        return None  # Return None if the format is unrecognized

class Location(BaseModel):
    city: Optional[str] = None
    state: Optional[str] = None
    country: Optional[str] = "Unknown"  

class EducationalQualification(BaseModel):
    degree: Optional[str] = "Unspecified"
    field_of_study: Optional[str] = "General"  

class JobPosting(BaseModel):
    job_title: str
    company_name: str
    locations: List[Location]
    job_tags: List[str] = Field(default_factory=list)
    employment_type: str
    salary: Salary
    job_description: str
    responsibilities: List[str] = Field(default_factory=list)
    requirements: List[str] = Field(default_factory=list)
    skills: List[str] = Field(default_factory=list)
    educational_qualifications: List[EducationalQualification] = Field(default_factory=list)
    date_posted: Optional[str] = None
    application_deadline: Optional[str] = None
    application_link: str 

    @field_validator('job_title', 'company_name', mode='before')
    def validate_mandatory_fields(cls, value, field):
        if not value or value.strip() == "":
            raise ValueError(f"{field.name} is a required field and cannot be empty")
        return value

    @field_validator('date_posted', 'application_deadline', mode='before')
    def validate_dates(cls, value):
        return parse_date(value) if value else value

class JobPostingsContainer(BaseModel):
    job_postings: List[JobPosting]
    metadata: Dict[str, Union[str, int]]

    @field_validator('job_postings', mode='before')
    def validate_job_postings(cls, value):
        if not value or len(value) == 0:
            raise ValueError("Job postings list cannot be empty")
        return value

# Function to convert relative date strings to absolute dates
def convert_relative_to_absolute(date_str: str) -> str:
    if "day" in date_str:
        days_ago = int(date_str.split()[0])
        return (datetime.now() - timedelta(days=days_ago)).strftime("%Y-%m-%d")
    elif "week" in date_str:
        weeks_ago = int(date_str.split()[0])
        return (datetime.now() - timedelta(weeks=weeks_ago)).strftime("%Y-%m-%d")
    elif "month" in date_str:
        months_ago = int(date_str.split()[0])
        return (datetime.now() - timedelta(days=months_ago * 30)).strftime("%Y-%m-%d")
    
    try:
        return datetime.strptime(date_str, "%Y-%m-%d").strftime("%Y-%m-%d")
    except ValueError:
        raise ValueError("Date should be in YYYY-MM-DD format")

import openai
import os
import tiktoken
from typing import List, Optional

# Helper function to chunk the text
def chunk_text(text: str, max_tokens: int, model: str = "gpt-4o-mini") -> List[str]:
    """
    Splits text into smaller chunks that do not exceed the token limit of the model.
    """
    encoder = tiktoken.encoding_for_model(model)
    tokens = encoder.encode(text)
    chunks = []
    
    while tokens:
        chunk = tokens[:max_tokens]
        tokens = tokens[max_tokens:]
        chunk_text = encoder.decode(chunk)
        chunks.append(chunk_text)

    return chunks

# Updated format_data function with chunking
from typing import List

# Updated format_data function to parse JSON strings into JobPostingsContainer objects
def format_data(data: str, model: str = "gpt-4o-mini") -> Optional[List[JobPostingsContainer]]:
    openai.api_key = os.getenv('OPENAI_API_KEY')

    system_message = """
    You are an intelligent data extraction assistant. Your task is to extract structured job posting data from HTML content and convert it into JSON format.
    
    Output must match the following structure:
    {
        "job_postings": [
            {
                "job_title": "string",
                "company_name": "string",
                "locations": [
                    {
                        "city": "string",
                        "state": "string",
                        "country": "string"
                    }
                ],
                "job_tags": ["string"],
                "employment_type": "string",
                "salary": {
                    "min": "integer",
                    "max": "integer",
                    "currency": "string",
                    "period": "string"
                },
                "job_description": "string",
                "responsibilities": ["string"],
                "requirements": ["string"],
                "skills": ["string"],
                "educational_qualifications": [
                    {
                        "degree": "string",
                        "field_of_study": "string"
                    }
                ],
                "date_posted": "string",
                "application_deadline": "string",
                "application_link": "string"
            }
        ],
        "metadata": {
            "scraping_timestamp": "string",
            "scraped_from": "string",
            "source_type": "string",
            "scraper_version": "string",
            "data_format_version": "string",
            "total_job_postings": "integer"
        }
    }
    """

    # Split the data into manageable chunks
    chunks = chunk_text(data, max_tokens=3000, model=model)  # Adjust max_tokens if needed
    all_responses = []

    # Process each chunk separately
    for chunk in chunks:
        user_message = f"Extract information from the following HTML content:\n\n{chunk}"
        
        try:
            completion = openai.ChatCompletion.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": user_message},
                ],
                max_tokens=4096  # Max tokens for response
            )
            
            response_content = completion.choices[0].message.content.strip()
            
            # Clean the response content
            if response_content.startswith('```json'):
                response_content = response_content.replace('```json', '').replace('```', '').strip()

            if response_content.startswith('{'):
                # Parse the response string into JobPostingsContainer
                try:
                    job_posting_container = JobPostingsContainer.parse_raw(response_content)
                    all_responses.append(job_posting_container)
                except Exception as e:
                    print(f"Error parsing JSON into Pydantic model: {e}")
            else:
                print(f"Unexpected response format: {response_content}")

        except Exception as e:
            print(f"Error during API call: {e}")

    # Return the list of parsed Pydantic objects
    return all_responses if all_responses else None

def save_formatted_data(formatted_data: List[JobPostingsContainer], timestamp, output_folder='output'):
    os.makedirs(output_folder, exist_ok=True)
    json_output_path = os.path.join(output_folder, f'sorted_data_{timestamp}.json')

    # Convert Pydantic objects to a serializable format
    combined_data = {
        "job_postings": [job.dict() for container in formatted_data for job in container.job_postings],
        "metadata": {
            "scraping_timestamp": timestamp,
            "total_job_postings": sum(len(container.job_postings) for container in formatted_data),
            "source_type": "web scraping",
            "scraper_version": "1.0",
            "data_format_version": "1.0"
        }
    }

    with open(json_output_path, 'w', encoding='utf-8') as f:
        json.dump(combined_data, f, indent=4)
    print(f"Formatted data saved to JSON at {json_output_path}")

# Calculate price based on input and output tokens
def calculate_price(input_text: str, output_text: str, model: str = "gpt-4o-mini") -> float:
    encoder = tiktoken.encoding_for_model(model)
    
    input_token_count = len(encoder.encode(input_text))
    output_token_count = len(encoder.encode(output_text))
    
    input_cost = input_token_count * pricing[model]["input"]
    output_cost = output_token_count * pricing[model]["output"]
    total_cost = input_cost + output_cost
    
    return input_token_count, output_token_count, total_cost

# Main function to execute the scraper
if __name__ == "__main__":
    # url = 'https://aijobs.ai/job/senior-software-engineer-semantic-scholar'  # Example URL
    # url = 'https://aijobs.ai/job/machine-learning-engineer-voice-cloning-and-speech-synthesis'  # Example URL
    url = 'https://aijobs.ai/job/software-engineer-aiadas'
    # url = 'https://aijobs.ai/job/software-engineer-applied-engineering'
    # url = 'https://aijobs.ai/job/senior-software-engineer-ai-platform-6'
    # url = 'https://aijobs.ai/job/lead-software-engineer-prog-data-scientist'
    
    try:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        
        raw_html = fetch_html_selenium(url)
        cleaned_html = clean_html(raw_html)  # Cleaned HTML, no markdown conversion
        # print(cleaned_html)
        
        raw_file_path = save_raw_data(cleaned_html, timestamp)
        
        formatted_data = format_data(cleaned_html)  # Pass cleaned HTML directly to the LLM
        save_formatted_data(formatted_data, timestamp)
        
        if formatted_data:  # Only calculate price if formatted_data is not None
            # Convert list of Pydantic objects to serializable list of dictionaries
            formatted_data_list = [container.dict() for container in formatted_data]
            formatted_data_text = json.dumps(formatted_data_list)   
            input_tokens, output_tokens, total_cost = calculate_price(cleaned_html, formatted_data_text, model=model_used)
            print(f"Input token count: {input_tokens}")
            print(f"Output token count: {output_tokens}")
            print(f"Estimated total cost: ${total_cost:.4f}")
        else:
            print("No formatted data to calculate cost.")
            
    except Exception as e:
        print(f"An error occurred: {e}")

This code is now giving an error. Could you help me figure out what’s wrong?

Can you share the actual error? Is it an exception you are catching and redering something generic or is it an real error from an API call etc? If its your own code catching the exception, I would recommend you not catch it and get the raw exception, share it to me.

Thanks

It is working but not giving correct answer bro. Could you help me??

The bot text answer above has no idea about how to interact with the API, so I would revert your code. Ban bots.

Here are issues that can cause truncation:

  • max_tokens is not sufficiently high - You can just set it to 16000+, or can look at the API usage report from the response, and see the value of completion tokens.
  • The AI doesn’t want to write as long as the advertised output. You could put system prompt lines to the effect that the output length is extended and enhanced, the model has new capabilities, whatever is needed to prevent a stop token, but this model still will have enforced cutoffs that prevent the advertised output.
  • the input data is what is incomplete

For a reproduction task like this, I would reduce top_p: 0.01. Otherwise, at every token in the output, you’re taking a gamble on the AI’s probability of a “stop” token.

If description is too much to write when accompanied by all the other fields, you might run a second API call just to get that section.

The approach I gave for breaking it into chunks seems to have helped you get around the initial issue but your code needs a little massaging possibly the helper functions etc., if you want me to continue helping let me know or you can follow the arrogant jelly fish that just claimed the answer to be a bot answer, let me know if you want my help further. Remember, there is no 1 solution towards an API issue, there are many solutions and peoples arrogance is going to attack others but its up to you to decide what route is best for you. Thanks