Structured Outputs Deep-dive

Yes it is possible. BTW, I’m open sourcing a pydantic wrapper I’m developing that extends it for use with LLMs. It’s not ready for PYPI but you can check it out here: GitHub - nicholishen/tooldantic

! pip install -U git+https://github.com/nicholishen/tooldantic.git

Here is a snippet:

import asyncio
import json

import httpx
import openai
from bs4 import BeautifulSoup
from tooldantic import OpenAiResponseFormatBaseModel as BaseModel

client = openai.AsyncOpenAI()

class ArticleExtractor(BaseModel):
    """Use this tool to extract information from the user's articles"""

    headline: str
    summary: str


urls = [
    "https://www.cnn.com/2019/08/29/us/new-hampshire-vanity-license-plate-trnd/index.html",
    "https://www.cnn.com/2024/08/02/tech/google-olympics-ai-ad-artificial-intelligence/index.html",
]



async def get_url_content(url: str) -> str:
    async with httpx.AsyncClient() as client:
        response = await client.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        important_tags = ["h1", "h2", "h3", "p"]
        content = []
        for tag in important_tags:
            elements = soup.find_all(tag)
            for element in elements:
                content.append(element.get_text())
        return " ".join(content)



async def prepare_jsonl():
    tasks = [get_url_content(url) for url in urls]
    articles = await asyncio.gather(*tasks)
    jsonl = []
    for i, article in enumerate(articles, start=1):
        jsonl.append(
            {
                "custom_id": f"request-{i}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": "gpt-4o-mini",
                    "max_tokens": 1000,
                    "messages": [{"role": "user", "content": article}],
                    "response_format": ArticleExtractor.model_json_schema(),
                },
            }
        )
    with open("requests.jsonl", "w") as f:
        for line in jsonl:
            f.write(json.dumps(line) + "\n")

await prepare_jsonl()
        

EDIT:

tooldantic can also create dynamic pydantic models from arbitrary data sources.

import tooldantic

some_existing_data = {
    "headline": "The headline of the article",
    "summary": "The summary of the article",
}

MyDataModel = tooldantic.ModelBuilder(
    base_model=tooldantic.OpenAiResponseFormatBaseModel
).model_from_dict(
    some_existing_data,
    model_name="MyDataModel",
    model_description="This is a custom data model from arbitrary data",
)

print(json.dumps(MyDataModel.model_json_schema(), indent=2))
assert MyDataModel(**some_existing_data).model_dump() == some_existing_data

# {
#   "type": "json_schema",
#   "json_schema": {
#     "name": "MyDataModel",
#     "description": "This is a custom data model from arbitrary data",
#     "strict": true,
#     "schema": {
#       "type": "object",
#       "properties": {
#         "headline": {
#           "type": "string"
#         },
#         "summary": {
#           "type": "string"
#         }
#       },
#       "required": [
#         "headline",
#         "summary"
#       ],
#       "additionalProperties": false
#     }
#   }
# }
2 Likes