Getting peer connection closed for requests exceeding 1 hour

Hey i was doing some benchmarking, with gpt-5.2 xhigh variant, and i am coninuously getting:
“peer closed connection without sending complete message body (incomplete chunked read”)

The tasks are reasoning heavy and thus may require many many tokens, thus the long wait, i am using normal inferencing with responses format, and streaming enabled.

all requests complete successfully, if smaller than 1hr, else they terminate with the above mentioned error.

i am not sure if this is an issue on my end, or from openai’s, but i could not resolve it from my end, despite of mutliple tries. giving the code snippet also. (i have set timeout on my end to 7200s, so that is definitely not an issue).

code:

import os
from typing import Any, Dict, Optional, Tuple
import httpx
import openai
from openai import OpenAI
from api_interface import LLMInterface

class OpenAI_Class(LLMInterface):
    def __init__(
        self,
        model: str = "gpt-5.2",
        api_key: Optional[str] = None,
        instruction: str = "You are a competitive programming expert. Provide optimized working solutions to the problems presented to you in C++.",
        temperature: float = 0.5,
        max_completion_tokens: int = 65_536 * 2,
        timeout: float = 7200.0,  # 2 hour timeout
        reasoning_effort: str = "high"
    ):
        super().__init__()
    
        api_key = api_key or os.environ.get("OPENAI_API_KEY")
        if not api_key:
            raise ValueError("Missing OPENAI_API_KEY (set env var or pass api_key=...).")

        # --- CUSTOM HTTP CLIENT ---
        # Configures httpx to hold connections open longer and withstand 
        # long pauses without dropping the socket.
        # http_client = httpx.Client(
        #     timeout=timeout,
        #     limits=httpx.Limits(
        #         max_keepalive_connections=20, 
        #         max_connections=100, 
        #         keepalive_expiry=timeout
        #     ),
        #     transport=httpx.HTTPTransport(local_address="0.0.0.0")
        # )

        self.client = OpenAI(
            api_key=api_key, 
            timeout=timeout,
            # http_client=http_client
        )
        
        self.model = model
        self.reasoning_effort = reasoning_effort
        self.instruction = instruction
        self.temperature = temperature
        self.max_completion_tokens = max_completion_tokens

    def call_llm(self, user_prompt: str) -> Tuple[str, Dict[str, Any]]:
        """
        Returns: (response_text, metadata)
        """
        try:
            response = self.client.responses.create(
                model=self.model,
                instructions=self.instruction,
                input=user_prompt,
                temperature=self.temperature if not ("gpt-5" in self.model or self.model.startswith("o")) else None,
                # max_tokens is deprecated; max_completion_tokens is preferred. :contentReference[oaicite:1]{index=1}
                max_output_tokens=self.max_completion_tokens,
                timeout=7200.0,
                stream=True,
                reasoning={
                    "effort": self.reasoning_effort,
                    "summary": "detailed",
                }
            )

            resp = None
            type_resp = ""

            for chunk in response:
                type_resp = str(type(chunk))
                resp = chunk
            
            resp = resp.response
            
            # print(types[-10:])
            # print(resp)
            
            thinking = "<think>"
            text = None
            count = 0

            if type_resp == "<class 'openai.types.responses.response_completed_event.ResponseCompletedEvent'>":
                for out in resp.output:
                    if out.id.startswith("rs"):
                        for summ in out.summary:
                            # print(summ)
                            thinking += "\n" + summ.text + "\n"
                    else:
                        if count > 0:
                            raise RuntimeError("Multiple output message chunks received from LLM; expected only one.")
                        # print(out.content[0])
                        text = out.content[0].text
                        count += 1
            elif type_resp == "<class 'openai.types.responses.response_audio_delta_event.ResponseAudioDeltaEvent'>":
                for out in resp["output"]:
                    if out["id"].startswith("rs"):
                        for summ in out["summary"]:
                            # print(summ)
                            thinking += "\n" + summ["text"] + "\n"
                    else:
                        if count > 0:
                            raise RuntimeError("Multiple output message chunks received from LLM; expected only one.")
                        # print(out["content"][0])
                        text = out["content"][0]["text"]
                        count += 1
            else:
                raise RuntimeError(f"Unexpected response type from LLM: {type_resp}")
            
            if text is None:
                print("WARNING: No text generated by LLM.")
                text = ""

            thinking += "</think>"
            metadata = str(resp)

            # print(metadata)

            return text, metadata, thinking

        except openai.RateLimitError as e:
            raise RuntimeError(f"OpenAI rate limit hit: {e}") from e
        except openai.APIConnectionError as e:
            raise RuntimeError(f"OpenAI connection error: {e}") from e
        except openai.APIError as e:
            raise RuntimeError(f"OpenAI API error: {e}") from e
1 Like