Currently when I send off my data to be scored by openai, I continuously poll to check if the results are back. This is fine if the data returns quickly, but given the 24hr turn around for the batch api, this can take a while.
Wondering if there is a work around for this. eg. send me an email when the processing is done and I can download results from a openai link. My code is shown below:
for chunk_idx, df_chunk in enumerate(np.array_split(df, num_chunks)):
requests, requests_customids = create_batch_requests(
df_chunk, gpt4o_version, temperature, image_detail
)
save_jsonl(requests, f"batch_input_chunk_{chunk_idx + 1}.jsonl")
batch_job_id = run_batch_job(openai, f"batch_input_chunk_{chunk_idx + 1}.jsonl")
while True:
batch_status = check_batch_status(openai, batch_job_id)
if batch_status.output_file_id:
output_file_id = batch_status.output_file_id
break
sleep(60) # check every minute
batch_data = extract_data(openai, output_file_id)
df_batch_output = pd.DataFrame(batch_data)
where:
def run_batch_job(client: openai, input_file: str) -> str:
batch_input_file = client.files.create(file=open(input_file, "rb"), purpose="batch")
batch_input_file_id = batch_input_file.id
# create batch job (this step costs money!!!)
batch_job = client.batches.create(
input_file_id=batch_input_file_id, endpoint="/v1/chat/completions", completion_window="24h"
)
return batch_job.id
def check_batch_status(client: openai, job_id: str):
# check the status of batch job to see if it's completed
retrieve_batch = client.batches.retrieve(job_id)
return retrieve_batch