Generating Response from Each Chunk Azure Open AI

Hi All,

I am trying to extract response from over 1000 invoices, I need information from each invoice separately and store the response. What I did is created multiple chunks and then trying to run through one chunk at each time but for some reason it’s giving response only from each chunk, I need response from each chunk. I have attached code below ANY HELP WOULD BE HIGHLY APPRECIATED!

Define the function to process the output of the query for each document or chunk

def process_output(output, doc_index):
“”"
Function to process the output of the query for each document or chunk.
“”"
# Define the output file name
output_file_name = f"output_document_{doc_index + 1}.txt"

# Define the output file path
output_file_path = os.path.join(output_directory, output_file_name)

# Write the output to the file
with open(output_file_path, 'w') as file:
    file.write(str(output))

print(f"Processed document {doc_index + 1}")

Define the directory containing the files

directory_path = r’\folder’

Define the output directory

output_directory = r’\folder2’

Define the loader to load documents from the directory

loader = DirectoryLoader(directory_path, glob=“**/*.docx”)

Load the documents

docs = loader.load()

print(f"Total number of documents: {len(docs)}")

Initialize text splitter

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=2000, chunk_overlap=0)

Split documents into smaller chunks

split_docs = text_splitter.split_documents(docs)

Initialize embeddings

embeddings = AzureOpenAIEmbeddings(azure_deployment=“text-embedding-ada-002”)

Define an empty list to store all outputs from each chunk

all_outputs =

Iterate through each chunk

for doc_index, doc in enumerate(split_docs):
try:
print(f"Processing chunk {doc_index + 1} out of {len(split_docs)}")

    # Initialize embeddings for each document
    doc_embeddings = AzureOpenAIEmbeddings(azure_deployment="text-embedding-ada-002")
    

    # Create a Chroma retriever for the current document
    doc_search = Chroma.from_documents([doc], doc_embeddings)


    # Initialize the chain
    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="refine", retriever=doc_search.as_retriever(search_kwargs={"k": 1}))

    # Define the query
    query = 'Create a table with the Name of Insured/ Insured Name, Location,Risk Occupation/Operation value, Sum Insured, Deductibles, Premium and Loss data. If any value is not present write Null and write separate rows for Sum Insured values?'

    # Run the query with a smaller number of requested results
    output = chain.run(query)  # Adjust the number of requested results here

    # Append the output to the list of all outputs
    all_outputs.append(output)

except Exception as e:
    print(f"Error processing chunk {doc_index + 1}: {str(e)}")

Process all outputs together

combined_output = ‘\n’.join(map(str, all_outputs))

Write the combined output to a file

output_file_path = os.path.join(output_directory, “combined_output.txt”)
with open(output_file_path, ‘w’) as file:
file.write(combined_output)

print(“Combined output written to file.”)