Hi I am currently trying to run a RAG application (FAQ chatbot) which consists of 2 UI one where we can separately upload the files and store its embeddings in PineCone Vector store and another where we can retrieve the embedding from the selected index into the RAG chatbot.I have used gpt-4o paid account (tier-1)(30000 tokens) as my primary llm and AzureAIDocumentIntelligenceLoader to load my PDF files asynchronously (using aload() function) to retrieve a 272 page pdf and chat with it.Even when I just type in ‘hi’ it says-“‘message’: ‘Request too large for gpt-4o in organization org-wOFxlX2RaRVsbRdbSuZ5iBGM on tokens per min (TPM): Limit 30000, Requested 49634. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.’, ‘type’: ‘tokens’, ‘param’: None, ‘code’: ‘rate_limit_exceeded’” I successfully tried to chat with when loaded with ‘PyPDFium2Loader’.The first doubt is how it requested 50000 tokens when I have only typed only ‘hi’ to the chatbot.The second doubt is even though I added async func to the pdf loader function and time delay while retrieving the responses why am I still getting the error code:429
async def extract_embeddings_upload_index(pdf_path, index_name):
print(f"Loading PDF from path: {pdf_path}")
# Load PDF documents
async def lol(pdf_path):
client= await AzureAIDocumentIntelligenceLoader( api_key="167f20e5ce49431aad891c46e2268696",file_path=pdf_path,api_endpoint="https://rx11.cognitiveservices.azure.com/",api_model="prebuilt-layout",mode="single").aload()
return client
txt_docs = await lol(pdf_path)
#total_pages=txt_docs
#print(f'{total_pages}')
#txt_docs = PyPDFium2Loader(pdf_path).load()
# Split documents
print("Splitting documents...")
splt_docs = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
docs = splt_docs.split_documents(txt_docs)
print(f"Split into {len(docs)} chunks")
# Initialize OpenAI embeddings
print("Initializing OpenAI embeddings...")
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
# Upload documents to Pinecone index
print("Initializing Pinecone Vector Store...")
dbx = PineconeVectorStore.from_documents(documents=docs, index_name=index_name, embedding=embeddings)
print(f"Uploaded {len(docs)} documents to Pinecone index '{index_name}'")
def initialize(index_name):
embeddings = ini_embed()
print('11')
dbx = PineconeVectorStore.from_existing_index(index_name=index_name, embedding=embeddings)
print('12')
llm = ChatOpenAI(model='gpt-4o', temperature=0.5, max_tokens=3000)
# model_id="meta-llama/Meta-Llama-3-8B"
#model=AutoModelForCausalLM.from_pretrained(model_id)
#tokenizer=AutoTokenizer.from_pretrained(model)
#pipe=pipeline("text-generation",model=model,tokenizer=tokenizer,max_new_tokens=5000)
repo_id="meta-llama/Llama-2-7b-hf"
print('13')
prompt = ini_prompt()
print('14')
doc_chain = create_stuff_documents_chain(llm, prompt)
print('15')
retriever = dbx.as_retriever()
print('16')
ans_retrieval = create_retrieval_chain(retriever, doc_chain)
print('17')
# Wrap the retrieval chain with RunnableWithMessageHistory
conversational_ans_retrieval = RunnableWithMessageHistory(
ans_retrieval,
lambda session_id: StreamlitChatMessageHistory(key=session_id),
input_messages_key="input",
history_messages_key="chat_history",
output_messages_key="answer"
)
print('17')
print(session_id)
print('18')
return conversational_ans_retrieval
def run_query(retrieval_chain, input_text):
st.write('run query')
try:
# Generate a response using the retrieval chain
time.sleep(60)
response = retrieval_chain.invoke(
{"input": input_text},
config={"configurable": {"session_id": f'{session_id}'}}
)
return response['answer']
except KeyError as e:
st.error(f"KeyError occurred: {e}. Check the response structure.")
return None