Since last week, when trying to embed our notes in Pinecone using text-embedding-3-large, we keep getting the following error message:
Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-05-15 have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 86400 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.'}}
We earlier thought maybe increasing the TPM will help and we increased it to 100 K from 30K but to no avail, we are still having the same error.
To give you some background, we have managed to embed about 60K rows with the lower rate limit which was 30 K with no issues, so to see this error for 10K rows with a higher limit is very weird.
Also FYI, we are doing embedding in batches and chunks so it shouldn’t be reaching the rate limit. Code below.
I also used the package tiktoken to get the exact token numbers to see if we were crossing the limit , and indeed the total token count was about 730K where chunk 1 was about 359K. Once again, I find this very weird for 10000 rows, as all we did was add one new column in our database. And it was absolutely fine for 60K rows last month without the new column.
This is just the embedding part of my pipeline where the model we are referring to is text-embedding-3-large
import tiktoken
# Load the tokenizer for text-embedding-3-large
# Replace with the correct tokenizer name for your model
tokenizer = tiktoken.get_encoding("cl100k_base")
def count_tokens(text):
"""Function to count the number of tokens in a given text."""
return len(tokenizer.encode(text))
# Function to calculate total token count before embedding
def calculate_total_token_count(doc):
"""Function to calculate total token count for a list of documents."""
return sum(count_tokens(text) for text in doc)
# Loading OpenAI parameters
embedding_model = AzureOpenAIEmbeddings(
deployment=os.environ["OPENAI_EMBEDDING_MODEL"],
model=os.environ["OPENAI_EMBEDDING_MODEL"],
azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
openai_api_type=os.environ["OPENAI_API_TYPE"],
)
ids = pdf["ID_column"].tolist()
pdf.drop(columns=["ID_column"], inplace=True)
doc = pdf["TEXT"].astype(str)
# Calculate total token count before starting the embedding process
total_token_count = calculate_total_token_count(doc)
LOGGER.debug(f"Total token count for all documents: {total_token_count}")
# Optionally, you can set a threshold to avoid exceeding rate limits
MAX_TOKEN_LIMIT = 100000 # Adjust this limit based on your API rate limit or batch constraints
if total_token_count > MAX_TOKEN_LIMIT:
raise ValueError(f"Total token count {total_token_count} exceeds the maximum allowed limit of {MAX_TOKEN_LIMIT}.")
# Proceed with the embedding process if within limits
length_doc = len(doc)
LOGGER.debug("Start data embedding.")
LOGGER.debug(f"Number of notes to embed: {length_doc}.")
doc_embed = []
i = 0
for doc_chunk in get_chunks(doc, batch_size=5000):
LOGGER.debug(f"Embedding chunk: {i}.")
chunk_embed = embedding_model.embed_documents(list(doc_chunk))
doc_embed.extend(chunk_embed)
time.sleep(360) # Handle rate limits by adding sleep if necessary
i += 1
LOGGER.debug("End data embedding.")
meta_data = pdf.to_dict("records")
vector_count = len(doc_embed)
LOGGER.debug(f"Number of embedding vectors: {vector_count}.")
if vector_count != len(meta_data):
raise ValueError("The number of vectors and metadata does not match.")
if vector_count != len(ids):
raise ValueError("The number of vectors and IDs does not match.")
data_generator = list(zip(ids, doc_embed, meta_data))
length_data_generator = len(data_generator)
LOGGER.debug("Start data indexing.")
LOGGER.debug(f"Number of vectors indexed: {length_data_generator}.")
for ids_vectors_chunk in get_chunks(data_generator, batch_size=10):
index.upsert(vectors=ids_vectors_chunk, namespace=pinecone_namespace)
LOGGER.debug("End data indexing.")
stats_index = index.describe_index_stats()
size_namespace = stats_index["namespaces"][pinecone_namespace]["vector_count"]
LOGGER.debug(f"Namespace upserted successfully!: {pinecone_namespace}.")