below is code for chunking from openai site. I need to store the chunked values in a csv with also shows the columns details of document id like d1,d2. Next need to show chunk size like c1,c2 for each document . After each document the chunk size should be 0.
max_tokens = 500
def split_into_many(text, max_tokens = max_tokens):
# Split the text into sentences sentences = text.split('. ') # Get the number of tokens for each sentence n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences] chunks = [x] tokens_so_far = 0 chunk = [x] # Loop through the sentences and tokens joined together in a tuple for sentence, token in zip(sentences, n_tokens): # If the number of tokens so far plus the number of tokens in the current sentence is greater # than the max number of tokens, then add the chunk to the list of chunks and reset # the chunk and tokens so far if tokens_so_far + token > max_tokens: chunks.append(". ".join(chunk) + ".") chunk =  tokens_so_far = 0 # If the number of tokens in the current sentence is greater than the max number of # tokens, go to the next sentence if token > max_tokens: continue # Otherwise, add the sentence to the chunk and add the number of tokens to the total chunk.append(sentence) tokens_so_far += token + 1 return chunks
shortened = [ ]
for row in df.iterrows():
# If the text is None, go to the next row if row['text'] is None: continue # If the number of tokens is greater than the max number of tokens, split the text into chunks if row['n_tokens'] > max_tokens: shortened += split_into_many(row['text']) # Otherwise, add the text to the list of shortened texts else: shortened.append( row['text'] )