Hi, trying to create PDF chunking and embedding into openAi
const fs = require(‘fs’);
const pdf = require(‘pdf-parse’);
require(‘dotenv’).config(); // Load environment variables from .env file
const { Configuration, OpenAIApi } = require(‘openai’);
// Helper function to count words in a string, excluding whitespace
function countWords(text) {
return text.trim().split(/\s+/).length;
}
// Function to chunk the PDF text into passages
function chunkText(text, maxWords) {
const paragraphs = text.split(‘\n\n’); // Split by paragraphs
const chunks = ;
let currentChunk = ‘’;
paragraphs.forEach(paragraph => {
const wordsInParagraph = countWords(paragraph);
if (wordsInParagraph === 0) {
return;
}
if (countWords(currentChunk) + wordsInParagraph > maxWords) {
chunks.push(currentChunk.trim());
currentChunk = paragraph;
} else {
currentChunk += `\n\n${paragraph}`;
}
});
if (currentChunk) {
chunks.push(currentChunk.trim());
}
return chunks;
}
// Function to get embeddings from OpenAI API
async function getEmbeddings(text) {
const configuration = new Configuration({
apiKey: process.env.OPENAI_API_KEY, // Ensure your API key is set as an environment variable
});
const openai = new OpenAIApi(configuration);
const response = await openai.createEmbedding({
model: ‘text-embedding-ada-002’, // Example model, you may choose a different one
input: text,
});
return response.data.data[0].embedding;
}
// Main function to process the PDF
async function processPDF(inputFilePath, outputFilePath, maxWords) {
const dataBuffer = fs.readFileSync(inputFilePath);
const pdfData = await pdf(dataBuffer);
const text = pdfData.text;
const chunks = chunkText(text, maxWords);
let outputContent = Number of chunks: ${chunks.length}\n\n
;
for (const [index, chunk] of chunks.entries()) {
const embedding = await getEmbeddings(chunk);
outputContent += Chunk ${index + 1}:\n${chunk}\n\n
;
outputContent += Embedding ${index + 1}:\n${JSON.stringify(embedding)}\n\n
;
}
fs.writeFileSync(outputFilePath, outputContent);
console.log(PDF has been chunked into ${chunks.length} passages and saved to ${outputFilePath}
);
}
// Execute the function with the desired file path and word limit per chunk
const inputFilePath = ‘sample.pdf’; // Replace with your PDF file path
const outputFilePath = ‘output_chunks.txt’; // Replace with your desired output file path
const maxWords = 500;
processPDF(inputFilePath, outputFilePath, maxWords).catch(console.error);
Created .env file to store API key
but error :
(node: 27816) [DEP0040] DeprecationWarning: The punycode
module is deprecated. Please use a userland alternative instead.
(Use node --trace deprecation …
to show where the warning was created)
TypeError: Configuration is not a constructor
at getEmbeddings (C: \Users \5. roopesh\React\pdf-chunker \chunk-pdf.js:42:25).
at processPDF (C: \Users\5. roopesh\React\pdf-chunker \chunk-pdf.js:65:29).