PDF chunking and OpenAI Embedding in nodeJS

Hi, trying to create PDF chunking and embedding into openAi

const fs = require(‘fs’);
const pdf = require(‘pdf-parse’);
require(‘dotenv’).config(); // Load environment variables from .env file
const { Configuration, OpenAIApi } = require(‘openai’);

// Helper function to count words in a string, excluding whitespace
function countWords(text) {
return text.trim().split(/\s+/).length;
}

// Function to chunk the PDF text into passages
function chunkText(text, maxWords) {
const paragraphs = text.split(‘\n\n’); // Split by paragraphs
const chunks = ;
let currentChunk = ‘’;

paragraphs.forEach(paragraph => {
const wordsInParagraph = countWords(paragraph);

if (wordsInParagraph === 0) {
  return;
}

if (countWords(currentChunk) + wordsInParagraph > maxWords) {
  chunks.push(currentChunk.trim());
  currentChunk = paragraph;
} else {
  currentChunk += `\n\n${paragraph}`;
}

});

if (currentChunk) {
chunks.push(currentChunk.trim());
}

return chunks;
}

// Function to get embeddings from OpenAI API
async function getEmbeddings(text) {
const configuration = new Configuration({
apiKey: process.env.OPENAI_API_KEY, // Ensure your API key is set as an environment variable
});
const openai = new OpenAIApi(configuration);

const response = await openai.createEmbedding({
model: ‘text-embedding-ada-002’, // Example model, you may choose a different one
input: text,
});

return response.data.data[0].embedding;
}

// Main function to process the PDF
async function processPDF(inputFilePath, outputFilePath, maxWords) {
const dataBuffer = fs.readFileSync(inputFilePath);
const pdfData = await pdf(dataBuffer);
const text = pdfData.text;
const chunks = chunkText(text, maxWords);

let outputContent = Number of chunks: ${chunks.length}\n\n;

for (const [index, chunk] of chunks.entries()) {
const embedding = await getEmbeddings(chunk);
outputContent += Chunk ${index + 1}:\n${chunk}\n\n;
outputContent += Embedding ${index + 1}:\n${JSON.stringify(embedding)}\n\n;
}

fs.writeFileSync(outputFilePath, outputContent);
console.log(PDF has been chunked into ${chunks.length} passages and saved to ${outputFilePath});
}

// Execute the function with the desired file path and word limit per chunk
const inputFilePath = ‘sample.pdf’; // Replace with your PDF file path
const outputFilePath = ‘output_chunks.txt’; // Replace with your desired output file path
const maxWords = 500;

processPDF(inputFilePath, outputFilePath, maxWords).catch(console.error);

Created .env file to store API key

but error :
(node: 27816) [DEP0040] DeprecationWarning: The punycode

module is deprecated. Please use a userland alternative instead.

(Use node --trace deprecation …

to show where the warning was created)

TypeError: Configuration is not a constructor

at getEmbeddings (C: \Users \5. roopesh\React\pdf-chunker \chunk-pdf.js:42:25).
at processPDF (C: \Users\5. roopesh\React\pdf-chunker \chunk-pdf.js:65:29).

1 Like

GPT4o on your issue:

Fix for the Error

The error you’re encountering is due to how the Configuration class from the openai module is being used. The error message indicates that Configuration is not a constructor.

Update the import and initialization of the OpenAI client as follows:

  1. Corrected Imports and Initialization:

javascript

Copy code

const { OpenAIApi } = require('openai');
const { Configuration } = require('openai').default;
  1. Fixing the Initialization in the Function:

javascript

Copy code

async function getEmbeddings(text) {
  const configuration = new Configuration({
    apiKey: process.env.OPENAI_API_KEY,
  });
  const openai = new OpenAIApi(configuration);

  const response = await openai.createEmbedding({
    model: 'text-embedding-ada-002',
    input: text,
  });

  return response.data.data[0].embedding;
}

This should resolve the TypeError: Configuration is not a constructor issue.

Summary

The code reads a PDF, chunks its text into smaller passages, and then gets embeddings for each chunk using the OpenAI API. The error you encountered is due to incorrect usage of the Configuration class from the OpenAI module, which can be resolved by adjusting the import and initialization as shown above.