PDF chunking and OpenAI Embedding in nodeJS

sroopesh234 · May 28, 2024, 8:02am

Hi, trying to create PDF chunking and embedding into openAi

const fs = require(‘fs’);
const pdf = require(‘pdf-parse’);
require(‘dotenv’).config(); // Load environment variables from .env file
const { Configuration, OpenAIApi } = require(‘openai’);

// Helper function to count words in a string, excluding whitespace
function countWords(text) {
return text.trim().split(/\s+/).length;
}

// Function to chunk the PDF text into passages
function chunkText(text, maxWords) {
const paragraphs = text.split(‘\n\n’); // Split by paragraphs
const chunks = ;
let currentChunk = ‘’;

paragraphs.forEach(paragraph => {
const wordsInParagraph = countWords(paragraph);

if (wordsInParagraph === 0) {
  return;
}

if (countWords(currentChunk) + wordsInParagraph > maxWords) {
  chunks.push(currentChunk.trim());
  currentChunk = paragraph;
} else {
  currentChunk += `\n\n${paragraph}`;
}

});

if (currentChunk) {
chunks.push(currentChunk.trim());
}

return chunks;
}

// Function to get embeddings from OpenAI API
async function getEmbeddings(text) {
const configuration = new Configuration({
apiKey: process.env.OPENAI_API_KEY, // Ensure your API key is set as an environment variable
});
const openai = new OpenAIApi(configuration);

const response = await openai.createEmbedding({
model: ‘text-embedding-ada-002’, // Example model, you may choose a different one
input: text,
});

return response.data.data[0].embedding;
}

// Main function to process the PDF
async function processPDF(inputFilePath, outputFilePath, maxWords) {
const dataBuffer = fs.readFileSync(inputFilePath);
const pdfData = await pdf(dataBuffer);
const text = pdfData.text;
const chunks = chunkText(text, maxWords);

let outputContent = Number of chunks: ${chunks.length}\n\n;

for (const [index, chunk] of chunks.entries()) {
const embedding = await getEmbeddings(chunk);
outputContent += Chunk ${index + 1}:\n${chunk}\n\n;
outputContent += Embedding ${index + 1}:\n${JSON.stringify(embedding)}\n\n;
}

fs.writeFileSync(outputFilePath, outputContent);
console.log(PDF has been chunked into ${chunks.length} passages and saved to ${outputFilePath});
}

// Execute the function with the desired file path and word limit per chunk
const inputFilePath = ‘sample.pdf’; // Replace with your PDF file path
const outputFilePath = ‘output_chunks.txt’; // Replace with your desired output file path
const maxWords = 500;

processPDF(inputFilePath, outputFilePath, maxWords).catch(console.error);

Created .env file to store API key

but error :
(node: 27816) [DEP0040] DeprecationWarning: The punycode

module is deprecated. Please use a userland alternative instead.

(Use node --trace deprecation …

to show where the warning was created)

TypeError: Configuration is not a constructor

at getEmbeddings (C: \Users \5. roopesh\React\pdf-chunker \chunk-pdf.js:42:25).
at processPDF (C: \Users\5. roopesh\React\pdf-chunker \chunk-pdf.js:65:29).

loucas · July 8, 2024, 7:25pm

GPT4o on your issue:

Fix for the Error

The error you’re encountering is due to how the Configuration class from the openai module is being used. The error message indicates that Configuration is not a constructor.

Update the import and initialization of the OpenAI client as follows:

Corrected Imports and Initialization:

javascript

Copy code

const { OpenAIApi } = require('openai');
const { Configuration } = require('openai').default;

Fixing the Initialization in the Function:

javascript

Copy code

async function getEmbeddings(text) {
  const configuration = new Configuration({
    apiKey: process.env.OPENAI_API_KEY,
  });
  const openai = new OpenAIApi(configuration);

  const response = await openai.createEmbedding({
    model: 'text-embedding-ada-002',
    input: text,
  });

  return response.data.data[0].embedding;
}

This should resolve the TypeError: Configuration is not a constructor issue.

Summary

The code reads a PDF, chunks its text into smaller passages, and then gets embeddings for each chunk using the OpenAI API. The error you encountered is due to incorrect usage of the Configuration class from the OpenAI module, which can be resolved by adjusting the import and initialization as shown above.

Topic		Replies	Views
Is the OpenAI Embedding working well in the NodeJS? API embeddings	11	4116	March 6, 2024
I 'm using Openai api in nodeJS but throwing error API api	10	47998	December 14, 2023
OpenAI Embeddings - Search through ~1000 PDFs API embeddings	3	3421	August 28, 2024
Stumble with openai key in nodejs API chatgpt	2	1172	December 17, 2023
Is there any sample code to split a json file into smaller chunks? API	11	13916	October 26, 2023

PDF chunking and OpenAI Embedding in nodeJS

Fix for the Error

Summary

Related topics