PDF chunking and OpenAI Embedding in nodeJS

Hi, trying to create PDF chunking and embedding into openAi

const fs = require(‘fs’);
const pdf = require(‘pdf-parse’);
require(‘dotenv’).config(); // Load environment variables from .env file
const { Configuration, OpenAIApi } = require(‘openai’);

// Helper function to count words in a string, excluding whitespace
function countWords(text) {
return text.trim().split(/\s+/).length;

// Function to chunk the PDF text into passages
function chunkText(text, maxWords) {
const paragraphs = text.split(‘\n\n’); // Split by paragraphs
const chunks = ;
let currentChunk = ‘’;

paragraphs.forEach(paragraph => {
const wordsInParagraph = countWords(paragraph);

if (wordsInParagraph === 0) {

if (countWords(currentChunk) + wordsInParagraph > maxWords) {
  currentChunk = paragraph;
} else {
  currentChunk += `\n\n${paragraph}`;


if (currentChunk) {

return chunks;

// Function to get embeddings from OpenAI API
async function getEmbeddings(text) {
const configuration = new Configuration({
apiKey: process.env.OPENAI_API_KEY, // Ensure your API key is set as an environment variable
const openai = new OpenAIApi(configuration);

const response = await openai.createEmbedding({
model: ‘text-embedding-ada-002’, // Example model, you may choose a different one
input: text,

return response.data.data[0].embedding;

// Main function to process the PDF
async function processPDF(inputFilePath, outputFilePath, maxWords) {
const dataBuffer = fs.readFileSync(inputFilePath);
const pdfData = await pdf(dataBuffer);
const text = pdfData.text;
const chunks = chunkText(text, maxWords);

let outputContent = Number of chunks: ${chunks.length}\n\n;

for (const [index, chunk] of chunks.entries()) {
const embedding = await getEmbeddings(chunk);
outputContent += Chunk ${index + 1}:\n${chunk}\n\n;
outputContent += Embedding ${index + 1}:\n${JSON.stringify(embedding)}\n\n;

fs.writeFileSync(outputFilePath, outputContent);
console.log(PDF has been chunked into ${chunks.length} passages and saved to ${outputFilePath});

// Execute the function with the desired file path and word limit per chunk
const inputFilePath = ‘sample.pdf’; // Replace with your PDF file path
const outputFilePath = ‘output_chunks.txt’; // Replace with your desired output file path
const maxWords = 500;

processPDF(inputFilePath, outputFilePath, maxWords).catch(console.error);

Created .env file to store API key

but error :
(node: 27816) [DEP0040] DeprecationWarning: The punycode

module is deprecated. Please use a userland alternative instead.

(Use node --trace deprecation …

to show where the warning was created)

TypeError: Configuration is not a constructor

at getEmbeddings (C: \Users \5. roopesh\React\pdf-chunker \chunk-pdf.js:42:25).
at processPDF (C: \Users\5. roopesh\React\pdf-chunker \chunk-pdf.js:65:29).