The file type is not supported. - Uploaded PDF to vector file

Adding a PDF file via the API to vector store results in a indexing failed status: The file type is not supported. Worked before.

But it does work via playground!

Others too?

This file type is supported.

The API endpoint doesn’t trust your file extension and thus does some deep inspection, which normally refuses on text, not binaries like PDF.

It’s been 12 hours, but you also just gave your message a bump…

My Results:

Uploaded file: 'hp-printers.pdf' -> ID: file-6qwvVLgqLr1AQQxsj6sw6N
Uploaded files; IDs returned:
  1. hp-printers.pdf (hp-printers.pdf) -> file-6qwvVLgqLr1AQQxsj6sw6N
Created Vector Store: "Printer_search" -> ID: vs_689ff4b97c908191bbb195f7f0cbf5f1
Deleted Vector Store: "Printer_search" -> ID: vs_689ff4b97c908191bbb195f7f0cbf5f1
Deleted file: 'hp-printers.pdf' -> ID: file-6qwvVLgqLr1AQQxsj6sw6N

By this Python testing code (have your pdf file in the code and in the directory).

import openai
from pathlib import Path

db_name = "Printer_search"  # vector store name
file_paths = [
    "hp-printers.pdf",
    # "text1.txt"
]

client = None
file_records = []  # [{path, id, filename}]
vector_store = None

try:
    client = openai.OpenAI(timeout=90.0, max_retries=0)  # or openai.AsyncOpenAI

    for file_path in file_paths:
        p = Path(file_path)
        if not p.exists():
            print(f"SKIP: File not found -> {p}")
            continue

        try:
            with p.open("rb") as f:
                file_object = client.files.create(file=f, purpose="assistants")
            filename = getattr(file_object, "filename", p.name)
            file_records.append({"path": str(p), "id": file_object.id, "filename": filename})
            print(f"Uploaded file: '{filename}' -> ID: {file_object.id}")
        except Exception as e:
            print(f"Upload failed for '{p}': {e}")

    if file_records:
        print("Uploaded files; IDs returned:")
        for idx, rec in enumerate(file_records, start=1):
            print(f"  {idx}. {rec['filename']} ({rec['path']}) -> {rec['id']}")

        vector_store = client.vector_stores.create(
            name=db_name,
            file_ids=[rec["id"] for rec in file_records],
            expires_after={"anchor": "last_active_at", "days": 7},
            chunking_strategy={
                "type": "static",
                "static": {"max_chunk_size_tokens": 600, "chunk_overlap_tokens": 200},
            },
        )
        print(f'Created Vector Store: "{vector_store.name}" -> ID: {vector_store.id}')
    else:
        print("No files uploaded successfully; skipping vector store creation.")

except Exception as e:
    print(f"code failed\n{e}")

# then search or integrate with AI tool
pass

# End: delete resources (delete vector store first, then files)
try:
    if client is not None and vector_store is not None:
        client.vector_stores.delete(vector_store.id)
        print(f'Deleted Vector Store: "{vector_store.name}" -> ID: {vector_store.id}')
    else:
        print("Vector Store cleanup skipped; never created.")
except Exception as e:
    print("Vector Store cleanup fail; never created?")
    print(e)

try:
    if client is not None and file_records:
        for rec in file_records:
            client.files.delete(rec["id"])
            print(f"Deleted file: '{rec['filename']}' -> ID: {rec['id']}")
    else:
        print("Files storage cleanup skipped; none uploaded.")
except Exception as e:
    print("Files storage cleanup fail; never created?")
    print(e)

Same issue here.. but I decided to convert PDF’s to text before uploading them as TXT. Works perfectly.

<?php
$pdfUrl = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf";

function download_pdf(string $url): string {
    $tmpFile = tempnam(sys_get_temp_dir(), 'pdf_');
    $fp = fopen($tmpFile, 'w');
    $ch = curl_init($url);
    curl_setopt_array($ch, [
        CURLOPT_FILE => $fp,
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_TIMEOUT => 60,
        CURLOPT_SSL_VERIFYPEER => true,
    ]);
    if (!curl_exec($ch)) {
        $err = curl_error($ch);
        curl_close($ch);
        fclose($fp);
        unlink($tmpFile);
        throw new RuntimeException("Download mislukt: $err");
    }
    curl_close($ch);
    fclose($fp);

    if (filesize($tmpFile) < 100) {
        unlink($tmpFile);
        throw new RuntimeException("PDF is leeg of te klein.");
    }
    return $tmpFile;
}

function pdf_to_text_pdftotext(string $pdfPath): string {
    $cmd = sprintf('pdftotext -layout -enc UTF-8 %s -', escapeshellarg($pdfPath));
    $out = shell_exec($cmd . ' 2>/dev/null');
    return $out !== null ? trim($out) : '';
}

function pdf_to_text_purephp(string $pdfPath): string {
    if (!class_exists(\Smalot\PdfParser\Parser::class)) {
        return '';
    }
    $parser = new \Smalot\PdfParser\Parser();
    $pdf    = $parser->parseFile($pdfPath);
    return trim($pdf->getText());
}

function pdf_to_text_ocr(string $pdfPath, string $lang = 'eng'): string {
    $tmpDir = sys_get_temp_dir() . '/ocr_' . bin2hex(random_bytes(4));
    mkdir($tmpDir);
    $ppmPrefix = $tmpDir . '/page';
    shell_exec(sprintf('pdftoppm -r 300 -png %s %s',
        escapeshellarg($pdfPath),
        escapeshellarg($ppmPrefix)
    ) . ' 2>/dev/null');

    $text = '';
    foreach (glob($ppmPrefix . '-*.png') as $img) {
        $outTxt = $img . '.txt';
        shell_exec(sprintf(
            'tesseract %s %s -l %s --psm 1 quiet',
            escapeshellarg($img),
            escapeshellarg($img),
            escapeshellarg($lang)
        ) . ' 2>/dev/null');
        if (is_file($outTxt)) {
            $text .= file_get_contents($outTxt) . "\n";
        }
    }
    array_map('unlink', glob($tmpDir . '/*'));
    @rmdir($tmpDir);

    return trim($text);
}

function pdf_to_text_all(string $pdfPath): string {
    $text = pdf_to_text_pdftotext($pdfPath);
    if (strlen($text) > 50) return $text;

    try {
        $text = pdf_to_text_purephp($pdfPath);
        if (strlen($text) > 50) return $text;
    } catch (\Throwable $e) {}

    return pdf_to_text_ocr($pdfPath, 'eng');
}

// -------------------- UITVOER --------------------
require_once __DIR__ . '/vendor/autoload.php'; // voor smalot/pdfparser (optioneel)

try {
    $localPdf = download_pdf($pdfUrl);
    $text = pdf_to_text_all($localPdf);
    unlink($localPdf);
} catch (Exception $e) {
    echo "<strong>Fout:</strong> " . htmlspecialchars($e->getMessage());
    return;
}

if ($text === '') {
    echo "<strong>Geen tekst gevonden.</strong>";
    return;
}

echo "<pre>" . htmlspecialchars($text) . "</pre>";
1 Like