The file type is not supported. - Uploaded PDF to vector file

wouter.steenkist · August 15, 2025, 2:34pm

Adding a PDF file via the API to vector store results in a indexing failed status: The file type is not supported. Worked before.

But it does work via playground!

Others too?

_j · August 16, 2025, 3:05am

This file type is supported.

The API endpoint doesn’t trust your file extension and thus does some deep inspection, which normally refuses on text, not binaries like PDF.

It’s been 12 hours, but you also just gave your message a bump…

My Results:

Uploaded file: 'hp-printers.pdf' -> ID: file-6qwvVLgqLr1AQQxsj6sw6N
Uploaded files; IDs returned:
  1. hp-printers.pdf (hp-printers.pdf) -> file-6qwvVLgqLr1AQQxsj6sw6N
Created Vector Store: "Printer_search" -> ID: vs_689ff4b97c908191bbb195f7f0cbf5f1
Deleted Vector Store: "Printer_search" -> ID: vs_689ff4b97c908191bbb195f7f0cbf5f1
Deleted file: 'hp-printers.pdf' -> ID: file-6qwvVLgqLr1AQQxsj6sw6N

By this Python testing code (have your pdf file in the code and in the directory).

import openai
from pathlib import Path

db_name = "Printer_search"  # vector store name
file_paths = [
    "hp-printers.pdf",
    # "text1.txt"
]

client = None
file_records = []  # [{path, id, filename}]
vector_store = None

try:
    client = openai.OpenAI(timeout=90.0, max_retries=0)  # or openai.AsyncOpenAI

    for file_path in file_paths:
        p = Path(file_path)
        if not p.exists():
            print(f"SKIP: File not found -> {p}")
            continue

        try:
            with p.open("rb") as f:
                file_object = client.files.create(file=f, purpose="assistants")
            filename = getattr(file_object, "filename", p.name)
            file_records.append({"path": str(p), "id": file_object.id, "filename": filename})
            print(f"Uploaded file: '{filename}' -> ID: {file_object.id}")
        except Exception as e:
            print(f"Upload failed for '{p}': {e}")

    if file_records:
        print("Uploaded files; IDs returned:")
        for idx, rec in enumerate(file_records, start=1):
            print(f"  {idx}. {rec['filename']} ({rec['path']}) -> {rec['id']}")

        vector_store = client.vector_stores.create(
            name=db_name,
            file_ids=[rec["id"] for rec in file_records],
            expires_after={"anchor": "last_active_at", "days": 7},
            chunking_strategy={
                "type": "static",
                "static": {"max_chunk_size_tokens": 600, "chunk_overlap_tokens": 200},
            },
        )
        print(f'Created Vector Store: "{vector_store.name}" -> ID: {vector_store.id}')
    else:
        print("No files uploaded successfully; skipping vector store creation.")

except Exception as e:
    print(f"code failed\n{e}")

# then search or integrate with AI tool
pass

# End: delete resources (delete vector store first, then files)
try:
    if client is not None and vector_store is not None:
        client.vector_stores.delete(vector_store.id)
        print(f'Deleted Vector Store: "{vector_store.name}" -> ID: {vector_store.id}')
    else:
        print("Vector Store cleanup skipped; never created.")
except Exception as e:
    print("Vector Store cleanup fail; never created?")
    print(e)

try:
    if client is not None and file_records:
        for rec in file_records:
            client.files.delete(rec["id"])
            print(f"Deleted file: '{rec['filename']}' -> ID: {rec['id']}")
    else:
        print("Files storage cleanup skipped; none uploaded.")
except Exception as e:
    print("Files storage cleanup fail; never created?")
    print(e)

wouter.steenkist · August 22, 2025, 4:54am

Same issue here.. but I decided to convert PDF’s to text before uploading them as TXT. Works perfectly.

<?php
$pdfUrl = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf";

function download_pdf(string $url): string {
    $tmpFile = tempnam(sys_get_temp_dir(), 'pdf_');
    $fp = fopen($tmpFile, 'w');
    $ch = curl_init($url);
    curl_setopt_array($ch, [
        CURLOPT_FILE => $fp,
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_TIMEOUT => 60,
        CURLOPT_SSL_VERIFYPEER => true,
    ]);
    if (!curl_exec($ch)) {
        $err = curl_error($ch);
        curl_close($ch);
        fclose($fp);
        unlink($tmpFile);
        throw new RuntimeException("Download mislukt: $err");
    }
    curl_close($ch);
    fclose($fp);

    if (filesize($tmpFile) < 100) {
        unlink($tmpFile);
        throw new RuntimeException("PDF is leeg of te klein.");
    }
    return $tmpFile;
}

function pdf_to_text_pdftotext(string $pdfPath): string {
    $cmd = sprintf('pdftotext -layout -enc UTF-8 %s -', escapeshellarg($pdfPath));
    $out = shell_exec($cmd . ' 2>/dev/null');
    return $out !== null ? trim($out) : '';
}

function pdf_to_text_purephp(string $pdfPath): string {
    if (!class_exists(\Smalot\PdfParser\Parser::class)) {
        return '';
    }
    $parser = new \Smalot\PdfParser\Parser();
    $pdf    = $parser->parseFile($pdfPath);
    return trim($pdf->getText());
}

function pdf_to_text_ocr(string $pdfPath, string $lang = 'eng'): string {
    $tmpDir = sys_get_temp_dir() . '/ocr_' . bin2hex(random_bytes(4));
    mkdir($tmpDir);
    $ppmPrefix = $tmpDir . '/page';
    shell_exec(sprintf('pdftoppm -r 300 -png %s %s',
        escapeshellarg($pdfPath),
        escapeshellarg($ppmPrefix)
    ) . ' 2>/dev/null');

    $text = '';
    foreach (glob($ppmPrefix . '-*.png') as $img) {
        $outTxt = $img . '.txt';
        shell_exec(sprintf(
            'tesseract %s %s -l %s --psm 1 quiet',
            escapeshellarg($img),
            escapeshellarg($img),
            escapeshellarg($lang)
        ) . ' 2>/dev/null');
        if (is_file($outTxt)) {
            $text .= file_get_contents($outTxt) . "\n";
        }
    }
    array_map('unlink', glob($tmpDir . '/*'));
    @rmdir($tmpDir);

    return trim($text);
}

function pdf_to_text_all(string $pdfPath): string {
    $text = pdf_to_text_pdftotext($pdfPath);
    if (strlen($text) > 50) return $text;

    try {
        $text = pdf_to_text_purephp($pdfPath);
        if (strlen($text) > 50) return $text;
    } catch (\Throwable $e) {}

    return pdf_to_text_ocr($pdfPath, 'eng');
}

// -------------------- UITVOER --------------------
require_once __DIR__ . '/vendor/autoload.php'; // voor smalot/pdfparser (optioneel)

try {
    $localPdf = download_pdf($pdfUrl);
    $text = pdf_to_text_all($localPdf);
    unlink($localPdf);
} catch (Exception $e) {
    echo "<strong>Fout:</strong> " . htmlspecialchars($e->getMessage());
    return;
}

if ($text === '') {
    echo "<strong>Geen tekst gevonden.</strong>";
    return;
}

echo "<pre>" . htmlspecialchars($text) . "</pre>";

Topic		Replies	Views
Invalid input: Expected file type to be a supported format: .pdf but got .docx API	8	3390	June 30, 2025
Can Not Add Files To Vector Store API assistants-api , vector-store	20	6885	December 10, 2024
Md and txt file not uploading Bugs	16	946	March 18, 2025
"unsupported_file" error when creating vector store with certain plain-text/markdown files Bugs	2	185	September 17, 2025
PHP text files are not supported for vector storage, but the documentation lists it as supported Documentation php	3	365	June 3, 2024

The file type is not supported. - Uploaded PDF to vector file

Related topics