|
|
const fs = require("fs").promises;
class PDFLoader { constructor(filePath, { splitPages = true } = {}) { this.filePath = filePath; this.splitPages = splitPages; }
async load() { const buffer = await fs.readFile(this.filePath); const { getDocument, version } = await this.getPdfJS();
const pdf = await getDocument({ data: new Uint8Array(buffer), useWorkerFetch: false, isEvalSupported: false, useSystemFonts: true, }).promise;
const meta = await pdf.getMetadata().catch(() => null); const documents = [];
for (let i = 1; i <= pdf.numPages; i += 1) { const page = await pdf.getPage(i); const content = await page.getTextContent();
if (content.items.length === 0) { continue; }
let lastY; const textItems = []; for (const item of content.items) { if ("str" in item) { if (lastY === item.transform[5] || !lastY) { textItems.push(item.str); } else { textItems.push(`\n${item.str}`); } lastY = item.transform[5]; } }
const text = textItems.join(""); documents.push({ pageContent: text.trim(), metadata: { source: this.filePath, pdf: { version, info: meta?.info, metadata: meta?.metadata, totalPages: pdf.numPages, }, loc: { pageNumber: i }, }, }); }
if (this.splitPages) { return documents; }
if (documents.length === 0) { return []; }
return [ { pageContent: documents.map((doc) => doc.pageContent).join("\n\n"), metadata: { source: this.filePath, pdf: { version, info: meta?.info, metadata: meta?.metadata, totalPages: pdf.numPages, }, }, }, ]; }
async getPdfJS() { try { const pdfjs = await import("pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js"); return { getDocument: pdfjs.getDocument, version: pdfjs.version }; } catch (e) { console.error(e); throw new Error( "Failed to load pdf-parse. Please install it with eg. `npm install pdf-parse`." ); } }}
module.exports = PDFLoader;
|