const fs = require("fs"); const os = require("os"); const path = require("path"); class OCRLoader { constructor() { this.cacheDir = path.resolve( process.env.STORAGE_DIR ? path.resolve(process.env.STORAGE_DIR, `models`, `tesseract`) : path.resolve(__dirname, `../../../server/storage/models/tesseract`) ); } log(text, ...args) { console.log(`\x1b[36m[OCRLoader]\x1b[0m ${text}`, ...args); } /** * Loads a PDF file and returns an array of documents. * This function is reserved to parsing for SCANNED documents - digital documents are not supported in this function * @returns {Promise<{pageContent: string, metadata: object}[]>} An array of documents with page content and metadata. */ async ocrPDF( filePath, { maxExecutionTime = 300_000, batchSize = 10, maxWorkers = null } = {} ) { if ( !filePath || !fs.existsSync(filePath) || !fs.statSync(filePath).isFile() ) { this.log(`File ${filePath} does not exist. Skipping OCR.`); return []; } const documentTitle = path.basename(filePath); this.log(`Starting OCR of ${documentTitle}`); const pdfjs = await import("pdf-parse/lib/pdf.js/v2.0.550/build/pdf.js"); let buffer = fs.readFileSync(filePath); const pdfDocument = await pdfjs.getDocument({ data: buffer }); const documents = []; const meta = await pdfDocument.getMetadata().catch(() => null); const metadata = { source: filePath, pdf: { version: "v2.0.550", info: meta?.info, metadata: meta?.metadata, totalPages: pdfDocument.numPages, }, }; const pdfSharp = new PDFSharp({ validOps: [ pdfjs.OPS.paintJpegXObject, pdfjs.OPS.paintImageXObject, pdfjs.OPS.paintInlineImageXObject, ], }); await pdfSharp.init(); const { createWorker, OEM } = require("tesseract.js"); const BATCH_SIZE = batchSize; const MAX_EXECUTION_TIME = maxExecutionTime; const NUM_WORKERS = maxWorkers ?? Math.min(os.cpus().length, 4); const totalPages = pdfDocument.numPages; const workerPool = await Promise.all( Array(NUM_WORKERS) .fill(0) .map(() => createWorker("eng", OEM.LSTM_ONLY, { cachePath: this.cacheDir, }) ) ); const startTime = Date.now(); try { this.log("Bootstrapping OCR completed successfully!", { MAX_EXECUTION_TIME_MS: MAX_EXECUTION_TIME, BATCH_SIZE, MAX_CONCURRENT_WORKERS: NUM_WORKERS, TOTAL_PAGES: totalPages, }); const timeoutPromise = new Promise((_, reject) => { setTimeout(() => { reject( new Error( `OCR job took too long to complete (${ MAX_EXECUTION_TIME / 1000 } seconds)` ) ); }, MAX_EXECUTION_TIME); }); const processPages = async () => { for ( let startPage = 1; startPage <= totalPages; startPage += BATCH_SIZE ) { const endPage = Math.min(startPage + BATCH_SIZE - 1, totalPages); const pageNumbers = Array.from( { length: endPage - startPage + 1 }, (_, i) => startPage + i ); this.log(`Working on pages ${startPage} - ${endPage}`); const pageQueue = [...pageNumbers]; const results = []; const workerPromises = workerPool.map(async (worker, workerIndex) => { while (pageQueue.length > 0) { const pageNum = pageQueue.shift(); this.log( `\x1b[34m[Worker ${ workerIndex + 1 }]\x1b[0m assigned pg${pageNum}` ); const page = await pdfDocument.getPage(pageNum); const imageBuffer = await pdfSharp.pageToBuffer({ page }); if (!imageBuffer) continue; const { data } = await worker.recognize(imageBuffer, {}, "text"); this.log( `✅ \x1b[34m[Worker ${ workerIndex + 1 }]\x1b[0m completed pg${pageNum}` ); results.push({ pageContent: data.text, metadata: { ...metadata, loc: { pageNumber: pageNum }, }, }); } }); await Promise.all(workerPromises); documents.push( ...results.sort( (a, b) => a.metadata.loc.pageNumber - b.metadata.loc.pageNumber ) ); } return documents; }; await Promise.race([timeoutPromise, processPages()]); } catch (e) { this.log(`Error: ${e.message}`, e.stack); } finally { global.Image = undefined; await Promise.all(workerPool.map((worker) => worker.terminate())); } this.log(`Completed OCR of ${documentTitle}!`, { documentsParsed: documents.length, totalPages: totalPages, executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`, }); return documents; } /** * Loads an image file and returns the OCRed text. * @param {string} filePath - The path to the image file. * @param {Object} options - The options for the OCR. * @param {number} options.maxExecutionTime - The maximum execution time of the OCR in milliseconds. * @returns {Promise} The OCRed text. */ async ocrImage(filePath, { maxExecutionTime = 300_000 } = {}) { let content = ""; let worker = null; if ( !filePath || !fs.existsSync(filePath) || !fs.statSync(filePath).isFile() ) { this.log(`File ${filePath} does not exist. Skipping OCR.`); return null; } const documentTitle = path.basename(filePath); try { this.log(`Starting OCR of ${documentTitle}`); const startTime = Date.now(); const { createWorker, OEM } = require("tesseract.js"); worker = await createWorker("eng", OEM.LSTM_ONLY, { cachePath: this.cacheDir, }); // Race the timeout with the OCR const timeoutPromise = new Promise((_, reject) => { setTimeout(() => { reject( new Error( `OCR job took too long to complete (${ maxExecutionTime / 1000 } seconds)` ) ); }, maxExecutionTime); }); const processImage = async () => { const { data } = await worker.recognize(filePath, {}, "text"); content = data.text; }; await Promise.race([timeoutPromise, processImage()]); this.log(`Completed OCR of ${documentTitle}!`, { executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`, }); return content; } catch (e) { this.log(`Error: ${e.message}`); return null; } finally { if (!worker) return; await worker.terminate(); } } } /** * Converts a PDF page to a buffer using Sharp. * @param {Object} options - The options for the Sharp PDF page object. * @param {Object} options.page - The PDFJS page proxy object. * @returns {Promise} The buffer of the page. */ class PDFSharp { constructor({ validOps = [] } = {}) { this.sharp = null; this.validOps = validOps; } log(text, ...args) { console.log(`\x1b[36m[PDFSharp]\x1b[0m ${text}`, ...args); } async init() { this.sharp = (await import("sharp")).default; } /** * Converts a PDF page to a buffer. * @param {Object} options - The options for the Sharp PDF page object. * @param {Object} options.page - The PDFJS page proxy object. * @returns {Promise} The buffer of the page. */ async pageToBuffer({ page }) { if (!this.sharp) await this.init(); try { this.log(`Converting page ${page.pageNumber} to image...`); const ops = await page.getOperatorList(); const pageImages = ops.fnArray.length; for (let i = 0; i < pageImages; i++) { try { if (!this.validOps.includes(ops.fnArray[i])) continue; const name = ops.argsArray[i][0]; const img = await page.objs.get(name); const { width, height } = img; const size = img.data.length; const channels = size / width / height; const targetDPI = 70; const targetWidth = Math.floor(width * (targetDPI / 72)); const targetHeight = Math.floor(height * (targetDPI / 72)); const image = this.sharp(img.data, { raw: { width, height, channels }, density: targetDPI, }) .resize({ width: targetWidth, height: targetHeight, fit: "fill", }) .withMetadata({ density: targetDPI, resolution: targetDPI, }) .png(); // For debugging purposes // await image.toFile(path.resolve(__dirname, `../../storage/`, `pg${page.pageNumber}.png`)); return await image.toBuffer(); } catch (error) { this.log(`Iteration error: ${error.message}`, error.stack); continue; } } this.log(`No valid images found on page ${page.pageNumber}`); return null; } catch (error) { this.log(`Error: ${error.message}`, error.stack); return null; } } } module.exports = OCRLoader;