You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

97 lines
2.3 KiB

11 months ago
  1. const fs = require("fs").promises;
  2. class PDFLoader {
  3. constructor(filePath, { splitPages = true } = {}) {
  4. this.filePath = filePath;
  5. this.splitPages = splitPages;
  6. }
  7. async load() {
  8. const buffer = await fs.readFile(this.filePath);
  9. const { getDocument, version } = await this.getPdfJS();
  10. const pdf = await getDocument({
  11. data: new Uint8Array(buffer),
  12. useWorkerFetch: false,
  13. isEvalSupported: false,
  14. useSystemFonts: true,
  15. }).promise;
  16. const meta = await pdf.getMetadata().catch(() => null);
  17. const documents = [];
  18. for (let i = 1; i <= pdf.numPages; i += 1) {
  19. const page = await pdf.getPage(i);
  20. const content = await page.getTextContent();
  21. if (content.items.length === 0) {
  22. continue;
  23. }
  24. let lastY;
  25. const textItems = [];
  26. for (const item of content.items) {
  27. if ("str" in item) {
  28. if (lastY === item.transform[5] || !lastY) {
  29. textItems.push(item.str);
  30. } else {
  31. textItems.push(`\n${item.str}`);
  32. }
  33. lastY = item.transform[5];
  34. }
  35. }
  36. const text = textItems.join("");
  37. documents.push({
  38. pageContent: text.trim(),
  39. metadata: {
  40. source: this.filePath,
  41. pdf: {
  42. version,
  43. info: meta?.info,
  44. metadata: meta?.metadata,
  45. totalPages: pdf.numPages,
  46. },
  47. loc: { pageNumber: i },
  48. },
  49. });
  50. }
  51. if (this.splitPages) {
  52. return documents;
  53. }
  54. if (documents.length === 0) {
  55. return [];
  56. }
  57. return [
  58. {
  59. pageContent: documents.map((doc) => doc.pageContent).join("\n\n"),
  60. metadata: {
  61. source: this.filePath,
  62. pdf: {
  63. version,
  64. info: meta?.info,
  65. metadata: meta?.metadata,
  66. totalPages: pdf.numPages,
  67. },
  68. },
  69. },
  70. ];
  71. }
  72. async getPdfJS() {
  73. try {
  74. const pdfjs = await import("pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js");
  75. return { getDocument: pdfjs.getDocument, version: pdfjs.version };
  76. } catch (e) {
  77. console.error(e);
  78. throw new Error(
  79. "Failed to load pdf-parse. Please install it with eg. `npm install pdf-parse`."
  80. );
  81. }
  82. }
  83. }
  84. module.exports = PDFLoader;