You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

48 lines
1.4 KiB

11 months ago
  1. const { v4 } = require("uuid");
  2. const { tokenizeString } = require("../../utils/tokenizer");
  3. const {
  4. createdDate,
  5. trashFile,
  6. writeToServerDocuments,
  7. } = require("../../utils/files");
  8. const OCRLoader = require("../../utils/OCRLoader");
  9. const { default: slugify } = require("slugify");
  10. async function asImage({ fullFilePath = "", filename = "" }) {
  11. let content = await new OCRLoader().ocrImage(fullFilePath);
  12. if (!content?.length) {
  13. console.error(`Resulting text content was empty for ${filename}.`);
  14. trashFile(fullFilePath);
  15. return {
  16. success: false,
  17. reason: `No text content found in ${filename}.`,
  18. documents: [],
  19. };
  20. }
  21. console.log(`-- Working ${filename} --`);
  22. const data = {
  23. id: v4(),
  24. url: "file://" + fullFilePath,
  25. title: filename,
  26. docAuthor: "Unknown", // TODO: Find a better author
  27. description: "Unknown", // TODO: Find a better description
  28. docSource: "a text file uploaded by the user.",
  29. chunkSource: "",
  30. published: createdDate(fullFilePath),
  31. wordCount: content.split(" ").length,
  32. pageContent: content,
  33. token_count_estimate: tokenizeString(content),
  34. };
  35. const document = writeToServerDocuments(
  36. data,
  37. `${slugify(filename)}-${data.id}`
  38. );
  39. trashFile(fullFilePath);
  40. console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
  41. return { success: true, reason: null, documents: [document] };
  42. }
  43. module.exports = asImage;