You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

113 lines
3.1 KiB

11 months ago
  1. const { v4 } = require("uuid");
  2. const xlsx = require("node-xlsx").default;
  3. const path = require("path");
  4. const fs = require("fs");
  5. const {
  6. createdDate,
  7. trashFile,
  8. writeToServerDocuments,
  9. } = require("../../utils/files");
  10. const { tokenizeString } = require("../../utils/tokenizer");
  11. const { default: slugify } = require("slugify");
  12. function convertToCSV(data) {
  13. return data
  14. .map((row) =>
  15. row
  16. .map((cell) => {
  17. if (cell === null || cell === undefined) return "";
  18. if (typeof cell === "string" && cell.includes(","))
  19. return `"${cell}"`;
  20. return cell;
  21. })
  22. .join(",")
  23. )
  24. .join("\n");
  25. }
  26. async function asXlsx({ fullFilePath = "", filename = "" }) {
  27. const documents = [];
  28. const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, {
  29. lower: true,
  30. trim: true,
  31. });
  32. const outFolderPath =
  33. process.env.NODE_ENV === "development"
  34. ? path.resolve(
  35. __dirname,
  36. `../../../server/storage/documents/${folderName}`
  37. )
  38. : path.resolve(process.env.STORAGE_DIR, `documents/${folderName}`);
  39. try {
  40. const workSheetsFromFile = xlsx.parse(fullFilePath);
  41. if (!fs.existsSync(outFolderPath))
  42. fs.mkdirSync(outFolderPath, { recursive: true });
  43. for (const sheet of workSheetsFromFile) {
  44. try {
  45. const { name, data } = sheet;
  46. const content = convertToCSV(data);
  47. if (!content?.length) {
  48. console.warn(`Sheet "${name}" is empty. Skipping.`);
  49. continue;
  50. }
  51. console.log(`-- Processing sheet: ${name} --`);
  52. const sheetData = {
  53. id: v4(),
  54. url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`,
  55. title: `${filename} - Sheet:${name}`,
  56. docAuthor: "Unknown",
  57. description: `Spreadsheet data from sheet: ${name}`,
  58. docSource: "an xlsx file uploaded by the user.",
  59. chunkSource: "",
  60. published: createdDate(fullFilePath),
  61. wordCount: content.split(/\s+/).length,
  62. pageContent: content,
  63. token_count_estimate: tokenizeString(content),
  64. };
  65. const document = writeToServerDocuments(
  66. sheetData,
  67. `sheet-${slugify(name)}`,
  68. outFolderPath
  69. );
  70. documents.push(document);
  71. console.log(
  72. `[SUCCESS]: Sheet "${name}" converted & ready for embedding.`
  73. );
  74. } catch (err) {
  75. console.error(`Error processing sheet "${name}":`, err);
  76. continue;
  77. }
  78. }
  79. } catch (err) {
  80. console.error("Could not process xlsx file!", err);
  81. return {
  82. success: false,
  83. reason: `Error processing ${filename}: ${err.message}`,
  84. documents: [],
  85. };
  86. } finally {
  87. trashFile(fullFilePath);
  88. }
  89. if (documents.length === 0) {
  90. console.error(`No valid sheets found in ${filename}.`);
  91. return {
  92. success: false,
  93. reason: `No valid sheets found in ${filename}.`,
  94. documents: [],
  95. };
  96. }
  97. console.log(
  98. `[SUCCESS]: ${filename} fully processed. Created ${documents.length} document(s).\n`
  99. );
  100. return { success: true, reason: null, documents };
  101. }
  102. module.exports = asXlsx;