You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

192 lines
5.9 KiB

11 months ago
11 months ago
  1. const fs = require("fs");
  2. const path = require("path");
  3. const { MimeDetector } = require("./mime");
  4. /**
  5. * Checks if a file is text by checking the mime type and then falling back to buffer inspection.
  6. * This way we can capture all the cases where the mime type is not known but still parseable as text
  7. * without having to constantly add new mime type overrides.
  8. * @param {string} filepath - The path to the file.
  9. * @returns {boolean} - Returns true if the file is text, false otherwise.
  10. */
  11. function isTextType(filepath) {
  12. if (!fs.existsSync(filepath)) return false;
  13. const result = isKnownTextMime(filepath);
  14. if (result.valid) return true; // Known text type - return true.
  15. if (result.reason !== "generic") return false; // If any other reason than generic - return false.
  16. return parseableAsText(filepath); // Fallback to parsing as text via buffer inspection.
  17. }
  18. /**
  19. * Checks if a file is known to be text by checking the mime type.
  20. * @param {string} filepath - The path to the file.
  21. * @returns {boolean} - Returns true if the file is known to be text, false otherwise.
  22. */
  23. function isKnownTextMime(filepath) {
  24. try {
  25. const mimeLib = new MimeDetector();
  26. const mime = mimeLib.getType(filepath);
  27. if (mimeLib.badMimes.includes(mime))
  28. return { valid: false, reason: "bad_mime" };
  29. const type = mime.split("/")[0];
  30. if (mimeLib.nonTextTypes.includes(type))
  31. return { valid: false, reason: "non_text_mime" };
  32. return { valid: true, reason: "valid_mime" };
  33. } catch (e) {
  34. return { valid: false, reason: "generic" };
  35. }
  36. }
  37. /**
  38. * Checks if a file is parseable as text by forcing it to be read as text in utf8 encoding.
  39. * If the file looks too much like a binary file, it will return false.
  40. * @param {string} filepath - The path to the file.
  41. * @returns {boolean} - Returns true if the file is parseable as text, false otherwise.
  42. */
  43. function parseableAsText(filepath) {
  44. try {
  45. const fd = fs.openSync(filepath, "r");
  46. const buffer = Buffer.alloc(1024); // Read first 1KB of the file synchronously
  47. const bytesRead = fs.readSync(fd, buffer, 0, 1024, 0);
  48. fs.closeSync(fd);
  49. const content = buffer.subarray(0, bytesRead).toString("utf8");
  50. const nullCount = (content.match(/\0/g) || []).length;
  51. const controlCount = (content.match(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g) || [])
  52. .length;
  53. const threshold = bytesRead * 0.1;
  54. return nullCount + controlCount < threshold;
  55. } catch {
  56. return false;
  57. }
  58. }
  59. function trashFile(filepath) {
  60. if (!fs.existsSync(filepath)) return;
  61. try {
  62. const isDir = fs.lstatSync(filepath).isDirectory();
  63. if (isDir) return;
  64. } catch {
  65. return;
  66. }
  67. console.log("=====:::::", filepath);
  68. // fs.rmSync(filepath);
  69. return;
  70. }
  71. function createdDate(filepath) {
  72. try {
  73. const { birthtimeMs, birthtime } = fs.statSync(filepath);
  74. if (birthtimeMs === 0) throw new Error("Invalid stat for file!");
  75. return birthtime.toLocaleString();
  76. } catch {
  77. return "unknown";
  78. }
  79. }
  80. function writeToServerDocuments(
  81. data = {},
  82. filename,
  83. destinationOverride = null
  84. ) {
  85. const destination = destinationOverride
  86. ? path.resolve(destinationOverride)
  87. : path.resolve(
  88. __dirname,
  89. "../../../server/storage/documents/custom-documents"
  90. );
  91. if (!fs.existsSync(destination))
  92. fs.mkdirSync(destination, { recursive: true });
  93. const destinationFilePath = path.resolve(destination, filename) + ".json";
  94. fs.writeFileSync(destinationFilePath, JSON.stringify(data, null, 4), {
  95. encoding: "utf-8",
  96. });
  97. return {
  98. ...data,
  99. // relative location string that can be passed into the /update-embeddings api
  100. // that will work since we know the location exists and since we only allow
  101. // 1-level deep folders this will always work. This still works for integrations like GitHub and YouTube.
  102. location: destinationFilePath.split("/").slice(-2).join("/"),
  103. };
  104. }
  105. // When required we can wipe the entire collector hotdir and tmp storage in case
  106. // there were some large file failures that we unable to be removed a reboot will
  107. // force remove them.
  108. async function wipeCollectorStorage() {
  109. const cleanHotDir = new Promise((resolve) => {
  110. const directory = path.resolve(__dirname, "../../hotdir");
  111. fs.readdir(directory, (err, files) => {
  112. if (err) resolve();
  113. for (const file of files) {
  114. if (file === "__HOTDIR__.md") continue;
  115. try {
  116. fs.rmSync(path.join(directory, file));
  117. } catch {}
  118. }
  119. resolve();
  120. });
  121. });
  122. const cleanTmpDir = new Promise((resolve) => {
  123. const directory = path.resolve(__dirname, "../../storage/tmp");
  124. fs.readdir(directory, (err, files) => {
  125. if (err) resolve();
  126. for (const file of files) {
  127. if (file === ".placeholder") continue;
  128. try {
  129. fs.rmSync(path.join(directory, file));
  130. } catch {}
  131. }
  132. resolve();
  133. });
  134. });
  135. await Promise.all([cleanHotDir, cleanTmpDir]);
  136. console.log(`Collector hot directory and tmp storage wiped!`);
  137. return;
  138. }
  139. /**
  140. * Checks if a given path is within another path.
  141. * @param {string} outer - The outer path (should be resolved).
  142. * @param {string} inner - The inner path (should be resolved).
  143. * @returns {boolean} - Returns true if the inner path is within the outer path, false otherwise.
  144. */
  145. function isWithin(outer, inner) {
  146. if (outer === inner) return false;
  147. const rel = path.relative(outer, inner);
  148. return !rel.startsWith("../") && rel !== "..";
  149. }
  150. function normalizePath(filepath = "") {
  151. const result = path
  152. .normalize(filepath.trim())
  153. .replace(/^(\.\.(\/|\\|$))+/, "")
  154. .trim();
  155. if (["..", ".", "/"].includes(result)) throw new Error("Invalid path.");
  156. return result;
  157. }
  158. function sanitizeFileName(fileName) {
  159. if (!fileName) return fileName;
  160. return fileName.replace(/[<>:"\/\\|?*]/g, "");
  161. }
  162. module.exports = {
  163. trashFile,
  164. isTextType,
  165. createdDate,
  166. writeToServerDocuments,
  167. wipeCollectorStorage,
  168. normalizePath,
  169. isWithin,
  170. sanitizeFileName,
  171. };