You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

307 lines
10 KiB

11 months ago
  1. const fs = require("fs");
  2. const path = require("path");
  3. const { v5: uuidv5 } = require("uuid");
  4. const { Document } = require("../../models/documents");
  5. const { DocumentSyncQueue } = require("../../models/documentSyncQueue");
  6. const documentsPath =
  7. process.env.NODE_ENV === "development"
  8. ? path.resolve(__dirname, `../../storage/documents`)
  9. : path.resolve(process.env.STORAGE_DIR, `documents`);
  10. const vectorCachePath =
  11. process.env.NODE_ENV === "development"
  12. ? path.resolve(__dirname, `../../storage/vector-cache`)
  13. : path.resolve(process.env.STORAGE_DIR, `vector-cache`);
  14. // Should take in a folder that is a subfolder of documents
  15. // eg: youtube-subject/video-123.json
  16. async function fileData(filePath = null) {
  17. if (!filePath) throw new Error("No docPath provided in request");
  18. const fullFilePath = path.resolve(documentsPath, normalizePath(filePath));
  19. if (!fs.existsSync(fullFilePath) || !isWithin(documentsPath, fullFilePath))
  20. return null;
  21. const data = fs.readFileSync(fullFilePath, "utf8");
  22. return JSON.parse(data);
  23. }
  24. async function viewLocalFiles() {
  25. if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);
  26. const liveSyncAvailable = await DocumentSyncQueue.enabled();
  27. const directory = {
  28. name: "documents",
  29. type: "folder",
  30. items: [],
  31. };
  32. for (const file of fs.readdirSync(documentsPath)) {
  33. if (path.extname(file) === ".md") continue;
  34. const folderPath = path.resolve(documentsPath, file);
  35. const isFolder = fs.lstatSync(folderPath).isDirectory();
  36. if (isFolder) {
  37. const subdocs = {
  38. name: file,
  39. type: "folder",
  40. items: [],
  41. };
  42. const subfiles = fs.readdirSync(folderPath);
  43. const filenames = {};
  44. for (const subfile of subfiles) {
  45. if (path.extname(subfile) !== ".json") continue;
  46. const filePath = path.join(folderPath, subfile);
  47. const rawData = fs.readFileSync(filePath, "utf8");
  48. const cachefilename = `${file}/${subfile}`;
  49. const { pageContent, ...metadata } = JSON.parse(rawData);
  50. subdocs.items.push({
  51. name: subfile,
  52. type: "file",
  53. ...metadata,
  54. cached: await cachedVectorInformation(cachefilename, true),
  55. canWatch: liveSyncAvailable
  56. ? DocumentSyncQueue.canWatch(metadata)
  57. : false,
  58. // pinnedWorkspaces: [], // This is the list of workspaceIds that have pinned this document
  59. // watched: false, // boolean to indicate if this document is watched in ANY workspace
  60. });
  61. filenames[cachefilename] = subfile;
  62. }
  63. // Grab the pinned workspaces and watched documents for this folder's documents
  64. // at the time of the query so we don't have to re-query the database for each file
  65. const pinnedWorkspacesByDocument =
  66. await getPinnedWorkspacesByDocument(filenames);
  67. const watchedDocumentsFilenames =
  68. await getWatchedDocumentFilenames(filenames);
  69. for (const item of subdocs.items) {
  70. item.pinnedWorkspaces = pinnedWorkspacesByDocument[item.name] || [];
  71. item.watched =
  72. watchedDocumentsFilenames.hasOwnProperty(item.name) || false;
  73. }
  74. directory.items.push(subdocs);
  75. }
  76. }
  77. // Make sure custom-documents is always the first folder in picker
  78. directory.items = [
  79. directory.items.find((folder) => folder.name === "custom-documents"),
  80. ...directory.items.filter((folder) => folder.name !== "custom-documents"),
  81. ].filter((i) => !!i);
  82. return directory;
  83. }
  84. /**
  85. * Searches the vector-cache folder for existing information so we dont have to re-embed a
  86. * document and can instead push directly to vector db.
  87. * @param {string} filename - the filename to check for cached vector information
  88. * @param {boolean} checkOnly - if true, only check if the file exists, do not return the cached data
  89. * @returns {Promise<{exists: boolean, chunks: any[]}>} - a promise that resolves to an object containing the existence of the file and its cached chunks
  90. */
  91. async function cachedVectorInformation(filename = null, checkOnly = false) {
  92. if (!filename) return checkOnly ? false : { exists: false, chunks: [] };
  93. const digest = uuidv5(filename, uuidv5.URL);
  94. const file = path.resolve(vectorCachePath, `${digest}.json`);
  95. const exists = fs.existsSync(file);
  96. if (checkOnly) return exists;
  97. if (!exists) return { exists, chunks: [] };
  98. console.log(
  99. `Cached vectorized results of ${filename} found! Using cached data to save on embed costs.`
  100. );
  101. const rawData = fs.readFileSync(file, "utf8");
  102. return { exists: true, chunks: JSON.parse(rawData) };
  103. }
  104. // vectorData: pre-chunked vectorized data for a given file that includes the proper metadata and chunk-size limit so it can be iterated and dumped into Pinecone, etc
  105. // filename is the fullpath to the doc so we can compare by filename to find cached matches.
  106. async function storeVectorResult(vectorData = [], filename = null) {
  107. if (!filename) return;
  108. console.log(
  109. `Caching vectorized results of ${filename} to prevent duplicated embedding.`
  110. );
  111. if (!fs.existsSync(vectorCachePath)) fs.mkdirSync(vectorCachePath);
  112. const digest = uuidv5(filename, uuidv5.URL);
  113. const writeTo = path.resolve(vectorCachePath, `${digest}.json`);
  114. fs.writeFileSync(writeTo, JSON.stringify(vectorData), "utf8");
  115. return;
  116. }
  117. // Purges a file from the documents/ folder.
  118. async function purgeSourceDocument(filename = null) {
  119. if (!filename) return;
  120. const filePath = path.resolve(documentsPath, normalizePath(filename));
  121. if (
  122. !fs.existsSync(filePath) ||
  123. !isWithin(documentsPath, filePath) ||
  124. !fs.lstatSync(filePath).isFile()
  125. )
  126. return;
  127. console.log(`Purging source document of ${filename}.`);
  128. fs.rmSync(filePath);
  129. return;
  130. }
  131. // Purges a vector-cache file from the vector-cache/ folder.
  132. async function purgeVectorCache(filename = null) {
  133. if (!filename) return;
  134. const digest = uuidv5(filename, uuidv5.URL);
  135. const filePath = path.resolve(vectorCachePath, `${digest}.json`);
  136. if (!fs.existsSync(filePath) || !fs.lstatSync(filePath).isFile()) return;
  137. console.log(`Purging vector-cache of ${filename}.`);
  138. fs.rmSync(filePath);
  139. return;
  140. }
  141. // Search for a specific document by its unique name in the entire `documents`
  142. // folder via iteration of all folders and checking if the expected file exists.
  143. async function findDocumentInDocuments(documentName = null) {
  144. if (!documentName) return null;
  145. for (const folder of fs.readdirSync(documentsPath)) {
  146. const isFolder = fs
  147. .lstatSync(path.join(documentsPath, folder))
  148. .isDirectory();
  149. if (!isFolder) continue;
  150. const targetFilename = normalizePath(documentName);
  151. const targetFileLocation = path.join(documentsPath, folder, targetFilename);
  152. if (
  153. !fs.existsSync(targetFileLocation) ||
  154. !isWithin(documentsPath, targetFileLocation)
  155. )
  156. continue;
  157. const fileData = fs.readFileSync(targetFileLocation, "utf8");
  158. const cachefilename = `${folder}/${targetFilename}`;
  159. const { pageContent, ...metadata } = JSON.parse(fileData);
  160. return {
  161. name: targetFilename,
  162. type: "file",
  163. ...metadata,
  164. cached: await cachedVectorInformation(cachefilename, true),
  165. };
  166. }
  167. return null;
  168. }
  169. /**
  170. * Checks if a given path is within another path.
  171. * @param {string} outer - The outer path (should be resolved).
  172. * @param {string} inner - The inner path (should be resolved).
  173. * @returns {boolean} - Returns true if the inner path is within the outer path, false otherwise.
  174. */
  175. function isWithin(outer, inner) {
  176. if (outer === inner) return false;
  177. const rel = path.relative(outer, inner);
  178. return !rel.startsWith("../") && rel !== "..";
  179. }
  180. function normalizePath(filepath = "") {
  181. const result = path
  182. .normalize(filepath.trim())
  183. .replace(/^(\.\.(\/|\\|$))+/, "")
  184. .trim();
  185. if (["..", ".", "/"].includes(result)) throw new Error("Invalid path.");
  186. return result;
  187. }
  188. // Check if the vector-cache folder is empty or not
  189. // useful for it the user is changing embedders as this will
  190. // break the previous cache.
  191. function hasVectorCachedFiles() {
  192. try {
  193. return (
  194. fs.readdirSync(vectorCachePath)?.filter((name) => name.endsWith(".json"))
  195. .length !== 0
  196. );
  197. } catch {}
  198. return false;
  199. }
  200. /**
  201. * @param {string[]} filenames - array of filenames to check for pinned workspaces
  202. * @returns {Promise<Record<string, string[]>>} - a record of filenames and their corresponding workspaceIds
  203. */
  204. async function getPinnedWorkspacesByDocument(filenames = []) {
  205. return (
  206. await Document.where(
  207. {
  208. docpath: {
  209. in: Object.keys(filenames),
  210. },
  211. pinned: true,
  212. },
  213. null,
  214. null,
  215. null,
  216. {
  217. workspaceId: true,
  218. docpath: true,
  219. }
  220. )
  221. ).reduce((result, { workspaceId, docpath }) => {
  222. const filename = filenames[docpath];
  223. if (!result[filename]) result[filename] = [];
  224. if (!result[filename].includes(workspaceId))
  225. result[filename].push(workspaceId);
  226. return result;
  227. }, {});
  228. }
  229. /**
  230. * Get a record of filenames and their corresponding workspaceIds that have watched a document
  231. * that will be used to determine if a document should be displayed in the watched documents sidebar
  232. * @param {string[]} filenames - array of filenames to check for watched workspaces
  233. * @returns {Promise<Record<string, string[]>>} - a record of filenames and their corresponding workspaceIds
  234. */
  235. async function getWatchedDocumentFilenames(filenames = []) {
  236. return (
  237. await Document.where(
  238. {
  239. docpath: { in: Object.keys(filenames) },
  240. watched: true,
  241. },
  242. null,
  243. null,
  244. null,
  245. { workspaceId: true, docpath: true }
  246. )
  247. ).reduce((result, { workspaceId, docpath }) => {
  248. const filename = filenames[docpath];
  249. result[filename] = workspaceId;
  250. return result;
  251. }, {});
  252. }
  253. /**
  254. * Purges the entire vector-cache folder and recreates it.
  255. * @returns {void}
  256. */
  257. function purgeEntireVectorCache() {
  258. fs.rmSync(vectorCachePath, { recursive: true, force: true });
  259. fs.mkdirSync(vectorCachePath);
  260. return;
  261. }
  262. module.exports = {
  263. findDocumentInDocuments,
  264. cachedVectorInformation,
  265. viewLocalFiles,
  266. purgeSourceDocument,
  267. purgeVectorCache,
  268. storeVectorResult,
  269. fileData,
  270. normalizePath,
  271. isWithin,
  272. documentsPath,
  273. hasVectorCachedFiles,
  274. purgeEntireVectorCache,
  275. };