You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

487 lines
16 KiB

1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
  1. const fs = require("fs");
  2. const path = require("path");
  3. const { v5: uuidv5 } = require("uuid");
  4. const { Document } = require("../../models/documents");
  5. const { DocumentSyncQueue } = require("../../models/documentSyncQueue");
  6. const { userFromSession } = require("../http");
  7. const { DeptDocument } = require("../../models/deptDocument");
  8. const documentsPath =
  9. process.env.NODE_ENV === "development"
  10. ? path.resolve(__dirname, `../../storage/documents`)
  11. : path.resolve(process.env.STORAGE_DIR, `documents`);
  12. const vectorCachePath =
  13. process.env.NODE_ENV === "development"
  14. ? path.resolve(__dirname, `../../storage/vector-cache`)
  15. : path.resolve(process.env.STORAGE_DIR, `vector-cache`);
  16. // Should take in a folder that is a subfolder of documents
  17. // eg: youtube-subject/video-123.json
  18. async function fileData(filePath = null) {
  19. if (!filePath) throw new Error("No docPath provided in request");
  20. const fullFilePath = path.resolve(documentsPath, normalizePath(filePath));
  21. if (!fs.existsSync(fullFilePath) || !isWithin(documentsPath, fullFilePath))
  22. return null;
  23. const data = fs.readFileSync(fullFilePath, "utf8");
  24. return JSON.parse(data);
  25. }
  26. // async function viewLocalFiles() {
  27. // if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);
  28. // const liveSyncAvailable = await DocumentSyncQueue.enabled();
  29. // const directory = {
  30. // name: "documents",
  31. // type: "folder",
  32. // items: [],
  33. // };
  34. // for (const file of fs.readdirSync(documentsPath)) {
  35. // // console.log("file:", file);
  36. // if (path.extname(file) === ".md") continue;
  37. // const folderPath = path.resolve(documentsPath, file);
  38. // const isFolder = fs.lstatSync(folderPath).isDirectory();
  39. // if (isFolder) {
  40. // const subdocs = {
  41. // name: file,
  42. // type: "folder",
  43. // items: [],
  44. // };
  45. // const subfiles = fs.readdirSync(folderPath);
  46. // const filenames = {};
  47. // for (const subfile of subfiles) {
  48. // if (path.extname(subfile) !== ".json") continue;
  49. // const filePath = path.join(folderPath, subfile);
  50. // const rawData = fs.readFileSync(filePath, "utf8");
  51. // // console.log("rawData:", rawData);
  52. // const cachefilename = `${file}/${subfile}`;
  53. // const { pageContent, ...metadata } = JSON.parse(rawData);
  54. // subdocs.items.push({
  55. // name: subfile,
  56. // type: "file",
  57. // ...metadata,
  58. // cached: await cachedVectorInformation(cachefilename, true),
  59. // canWatch: liveSyncAvailable
  60. // ? DocumentSyncQueue.canWatch(metadata)
  61. // : false,
  62. // // pinnedWorkspaces: [], // This is the list of workspaceIds that have pinned this document
  63. // // watched: false, // boolean to indicate if this document is watched in ANY workspace
  64. // });
  65. // filenames[cachefilename] = subfile;
  66. // }
  67. //
  68. // // Grab the pinned workspaces and watched documents for this folder's documents
  69. // // at the time of the query so we don't have to re-query the database for each file
  70. // const pinnedWorkspacesByDocument =
  71. // await getPinnedWorkspacesByDocument(filenames);
  72. // const watchedDocumentsFilenames =
  73. // await getWatchedDocumentFilenames(filenames);
  74. // for (const item of subdocs.items) {
  75. // item.pinnedWorkspaces = pinnedWorkspacesByDocument[item.name] || [];
  76. // item.watched =
  77. // watchedDocumentsFilenames.hasOwnProperty(item.name) || false;
  78. // }
  79. //
  80. // directory.items.push(subdocs);
  81. // }
  82. // }
  83. //
  84. // // Make sure custom-documents is always the first folder in picker
  85. // directory.items = [
  86. // directory.items.find((folder) => folder.name === "custom-documents"),
  87. // ...directory.items.filter((folder) => folder.name !== "custom-documents"),
  88. // ].filter((i) => !!i);
  89. //
  90. // return directory;
  91. // }
  92. // async function viewLocalFiles(deptId) {
  93. // const directory = {
  94. // name: "documents",
  95. // type: "folder",
  96. // items: [],
  97. // };
  98. // if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);
  99. // const liveSyncAvailable = await DocumentSyncQueue.enabled();
  100. //
  101. // // 查询 deptDocuments
  102. // const deptDocuments = await DeptDocument.where({ deptId: deptId, delTag: false });
  103. // if (!deptDocuments || deptDocuments.length === 0) {
  104. // return directory;
  105. // }
  106. //
  107. // // 初始化分类对象
  108. // const publicd = {
  109. // name: "公开",
  110. // type: "folder",
  111. // items: [],
  112. // };
  113. // const privated = {
  114. // name: "私有",
  115. // type: "folder",
  116. // items: [],
  117. // };
  118. // const temp = {
  119. // name: "临时",
  120. // type: "folder",
  121. // items: [],
  122. // };
  123. //
  124. // // 遍历 deptDocuments
  125. // for (const doc of deptDocuments) {
  126. // try {
  127. // const filePath = doc.parsedFilePath; // 获取文件路径
  128. // if (!fs.existsSync(filePath)) continue; // 如果文件不存在,跳过
  129. //
  130. // // 读取文件内容
  131. // const rawData = fs.readFileSync(filePath, 'utf8');
  132. // const { pageContent, ...metadata } = JSON.parse(rawData);
  133. //
  134. // // 构造文件信息对象(保持与原方法一致的字段)
  135. // const fileInfo = {
  136. // name: path.basename(filePath), // 文件名
  137. // type: "file",
  138. // ...metadata,
  139. // cached: await cachedVectorInformation(filePath, true),
  140. // canWatch: liveSyncAvailable
  141. // ? DocumentSyncQueue.canWatch(metadata)
  142. // : false,
  143. // pinnedWorkspaces: [], // 初始化为空数组
  144. // watched: false, // 初始化为 false
  145. // };
  146. //
  147. // // 根据 isPublic 属性分类
  148. // if (doc.isPublic === 0) {
  149. // publicd.items.push(fileInfo);
  150. // } else if (doc.isPublic === 1) {
  151. // privated.items.push(fileInfo);
  152. // } else {
  153. // temp.items.push(fileInfo);
  154. // }
  155. // } catch (error) {
  156. // console.error(`Error processing file ${doc.parsedFilePath}:`, error);
  157. // }
  158. // }
  159. //
  160. // directory.items = [publicd, privated, temp];
  161. // // 返回嵌套结构
  162. // return directory;
  163. // }
  164. async function viewLocalFiles(deptId) {
  165. const directory = {
  166. name: "documents",
  167. type: "folder",
  168. items: [],
  169. };
  170. if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);
  171. const liveSyncAvailable = await DocumentSyncQueue.enabled();
  172. // 查询 deptDocuments
  173. const deptDocuments = await DeptDocument.where({ deptId: deptId, delTag: false });
  174. if (!deptDocuments || deptDocuments.length === 0) {
  175. return directory;
  176. }
  177. // 初始化分类对象
  178. const publicd = {
  179. name: "公开",
  180. type: "folder",
  181. items: [],
  182. };
  183. const privated = {
  184. name: "私有",
  185. type: "folder",
  186. items: [],
  187. };
  188. const temp = {
  189. name: "临时",
  190. type: "folder",
  191. items: [],
  192. };
  193. // 遍历 deptDocuments
  194. for (const doc of deptDocuments) {
  195. try {
  196. const filePath = doc.parsedFilePath; // 获取文件路径
  197. if (!fs.existsSync(filePath)) continue; // 如果文件不存在,跳过
  198. // 读取文件内容
  199. const rawData = fs.readFileSync(filePath, 'utf8');
  200. const { pageContent, ...metadata } = JSON.parse(rawData);
  201. // 计算相对路径,并将路径分隔符统一为 `/`
  202. const relativePath = path.relative(documentsPath, filePath).replace(/\\/g, '/');
  203. // 构造文件信息对象(保持与原方法一致的字段)
  204. const fileInfo = {
  205. name: path.basename(filePath), // 文件名
  206. type: "file",
  207. ...metadata,
  208. cached: await cachedVectorInformation(filePath, true),
  209. canWatch: liveSyncAvailable
  210. ? DocumentSyncQueue.canWatch(metadata)
  211. : false,
  212. pinnedWorkspaces: [], // 初始化为空数组
  213. watched: false, // 初始化为 false
  214. relativePath: relativePath, // 新增字段:相对路径(使用 `/` 分隔符)
  215. };
  216. // 根据 isPublic 属性分类
  217. if (doc.isPublic === 0) {
  218. publicd.items.push(fileInfo);
  219. } else if (doc.isPublic === 1) {
  220. privated.items.push(fileInfo);
  221. } else {
  222. temp.items.push(fileInfo);
  223. }
  224. } catch (error) {
  225. console.error(`Error processing file ${doc.parsedFilePath}:`, error);
  226. }
  227. }
  228. directory.items = [publicd, privated, temp];
  229. // 返回嵌套结构
  230. return directory;
  231. }
  232. /**
  233. * Searches the vector-cache folder for existing information so we dont have to re-embed a
  234. * document and can instead push directly to vector db.
  235. * @param {string} filename - the filename to check for cached vector information
  236. * @param {boolean} checkOnly - if true, only check if the file exists, do not return the cached data
  237. * @returns {Promise<{exists: boolean, chunks: any[]}>} - a promise that resolves to an object containing the existence of the file and its cached chunks
  238. */
  239. async function cachedVectorInformation(filename = null, checkOnly = false) {
  240. if (!filename) return checkOnly ? false : { exists: false, chunks: [] };
  241. const digest = uuidv5(filename, uuidv5.URL);
  242. const file = path.resolve(vectorCachePath, `${digest}.json`);
  243. const exists = fs.existsSync(file);
  244. if (checkOnly) return exists;
  245. if (!exists) return { exists, chunks: [] };
  246. console.log(
  247. `Cached vectorized results of ${filename} found! Using cached data to save on embed costs.`
  248. );
  249. const rawData = fs.readFileSync(file, "utf8");
  250. return { exists: true, chunks: JSON.parse(rawData) };
  251. }
  252. // vectorData: pre-chunked vectorized data for a given file that includes the proper metadata and chunk-size limit so it can be iterated and dumped into Pinecone, etc
  253. // filename is the fullpath to the doc so we can compare by filename to find cached matches.
  254. async function storeVectorResult(vectorData = [], filename = null) {
  255. if (!filename) return;
  256. console.log(
  257. `Caching vectorized results of ${filename} to prevent duplicated embedding.`
  258. );
  259. if (!fs.existsSync(vectorCachePath)) fs.mkdirSync(vectorCachePath);
  260. const digest = uuidv5(filename, uuidv5.URL);
  261. const writeTo = path.resolve(vectorCachePath, `${digest}.json`);
  262. fs.writeFileSync(writeTo, JSON.stringify(vectorData), "utf8");
  263. return;
  264. }
  265. // Purges a file from the documents/ folder.
  266. async function purgeSourceDocument(filename = null) {
  267. if (!filename) return;
  268. const filePath = path.resolve(documentsPath, normalizePath(filename));
  269. if (
  270. !fs.existsSync(filePath) ||
  271. !isWithin(documentsPath, filePath) ||
  272. !fs.lstatSync(filePath).isFile()
  273. )
  274. return;
  275. console.log(`Purging source document of ${filename}.`);
  276. fs.rmSync(filePath);
  277. return;
  278. }
  279. // Purges a vector-cache file from the vector-cache/ folder.
  280. async function purgeVectorCache(filename = null) {
  281. if (!filename) return;
  282. const digest = uuidv5(filename, uuidv5.URL);
  283. const filePath = path.resolve(vectorCachePath, `${digest}.json`);
  284. if (!fs.existsSync(filePath) || !fs.lstatSync(filePath).isFile()) return;
  285. console.log(`Purging vector-cache of ${filename}.`);
  286. fs.rmSync(filePath);
  287. return;
  288. }
  289. // Search for a specific document by its unique name in the entire `documents`
  290. // folder via iteration of all folders and checking if the expected file exists.
  291. async function findDocumentInDocuments(documentName = null) {
  292. if (!documentName) return null;
  293. for (const folder of fs.readdirSync(documentsPath)) {
  294. const isFolder = fs
  295. .lstatSync(path.join(documentsPath, folder))
  296. .isDirectory();
  297. if (!isFolder) continue;
  298. const targetFilename = normalizePath(documentName);
  299. const targetFileLocation = path.join(documentsPath, folder, targetFilename);
  300. if (
  301. !fs.existsSync(targetFileLocation) ||
  302. !isWithin(documentsPath, targetFileLocation)
  303. )
  304. continue;
  305. const fileData = fs.readFileSync(targetFileLocation, "utf8");
  306. const cachefilename = `${folder}/${targetFilename}`;
  307. const { pageContent, ...metadata } = JSON.parse(fileData);
  308. return {
  309. name: targetFilename,
  310. type: "file",
  311. ...metadata,
  312. cached: await cachedVectorInformation(cachefilename, true),
  313. };
  314. }
  315. return null;
  316. }
  317. /**
  318. * Checks if a given path is within another path.
  319. * @param {string} outer - The outer path (should be resolved).
  320. * @param {string} inner - The inner path (should be resolved).
  321. * @returns {boolean} - Returns true if the inner path is within the outer path, false otherwise.
  322. */
  323. function isWithin(outer, inner) {
  324. if (outer === inner) return false;
  325. const rel = path.relative(outer, inner);
  326. return !rel.startsWith("../") && rel !== "..";
  327. }
  328. function normalizePath(filepath = "") {
  329. const result = path
  330. .normalize(filepath.trim())
  331. .replace(/^(\.\.(\/|\\|$))+/, "")
  332. .trim();
  333. if (["..", ".", "/"].includes(result)) throw new Error("Invalid path.");
  334. return result;
  335. }
  336. // Check if the vector-cache folder is empty or not
  337. // useful for it the user is changing embedders as this will
  338. // break the previous cache.
  339. function hasVectorCachedFiles() {
  340. try {
  341. return (
  342. fs.readdirSync(vectorCachePath)?.filter((name) => name.endsWith(".json"))
  343. .length !== 0
  344. );
  345. } catch {}
  346. return false;
  347. }
  348. /**
  349. * @param {string[]} filenames - array of filenames to check for pinned workspaces
  350. * @returns {Promise<Record<string, string[]>>} - a record of filenames and their corresponding workspaceIds
  351. */
  352. async function getPinnedWorkspacesByDocument(filenames = []) {
  353. return (
  354. await Document.where(
  355. {
  356. docpath: {
  357. in: Object.keys(filenames),
  358. },
  359. pinned: true,
  360. },
  361. null,
  362. null,
  363. null,
  364. {
  365. workspaceId: true,
  366. docpath: true,
  367. }
  368. )
  369. ).reduce((result, { workspaceId, docpath }) => {
  370. const filename = filenames[docpath];
  371. if (!result[filename]) result[filename] = [];
  372. if (!result[filename].includes(workspaceId))
  373. result[filename].push(workspaceId);
  374. return result;
  375. }, {});
  376. }
  377. /**
  378. * Get a record of filenames and their corresponding workspaceIds that have watched a document
  379. * that will be used to determine if a document should be displayed in the watched documents sidebar
  380. * @param {string[]} filenames - array of filenames to check for watched workspaces
  381. * @returns {Promise<Record<string, string[]>>} - a record of filenames and their corresponding workspaceIds
  382. */
  383. async function getWatchedDocumentFilenames(filenames = []) {
  384. return (
  385. await Document.where(
  386. {
  387. docpath: { in: Object.keys(filenames) },
  388. watched: true,
  389. },
  390. null,
  391. null,
  392. null,
  393. { workspaceId: true, docpath: true }
  394. )
  395. ).reduce((result, { workspaceId, docpath }) => {
  396. const filename = filenames[docpath];
  397. result[filename] = workspaceId;
  398. return result;
  399. }, {});
  400. }
  401. /**
  402. * Purges the entire vector-cache folder and recreates it.
  403. * @returns {void}
  404. */
  405. function purgeEntireVectorCache() {
  406. fs.rmSync(vectorCachePath, { recursive: true, force: true });
  407. fs.mkdirSync(vectorCachePath);
  408. return;
  409. }
  410. /**
  411. * 移动文件到目标目录并重命名
  412. * @param {string} sourceFilePath - 源文件路径
  413. * @param {string} targetDirectory - 目标目录路径
  414. * @param {string} newFileName - 新文件名
  415. */
  416. function moveAndRenameFile(sourceFilePath, targetDirectory, newFileName) {
  417. // 1. 检查源文件是否存在
  418. if (!fs.existsSync(sourceFilePath)) {
  419. throw new Error(`源文件不存在: ${sourceFilePath}`);
  420. }
  421. // 2. 检查目标目录是否存在,如果不存在则创建
  422. if (!fs.existsSync(targetDirectory)) {
  423. fs.mkdirSync(targetDirectory, { recursive: true }); // recursive: true 确保创建多层目录
  424. }
  425. // 3. 构造目标文件的完整路径(使用新文件名)
  426. const targetFilePath = path.join(targetDirectory, newFileName);
  427. // 4. 移动文件并重命名
  428. fs.renameSync(sourceFilePath, targetFilePath);
  429. console.log(`文件已移动到: ${targetFilePath}`);
  430. }
  431. module.exports = {
  432. findDocumentInDocuments,
  433. cachedVectorInformation,
  434. viewLocalFiles,
  435. purgeSourceDocument,
  436. purgeVectorCache,
  437. storeVectorResult,
  438. fileData,
  439. normalizePath,
  440. isWithin,
  441. documentsPath,
  442. hasVectorCachedFiles,
  443. purgeEntireVectorCache,
  444. moveAndRenameFile,
  445. };