You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

491 lines
16 KiB

11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
  1. const fs = require("fs");
  2. const path = require("path");
  3. const { v5: uuidv5 } = require("uuid");
  4. const { Document } = require("../../models/documents");
  5. const { DocumentSyncQueue } = require("../../models/documentSyncQueue");
  6. const { userFromSession } = require("../http");
  7. const { DeptDocument } = require("../../models/deptDocument");
  8. const documentsPath =
  9. process.env.NODE_ENV === "development"
  10. ? path.resolve(__dirname, `../../storage/documents`)
  11. : path.resolve(process.env.STORAGE_DIR, `documents`);
  12. const vectorCachePath =
  13. process.env.NODE_ENV === "development"
  14. ? path.resolve(__dirname, `../../storage/vector-cache`)
  15. : path.resolve(process.env.STORAGE_DIR, `vector-cache`);
  16. // Should take in a folder that is a subfolder of documents
  17. // eg: youtube-subject/video-123.json
  18. async function fileData(filePath = null) {
  19. if (!filePath) throw new Error("No docPath provided in request");
  20. const fullFilePath = path.resolve(documentsPath, normalizePath(filePath));
  21. if (!fs.existsSync(fullFilePath) || !isWithin(documentsPath, fullFilePath))
  22. return null;
  23. const data = fs.readFileSync(fullFilePath, "utf8");
  24. return JSON.parse(data);
  25. }
  26. // async function viewLocalFiles() {
  27. // if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);
  28. // const liveSyncAvailable = await DocumentSyncQueue.enabled();
  29. // const directory = {
  30. // name: "documents",
  31. // type: "folder",
  32. // items: [],
  33. // };
  34. // for (const file of fs.readdirSync(documentsPath)) {
  35. // // console.log("file:", file);
  36. // if (path.extname(file) === ".md") continue;
  37. // const folderPath = path.resolve(documentsPath, file);
  38. // const isFolder = fs.lstatSync(folderPath).isDirectory();
  39. // if (isFolder) {
  40. // const subdocs = {
  41. // name: file,
  42. // type: "folder",
  43. // items: [],
  44. // };
  45. // const subfiles = fs.readdirSync(folderPath);
  46. // const filenames = {};
  47. // for (const subfile of subfiles) {
  48. // if (path.extname(subfile) !== ".json") continue;
  49. // const filePath = path.join(folderPath, subfile);
  50. // const rawData = fs.readFileSync(filePath, "utf8");
  51. // // console.log("rawData:", rawData);
  52. // const cachefilename = `${file}/${subfile}`;
  53. // const { pageContent, ...metadata } = JSON.parse(rawData);
  54. // subdocs.items.push({
  55. // name: subfile,
  56. // type: "file",
  57. // ...metadata,
  58. // cached: await cachedVectorInformation(cachefilename, true),
  59. // canWatch: liveSyncAvailable
  60. // ? DocumentSyncQueue.canWatch(metadata)
  61. // : false,
  62. // // pinnedWorkspaces: [], // This is the list of workspaceIds that have pinned this document
  63. // // watched: false, // boolean to indicate if this document is watched in ANY workspace
  64. // });
  65. // filenames[cachefilename] = subfile;
  66. // }
  67. //
  68. // // Grab the pinned workspaces and watched documents for this folder's documents
  69. // // at the time of the query so we don't have to re-query the database for each file
  70. // const pinnedWorkspacesByDocument =
  71. // await getPinnedWorkspacesByDocument(filenames);
  72. // const watchedDocumentsFilenames =
  73. // await getWatchedDocumentFilenames(filenames);
  74. // for (const item of subdocs.items) {
  75. // item.pinnedWorkspaces = pinnedWorkspacesByDocument[item.name] || [];
  76. // item.watched =
  77. // watchedDocumentsFilenames.hasOwnProperty(item.name) || false;
  78. // }
  79. //
  80. // directory.items.push(subdocs);
  81. // }
  82. // }
  83. //
  84. // // Make sure custom-documents is always the first folder in picker
  85. // directory.items = [
  86. // directory.items.find((folder) => folder.name === "custom-documents"),
  87. // ...directory.items.filter((folder) => folder.name !== "custom-documents"),
  88. // ].filter((i) => !!i);
  89. //
  90. // return directory;
  91. // }
  92. // async function viewLocalFiles(deptId) {
  93. // const directory = {
  94. // name: "documents",
  95. // type: "folder",
  96. // items: [],
  97. // };
  98. // if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);
  99. // const liveSyncAvailable = await DocumentSyncQueue.enabled();
  100. //
  101. // // 查询 deptDocuments
  102. // const deptDocuments = await DeptDocument.where({ deptId: deptId, delTag: false });
  103. // if (!deptDocuments || deptDocuments.length === 0) {
  104. // return directory;
  105. // }
  106. //
  107. // // 初始化分类对象
  108. // const publicd = {
  109. // name: "公开",
  110. // type: "folder",
  111. // items: [],
  112. // };
  113. // const privated = {
  114. // name: "私有",
  115. // type: "folder",
  116. // items: [],
  117. // };
  118. // const temp = {
  119. // name: "临时",
  120. // type: "folder",
  121. // items: [],
  122. // };
  123. //
  124. // // 遍历 deptDocuments
  125. // for (const doc of deptDocuments) {
  126. // try {
  127. // const filePath = doc.parsedFilePath; // 获取文件路径
  128. // if (!fs.existsSync(filePath)) continue; // 如果文件不存在,跳过
  129. //
  130. // // 读取文件内容
  131. // const rawData = fs.readFileSync(filePath, 'utf8');
  132. // const { pageContent, ...metadata } = JSON.parse(rawData);
  133. //
  134. // // 构造文件信息对象(保持与原方法一致的字段)
  135. // const fileInfo = {
  136. // name: path.basename(filePath), // 文件名
  137. // type: "file",
  138. // ...metadata,
  139. // cached: await cachedVectorInformation(filePath, true),
  140. // canWatch: liveSyncAvailable
  141. // ? DocumentSyncQueue.canWatch(metadata)
  142. // : false,
  143. // pinnedWorkspaces: [], // 初始化为空数组
  144. // watched: false, // 初始化为 false
  145. // };
  146. //
  147. // // 根据 isPublic 属性分类
  148. // if (doc.isPublic === 0) {
  149. // publicd.items.push(fileInfo);
  150. // } else if (doc.isPublic === 1) {
  151. // privated.items.push(fileInfo);
  152. // } else {
  153. // temp.items.push(fileInfo);
  154. // }
  155. // } catch (error) {
  156. // console.error(`Error processing file ${doc.parsedFilePath}:`, error);
  157. // }
  158. // }
  159. //
  160. // directory.items = [publicd, privated, temp];
  161. // // 返回嵌套结构
  162. // return directory;
  163. // }
  164. async function viewLocalFiles(deptId) {
  165. const directory = {
  166. name: "documents",
  167. type: "folder",
  168. items: [],
  169. };
  170. if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);
  171. const liveSyncAvailable = await DocumentSyncQueue.enabled();
  172. // 查询 deptDocuments
  173. const deptDocuments = await DeptDocument.where({ deptId: deptId, delTag: false });
  174. if (!deptDocuments || deptDocuments.length === 0) {
  175. return directory;
  176. }
  177. // 初始化分类对象
  178. const publicd = {
  179. name: "公开",
  180. type: "folder",
  181. items: [],
  182. };
  183. const privated = {
  184. name: "私有",
  185. type: "folder",
  186. items: [],
  187. };
  188. const temp = {
  189. name: "临时",
  190. type: "folder",
  191. items: [],
  192. };
  193. // 遍历 deptDocuments
  194. for (const doc of deptDocuments) {
  195. try {
  196. let filePath = doc.parsedFilePath; // 获取文件路径
  197. if (!fs.existsSync(filePath)) {
  198. filePath = process.env.NODE_ENV === "development"
  199. ? filePath
  200. : path.resolve(process.env.STORAGE_DIR, "documents", filePath);
  201. }
  202. // 读取文件内容
  203. const rawData = fs.readFileSync(filePath, 'utf8');
  204. const { pageContent, ...metadata } = JSON.parse(rawData);
  205. // 计算相对路径,并将路径分隔符统一为 `/`
  206. const relativePath = path.relative(documentsPath, filePath).replace(/\\/g, '/');
  207. // 构造文件信息对象(保持与原方法一致的字段)
  208. const fileInfo = {
  209. name: path.basename(filePath), // 文件名
  210. type: "file",
  211. ...metadata,
  212. cached: await cachedVectorInformation(filePath, true),
  213. canWatch: liveSyncAvailable
  214. ? DocumentSyncQueue.canWatch(metadata)
  215. : false,
  216. pinnedWorkspaces: [], // 初始化为空数组
  217. watched: false, // 初始化为 false
  218. relativePath: relativePath, // 新增字段:相对路径(使用 `/` 分隔符)
  219. };
  220. // 根据 isPublic 属性分类
  221. if (doc.isPublic === 0) {
  222. publicd.items.push(fileInfo);
  223. } else if (doc.isPublic === 1) {
  224. privated.items.push(fileInfo);
  225. } else {
  226. temp.items.push(fileInfo);
  227. }
  228. } catch (error) {
  229. console.error(`Error processing file ${doc.parsedFilePath}:`, error);
  230. }
  231. }
  232. directory.items = [publicd, privated, temp];
  233. // 返回嵌套结构
  234. return directory;
  235. }
  236. /**
  237. * Searches the vector-cache folder for existing information so we dont have to re-embed a
  238. * document and can instead push directly to vector db.
  239. * @param {string} filename - the filename to check for cached vector information
  240. * @param {boolean} checkOnly - if true, only check if the file exists, do not return the cached data
  241. * @returns {Promise<{exists: boolean, chunks: any[]}>} - a promise that resolves to an object containing the existence of the file and its cached chunks
  242. */
  243. async function cachedVectorInformation(filename = null, checkOnly = false) {
  244. if (!filename) return checkOnly ? false : { exists: false, chunks: [] };
  245. const digest = uuidv5(filename, uuidv5.URL);
  246. const file = path.resolve(vectorCachePath, `${digest}.json`);
  247. const exists = fs.existsSync(file);
  248. if (checkOnly) return exists;
  249. if (!exists) return { exists, chunks: [] };
  250. console.log(
  251. `Cached vectorized results of ${filename} found! Using cached data to save on embed costs.`
  252. );
  253. const rawData = fs.readFileSync(file, "utf8");
  254. return { exists: true, chunks: JSON.parse(rawData) };
  255. }
  256. // vectorData: pre-chunked vectorized data for a given file that includes the proper metadata and chunk-size limit so it can be iterated and dumped into Pinecone, etc
  257. // filename is the fullpath to the doc so we can compare by filename to find cached matches.
  258. async function storeVectorResult(vectorData = [], filename = null) {
  259. if (!filename) return;
  260. console.log(
  261. `Caching vectorized results of ${filename} to prevent duplicated embedding.`
  262. );
  263. if (!fs.existsSync(vectorCachePath)) fs.mkdirSync(vectorCachePath);
  264. const digest = uuidv5(filename, uuidv5.URL);
  265. const writeTo = path.resolve(vectorCachePath, `${digest}.json`);
  266. fs.writeFileSync(writeTo, JSON.stringify(vectorData), "utf8");
  267. return;
  268. }
  269. // Purges a file from the documents/ folder.
  270. async function purgeSourceDocument(filename = null) {
  271. if (!filename) return;
  272. const filePath = path.resolve(documentsPath, normalizePath(filename));
  273. if (
  274. !fs.existsSync(filePath) ||
  275. !isWithin(documentsPath, filePath) ||
  276. !fs.lstatSync(filePath).isFile()
  277. )
  278. return;
  279. console.log(`Purging source document of ${filename}.`);
  280. fs.rmSync(filePath);
  281. return;
  282. }
  283. // Purges a vector-cache file from the vector-cache/ folder.
  284. async function purgeVectorCache(filename = null) {
  285. if (!filename) return;
  286. const digest = uuidv5(filename, uuidv5.URL);
  287. const filePath = path.resolve(vectorCachePath, `${digest}.json`);
  288. if (!fs.existsSync(filePath) || !fs.lstatSync(filePath).isFile()) return;
  289. console.log(`Purging vector-cache of ${filename}.`);
  290. fs.rmSync(filePath);
  291. return;
  292. }
  293. // Search for a specific document by its unique name in the entire `documents`
  294. // folder via iteration of all folders and checking if the expected file exists.
  295. async function findDocumentInDocuments(documentName = null) {
  296. if (!documentName) return null;
  297. for (const folder of fs.readdirSync(documentsPath)) {
  298. const isFolder = fs
  299. .lstatSync(path.join(documentsPath, folder))
  300. .isDirectory();
  301. if (!isFolder) continue;
  302. const targetFilename = normalizePath(documentName);
  303. const targetFileLocation = path.join(documentsPath, folder, targetFilename);
  304. if (
  305. !fs.existsSync(targetFileLocation) ||
  306. !isWithin(documentsPath, targetFileLocation)
  307. )
  308. continue;
  309. const fileData = fs.readFileSync(targetFileLocation, "utf8");
  310. const cachefilename = `${folder}/${targetFilename}`;
  311. const { pageContent, ...metadata } = JSON.parse(fileData);
  312. return {
  313. name: targetFilename,
  314. type: "file",
  315. ...metadata,
  316. cached: await cachedVectorInformation(cachefilename, true),
  317. };
  318. }
  319. return null;
  320. }
  321. /**
  322. * Checks if a given path is within another path.
  323. * @param {string} outer - The outer path (should be resolved).
  324. * @param {string} inner - The inner path (should be resolved).
  325. * @returns {boolean} - Returns true if the inner path is within the outer path, false otherwise.
  326. */
  327. function isWithin(outer, inner) {
  328. if (outer === inner) return false;
  329. const rel = path.relative(outer, inner);
  330. return !rel.startsWith("../") && rel !== "..";
  331. }
  332. function normalizePath(filepath = "") {
  333. const result = path
  334. .normalize(filepath.trim())
  335. .replace(/^(\.\.(\/|\\|$))+/, "")
  336. .trim();
  337. if (["..", ".", "/"].includes(result)) throw new Error("Invalid path.");
  338. return result;
  339. }
  340. // Check if the vector-cache folder is empty or not
  341. // useful for it the user is changing embedders as this will
  342. // break the previous cache.
  343. function hasVectorCachedFiles() {
  344. try {
  345. return (
  346. fs.readdirSync(vectorCachePath)?.filter((name) => name.endsWith(".json"))
  347. .length !== 0
  348. );
  349. } catch {}
  350. return false;
  351. }
  352. /**
  353. * @param {string[]} filenames - array of filenames to check for pinned workspaces
  354. * @returns {Promise<Record<string, string[]>>} - a record of filenames and their corresponding workspaceIds
  355. */
  356. async function getPinnedWorkspacesByDocument(filenames = []) {
  357. return (
  358. await Document.where(
  359. {
  360. docpath: {
  361. in: Object.keys(filenames),
  362. },
  363. pinned: true,
  364. },
  365. null,
  366. null,
  367. null,
  368. {
  369. workspaceId: true,
  370. docpath: true,
  371. }
  372. )
  373. ).reduce((result, { workspaceId, docpath }) => {
  374. const filename = filenames[docpath];
  375. if (!result[filename]) result[filename] = [];
  376. if (!result[filename].includes(workspaceId))
  377. result[filename].push(workspaceId);
  378. return result;
  379. }, {});
  380. }
  381. /**
  382. * Get a record of filenames and their corresponding workspaceIds that have watched a document
  383. * that will be used to determine if a document should be displayed in the watched documents sidebar
  384. * @param {string[]} filenames - array of filenames to check for watched workspaces
  385. * @returns {Promise<Record<string, string[]>>} - a record of filenames and their corresponding workspaceIds
  386. */
  387. async function getWatchedDocumentFilenames(filenames = []) {
  388. return (
  389. await Document.where(
  390. {
  391. docpath: { in: Object.keys(filenames) },
  392. watched: true,
  393. },
  394. null,
  395. null,
  396. null,
  397. { workspaceId: true, docpath: true }
  398. )
  399. ).reduce((result, { workspaceId, docpath }) => {
  400. const filename = filenames[docpath];
  401. result[filename] = workspaceId;
  402. return result;
  403. }, {});
  404. }
  405. /**
  406. * Purges the entire vector-cache folder and recreates it.
  407. * @returns {void}
  408. */
  409. function purgeEntireVectorCache() {
  410. fs.rmSync(vectorCachePath, { recursive: true, force: true });
  411. fs.mkdirSync(vectorCachePath);
  412. return;
  413. }
  414. /**
  415. * 移动文件到目标目录并重命名
  416. * @param {string} sourceFilePath - 源文件路径
  417. * @param {string} targetDirectory - 目标目录路径
  418. * @param {string} newFileName - 新文件名
  419. */
  420. function moveAndRenameFile(sourceFilePath, targetDirectory, newFileName) {
  421. // 1. 检查源文件是否存在
  422. if (!fs.existsSync(sourceFilePath)) {
  423. throw new Error(`源文件不存在: ${sourceFilePath}`);
  424. }
  425. // 2. 检查目标目录是否存在,如果不存在则创建
  426. if (!fs.existsSync(targetDirectory)) {
  427. fs.mkdirSync(targetDirectory, { recursive: true }); // recursive: true 确保创建多层目录
  428. }
  429. // 3. 构造目标文件的完整路径(使用新文件名)
  430. const targetFilePath = path.join(targetDirectory, newFileName);
  431. // 4. 移动文件并重命名
  432. fs.renameSync(sourceFilePath, targetFilePath);
  433. console.log(`文件已移动到: ${targetFilePath}`);
  434. }
  435. module.exports = {
  436. findDocumentInDocuments,
  437. cachedVectorInformation,
  438. viewLocalFiles,
  439. purgeSourceDocument,
  440. purgeVectorCache,
  441. storeVectorResult,
  442. fileData,
  443. normalizePath,
  444. isWithin,
  445. documentsPath,
  446. hasVectorCachedFiles,
  447. purgeEntireVectorCache,
  448. moveAndRenameFile,
  449. };