|
|
const fs = require("fs");const path = require("path");const { v5: uuidv5 } = require("uuid");const { Document } = require("../../models/documents");const { DocumentSyncQueue } = require("../../models/documentSyncQueue");const { userFromSession } = require("../http");const { DeptDocument } = require("../../models/deptDocument");const documentsPath = process.env.NODE_ENV === "development" ? path.resolve(__dirname, `../../storage/documents`) : path.resolve(process.env.STORAGE_DIR, `documents`);const vectorCachePath = process.env.NODE_ENV === "development" ? path.resolve(__dirname, `../../storage/vector-cache`) : path.resolve(process.env.STORAGE_DIR, `vector-cache`);
// Should take in a folder that is a subfolder of documents
// eg: youtube-subject/video-123.json
async function fileData(filePath = null) { if (!filePath) throw new Error("No docPath provided in request"); const fullFilePath = path.resolve(documentsPath, normalizePath(filePath)); if (!fs.existsSync(fullFilePath) || !isWithin(documentsPath, fullFilePath)) return null;
const data = fs.readFileSync(fullFilePath, "utf8"); return JSON.parse(data);}
// async function viewLocalFiles() {
// if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);
// const liveSyncAvailable = await DocumentSyncQueue.enabled();
// const directory = {
// name: "documents",
// type: "folder",
// items: [],
// };
// for (const file of fs.readdirSync(documentsPath)) {
// // console.log("file:", file);
// if (path.extname(file) === ".md") continue;
// const folderPath = path.resolve(documentsPath, file);
// const isFolder = fs.lstatSync(folderPath).isDirectory();
// if (isFolder) {
// const subdocs = {
// name: file,
// type: "folder",
// items: [],
// };
// const subfiles = fs.readdirSync(folderPath);
// const filenames = {};
// for (const subfile of subfiles) {
// if (path.extname(subfile) !== ".json") continue;
// const filePath = path.join(folderPath, subfile);
// const rawData = fs.readFileSync(filePath, "utf8");
// // console.log("rawData:", rawData);
// const cachefilename = `${file}/${subfile}`;
// const { pageContent, ...metadata } = JSON.parse(rawData);
// subdocs.items.push({
// name: subfile,
// type: "file",
// ...metadata,
// cached: await cachedVectorInformation(cachefilename, true),
// canWatch: liveSyncAvailable
// ? DocumentSyncQueue.canWatch(metadata)
// : false,
// // pinnedWorkspaces: [], // This is the list of workspaceIds that have pinned this document
// // watched: false, // boolean to indicate if this document is watched in ANY workspace
// });
// filenames[cachefilename] = subfile;
// }
//
// // Grab the pinned workspaces and watched documents for this folder's documents
// // at the time of the query so we don't have to re-query the database for each file
// const pinnedWorkspacesByDocument =
// await getPinnedWorkspacesByDocument(filenames);
// const watchedDocumentsFilenames =
// await getWatchedDocumentFilenames(filenames);
// for (const item of subdocs.items) {
// item.pinnedWorkspaces = pinnedWorkspacesByDocument[item.name] || [];
// item.watched =
// watchedDocumentsFilenames.hasOwnProperty(item.name) || false;
// }
//
// directory.items.push(subdocs);
// }
// }
//
// // Make sure custom-documents is always the first folder in picker
// directory.items = [
// directory.items.find((folder) => folder.name === "custom-documents"),
// ...directory.items.filter((folder) => folder.name !== "custom-documents"),
// ].filter((i) => !!i);
//
// return directory;
// }
// async function viewLocalFiles(deptId) {
// const directory = {
// name: "documents",
// type: "folder",
// items: [],
// };
// if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);
// const liveSyncAvailable = await DocumentSyncQueue.enabled();
//
// // 查询 deptDocuments
// const deptDocuments = await DeptDocument.where({ deptId: deptId, delTag: false });
// if (!deptDocuments || deptDocuments.length === 0) {
// return directory;
// }
//
// // 初始化分类对象
// const publicd = {
// name: "公开",
// type: "folder",
// items: [],
// };
// const privated = {
// name: "私有",
// type: "folder",
// items: [],
// };
// const temp = {
// name: "临时",
// type: "folder",
// items: [],
// };
//
// // 遍历 deptDocuments
// for (const doc of deptDocuments) {
// try {
// const filePath = doc.parsedFilePath; // 获取文件路径
// if (!fs.existsSync(filePath)) continue; // 如果文件不存在,跳过
//
// // 读取文件内容
// const rawData = fs.readFileSync(filePath, 'utf8');
// const { pageContent, ...metadata } = JSON.parse(rawData);
//
// // 构造文件信息对象(保持与原方法一致的字段)
// const fileInfo = {
// name: path.basename(filePath), // 文件名
// type: "file",
// ...metadata,
// cached: await cachedVectorInformation(filePath, true),
// canWatch: liveSyncAvailable
// ? DocumentSyncQueue.canWatch(metadata)
// : false,
// pinnedWorkspaces: [], // 初始化为空数组
// watched: false, // 初始化为 false
// };
//
// // 根据 isPublic 属性分类
// if (doc.isPublic === 0) {
// publicd.items.push(fileInfo);
// } else if (doc.isPublic === 1) {
// privated.items.push(fileInfo);
// } else {
// temp.items.push(fileInfo);
// }
// } catch (error) {
// console.error(`Error processing file ${doc.parsedFilePath}:`, error);
// }
// }
//
// directory.items = [publicd, privated, temp];
// // 返回嵌套结构
// return directory;
// }
async function viewLocalFiles(deptId) { const directory = { name: "documents", type: "folder", items: [], }; if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath); const liveSyncAvailable = await DocumentSyncQueue.enabled();
// 查询 deptDocuments
const deptDocuments = await DeptDocument.where({ deptId: deptId, delTag: false }); if (!deptDocuments || deptDocuments.length === 0) { return directory; }
// 初始化分类对象
const publicd = { name: "公开", type: "folder", items: [], }; const privated = { name: "私有", type: "folder", items: [], }; const temp = { name: "临时", type: "folder", items: [], };
// 遍历 deptDocuments
for (const doc of deptDocuments) { try { const filePath = doc.parsedFilePath; // 获取文件路径
if (!fs.existsSync(filePath)) continue; // 如果文件不存在,跳过
// 读取文件内容
const rawData = fs.readFileSync(filePath, 'utf8'); const { pageContent, ...metadata } = JSON.parse(rawData);
// 计算相对路径,并将路径分隔符统一为 `/`
const relativePath = path.relative(documentsPath, filePath).replace(/\\/g, '/');
// 构造文件信息对象(保持与原方法一致的字段)
const fileInfo = { name: path.basename(filePath), // 文件名
type: "file", ...metadata, cached: await cachedVectorInformation(filePath, true), canWatch: liveSyncAvailable ? DocumentSyncQueue.canWatch(metadata) : false, pinnedWorkspaces: [], // 初始化为空数组
watched: false, // 初始化为 false
relativePath: relativePath, // 新增字段:相对路径(使用 `/` 分隔符)
};
// 根据 isPublic 属性分类
if (doc.isPublic === 0) { publicd.items.push(fileInfo); } else if (doc.isPublic === 1) { privated.items.push(fileInfo); } else { temp.items.push(fileInfo); } } catch (error) { console.error(`Error processing file ${doc.parsedFilePath}:`, error); } }
directory.items = [publicd, privated, temp]; // 返回嵌套结构
return directory;}
/** * Searches the vector-cache folder for existing information so we dont have to re-embed a * document and can instead push directly to vector db. * @param {string} filename - the filename to check for cached vector information * @param {boolean} checkOnly - if true, only check if the file exists, do not return the cached data * @returns {Promise<{exists: boolean, chunks: any[]}>} - a promise that resolves to an object containing the existence of the file and its cached chunks */async function cachedVectorInformation(filename = null, checkOnly = false) { if (!filename) return checkOnly ? false : { exists: false, chunks: [] };
const digest = uuidv5(filename, uuidv5.URL); const file = path.resolve(vectorCachePath, `${digest}.json`); const exists = fs.existsSync(file);
if (checkOnly) return exists; if (!exists) return { exists, chunks: [] };
console.log( `Cached vectorized results of ${filename} found! Using cached data to save on embed costs.` ); const rawData = fs.readFileSync(file, "utf8"); return { exists: true, chunks: JSON.parse(rawData) };}
// vectorData: pre-chunked vectorized data for a given file that includes the proper metadata and chunk-size limit so it can be iterated and dumped into Pinecone, etc
// filename is the fullpath to the doc so we can compare by filename to find cached matches.
async function storeVectorResult(vectorData = [], filename = null) { if (!filename) return; console.log( `Caching vectorized results of ${filename} to prevent duplicated embedding.` ); if (!fs.existsSync(vectorCachePath)) fs.mkdirSync(vectorCachePath);
const digest = uuidv5(filename, uuidv5.URL); const writeTo = path.resolve(vectorCachePath, `${digest}.json`); fs.writeFileSync(writeTo, JSON.stringify(vectorData), "utf8"); return;}
// Purges a file from the documents/ folder.
async function purgeSourceDocument(filename = null) { if (!filename) return; const filePath = path.resolve(documentsPath, normalizePath(filename));
if ( !fs.existsSync(filePath) || !isWithin(documentsPath, filePath) || !fs.lstatSync(filePath).isFile() ) return;
console.log(`Purging source document of ${filename}.`); fs.rmSync(filePath); return;}
// Purges a vector-cache file from the vector-cache/ folder.
async function purgeVectorCache(filename = null) { if (!filename) return; const digest = uuidv5(filename, uuidv5.URL); const filePath = path.resolve(vectorCachePath, `${digest}.json`);
if (!fs.existsSync(filePath) || !fs.lstatSync(filePath).isFile()) return; console.log(`Purging vector-cache of ${filename}.`); fs.rmSync(filePath); return;}
// Search for a specific document by its unique name in the entire `documents`
// folder via iteration of all folders and checking if the expected file exists.
async function findDocumentInDocuments(documentName = null) { if (!documentName) return null; for (const folder of fs.readdirSync(documentsPath)) { const isFolder = fs .lstatSync(path.join(documentsPath, folder)) .isDirectory(); if (!isFolder) continue;
const targetFilename = normalizePath(documentName); const targetFileLocation = path.join(documentsPath, folder, targetFilename);
if ( !fs.existsSync(targetFileLocation) || !isWithin(documentsPath, targetFileLocation) ) continue;
const fileData = fs.readFileSync(targetFileLocation, "utf8"); const cachefilename = `${folder}/${targetFilename}`; const { pageContent, ...metadata } = JSON.parse(fileData); return { name: targetFilename, type: "file", ...metadata, cached: await cachedVectorInformation(cachefilename, true), }; }
return null;}
/** * Checks if a given path is within another path. * @param {string} outer - The outer path (should be resolved). * @param {string} inner - The inner path (should be resolved). * @returns {boolean} - Returns true if the inner path is within the outer path, false otherwise. */function isWithin(outer, inner) { if (outer === inner) return false; const rel = path.relative(outer, inner); return !rel.startsWith("../") && rel !== "..";}
function normalizePath(filepath = "") { const result = path .normalize(filepath.trim()) .replace(/^(\.\.(\/|\\|$))+/, "") .trim(); if (["..", ".", "/"].includes(result)) throw new Error("Invalid path."); return result;}
// Check if the vector-cache folder is empty or not
// useful for it the user is changing embedders as this will
// break the previous cache.
function hasVectorCachedFiles() { try { return ( fs.readdirSync(vectorCachePath)?.filter((name) => name.endsWith(".json")) .length !== 0 ); } catch {} return false;}
/** * @param {string[]} filenames - array of filenames to check for pinned workspaces * @returns {Promise<Record<string, string[]>>} - a record of filenames and their corresponding workspaceIds */async function getPinnedWorkspacesByDocument(filenames = []) { return ( await Document.where( { docpath: { in: Object.keys(filenames), }, pinned: true, }, null, null, null, { workspaceId: true, docpath: true, } ) ).reduce((result, { workspaceId, docpath }) => { const filename = filenames[docpath]; if (!result[filename]) result[filename] = []; if (!result[filename].includes(workspaceId)) result[filename].push(workspaceId); return result; }, {});}
/** * Get a record of filenames and their corresponding workspaceIds that have watched a document * that will be used to determine if a document should be displayed in the watched documents sidebar * @param {string[]} filenames - array of filenames to check for watched workspaces * @returns {Promise<Record<string, string[]>>} - a record of filenames and their corresponding workspaceIds */async function getWatchedDocumentFilenames(filenames = []) { return ( await Document.where( { docpath: { in: Object.keys(filenames) }, watched: true, }, null, null, null, { workspaceId: true, docpath: true } ) ).reduce((result, { workspaceId, docpath }) => { const filename = filenames[docpath]; result[filename] = workspaceId; return result; }, {});}
/** * Purges the entire vector-cache folder and recreates it. * @returns {void} */function purgeEntireVectorCache() { fs.rmSync(vectorCachePath, { recursive: true, force: true }); fs.mkdirSync(vectorCachePath); return;}
/** * 移动文件到目标目录并重命名 * @param {string} sourceFilePath - 源文件路径 * @param {string} targetDirectory - 目标目录路径 * @param {string} newFileName - 新文件名 */function moveAndRenameFile(sourceFilePath, targetDirectory, newFileName) { // 1. 检查源文件是否存在
if (!fs.existsSync(sourceFilePath)) { throw new Error(`源文件不存在: ${sourceFilePath}`); }
// 2. 检查目标目录是否存在,如果不存在则创建
if (!fs.existsSync(targetDirectory)) { fs.mkdirSync(targetDirectory, { recursive: true }); // recursive: true 确保创建多层目录
}
// 3. 构造目标文件的完整路径(使用新文件名)
const targetFilePath = path.join(targetDirectory, newFileName);
// 4. 移动文件并重命名
fs.renameSync(sourceFilePath, targetFilePath);
console.log(`文件已移动到: ${targetFilePath}`);}
module.exports = { findDocumentInDocuments, cachedVectorInformation, viewLocalFiles, purgeSourceDocument, purgeVectorCache, storeVectorResult, fileData, normalizePath, isWithin, documentsPath, hasVectorCachedFiles, purgeEntireVectorCache, moveAndRenameFile,};
|