chtech-anythingllm/server/utils/files/index.js

const fs = require("fs");
const path = require("path");
const { v5: uuidv5 } = require("uuid");
const { Document } = require("../../models/documents");
const { DocumentSyncQueue } = require("../../models/documentSyncQueue");
const { userFromSession } = require("../http");
const { DeptDocument } = require("../../models/deptDocument");
const documentsPath =
  process.env.NODE_ENV === "development"
    ? path.resolve(__dirname, `../../storage/documents`)
    : path.resolve(process.env.STORAGE_DIR, `documents`);
const vectorCachePath =
  process.env.NODE_ENV === "development"
    ? path.resolve(__dirname, `../../storage/vector-cache`)
    : path.resolve(process.env.STORAGE_DIR, `vector-cache`);

// Should take in a folder that is a subfolder of documents
// eg: youtube-subject/video-123.json
async function fileData(filePath = null) {
  if (!filePath) throw new Error("No docPath provided in request");
  const fullFilePath = path.resolve(documentsPath, normalizePath(filePath));
  if (!fs.existsSync(fullFilePath) || !isWithin(documentsPath, fullFilePath))
    return null;

  const data = fs.readFileSync(fullFilePath, "utf8");
  return JSON.parse(data);
}

// async function viewLocalFiles() {
//   if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);
//   const liveSyncAvailable = await DocumentSyncQueue.enabled();
//   const directory = {
//     name: "documents",
//     type: "folder",
//     items: [],
//   };
//   for (const file of fs.readdirSync(documentsPath)) {
//     // console.log("file:", file);
//     if (path.extname(file) === ".md") continue;
//     const folderPath = path.resolve(documentsPath, file);
//     const isFolder = fs.lstatSync(folderPath).isDirectory();
//     if (isFolder) {
//       const subdocs = {
//         name: file,
//         type: "folder",
//         items: [],
//       };
//       const subfiles = fs.readdirSync(folderPath);
//       const filenames = {};
//       for (const subfile of subfiles) {
//         if (path.extname(subfile) !== ".json") continue;
//         const filePath = path.join(folderPath, subfile);
//         const rawData = fs.readFileSync(filePath, "utf8");
//         // console.log("rawData:", rawData);
//         const cachefilename = `${file}/${subfile}`;
//         const { pageContent, ...metadata } = JSON.parse(rawData);
//         subdocs.items.push({
//           name: subfile,
//           type: "file",
//           ...metadata,
//           cached: await cachedVectorInformation(cachefilename, true),
//           canWatch: liveSyncAvailable
//             ? DocumentSyncQueue.canWatch(metadata)
//             : false,
//           // pinnedWorkspaces: [], // This is the list of workspaceIds that have pinned this document
//           // watched: false, // boolean to indicate if this document is watched in ANY workspace
//         });
//         filenames[cachefilename] = subfile;
//       }
//
//       // Grab the pinned workspaces and watched documents for this folder's documents
//       // at the time of the query so we don't have to re-query the database for each file
//       const pinnedWorkspacesByDocument =
//         await getPinnedWorkspacesByDocument(filenames);
//       const watchedDocumentsFilenames =
//         await getWatchedDocumentFilenames(filenames);
//       for (const item of subdocs.items) {
//         item.pinnedWorkspaces = pinnedWorkspacesByDocument[item.name] || [];
//         item.watched =
//           watchedDocumentsFilenames.hasOwnProperty(item.name) || false;
//       }
//
//       directory.items.push(subdocs);
//     }
//   }
//
//   // Make sure custom-documents is always the first folder in picker
//   directory.items = [
//     directory.items.find((folder) => folder.name === "custom-documents"),
//     ...directory.items.filter((folder) => folder.name !== "custom-documents"),
//   ].filter((i) => !!i);
//
//   return directory;
// }

// async function viewLocalFiles(deptId) {
//   const directory = {
//     name: "documents",
//     type: "folder",
//     items: [],
//   };
//   if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);
//   const liveSyncAvailable = await DocumentSyncQueue.enabled();
//
//   // 查询 deptDocuments
//   const deptDocuments = await DeptDocument.where({ deptId: deptId, delTag: false });
//   if (!deptDocuments || deptDocuments.length === 0) {
//     return directory;
//   }
//
//   // 初始化分类对象
//   const publicd = {
//     name: "公开",
//     type: "folder",
//     items: [],
//   };
//   const privated = {
//     name: "私有",
//     type: "folder",
//     items: [],
//   };
//   const temp = {
//     name: "临时",
//     type: "folder",
//     items: [],
//   };
//
//   // 遍历 deptDocuments
//   for (const doc of deptDocuments) {
//     try {
//       const filePath = doc.parsedFilePath; // 获取文件路径
//       if (!fs.existsSync(filePath)) continue; // 如果文件不存在，跳过
//
//       // 读取文件内容
//       const rawData = fs.readFileSync(filePath, 'utf8');
//       const { pageContent, ...metadata } = JSON.parse(rawData);
//
//       // 构造文件信息对象（保持与原方法一致的字段）
//       const fileInfo = {
//         name: path.basename(filePath), // 文件名
//         type: "file",
//         ...metadata,
//         cached: await cachedVectorInformation(filePath, true),
//         canWatch: liveSyncAvailable
//           ? DocumentSyncQueue.canWatch(metadata)
//           : false,
//         pinnedWorkspaces: [], // 初始化为空数组
//         watched: false, // 初始化为 false
//       };
//
//       // 根据 isPublic 属性分类
//       if (doc.isPublic === 0) {
//         publicd.items.push(fileInfo);
//       } else if (doc.isPublic === 1) {
//         privated.items.push(fileInfo);
//       } else {
//         temp.items.push(fileInfo);
//       }
//     } catch (error) {
//       console.error(`Error processing file ${doc.parsedFilePath}:`, error);
//     }
//   }
//
//   directory.items = [publicd, privated, temp];
//   // 返回嵌套结构
//   return directory;
// }


async function viewLocalFiles(deptId) {
  const directory = {
    name: "documents",
    type: "folder",
    items: [],
  };
  if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);
  const liveSyncAvailable = await DocumentSyncQueue.enabled();

  // 查询 deptDocuments
  const deptDocuments = await DeptDocument.where({ deptId: deptId, delTag: false });
  if (!deptDocuments || deptDocuments.length === 0) {
    return directory;
  }

  // 初始化分类对象
  const publicd = {
    name: "公开",
    type: "folder",
    items: [],
  };
  const privated = {
    name: "私有",
    type: "folder",
    items: [],
  };
  const temp = {
    name: "临时",
    type: "folder",
    items: [],
  };

  // 遍历 deptDocuments
  for (const doc of deptDocuments) {
    try {
      const filePath = doc.parsedFilePath; // 获取文件路径
      if (!fs.existsSync(filePath)) continue; // 如果文件不存在，跳过

      // 读取文件内容
      const rawData = fs.readFileSync(filePath, 'utf8');
      const { pageContent, ...metadata } = JSON.parse(rawData);

      // 计算相对路径，并将路径分隔符统一为 `/`
      const relativePath = path.relative(documentsPath, filePath).replace(/\\/g, '/');

      // 构造文件信息对象（保持与原方法一致的字段）
      const fileInfo = {
        name: path.basename(filePath), // 文件名
        type: "file",
        ...metadata,
        cached: await cachedVectorInformation(filePath, true),
        canWatch: liveSyncAvailable
          ? DocumentSyncQueue.canWatch(metadata)
          : false,
        pinnedWorkspaces: [], // 初始化为空数组
        watched: false, // 初始化为 false
        relativePath: relativePath, // 新增字段：相对路径（使用 `/` 分隔符）
      };

      // 根据 isPublic 属性分类
      if (doc.isPublic === 0) {
        publicd.items.push(fileInfo);
      } else if (doc.isPublic === 1) {
        privated.items.push(fileInfo);
      } else {
        temp.items.push(fileInfo);
      }
    } catch (error) {
      console.error(`Error processing file ${doc.parsedFilePath}:`, error);
    }
  }

  directory.items = [publicd, privated, temp];
  // 返回嵌套结构
  return directory;
}

/**
 * Searches the vector-cache folder for existing information so we dont have to re-embed a
 * document and can instead push directly to vector db.
 * @param {string} filename - the filename to check for cached vector information
 * @param {boolean} checkOnly - if true, only check if the file exists, do not return the cached data
 * @returns {Promise<{exists: boolean, chunks: any[]}>} - a promise that resolves to an object containing the existence of the file and its cached chunks
 */
async function cachedVectorInformation(filename = null, checkOnly = false) {
  if (!filename) return checkOnly ? false : { exists: false, chunks: [] };

  const digest = uuidv5(filename, uuidv5.URL);
  const file = path.resolve(vectorCachePath, `${digest}.json`);
  const exists = fs.existsSync(file);

  if (checkOnly) return exists;
  if (!exists) return { exists, chunks: [] };

  console.log(
    `Cached vectorized results of ${filename} found! Using cached data to save on embed costs.`
  );
  const rawData = fs.readFileSync(file, "utf8");
  return { exists: true, chunks: JSON.parse(rawData) };
}

// vectorData: pre-chunked vectorized data for a given file that includes the proper metadata and chunk-size limit so it can be iterated and dumped into Pinecone, etc
// filename is the fullpath to the doc so we can compare by filename to find cached matches.
async function storeVectorResult(vectorData = [], filename = null) {
  if (!filename) return;
  console.log(
    `Caching vectorized results of ${filename} to prevent duplicated embedding.`
  );
  if (!fs.existsSync(vectorCachePath)) fs.mkdirSync(vectorCachePath);

  const digest = uuidv5(filename, uuidv5.URL);
  const writeTo = path.resolve(vectorCachePath, `${digest}.json`);
  fs.writeFileSync(writeTo, JSON.stringify(vectorData), "utf8");
  return;
}

// Purges a file from the documents/ folder.
async function purgeSourceDocument(filename = null) {
  if (!filename) return;
  const filePath = path.resolve(documentsPath, normalizePath(filename));

  if (
    !fs.existsSync(filePath) ||
    !isWithin(documentsPath, filePath) ||
    !fs.lstatSync(filePath).isFile()
  )
    return;

  console.log(`Purging source document of ${filename}.`);
  fs.rmSync(filePath);
  return;
}

// Purges a vector-cache file from the vector-cache/ folder.
async function purgeVectorCache(filename = null) {
  if (!filename) return;
  const digest = uuidv5(filename, uuidv5.URL);
  const filePath = path.resolve(vectorCachePath, `${digest}.json`);

  if (!fs.existsSync(filePath) || !fs.lstatSync(filePath).isFile()) return;
  console.log(`Purging vector-cache of ${filename}.`);
  fs.rmSync(filePath);
  return;
}

// Search for a specific document by its unique name in the entire `documents`
// folder via iteration of all folders and checking if the expected file exists.
async function findDocumentInDocuments(documentName = null) {
  if (!documentName) return null;
  for (const folder of fs.readdirSync(documentsPath)) {
    const isFolder = fs
      .lstatSync(path.join(documentsPath, folder))
      .isDirectory();
    if (!isFolder) continue;

    const targetFilename = normalizePath(documentName);
    const targetFileLocation = path.join(documentsPath, folder, targetFilename);

    if (
      !fs.existsSync(targetFileLocation) ||
      !isWithin(documentsPath, targetFileLocation)
    )
      continue;

    const fileData = fs.readFileSync(targetFileLocation, "utf8");
    const cachefilename = `${folder}/${targetFilename}`;
    const { pageContent, ...metadata } = JSON.parse(fileData);
    return {
      name: targetFilename,
      type: "file",
      ...metadata,
      cached: await cachedVectorInformation(cachefilename, true),
    };
  }

  return null;
}

/**
 * Checks if a given path is within another path.
 * @param {string} outer - The outer path (should be resolved).
 * @param {string} inner - The inner path (should be resolved).
 * @returns {boolean} - Returns true if the inner path is within the outer path, false otherwise.
 */
function isWithin(outer, inner) {
  if (outer === inner) return false;
  const rel = path.relative(outer, inner);
  return !rel.startsWith("../") && rel !== "..";
}

function normalizePath(filepath = "") {
  const result = path
    .normalize(filepath.trim())
    .replace(/^(\.\.(\/|\\|$))+/, "")
    .trim();
  if (["..", ".", "/"].includes(result)) throw new Error("Invalid path.");
  return result;
}

// Check if the vector-cache folder is empty or not
// useful for it the user is changing embedders as this will
// break the previous cache.
function hasVectorCachedFiles() {
  try {
    return (
      fs.readdirSync(vectorCachePath)?.filter((name) => name.endsWith(".json"))
        .length !== 0
    );
  } catch {}
  return false;
}

/**
 * @param {string[]} filenames - array of filenames to check for pinned workspaces
 * @returns {Promise<Record<string, string[]>>} - a record of filenames and their corresponding workspaceIds
 */
async function getPinnedWorkspacesByDocument(filenames = []) {
  return (
    await Document.where(
      {
        docpath: {
          in: Object.keys(filenames),
        },
        pinned: true,
      },
      null,
      null,
      null,
      {
        workspaceId: true,
        docpath: true,
      }
    )
  ).reduce((result, { workspaceId, docpath }) => {
    const filename = filenames[docpath];
    if (!result[filename]) result[filename] = [];
    if (!result[filename].includes(workspaceId))
      result[filename].push(workspaceId);
    return result;
  }, {});
}

/**
 * Get a record of filenames and their corresponding workspaceIds that have watched a document
 * that will be used to determine if a document should be displayed in the watched documents sidebar
 * @param {string[]} filenames - array of filenames to check for watched workspaces
 * @returns {Promise<Record<string, string[]>>} - a record of filenames and their corresponding workspaceIds
 */
async function getWatchedDocumentFilenames(filenames = []) {
  return (
    await Document.where(
      {
        docpath: { in: Object.keys(filenames) },
        watched: true,
      },
      null,
      null,
      null,
      { workspaceId: true, docpath: true }
    )
  ).reduce((result, { workspaceId, docpath }) => {
    const filename = filenames[docpath];
    result[filename] = workspaceId;
    return result;
  }, {});
}

/**
 * Purges the entire vector-cache folder and recreates it.
 * @returns {void}
 */
function purgeEntireVectorCache() {
  fs.rmSync(vectorCachePath, { recursive: true, force: true });
  fs.mkdirSync(vectorCachePath);
  return;
}

/**
 * 移动文件到目标目录并重命名
 * @param {string} sourceFilePath - 源文件路径
 * @param {string} targetDirectory - 目标目录路径
 * @param {string} newFileName - 新文件名
 */
function moveAndRenameFile(sourceFilePath, targetDirectory, newFileName) {
  // 1. 检查源文件是否存在
  if (!fs.existsSync(sourceFilePath)) {
    throw new Error(`源文件不存在: ${sourceFilePath}`);
  }

  // 2. 检查目标目录是否存在，如果不存在则创建
  if (!fs.existsSync(targetDirectory)) {
    fs.mkdirSync(targetDirectory, { recursive: true }); // recursive: true 确保创建多层目录
  }

  // 3. 构造目标文件的完整路径（使用新文件名）
  const targetFilePath = path.join(targetDirectory, newFileName);

  // 4. 移动文件并重命名
  fs.renameSync(sourceFilePath, targetFilePath);

  console.log(`文件已移动到: ${targetFilePath}`);
}

module.exports = {
  findDocumentInDocuments,
  cachedVectorInformation,
  viewLocalFiles,
  purgeSourceDocument,
  purgeVectorCache,
  storeVectorResult,
  fileData,
  normalizePath,
  isWithin,
  documentsPath,
  hasVectorCachedFiles,
  purgeEntireVectorCache,
  moveAndRenameFile,
};