You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

264 lines
7.9 KiB

11 months ago
11 months ago
11 months ago
  1. const { v4: uuidv4 } = require("uuid");
  2. const { getVectorDbClass } = require("../utils/helpers");
  3. const prisma = require("../utils/prisma");
  4. const { Telemetry } = require("./telemetry");
  5. const { EventLogs } = require("./eventLogs");
  6. const { safeJsonParse } = require("../utils/http");
  7. const Document = {
  8. writable: ["pinned", "watched", "lastUpdatedAt"],
  9. /**
  10. * @param {import("@prisma/client").workspace_documents} document - Document PrismaRecord
  11. * @returns {{
  12. * metadata: (null|object),
  13. * type: import("./documentSyncQueue.js").validFileType,
  14. * source: string
  15. * }}
  16. */
  17. parseDocumentTypeAndSource: function (document) {
  18. const metadata = safeJsonParse(document.metadata, null);
  19. if (!metadata) return { metadata: null, type: null, source: null };
  20. // Parse the correct type of source and its original source path.
  21. const idx = metadata.chunkSource.indexOf("://");
  22. const [type, source] = [
  23. metadata.chunkSource.slice(0, idx),
  24. metadata.chunkSource.slice(idx + 3),
  25. ];
  26. return { metadata, type, source: this._stripSource(source, type) };
  27. },
  28. forWorkspace: async function (workspaceId = null) {
  29. if (!workspaceId) return [];
  30. return await prisma.workspace_documents.findMany({
  31. where: { workspaceId },
  32. });
  33. },
  34. delete: async function (clause = {}) {
  35. try {
  36. await prisma.workspace_documents.deleteMany({ where: clause });
  37. return true;
  38. } catch (error) {
  39. console.error(error.message);
  40. return false;
  41. }
  42. },
  43. get: async function (clause = {}) {
  44. try {
  45. const document = await prisma.workspace_documents.findFirst({
  46. where: clause,
  47. });
  48. return document || null;
  49. } catch (error) {
  50. console.error(error.message);
  51. return null;
  52. }
  53. },
  54. where: async function (
  55. clause = {},
  56. limit = null,
  57. orderBy = null,
  58. include = null,
  59. select = null
  60. ) {
  61. try {
  62. const results = await prisma.workspace_documents.findMany({
  63. where: clause,
  64. ...(limit !== null ? { take: limit } : {}),
  65. ...(orderBy !== null ? { orderBy } : {}),
  66. ...(include !== null ? { include } : {}),
  67. ...(select !== null ? { select: { ...select } } : {}),
  68. });
  69. return results;
  70. } catch (error) {
  71. console.error(error.message);
  72. return [];
  73. }
  74. },
  75. addDocuments: async function (workspace, additions = [], userId = null) {
  76. const VectorDb = getVectorDbClass();
  77. if (additions.length === 0) return { failed: [], embedded: [] };
  78. const { fileData } = require("../utils/files");
  79. const embedded = [];
  80. const failedToEmbed = [];
  81. const errors = new Set();
  82. for (const path of additions) {
  83. const data = await fileData(path);
  84. console.log("data+++++++++++++++++++++++++++++++++", data);
  85. console.log("path.split(\"/\")+++++++++++++++++++++++++++++++++", path.split("/"));
  86. if (!data) continue;
  87. const docId = uuidv4();
  88. const { pageContent, ...metadata } = data;
  89. const newDoc = {
  90. docId,
  91. filename: path.split("/")[1],
  92. docpath: path,
  93. workspaceId: workspace.id,
  94. metadata: JSON.stringify(metadata),
  95. };
  96. const { vectorized, error } = await VectorDb.addDocumentToNamespace(
  97. workspace.slug,
  98. { ...data, docId },
  99. path
  100. );
  101. if (!vectorized) {
  102. console.error(
  103. "Failed to vectorize",
  104. metadata?.title || newDoc.filename
  105. );
  106. failedToEmbed.push(metadata?.title || newDoc.filename);
  107. errors.add(error);
  108. continue;
  109. }
  110. console.log("newDoc+++++++++++++++++++++++++++++++++", newDoc);
  111. try {
  112. await prisma.workspace_documents.create({ data: newDoc });
  113. embedded.push(path);
  114. } catch (error) {
  115. console.error(error.message);
  116. }
  117. }
  118. await Telemetry.sendTelemetry("documents_embedded_in_workspace", {
  119. LLMSelection: process.env.LLM_PROVIDER || "openai",
  120. Embedder: process.env.EMBEDDING_ENGINE || "inherit",
  121. VectorDbSelection: process.env.VECTOR_DB || "lancedb",
  122. TTSSelection: process.env.TTS_PROVIDER || "native",
  123. });
  124. await EventLogs.logEvent(
  125. "workspace_documents_added",
  126. {
  127. workspaceName: workspace?.name || "Unknown Workspace",
  128. numberOfDocumentsAdded: additions.length,
  129. },
  130. userId
  131. );
  132. return { failedToEmbed, errors: Array.from(errors), embedded };
  133. },
  134. removeDocuments: async function (workspace, removals = [], userId = null) {
  135. const VectorDb = getVectorDbClass();
  136. if (removals.length === 0) return;
  137. for (const path of removals) {
  138. const document = await this.get({
  139. docpath: path,
  140. workspaceId: workspace.id,
  141. });
  142. if (!document) continue;
  143. await VectorDb.deleteDocumentFromNamespace(
  144. workspace.slug,
  145. document.docId
  146. );
  147. try {
  148. await prisma.workspace_documents.delete({
  149. where: { id: document.id, workspaceId: workspace.id },
  150. });
  151. await prisma.document_vectors.deleteMany({
  152. where: { docId: document.docId },
  153. });
  154. } catch (error) {
  155. console.error(error.message);
  156. }
  157. }
  158. await Telemetry.sendTelemetry("documents_removed_in_workspace", {
  159. LLMSelection: process.env.LLM_PROVIDER || "openai",
  160. Embedder: process.env.EMBEDDING_ENGINE || "inherit",
  161. VectorDbSelection: process.env.VECTOR_DB || "lancedb",
  162. TTSSelection: process.env.TTS_PROVIDER || "native",
  163. });
  164. await EventLogs.logEvent(
  165. "workspace_documents_removed",
  166. {
  167. workspaceName: workspace?.name || "Unknown Workspace",
  168. numberOfDocuments: removals.length,
  169. },
  170. userId
  171. );
  172. return true;
  173. },
  174. count: async function (clause = {}, limit = null) {
  175. try {
  176. const count = await prisma.workspace_documents.count({
  177. where: clause,
  178. ...(limit !== null ? { take: limit } : {}),
  179. });
  180. return count;
  181. } catch (error) {
  182. console.error("FAILED TO COUNT DOCUMENTS.", error.message);
  183. return 0;
  184. }
  185. },
  186. update: async function (id = null, data = {}) {
  187. if (!id) throw new Error("No workspace document id provided for update");
  188. const validKeys = Object.keys(data).filter((key) =>
  189. this.writable.includes(key)
  190. );
  191. if (validKeys.length === 0)
  192. return { document: { id }, message: "No valid fields to update!" };
  193. try {
  194. const document = await prisma.workspace_documents.update({
  195. where: { id },
  196. data,
  197. });
  198. return { document, message: null };
  199. } catch (error) {
  200. console.error(error.message);
  201. return { document: null, message: error.message };
  202. }
  203. },
  204. _updateAll: async function (clause = {}, data = {}) {
  205. try {
  206. await prisma.workspace_documents.updateMany({
  207. where: clause,
  208. data,
  209. });
  210. return true;
  211. } catch (error) {
  212. console.error(error.message);
  213. return false;
  214. }
  215. },
  216. content: async function (docId) {
  217. if (!docId) throw new Error("No workspace docId provided!");
  218. const document = await this.get({ docId: String(docId) });
  219. if (!document) throw new Error(`Could not find a document by id ${docId}`);
  220. const { fileData } = require("../utils/files");
  221. const data = await fileData(document.docpath);
  222. return { title: data.title, content: data.pageContent };
  223. },
  224. contentByDocPath: async function (docPath) {
  225. const { fileData } = require("../utils/files");
  226. const data = await fileData(docPath);
  227. return { title: data.title, content: data.pageContent };
  228. },
  229. // Some data sources have encoded params in them we don't want to log - so strip those details.
  230. _stripSource: function (sourceString, type) {
  231. if (["confluence", "github"].includes(type)) {
  232. const _src = new URL(sourceString);
  233. _src.search = ""; // remove all search params that are encoded for resync.
  234. return _src.toString();
  235. }
  236. return sourceString;
  237. },
  238. };
  239. module.exports = { Document };