You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

260 lines
7.7 KiB

11 months ago
  1. const { v4: uuidv4 } = require("uuid");
  2. const { getVectorDbClass } = require("../utils/helpers");
  3. const prisma = require("../utils/prisma");
  4. const { Telemetry } = require("./telemetry");
  5. const { EventLogs } = require("./eventLogs");
  6. const { safeJsonParse } = require("../utils/http");
  7. const Document = {
  8. writable: ["pinned", "watched", "lastUpdatedAt"],
  9. /**
  10. * @param {import("@prisma/client").workspace_documents} document - Document PrismaRecord
  11. * @returns {{
  12. * metadata: (null|object),
  13. * type: import("./documentSyncQueue.js").validFileType,
  14. * source: string
  15. * }}
  16. */
  17. parseDocumentTypeAndSource: function (document) {
  18. const metadata = safeJsonParse(document.metadata, null);
  19. if (!metadata) return { metadata: null, type: null, source: null };
  20. // Parse the correct type of source and its original source path.
  21. const idx = metadata.chunkSource.indexOf("://");
  22. const [type, source] = [
  23. metadata.chunkSource.slice(0, idx),
  24. metadata.chunkSource.slice(idx + 3),
  25. ];
  26. return { metadata, type, source: this._stripSource(source, type) };
  27. },
  28. forWorkspace: async function (workspaceId = null) {
  29. if (!workspaceId) return [];
  30. return await prisma.workspace_documents.findMany({
  31. where: { workspaceId },
  32. });
  33. },
  34. delete: async function (clause = {}) {
  35. try {
  36. await prisma.workspace_documents.deleteMany({ where: clause });
  37. return true;
  38. } catch (error) {
  39. console.error(error.message);
  40. return false;
  41. }
  42. },
  43. get: async function (clause = {}) {
  44. try {
  45. const document = await prisma.workspace_documents.findFirst({
  46. where: clause,
  47. });
  48. return document || null;
  49. } catch (error) {
  50. console.error(error.message);
  51. return null;
  52. }
  53. },
  54. where: async function (
  55. clause = {},
  56. limit = null,
  57. orderBy = null,
  58. include = null,
  59. select = null
  60. ) {
  61. try {
  62. const results = await prisma.workspace_documents.findMany({
  63. where: clause,
  64. ...(limit !== null ? { take: limit } : {}),
  65. ...(orderBy !== null ? { orderBy } : {}),
  66. ...(include !== null ? { include } : {}),
  67. ...(select !== null ? { select: { ...select } } : {}),
  68. });
  69. return results;
  70. } catch (error) {
  71. console.error(error.message);
  72. return [];
  73. }
  74. },
  75. addDocuments: async function (workspace, additions = [], userId = null) {
  76. const VectorDb = getVectorDbClass();
  77. if (additions.length === 0) return { failed: [], embedded: [] };
  78. const { fileData } = require("../utils/files");
  79. const embedded = [];
  80. const failedToEmbed = [];
  81. const errors = new Set();
  82. for (const path of additions) {
  83. const data = await fileData(path);
  84. if (!data) continue;
  85. const docId = uuidv4();
  86. const { pageContent, ...metadata } = data;
  87. const newDoc = {
  88. docId,
  89. filename: path.split("/")[1],
  90. docpath: path,
  91. workspaceId: workspace.id,
  92. metadata: JSON.stringify(metadata),
  93. };
  94. const { vectorized, error } = await VectorDb.addDocumentToNamespace(
  95. workspace.slug,
  96. { ...data, docId },
  97. path
  98. );
  99. if (!vectorized) {
  100. console.error(
  101. "Failed to vectorize",
  102. metadata?.title || newDoc.filename
  103. );
  104. failedToEmbed.push(metadata?.title || newDoc.filename);
  105. errors.add(error);
  106. continue;
  107. }
  108. try {
  109. await prisma.workspace_documents.create({ data: newDoc });
  110. embedded.push(path);
  111. } catch (error) {
  112. console.error(error.message);
  113. }
  114. }
  115. await Telemetry.sendTelemetry("documents_embedded_in_workspace", {
  116. LLMSelection: process.env.LLM_PROVIDER || "openai",
  117. Embedder: process.env.EMBEDDING_ENGINE || "inherit",
  118. VectorDbSelection: process.env.VECTOR_DB || "lancedb",
  119. TTSSelection: process.env.TTS_PROVIDER || "native",
  120. });
  121. await EventLogs.logEvent(
  122. "workspace_documents_added",
  123. {
  124. workspaceName: workspace?.name || "Unknown Workspace",
  125. numberOfDocumentsAdded: additions.length,
  126. },
  127. userId
  128. );
  129. return { failedToEmbed, errors: Array.from(errors), embedded };
  130. },
  131. removeDocuments: async function (workspace, removals = [], userId = null) {
  132. const VectorDb = getVectorDbClass();
  133. if (removals.length === 0) return;
  134. for (const path of removals) {
  135. const document = await this.get({
  136. docpath: path,
  137. workspaceId: workspace.id,
  138. });
  139. if (!document) continue;
  140. await VectorDb.deleteDocumentFromNamespace(
  141. workspace.slug,
  142. document.docId
  143. );
  144. try {
  145. await prisma.workspace_documents.delete({
  146. where: { id: document.id, workspaceId: workspace.id },
  147. });
  148. await prisma.document_vectors.deleteMany({
  149. where: { docId: document.docId },
  150. });
  151. } catch (error) {
  152. console.error(error.message);
  153. }
  154. }
  155. await Telemetry.sendTelemetry("documents_removed_in_workspace", {
  156. LLMSelection: process.env.LLM_PROVIDER || "openai",
  157. Embedder: process.env.EMBEDDING_ENGINE || "inherit",
  158. VectorDbSelection: process.env.VECTOR_DB || "lancedb",
  159. TTSSelection: process.env.TTS_PROVIDER || "native",
  160. });
  161. await EventLogs.logEvent(
  162. "workspace_documents_removed",
  163. {
  164. workspaceName: workspace?.name || "Unknown Workspace",
  165. numberOfDocuments: removals.length,
  166. },
  167. userId
  168. );
  169. return true;
  170. },
  171. count: async function (clause = {}, limit = null) {
  172. try {
  173. const count = await prisma.workspace_documents.count({
  174. where: clause,
  175. ...(limit !== null ? { take: limit } : {}),
  176. });
  177. return count;
  178. } catch (error) {
  179. console.error("FAILED TO COUNT DOCUMENTS.", error.message);
  180. return 0;
  181. }
  182. },
  183. update: async function (id = null, data = {}) {
  184. if (!id) throw new Error("No workspace document id provided for update");
  185. const validKeys = Object.keys(data).filter((key) =>
  186. this.writable.includes(key)
  187. );
  188. if (validKeys.length === 0)
  189. return { document: { id }, message: "No valid fields to update!" };
  190. try {
  191. const document = await prisma.workspace_documents.update({
  192. where: { id },
  193. data,
  194. });
  195. return { document, message: null };
  196. } catch (error) {
  197. console.error(error.message);
  198. return { document: null, message: error.message };
  199. }
  200. },
  201. _updateAll: async function (clause = {}, data = {}) {
  202. try {
  203. await prisma.workspace_documents.updateMany({
  204. where: clause,
  205. data,
  206. });
  207. return true;
  208. } catch (error) {
  209. console.error(error.message);
  210. return false;
  211. }
  212. },
  213. content: async function (docId) {
  214. if (!docId) throw new Error("No workspace docId provided!");
  215. const document = await this.get({ docId: String(docId) });
  216. if (!document) throw new Error(`Could not find a document by id ${docId}`);
  217. const { fileData } = require("../utils/files");
  218. const data = await fileData(document.docpath);
  219. return { title: data.title, content: data.pageContent };
  220. },
  221. contentByDocPath: async function (docPath) {
  222. const { fileData } = require("../utils/files");
  223. const data = await fileData(docPath);
  224. return { title: data.title, content: data.pageContent };
  225. },
  226. // Some data sources have encoded params in them we don't want to log - so strip those details.
  227. _stripSource: function (sourceString, type) {
  228. if (["confluence", "github"].includes(type)) {
  229. const _src = new URL(sourceString);
  230. _src.search = ""; // remove all search params that are encoded for resync.
  231. return _src.toString();
  232. }
  233. return sourceString;
  234. },
  235. };
  236. module.exports = { Document };