You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

400 lines
14 KiB

11 months ago
  1. const {
  2. DataType,
  3. MetricType,
  4. IndexType,
  5. MilvusClient,
  6. } = require("@zilliz/milvus2-sdk-node");
  7. const { TextSplitter } = require("../../TextSplitter");
  8. const { SystemSettings } = require("../../../models/systemSettings");
  9. const { v4: uuidv4 } = require("uuid");
  10. const { storeVectorResult, cachedVectorInformation } = require("../../files");
  11. const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
  12. const { sourceIdentifier } = require("../../chats");
  13. // Zilliz is basically a copy of Milvus DB class with a different constructor
  14. // to connect to the cloud
  15. const Zilliz = {
  16. name: "Zilliz",
  17. // Milvus/Zilliz only allows letters, numbers, and underscores in collection names
  18. // so we need to enforce that by re-normalizing the names when communicating with
  19. // the DB.
  20. // If the first char of the collection is not an underscore or letter the collection name will be invalid.
  21. normalize: function (inputString) {
  22. let normalized = inputString.replace(/[^a-zA-Z0-9_]/g, "_");
  23. if (new RegExp(/^[a-zA-Z_]/).test(normalized.slice(0, 1)))
  24. normalized = `anythingllm_${normalized}`;
  25. return normalized;
  26. },
  27. connect: async function () {
  28. if (process.env.VECTOR_DB !== "zilliz")
  29. throw new Error("Zilliz::Invalid ENV settings");
  30. const client = new MilvusClient({
  31. address: process.env.ZILLIZ_ENDPOINT,
  32. token: process.env.ZILLIZ_API_TOKEN,
  33. });
  34. const { isHealthy } = await client.checkHealth();
  35. if (!isHealthy)
  36. throw new Error(
  37. "Zilliz::Invalid Heartbeat received - is the instance online?"
  38. );
  39. return { client };
  40. },
  41. heartbeat: async function () {
  42. await this.connect();
  43. return { heartbeat: Number(new Date()) };
  44. },
  45. totalVectors: async function () {
  46. const { client } = await this.connect();
  47. const { collection_names } = await client.listCollections();
  48. const total = collection_names.reduce(async (acc, collection_name) => {
  49. const statistics = await client.getCollectionStatistics({
  50. collection_name: this.normalize(collection_name),
  51. });
  52. return Number(acc) + Number(statistics?.data?.row_count ?? 0);
  53. }, 0);
  54. return total;
  55. },
  56. namespaceCount: async function (_namespace = null) {
  57. const { client } = await this.connect();
  58. const statistics = await client.getCollectionStatistics({
  59. collection_name: this.normalize(_namespace),
  60. });
  61. return Number(statistics?.data?.row_count ?? 0);
  62. },
  63. namespace: async function (client, namespace = null) {
  64. if (!namespace) throw new Error("No namespace value provided.");
  65. const collection = await client
  66. .getCollectionStatistics({ collection_name: this.normalize(namespace) })
  67. .catch(() => null);
  68. return collection;
  69. },
  70. hasNamespace: async function (namespace = null) {
  71. if (!namespace) return false;
  72. const { client } = await this.connect();
  73. return await this.namespaceExists(client, namespace);
  74. },
  75. namespaceExists: async function (client, namespace = null) {
  76. if (!namespace) throw new Error("No namespace value provided.");
  77. const { value } = await client
  78. .hasCollection({ collection_name: this.normalize(namespace) })
  79. .catch((e) => {
  80. console.error("Zilliz::namespaceExists", e.message);
  81. return { value: false };
  82. });
  83. return value;
  84. },
  85. deleteVectorsInNamespace: async function (client, namespace = null) {
  86. await client.dropCollection({ collection_name: this.normalize(namespace) });
  87. return true;
  88. },
  89. // Zilliz requires a dimension aspect for collection creation
  90. // we pass this in from the first chunk to infer the dimensions like other
  91. // providers do.
  92. getOrCreateCollection: async function (client, namespace, dimensions = null) {
  93. const isExists = await this.namespaceExists(client, namespace);
  94. if (!isExists) {
  95. if (!dimensions)
  96. throw new Error(
  97. `Zilliz:getOrCreateCollection Unable to infer vector dimension from input. Open an issue on GitHub for support.`
  98. );
  99. await client.createCollection({
  100. collection_name: this.normalize(namespace),
  101. fields: [
  102. {
  103. name: "id",
  104. description: "id",
  105. data_type: DataType.VarChar,
  106. max_length: 255,
  107. is_primary_key: true,
  108. },
  109. {
  110. name: "vector",
  111. description: "vector",
  112. data_type: DataType.FloatVector,
  113. dim: dimensions,
  114. },
  115. {
  116. name: "metadata",
  117. decription: "metadata",
  118. data_type: DataType.JSON,
  119. },
  120. ],
  121. });
  122. await client.createIndex({
  123. collection_name: this.normalize(namespace),
  124. field_name: "vector",
  125. index_type: IndexType.AUTOINDEX,
  126. metric_type: MetricType.COSINE,
  127. });
  128. await client.loadCollectionSync({
  129. collection_name: this.normalize(namespace),
  130. });
  131. }
  132. },
  133. addDocumentToNamespace: async function (
  134. namespace,
  135. documentData = {},
  136. fullFilePath = null,
  137. skipCache = false
  138. ) {
  139. const { DocumentVectors } = require("../../../models/vectors");
  140. try {
  141. let vectorDimension = null;
  142. const { pageContent, docId, ...metadata } = documentData;
  143. if (!pageContent || pageContent.length == 0) return false;
  144. console.log("Adding new vectorized document into namespace", namespace);
  145. if (skipCache) {
  146. const cacheResult = await cachedVectorInformation(fullFilePath);
  147. if (cacheResult.exists) {
  148. const { client } = await this.connect();
  149. const { chunks } = cacheResult;
  150. const documentVectors = [];
  151. vectorDimension = chunks[0][0].values.length || null;
  152. await this.getOrCreateCollection(client, namespace, vectorDimension);
  153. for (const chunk of chunks) {
  154. // Before sending to Pinecone and saving the records to our db
  155. // we need to assign the id of each chunk that is stored in the cached file.
  156. const newChunks = chunk.map((chunk) => {
  157. const id = uuidv4();
  158. documentVectors.push({ docId, vectorId: id });
  159. return { id, vector: chunk.values, metadata: chunk.metadata };
  160. });
  161. const insertResult = await client.insert({
  162. collection_name: this.normalize(namespace),
  163. data: newChunks,
  164. });
  165. if (insertResult?.status.error_code !== "Success") {
  166. throw new Error(
  167. `Error embedding into Zilliz! Reason:${insertResult?.status.reason}`
  168. );
  169. }
  170. }
  171. await DocumentVectors.bulkInsert(documentVectors);
  172. await client.flushSync({
  173. collection_names: [this.normalize(namespace)],
  174. });
  175. return { vectorized: true, error: null };
  176. }
  177. }
  178. const EmbedderEngine = getEmbeddingEngineSelection();
  179. const textSplitter = new TextSplitter({
  180. chunkSize: TextSplitter.determineMaxChunkSize(
  181. await SystemSettings.getValueOrFallback({
  182. label: "text_splitter_chunk_size",
  183. }),
  184. EmbedderEngine?.embeddingMaxChunkLength
  185. ),
  186. chunkOverlap: await SystemSettings.getValueOrFallback(
  187. { label: "text_splitter_chunk_overlap" },
  188. 20
  189. ),
  190. chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
  191. });
  192. const textChunks = await textSplitter.splitText(pageContent);
  193. console.log("Chunks created from document:", textChunks.length);
  194. const documentVectors = [];
  195. const vectors = [];
  196. const vectorValues = await EmbedderEngine.embedChunks(textChunks);
  197. if (!!vectorValues && vectorValues.length > 0) {
  198. for (const [i, vector] of vectorValues.entries()) {
  199. if (!vectorDimension) vectorDimension = vector.length;
  200. const vectorRecord = {
  201. id: uuidv4(),
  202. values: vector,
  203. // [DO NOT REMOVE]
  204. // LangChain will be unable to find your text if you embed manually and dont include the `text` key.
  205. metadata: { ...metadata, text: textChunks[i] },
  206. };
  207. vectors.push(vectorRecord);
  208. documentVectors.push({ docId, vectorId: vectorRecord.id });
  209. }
  210. } else {
  211. throw new Error(
  212. "Could not embed document chunks! This document will not be recorded."
  213. );
  214. }
  215. if (vectors.length > 0) {
  216. const chunks = [];
  217. const { client } = await this.connect();
  218. await this.getOrCreateCollection(client, namespace, vectorDimension);
  219. console.log("Inserting vectorized chunks into Zilliz.");
  220. for (const chunk of toChunks(vectors, 100)) {
  221. chunks.push(chunk);
  222. const insertResult = await client.insert({
  223. collection_name: this.normalize(namespace),
  224. data: chunk.map((item) => ({
  225. id: item.id,
  226. vector: item.values,
  227. metadata: chunk.metadata,
  228. })),
  229. });
  230. if (insertResult?.status.error_code !== "Success") {
  231. throw new Error(
  232. `Error embedding into Zilliz! Reason:${insertResult?.status.reason}`
  233. );
  234. }
  235. }
  236. await storeVectorResult(chunks, fullFilePath);
  237. await client.flushSync({
  238. collection_names: [this.normalize(namespace)],
  239. });
  240. }
  241. await DocumentVectors.bulkInsert(documentVectors);
  242. return { vectorized: true, error: null };
  243. } catch (e) {
  244. console.error("addDocumentToNamespace", e.message);
  245. return { vectorized: false, error: e.message };
  246. }
  247. },
  248. deleteDocumentFromNamespace: async function (namespace, docId) {
  249. const { DocumentVectors } = require("../../../models/vectors");
  250. const { client } = await this.connect();
  251. if (!(await this.namespaceExists(client, namespace))) return;
  252. const knownDocuments = await DocumentVectors.where({ docId });
  253. if (knownDocuments.length === 0) return;
  254. const vectorIds = knownDocuments.map((doc) => doc.vectorId);
  255. const queryIn = vectorIds.map((v) => `'${v}'`).join(",");
  256. await client.deleteEntities({
  257. collection_name: this.normalize(namespace),
  258. expr: `id in [${queryIn}]`,
  259. });
  260. const indexes = knownDocuments.map((doc) => doc.id);
  261. await DocumentVectors.deleteIds(indexes);
  262. // Even after flushing Zilliz can take some time to re-calc the count
  263. // so all we can hope to do is flushSync so that the count can be correct
  264. // on a later call.
  265. await client.flushSync({ collection_names: [this.normalize(namespace)] });
  266. return true;
  267. },
  268. performSimilaritySearch: async function ({
  269. namespace = null,
  270. input = "",
  271. LLMConnector = null,
  272. similarityThreshold = 0.25,
  273. topN = 4,
  274. filterIdentifiers = [],
  275. }) {
  276. if (!namespace || !input || !LLMConnector)
  277. throw new Error("Invalid request to performSimilaritySearch.");
  278. const { client } = await this.connect();
  279. if (!(await this.namespaceExists(client, namespace))) {
  280. return {
  281. contextTexts: [],
  282. sources: [],
  283. message: "Invalid query - no documents found for workspace!",
  284. };
  285. }
  286. const queryVector = await LLMConnector.embedTextInput(input);
  287. const { contextTexts, sourceDocuments } = await this.similarityResponse({
  288. client,
  289. namespace,
  290. queryVector,
  291. similarityThreshold,
  292. topN,
  293. filterIdentifiers,
  294. });
  295. const sources = sourceDocuments.map((metadata, i) => {
  296. return { ...metadata, text: contextTexts[i] };
  297. });
  298. return {
  299. contextTexts,
  300. sources: this.curateSources(sources),
  301. message: false,
  302. };
  303. },
  304. similarityResponse: async function ({
  305. client,
  306. namespace,
  307. queryVector,
  308. similarityThreshold = 0.25,
  309. topN = 4,
  310. filterIdentifiers = [],
  311. }) {
  312. const result = {
  313. contextTexts: [],
  314. sourceDocuments: [],
  315. scores: [],
  316. };
  317. const response = await client.search({
  318. collection_name: this.normalize(namespace),
  319. vectors: queryVector,
  320. limit: topN,
  321. });
  322. response.results.forEach((match) => {
  323. if (match.score < similarityThreshold) return;
  324. if (filterIdentifiers.includes(sourceIdentifier(match.metadata))) {
  325. console.log(
  326. "Zilliz: A source was filtered from context as it's parent document is pinned."
  327. );
  328. return;
  329. }
  330. result.contextTexts.push(match.metadata.text);
  331. result.sourceDocuments.push(match);
  332. result.scores.push(match.score);
  333. });
  334. return result;
  335. },
  336. "namespace-stats": async function (reqBody = {}) {
  337. const { namespace = null } = reqBody;
  338. if (!namespace) throw new Error("namespace required");
  339. const { client } = await this.connect();
  340. if (!(await this.namespaceExists(client, namespace)))
  341. throw new Error("Namespace by that name does not exist.");
  342. const stats = await this.namespace(client, namespace);
  343. return stats
  344. ? stats
  345. : { message: "No stats were able to be fetched from DB for namespace" };
  346. },
  347. "delete-namespace": async function (reqBody = {}) {
  348. const { namespace = null } = reqBody;
  349. const { client } = await this.connect();
  350. if (!(await this.namespaceExists(client, namespace)))
  351. throw new Error("Namespace by that name does not exist.");
  352. const statistics = await this.namespace(client, namespace);
  353. await this.deleteVectorsInNamespace(client, namespace);
  354. const vectorCount = Number(statistics?.data?.row_count ?? 0);
  355. return {
  356. message: `Namespace ${namespace} was deleted along with ${vectorCount} vectors.`,
  357. };
  358. },
  359. curateSources: function (sources = []) {
  360. const documents = [];
  361. for (const source of sources) {
  362. const { metadata = {} } = source;
  363. if (Object.keys(metadata).length > 0) {
  364. documents.push({
  365. ...metadata,
  366. ...(source.hasOwnProperty("pageContent")
  367. ? { text: source.pageContent }
  368. : {}),
  369. });
  370. }
  371. }
  372. return documents;
  373. },
  374. };
  375. module.exports.Zilliz = Zilliz;