You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

297 lines
8.8 KiB

11 months ago
  1. const { v4: uuidv4 } = require("uuid");
  2. const { DocumentManager } = require("../DocumentManager");
  3. const { WorkspaceChats } = require("../../models/workspaceChats");
  4. const { getVectorDbClass, getLLMProvider } = require("../helpers");
  5. const { writeResponseChunk } = require("../helpers/chat/responses");
  6. const { grepAgents } = require("./agents");
  7. const {
  8. grepCommand,
  9. VALID_COMMANDS,
  10. chatPrompt,
  11. recentChatHistory,
  12. sourceIdentifier,
  13. } = require("./index");
  14. const VALID_CHAT_MODE = ["chat", "query"];
  15. async function streamChatWithWorkspace(
  16. response,
  17. workspace,
  18. message,
  19. chatMode = "chat",
  20. user = null,
  21. thread = null,
  22. attachments = []
  23. ) {
  24. const uuid = uuidv4();
  25. const updatedMessage = await grepCommand(message, user);
  26. if (Object.keys(VALID_COMMANDS).includes(updatedMessage)) {
  27. const data = await VALID_COMMANDS[updatedMessage](
  28. workspace,
  29. message,
  30. uuid,
  31. user,
  32. thread
  33. );
  34. writeResponseChunk(response, data);
  35. return;
  36. }
  37. // If is agent enabled chat we will exit this flow early.
  38. const isAgentChat = await grepAgents({
  39. uuid,
  40. response,
  41. message,
  42. user,
  43. workspace,
  44. thread,
  45. });
  46. if (isAgentChat) return;
  47. const LLMConnector = getLLMProvider({
  48. provider: workspace?.chatProvider,
  49. model: workspace?.chatModel,
  50. });
  51. const VectorDb = getVectorDbClass();
  52. const messageLimit = workspace?.openAiHistory || 20;
  53. const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug);
  54. const embeddingsCount = await VectorDb.namespaceCount(workspace.slug);
  55. // User is trying to query-mode chat a workspace that has no data in it - so
  56. // we should exit early as no information can be found under these conditions.
  57. if ((!hasVectorizedSpace || embeddingsCount === 0) && chatMode === "query") {
  58. const textResponse =
  59. workspace?.queryRefusalResponse ??
  60. "There is no relevant information in this workspace to answer your query.";
  61. writeResponseChunk(response, {
  62. id: uuid,
  63. type: "textResponse",
  64. textResponse,
  65. sources: [],
  66. attachments,
  67. close: true,
  68. error: null,
  69. });
  70. await WorkspaceChats.new({
  71. workspaceId: workspace.id,
  72. prompt: message,
  73. response: {
  74. text: textResponse,
  75. sources: [],
  76. type: chatMode,
  77. attachments,
  78. },
  79. threadId: thread?.id || null,
  80. include: false,
  81. user,
  82. });
  83. return;
  84. }
  85. // If we are here we know that we are in a workspace that is:
  86. // 1. Chatting in "chat" mode and may or may _not_ have embeddings
  87. // 2. Chatting in "query" mode and has at least 1 embedding
  88. let completeText;
  89. let metrics = {};
  90. let contextTexts = [];
  91. let sources = [];
  92. let pinnedDocIdentifiers = [];
  93. const { rawHistory, chatHistory } = await recentChatHistory({
  94. user,
  95. workspace,
  96. thread,
  97. messageLimit,
  98. });
  99. // Look for pinned documents and see if the user decided to use this feature. We will also do a vector search
  100. // as pinning is a supplemental tool but it should be used with caution since it can easily blow up a context window.
  101. // However we limit the maximum of appended context to 80% of its overall size, mostly because if it expands beyond this
  102. // it will undergo prompt compression anyway to make it work. If there is so much pinned that the context here is bigger than
  103. // what the model can support - it would get compressed anyway and that really is not the point of pinning. It is really best
  104. // suited for high-context models.
  105. await new DocumentManager({
  106. workspace,
  107. maxTokens: LLMConnector.promptWindowLimit(),
  108. })
  109. .pinnedDocs()
  110. .then((pinnedDocs) => {
  111. pinnedDocs.forEach((doc) => {
  112. const { pageContent, ...metadata } = doc;
  113. pinnedDocIdentifiers.push(sourceIdentifier(doc));
  114. contextTexts.push(doc.pageContent);
  115. sources.push({
  116. text:
  117. pageContent.slice(0, 1_000) +
  118. "...continued on in source document...",
  119. ...metadata,
  120. });
  121. });
  122. });
  123. const vectorSearchResults =
  124. embeddingsCount !== 0
  125. ? await VectorDb.performSimilaritySearch({
  126. namespace: workspace.slug,
  127. input: message,
  128. LLMConnector,
  129. similarityThreshold: workspace?.similarityThreshold,
  130. topN: workspace?.topN,
  131. filterIdentifiers: pinnedDocIdentifiers,
  132. rerank: workspace?.vectorSearchMode === "rerank",
  133. })
  134. : {
  135. contextTexts: [],
  136. sources: [],
  137. message: null,
  138. };
  139. // Failed similarity search if it was run at all and failed.
  140. if (!!vectorSearchResults.message) {
  141. writeResponseChunk(response, {
  142. id: uuid,
  143. type: "abort",
  144. textResponse: null,
  145. sources: [],
  146. close: true,
  147. error: vectorSearchResults.message,
  148. });
  149. return;
  150. }
  151. const { fillSourceWindow } = require("../helpers/chat");
  152. const filledSources = fillSourceWindow({
  153. nDocs: workspace?.topN || 4,
  154. searchResults: vectorSearchResults.sources,
  155. history: rawHistory,
  156. filterIdentifiers: pinnedDocIdentifiers,
  157. });
  158. // Why does contextTexts get all the info, but sources only get current search?
  159. // This is to give the ability of the LLM to "comprehend" a contextual response without
  160. // populating the Citations under a response with documents the user "thinks" are irrelevant
  161. // due to how we manage backfilling of the context to keep chats with the LLM more correct in responses.
  162. // If a past citation was used to answer the question - that is visible in the history so it logically makes sense
  163. // and does not appear to the user that a new response used information that is otherwise irrelevant for a given prompt.
  164. // TLDR; reduces GitHub issues for "LLM citing document that has no answer in it" while keep answers highly accurate.
  165. contextTexts = [...contextTexts, ...filledSources.contextTexts];
  166. sources = [...sources, ...vectorSearchResults.sources];
  167. // If in query mode and no context chunks are found from search, backfill, or pins - do not
  168. // let the LLM try to hallucinate a response or use general knowledge and exit early
  169. if (chatMode === "query" && contextTexts.length === 0) {
  170. const textResponse =
  171. workspace?.queryRefusalResponse ??
  172. "There is no relevant information in this workspace to answer your query.";
  173. writeResponseChunk(response, {
  174. id: uuid,
  175. type: "textResponse",
  176. textResponse,
  177. sources: [],
  178. close: true,
  179. error: null,
  180. });
  181. await WorkspaceChats.new({
  182. workspaceId: workspace.id,
  183. prompt: message,
  184. response: {
  185. text: textResponse,
  186. sources: [],
  187. type: chatMode,
  188. attachments,
  189. },
  190. threadId: thread?.id || null,
  191. include: false,
  192. user,
  193. });
  194. return;
  195. }
  196. // Compress & Assemble message to ensure prompt passes token limit with room for response
  197. // and build system messages based on inputs and history.
  198. const messages = await LLMConnector.compressMessages(
  199. {
  200. systemPrompt: chatPrompt(workspace),
  201. userPrompt: updatedMessage,
  202. contextTexts,
  203. chatHistory,
  204. attachments,
  205. },
  206. rawHistory
  207. );
  208. // If streaming is not explicitly enabled for connector
  209. // we do regular waiting of a response and send a single chunk.
  210. if (LLMConnector.streamingEnabled() !== true) {
  211. console.log(
  212. `\x1b[31m[STREAMING DISABLED]\x1b[0m Streaming is not available for ${LLMConnector.constructor.name}. Will use regular chat method.`
  213. );
  214. const { textResponse, metrics: performanceMetrics } =
  215. await LLMConnector.getChatCompletion(messages, {
  216. temperature: workspace?.openAiTemp ?? LLMConnector.defaultTemp,
  217. });
  218. completeText = textResponse;
  219. metrics = performanceMetrics;
  220. writeResponseChunk(response, {
  221. uuid,
  222. sources,
  223. type: "textResponseChunk",
  224. textResponse: completeText,
  225. close: true,
  226. error: false,
  227. metrics,
  228. });
  229. } else {
  230. const stream = await LLMConnector.streamGetChatCompletion(messages, {
  231. temperature: workspace?.openAiTemp ?? LLMConnector.defaultTemp,
  232. });
  233. completeText = await LLMConnector.handleStream(response, stream, {
  234. uuid,
  235. sources,
  236. });
  237. metrics = stream.metrics;
  238. }
  239. if (completeText?.length > 0) {
  240. const { chat } = await WorkspaceChats.new({
  241. workspaceId: workspace.id,
  242. prompt: message,
  243. response: {
  244. text: completeText,
  245. sources,
  246. type: chatMode,
  247. attachments,
  248. metrics,
  249. },
  250. threadId: thread?.id || null,
  251. user,
  252. });
  253. writeResponseChunk(response, {
  254. uuid,
  255. type: "finalizeResponseStream",
  256. close: true,
  257. error: false,
  258. chatId: chat.id,
  259. metrics,
  260. });
  261. return;
  262. }
  263. writeResponseChunk(response, {
  264. uuid,
  265. type: "finalizeResponseStream",
  266. close: true,
  267. error: false,
  268. metrics,
  269. });
  270. return;
  271. }
  272. module.exports = {
  273. VALID_CHAT_MODE,
  274. streamChatWithWorkspace,
  275. };