You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

448 lines
18 KiB

11 months ago
  1. const { sourceIdentifier } = require("../../chats");
  2. const { safeJsonParse } = require("../../http");
  3. const { TokenManager } = require("../tiktoken");
  4. const { convertToPromptHistory } = require("./responses");
  5. /*
  6. What is the message Array compressor?
  7. TLDR: So anyway, i started blasting (your prompts & stuff)
  8. messageArrayCompressor arose out of a need for users to be able to insert unlimited token prompts
  9. and also maintain coherent history, system instructions and context, if applicable.
  10. We took an opinionated approach that after much back-testing we have found retained a highly coherent answer
  11. under most user conditions that a user would take while using this specific system. While other systems may
  12. use a more advanced model for compressing message history or simplify text through a recursive approach - our is much more simple.
  13. We "cannonball" the input.
  14. Cannonball (verb): To ensure a prompt fits through a model window we blast a hole in the center of any inputs blocking our path to doing so.
  15. This starts by dissecting the input as tokens and delete from the middle-out bi-directionally until the prompt window is satisfied.
  16. You may think: "Doesn't this result in massive data loss?" - yes & no.
  17. Under the use cases we expect the tool to be used, which is mostly chatting with documents, we are able to use this approach with minimal blowback
  18. on the quality of responses.
  19. We accomplish this by taking a rate-limit approach that is proportional to the model capacity. Since we support more than openAI models, this needs to
  20. be generic and reliance on a "better summary" model just is not a luxury we can afford. The added latency overhead during prompting is also unacceptable.
  21. In general:
  22. system: at best 15% of token capacity
  23. history: at best 15% of token capacity
  24. prompt: at best 70% of token capacity.
  25. we handle overflows by taking an aggressive path for two main cases.
  26. 1. Very large user prompt
  27. - Likely uninterested in context, history, or even system prompt. This is a "standalone" prompt that highjacks the whole thread.
  28. - We run this prompt on its own since a prompt that is over 70% of context window certainly is standalone.
  29. 2. Context window is exceeded in regular use.
  30. - We do not touch prompt since it is very likely to be <70% of window.
  31. - We check system prompt is not outrageous - if it is we cannonball it and keep context if present.
  32. - We check a sliding window of history, only allowing up to 15% of the history to pass through if it fits, with a
  33. preference for recent history if we can cannonball to fit it, otherwise it is omitted.
  34. We end up with a rather large prompt that fits through a given window with a lot of room for response in most use-cases.
  35. We also take the approach that history is the least important and most flexible of the items in this array of responses.
  36. There is a supplemental version of this function that also returns a formatted string for models like Claude-2
  37. */
  38. async function messageArrayCompressor(llm, messages = [], rawHistory = []) {
  39. // assume the response will be at least 600 tokens. If the total prompt + reply is over we need to proactively
  40. // run the compressor to ensure the prompt has enough space to reply.
  41. // realistically - most users will not be impacted by this.
  42. const tokenBuffer = 600;
  43. const tokenManager = new TokenManager(llm.model);
  44. // If no work needs to be done, just pass through.
  45. if (tokenManager.statsFrom(messages) + tokenBuffer < llm.promptWindowLimit())
  46. return messages;
  47. const system = messages.shift();
  48. const user = messages.pop();
  49. const userPromptSize = tokenManager.countFromString(user.content);
  50. // User prompt is the main focus here - we we prioritize it and allow
  51. // it to highjack the entire conversation thread. We are going to
  52. // cannonball the prompt through to ensure the reply has at least 20% of
  53. // the token supply to reply with.
  54. if (userPromptSize > llm.limits.user) {
  55. return [
  56. {
  57. role: "user",
  58. content: cannonball({
  59. input: user.content,
  60. targetTokenSize: llm.promptWindowLimit() * 0.8,
  61. tiktokenInstance: tokenManager,
  62. }),
  63. },
  64. ];
  65. }
  66. const compressedSystem = new Promise(async (resolve) => {
  67. const count = tokenManager.countFromString(system.content);
  68. if (count < llm.limits.system) {
  69. resolve(system);
  70. return;
  71. }
  72. // Split context from system prompt - cannonball since its over the window.
  73. // We assume the context + user prompt is enough tokens to fit.
  74. const [prompt, context = ""] = system.content.split("Context:");
  75. let compressedPrompt;
  76. let compressedContext;
  77. // If the user system prompt contribution's to the system prompt is more than
  78. // 25% of the system limit, we will cannonball it - this favors the context
  79. // over the instruction from the user.
  80. if (tokenManager.countFromString(prompt) >= llm.limits.system * 0.25) {
  81. compressedPrompt = cannonball({
  82. input: prompt,
  83. targetTokenSize: llm.limits.system * 0.25,
  84. tiktokenInstance: tokenManager,
  85. });
  86. } else {
  87. compressedPrompt = prompt;
  88. }
  89. if (tokenManager.countFromString(context) >= llm.limits.system * 0.75) {
  90. compressedContext = cannonball({
  91. input: context,
  92. targetTokenSize: llm.limits.system * 0.75,
  93. tiktokenInstance: tokenManager,
  94. });
  95. } else {
  96. compressedContext = context;
  97. }
  98. system.content = `${compressedPrompt}${
  99. compressedContext ? `\nContext: ${compressedContext}` : ""
  100. }`;
  101. resolve(system);
  102. });
  103. // Prompt is allowed to take up to 70% of window - we know its under
  104. // if we are here, so passthrough.
  105. const compressedPrompt = new Promise(async (resolve) => resolve(user));
  106. // We always aggressively compress history because it is the least
  107. // important data to retain in full-fidelity.
  108. const compressedHistory = new Promise((resolve) => {
  109. const eligibleHistoryItems = [];
  110. var historyTokenCount = 0;
  111. for (const [i, history] of rawHistory.reverse().entries()) {
  112. const [user, assistant] = convertToPromptHistory([history]);
  113. const [userTokens, assistantTokens] = [
  114. tokenManager.countFromString(user.content),
  115. tokenManager.countFromString(assistant.content),
  116. ];
  117. const total = userTokens + assistantTokens;
  118. // If during the loop the token cost of adding this history
  119. // is small, we can add it to history and move onto next.
  120. if (historyTokenCount + total < llm.limits.history) {
  121. eligibleHistoryItems.unshift(user, assistant);
  122. historyTokenCount += total;
  123. continue;
  124. }
  125. // If we reach here the overhead of adding this history item will
  126. // be too much of the limit. So now, we are prioritizing
  127. // the most recent 3 message pairs - if we are already past those - exit loop and stop
  128. // trying to make history work.
  129. if (i > 2) break;
  130. // We are over the limit and we are within the first 3 most recent chats.
  131. // so now we cannonball them to make them fit into the window.
  132. // max size = llm.limit.history; Each component of the message, can at most
  133. // be 50% of the history. We cannonball whichever is the problem.
  134. // The math isnt perfect for tokens, so we have to add a fudge factor for safety.
  135. const maxTargetSize = Math.floor(llm.limits.history / 2.2);
  136. if (userTokens > maxTargetSize) {
  137. user.content = cannonball({
  138. input: user.content,
  139. targetTokenSize: maxTargetSize,
  140. tiktokenInstance: tokenManager,
  141. });
  142. }
  143. if (assistantTokens > maxTargetSize) {
  144. assistant.content = cannonball({
  145. input: assistant.content,
  146. targetTokenSize: maxTargetSize,
  147. tiktokenInstance: tokenManager,
  148. });
  149. }
  150. const newTotal = tokenManager.statsFrom([user, assistant]);
  151. if (historyTokenCount + newTotal > llm.limits.history) continue;
  152. eligibleHistoryItems.unshift(user, assistant);
  153. historyTokenCount += newTotal;
  154. }
  155. resolve(eligibleHistoryItems);
  156. });
  157. const [cSystem, cHistory, cPrompt] = await Promise.all([
  158. compressedSystem,
  159. compressedHistory,
  160. compressedPrompt,
  161. ]);
  162. return [cSystem, ...cHistory, cPrompt];
  163. }
  164. // Implementation of messageArrayCompressor, but for string only completion models
  165. async function messageStringCompressor(llm, promptArgs = {}, rawHistory = []) {
  166. const tokenBuffer = 600;
  167. const tokenManager = new TokenManager(llm.model);
  168. const initialPrompt = llm.constructPrompt(promptArgs);
  169. if (
  170. tokenManager.statsFrom(initialPrompt) + tokenBuffer <
  171. llm.promptWindowLimit()
  172. )
  173. return initialPrompt;
  174. const system = promptArgs.systemPrompt;
  175. const user = promptArgs.userPrompt;
  176. const userPromptSize = tokenManager.countFromString(user);
  177. // User prompt is the main focus here - we we prioritize it and allow
  178. // it to highjack the entire conversation thread. We are going to
  179. // cannonball the prompt through to ensure the reply has at least 20% of
  180. // the token supply to reply with.
  181. if (userPromptSize > llm.limits.user) {
  182. return llm.constructPrompt({
  183. userPrompt: cannonball({
  184. input: user,
  185. targetTokenSize: llm.promptWindowLimit() * 0.8,
  186. tiktokenInstance: tokenManager,
  187. }),
  188. });
  189. }
  190. const compressedSystem = new Promise(async (resolve) => {
  191. const count = tokenManager.countFromString(system);
  192. if (count < llm.limits.system) {
  193. resolve(system);
  194. return;
  195. }
  196. resolve(
  197. cannonball({
  198. input: system,
  199. targetTokenSize: llm.limits.system,
  200. tiktokenInstance: tokenManager,
  201. })
  202. );
  203. });
  204. // Prompt is allowed to take up to 70% of window - we know its under
  205. // if we are here, so passthrough.
  206. const compressedPrompt = new Promise(async (resolve) => resolve(user));
  207. // We always aggressively compress history because it is the least
  208. // important data to retain in full-fidelity.
  209. const compressedHistory = new Promise((resolve) => {
  210. const eligibleHistoryItems = [];
  211. var historyTokenCount = 0;
  212. for (const [i, history] of rawHistory.reverse().entries()) {
  213. const [user, assistant] = convertToPromptHistory([history]);
  214. const [userTokens, assistantTokens] = [
  215. tokenManager.countFromString(user.content),
  216. tokenManager.countFromString(assistant.content),
  217. ];
  218. const total = userTokens + assistantTokens;
  219. // If during the loop the token cost of adding this history
  220. // is small, we can add it to history and move onto next.
  221. if (historyTokenCount + total < llm.limits.history) {
  222. eligibleHistoryItems.unshift(user, assistant);
  223. historyTokenCount += total;
  224. continue;
  225. }
  226. // If we reach here the overhead of adding this history item will
  227. // be too much of the limit. So now, we are prioritizing
  228. // the most recent 3 message pairs - if we are already past those - exit loop and stop
  229. // trying to make history work.
  230. if (i > 2) break;
  231. // We are over the limit and we are within the first 3 most recent chats.
  232. // so now we cannonball them to make them fit into the window.
  233. // max size = llm.limit.history; Each component of the message, can at most
  234. // be 50% of the history. We cannonball whichever is the problem.
  235. // The math isnt perfect for tokens, so we have to add a fudge factor for safety.
  236. const maxTargetSize = Math.floor(llm.limits.history / 2.2);
  237. if (userTokens > maxTargetSize) {
  238. user.content = cannonball({
  239. input: user.content,
  240. targetTokenSize: maxTargetSize,
  241. tiktokenInstance: tokenManager,
  242. });
  243. }
  244. if (assistantTokens > maxTargetSize) {
  245. assistant.content = cannonball({
  246. input: assistant.content,
  247. targetTokenSize: maxTargetSize,
  248. tiktokenInstance: tokenManager,
  249. });
  250. }
  251. const newTotal = tokenManager.statsFrom([user, assistant]);
  252. if (historyTokenCount + newTotal > llm.limits.history) continue;
  253. eligibleHistoryItems.unshift(user, assistant);
  254. historyTokenCount += newTotal;
  255. }
  256. resolve(eligibleHistoryItems);
  257. });
  258. const [cSystem, cHistory, cPrompt] = await Promise.all([
  259. compressedSystem,
  260. compressedHistory,
  261. compressedPrompt,
  262. ]);
  263. return llm.constructPrompt({
  264. systemPrompt: cSystem,
  265. contextTexts: promptArgs?.contextTexts || [],
  266. chatHistory: cHistory,
  267. userPrompt: cPrompt,
  268. });
  269. }
  270. // Cannonball prompting: aka where we shoot a proportionally big cannonball through a proportional large prompt
  271. // Nobody should be sending prompts this big, but there is no reason we shouldn't allow it if results are good even by doing it.
  272. function cannonball({
  273. input = "",
  274. targetTokenSize = 0,
  275. tiktokenInstance = null,
  276. ellipsesStr = null,
  277. }) {
  278. if (!input || !targetTokenSize) return input;
  279. const tokenManager = tiktokenInstance || new TokenManager();
  280. const truncText = ellipsesStr || "\n\n--prompt truncated for brevity--\n\n";
  281. const initialInputSize = tokenManager.countFromString(input);
  282. if (initialInputSize < targetTokenSize) return input;
  283. // if the delta is the token difference between where our prompt is in size
  284. // and where we ideally need to land.
  285. const delta = initialInputSize - targetTokenSize;
  286. const tokenChunks = tokenManager.tokensFromString(input);
  287. const middleIdx = Math.floor(tokenChunks.length / 2);
  288. // middle truncate the text going left and right of midpoint
  289. const leftChunks = tokenChunks.slice(0, middleIdx - Math.round(delta / 2));
  290. const rightChunks = tokenChunks.slice(middleIdx + Math.round(delta / 2));
  291. const truncatedText =
  292. tokenManager.bytesFromTokens(leftChunks) +
  293. truncText +
  294. tokenManager.bytesFromTokens(rightChunks);
  295. console.log(
  296. `Cannonball results ${initialInputSize} -> ${tokenManager.countFromString(
  297. truncatedText
  298. )} tokens.`
  299. );
  300. return truncatedText;
  301. }
  302. /**
  303. * Fill the sources window with the priority of
  304. * 1. Pinned documents (handled prior to function)
  305. * 2. VectorSearch results
  306. * 3. prevSources in chat history - starting from most recent.
  307. *
  308. * Ensuring the window always has the desired amount of sources so that followup questions
  309. * in any chat mode have relevant sources, but not infinite sources. This function is used during chatting
  310. * and allows follow-up questions within a query chat that otherwise would have zero sources and would fail.
  311. * The added benefit is that during regular RAG chat, we have better coherence of citations that otherwise would
  312. * also yield no results with no need for a ReRanker to run and take much longer to return a response.
  313. *
  314. * The side effect of this is follow-up unrelated questions now have citations that would look totally irrelevant, however
  315. * we would rather optimize on the correctness of a response vs showing extraneous sources during a response. Given search
  316. * results always take a priority a good unrelated question that produces RAG results will still function as desired and due to previous
  317. * history backfill sources "changing context" mid-chat is handled appropriately.
  318. * example:
  319. * ---previous implementation---
  320. * prompt 1: "What is anythingllm?" -> possibly get 4 good sources
  321. * prompt 2: "Tell me some features" -> possible get 0 - 1 maybe relevant source + previous answer response -> bad response due to bad context mgmt
  322. * ---next implementation---
  323. * prompt 1: "What is anythingllm?" -> possibly get 4 good sources
  324. * prompt 2: "Tell me some features" -> possible get 0 - 1 maybe relevant source + previous answer response -> backfill with 3 good sources from previous -> much better response
  325. *
  326. * @param {Object} config - params to call
  327. * @param {object} config.nDocs = fill size of the window
  328. * @param {object} config.searchResults = vector `similarityResponse` results for .sources
  329. * @param {object[]} config.history - rawHistory of chat containing sources
  330. * @param {string[]} config.filterIdentifiers - Pinned document identifiers to prevent duplicate context
  331. * @returns {{
  332. * contextTexts: string[],
  333. * sources: object[],
  334. * }} - Array of sources that should be added to window
  335. */
  336. function fillSourceWindow({
  337. nDocs = 4, // Number of documents
  338. searchResults = [], // Sources from similarity search
  339. history = [], // Raw history
  340. filterIdentifiers = [], // pinned document sources
  341. } = config) {
  342. const sources = [...searchResults];
  343. if (sources.length >= nDocs || history.length === 0) {
  344. return {
  345. sources,
  346. contextTexts: sources.map((src) => src.text),
  347. };
  348. }
  349. const log = (text, ...args) => {
  350. console.log(`\x1b[36m[fillSourceWindow]\x1b[0m ${text}`, ...args);
  351. };
  352. log(
  353. `Need to backfill ${nDocs - searchResults.length} chunks to fill in the source window for RAG!`
  354. );
  355. const seenChunks = new Set(searchResults.map((source) => source.id));
  356. // We need to reverse again because we need to iterate from bottom of array (most recent chats)
  357. // Looking at this function by itself you may think that this loop could be extreme for long history chats,
  358. // but this was already handled where `history` we derived. This comes from `recentChatHistory` which
  359. // includes a limit for history (default: 20). So this loop does not look as extreme as on first glance.
  360. for (const chat of history.reverse()) {
  361. if (sources.length >= nDocs) {
  362. log(
  363. `Citations backfilled to ${nDocs} references from ${searchResults.length} original citations.`
  364. );
  365. break;
  366. }
  367. const chatSources =
  368. safeJsonParse(chat.response, { sources: [] })?.sources || [];
  369. if (!chatSources?.length || !Array.isArray(chatSources)) continue;
  370. const validSources = chatSources.filter((source) => {
  371. return (
  372. filterIdentifiers.includes(sourceIdentifier(source)) == false && // source cannot be in current pins
  373. source.hasOwnProperty("score") && // source cannot have come from a pinned document that was previously pinned
  374. source.hasOwnProperty("text") && // source has a valid text property we can use
  375. seenChunks.has(source.id) == false // is unique
  376. );
  377. });
  378. for (const validSource of validSources) {
  379. if (sources.length >= nDocs) break;
  380. sources.push(validSource);
  381. seenChunks.add(validSource.id);
  382. }
  383. }
  384. return {
  385. sources,
  386. contextTexts: sources.map((src) => src.text),
  387. };
  388. }
  389. module.exports = {
  390. messageArrayCompressor,
  391. messageStringCompressor,
  392. fillSourceWindow,
  393. };