You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

499 lines
16 KiB

11 months ago
  1. const { NativeEmbedder } = require("../../EmbeddingEngines/native");
  2. const { v4: uuidv4 } = require("uuid");
  3. const {
  4. writeResponseChunk,
  5. clientAbortedHandler,
  6. formatChatHistory,
  7. } = require("../../helpers/chat/responses");
  8. const fs = require("fs");
  9. const path = require("path");
  10. const { safeJsonParse } = require("../../http");
  11. const {
  12. LLMPerformanceMonitor,
  13. } = require("../../helpers/chat/LLMPerformanceMonitor");
  14. const cacheFolder = path.resolve(
  15. process.env.STORAGE_DIR
  16. ? path.resolve(process.env.STORAGE_DIR, "models", "openrouter")
  17. : path.resolve(__dirname, `../../../storage/models/openrouter`)
  18. );
  19. class OpenRouterLLM {
  20. constructor(embedder = null, modelPreference = null) {
  21. if (!process.env.OPENROUTER_API_KEY)
  22. throw new Error("No OpenRouter API key was set.");
  23. const { OpenAI: OpenAIApi } = require("openai");
  24. this.basePath = "https://openrouter.ai/api/v1";
  25. this.openai = new OpenAIApi({
  26. baseURL: this.basePath,
  27. apiKey: process.env.OPENROUTER_API_KEY ?? null,
  28. defaultHeaders: {
  29. "HTTP-Referer": "https://anythingllm.com",
  30. "X-Title": "AnythingLLM",
  31. },
  32. });
  33. this.model =
  34. modelPreference || process.env.OPENROUTER_MODEL_PREF || "openrouter/auto";
  35. this.limits = {
  36. history: this.promptWindowLimit() * 0.15,
  37. system: this.promptWindowLimit() * 0.15,
  38. user: this.promptWindowLimit() * 0.7,
  39. };
  40. this.embedder = embedder ?? new NativeEmbedder();
  41. this.defaultTemp = 0.7;
  42. this.timeout = this.#parseTimeout();
  43. if (!fs.existsSync(cacheFolder))
  44. fs.mkdirSync(cacheFolder, { recursive: true });
  45. this.cacheModelPath = path.resolve(cacheFolder, "models.json");
  46. this.cacheAtPath = path.resolve(cacheFolder, ".cached_at");
  47. this.log("Initialized with model:", this.model);
  48. }
  49. log(text, ...args) {
  50. console.log(`\x1b[36m[${this.constructor.name}]\x1b[0m ${text}`, ...args);
  51. }
  52. /**
  53. * OpenRouter has various models that never return `finish_reasons` and thus leave the stream open
  54. * which causes issues in subsequent messages. This timeout value forces us to close the stream after
  55. * x milliseconds. This is a configurable value via the OPENROUTER_TIMEOUT_MS value
  56. * @returns {number} The timeout value in milliseconds (default: 500)
  57. */
  58. #parseTimeout() {
  59. this.log(
  60. `OpenRouter timeout is set to ${process.env.OPENROUTER_TIMEOUT_MS ?? 500}ms`
  61. );
  62. if (isNaN(Number(process.env.OPENROUTER_TIMEOUT_MS))) return 500;
  63. const setValue = Number(process.env.OPENROUTER_TIMEOUT_MS);
  64. if (setValue < 500) return 500;
  65. return setValue;
  66. }
  67. // This checks if the .cached_at file has a timestamp that is more than 1Week (in millis)
  68. // from the current date. If it is, then we will refetch the API so that all the models are up
  69. // to date.
  70. #cacheIsStale() {
  71. const MAX_STALE = 6.048e8; // 1 Week in MS
  72. if (!fs.existsSync(this.cacheAtPath)) return true;
  73. const now = Number(new Date());
  74. const timestampMs = Number(fs.readFileSync(this.cacheAtPath));
  75. return now - timestampMs > MAX_STALE;
  76. }
  77. // The OpenRouter model API has a lot of models, so we cache this locally in the directory
  78. // as if the cache directory JSON file is stale or does not exist we will fetch from API and store it.
  79. // This might slow down the first request, but we need the proper token context window
  80. // for each model and this is a constructor property - so we can really only get it if this cache exists.
  81. // We used to have this as a chore, but given there is an API to get the info - this makes little sense.
  82. async #syncModels() {
  83. if (fs.existsSync(this.cacheModelPath) && !this.#cacheIsStale())
  84. return false;
  85. this.log(
  86. "Model cache is not present or stale. Fetching from OpenRouter API."
  87. );
  88. await fetchOpenRouterModels();
  89. return;
  90. }
  91. #appendContext(contextTexts = []) {
  92. if (!contextTexts || !contextTexts.length) return "";
  93. return (
  94. "\nContext:\n" +
  95. contextTexts
  96. .map((text, i) => {
  97. return `[CONTEXT ${i}]:\n${text}\n[END CONTEXT ${i}]\n\n`;
  98. })
  99. .join("")
  100. );
  101. }
  102. models() {
  103. if (!fs.existsSync(this.cacheModelPath)) return {};
  104. return safeJsonParse(
  105. fs.readFileSync(this.cacheModelPath, { encoding: "utf-8" }),
  106. {}
  107. );
  108. }
  109. streamingEnabled() {
  110. return "streamGetChatCompletion" in this;
  111. }
  112. static promptWindowLimit(modelName) {
  113. const cacheModelPath = path.resolve(cacheFolder, "models.json");
  114. const availableModels = fs.existsSync(cacheModelPath)
  115. ? safeJsonParse(
  116. fs.readFileSync(cacheModelPath, { encoding: "utf-8" }),
  117. {}
  118. )
  119. : {};
  120. return availableModels[modelName]?.maxLength || 4096;
  121. }
  122. promptWindowLimit() {
  123. const availableModels = this.models();
  124. return availableModels[this.model]?.maxLength || 4096;
  125. }
  126. async isValidChatCompletionModel(model = "") {
  127. await this.#syncModels();
  128. const availableModels = this.models();
  129. return availableModels.hasOwnProperty(model);
  130. }
  131. /**
  132. * Generates appropriate content array for a message + attachments.
  133. * @param {{userPrompt:string, attachments: import("../../helpers").Attachment[]}}
  134. * @returns {string|object[]}
  135. */
  136. #generateContent({ userPrompt, attachments = [] }) {
  137. if (!attachments.length) {
  138. return userPrompt;
  139. }
  140. const content = [{ type: "text", text: userPrompt }];
  141. for (let attachment of attachments) {
  142. content.push({
  143. type: "image_url",
  144. image_url: {
  145. url: attachment.contentString,
  146. detail: "auto",
  147. },
  148. });
  149. }
  150. return content.flat();
  151. }
  152. /**
  153. * Parses and prepends reasoning from the response and returns the full text response.
  154. * @param {Object} response
  155. * @returns {string}
  156. */
  157. #parseReasoningFromResponse({ message }) {
  158. let textResponse = message?.content;
  159. if (!!message?.reasoning && message.reasoning.trim().length > 0)
  160. textResponse = `<think>${message.reasoning}</think>${textResponse}`;
  161. return textResponse;
  162. }
  163. constructPrompt({
  164. systemPrompt = "",
  165. contextTexts = [],
  166. chatHistory = [],
  167. userPrompt = "",
  168. attachments = [],
  169. }) {
  170. const prompt = {
  171. role: "system",
  172. content: `${systemPrompt}${this.#appendContext(contextTexts)}`,
  173. };
  174. return [
  175. prompt,
  176. ...formatChatHistory(chatHistory, this.#generateContent),
  177. {
  178. role: "user",
  179. content: this.#generateContent({ userPrompt, attachments }),
  180. },
  181. ];
  182. }
  183. async getChatCompletion(messages = null, { temperature = 0.7 }) {
  184. if (!(await this.isValidChatCompletionModel(this.model)))
  185. throw new Error(
  186. `OpenRouter chat: ${this.model} is not valid for chat completion!`
  187. );
  188. const result = await LLMPerformanceMonitor.measureAsyncFunction(
  189. this.openai.chat.completions
  190. .create({
  191. model: this.model,
  192. messages,
  193. temperature,
  194. // This is an OpenRouter specific option that allows us to get the reasoning text
  195. // before the token text.
  196. include_reasoning: true,
  197. })
  198. .catch((e) => {
  199. throw new Error(e.message);
  200. })
  201. );
  202. if (
  203. !result?.output?.hasOwnProperty("choices") ||
  204. result?.output?.choices?.length === 0
  205. )
  206. throw new Error(
  207. `Invalid response body returned from OpenRouter: ${result.output?.error?.message || "Unknown error"} ${result.output?.error?.code || "Unknown code"}`
  208. );
  209. return {
  210. textResponse: this.#parseReasoningFromResponse(result.output.choices[0]),
  211. metrics: {
  212. prompt_tokens: result.output.usage.prompt_tokens || 0,
  213. completion_tokens: result.output.usage.completion_tokens || 0,
  214. total_tokens: result.output.usage.total_tokens || 0,
  215. outputTps: result.output.usage.completion_tokens / result.duration,
  216. duration: result.duration,
  217. },
  218. };
  219. }
  220. async streamGetChatCompletion(messages = null, { temperature = 0.7 }) {
  221. if (!(await this.isValidChatCompletionModel(this.model)))
  222. throw new Error(
  223. `OpenRouter chat: ${this.model} is not valid for chat completion!`
  224. );
  225. const measuredStreamRequest = await LLMPerformanceMonitor.measureStream(
  226. this.openai.chat.completions.create({
  227. model: this.model,
  228. stream: true,
  229. messages,
  230. temperature,
  231. // This is an OpenRouter specific option that allows us to get the reasoning text
  232. // before the token text.
  233. include_reasoning: true,
  234. }),
  235. messages
  236. // We have to manually count the tokens
  237. // OpenRouter has a ton of providers and they all can return slightly differently
  238. // some return chunk.usage on STOP, some do it after stop, its inconsistent.
  239. // So it is possible reported metrics are inaccurate since we cannot reliably
  240. // catch the metrics before resolving the stream - so we just pretend this functionality
  241. // is not available.
  242. );
  243. return measuredStreamRequest;
  244. }
  245. /**
  246. * Handles the default stream response for a chat.
  247. * @param {import("express").Response} response
  248. * @param {import('../../helpers/chat/LLMPerformanceMonitor').MonitoredStream} stream
  249. * @param {Object} responseProps
  250. * @returns {Promise<string>}
  251. */
  252. handleStream(response, stream, responseProps) {
  253. const timeoutThresholdMs = this.timeout;
  254. const { uuid = uuidv4(), sources = [] } = responseProps;
  255. return new Promise(async (resolve) => {
  256. let fullText = "";
  257. let reasoningText = "";
  258. let lastChunkTime = null; // null when first token is still not received.
  259. // Establish listener to early-abort a streaming response
  260. // in case things go sideways or the user does not like the response.
  261. // We preserve the generated text but continue as if chat was completed
  262. // to preserve previously generated content.
  263. const handleAbort = () => {
  264. stream?.endMeasurement({
  265. completion_tokens: LLMPerformanceMonitor.countTokens(fullText),
  266. });
  267. clientAbortedHandler(resolve, fullText);
  268. };
  269. response.on("close", handleAbort);
  270. // NOTICE: Not all OpenRouter models will return a stop reason
  271. // which keeps the connection open and so the model never finalizes the stream
  272. // like the traditional OpenAI response schema does. So in the case the response stream
  273. // never reaches a formal close state we maintain an interval timer that if we go >=timeoutThresholdMs with
  274. // no new chunks then we kill the stream and assume it to be complete. OpenRouter is quite fast
  275. // so this threshold should permit most responses, but we can adjust `timeoutThresholdMs` if
  276. // we find it is too aggressive.
  277. const timeoutCheck = setInterval(() => {
  278. if (lastChunkTime === null) return;
  279. const now = Number(new Date());
  280. const diffMs = now - lastChunkTime;
  281. if (diffMs >= timeoutThresholdMs) {
  282. console.log(
  283. `OpenRouter stream did not self-close and has been stale for >${timeoutThresholdMs}ms. Closing response stream.`
  284. );
  285. writeResponseChunk(response, {
  286. uuid,
  287. sources,
  288. type: "textResponseChunk",
  289. textResponse: "",
  290. close: true,
  291. error: false,
  292. });
  293. clearInterval(timeoutCheck);
  294. response.removeListener("close", handleAbort);
  295. stream?.endMeasurement({
  296. completion_tokens: LLMPerformanceMonitor.countTokens(fullText),
  297. });
  298. resolve(fullText);
  299. }
  300. }, 500);
  301. try {
  302. for await (const chunk of stream) {
  303. const message = chunk?.choices?.[0];
  304. const token = message?.delta?.content;
  305. const reasoningToken = message?.delta?.reasoning;
  306. lastChunkTime = Number(new Date());
  307. // Reasoning models will always return the reasoning text before the token text.
  308. // can be null or ''
  309. if (reasoningToken) {
  310. // If the reasoning text is empty (''), we need to initialize it
  311. // and send the first chunk of reasoning text.
  312. if (reasoningText.length === 0) {
  313. writeResponseChunk(response, {
  314. uuid,
  315. sources: [],
  316. type: "textResponseChunk",
  317. textResponse: `<think>${reasoningToken}`,
  318. close: false,
  319. error: false,
  320. });
  321. reasoningText += `<think>${reasoningToken}`;
  322. continue;
  323. } else {
  324. // If the reasoning text is not empty, we need to append the reasoning text
  325. // to the existing reasoning text.
  326. writeResponseChunk(response, {
  327. uuid,
  328. sources: [],
  329. type: "textResponseChunk",
  330. textResponse: reasoningToken,
  331. close: false,
  332. error: false,
  333. });
  334. reasoningText += reasoningToken;
  335. }
  336. }
  337. // If the reasoning text is not empty, but the reasoning token is empty
  338. // and the token text is not empty we need to close the reasoning text and begin sending the token text.
  339. if (!!reasoningText && !reasoningToken && token) {
  340. writeResponseChunk(response, {
  341. uuid,
  342. sources: [],
  343. type: "textResponseChunk",
  344. textResponse: `</think>`,
  345. close: false,
  346. error: false,
  347. });
  348. fullText += `${reasoningText}</think>`;
  349. reasoningText = "";
  350. }
  351. if (token) {
  352. fullText += token;
  353. writeResponseChunk(response, {
  354. uuid,
  355. sources: [],
  356. type: "textResponseChunk",
  357. textResponse: token,
  358. close: false,
  359. error: false,
  360. });
  361. }
  362. if (message.finish_reason !== null) {
  363. writeResponseChunk(response, {
  364. uuid,
  365. sources,
  366. type: "textResponseChunk",
  367. textResponse: "",
  368. close: true,
  369. error: false,
  370. });
  371. response.removeListener("close", handleAbort);
  372. clearInterval(timeoutCheck);
  373. stream?.endMeasurement({
  374. completion_tokens: LLMPerformanceMonitor.countTokens(fullText),
  375. });
  376. resolve(fullText);
  377. }
  378. }
  379. } catch (e) {
  380. writeResponseChunk(response, {
  381. uuid,
  382. sources,
  383. type: "abort",
  384. textResponse: null,
  385. close: true,
  386. error: e.message,
  387. });
  388. response.removeListener("close", handleAbort);
  389. clearInterval(timeoutCheck);
  390. stream?.endMeasurement({
  391. completion_tokens: LLMPerformanceMonitor.countTokens(fullText),
  392. });
  393. resolve(fullText);
  394. }
  395. });
  396. }
  397. // Simple wrapper for dynamic embedder & normalize interface for all LLM implementations
  398. async embedTextInput(textInput) {
  399. return await this.embedder.embedTextInput(textInput);
  400. }
  401. async embedChunks(textChunks = []) {
  402. return await this.embedder.embedChunks(textChunks);
  403. }
  404. async compressMessages(promptArgs = {}, rawHistory = []) {
  405. const { messageArrayCompressor } = require("../../helpers/chat");
  406. const messageArray = this.constructPrompt(promptArgs);
  407. return await messageArrayCompressor(this, messageArray, rawHistory);
  408. }
  409. }
  410. async function fetchOpenRouterModels() {
  411. return await fetch(`https://openrouter.ai/api/v1/models`, {
  412. method: "GET",
  413. headers: {
  414. "Content-Type": "application/json",
  415. },
  416. })
  417. .then((res) => res.json())
  418. .then(({ data = [] }) => {
  419. const models = {};
  420. data.forEach((model) => {
  421. models[model.id] = {
  422. id: model.id,
  423. name: model.name,
  424. organization:
  425. model.id.split("/")[0].charAt(0).toUpperCase() +
  426. model.id.split("/")[0].slice(1),
  427. maxLength: model.context_length,
  428. };
  429. });
  430. // Cache all response information
  431. if (!fs.existsSync(cacheFolder))
  432. fs.mkdirSync(cacheFolder, { recursive: true });
  433. fs.writeFileSync(
  434. path.resolve(cacheFolder, "models.json"),
  435. JSON.stringify(models),
  436. {
  437. encoding: "utf-8",
  438. }
  439. );
  440. fs.writeFileSync(
  441. path.resolve(cacheFolder, ".cached_at"),
  442. String(Number(new Date())),
  443. {
  444. encoding: "utf-8",
  445. }
  446. );
  447. return models;
  448. })
  449. .catch((e) => {
  450. console.error(e);
  451. return {};
  452. });
  453. }
  454. module.exports = {
  455. OpenRouterLLM,
  456. fetchOpenRouterModels,
  457. };