You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

364 lines
12 KiB

11 months ago
  1. const { NativeEmbedder } = require("../../EmbeddingEngines/native");
  2. const {
  3. LLMPerformanceMonitor,
  4. } = require("../../helpers/chat/LLMPerformanceMonitor");
  5. const {
  6. formatChatHistory,
  7. writeResponseChunk,
  8. clientAbortedHandler,
  9. } = require("../../helpers/chat/responses");
  10. const { toValidNumber } = require("../../http");
  11. class GenericOpenAiLLM {
  12. constructor(embedder = null, modelPreference = null) {
  13. const { OpenAI: OpenAIApi } = require("openai");
  14. if (!process.env.GENERIC_OPEN_AI_BASE_PATH)
  15. throw new Error(
  16. "GenericOpenAI must have a valid base path to use for the api."
  17. );
  18. this.basePath = process.env.GENERIC_OPEN_AI_BASE_PATH;
  19. this.openai = new OpenAIApi({
  20. baseURL: this.basePath,
  21. apiKey: process.env.GENERIC_OPEN_AI_API_KEY ?? null,
  22. });
  23. this.model =
  24. modelPreference ?? process.env.GENERIC_OPEN_AI_MODEL_PREF ?? null;
  25. this.maxTokens = process.env.GENERIC_OPEN_AI_MAX_TOKENS
  26. ? toValidNumber(process.env.GENERIC_OPEN_AI_MAX_TOKENS, 1024)
  27. : 1024;
  28. if (!this.model)
  29. throw new Error("GenericOpenAI must have a valid model set.");
  30. this.limits = {
  31. history: this.promptWindowLimit() * 0.15,
  32. system: this.promptWindowLimit() * 0.15,
  33. user: this.promptWindowLimit() * 0.7,
  34. };
  35. this.embedder = embedder ?? new NativeEmbedder();
  36. this.defaultTemp = 0.7;
  37. this.log(`Inference API: ${this.basePath} Model: ${this.model}`);
  38. }
  39. log(text, ...args) {
  40. console.log(`\x1b[36m[${this.constructor.name}]\x1b[0m ${text}`, ...args);
  41. }
  42. #appendContext(contextTexts = []) {
  43. if (!contextTexts || !contextTexts.length) return "";
  44. return (
  45. "\nContext:\n" +
  46. contextTexts
  47. .map((text, i) => {
  48. return `[CONTEXT ${i}]:\n${text}\n[END CONTEXT ${i}]\n\n`;
  49. })
  50. .join("")
  51. );
  52. }
  53. streamingEnabled() {
  54. return "streamGetChatCompletion" in this;
  55. }
  56. static promptWindowLimit(_modelName) {
  57. const limit = process.env.GENERIC_OPEN_AI_MODEL_TOKEN_LIMIT || 4096;
  58. if (!limit || isNaN(Number(limit)))
  59. throw new Error("No token context limit was set.");
  60. return Number(limit);
  61. }
  62. // Ensure the user set a value for the token limit
  63. // and if undefined - assume 4096 window.
  64. promptWindowLimit() {
  65. const limit = process.env.GENERIC_OPEN_AI_MODEL_TOKEN_LIMIT || 4096;
  66. if (!limit || isNaN(Number(limit)))
  67. throw new Error("No token context limit was set.");
  68. return Number(limit);
  69. }
  70. // Short circuit since we have no idea if the model is valid or not
  71. // in pre-flight for generic endpoints
  72. isValidChatCompletionModel(_modelName = "") {
  73. return true;
  74. }
  75. /**
  76. * Generates appropriate content array for a message + attachments.
  77. *
  78. * ## Developer Note
  79. * This function assumes the generic OpenAI provider is _actually_ OpenAI compatible.
  80. * For example, Ollama is "OpenAI compatible" but does not support images as a content array.
  81. * The contentString also is the base64 string WITH `data:image/xxx;base64,` prefix, which may not be the case for all providers.
  82. * If your provider does not work exactly this way, then attachments will not function or potentially break vision requests.
  83. * If you encounter this issue, you are welcome to open an issue asking for your specific provider to be supported.
  84. *
  85. * This function will **not** be updated for providers that **do not** support images as a content array like OpenAI does.
  86. * Do not open issues to update this function due to your specific provider not being compatible. Open an issue to request support for your specific provider.
  87. * @param {Object} props
  88. * @param {string} props.userPrompt - the user prompt to be sent to the model
  89. * @param {import("../../helpers").Attachment[]} props.attachments - the array of attachments to be sent to the model
  90. * @returns {string|object[]}
  91. */
  92. #generateContent({ userPrompt, attachments = [] }) {
  93. if (!attachments.length) {
  94. return userPrompt;
  95. }
  96. const content = [{ type: "text", text: userPrompt }];
  97. for (let attachment of attachments) {
  98. content.push({
  99. type: "image_url",
  100. image_url: {
  101. url: attachment.contentString,
  102. detail: "high",
  103. },
  104. });
  105. }
  106. return content.flat();
  107. }
  108. /**
  109. * Construct the user prompt for this model.
  110. * @param {{attachments: import("../../helpers").Attachment[]}} param0
  111. * @returns
  112. */
  113. constructPrompt({
  114. systemPrompt = "",
  115. contextTexts = [],
  116. chatHistory = [],
  117. userPrompt = "",
  118. attachments = [],
  119. }) {
  120. const prompt = {
  121. role: "system",
  122. content: `${systemPrompt}${this.#appendContext(contextTexts)}`,
  123. };
  124. return [
  125. prompt,
  126. ...formatChatHistory(chatHistory, this.#generateContent),
  127. {
  128. role: "user",
  129. content: this.#generateContent({ userPrompt, attachments }),
  130. },
  131. ];
  132. }
  133. /**
  134. * Parses and prepends reasoning from the response and returns the full text response.
  135. * @param {Object} response
  136. * @returns {string}
  137. */
  138. #parseReasoningFromResponse({ message }) {
  139. let textResponse = message?.content;
  140. if (
  141. !!message?.reasoning_content &&
  142. message.reasoning_content.trim().length > 0
  143. )
  144. textResponse = `<think>${message.reasoning_content}</think>${textResponse}`;
  145. return textResponse;
  146. }
  147. async getChatCompletion(messages = null, { temperature = 0.7 }) {
  148. const result = await LLMPerformanceMonitor.measureAsyncFunction(
  149. this.openai.chat.completions
  150. .create({
  151. model: this.model,
  152. messages,
  153. temperature,
  154. max_tokens: this.maxTokens,
  155. })
  156. .catch((e) => {
  157. throw new Error(e.message);
  158. })
  159. );
  160. if (
  161. !result.output.hasOwnProperty("choices") ||
  162. result.output.choices.length === 0
  163. )
  164. return null;
  165. return {
  166. textResponse: this.#parseReasoningFromResponse(result.output.choices[0]),
  167. metrics: {
  168. prompt_tokens: result.output?.usage?.prompt_tokens || 0,
  169. completion_tokens: result.output?.usage?.completion_tokens || 0,
  170. total_tokens: result.output?.usage?.total_tokens || 0,
  171. outputTps:
  172. (result.output?.usage?.completion_tokens || 0) / result.duration,
  173. duration: result.duration,
  174. },
  175. };
  176. }
  177. async streamGetChatCompletion(messages = null, { temperature = 0.7 }) {
  178. const measuredStreamRequest = await LLMPerformanceMonitor.measureStream(
  179. this.openai.chat.completions.create({
  180. model: this.model,
  181. stream: true,
  182. messages,
  183. temperature,
  184. max_tokens: this.maxTokens,
  185. }),
  186. messages
  187. // runPromptTokenCalculation: true - There is not way to know if the generic provider connected is returning
  188. // the correct usage metrics if any at all since any provider could be connected.
  189. );
  190. return measuredStreamRequest;
  191. }
  192. // TODO: This is a copy of the generic handleStream function in responses.js
  193. // to specifically handle the DeepSeek reasoning model `reasoning_content` field.
  194. // When or if ever possible, we should refactor this to be in the generic function.
  195. handleStream(response, stream, responseProps) {
  196. const { uuid = uuidv4(), sources = [] } = responseProps;
  197. let hasUsageMetrics = false;
  198. let usage = {
  199. completion_tokens: 0,
  200. };
  201. return new Promise(async (resolve) => {
  202. let fullText = "";
  203. let reasoningText = "";
  204. // Establish listener to early-abort a streaming response
  205. // in case things go sideways or the user does not like the response.
  206. // We preserve the generated text but continue as if chat was completed
  207. // to preserve previously generated content.
  208. const handleAbort = () => {
  209. stream?.endMeasurement(usage);
  210. clientAbortedHandler(resolve, fullText);
  211. };
  212. response.on("close", handleAbort);
  213. try {
  214. for await (const chunk of stream) {
  215. const message = chunk?.choices?.[0];
  216. const token = message?.delta?.content;
  217. const reasoningToken = message?.delta?.reasoning_content;
  218. if (
  219. chunk.hasOwnProperty("usage") && // exists
  220. !!chunk.usage && // is not null
  221. Object.values(chunk.usage).length > 0 // has values
  222. ) {
  223. if (chunk.usage.hasOwnProperty("prompt_tokens")) {
  224. usage.prompt_tokens = Number(chunk.usage.prompt_tokens);
  225. }
  226. if (chunk.usage.hasOwnProperty("completion_tokens")) {
  227. hasUsageMetrics = true; // to stop estimating counter
  228. usage.completion_tokens = Number(chunk.usage.completion_tokens);
  229. }
  230. }
  231. // Reasoning models will always return the reasoning text before the token text.
  232. if (reasoningToken) {
  233. // If the reasoning text is empty (''), we need to initialize it
  234. // and send the first chunk of reasoning text.
  235. if (reasoningText.length === 0) {
  236. writeResponseChunk(response, {
  237. uuid,
  238. sources: [],
  239. type: "textResponseChunk",
  240. textResponse: `<think>${reasoningToken}`,
  241. close: false,
  242. error: false,
  243. });
  244. reasoningText += `<think>${reasoningToken}`;
  245. continue;
  246. } else {
  247. writeResponseChunk(response, {
  248. uuid,
  249. sources: [],
  250. type: "textResponseChunk",
  251. textResponse: reasoningToken,
  252. close: false,
  253. error: false,
  254. });
  255. reasoningText += reasoningToken;
  256. }
  257. }
  258. // If the reasoning text is not empty, but the reasoning token is empty
  259. // and the token text is not empty we need to close the reasoning text and begin sending the token text.
  260. if (!!reasoningText && !reasoningToken && token) {
  261. writeResponseChunk(response, {
  262. uuid,
  263. sources: [],
  264. type: "textResponseChunk",
  265. textResponse: `</think>`,
  266. close: false,
  267. error: false,
  268. });
  269. fullText += `${reasoningText}</think>`;
  270. reasoningText = "";
  271. }
  272. if (token) {
  273. fullText += token;
  274. // If we never saw a usage metric, we can estimate them by number of completion chunks
  275. if (!hasUsageMetrics) usage.completion_tokens++;
  276. writeResponseChunk(response, {
  277. uuid,
  278. sources: [],
  279. type: "textResponseChunk",
  280. textResponse: token,
  281. close: false,
  282. error: false,
  283. });
  284. }
  285. if (
  286. message?.hasOwnProperty("finish_reason") && // Got valid message and it is an object with finish_reason
  287. message.finish_reason !== "" &&
  288. message.finish_reason !== null
  289. ) {
  290. writeResponseChunk(response, {
  291. uuid,
  292. sources,
  293. type: "textResponseChunk",
  294. textResponse: "",
  295. close: true,
  296. error: false,
  297. });
  298. response.removeListener("close", handleAbort);
  299. stream?.endMeasurement(usage);
  300. resolve(fullText);
  301. break; // Break streaming when a valid finish_reason is first encountered
  302. }
  303. }
  304. } catch (e) {
  305. console.log(`\x1b[43m\x1b[34m[STREAMING ERROR]\x1b[0m ${e.message}`);
  306. writeResponseChunk(response, {
  307. uuid,
  308. type: "abort",
  309. textResponse: null,
  310. sources: [],
  311. close: true,
  312. error: e.message,
  313. });
  314. stream?.endMeasurement(usage);
  315. resolve(fullText);
  316. }
  317. });
  318. }
  319. // Simple wrapper for dynamic embedder & normalize interface for all LLM implementations
  320. async embedTextInput(textInput) {
  321. return await this.embedder.embedTextInput(textInput);
  322. }
  323. async embedChunks(textChunks = []) {
  324. return await this.embedder.embedChunks(textChunks);
  325. }
  326. async compressMessages(promptArgs = {}, rawHistory = []) {
  327. const { messageArrayCompressor } = require("../../helpers/chat");
  328. const messageArray = this.constructPrompt(promptArgs);
  329. return await messageArrayCompressor(this, messageArray, rawHistory);
  330. }
  331. }
  332. module.exports = {
  333. GenericOpenAiLLM,
  334. };