You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

219 lines
6.8 KiB

11 months ago
  1. const fs = require("fs");
  2. const path = require("path");
  3. const { v4 } = require("uuid");
  4. const defaultWhisper = "Xenova/whisper-small"; // Model Card: https://huggingface.co/Xenova/whisper-small
  5. const fileSize = {
  6. "Xenova/whisper-small": "250mb",
  7. "Xenova/whisper-large": "1.56GB",
  8. };
  9. class LocalWhisper {
  10. constructor({ options }) {
  11. this.model = options?.WhisperModelPref ?? defaultWhisper;
  12. this.fileSize = fileSize[this.model];
  13. this.cacheDir = path.resolve(
  14. process.env.STORAGE_DIR
  15. ? path.resolve(process.env.STORAGE_DIR, `models`)
  16. : path.resolve(__dirname, `../../../server/storage/models`)
  17. );
  18. this.modelPath = path.resolve(this.cacheDir, ...this.model.split("/"));
  19. // Make directory when it does not exist in existing installations
  20. if (!fs.existsSync(this.cacheDir))
  21. fs.mkdirSync(this.cacheDir, { recursive: true });
  22. this.#log("Initialized.");
  23. }
  24. #log(text, ...args) {
  25. console.log(`\x1b[32m[LocalWhisper]\x1b[0m ${text}`, ...args);
  26. }
  27. #validateAudioFile(wavFile) {
  28. const sampleRate = wavFile.fmt.sampleRate;
  29. const duration = wavFile.data.samples / sampleRate;
  30. // Most speech recognition systems expect minimum 8kHz
  31. // But we'll set it lower to be safe
  32. if (sampleRate < 4000) {
  33. // 4kHz minimum
  34. throw new Error(
  35. "Audio file sample rate is too low for accurate transcription. Minimum required is 4kHz."
  36. );
  37. }
  38. // Typical audio file duration limits
  39. const MAX_DURATION_SECONDS = 4 * 60 * 60; // 4 hours
  40. if (duration > MAX_DURATION_SECONDS) {
  41. throw new Error("Audio file duration exceeds maximum limit of 4 hours.");
  42. }
  43. // Check final sample count after upsampling to prevent memory issues
  44. const targetSampleRate = 16000;
  45. const upsampledSamples = duration * targetSampleRate;
  46. const MAX_SAMPLES = 230_400_000; // ~4 hours at 16kHz
  47. if (upsampledSamples > MAX_SAMPLES) {
  48. throw new Error("Audio file exceeds maximum allowed length.");
  49. }
  50. return true;
  51. }
  52. async #convertToWavAudioData(sourcePath) {
  53. try {
  54. let buffer;
  55. const wavefile = require("wavefile");
  56. const ffmpeg = require("fluent-ffmpeg");
  57. const outFolder = path.resolve(__dirname, `../../storage/tmp`);
  58. if (!fs.existsSync(outFolder))
  59. fs.mkdirSync(outFolder, { recursive: true });
  60. const fileExtension = path.extname(sourcePath).toLowerCase();
  61. if (fileExtension !== ".wav") {
  62. this.#log(
  63. `File conversion required! ${fileExtension} file detected - converting to .wav`
  64. );
  65. const outputFile = path.resolve(outFolder, `${v4()}.wav`);
  66. const convert = new Promise((resolve) => {
  67. ffmpeg(sourcePath)
  68. .toFormat("wav")
  69. .on("error", (error) => {
  70. this.#log(`Conversion Error! ${error.message}`);
  71. resolve(false);
  72. })
  73. .on("progress", (progress) =>
  74. this.#log(
  75. `Conversion Processing! ${progress.targetSize}KB converted`
  76. )
  77. )
  78. .on("end", () => {
  79. this.#log(`Conversion Complete! File converted to .wav!`);
  80. resolve(true);
  81. })
  82. .save(outputFile);
  83. });
  84. const success = await convert;
  85. if (!success)
  86. throw new Error(
  87. "[Conversion Failed]: Could not convert file to .wav format!"
  88. );
  89. const chunks = [];
  90. const stream = fs.createReadStream(outputFile);
  91. for await (let chunk of stream) chunks.push(chunk);
  92. buffer = Buffer.concat(chunks);
  93. fs.rmSync(outputFile);
  94. } else {
  95. const chunks = [];
  96. const stream = fs.createReadStream(sourcePath);
  97. for await (let chunk of stream) chunks.push(chunk);
  98. buffer = Buffer.concat(chunks);
  99. }
  100. const wavFile = new wavefile.WaveFile(buffer);
  101. try {
  102. this.#validateAudioFile(wavFile);
  103. } catch (error) {
  104. this.#log(`Audio validation failed: ${error.message}`);
  105. throw new Error(`Invalid audio file: ${error.message}`);
  106. }
  107. wavFile.toBitDepth("32f");
  108. wavFile.toSampleRate(16000);
  109. let audioData = wavFile.getSamples();
  110. if (Array.isArray(audioData)) {
  111. if (audioData.length > 1) {
  112. const SCALING_FACTOR = Math.sqrt(2);
  113. // Merge channels into first channel to save memory
  114. for (let i = 0; i < audioData[0].length; ++i) {
  115. audioData[0][i] =
  116. (SCALING_FACTOR * (audioData[0][i] + audioData[1][i])) / 2;
  117. }
  118. }
  119. audioData = audioData[0];
  120. }
  121. return audioData;
  122. } catch (error) {
  123. console.error(`convertToWavAudioData`, error);
  124. return null;
  125. }
  126. }
  127. async client() {
  128. if (!fs.existsSync(this.modelPath)) {
  129. this.#log(
  130. `The native whisper model has never been run and will be downloaded right now. Subsequent runs will be faster. (~${this.fileSize})`
  131. );
  132. }
  133. try {
  134. // Convert ESM to CommonJS via import so we can load this library.
  135. const pipeline = (...args) =>
  136. import("@xenova/transformers").then(({ pipeline }) =>
  137. pipeline(...args)
  138. );
  139. return await pipeline("automatic-speech-recognition", this.model, {
  140. cache_dir: this.cacheDir,
  141. ...(!fs.existsSync(this.modelPath)
  142. ? {
  143. // Show download progress if we need to download any files
  144. progress_callback: (data) => {
  145. if (!data.hasOwnProperty("progress")) return;
  146. console.log(
  147. `\x1b[34m[Embedding - Downloading Model Files]\x1b[0m ${
  148. data.file
  149. } ${~~data?.progress}%`
  150. );
  151. },
  152. }
  153. : {}),
  154. });
  155. } catch (error) {
  156. this.#log("Failed to load the native whisper model:", error);
  157. throw error;
  158. }
  159. }
  160. async processFile(fullFilePath, filename) {
  161. try {
  162. const transcriberPromise = new Promise((resolve) =>
  163. this.client().then((client) => resolve(client))
  164. );
  165. const audioDataPromise = new Promise((resolve) =>
  166. this.#convertToWavAudioData(fullFilePath).then((audioData) =>
  167. resolve(audioData)
  168. )
  169. );
  170. const [audioData, transcriber] = await Promise.all([
  171. audioDataPromise,
  172. transcriberPromise,
  173. ]);
  174. if (!audioData) {
  175. this.#log(`Failed to parse content from ${filename}.`);
  176. return {
  177. content: null,
  178. error: `Failed to parse content from ${filename}.`,
  179. };
  180. }
  181. this.#log(`Transcribing audio data to text...`);
  182. const { text } = await transcriber(audioData, {
  183. chunk_length_s: 30,
  184. stride_length_s: 5,
  185. });
  186. return { content: text, error: null };
  187. } catch (error) {
  188. return { content: null, error: error.message };
  189. }
  190. }
  191. }
  192. module.exports = {
  193. LocalWhisper,
  194. };