You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

117 lines
3.6 KiB

11 months ago
  1. const { parse } = require("node-html-parser");
  2. const RE_YOUTUBE =
  3. /(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})/i;
  4. const USER_AGENT =
  5. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)";
  6. class YoutubeTranscriptError extends Error {
  7. constructor(message) {
  8. super(`[YoutubeTranscript] ${message}`);
  9. }
  10. }
  11. /**
  12. * Class to retrieve transcript if exist
  13. */
  14. class YoutubeTranscript {
  15. /**
  16. * Fetch transcript from YTB Video
  17. * @param videoId Video url or video identifier
  18. * @param config Object with lang param (eg: en, es, hk, uk) format.
  19. * Will just the grab first caption if it can find one, so no special lang caption support.
  20. */
  21. static async fetchTranscript(videoId, config = {}) {
  22. const identifier = this.retrieveVideoId(videoId);
  23. const lang = config?.lang ?? "en";
  24. try {
  25. const transcriptUrl = await fetch(
  26. `https://www.youtube.com/watch?v=${identifier}`,
  27. {
  28. headers: {
  29. "User-Agent": USER_AGENT,
  30. },
  31. }
  32. )
  33. .then((res) => res.text())
  34. .then((html) => parse(html))
  35. .then((html) => this.#parseTranscriptEndpoint(html, lang));
  36. if (!transcriptUrl)
  37. throw new Error("Failed to locate a transcript for this video!");
  38. // Result is hopefully some XML.
  39. const transcriptXML = await fetch(transcriptUrl)
  40. .then((res) => res.text())
  41. .then((xml) => parse(xml));
  42. let transcript = "";
  43. const chunks = transcriptXML.getElementsByTagName("text");
  44. for (const chunk of chunks) {
  45. // Add space after each text chunk
  46. transcript += chunk.textContent + " ";
  47. }
  48. // Trim extra whitespace
  49. return transcript.trim().replace(/\s+/g, " ");
  50. } catch (e) {
  51. throw new YoutubeTranscriptError(e);
  52. }
  53. }
  54. static #parseTranscriptEndpoint(document, langCode = null) {
  55. try {
  56. // Get all script tags on document page
  57. const scripts = document.getElementsByTagName("script");
  58. // find the player data script.
  59. const playerScript = scripts.find((script) =>
  60. script.textContent.includes("var ytInitialPlayerResponse = {")
  61. );
  62. const dataString =
  63. playerScript.textContent
  64. ?.split("var ytInitialPlayerResponse = ")?.[1] //get the start of the object {....
  65. ?.split("};")?.[0] + // chunk off any code after object closure.
  66. "}"; // add back that curly brace we just cut.
  67. const data = JSON.parse(dataString.trim()); // Attempt a JSON parse
  68. const availableCaptions =
  69. data?.captions?.playerCaptionsTracklistRenderer?.captionTracks || [];
  70. // If languageCode was specified then search for it's code, otherwise get the first.
  71. let captionTrack = availableCaptions?.[0];
  72. if (langCode)
  73. captionTrack =
  74. availableCaptions.find((track) =>
  75. track.languageCode.includes(langCode)
  76. ) ?? availableCaptions?.[0];
  77. return captionTrack?.baseUrl;
  78. } catch (e) {
  79. console.error(`YoutubeTranscript.#parseTranscriptEndpoint ${e.message}`);
  80. return null;
  81. }
  82. }
  83. /**
  84. * Retrieve video id from url or string
  85. * @param videoId video url or video id
  86. */
  87. static retrieveVideoId(videoId) {
  88. if (videoId.length === 11) {
  89. return videoId;
  90. }
  91. const matchId = videoId.match(RE_YOUTUBE);
  92. if (matchId && matchId.length) {
  93. return matchId[1];
  94. }
  95. throw new YoutubeTranscriptError(
  96. "Impossible to retrieve Youtube video ID."
  97. );
  98. }
  99. }
  100. module.exports = {
  101. YoutubeTranscript,
  102. YoutubeTranscriptError,
  103. };