You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

127 lines
3.7 KiB

11 months ago
  1. const { v4 } = require("uuid");
  2. const {
  3. PuppeteerWebBaseLoader,
  4. } = require("langchain/document_loaders/web/puppeteer");
  5. const { writeToServerDocuments } = require("../../utils/files");
  6. const { tokenizeString } = require("../../utils/tokenizer");
  7. const { default: slugify } = require("slugify");
  8. /**
  9. * Scrape a generic URL and return the content in the specified format
  10. * @param {string} link - The URL to scrape
  11. * @param {('html' | 'text')} captureAs - The format to capture the page content as
  12. * @param {boolean} processAsDocument - Whether to process the content as a document or return the content directly
  13. * @returns {Promise<Object>} - The content of the page
  14. */
  15. async function scrapeGenericUrl(
  16. link,
  17. captureAs = "text",
  18. processAsDocument = true
  19. ) {
  20. console.log(`-- Working URL ${link} => (${captureAs}) --`);
  21. const content = await getPageContent(link, captureAs);
  22. if (!content.length) {
  23. console.error(`Resulting URL content was empty at ${link}.`);
  24. return {
  25. success: false,
  26. reason: `No URL content found at ${link}.`,
  27. documents: [],
  28. };
  29. }
  30. if (!processAsDocument) {
  31. return {
  32. success: true,
  33. content,
  34. };
  35. }
  36. const url = new URL(link);
  37. const decodedPathname = decodeURIComponent(url.pathname);
  38. const filename = `${url.hostname}${decodedPathname.replace(/\//g, "_")}`;
  39. const data = {
  40. id: v4(),
  41. url: "file://" + slugify(filename) + ".html",
  42. title: slugify(filename) + ".html",
  43. docAuthor: "no author found",
  44. description: "No description found.",
  45. docSource: "URL link uploaded by the user.",
  46. chunkSource: `link://${link}`,
  47. published: new Date().toLocaleString(),
  48. wordCount: content.split(" ").length,
  49. pageContent: content,
  50. token_count_estimate: tokenizeString(content),
  51. };
  52. const document = writeToServerDocuments(
  53. data,
  54. `url-${slugify(filename)}-${data.id}`
  55. );
  56. console.log(`[SUCCESS]: URL ${link} converted & ready for embedding.\n`);
  57. return { success: true, reason: null, documents: [document] };
  58. }
  59. /**
  60. * Get the content of a page
  61. * @param {string} link - The URL to get the content of
  62. * @param {('html' | 'text')} captureAs - The format to capture the page content as
  63. * @returns {Promise<string>} - The content of the page
  64. */
  65. async function getPageContent(link, captureAs = "text") {
  66. try {
  67. let pageContents = [];
  68. const loader = new PuppeteerWebBaseLoader(link, {
  69. launchOptions: {
  70. headless: "new",
  71. ignoreHTTPSErrors: true,
  72. },
  73. gotoOptions: {
  74. waitUntil: "networkidle2",
  75. },
  76. async evaluate(page, browser) {
  77. const result = await page.evaluate((captureAs) => {
  78. if (captureAs === "text") return document.body.innerText;
  79. if (captureAs === "html") return document.documentElement.innerHTML;
  80. return document.body.innerText;
  81. }, captureAs);
  82. await browser.close();
  83. return result;
  84. },
  85. });
  86. const docs = await loader.load();
  87. for (const doc of docs) {
  88. pageContents.push(doc.pageContent);
  89. }
  90. return pageContents.join(" ");
  91. } catch (error) {
  92. console.error(
  93. "getPageContent failed to be fetched by puppeteer - falling back to fetch!",
  94. error
  95. );
  96. }
  97. try {
  98. const pageText = await fetch(link, {
  99. method: "GET",
  100. headers: {
  101. "Content-Type": "text/plain",
  102. "User-Agent":
  103. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)",
  104. },
  105. }).then((res) => res.text());
  106. return pageText;
  107. } catch (error) {
  108. console.error("getPageContent failed to be fetched by any method.", error);
  109. }
  110. return null;
  111. }
  112. module.exports = {
  113. scrapeGenericUrl,
  114. };