You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

113 lines
3.8 KiB

11 months ago
  1. const { CollectorApi } = require("../../../collectorApi");
  2. const Provider = require("../providers/ai-provider");
  3. const { summarizeContent } = require("../utils/summarize");
  4. const webScraping = {
  5. name: "web-scraping",
  6. startupConfig: {
  7. params: {},
  8. },
  9. plugin: function () {
  10. return {
  11. name: this.name,
  12. setup(aibitat) {
  13. aibitat.function({
  14. super: aibitat,
  15. name: this.name,
  16. controller: new AbortController(),
  17. description:
  18. "Scrapes the content of a webpage or online resource from a provided URL.",
  19. examples: [
  20. {
  21. prompt: "What is anythingllm.com about?",
  22. call: JSON.stringify({ url: "https://anythingllm.com" }),
  23. },
  24. {
  25. prompt: "Scrape https://example.com",
  26. call: JSON.stringify({ url: "https://example.com" }),
  27. },
  28. ],
  29. parameters: {
  30. $schema: "http://json-schema.org/draft-07/schema#",
  31. type: "object",
  32. properties: {
  33. url: {
  34. type: "string",
  35. format: "uri",
  36. description:
  37. "A complete web address URL including protocol. Assumes https if not provided.",
  38. },
  39. },
  40. additionalProperties: false,
  41. },
  42. handler: async function ({ url }) {
  43. try {
  44. if (url) return await this.scrape(url);
  45. return "There is nothing we can do. This function call returns no information.";
  46. } catch (error) {
  47. return `There was an error while calling the function. No data or response was found. Let the user know this was the error: ${error.message}`;
  48. }
  49. },
  50. /**
  51. * Scrape a website and summarize the content based on objective if the content is too large.
  52. * Objective is the original objective & task that user give to the agent, url is the url of the website to be scraped.
  53. * Here we can leverage the document collector to get raw website text quickly.
  54. *
  55. * @param url
  56. * @returns
  57. */
  58. scrape: async function (url) {
  59. this.super.introspect(
  60. `${this.caller}: Scraping the content of ${url}`
  61. );
  62. const { success, content } =
  63. await new CollectorApi().getLinkContent(url);
  64. if (!success) {
  65. this.super.introspect(
  66. `${this.caller}: could not scrape ${url}. I can't use this page's content.`
  67. );
  68. throw new Error(
  69. `URL could not be scraped and no content was found.`
  70. );
  71. }
  72. if (!content || content?.length === 0) {
  73. throw new Error("There was no content to be collected or read.");
  74. }
  75. const { TokenManager } = require("../../../helpers/tiktoken");
  76. if (
  77. new TokenManager(this.super.model).countFromString(content) <
  78. Provider.contextLimit(this.super.provider, this.super.model)
  79. ) {
  80. return content;
  81. }
  82. this.super.introspect(
  83. `${this.caller}: This page's content is way too long. I will summarize it right now.`
  84. );
  85. this.super.onAbort(() => {
  86. this.super.handlerProps.log(
  87. "Abort was triggered, exiting summarization early."
  88. );
  89. this.controller.abort();
  90. });
  91. return summarizeContent({
  92. provider: this.super.provider,
  93. model: this.super.model,
  94. controllerSignal: this.controller.signal,
  95. content,
  96. });
  97. },
  98. });
  99. },
  100. };
  101. },
  102. };
  103. module.exports = {
  104. webScraping,
  105. };