You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

141 lines
4.1 KiB

11 months ago
  1. /*
  2. * This is a custom implementation of the Confluence langchain loader. There was an issue where
  3. * code blocks were not being extracted. This is a temporary fix until this issue is resolved.*/
  4. const { htmlToText } = require("html-to-text");
  5. class ConfluencePagesLoader {
  6. constructor({
  7. baseUrl,
  8. spaceKey,
  9. username,
  10. accessToken,
  11. limit = 25,
  12. expand = "body.storage,version",
  13. personalAccessToken,
  14. cloud = true,
  15. }) {
  16. this.baseUrl = baseUrl;
  17. this.spaceKey = spaceKey;
  18. this.username = username;
  19. this.accessToken = accessToken;
  20. this.limit = limit;
  21. this.expand = expand;
  22. this.personalAccessToken = personalAccessToken;
  23. this.cloud = cloud;
  24. }
  25. get authorizationHeader() {
  26. if (this.personalAccessToken) {
  27. return `Bearer ${this.personalAccessToken}`;
  28. } else if (this.username && this.accessToken) {
  29. const authToken = Buffer.from(
  30. `${this.username}:${this.accessToken}`
  31. ).toString("base64");
  32. return `Basic ${authToken}`;
  33. }
  34. return undefined;
  35. }
  36. async load(options) {
  37. try {
  38. const pages = await this.fetchAllPagesInSpace(
  39. options?.start,
  40. options?.limit
  41. );
  42. return pages.map((page) => this.createDocumentFromPage(page));
  43. } catch (error) {
  44. console.error("Error:", error);
  45. return [];
  46. }
  47. }
  48. async fetchConfluenceData(url) {
  49. try {
  50. const initialHeaders = {
  51. "Content-Type": "application/json",
  52. Accept: "application/json",
  53. };
  54. const authHeader = this.authorizationHeader;
  55. if (authHeader) {
  56. initialHeaders.Authorization = authHeader;
  57. }
  58. const response = await fetch(url, {
  59. headers: initialHeaders,
  60. });
  61. if (!response.ok) {
  62. throw new Error(
  63. `Failed to fetch ${url} from Confluence: ${response.status}`
  64. );
  65. }
  66. return await response.json();
  67. } catch (error) {
  68. throw new Error(`Failed to fetch ${url} from Confluence: ${error}`);
  69. }
  70. }
  71. // https://developer.atlassian.com/cloud/confluence/rest/v2/intro/#auth
  72. async fetchAllPagesInSpace(start = 0, limit = this.limit) {
  73. const url = `${this.baseUrl}${
  74. this.cloud ? "/wiki" : ""
  75. }/rest/api/content?spaceKey=${
  76. this.spaceKey
  77. }&limit=${limit}&start=${start}&expand=${this.expand}`;
  78. const data = await this.fetchConfluenceData(url);
  79. if (data.size === 0) {
  80. return [];
  81. }
  82. const nextPageStart = start + data.size;
  83. const nextPageResults = await this.fetchAllPagesInSpace(
  84. nextPageStart,
  85. limit
  86. );
  87. return data.results.concat(nextPageResults);
  88. }
  89. createDocumentFromPage(page) {
  90. // Function to extract code blocks
  91. const extractCodeBlocks = (content) => {
  92. const codeBlockRegex =
  93. /<ac:structured-macro ac:name="code"[^>]*>[\s\S]*?<ac:plain-text-body><!\[CDATA\[([\s\S]*?)\]\]><\/ac:plain-text-body>[\s\S]*?<\/ac:structured-macro>/g;
  94. const languageRegex =
  95. /<ac:parameter ac:name="language">(.*?)<\/ac:parameter>/;
  96. return content.replace(codeBlockRegex, (match) => {
  97. const language = match.match(languageRegex)?.[1] || "";
  98. const code =
  99. match.match(
  100. /<ac:plain-text-body><!\[CDATA\[([\s\S]*?)\]\]><\/ac:plain-text-body>/
  101. )?.[1] || "";
  102. return `\n\`\`\`${language}\n${code.trim()}\n\`\`\`\n`;
  103. });
  104. };
  105. const contentWithCodeBlocks = extractCodeBlocks(page.body.storage.value);
  106. const plainTextContent = htmlToText(contentWithCodeBlocks, {
  107. wordwrap: false,
  108. preserveNewlines: true,
  109. });
  110. const textWithPreservedStructure = plainTextContent.replace(
  111. /\n{3,}/g,
  112. "\n\n"
  113. );
  114. const pageUrl = `${this.baseUrl}/spaces/${this.spaceKey}/pages/${page.id}`;
  115. return {
  116. pageContent: textWithPreservedStructure,
  117. metadata: {
  118. id: page.id,
  119. status: page.status,
  120. title: page.title,
  121. type: page.type,
  122. url: pageUrl,
  123. version: page.version?.number,
  124. updated_by: page.version?.by?.displayName,
  125. updated_at: page.version?.when,
  126. },
  127. };
  128. }
  129. }
  130. module.exports = { ConfluencePagesLoader };