From 6db5fb8cb96d46b1c8a78cea5205071210917c24 Mon Sep 17 00:00:00 2001 From: Fares <46092308+FaresKi@users.noreply.github.com> Date: Mon, 11 Nov 2024 22:42:20 +0100 Subject: [PATCH] feat(community): added code blocks in markdown into document's page content (#7178) --- .../src/document_loaders/web/confluence.ts | 39 +++++++++++++++---- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/libs/langchain-community/src/document_loaders/web/confluence.ts b/libs/langchain-community/src/document_loaders/web/confluence.ts index 5a821b9628a3..848d36b8edf6 100644 --- a/libs/langchain-community/src/document_loaders/web/confluence.ts +++ b/libs/langchain-community/src/document_loaders/web/confluence.ts @@ -218,19 +218,44 @@ export class ConfluencePagesLoader extends BaseDocumentLoader { * @returns A Document instance. */ private createDocumentFromPage(page: ConfluencePage): Document { + const htmlContent = page.body.storage.value; + + // Handle both self-closing and regular macros for attachments and view-file + const htmlWithoutOtherMacros = htmlContent.replace( + /]*(?:\/?>|>.*?<\/ac:structured-macro>)/gs, + "[ATTACHMENT]" + ); + + // Extract and preserve code blocks with unique placeholders + const codeBlocks: { language: string; code: string }[] = []; + const htmlWithPlaceholders = htmlWithoutOtherMacros.replace( + /(.*?)<\/ac:parameter>.*?<\/ac:plain-text-body><\/ac:structured-macro>/g, + (_, language, code) => { + const placeholder = `CODE_BLOCK_${codeBlocks.length}`; + codeBlocks.push({ language, code: code.trim() }); + return `\n${placeholder}\n`; + } + ); + // Convert the HTML content to plain text - const plainTextContent = htmlToText(page.body.storage.value, { + let plainTextContent = htmlToText(htmlWithPlaceholders, { wordwrap: false, - preserveNewlines: false, + preserveNewlines: true, + }); + + // Reinsert code blocks with proper markdown formatting + codeBlocks.forEach(({ language, code }, index) => { + const placeholder = `CODE_BLOCK_${index}`; + plainTextContent = plainTextContent.replace( + placeholder, + `\`\`\`${language}\n${code}\n\`\`\`` + ); }); // Remove empty lines const textWithoutEmptyLines = plainTextContent.replace(/^\s*[\r\n]/gm, ""); - // Generate the URL - const pageUrl = `${this.baseUrl}/spaces/${this.spaceKey}/pages/${page.id}`; - - // Return a langchain document + // Rest of the method remains the same... return new Document({ pageContent: textWithoutEmptyLines, metadata: { @@ -238,7 +263,7 @@ export class ConfluencePagesLoader extends BaseDocumentLoader { status: page.status, title: page.title, type: page.type, - url: pageUrl, + url: `${this.baseUrl}/spaces/${this.spaceKey}/pages/${page.id}`, version: page.version?.number, updated_by: page.version?.by?.displayName, updated_at: page.version?.when,