Fixed the error on ConfluenceLoader when content_format=VIEW and `kee…

…p_markdown_format`=True (langchain-ai#9633) - Description: a description of the change when I set `content_format=ContentFormat.VIEW` and `keep_markdown_format=True` on ConfluenceLoader, it shows the following error: ``` langchain/document_loaders/confluence.py", line 459, in process_page page["body"]["storage"]["value"], heading_style="ATX" KeyError: 'storage' ``` The reason is because the content format was set to `view` but it was still trying to get the content from `page["body"]["storage"]["value"]`. Also added the other content formats which are supported by Atlassian API https://stackoverflow.com/questions/34353955/confluence-rest-api-expanding-page-body-when-retrieving-page-by-title/34363386#34363386 - Issue: the issue # it fixes (if applicable), Not applicable. - Dependencies: any dependencies required for this change, Added optional dependency `markdownify` if anyone wants to extract in markdown format. --------- Co-authored-by: Bagatur <[email protected]>
FalkorDB · Aug 23, 2023 · b379c5f · b379c5f
1 parent e1f4f9a
commit b379c5f
Show file tree

Hide file tree

Showing 4 changed files with 75 additions and 17 deletions.
diff --git a/libs/langchain/langchain/document_loaders/confluence.py b/libs/langchain/langchain/document_loaders/confluence.py
@@ -20,16 +20,14 @@
 class ContentFormat(str, Enum):
     """Enumerator of the content formats of Confluence page."""
 
+    EDITOR = "body.editor"
+    EXPORT_VIEW = "body.export_view"
+    ANONYMOUS_EXPORT_VIEW = "body.anonymous_export_view"
     STORAGE = "body.storage"
     VIEW = "body.view"
 
     def get_content(self, page: dict) -> str:
-        if self == ContentFormat.STORAGE:
-            return page["body"]["storage"]["value"]
-        elif self == ContentFormat.VIEW:
-            return page["body"]["view"]["value"]
-
-        raise ValueError("unknown content format")
+        return page["body"][self.name.lower()]["value"]
 
 
 class ConfluenceLoader(BaseLoader):
@@ -52,7 +50,10 @@ class ConfluenceLoader(BaseLoader):
     raw XML representation for storage. The view format is the HTML representation for
     viewing with macros are rendered as though it is viewed by users. You can pass
     a enum `content_format` argument to `load()` to specify the content format, this is
-    set to `ContentFormat.STORAGE` by default.
+    set to `ContentFormat.STORAGE` by default, the supported values are:
+    `ContentFormat.EDITOR`, `ContentFormat.EXPORT_VIEW`,
+    `ContentFormat.ANONYMOUS_EXPORT_VIEW`, `ContentFormat.STORAGE`,
+    and `ContentFormat.VIEW`.
 
     Hint: space_key and page_id can both be found in the URL of a page in Confluence
     - https://yoursite.atlassian.com/wiki/spaces/<space_key>/pages/<page_id>
@@ -238,7 +239,11 @@ def load(
         :type include_attachments: bool, optional
         :param include_comments: defaults to False
         :type include_comments: bool, optional
-        :param content_format: Specify content format, defaults to ContentFormat.STORAGE
+        :param content_format: Specify content format, defaults to
+                                ContentFormat.STORAGE, the supported values are:
+                                `ContentFormat.EDITOR`, `ContentFormat.EXPORT_VIEW`,
+                                `ContentFormat.ANONYMOUS_EXPORT_VIEW`,
+                                `ContentFormat.STORAGE`, and `ContentFormat.VIEW`.
         :type content_format: ContentFormat
         :param limit: Maximum number of pages to retrieve per request, defaults to 50
         :type limit: int, optional
@@ -473,14 +478,12 @@ def process_page(
         else:
             attachment_texts = []
 
+        content = content_format.get_content(page)
         if keep_markdown_format:
             # Use markdownify to keep the page Markdown style
-            text = markdownify(
-                page["body"]["storage"]["value"], heading_style="ATX"
-            ) + "".join(attachment_texts)
+            text = markdownify(content, heading_style="ATX") + "".join(attachment_texts)
 
         else:
-            content = content_format.get_content(page)
             if keep_newlines:
                 text = BeautifulSoup(
                     content.replace("</p>", "\n</p>").replace("<br />", "\n"), "lxml"

diff --git a/libs/langchain/poetry.lock b/libs/langchain/poetry.lock
diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml
@@ -126,6 +126,7 @@ amazon-textract-caller = {version = "<2", optional = true}
 xata = {version = "^1.0.0a7", optional = true}
 xmltodict = {version = "^0.13.0", optional = true}
 google-api-core = {version = "^2.11.1", optional = true}
+markdownify = {version = "^0.11.6", optional = true}
 
 
 [tool.poetry.group.test.dependencies]
@@ -338,6 +339,7 @@ extended_testing = [
  "xmltodict",
  "faiss-cpu",
  "openapi-schema-pydantic",
+ "markdownify",
 ]
 
 [tool.ruff]

diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_confluence.py b/libs/langchain/tests/unit_tests/document_loaders/test_confluence.py
@@ -6,7 +6,7 @@
 import requests
 
 from langchain.docstore.document import Document
-from langchain.document_loaders.confluence import ConfluenceLoader
+from langchain.document_loaders.confluence import ConfluenceLoader, ContentFormat
 
 
 @pytest.fixture
@@ -152,6 +152,40 @@ def test_confluence_loader_load_data_by_space_id(
         assert mock_confluence.cql.call_count == 0
         assert mock_confluence.get_page_child_by_type.call_count == 0
 
+    def test_confluence_loader_when_content_format_and_keep_markdown_format_enabled(
+        self, mock_confluence: MagicMock
+    ) -> None:
+        # one response with two pages
+        mock_confluence.get_all_pages_from_space.return_value = [
+            self._get_mock_page("123", ContentFormat.VIEW),
+            self._get_mock_page("456", ContentFormat.VIEW),
+        ]
+        mock_confluence.get_all_restrictions_for_content.side_effect = [
+            self._get_mock_page_restrictions("123"),
+            self._get_mock_page_restrictions("456"),
+        ]
+
+        confluence_loader = self._get_mock_confluence_loader(mock_confluence)
+
+        documents = confluence_loader.load(
+            space_key=self.MOCK_SPACE_KEY,
+            content_format=ContentFormat.VIEW,
+            keep_markdown_format=True,
+            max_pages=2,
+        )
+
+        assert mock_confluence.get_all_pages_from_space.call_count == 1
+
+        assert len(documents) == 2
+        assert all(isinstance(doc, Document) for doc in documents)
+        assert documents[0].page_content == "Content 123\n\n"
+        assert documents[1].page_content == "Content 456\n\n"
+
+        assert mock_confluence.get_page_by_id.call_count == 0
+        assert mock_confluence.get_all_pages_by_label.call_count == 0
+        assert mock_confluence.cql.call_count == 0
+        assert mock_confluence.get_page_child_by_type.call_count == 0
+
     def _get_mock_confluence_loader(
         self, mock_confluence: MagicMock
     ) -> ConfluenceLoader:
@@ -163,11 +197,15 @@ def _get_mock_confluence_loader(
         confluence_loader.confluence = mock_confluence
         return confluence_loader
 
-    def _get_mock_page(self, page_id: str) -> Dict:
+    def _get_mock_page(
+        self, page_id: str, content_format: ContentFormat = ContentFormat.STORAGE
+    ) -> Dict:
         return {
             "id": f"{page_id}",
             "title": f"Page {page_id}",
-            "body": {"storage": {"value": f"<p>Content {page_id}</p>"}},
+            "body": {
+                f"{content_format.name.lower()}": {"value": f"<p>Content {page_id}</p>"}
+            },
             "status": "current",
             "type": "page",
             "_links": {