From b379c5f9c89199370d9175ce6ee34d469567e533 Mon Sep 17 00:00:00 2001 From: Jun Liu Date: Wed, 23 Aug 2023 14:00:15 +1000 Subject: [PATCH] Fixed the error on ConfluenceLoader when content_format=VIEW and `keep_markdown_format`=True (#9633) - Description: a description of the change when I set `content_format=ContentFormat.VIEW` and `keep_markdown_format=True` on ConfluenceLoader, it shows the following error: ``` langchain/document_loaders/confluence.py", line 459, in process_page page["body"]["storage"]["value"], heading_style="ATX" KeyError: 'storage' ``` The reason is because the content format was set to `view` but it was still trying to get the content from `page["body"]["storage"]["value"]`. Also added the other content formats which are supported by Atlassian API https://stackoverflow.com/questions/34353955/confluence-rest-api-expanding-page-body-when-retrieving-page-by-title/34363386#34363386 - Issue: the issue # it fixes (if applicable), Not applicable. - Dependencies: any dependencies required for this change, Added optional dependency `markdownify` if anyone wants to extract in markdown format. --------- Co-authored-by: Bagatur --- .../langchain/document_loaders/confluence.py | 27 +++++++----- libs/langchain/poetry.lock | 19 +++++++- libs/langchain/pyproject.toml | 2 + .../document_loaders/test_confluence.py | 44 +++++++++++++++++-- 4 files changed, 75 insertions(+), 17 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/confluence.py b/libs/langchain/langchain/document_loaders/confluence.py index a1151c08e2cd4..739f52f48ed09 100644 --- a/libs/langchain/langchain/document_loaders/confluence.py +++ b/libs/langchain/langchain/document_loaders/confluence.py @@ -20,16 +20,14 @@ class ContentFormat(str, Enum): """Enumerator of the content formats of Confluence page.""" + EDITOR = "body.editor" + EXPORT_VIEW = "body.export_view" + ANONYMOUS_EXPORT_VIEW = "body.anonymous_export_view" STORAGE = "body.storage" VIEW = "body.view" def get_content(self, page: dict) -> str: - if self == ContentFormat.STORAGE: - return page["body"]["storage"]["value"] - elif self == ContentFormat.VIEW: - return page["body"]["view"]["value"] - - raise ValueError("unknown content format") + return page["body"][self.name.lower()]["value"] class ConfluenceLoader(BaseLoader): @@ -52,7 +50,10 @@ class ConfluenceLoader(BaseLoader): raw XML representation for storage. The view format is the HTML representation for viewing with macros are rendered as though it is viewed by users. You can pass a enum `content_format` argument to `load()` to specify the content format, this is - set to `ContentFormat.STORAGE` by default. + set to `ContentFormat.STORAGE` by default, the supported values are: + `ContentFormat.EDITOR`, `ContentFormat.EXPORT_VIEW`, + `ContentFormat.ANONYMOUS_EXPORT_VIEW`, `ContentFormat.STORAGE`, + and `ContentFormat.VIEW`. Hint: space_key and page_id can both be found in the URL of a page in Confluence - https://yoursite.atlassian.com/wiki/spaces//pages/ @@ -238,7 +239,11 @@ def load( :type include_attachments: bool, optional :param include_comments: defaults to False :type include_comments: bool, optional - :param content_format: Specify content format, defaults to ContentFormat.STORAGE + :param content_format: Specify content format, defaults to + ContentFormat.STORAGE, the supported values are: + `ContentFormat.EDITOR`, `ContentFormat.EXPORT_VIEW`, + `ContentFormat.ANONYMOUS_EXPORT_VIEW`, + `ContentFormat.STORAGE`, and `ContentFormat.VIEW`. :type content_format: ContentFormat :param limit: Maximum number of pages to retrieve per request, defaults to 50 :type limit: int, optional @@ -473,14 +478,12 @@ def process_page( else: attachment_texts = [] + content = content_format.get_content(page) if keep_markdown_format: # Use markdownify to keep the page Markdown style - text = markdownify( - page["body"]["storage"]["value"], heading_style="ATX" - ) + "".join(attachment_texts) + text = markdownify(content, heading_style="ATX") + "".join(attachment_texts) else: - content = content_format.get_content(page) if keep_newlines: text = BeautifulSoup( content.replace("

", "\n

").replace("
", "\n"), "lxml" diff --git a/libs/langchain/poetry.lock b/libs/langchain/poetry.lock index f065fd0c38d32..3ea9c47341f6b 100644 --- a/libs/langchain/poetry.lock +++ b/libs/langchain/poetry.lock @@ -4288,6 +4288,21 @@ profiling = ["gprof2dot"] rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] +[[package]] +name = "markdownify" +version = "0.11.6" +description = "Convert HTML to markdown." +optional = true +python-versions = "*" +files = [ + {file = "markdownify-0.11.6-py3-none-any.whl", hash = "sha256:ba35fe289d5e9073bcd7d2cad629278fe25f1a93741fcdc0bfb4f009076d8324"}, + {file = "markdownify-0.11.6.tar.gz", hash = "sha256:009b240e0c9f4c8eaf1d085625dcd4011e12f0f8cec55dedf9ea6f7655e49bfe"}, +] + +[package.dependencies] +beautifulsoup4 = ">=4.9,<5" +six = ">=1.15,<2" + [[package]] name = "markupsafe" version = "2.1.3" @@ -10323,7 +10338,7 @@ clarifai = ["clarifai"] cohere = ["cohere"] docarray = ["docarray"] embeddings = ["sentence-transformers"] -extended-testing = ["amazon-textract-caller", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "openapi-schema-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xata", "xmltodict"] +extended-testing = ["amazon-textract-caller", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "markdownify", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "openapi-schema-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xata", "xmltodict"] javascript = ["esprima"] llms = ["clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"] openai = ["openai", "tiktoken"] @@ -10333,4 +10348,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "27c44e64d872c51f42b58f9f5185f20914dc4360e91860cfc260b1acbdaa3272" +content-hash = "83280a03c352011c2a51081a29aca67bb5c4c23054ad1b7be94f89d6ce52460b" diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml index d02712349038d..61f05175ddbe7 100644 --- a/libs/langchain/pyproject.toml +++ b/libs/langchain/pyproject.toml @@ -126,6 +126,7 @@ amazon-textract-caller = {version = "<2", optional = true} xata = {version = "^1.0.0a7", optional = true} xmltodict = {version = "^0.13.0", optional = true} google-api-core = {version = "^2.11.1", optional = true} +markdownify = {version = "^0.11.6", optional = true} [tool.poetry.group.test.dependencies] @@ -338,6 +339,7 @@ extended_testing = [ "xmltodict", "faiss-cpu", "openapi-schema-pydantic", + "markdownify", ] [tool.ruff] diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_confluence.py b/libs/langchain/tests/unit_tests/document_loaders/test_confluence.py index 42de78598a6b7..0048a8fba4171 100644 --- a/libs/langchain/tests/unit_tests/document_loaders/test_confluence.py +++ b/libs/langchain/tests/unit_tests/document_loaders/test_confluence.py @@ -6,7 +6,7 @@ import requests from langchain.docstore.document import Document -from langchain.document_loaders.confluence import ConfluenceLoader +from langchain.document_loaders.confluence import ConfluenceLoader, ContentFormat @pytest.fixture @@ -152,6 +152,40 @@ def test_confluence_loader_load_data_by_space_id( assert mock_confluence.cql.call_count == 0 assert mock_confluence.get_page_child_by_type.call_count == 0 + def test_confluence_loader_when_content_format_and_keep_markdown_format_enabled( + self, mock_confluence: MagicMock + ) -> None: + # one response with two pages + mock_confluence.get_all_pages_from_space.return_value = [ + self._get_mock_page("123", ContentFormat.VIEW), + self._get_mock_page("456", ContentFormat.VIEW), + ] + mock_confluence.get_all_restrictions_for_content.side_effect = [ + self._get_mock_page_restrictions("123"), + self._get_mock_page_restrictions("456"), + ] + + confluence_loader = self._get_mock_confluence_loader(mock_confluence) + + documents = confluence_loader.load( + space_key=self.MOCK_SPACE_KEY, + content_format=ContentFormat.VIEW, + keep_markdown_format=True, + max_pages=2, + ) + + assert mock_confluence.get_all_pages_from_space.call_count == 1 + + assert len(documents) == 2 + assert all(isinstance(doc, Document) for doc in documents) + assert documents[0].page_content == "Content 123\n\n" + assert documents[1].page_content == "Content 456\n\n" + + assert mock_confluence.get_page_by_id.call_count == 0 + assert mock_confluence.get_all_pages_by_label.call_count == 0 + assert mock_confluence.cql.call_count == 0 + assert mock_confluence.get_page_child_by_type.call_count == 0 + def _get_mock_confluence_loader( self, mock_confluence: MagicMock ) -> ConfluenceLoader: @@ -163,11 +197,15 @@ def _get_mock_confluence_loader( confluence_loader.confluence = mock_confluence return confluence_loader - def _get_mock_page(self, page_id: str) -> Dict: + def _get_mock_page( + self, page_id: str, content_format: ContentFormat = ContentFormat.STORAGE + ) -> Dict: return { "id": f"{page_id}", "title": f"Page {page_id}", - "body": {"storage": {"value": f"

Content {page_id}

"}}, + "body": { + f"{content_format.name.lower()}": {"value": f"

Content {page_id}

"} + }, "status": "current", "type": "page", "_links": {