Skip to content

Commit

Permalink
Fixed the error on ConfluenceLoader when content_format=VIEW and `kee…
Browse files Browse the repository at this point in the history
…p_markdown_format`=True (langchain-ai#9633)

- Description: a description of the change

when I set `content_format=ContentFormat.VIEW` and
`keep_markdown_format=True` on ConfluenceLoader, it shows the following
error:
```
langchain/document_loaders/confluence.py", line 459, in process_page
    page["body"]["storage"]["value"], heading_style="ATX"
KeyError: 'storage'
```
The reason is because the content format was set to `view` but it was
still trying to get the content from `page["body"]["storage"]["value"]`.

Also added the other content formats which are supported by Atlassian
API

https://stackoverflow.com/questions/34353955/confluence-rest-api-expanding-page-body-when-retrieving-page-by-title/34363386#34363386

  - Issue: the issue # it fixes (if applicable),

Not applicable.

  - Dependencies: any dependencies required for this change,

Added optional dependency `markdownify` if anyone wants to extract in
markdown format.

---------

Co-authored-by: Bagatur <[email protected]>
  • Loading branch information
ericmm and baskaryan authored Aug 23, 2023
1 parent e1f4f9a commit b379c5f
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 17 deletions.
27 changes: 15 additions & 12 deletions libs/langchain/langchain/document_loaders/confluence.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,14 @@
class ContentFormat(str, Enum):
"""Enumerator of the content formats of Confluence page."""

EDITOR = "body.editor"
EXPORT_VIEW = "body.export_view"
ANONYMOUS_EXPORT_VIEW = "body.anonymous_export_view"
STORAGE = "body.storage"
VIEW = "body.view"

def get_content(self, page: dict) -> str:
if self == ContentFormat.STORAGE:
return page["body"]["storage"]["value"]
elif self == ContentFormat.VIEW:
return page["body"]["view"]["value"]

raise ValueError("unknown content format")
return page["body"][self.name.lower()]["value"]


class ConfluenceLoader(BaseLoader):
Expand All @@ -52,7 +50,10 @@ class ConfluenceLoader(BaseLoader):
raw XML representation for storage. The view format is the HTML representation for
viewing with macros are rendered as though it is viewed by users. You can pass
a enum `content_format` argument to `load()` to specify the content format, this is
set to `ContentFormat.STORAGE` by default.
set to `ContentFormat.STORAGE` by default, the supported values are:
`ContentFormat.EDITOR`, `ContentFormat.EXPORT_VIEW`,
`ContentFormat.ANONYMOUS_EXPORT_VIEW`, `ContentFormat.STORAGE`,
and `ContentFormat.VIEW`.
Hint: space_key and page_id can both be found in the URL of a page in Confluence
- https://yoursite.atlassian.com/wiki/spaces/<space_key>/pages/<page_id>
Expand Down Expand Up @@ -238,7 +239,11 @@ def load(
:type include_attachments: bool, optional
:param include_comments: defaults to False
:type include_comments: bool, optional
:param content_format: Specify content format, defaults to ContentFormat.STORAGE
:param content_format: Specify content format, defaults to
ContentFormat.STORAGE, the supported values are:
`ContentFormat.EDITOR`, `ContentFormat.EXPORT_VIEW`,
`ContentFormat.ANONYMOUS_EXPORT_VIEW`,
`ContentFormat.STORAGE`, and `ContentFormat.VIEW`.
:type content_format: ContentFormat
:param limit: Maximum number of pages to retrieve per request, defaults to 50
:type limit: int, optional
Expand Down Expand Up @@ -473,14 +478,12 @@ def process_page(
else:
attachment_texts = []

content = content_format.get_content(page)
if keep_markdown_format:
# Use markdownify to keep the page Markdown style
text = markdownify(
page["body"]["storage"]["value"], heading_style="ATX"
) + "".join(attachment_texts)
text = markdownify(content, heading_style="ATX") + "".join(attachment_texts)

else:
content = content_format.get_content(page)
if keep_newlines:
text = BeautifulSoup(
content.replace("</p>", "\n</p>").replace("<br />", "\n"), "lxml"
Expand Down
19 changes: 17 additions & 2 deletions libs/langchain/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions libs/langchain/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ amazon-textract-caller = {version = "<2", optional = true}
xata = {version = "^1.0.0a7", optional = true}
xmltodict = {version = "^0.13.0", optional = true}
google-api-core = {version = "^2.11.1", optional = true}
markdownify = {version = "^0.11.6", optional = true}


[tool.poetry.group.test.dependencies]
Expand Down Expand Up @@ -338,6 +339,7 @@ extended_testing = [
"xmltodict",
"faiss-cpu",
"openapi-schema-pydantic",
"markdownify",
]

[tool.ruff]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import requests

from langchain.docstore.document import Document
from langchain.document_loaders.confluence import ConfluenceLoader
from langchain.document_loaders.confluence import ConfluenceLoader, ContentFormat


@pytest.fixture
Expand Down Expand Up @@ -152,6 +152,40 @@ def test_confluence_loader_load_data_by_space_id(
assert mock_confluence.cql.call_count == 0
assert mock_confluence.get_page_child_by_type.call_count == 0

def test_confluence_loader_when_content_format_and_keep_markdown_format_enabled(
self, mock_confluence: MagicMock
) -> None:
# one response with two pages
mock_confluence.get_all_pages_from_space.return_value = [
self._get_mock_page("123", ContentFormat.VIEW),
self._get_mock_page("456", ContentFormat.VIEW),
]
mock_confluence.get_all_restrictions_for_content.side_effect = [
self._get_mock_page_restrictions("123"),
self._get_mock_page_restrictions("456"),
]

confluence_loader = self._get_mock_confluence_loader(mock_confluence)

documents = confluence_loader.load(
space_key=self.MOCK_SPACE_KEY,
content_format=ContentFormat.VIEW,
keep_markdown_format=True,
max_pages=2,
)

assert mock_confluence.get_all_pages_from_space.call_count == 1

assert len(documents) == 2
assert all(isinstance(doc, Document) for doc in documents)
assert documents[0].page_content == "Content 123\n\n"
assert documents[1].page_content == "Content 456\n\n"

assert mock_confluence.get_page_by_id.call_count == 0
assert mock_confluence.get_all_pages_by_label.call_count == 0
assert mock_confluence.cql.call_count == 0
assert mock_confluence.get_page_child_by_type.call_count == 0

def _get_mock_confluence_loader(
self, mock_confluence: MagicMock
) -> ConfluenceLoader:
Expand All @@ -163,11 +197,15 @@ def _get_mock_confluence_loader(
confluence_loader.confluence = mock_confluence
return confluence_loader

def _get_mock_page(self, page_id: str) -> Dict:
def _get_mock_page(
self, page_id: str, content_format: ContentFormat = ContentFormat.STORAGE
) -> Dict:
return {
"id": f"{page_id}",
"title": f"Page {page_id}",
"body": {"storage": {"value": f"<p>Content {page_id}</p>"}},
"body": {
f"{content_format.name.lower()}": {"value": f"<p>Content {page_id}</p>"}
},
"status": "current",
"type": "page",
"_links": {
Expand Down

0 comments on commit b379c5f

Please sign in to comment.