Add summarize to sdk (#1731)

nuclia · Jan 16, 2024 · 14ca50a · 14ca50a · github-actions · Jan 16, 2024
1 parent 4804ba2
commit 14ca50a
Show file tree

Hide file tree

Showing 7 changed files with 69 additions and 5 deletions.
diff --git a/nucliadb/nucliadb/search/search/summarize.py b/nucliadb/nucliadb/search/search/summarize.py
@@ -38,6 +38,8 @@
 )
 from nucliadb_utils.utilities import get_storage
 
+ExtractedTexts = list[tuple[str, str, Optional[ExtractedText]]]
+
 MAX_GET_EXTRACTED_TEXT_OPS = 20
 
 
@@ -59,10 +61,8 @@ async def summarize(kbid: str, request: SummarizeRequest) -> SummarizedResponse:
     return await predict.summarize(kbid, predict_request)
 
 
-async def get_extracted_texts(
-    kbid: str, resource_uuids: list[str]
-) -> list[tuple[str, str, Optional[ExtractedText]]]:
-    results = []
+async def get_extracted_texts(kbid: str, resource_uuids: list[str]) -> ExtractedTexts:
+    results: ExtractedTexts = []
 
     driver = get_driver()
     storage = await get_storage()
@@ -83,6 +83,11 @@ async def get_extracted_texts(
             for _, field in fields.items():
                 task = asyncio.create_task(get_extracted_text(rid, field, max_tasks))
                 tasks.append(task)
+
+        if len(tasks) == 0:
+            # No extracted text to get
+            return results
+
         done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_EXCEPTION)
 
     # Parse the task results

diff --git a/nucliadb_models/nucliadb_models/search.py b/nucliadb_models/nucliadb_models/search.py
@@ -813,7 +813,7 @@ class SummarizedResponse(BaseModel):
         default={}, title="Resources", description="Individual resource summaries"
     )
     summary: str = Field(
-        default="", title="Summary", description="Globla summary of all resources"
+        default="", title="Summary", description="Global summary of all resources"
     )
 
 

diff --git a/nucliadb_sdk/nucliadb_sdk/tests/test_sdk.py b/nucliadb_sdk/nucliadb_sdk/tests/test_sdk.py
@@ -96,3 +96,4 @@ def test_search_endpoints(sdk: nucliadb_sdk.NucliaDB, kb):
     resource = sdk.create_resource(kbid=kb.uuid, title="Resource", slug="resource")
     sdk.chat_on_resource(kbid=kb.uuid, rid=resource.uuid, query="foo")
     sdk.feedback(kbid=kb.uuid, ident="bar", good=True, feedback="baz", task="CHAT")
+    sdk.summarize(kbid=kb.uuid, resources=["foobar"])
diff --git a/nucliadb_sdk/nucliadb_sdk/tests/test_sdk_async.py b/nucliadb_sdk/nucliadb_sdk/tests/test_sdk_async.py
@@ -89,3 +89,4 @@ async def test_search_endpoints(sdk_async: nucliadb_sdk.NucliaDBAsync, kb):
     await sdk_async.feedback(
         kbid=kb.uuid, ident="bar", good=True, feedback="baz", task=FeedbackTasks.CHAT
     )
+    await sdk_async.summarize(kbid=kb.uuid, resources=["foobar"])
diff --git a/nucliadb_sdk/nucliadb_sdk/tests/test_summarize.py b/nucliadb_sdk/nucliadb_sdk/tests/test_summarize.py
@@ -0,0 +1,32 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at [email protected].
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+import nucliadb_sdk
+from nucliadb_models.search import KnowledgeboxFindResults, SummarizeRequest
+
+
+def test_summarize(docs_dataset, sdk: nucliadb_sdk.NucliaDB):
+    results: KnowledgeboxFindResults = sdk.find(kbid=docs_dataset, query="love")
+    resource_uuids = [uuid for uuid in results.resources.keys()]
+
+    response = sdk.summarize(kbid=docs_dataset, resources=[resource_uuids[0]])
+    assert response.summary == "global summary"
+
+    content = SummarizeRequest(resources=[resource_uuids[0]])
+    response = sdk.summarize(kbid=docs_dataset, content=content)
+    assert response.summary == "global summary"
diff --git a/nucliadb_sdk/nucliadb_sdk/v2/docstrings.py b/nucliadb_sdk/nucliadb_sdk/v2/docstrings.py
@@ -276,6 +276,19 @@ class Docstring(BaseModel):
     ],
 )
 
+SUMMARIZE = Docstring(
+    doc="""Summarize your documents""",
+    examples=[
+        Example(
+            description="Get a summary of a document or a list of documents",
+            code=""">>> summary = sdk.summarize(kbid="mykbid", resources=["uuid1"]).summary
+>>> print(summary)
+'The document talks about Seville and its temperature. It also mentions the coldest month of the year, which is January.'  # noqa
+""",
+        ),
+    ],
+)
+
 
 DELETE_LABELSET = Docstring(
     doc="Delete a specific set of labels",

diff --git a/nucliadb_sdk/nucliadb_sdk/v2/sdk.py b/nucliadb_sdk/nucliadb_sdk/v2/sdk.py
@@ -71,6 +71,8 @@
     KnowledgeboxSearchResults,
     Relations,
     SearchRequest,
+    SummarizedResponse,
+    SummarizeRequest,
 )
 from nucliadb_models.vectors import VectorSet, VectorSets
 from nucliadb_models.writer import (
@@ -652,6 +654,16 @@ def _check_response(self, response: httpx.Response):
         response_type=chat_response_parser,
         docstring=docstrings.RESOURCE_CHAT,
     )
+    summarize = _request_builder(
+        name="summarize",
+        path_template="/v1/kb/{kbid}/summarize",
+        method="POST",
+        path_params=("kbid",),
+        request_type=SummarizeRequest,
+        response_type=SummarizedResponse,
+        docstring=docstrings.SUMMARIZE,
+    )
+
     feedback = _request_builder(
         name="feedback",
         path_template="/v1/kb/{kbid}/feedback",