Merge branch 'main' into remove-datasets-from-datasets

nuclia · Jan 23, 2024 · 656eae6 · 656eae6
2 parents dee9428 + 4d7edf7
commit 656eae6
Show file tree

Hide file tree

Showing 20 changed files with 340 additions and 190 deletions.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -1,4 +1,4 @@
-name: PyPI Releases
+name: PyPI and Helm Releases
 
 on:
   push:
@@ -155,4 +155,54 @@ jobs:
             nuclia/nucliadb:${{ steps.version_step.outputs.version_number }}
             nuclia/nucliadb:${{ steps.version_step.outputs.hash }}
           cache-from: type=gha
-          cache-to: type=gha,mode=min
+          cache-to: type=gha,mode=min
+
+  deploy:
+    name: Deploy Helm chart and trigger internal CI
+    runs-on: ubuntu-latest
+    needs: push-docker
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Calculate short sha
+        id: env-vars
+        run: |-
+          HASH=`git rev-parse --short HEAD`
+          echo "short_sha=$HASH" >> $GITHUB_OUTPUT
+
+      - name: Set helm package image
+        id: version_step
+        run: |-
+          sed -i.bak "s#IMAGE_TO_REPLACE#$IMAGE_NAME:${{ steps.env-vars.outputs.short_sha }}#" ./charts/nucliadb_writer/values.yaml
+          sed -i.bak "s#CONTAINER_REGISTRY_TO_REPLACE#$CONTAINER_REGISTRY#" ./charts/nucliadb_writer/values.yaml
+          VERSION=`cat VERSION`
+          VERSION_SHA=$VERSION+${{ steps.env-vars.outputs.short_sha }}
+          sed -i.bak "s#99999.99999.99999#$VERSION_SHA#" ./charts/nucliadb_writer/Chart.yaml
+          echo "version_number=$VERSION_SHA" >> $GITHUB_OUTPUT
+
+      - name: Configure Git
+        run: |
+          git config user.name "$GITHUB_ACTOR"
+          git config user.email "[email protected]"
+
+      - name: Install Helm
+        uses: azure/setup-helm@v3
+        with:
+          version: v3.4.0
+
+      - name: Push helm package
+        run: |-
+          helm lint charts/nucliadb
+          helm package charts/nucliadb
+          curl --data-binary "@nucliadb-${{ steps.version_step.outputs.version_number }}.tgz" ${{ secrets.HELM_CHART_URL }}/api/charts
+
+      # Not working yet, disabled for now
+      # - name: Repository Dispatch
+      #   uses: peter-evans/repository-dispatch@v2
+      #   with:
+      #     token: ${{ secrets.GH_CICD_PUBLIC }}
+      #     repository: nuclia/nucliadb_deploy
+      #     event-type: promote
+      #     client-payload: '{"component": "nucliadb_standalone", "chart-version": "${{ steps.version_step.outputs.version_number }}" }'
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.40.1
+2.41.0
diff --git a/charts/nucliadb/templates/sts.yaml b/charts/nucliadb/templates/sts.yaml
@@ -20,6 +20,10 @@ spec:
       heritage: "{{ .Release.Service }}"
   template:
     metadata:
+      {{- with .Values.podAnnotations }}
+      annotations:
+{{ toYaml . | indent 8 }}
+      {{- end }}
       name: nucliadb
       labels:
         app: nucliadb

diff --git a/charts/nucliadb/values.yaml b/charts/nucliadb/values.yaml
@@ -6,6 +6,7 @@ image: nuclia/nucliadb
 imageVersion:
 
 replicas: 2
+podAnnotations: {}
 
 # app settings
 env:

diff --git a/e2e/test_e2e.py b/e2e/test_e2e.py
@@ -190,7 +190,7 @@ def _test_predict_proxy_chat(kbid: str):
         json={
             "question": "Who is the best football player?",
             "query_context": [
-                "Many football players have existed. Cristiano Ronaldo and Messi among them, but Messi is by far the greatest."
+                "Many football players have existed. Messi is by far the greatest."
             ],
             "user_id": "[email protected]",
         },
@@ -252,4 +252,4 @@ def raise_for_status(resp):
         print("Error response")
         print("Status code:", resp.status_code)
         print(resp.text)
-        raise
+        raise
diff --git a/nucliadb/nucliadb/common/http_clients/processing.py b/nucliadb/nucliadb/common/http_clients/processing.py
@@ -203,6 +203,11 @@ class StatusResultV2(pydantic.BaseModel):
         title="Schedule ETA",
         description="Estimated time until the resource is scheduled.",
     )
+    schedule_order: int = pydantic.Field(
+        0,
+        title="Schedule Order",
+        description="Order of the resource in the schedule queue.",
+    )
 
 
 class StatusResultsV2(pydantic.BaseModel):

diff --git a/nucliadb/nucliadb/ingest/orm/brain.py b/nucliadb/nucliadb/ingest/orm/brain.py
@@ -413,6 +413,9 @@ def _set_resource_labels(self, basic: Basic, origin: Optional[Origin]):
             if origin.source_id != "":
                 self.labels["u"].append(f"s/{origin.source_id}")
 
+            if origin.path:
+                self.labels["p"].append(origin.path.lstrip("/"))
+
             # origin contributors
             for contrib in origin.colaborators:
                 self.labels["u"].append(f"o/{contrib}")

diff --git a/nucliadb/nucliadb/search/requesters/utils.py b/nucliadb/nucliadb/search/requesters/utils.py
@@ -130,7 +130,9 @@ async def node_query(
     target_replicas: Optional[list[str]] = None,
     read_only: bool = True,
 ) -> tuple[list[T], bool, list[tuple[str, str, str]], list[str]]:
-    read_only = read_only and has_feature(const.Features.READ_REPLICA_SEARCHES)
+    read_only = read_only and has_feature(
+        const.Features.READ_REPLICA_SEARCHES, context={"kbid": kbid}
+    )
 
     shard_manager = get_shard_manager()
 

diff --git a/nucliadb/nucliadb/search/search/summarize.py b/nucliadb/nucliadb/search/search/summarize.py
@@ -46,6 +46,7 @@
 async def summarize(kbid: str, request: SummarizeRequest) -> SummarizedResponse:
     predict_request = SummarizeModel()
     predict_request.user_prompt = request.user_prompt
+    predict_request.summary_kind = request.summary_kind
 
     for rid, field_id, extracted_text in await get_extracted_texts(
         kbid, request.resources

diff --git a/nucliadb/nucliadb/tests/integration/search/test_search.py b/nucliadb/nucliadb/tests/integration/search/test_search.py
@@ -436,7 +436,7 @@ async def test_catalog_can_filter_by_processing_status(
 @pytest.mark.skip(reason="Needs sc-5626")
 @pytest.mark.asyncio
 @pytest.mark.parametrize("knowledgebox", ("EXPERIMENTAL", "STABLE"), indirect=True)
-async def test_catalog_prefix_search(
+async def test_(
     nucliadb_reader: AsyncClient,
     nucliadb_writer: AsyncClient,
     knowledgebox,
@@ -1499,3 +1499,58 @@ def check_fuzzy_paragraphs(search_response, *, fuzzy_result: bool, n_expected: i
         assert paragraph["fuzzy_result"] is fuzzy_result
         found += 1
     assert found == n_expected
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("knowledgebox", ("EXPERIMENTAL", "STABLE"), indirect=True)
+async def test_search_by_path_filter(
+    nucliadb_reader: AsyncClient,
+    nucliadb_writer: AsyncClient,
+    nucliadb_grpc: WriterStub,
+    knowledgebox,
+):
+    paths = ["/foo", "foo/bar", "foo/bar/1", "foo/bar/2", "foo/bar/3", "foo/bar/4"]
+
+    for path in paths:
+        resp = await nucliadb_writer.post(
+            f"/kb/{knowledgebox}/resources",
+            headers={"X-Synchronous": "true"},
+            json={
+                "title": f"My resource: {path}",
+                "summary": "Some summary",
+                "origin": {
+                    "path": path,
+                },
+            },
+        )
+        assert resp.status_code == 201
+
+    resp = await nucliadb_reader.get(
+        f"/kb/{knowledgebox}/catalog",
+        params={
+            "query": "",
+        },
+    )
+    assert resp.status_code == 200
+    assert len(resp.json()["resources"]) == len(paths)
+
+    # Get the list of all
+    resp = await nucliadb_reader.get(
+        f"/kb/{knowledgebox}/search?filters=/origin.path/foo"
+    )
+    assert resp.status_code == 200
+    assert len(resp.json()["resources"]) == len(paths)
+
+    # Get the list of under foo/bar
+    resp = await nucliadb_reader.get(
+        f"/kb/{knowledgebox}/search?filters=/origin.path/foo/bar"
+    )
+    assert resp.status_code == 200
+    assert len(resp.json()["resources"]) == len(paths) - 1
+
+    # Get the list of under foo/bar/4
+    resp = await nucliadb_reader.get(
+        f"/kb/{knowledgebox}/search?filters=/origin.path/foo/bar/4"
+    )
+    assert resp.status_code == 200
+    assert len(resp.json()["resources"]) == 1
diff --git a/nucliadb/nucliadb/writer/resource/origin.py b/nucliadb/nucliadb/writer/resource/origin.py
@@ -41,6 +41,8 @@ def parse_origin(origin: Origin, origin_payload: InputOrigin):
         origin.related.extend(origin_payload.related)
     if origin_payload.metadata:
         origin.metadata.update(origin_payload.metadata)
+    if origin_payload.path:
+        origin.path = origin_payload.path
     origin.source = Origin.Source.API
 
 

diff --git a/nucliadb_models/nucliadb_models/labels.py b/nucliadb_models/nucliadb_models/labels.py
@@ -33,6 +33,7 @@
     "f": [],  # field keyword field (field/keyword)
     "fg": [],  # field keyword (keywords) flat
     "m": [],  # origin metadata in the form of (key/value). Max key/value size is 255
+    "p": [],  # origin metadata in the form of (key/value). Max key/value size is 255
 }
 
 
@@ -44,6 +45,7 @@
     "metadata.languages": "s/s",
     "origin.tags": "t",
     "origin.metadata": "m",
+    "origin.path": "p",
     "classification.labels": "l",
     "entities": "e",
     "field": "f",

diff --git a/nucliadb_models/nucliadb_models/metadata.py b/nucliadb_models/nucliadb_models/metadata.py
@@ -354,6 +354,7 @@ class InputOrigin(BaseModel):
     # old field was "colaborators"
     filename: Optional[str] = None
     related: List[str] = []
+    path: Optional[str] = None
 
 
 class Origin(InputOrigin):

diff --git a/nucliadb_models/nucliadb_models/search.py b/nucliadb_models/nucliadb_models/search.py
@@ -782,13 +782,19 @@ class SummarizeResourceModel(BaseModel):
     fields: Dict[str, str] = {}
 
 
+class SummaryKind(str, Enum):
+    SIMPLE = "simple"
+    EXTENDED = "extended"
+
+
 class SummarizeModel(BaseModel):
     """
     Model for the summarize predict api request payload
     """
 
     resources: Dict[str, SummarizeResourceModel] = {}
     user_prompt: Optional[str] = None
+    summary_kind: SummaryKind = SummaryKind.SIMPLE
 
 
 class SummarizeRequest(BaseModel):
@@ -809,6 +815,12 @@ class SummarizeRequest(BaseModel):
         description="Uids of the resources to summarize",
     )
 
+    summary_kind: SummaryKind = Field(
+        default=SummaryKind.SIMPLE,
+        title="Summary kind",
+        description="Option to customize how the summary will be",
+    )
+
 
 class SummarizedResource(BaseModel):
     summary: str

diff --git a/nucliadb_node/tests/test_search_relations.rs b/nucliadb_node/tests/test_search_relations.rs
@@ -379,6 +379,10 @@ async fn test_search_relations_prefixed(
             shard_id: shard_id.clone(),
             prefix: Some(RelationPrefixSearchRequest {
                 prefix: "cat".to_string(),
+                node_filters: vec![RelationNodeFilter {
+                    node_subtype: None,
+                    node_type: NodeType::Entity as i32,
+                }],
                 ..Default::default()
             }),
             ..Default::default()

diff --git a/nucliadb_protos/python/nucliadb_protos/resources_pb2.py b/nucliadb_protos/python/nucliadb_protos/resources_pb2.py
diff --git a/nucliadb_protos/python/nucliadb_protos/resources_pb2.pyi b/nucliadb_protos/python/nucliadb_protos/resources_pb2.pyi
@@ -284,6 +284,7 @@ class Origin(google.protobuf.message.Message):
     COLABORATORS_FIELD_NUMBER: builtins.int
     FILENAME_FIELD_NUMBER: builtins.int
     RELATED_FIELD_NUMBER: builtins.int
+    PATH_FIELD_NUMBER: builtins.int
     source: global___Origin.Source.ValueType
     source_id: builtins.str
     url: builtins.str
@@ -300,6 +301,7 @@ class Origin(google.protobuf.message.Message):
     filename: builtins.str
     @property
     def related(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]: ...
+    path: builtins.str
     def __init__(
         self,
         *,
@@ -313,9 +315,10 @@ class Origin(google.protobuf.message.Message):
         colaborators: collections.abc.Iterable[builtins.str] | None = ...,
         filename: builtins.str = ...,
         related: collections.abc.Iterable[builtins.str] | None = ...,
+        path: builtins.str = ...,
     ) -> None: ...
     def HasField(self, field_name: typing_extensions.Literal["created", b"created", "modified", b"modified"]) -> builtins.bool: ...
-    def ClearField(self, field_name: typing_extensions.Literal["colaborators", b"colaborators", "created", b"created", "filename", b"filename", "metadata", b"metadata", "modified", b"modified", "related", b"related", "source", b"source", "source_id", b"source_id", "tags", b"tags", "url", b"url"]) -> None: ...
+    def ClearField(self, field_name: typing_extensions.Literal["colaborators", b"colaborators", "created", b"created", "filename", b"filename", "metadata", b"metadata", "modified", b"modified", "path", b"path", "related", b"related", "source", b"source", "source_id", b"source_id", "tags", b"tags", "url", b"url"]) -> None: ...
 
 global___Origin = Origin
 

diff --git a/nucliadb_protos/resources.proto b/nucliadb_protos/resources.proto
@@ -88,6 +88,7 @@ message Origin {
     repeated string colaborators = 8;
     string filename = 9;
     repeated string related = 10;
+    string path = 11;
 }
 
 message Extra {

diff --git a/nucliadb_protos/rust/src/resources.rs b/nucliadb_protos/rust/src/resources.rs
@@ -117,6 +117,8 @@ pub struct Origin {
     pub filename: ::prost::alloc::string::String,
     #[prost(string, repeated, tag="10")]
     pub related: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
+    #[prost(string, tag="11")]
+    pub path: ::prost::alloc::string::String,
 }
 /// Nested message and enum types in `Origin`.
 pub mod origin {

diff --git a/nucliadb_telemetry/nucliadb_telemetry/errors.py b/nucliadb_telemetry/nucliadb_telemetry/errors.py
@@ -143,5 +143,7 @@ def setup_sentry_logging_integration(for_loggers: List[str]) -> None:
     if settings.sentry_url:
         sentry_sdk.init(
             dsn=settings.sentry_url,
+            environment=settings.environment,
             integrations=[SentryLoggingIntegration(for_loggers)],
         )
+        sentry_sdk.set_tag("zone", settings.zone)