Merge branch 'main' into traces-service

nuclia · Sep 4, 2023 · 2a709e7 · 2a709e7
2 parents 1e30406 + 23be4a9
commit 2a709e7
Show file tree

Hide file tree

Showing 23 changed files with 559 additions and 337 deletions.
diff --git a/.github/workflows/nucliadb_nightly_bench.yml b/.github/workflows/nucliadb_nightly_bench.yml
diff --git a/.github/workflows/nucliadb_performance.yml b/.github/workflows/nucliadb_performance.yml
@@ -0,0 +1,94 @@
+name: Performance Regression
+on:
+  push:
+    branches:
+      - main
+  issue_comment:
+    types: [created]
+  schedule:
+    - cron: "0 0 * * *"
+  workflow_dispatch:
+
+permissions: write-all
+
+jobs:
+  check-perf-main-nightly:
+    if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
+    name: Record performance on main (nightly)
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout the repository
+        uses: actions/checkout@v3
+      - uses: Swatinem/rust-cache@v2
+      - uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          profile: minimal
+          override: true
+      - name: Run the bench
+        run: |
+          make -C vectors_benchmark nightly
+      - name: Store performance data
+        uses: nuclia/nucliadb_performance@main
+        with:
+          metrics_file: vectors_benchmark/benchmark.json
+          influxdb_url: ${{ secrets.INFLUXDB_SERVER }}
+          influxdb_token: ${{ secrets.INFLUXDB_TOKEN }}
+          influxdb_org: nuclia
+          influxdb_bucket: benchmarks
+
+  check-perf-main:
+    if: github.event_name == 'push'
+    name: Record performance on main
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout the repository
+        uses: actions/checkout@v3
+      - name: Run the bench
+        run: |
+          make -C vectors_benchmark fast-bench
+      - uses: Swatinem/rust-cache@v2
+      - uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          profile: minimal
+          override: true
+      - name: Store performance data
+        uses: nuclia/nucliadb_performance@main
+        with:
+          metrics_file: vectors_benchmark/benchmark.json
+          influxdb_url: ${{ secrets.INFLUXDB_SERVER }}
+          influxdb_token: ${{ secrets.INFLUXDB_TOKEN }}
+          influxdb_org: nuclia
+          influxdb_bucket: benchmarks
+  check-perf-pr:
+    if: github.event.issue.pull_request && contains(github.event.comment.body, '/bench')
+    runs-on: ubuntu-latest
+    name: Check performance on the PR
+    steps:
+      - name: Get PR branch
+        uses: xt0rted/pull-request-comment-branch@v1
+        id: comment-branch
+      - uses: actions/checkout@v3
+        if: success()
+        with:
+          ref: ${{ steps.comment-branch.outputs.head_ref }}
+      - uses: Swatinem/rust-cache@v2
+
+      - uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          profile: minimal
+          override: true
+      - name: Run the bench
+        run: |
+          make -C vectors_benchmark fast-bench
+      - name: Store Performance data and report back
+        uses: nuclia/nucliadb_performance@main
+        with:
+          metrics_file: vectors_benchmark/benchmark.json
+          head_ref: ${{ steps.comment-branch.outputs.head_ref }}
+          influxdb_url: ${{ secrets.INFLUXDB_SERVER }}
+          influxdb_token: ${{ secrets.INFLUXDB_TOKEN }}
+          influxdb_org: nuclia
+          influxdb_bucket: benchmarks
diff --git a/nucliadb/nucliadb/ingest/orm/brain.py b/nucliadb/nucliadb/ingest/orm/brain.py
@@ -248,6 +248,16 @@ def apply_field_vectors(
                 ssentence.ClearField("vector")  # clear first to prevent duplicates
                 ssentence.vector.extend(vector.vector)
 
+                # we only care about start/stop position of the paragraph for a given sentence here
+                # the key has the sentence position
+                ssentence.metadata.position.start = vector.start_paragraph
+                ssentence.metadata.position.end = vector.end_paragraph
+
+                ssentence.metadata.position.page_number = (
+                    sparagraph.metadata.position.page_number
+                )
+                ssentence.metadata.position.index = sparagraph.metadata.position.index
+
         for index, vector in enumerate(vo.vectors.vectors):
             para_key = f"{self.rid}/{field_key}/{vector.start_paragraph}-{vector.end_paragraph}"
             paragraph = self.brain.paragraphs[field_key].paragraphs[para_key]
@@ -257,8 +267,10 @@ def apply_field_vectors(
             sentence.ClearField("vector")  # clear first to prevent duplicates
             sentence.vector.extend(vector.vector)
 
-            sentence.metadata.position.start = vector.start
-            sentence.metadata.position.end = vector.end
+            # we only care about start/stop position of the paragraph for a given sentence here
+            # the key has the sentence position
+            sentence.metadata.position.start = vector.start_paragraph
+            sentence.metadata.position.end = vector.end_paragraph
 
             # does it make sense to copy forward paragraph values here?
             sentence.metadata.position.page_number = (

diff --git a/nucliadb/nucliadb/search/api/v1/chat.py b/nucliadb/nucliadb/search/api/v1/chat.py
@@ -101,9 +101,7 @@ async def chat_knowledgebox(
         )
 
     find_request = FindRequest()
-    find_request.features = [
-        SearchOptions.VECTOR,
-    ]
+    find_request.features = [SearchOptions.VECTOR]
     if ChatOptions.PARAGRAPHS in item.features:
         find_request.features.append(SearchOptions.PARAGRAPH)
     find_request.query = rephrased_query or user_query

diff --git a/nucliadb/nucliadb/search/api/v1/find.py b/nucliadb/nucliadb/search/api/v1/find.py
@@ -32,11 +32,7 @@
 from nucliadb.search.api.v1.utils import fastapi_query
 from nucliadb.search.requesters.utils import Method, node_query
 from nucliadb.search.search.find_merge import find_merge_results
-from nucliadb.search.search.query import (
-    get_default_min_score,
-    global_query_to_pb,
-    pre_process_query,
-)
+from nucliadb.search.search.query import get_default_min_score, global_query_to_pb
 from nucliadb.search.search.utils import should_disable_vector_search
 from nucliadb_models.common import FieldTypeName
 from nucliadb_models.resource import ExtractedDataTypeName, NucliaDBRoles
@@ -211,11 +207,10 @@ async def find(
         min_score = await get_default_min_score(kbid)
 
     # We need to query all nodes
-    processed_query = pre_process_query(item.query)
     pb_query, incomplete_results, autofilters = await global_query_to_pb(
         kbid,
         features=item.features,
-        query=processed_query,
+        query=item.query,
         filters=item.filters,
         faceted=item.faceted,
         sort=None,

diff --git a/nucliadb/nucliadb/search/api/v1/search.py b/nucliadb/nucliadb/search/api/v1/search.py
@@ -33,11 +33,7 @@
 from nucliadb.search.api.v1.utils import fastapi_query
 from nucliadb.search.requesters.utils import Method, node_query
 from nucliadb.search.search.merge import merge_results
-from nucliadb.search.search.query import (
-    get_default_min_score,
-    global_query_to_pb,
-    pre_process_query,
-)
+from nucliadb.search.search.query import get_default_min_score, global_query_to_pb
 from nucliadb.search.search.utils import (
     parse_sort_options,
     should_disable_vector_search,
@@ -313,11 +309,10 @@ async def search(
         min_score = await get_default_min_score(kbid)
 
     # We need to query all nodes
-    processed_query = pre_process_query(item.query)
     pb_query, incomplete_results, autofilters = await global_query_to_pb(
         kbid,
         features=item.features,
-        query=processed_query,
+        query=item.query,
         filters=item.filters,
         faceted=item.faceted,
         sort=sort_options,

diff --git a/nucliadb/nucliadb/search/predict.py b/nucliadb/nucliadb/search/predict.py
@@ -123,6 +123,12 @@ def convert_relations(data: Dict[str, List[Dict[str, str]]]) -> List[RelationNod
 class DummyPredictEngine:
     def __init__(self):
         self.calls = []
+        self.generated_answer = [
+            b"valid ",
+            b"answer ",
+            b" to",
+            AnswerStatusCode.SUCCESS.encode(),
+        ]
 
     async def initialize(self):
         pass
@@ -151,9 +157,8 @@ async def chat_query(
         self.calls.append(item)
 
         async def generate():
-            for i in [b"valid ", b"answer ", b" to"]:
+            for i in self.generated_answer:
                 yield i
-            yield AnswerStatusCode.SUCCESS.encode()
 
         return (DUMMY_LEARNING_ID, generate())