Skip to content

Commit

Permalink
Merge branch 'main' into traces-service
Browse files Browse the repository at this point in the history
  • Loading branch information
tarekziade authored Sep 4, 2023
2 parents 1e30406 + 23be4a9 commit 2a709e7
Show file tree
Hide file tree
Showing 23 changed files with 559 additions and 337 deletions.
54 changes: 0 additions & 54 deletions .github/workflows/nucliadb_nightly_bench.yml

This file was deleted.

94 changes: 94 additions & 0 deletions .github/workflows/nucliadb_performance.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
name: Performance Regression
on:
push:
branches:
- main
issue_comment:
types: [created]
schedule:
- cron: "0 0 * * *"
workflow_dispatch:

permissions: write-all

jobs:
check-perf-main-nightly:
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
name: Record performance on main (nightly)
runs-on: ubuntu-latest
steps:
- name: Checkout the repository
uses: actions/checkout@v3
- uses: Swatinem/rust-cache@v2
- uses: actions-rs/toolchain@v1
with:
toolchain: stable
profile: minimal
override: true
- name: Run the bench
run: |
make -C vectors_benchmark nightly
- name: Store performance data
uses: nuclia/nucliadb_performance@main
with:
metrics_file: vectors_benchmark/benchmark.json
influxdb_url: ${{ secrets.INFLUXDB_SERVER }}
influxdb_token: ${{ secrets.INFLUXDB_TOKEN }}
influxdb_org: nuclia
influxdb_bucket: benchmarks

check-perf-main:
if: github.event_name == 'push'
name: Record performance on main
runs-on: ubuntu-latest
steps:
- name: Checkout the repository
uses: actions/checkout@v3
- name: Run the bench
run: |
make -C vectors_benchmark fast-bench
- uses: Swatinem/rust-cache@v2
- uses: actions-rs/toolchain@v1
with:
toolchain: stable
profile: minimal
override: true
- name: Store performance data
uses: nuclia/nucliadb_performance@main
with:
metrics_file: vectors_benchmark/benchmark.json
influxdb_url: ${{ secrets.INFLUXDB_SERVER }}
influxdb_token: ${{ secrets.INFLUXDB_TOKEN }}
influxdb_org: nuclia
influxdb_bucket: benchmarks
check-perf-pr:
if: github.event.issue.pull_request && contains(github.event.comment.body, '/bench')
runs-on: ubuntu-latest
name: Check performance on the PR
steps:
- name: Get PR branch
uses: xt0rted/pull-request-comment-branch@v1
id: comment-branch
- uses: actions/checkout@v3
if: success()
with:
ref: ${{ steps.comment-branch.outputs.head_ref }}
- uses: Swatinem/rust-cache@v2

- uses: actions-rs/toolchain@v1
with:
toolchain: stable
profile: minimal
override: true
- name: Run the bench
run: |
make -C vectors_benchmark fast-bench
- name: Store Performance data and report back
uses: nuclia/nucliadb_performance@main
with:
metrics_file: vectors_benchmark/benchmark.json
head_ref: ${{ steps.comment-branch.outputs.head_ref }}
influxdb_url: ${{ secrets.INFLUXDB_SERVER }}
influxdb_token: ${{ secrets.INFLUXDB_TOKEN }}
influxdb_org: nuclia
influxdb_bucket: benchmarks
16 changes: 14 additions & 2 deletions nucliadb/nucliadb/ingest/orm/brain.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,16 @@ def apply_field_vectors(
ssentence.ClearField("vector") # clear first to prevent duplicates
ssentence.vector.extend(vector.vector)

# we only care about start/stop position of the paragraph for a given sentence here
# the key has the sentence position
ssentence.metadata.position.start = vector.start_paragraph
ssentence.metadata.position.end = vector.end_paragraph

ssentence.metadata.position.page_number = (
sparagraph.metadata.position.page_number
)
ssentence.metadata.position.index = sparagraph.metadata.position.index

for index, vector in enumerate(vo.vectors.vectors):
para_key = f"{self.rid}/{field_key}/{vector.start_paragraph}-{vector.end_paragraph}"
paragraph = self.brain.paragraphs[field_key].paragraphs[para_key]
Expand All @@ -257,8 +267,10 @@ def apply_field_vectors(
sentence.ClearField("vector") # clear first to prevent duplicates
sentence.vector.extend(vector.vector)

sentence.metadata.position.start = vector.start
sentence.metadata.position.end = vector.end
# we only care about start/stop position of the paragraph for a given sentence here
# the key has the sentence position
sentence.metadata.position.start = vector.start_paragraph
sentence.metadata.position.end = vector.end_paragraph

# does it make sense to copy forward paragraph values here?
sentence.metadata.position.page_number = (
Expand Down
4 changes: 1 addition & 3 deletions nucliadb/nucliadb/search/api/v1/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,7 @@ async def chat_knowledgebox(
)

find_request = FindRequest()
find_request.features = [
SearchOptions.VECTOR,
]
find_request.features = [SearchOptions.VECTOR]
if ChatOptions.PARAGRAPHS in item.features:
find_request.features.append(SearchOptions.PARAGRAPH)
find_request.query = rephrased_query or user_query
Expand Down
9 changes: 2 additions & 7 deletions nucliadb/nucliadb/search/api/v1/find.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,7 @@
from nucliadb.search.api.v1.utils import fastapi_query
from nucliadb.search.requesters.utils import Method, node_query
from nucliadb.search.search.find_merge import find_merge_results
from nucliadb.search.search.query import (
get_default_min_score,
global_query_to_pb,
pre_process_query,
)
from nucliadb.search.search.query import get_default_min_score, global_query_to_pb
from nucliadb.search.search.utils import should_disable_vector_search
from nucliadb_models.common import FieldTypeName
from nucliadb_models.resource import ExtractedDataTypeName, NucliaDBRoles
Expand Down Expand Up @@ -211,11 +207,10 @@ async def find(
min_score = await get_default_min_score(kbid)

# We need to query all nodes
processed_query = pre_process_query(item.query)
pb_query, incomplete_results, autofilters = await global_query_to_pb(
kbid,
features=item.features,
query=processed_query,
query=item.query,
filters=item.filters,
faceted=item.faceted,
sort=None,
Expand Down
9 changes: 2 additions & 7 deletions nucliadb/nucliadb/search/api/v1/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,7 @@
from nucliadb.search.api.v1.utils import fastapi_query
from nucliadb.search.requesters.utils import Method, node_query
from nucliadb.search.search.merge import merge_results
from nucliadb.search.search.query import (
get_default_min_score,
global_query_to_pb,
pre_process_query,
)
from nucliadb.search.search.query import get_default_min_score, global_query_to_pb
from nucliadb.search.search.utils import (
parse_sort_options,
should_disable_vector_search,
Expand Down Expand Up @@ -313,11 +309,10 @@ async def search(
min_score = await get_default_min_score(kbid)

# We need to query all nodes
processed_query = pre_process_query(item.query)
pb_query, incomplete_results, autofilters = await global_query_to_pb(
kbid,
features=item.features,
query=processed_query,
query=item.query,
filters=item.filters,
faceted=item.faceted,
sort=sort_options,
Expand Down
9 changes: 7 additions & 2 deletions nucliadb/nucliadb/search/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,12 @@ def convert_relations(data: Dict[str, List[Dict[str, str]]]) -> List[RelationNod
class DummyPredictEngine:
def __init__(self):
self.calls = []
self.generated_answer = [
b"valid ",
b"answer ",
b" to",
AnswerStatusCode.SUCCESS.encode(),
]

async def initialize(self):
pass
Expand Down Expand Up @@ -151,9 +157,8 @@ async def chat_query(
self.calls.append(item)

async def generate():
for i in [b"valid ", b"answer ", b" to"]:
for i in self.generated_answer:
yield i
yield AnswerStatusCode.SUCCESS.encode()

return (DUMMY_LEARNING_ID, generate())

Expand Down
Loading

0 comments on commit 2a709e7

Please sign in to comment.