WIP

nuclia · Nov 30, 2023 · 0d5c4d0 · 0d5c4d0
1 parent 84cebd9
commit 0d5c4d0
Show file tree

Hide file tree

Showing 4 changed files with 240 additions and 1 deletion.
diff --git a/.github/workflows/nucliadb_performance2.yml b/.github/workflows/nucliadb_performance2.yml
@@ -0,0 +1,60 @@
+name: Performance Regression
+on:
+  push:
+    branches:
+      - ferran/sc-7816/integrate-search-benchmarks-on-ci
+  issue_comment:
+    types: [created]
+  schedule:
+    - cron: "0 0 * * *"
+  workflow_dispatch:
+
+permissions: write-all
+
+jobs:
+  check-performance:
+    if: github.event_name == 'push' || github.event.issue.pull_request && contains(github.event.comment.body, '/bench')
+    name: Run NucliaDB standalone performance tests
+    runs-on: ubuntu-latest
+    env:
+      KB_SLUG: small
+      EXPORTS_URI: ${{ secrets.EXPORTS_URI }}
+
+    steps:
+      - name: Checkout the repository
+        uses: actions/checkout@v3
+      - name: Install NucliaDB
+        run: |
+          make -C nucliadb install
+
+      - name: Start NucliaDB
+        run: |
+          DEBUG=true nucliadb &
+          echo NDB_PID=$! >> "$GITHUB_ENV"
+
+      - name: Import data
+        run: |
+          python nucliadb/nucliadb_performance/import-kb.py --kb=$KB_SLUG --uri=$EXPORTS_URI/$KB_SLUG
+
+      - name: Run the bench
+        run: |
+          make -C nucliadb_performance test-standalone-search
+
+      - name: Stop NucliaDB
+        run: kill $NDB_PID
+
+      - uses: Swatinem/rust-cache@v2
+      - uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          profile: minimal
+          override: true
+
+      - name: Store performance data
+        uses: nuclia/nucliadb_performance@main
+        with:
+          metrics_file: nucliadb_performace/standalone.json
+          influxdb_url: ${{ secrets.INFLUXDB_SERVER }}
+          influxdb_token: ${{ secrets.INFLUXDB_TOKEN }}
+          influxdb_org: nuclia
+          influxdb_bucket: benchmarks
diff --git a/nucliadb_performance/export_import.py b/nucliadb_performance/export_import.py
@@ -0,0 +1,162 @@
+import argparse
+import os
+import time
+from dataclasses import dataclass
+
+import requests
+from tqdm import tqdm
+
+from nucliadb_models.resource import ReleaseChannel
+from nucliadb_sdk import NucliaSDK
+from nucliadb_sdk.v2.exceptions import NotFoundError
+
+
+@dataclass
+class NucliaDB:
+    reader: NucliaSDK
+    writer: NucliaSDK
+
+
+API = "http://localhost:8080/api"
+CHUNK_SIZE = 1024 * 1025 * 5
+
+
+ndb = NucliaDB(
+    reader=NucliaSDK(url=API, headers={"X-Nucliadb-Roles": "READER"}),
+    writer=NucliaSDK(url=API, headers={"X-Nucliadb-Roles": "WRITER;MANAGER"}),
+)
+
+
+def get_or_create_kb(ndb, slug, release_channel=None) -> str:
+    try:
+        kbid = ndb.reader.get_knowledge_box_by_slug(slug=slug).uuid
+    except NotFoundError:
+        kbid = ndb.writer.create_knowledge_box(
+            slug=slug, release_channel=release_channel
+        ).uuid
+    return kbid
+
+
+def import_kb(*, uri, slug, release_channel=None):
+    kbid = get_or_create_kb(ndb, slug, release_channel=release_channel)
+    print(f"Importing from {uri} to kb={slug}")
+
+    import_id = ndb.writer.start_import(
+        kbid=kbid, content=read_import_stream(uri)
+    ).import_id
+    print(f"Started import task. Import id: {import_id}")
+
+    print("Waiting for the data to be imported")
+    status = ndb.reader.import_status(kbid=kbid, import_id=import_id)
+    while status.status != "finished":
+        print(f"Status: {status.status} {status.processed}/{status.total}")
+        assert status.status != "error"
+        time.sleep(2)
+        status = ndb.reader.import_status(kbid=kbid, import_id=import_id)
+    print(f"Import finished!")
+
+
+def export_kb(*, uri, slug):
+    kbid = ndb.reader.get_knowledge_box_by_slug(slug=slug).uuid
+    export_id = ndb.writer.start_export(kbid=kbid).export_id
+
+    print(f"Starting export for {slug}. Export id: {export_id}")
+    status = ndb.reader.export_status(kbid=kbid, export_id=export_id)
+    while status.status != "finished":
+        print(f"Status: {status.status} {status.processed}/{status.total}")
+        assert status.status != "error"
+        time.sleep(2)
+        status = ndb.reader.export_status(kbid=kbid, export_id=export_id)
+
+    print(f"Downloading export at {uri}")
+    export_generator = ndb.reader.download_export(kbid=kbid, export_id=export_id)
+    save_export_stream(uri, export_generator)
+
+
+def save_export_stream(uri, export_generator):
+    tqdm_kwargs = dict(
+        desc="Downloading export from NucliaDB",
+        unit="iB",
+        unit_scale=True,
+    )
+    stream_with_progress = progressify(export_generator, **tqdm_kwargs)
+    if uri.startswith("http"):
+        save_export_to_url(uri, stream_with_progress)
+    else:
+        save_export_to_file(uri, stream_with_progress)
+
+
+def save_export_to_file(export_path, export_generator):
+    with open(export_path, "wb") as f:
+        for chunk in export_generator(chunk_size=CHUNK_SIZE * 10):
+            f.write(chunk)
+
+
+def save_export_to_url(uri, export_generator):
+    response = requests.put(uri, data=export_generator)
+    response.raise_for_status()
+
+
+def read_import_stream(uri):
+    tqdm_kwargs = dict(
+        desc="Uploading export to NucliaDB",
+        unit="iB",
+        unit_scale=True,
+    )
+    if uri.startswith("http"):
+        stream = read_from_url
+    else:
+        stream = read_from_file
+        tqdm_kwargs["total"] = os.path.getsize(uri)
+    for chunk in progressify(stream(uri), **tqdm_kwargs):
+        yield chunk
+
+
+def read_from_file(path):
+    with open(path, mode="rb") as f:
+        while True:
+            chunk = f.read(CHUNK_SIZE)
+            if not chunk:
+                break
+            yield chunk
+
+
+def read_from_url(uri):
+    response = requests.get(uri, stream=True)
+    response.raise_for_status()
+    return response.iter_content(chunk_size=CHUNK_SIZE)
+
+
+def progressify(func, **tqdm_kwargs):
+    with tqdm(**tqdm_kwargs) as progress_bar:
+        for chunk in func:
+            progress_bar.update(len(chunk))
+            yield chunk
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--action", choices=["export", "import"])
+    parser.add_argument("--uri", type=str)
+    parser.add_argument("--kb", type=str)
+    parser.add_argument(
+        "--release_channel",
+        type=str,
+        choices=[v.value for v in ReleaseChannel],
+        default=ReleaseChannel.STABLE.value,
+    )
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_arguments()
+    if args.action == "export":
+        export_kb(uri=args.uri, slug=args.kb)
+    elif args.action == "import":
+        release_channel = ReleaseChannel(args.release_channel)
+        import_kb(uri=args.uri, slug=args.kb, release_channel=release_channel)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nucliadb_performance/nucliadb_performance/standalone/test_search.py b/nucliadb_performance/nucliadb_performance/standalone/test_search.py
@@ -9,6 +9,7 @@
     save_benchmark_json_results,
 )
 from nucliadb_performance.utils.misc import (
+    get_fake_word,
     get_kb,
     get_request,
     make_kbid_request,
@@ -43,6 +44,20 @@ async def test_find(session):
     )
 
 
+@scenario(weight=2)
+async def test_suggest(session):
+    kbid, _ = get_test_kb()
+    url = "/v1/kb/{kbid}/suggest".format(kbid=kbid)
+    method = "GET"
+    await make_kbid_request(
+        session,
+        kbid,
+        method,
+        url,
+        params={"query": get_fake_word()},
+    )
+
+
 @global_teardown()
 def end_test():
     print("This is the end of the test.")

diff --git a/nucliadb_performance/requirements.txt b/nucliadb_performance/requirements.txt
@@ -3,4 +3,6 @@ nucliadb-sdk
 certifi
 faker
 httpx
-sentence-transformers
+sentence-transformers
+tqdm
+types-tqdm