Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
lferran committed Nov 30, 2023
1 parent 84cebd9 commit 0d5c4d0
Show file tree
Hide file tree
Showing 4 changed files with 240 additions and 1 deletion.
60 changes: 60 additions & 0 deletions .github/workflows/nucliadb_performance2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
name: Performance Regression
on:
push:
branches:
- ferran/sc-7816/integrate-search-benchmarks-on-ci
issue_comment:
types: [created]
schedule:
- cron: "0 0 * * *"
workflow_dispatch:

permissions: write-all

jobs:
check-performance:
if: github.event_name == 'push' || github.event.issue.pull_request && contains(github.event.comment.body, '/bench')
name: Run NucliaDB standalone performance tests
runs-on: ubuntu-latest
env:
KB_SLUG: small
EXPORTS_URI: ${{ secrets.EXPORTS_URI }}

steps:
- name: Checkout the repository
uses: actions/checkout@v3
- name: Install NucliaDB
run: |
make -C nucliadb install
- name: Start NucliaDB
run: |
DEBUG=true nucliadb &
echo NDB_PID=$! >> "$GITHUB_ENV"
- name: Import data
run: |
python nucliadb/nucliadb_performance/import-kb.py --kb=$KB_SLUG --uri=$EXPORTS_URI/$KB_SLUG
- name: Run the bench
run: |
make -C nucliadb_performance test-standalone-search
- name: Stop NucliaDB
run: kill $NDB_PID

- uses: Swatinem/rust-cache@v2
- uses: actions-rs/toolchain@v1
with:
toolchain: stable
profile: minimal
override: true

- name: Store performance data
uses: nuclia/nucliadb_performance@main
with:
metrics_file: nucliadb_performace/standalone.json
influxdb_url: ${{ secrets.INFLUXDB_SERVER }}
influxdb_token: ${{ secrets.INFLUXDB_TOKEN }}
influxdb_org: nuclia
influxdb_bucket: benchmarks
162 changes: 162 additions & 0 deletions nucliadb_performance/export_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import argparse
import os
import time
from dataclasses import dataclass

import requests
from tqdm import tqdm

from nucliadb_models.resource import ReleaseChannel
from nucliadb_sdk import NucliaSDK
from nucliadb_sdk.v2.exceptions import NotFoundError


@dataclass
class NucliaDB:
reader: NucliaSDK
writer: NucliaSDK


API = "http://localhost:8080/api"
CHUNK_SIZE = 1024 * 1025 * 5


ndb = NucliaDB(
reader=NucliaSDK(url=API, headers={"X-Nucliadb-Roles": "READER"}),
writer=NucliaSDK(url=API, headers={"X-Nucliadb-Roles": "WRITER;MANAGER"}),
)


def get_or_create_kb(ndb, slug, release_channel=None) -> str:
try:
kbid = ndb.reader.get_knowledge_box_by_slug(slug=slug).uuid
except NotFoundError:
kbid = ndb.writer.create_knowledge_box(
slug=slug, release_channel=release_channel
).uuid
return kbid


def import_kb(*, uri, slug, release_channel=None):
kbid = get_or_create_kb(ndb, slug, release_channel=release_channel)
print(f"Importing from {uri} to kb={slug}")

import_id = ndb.writer.start_import(
kbid=kbid, content=read_import_stream(uri)
).import_id
print(f"Started import task. Import id: {import_id}")

print("Waiting for the data to be imported")
status = ndb.reader.import_status(kbid=kbid, import_id=import_id)
while status.status != "finished":
print(f"Status: {status.status} {status.processed}/{status.total}")
assert status.status != "error"
time.sleep(2)
status = ndb.reader.import_status(kbid=kbid, import_id=import_id)
print(f"Import finished!")


def export_kb(*, uri, slug):
kbid = ndb.reader.get_knowledge_box_by_slug(slug=slug).uuid
export_id = ndb.writer.start_export(kbid=kbid).export_id

print(f"Starting export for {slug}. Export id: {export_id}")
status = ndb.reader.export_status(kbid=kbid, export_id=export_id)
while status.status != "finished":
print(f"Status: {status.status} {status.processed}/{status.total}")
assert status.status != "error"
time.sleep(2)
status = ndb.reader.export_status(kbid=kbid, export_id=export_id)

print(f"Downloading export at {uri}")
export_generator = ndb.reader.download_export(kbid=kbid, export_id=export_id)
save_export_stream(uri, export_generator)


def save_export_stream(uri, export_generator):
tqdm_kwargs = dict(
desc="Downloading export from NucliaDB",
unit="iB",
unit_scale=True,
)
stream_with_progress = progressify(export_generator, **tqdm_kwargs)
if uri.startswith("http"):
save_export_to_url(uri, stream_with_progress)
else:
save_export_to_file(uri, stream_with_progress)


def save_export_to_file(export_path, export_generator):
with open(export_path, "wb") as f:
for chunk in export_generator(chunk_size=CHUNK_SIZE * 10):
f.write(chunk)


def save_export_to_url(uri, export_generator):
response = requests.put(uri, data=export_generator)
response.raise_for_status()


def read_import_stream(uri):
tqdm_kwargs = dict(
desc="Uploading export to NucliaDB",
unit="iB",
unit_scale=True,
)
if uri.startswith("http"):
stream = read_from_url
else:
stream = read_from_file
tqdm_kwargs["total"] = os.path.getsize(uri)
for chunk in progressify(stream(uri), **tqdm_kwargs):
yield chunk


def read_from_file(path):
with open(path, mode="rb") as f:
while True:
chunk = f.read(CHUNK_SIZE)
if not chunk:
break
yield chunk


def read_from_url(uri):
response = requests.get(uri, stream=True)
response.raise_for_status()
return response.iter_content(chunk_size=CHUNK_SIZE)


def progressify(func, **tqdm_kwargs):
with tqdm(**tqdm_kwargs) as progress_bar:
for chunk in func:
progress_bar.update(len(chunk))
yield chunk


def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("--action", choices=["export", "import"])
parser.add_argument("--uri", type=str)
parser.add_argument("--kb", type=str)
parser.add_argument(
"--release_channel",
type=str,
choices=[v.value for v in ReleaseChannel],
default=ReleaseChannel.STABLE.value,
)
args = parser.parse_args()
return args


def main():
args = parse_arguments()
if args.action == "export":
export_kb(uri=args.uri, slug=args.kb)
elif args.action == "import":
release_channel = ReleaseChannel(args.release_channel)
import_kb(uri=args.uri, slug=args.kb, release_channel=release_channel)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
save_benchmark_json_results,
)
from nucliadb_performance.utils.misc import (
get_fake_word,
get_kb,
get_request,
make_kbid_request,
Expand Down Expand Up @@ -43,6 +44,20 @@ async def test_find(session):
)


@scenario(weight=2)
async def test_suggest(session):
kbid, _ = get_test_kb()
url = "/v1/kb/{kbid}/suggest".format(kbid=kbid)
method = "GET"
await make_kbid_request(
session,
kbid,
method,
url,
params={"query": get_fake_word()},
)


@global_teardown()
def end_test():
print("This is the end of the test.")
Expand Down
4 changes: 3 additions & 1 deletion nucliadb_performance/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@ nucliadb-sdk
certifi
faker
httpx
sentence-transformers
sentence-transformers
tqdm
types-tqdm

0 comments on commit 0d5c4d0

Please sign in to comment.