Skip to content

Commit

Permalink
add diff query benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
LucasG0 committed Oct 25, 2024
1 parent 27f4f86 commit 30560eb
Show file tree
Hide file tree
Showing 8 changed files with 285 additions and 122 deletions.
4 changes: 2 additions & 2 deletions backend/infrahub/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,9 +197,9 @@ class DatabaseSettings(BaseSettings):
tls_insecure: bool = Field(default=False, description="Indicates if TLS certificates are verified")
tls_ca_file: Optional[str] = Field(default=None, description="File path to CA cert or bundle in PEM format")
query_size_limit: int = Field(
default=5000,
default=5_000,
ge=1,
le=20000,
le=20_000,
description="The max number of records to fetch in a single query before performing internal pagination.",
)
max_depth_search_hierarchy: int = Field(
Expand Down
1 change: 1 addition & 0 deletions backend/infrahub/core/query/relationship.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,7 @@ async def query_init(self, db: InfrahubDatabase, **kwargs) -> None:
r1 = f"{arrows.left.start}[r1:{self.rel_type} $rel_prop ]{arrows.left.end}"
r2 = f"{arrows.right.start}[r2:{self.rel_type} $rel_prop ]{arrows.right.end}"

# Specifying relationship type might improve query performance here.
query = """
MATCH (s:Node { uuid: $source_id })-[]-(rl:Relationship {uuid: $rel_id})-[]-(d:Node { uuid: $destination_id })
CREATE (s)%s(rl)
Expand Down
14 changes: 14 additions & 0 deletions backend/tests/helpers/query_benchmark/benchmark_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from dataclasses import dataclass

from infrahub.database import Neo4jRuntime
from tests.helpers.constants import NEO4J_ENTERPRISE_IMAGE


@dataclass
class BenchmarkConfig:
neo4j_image: str = NEO4J_ENTERPRISE_IMAGE
neo4j_runtime: Neo4jRuntime = Neo4jRuntime.DEFAULT
load_db_indexes: bool = False

def __str__(self) -> str:
return f"{self.neo4j_image=} ; runtime: {self.neo4j_runtime} ; indexes: {self.load_db_indexes}"
216 changes: 130 additions & 86 deletions backend/tests/helpers/query_benchmark/car_person_generators.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,154 @@
import random
import uuid
from typing import Optional, Tuple
from typing import Any, Optional, Tuple

from infrahub.core import registry
from infrahub.core.branch import Branch
from infrahub.core.manager import NodeManager
from infrahub.core.node import Node
from tests.helpers.query_benchmark.data_generator import DataGenerator
from tests.helpers.query_benchmark.db_query_profiler import InfrahubDatabaseProfiler


class CarGenerator(DataGenerator):
async def load_data(self, nb_elements: int) -> None:
await self.load_cars(nb_elements)
default_branch = await registry.get_branch(db=self.db)
await self.load_cars(default_branch, nb_elements)

async def load_cars(self, nb_cars: int, persons: Optional[dict[str, Node]] = None) -> dict[str, Node]:
"""
Load cars and return a mapping car_name -> car_node.
If 'persons' is specified, each car created is linked to a person.
"""
async def load_car_random_name(self, branch: Branch, nbr_seats: int, **kwargs: Any) -> Node:
car_schema = registry.schema.get_node_schema(name="TestCar", branch=branch)

default_branch = await registry.get_branch(db=self.db)
car_schema = registry.schema.get_node_schema(name="TestCar", branch=default_branch)
short_id = str(uuid.uuid4())[:8]
car_name = f"car-{short_id}"
car_node = await Node.init(db=self.db, schema=car_schema, branch=branch)
await car_node.new(db=self.db, name=car_name, nbr_seats=nbr_seats, **kwargs)

return await car_node.save(db=self.db)

async def load_cars(self, branch: Branch, nb_cars: int, **kwargs: Any) -> dict[str, Node]:
cars = {}
for _ in range(nb_cars):
short_id = str(uuid.uuid4())[:8]
car_name = f"car-{short_id}"
car_node = await Node.init(db=self.db, schema=car_schema, branch=default_branch)
if persons is not None:
random_person = random.choice([persons[person_name] for person_name in persons])
await car_node.new(db=self.db, name=car_name, nbr_seats=4, owner=random_person)
else:
await car_node.new(db=self.db, name=car_name, nbr_seats=4)
car_node = await self.load_car_random_name(nbr_seats=4, branch=branch, **kwargs)
cars[car_node.name.value] = car_node # type: ignore[attr-defined]

async with self.db.start_session():
await car_node.save(db=self.db)
return cars

cars[car_name] = car_node

class EngineGenerator(DataGenerator):
async def load_data(self, nb_elements: int) -> None:
default_branch = await registry.get_branch(db=self.db)
await self.load_engines(default_branch, nb_elements)

async def load_engines(self, branch: Branch, nb_cars: int, **kwargs: Any) -> dict[str, Node]:
engines = {}
for _ in range(nb_cars):
engine_node = await self.load_engine_random_name(branch=branch, **kwargs)
engines[engine_node.name.value] = engine_node # type: ignore[attr-defined]

return engines

async def load_engine_random_name(self, branch: Branch, **kwargs: Any) -> Node:
engine_schema = registry.schema.get_node_schema(name="TestEngine", branch=branch)

short_id = str(uuid.uuid4())[:8]
engine_name = f"engine-{short_id}"
engine_node = await Node.init(db=self.db, schema=engine_schema, branch=branch)
await engine_node.new(db=self.db, name=engine_name, **kwargs)

return await engine_node.save(db=self.db)


class CarWithDiffInSecondBranchGenerator(CarGenerator):
persons: Optional[dict[str, Node]] # mapping of existing cars names -> node
nb_persons: int
diff_ratio: float # 0.1 means 10% of added nodes, 10% of deleted nodes, 10% of modified nodes
main_branch: Branch
diff_branch: Branch

def __init__(
self, db: InfrahubDatabaseProfiler, nb_persons: int, diff_ratio: float, main_branch: Branch, diff_branch: Branch
) -> None:
super().__init__(db)
self.persons = None
self.nb_persons = nb_persons
self.diff_ratio = diff_ratio
self.main_branch = main_branch
self.diff_branch = diff_branch

async def init(self) -> None:
"""Load persons, that will be later connected to generated cars."""
self.persons = await PersonGenerator(self.db).load_persons(nb_persons=self.nb_persons)

async def load_cars_with_multiple_rels(self, branch: Branch, nb_cars: int) -> dict[str, Node]:
assert self.persons is not None
engine_generator = EngineGenerator(db=self.db)

cars = {}
for _ in range(nb_cars):
owner = random.choice([self.persons[person_name] for person_name in self.persons])
drivers = random.choices([self.persons[person_name] for person_name in self.persons], k=nb_cars)
engine = await engine_generator.load_engine_random_name(branch=branch)
car = await self.load_car_random_name(
branch=branch, nbr_seats=4, owner=owner, drivers=drivers, engine=engine
)
cars[car.name.value] = car # type: ignore[attr-defined]

return cars

async def load_data(self, nb_elements: int) -> None:
"""
Load cars in main branch, rebase diff branch on main branch, then load changes
within diff branch according to a given ratio.
Differences are:
- Updates some cars attributes as well as 1:1, 1:N, N:N relationships.
- Add new cars.
Note that we do not delete cars within diff branch as it seems to take too long.
"""

assert self.persons is not None, "'init' method should be called before 'load_data'"

if nb_elements == 0:
return

# Load cars in main branch
new_cars = await self.load_cars_with_multiple_rels(nb_cars=nb_elements, branch=self.main_branch)

# Integrate these new cars in diff branch
await self.diff_branch.rebase(self.db)

# Retrieve car nodes from diff branch, including the ones not present in main branch
# that were created by prior calls to `load_data`
car_schema = registry.schema.get_node_schema(name="TestCar", branch=self.diff_branch)
car_nodes = await NodeManager.query(db=self.db, schema=car_schema, branch=self.diff_branch)
new_car_nodes = [car_node for car_node in car_nodes if car_node.name.value in new_cars]

nb_diff = max(int(nb_elements * self.diff_ratio), 1)

# Update cars in diff branch
car_nodes_updatable = new_car_nodes
car_nodes_to_update = random.choices(car_nodes_updatable, k=nb_diff)
for i, car_node in enumerate(car_nodes_to_update):
car_node.name.value = f"updated-car-{str(uuid.uuid4())[:8]}"

# Permute engines among car nodes to update, so it keeps one-to-one relationship between cars-engines
new_engine = car_nodes_to_update[(i + 1) % len(car_nodes_to_update)].engine
car_node.engine.update(db=self.db, data=new_engine)

# Update one-to-many relationship
new_owner = random.choice([self.persons[person_name] for person_name in self.persons])
car_node.owner.update(db=self.db, data=new_owner)

# Update many-to-many relationship
new_drivers = random.choices([self.persons[person_name] for person_name in self.persons])
car_node.drivers.update(db=self.db, data=new_drivers)

await car_node.save(db=self.db)

# Add a few cars in diff branch
added_cars = await self.load_cars_with_multiple_rels(nb_cars=nb_diff, branch=self.diff_branch)

assert len(added_cars) == len(car_nodes_to_update) == nb_diff


class PersonGenerator(DataGenerator):
async def load_data(self, nb_elements: int) -> None:
Expand Down Expand Up @@ -77,42 +187,6 @@ async def load_persons(
return persons_names_to_nodes


class PersonFromExistingCarGenerator(PersonGenerator):
cars: Optional[dict[str, Node]] # mapping of existing cars names -> node
nb_cars: int

def __init__(self, db: InfrahubDatabaseProfiler, nb_cars: int) -> None:
super().__init__(db)
self.nb_cars = nb_cars
self.cars = None

async def init(self) -> None:
"""Load cars, that will be later connected to generated persons."""
self.cars = await CarGenerator(self.db).load_cars(nb_cars=self.nb_cars)

async def load_data(self, nb_elements: int) -> None:
assert self.cars is not None, "'init' method should be called before 'load_data'"
await self.load_persons(nb_persons=nb_elements, cars=self.cars)


class CarFromExistingPersonGenerator(CarGenerator):
persons: Optional[dict[str, Node]] # mapping of existing cars names -> node
nb_persons: int

def __init__(self, db: InfrahubDatabaseProfiler, nb_persons: int) -> None:
super().__init__(db)
self.nb_persons = nb_persons
self.persons = None

async def init(self) -> None:
"""Load persons, that will be later connected to generated cars."""
self.persons = await PersonGenerator(self.db).load_persons(nb_persons=self.nb_persons)

async def load_data(self, nb_elements: int) -> None:
assert self.persons is not None, "'init' method should be called before 'load_data'"
await self.load_cars(nb_cars=nb_elements, persons=self.persons)


class CarGeneratorWithOwnerHavingUniqueCar(CarGenerator):
persons: list[Tuple[str, Node]] # mapping of existing cars names -> node
nb_persons: int
Expand Down Expand Up @@ -154,33 +228,3 @@ async def load_data(self, nb_elements: int) -> None:
await car_node.save(db=self.db)

self.nb_cars_loaded += nb_elements


class CarAndPersonIsolatedGenerator(DataGenerator):
def __init__(self, db: InfrahubDatabaseProfiler) -> None:
super().__init__(db)
self.car_generator: CarGenerator = CarGenerator(db)
self.person_generator: PersonGenerator = PersonGenerator(db)

async def load_data(self, nb_elements: int) -> None:
"""
Load not connected cars and persons. Note that 'nb_elements' cars plus 'nb_elements' persons are loaded.
"""

await self.car_generator.load_cars(nb_cars=nb_elements)
await self.person_generator.load_persons(nb_persons=nb_elements)


class CarAndPersonConnectedGenerator(DataGenerator):
def __init__(self, db: InfrahubDatabaseProfiler) -> None:
super().__init__(db)
self.car_generator: CarGenerator = CarGenerator(db)
self.person_generator: PersonGenerator = PersonGenerator(db)

async def load_data(self, nb_elements: int) -> None:
"""
Load connected cars and persons. Note that 'nb_elements' cars plus 'nb_elements' persons are loaded.
"""

persons = await self.person_generator.load_persons(nb_persons=nb_elements)
await self.car_generator.load_cars(nb_cars=nb_elements, persons=persons)
1 change: 1 addition & 0 deletions backend/tests/helpers/query_benchmark/data_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ async def load_data_and_profile(
)

for i, nb_elem_to_load in enumerate(nb_elem_per_batch):
print(f"Before loading batch {i=}. Current elements: {i * nb_elem_to_load=}")
await data_generator.load_data(nb_elements=nb_elem_to_load)
db_profiling_queries.increase_nb_elements_loaded(nb_elem_to_load)
profile_memory = i % memory_profiling_rate == 0 if memory_profiling_rate is not None else False
Expand Down
44 changes: 43 additions & 1 deletion backend/tests/query_benchmark/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import pytest

from infrahub import config
from infrahub.core.constants import BranchSupportType
from infrahub.core.schema import SchemaRoot
from tests.helpers.query_benchmark.db_query_profiler import GraphProfileGenerator
Expand Down Expand Up @@ -40,6 +41,14 @@ async def car_person_schema_root() -> SchemaRoot:
"peer": "TestPerson",
"cardinality": "one",
},
{
"name": "drivers",
"label": "Who already drove the car",
"peer": "TestPerson",
"identifier": "testcar__drivers",
"cardinality": "many",
},
{"name": "engine", "label": "engine of the car", "peer": "TestEngine", "cardinality": "one"},
],
},
{
Expand All @@ -54,7 +63,32 @@ async def car_person_schema_root() -> SchemaRoot:
{"name": "height", "kind": "Number", "optional": True},
],
"relationships": [
{"name": "cars", "peer": "TestCar", "cardinality": "many"},
{
"name": "cars",
"peer": "TestCar",
"cardinality": "many",
},
{
"name": "driven_cars",
"label": "Already driven by the Person",
"peer": "TestCar",
"identifier": "testcar__drivers",
"cardinality": "many",
},
],
},
{
"name": "Engine",
"namespace": "Test",
"default_filter": "name__value",
"display_labels": ["name__value"],
"branch": BranchSupportType.AWARE.value,
"uniqueness_constraints": [["name__value"]],
"attributes": [
{"name": "name", "kind": "Text", "unique": True},
],
"relationships": [
{"name": "car", "peer": "TestCar", "cardinality": "one"},
],
},
],
Expand All @@ -71,3 +105,11 @@ async def graph_generator() -> GraphProfileGenerator:
"""

return GraphProfileGenerator()


@pytest.fixture(scope="function")
async def increase_query_size_limit() -> None:
original_query_size_limit = config.SETTINGS.database.query_size_limit
config.SETTINGS.database.query_size_limit = 1_000_000
yield
config.SETTINGS.database.query_size_limit = original_query_size_limit
Loading

0 comments on commit 30560eb

Please sign in to comment.