From 6bb96f53df68f57014445480cfa36976478889bf Mon Sep 17 00:00:00 2001 From: Peter Marsh Date: Sat, 2 Sep 2023 22:44:42 +0200 Subject: [PATCH] Open historical data files in binary mode (it's faster) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both json and orjson can parse JSON directly from raw bytes, they don't require the bytes to first be decoded to str. It turns out that reading files in binary mode and parsing the bytes directly to JSON is 1.1 to 1.2 times faster than reading the raw data in text mode. This does mean there is a type discrepancy between StreamListener.on_data and HistoricListener.on_data (str vs bytes respectively). It wouldn't be difficult to change betfairlightweight to have StreamListender.on_data accept bytes, and while it would be an API change I doubt anyone would need to change any code as more or less any .on_data implementation needs to parse the data to JSON immediately anyway and that would work the same with str or bytes. From my benchmark I can't see any significant difference in str vs bytes inside betfairlightweight and as such it seems to me to be more prudent to bend the type hinting a little here and not make any changes in betfairlightweight itself. Benchmark https://gist.github.com/petedmarsh/802f1d2cde79d957afcc744d63d34347 Benchmarks, repeat=5, number=5 ┌────────────────────────────────────────────────────┬─────────┬─────────┬─────────┬─────────────────┬─────────────────┬─────────────────┐ │ Benchmark │ Min │ Max │ Mean │ Min (+) │ Max (+) │ Mean (+) │ ├────────────────────────────────────────────────────┼─────────┼─────────┼─────────┼─────────────────┼─────────────────┼─────────────────┤ │ (In Memory) json(str) vs json(bytes) │ 1.188 │ 1.194 │ 1.192 │ 1.199 (-1.0x) │ 1.208 (-1.0x) │ 1.203 (-1.0x) │ │ (In Memory) orjson(str) vs orjson(bytes) │ 0.577 │ 0.580 │ 0.578 │ 0.576 (1.0x) │ 0.580 (-1.0x) │ 0.578 (1.0x) │ │ (In Memory) json(decoded bytes) vs json(bytes) │ 1.193 │ 1.200 │ 1.196 │ 1.200 (-1.0x) │ 1.207 (-1.0x) │ 1.203 (-1.0x) │ │ (In Memory) orjson(decoded bytes) vs orjson(bytes) │ 0.585 │ 0.590 │ 0.587 │ 0.579 (1.0x) │ 0.585 (1.0x) │ 0.582 (1.0x) │ │ (From File) rt json vs rt orjson │ 1.832 │ 1.855 │ 1.839 │ 1.219 (1.5x) │ 1.220 (1.5x) │ 1.220 (1.5x) │ │ (From File) rt json vs rb json │ 1.834 │ 1.836 │ 1.835 │ 1.633 (1.1x) │ 1.649 (1.1x) │ 1.641 (1.1x) │ │ (From File) rt orjson vs rb orjson │ 1.217 │ 1.226 │ 1.221 │ 1.000 (1.2x) │ 1.003 (1.2x) │ 1.002 (1.2x) │ │ (From File) rb json vs rb orjson │ 1.635 │ 1.643 │ 1.638 │ 1.000 (1.6x) │ 1.010 (1.6x) │ 1.007 (1.6x) │ └────────────────────────────────────────────────────┴─────────┴─────────┴─────────┴─────────────────┴─────────────────┴─────────────────┘ --- flumine/streams/historicalstream.py | 4 ++-- flumine/utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/flumine/streams/historicalstream.py b/flumine/streams/historicalstream.py index f33a7e15..56849ee9 100644 --- a/flumine/streams/historicalstream.py +++ b/flumine/streams/historicalstream.py @@ -184,7 +184,7 @@ def _add_stream(self, unique_id: int, operation: str): else: raise ListenerError("Unable to process '{0}' stream".format(operation)) - def on_data(self, raw_data: str) -> Optional[bool]: + def on_data(self, raw_data: bytes) -> Optional[bool]: try: data = json.loads(raw_data) except ValueError: @@ -205,7 +205,7 @@ def _read_loop(self) -> dict: self.listener.register_stream(self.unique_id, self.operation) listener_on_data = self.listener.on_data # cache functions stream_snap = self.listener.stream.snap - with open(self.file_path, "r") as f: + with open(self.file_path, "rb") as f: for update in f: if listener_on_data(update): yield stream_snap() diff --git a/flumine/utils.py b/flumine/utils.py index 3e823739..c054b962 100644 --- a/flumine/utils.py +++ b/flumine/utils.py @@ -63,7 +63,7 @@ def get_file_md(file_dir: Union[str, tuple], value: str) -> Optional[str]: # get value from raw streaming file marketDefinition if isinstance(file_dir, tuple): file_dir = file_dir[0] - with open(file_dir, "r") as f: + with open(file_dir, "rb") as f: first_line = f.readline() update = json.loads(first_line) if "mc" not in update or not isinstance(update["mc"], list) or not update["mc"]: