data61 · hardbyte · Apr 13, 2018 · Apr 13, 2018 · nbgl · Apr 13, 2018
diff --git a/clkhash/backports.py b/clkhash/backports.py
@@ -22,7 +22,9 @@ def __int_from_bytes(bytes, byteorder, signed=False):
             :param byteorder: Either `'big'` or `'little'`.
         """
         if signed:
-            raise NotImplementedError("Signed integers are not currently supported in this " "backport.")
+            raise NotImplementedError(
+                "Signed integers are not currently supported in this " "backport."
+            )
 
         if byteorder == "big":
             pass
@@ -38,7 +40,14 @@ def __int_from_bytes(bytes, byteorder, signed=False):
     # named arguments. Hence, must cast so Mypy thinks it matches the
     # original function.
     int_from_bytes = cast(
-        Callable[[Arg(Sequence[int], "bytes"), Arg(str, "byteorder"), DefaultNamedArg(bool, "signed")], int],
+        Callable[
+            [
+                Arg(Sequence[int], "bytes"),
+                Arg(str, "byteorder"),
+                DefaultNamedArg(bool, "signed"),
+            ],
+            int,
+        ],
         __int_from_bytes,
     )
 
@@ -93,5 +102,6 @@ def _p2_unicode_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
 
 
 unicode_reader = (
-    _p2_unicode_reader if sys.version_info < (3, 0) else csv.reader  # Python 2 with hacky workarounds.
+    _p2_unicode_reader if sys.version_info
+    < (3, 0) else csv.reader  # Python 2 with hacky workarounds.
 )  # Py3 with native Unicode support.
diff --git a/clkhash/benchmark.py b/clkhash/benchmark.py
@@ -35,7 +35,11 @@ def compute_hash_speed(n, quiet=False):
 
     elapsed_time = end - start
     if not quiet:
-        print("{:6d} hashes in {:.6f} seconds. {:.2f} KH/s".format(n, elapsed_time, n / (1000 * elapsed_time)))
+        print(
+            "{:6d} hashes in {:.6f} seconds. {:.2f} KH/s".format(
+                n, elapsed_time, n / (1000 * elapsed_time)
+            )
+        )
     return n / elapsed_time
 
 

diff --git a/clkhash/bloomfilter.py b/clkhash/bloomfilter.py
@@ -31,7 +31,11 @@
 
 
 def double_hash_encode_ngrams(
-    ngrams, keys, k, l, encoding  # type: Iterable[str]  # type: Sequence[bytes]  # type: int  # type: int  # type: str
+    ngrams,
+    keys,
+    k,
+    l,
+    encoding,  # type: Iterable[str]  # type: Sequence[bytes]  # type: int  # type: int  # type: str
 ):
     # type: (...) -> bitarray
     """
@@ -52,16 +56,24 @@ def double_hash_encode_ngrams(
     bf = bitarray(l)
     bf.setall(False)
     for m in ngrams:
-        sha1hm = int(hmac.new(key_sha1, m.encode(encoding=encoding), sha1).hexdigest(), 16) % l
-        md5hm = int(hmac.new(key_md5, m.encode(encoding=encoding), md5).hexdigest(), 16) % l
+        sha1hm = int(
+            hmac.new(key_sha1, m.encode(encoding=encoding), sha1).hexdigest(), 16
+        ) % l
+        md5hm = int(
+            hmac.new(key_md5, m.encode(encoding=encoding), md5).hexdigest(), 16
+        ) % l
         for i in range(k):
             gi = (sha1hm + i * md5hm) % l
             bf[gi] = 1
     return bf
 
 
 def double_hash_encode_ngrams_non_singular(
-    ngrams, keys, k, l, encoding  # type: Iterable[str]  # type: Sequence[bytes]  # type: int  # type: int  # type: str
+    ngrams,
+    keys,
+    k,
+    l,
+    encoding,  # type: Iterable[str]  # type: Sequence[bytes]  # type: int  # type: int  # type: str
 ):
     # type: (...) -> bitarray.bitarray
     """
@@ -124,7 +136,11 @@ def double_hash_encode_ngrams_non_singular(
 
 
 def blake_encode_ngrams(
-    ngrams, keys, k, l, encoding  # type: Iterable[str]  # type: Sequence[bytes]  # type: int  # type: int  # type: str
+    ngrams,
+    keys,
+    k,
+    l,
+    encoding,  # type: Iterable[str]  # type: Sequence[bytes]  # type: int  # type: int  # type: str
 ):
     # type: (...) -> bitarray.bitarray
     """
@@ -180,7 +196,11 @@ def blake_encode_ngrams(
 
     log_l = int(math.log(l, 2))
     if not 2 ** log_l == l:
-        raise ValueError('parameter "l" has to be a power of two for the BLAKE2 encoding, but was: {}'.format(l))
+        raise ValueError(
+            'parameter "l" has to be a power of two for the BLAKE2 encoding, but was: {}'.format(
+                l
+            )
+        )
 
     bf = bitarray(l)
     bf.setall(False)
@@ -192,8 +212,12 @@ def blake_encode_ngrams(
     for m in ngrams:
         random_shorts = []  # type: List[int]
         for i in range(num_macs):
-            hash_bytes = blake2b(m.encode(encoding=encoding), key=key, salt=str(i).encode()).digest()
-            random_shorts.extend(struct.unpack("32H", hash_bytes))  # interpret hash bytes as 32 unsigned shorts.
+            hash_bytes = blake2b(
+                m.encode(encoding=encoding), key=key, salt=str(i).encode()
+            ).digest()
+            random_shorts.extend(
+                struct.unpack("32H", hash_bytes)
+            )  # interpret hash bytes as 32 unsigned shorts.
         for i in range(k):
             idx = random_shorts[i] % l
             bf[idx] = 1
@@ -316,7 +340,9 @@ def crypto_bloom_filter(
 
 
 def stream_bloom_filters(
-    dataset, keys, schema  # type: Iterable[Sequence[Text]]  # type: Sequence[Sequence[bytes]]  # type: Schema
+    dataset,
+    keys,
+    schema,  # type: Iterable[Sequence[Text]]  # type: Sequence[Sequence[bytes]]  # type: Schema
 ):
     # type: (...) -> Iterable[Tuple[bitarray, Text, int]]
     """
@@ -328,11 +354,16 @@ def stream_bloom_filters(
     :param xor_folds: number of XOR folds to perform
     :return: Yields bloom filters as 3-tuples
     """
-    tokenizers = [tokenizer.get_tokenizer(field.hashing_properties) for field in schema.fields]
+    tokenizers = [
+        tokenizer.get_tokenizer(field.hashing_properties) for field in schema.fields
+    ]
     field_hashing = [field.hashing_properties for field in schema.fields]
     hash_properties = schema.hashing_globals
 
-    return (crypto_bloom_filter(s, tokenizers, field_hashing, keys, hash_properties) for s in dataset)
+    return (
+        crypto_bloom_filter(s, tokenizers, field_hashing, keys, hash_properties)
+        for s in dataset
+    )
 
 
 def serialize_bitarray(ba):

diff --git a/clkhash/cli.py b/clkhash/cli.py
@@ -47,8 +47,12 @@ def cli(verbose=False):
 @click.argument("keys", nargs=2, type=click.Tuple([str, str]))
 @click.argument("schema", type=click.File("r", lazy=True))
 @click.argument("output", type=click.File("w"))
-@click.option("-q", "--quiet", default=False, is_flag=True, help="Quiet any progress messaging")
-@click.option("--no-header", default=False, is_flag=True, help="Don't skip the first row")
+@click.option(
+    "-q", "--quiet", default=False, is_flag=True, help="Quiet any progress messaging"
+)
+@click.option(
+    "--no-header", default=False, is_flag=True, help="Don't skip the first row"
+)
 def hash(input, keys, schema, output, quiet, no_header):
     """Process data to create CLKs
 
@@ -67,16 +71,25 @@ def hash(input, keys, schema, output, quiet, no_header):
 
     schema_object = clkhash.schema.Schema.from_json_file(schema_file=schema)
 
-    clk_data = clk.generate_clk_from_csv(input, keys, schema_object, header=not no_header, progress_bar=not quiet)
+    clk_data = clk.generate_clk_from_csv(
+        input, keys, schema_object, header=not no_header, progress_bar=not quiet
+    )
     json.dump({"clks": clk_data}, output)
     if hasattr(output, "name"):
         log("CLK data written to {}".format(output.name))
 
 
 @cli.command("status", short_help="Get status of entity service")
-@click.option("--server", type=str, default=DEFAULT_SERVICE_URL, help="Server address including protocol")
+@click.option(
+    "--server",
+    type=str,
+    default=DEFAULT_SERVICE_URL,
+    help="Server address including protocol",
+)
 @click.option("-o", "--output", type=click.File("w"), default="-")
-@click.option("-v", "--verbose", default=False, is_flag=True, help="Script is more talkative")
+@click.option(
+    "-v", "--verbose", default=False, is_flag=True, help="Script is more talkative"
+)
 def status(server, output, verbose):
     """Connect to an entity matching server and check the service status.
 
@@ -118,11 +131,22 @@ def status(server, output, verbose):
     default="permutation_unencrypted_mask",
     help="Alternative protocol/view type of the mapping. Default is unencrypted permutation and mask.",
 )
-@click.option("--schema", type=click.File("r"), help="Schema to publicly share with participating parties.")
-@click.option("--server", type=str, default=DEFAULT_SERVICE_URL, help="Server address including protocol")
+@click.option(
+    "--schema",
+    type=click.File("r"),
+    help="Schema to publicly share with participating parties.",
+)
+@click.option(
+    "--server",
+    type=str,
+    default=DEFAULT_SERVICE_URL,
+    help="Server address including protocol",
+)
 @click.option("-o", "--output", type=click.File("w"), default="-")
 @click.option("-t", "--threshold", type=float, default=0.95)
-@click.option("-v", "--verbose", default=False, is_flag=True, help="Script is more talkative")
+@click.option(
+    "-v", "--verbose", default=False, is_flag=True, help="Script is more talkative"
+)
 def create(type, schema, server, output, threshold, verbose):
     """Create a new mapping on an entity matching server.
 
@@ -149,7 +173,8 @@ def create(type, schema, server, output, threshold, verbose):
 
     log("Creating new mapping")
     response = requests.post(
-        "{}/api/v1/mappings".format(server), json={"schema": schema_json, "result_type": type, "threshold": threshold}
+        "{}/api/v1/mappings".format(server),
+        json={"schema": schema_json, "result_type": type, "threshold": threshold},
     )
 
     if response.status_code != 200:
@@ -166,9 +191,16 @@ def create(type, schema, server, output, threshold, verbose):
 @click.argument("input", type=click.File("r"))
 @click.option("--mapping", help="Server identifier of the mapping")
 @click.option("--apikey", help="Authentication API key for the server.")
-@click.option("--server", type=str, default=DEFAULT_SERVICE_URL, help="Server address including protocol")
+@click.option(
+    "--server",
+    type=str,
+    default=DEFAULT_SERVICE_URL,
+    help="Server address including protocol",
+)
 @click.option("-o", "--output", type=click.File("w"), default="-")
-@click.option("-v", "--verbose", default=False, is_flag=True, help="Script is more talkative")
+@click.option(
+    "-v", "--verbose", default=False, is_flag=True, help="Script is more talkative"
+)
 def upload(input, mapping, apikey, server, output, verbose):
     """Upload CLK data to entity matching server.
 
@@ -196,16 +228,25 @@ def upload(input, mapping, apikey, server, output, verbose):
 
     if verbose:
         log(response.text)
-        log("When the other party has uploaded their CLKS, you should be able to watch for results")
+        log(
+            "When the other party has uploaded their CLKS, you should be able to watch for results"
+        )
 
     print(response.text, file=output)
 
 
 @cli.command("results", short_help="fetch results from entity service")
 @click.option("--mapping", help="Server identifier of the mapping")
 @click.option("--apikey", help="Authentication API key for the server.")
-@click.option("-w", "--watch", help="Follow/wait until results are available", is_flag=True)
-@click.option("--server", type=str, default=DEFAULT_SERVICE_URL, help="Server address including protocol")
+@click.option(
+    "-w", "--watch", help="Follow/wait until results are available", is_flag=True
+)
+@click.option(
+    "--server",
+    type=str,
+    default=DEFAULT_SERVICE_URL,
+    help="Server address including protocol",
+)
 @click.option("-o", "--output", type=click.File("w"), default="-")
 def results(mapping, apikey, watch, server, output):
     """
@@ -223,7 +264,10 @@ def results(mapping, apikey, watch, server, output):
     log("Status: {}".format(status))
 
     def get_result():
-        return requests.get("{}/api/v1/mappings/{}".format(server, mapping), headers={"Authorization": apikey})
+        return requests.get(
+            "{}/api/v1/mappings/{}".format(server, mapping),
+            headers={"Authorization": apikey},
+        )
 
     response = get_result()
     log("Response code: {}".format(response.status_code))
@@ -257,14 +301,23 @@ def generate(size, output, schema):
     if schema is not None:
         raise NotImplementedError
 
-    randomnames.save_csv(pii_data.names, [f.identifier for f in pii_data.SCHEMA.fields], output)
+    randomnames.save_csv(
+        pii_data.names, [f.identifier for f in pii_data.SCHEMA.fields], output
+    )
 
 
-@cli.command("generate-default-schema", short_help="get the default schema used in generated random PII")
-@click.argument("output", type=click.Path(writable=True, readable=False, resolve_path=True))
+@cli.command(
+    "generate-default-schema",
+    short_help="get the default schema used in generated random PII",
+)
+@click.argument(
+    "output", type=click.Path(writable=True, readable=False, resolve_path=True)
+)
 def generate_default_schema(output):
     """Get default schema for fake PII"""
-    original_path = os.path.join(os.path.dirname(__file__), "data", "randomnames-schema.json")
+    original_path = os.path.join(
+        os.path.dirname(__file__), "data", "randomnames-schema.json"
+    )
     shutil.copyfile(original_path, output)
 
 

diff --git a/clkhash/clk.py b/clkhash/clk.py
@@ -5,7 +5,9 @@
 import concurrent.futures
 import logging
 import time
-from typing import AnyStr, Callable, Iterable, List, Optional, Sequence, TextIO, Tuple, TypeVar
+from typing import (
+    AnyStr, Callable, Iterable, List, Optional, Sequence, TextIO, Tuple, TypeVar
+)
 
 from tqdm import tqdm
 
@@ -23,7 +25,9 @@
 
 
 def hash_and_serialize_chunk(
-    chunk_pii_data, keys, schema  # type: Sequence[Sequence[str]]  # type: Sequence[Sequence[bytes]]  # type: Schema
+    chunk_pii_data,
+    keys,
+    schema,  # type: Sequence[Sequence[str]]  # type: Sequence[Sequence[bytes]]  # type: Schema
 ):
     # type: (...) -> Tuple[List[str], Sequence[int]]
     """
@@ -91,7 +95,9 @@ def callback(tics, clk_stats):
                 pbar.set_postfix(mean=stats.mean(), std=stats.std(), refresh=False)
                 pbar.update(tics)
 
-            results = generate_clks(pii_data, schema, keys, validate=validate, callback=callback)
+            results = generate_clks(
+                pii_data, schema, keys, validate=validate, callback=callback
+            )
     else:
         results = generate_clks(pii_data, schema, keys, validate=validate)
 
@@ -132,7 +138,9 @@ def generate_clks(
         for chunk in chunks(pii_data, chunk_size):
             future = executor.submit(hash_and_serialize_chunk, chunk, key_lists, schema)
             if callback is not None:
-                future.add_done_callback(lambda f: callback(len(f.result()[0]), f.result()[1]))
+                future.add_done_callback(
+                    lambda f: callback(len(f.result()[0]), f.result()[1])
+                )
             futures.append(future)
 
         results = []