data61 · hardbyte · Apr 13, 2018 · Apr 13, 2018
diff --git a/clkhash/__init__.py b/clkhash/__init__.py
@@ -7,8 +7,8 @@
 from . import randomnames
 
 try:
-    __version__ = pkg_resources.get_distribution('clkhash').version
+    __version__ = pkg_resources.get_distribution("clkhash").version
 except pkg_resources.DistributionNotFound:
     __version__ = "development"
 
-__author__ = 'N1 Analytics'
+__author__ = "N1 Analytics"
diff --git a/clkhash/backports.py b/clkhash/backports.py
@@ -22,28 +22,25 @@ def __int_from_bytes(bytes, byteorder, signed=False):
             :param byteorder: Either `'big'` or `'little'`.
         """
         if signed:
-            raise NotImplementedError(
-                "Signed integers are not currently supported in this "
-                "backport.")
+            raise NotImplementedError("Signed integers are not currently supported in this " "backport.")
 
-        if byteorder == 'big':
+        if byteorder == "big":
             pass
-        elif byteorder == 'little':
+        elif byteorder == "little":
             bytes = bytes[::-1]
         else:
             raise ValueError("byteorder must be either 'little' or 'big'")
 
-        hex_str = codecs.encode(bytes, 'hex')  # type: ignore
+        hex_str = codecs.encode(bytes, "hex")  # type: ignore
         return int(hex_str, 16)
 
     # Make this cast since Python 2 doesn't have syntax for default
     # named arguments. Hence, must cast so Mypy thinks it matches the
     # original function.
-    int_from_bytes = cast(Callable[[Arg(Sequence[int], 'bytes'),
-                                Arg(str, 'byteorder'),
-                                DefaultNamedArg(bool, 'signed')],
-                               int],
-                      __int_from_bytes)
+    int_from_bytes = cast(
+        Callable[[Arg(Sequence[int], "bytes"), Arg(str, "byteorder"), DefaultNamedArg(bool, "signed")], int],
+        __int_from_bytes,
+    )
 
 
 def re_compile_full(pattern, flags=0):
@@ -65,11 +62,11 @@ def re_compile_full(pattern, flags=0):
     # A pattern of type bytes doesn't make sense in Python 3.
     assert type(pattern) is not bytes or str is bytes
 
-    return re.compile('(?:{})\Z'.format(pattern), flags=flags)
+    return re.compile("(?:{})\Z".format(pattern), flags=flags)
 
 
 def _utf_8_encoder(unicode_csv_data):
-    return (line.encode('utf-8') for line in unicode_csv_data)
+    return (line.encode("utf-8") for line in unicode_csv_data)
 
 
 def _p2_unicode_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
@@ -92,9 +89,9 @@ def _p2_unicode_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
     csv_reader = csv.reader(utf8_csv_data, dialect=dialect, **kwargs)
 
     # Decode UTF-8 back to Unicode, cell by cell:
-    return ([unicode(cell, 'utf-8') for cell in row] for row in csv_reader)
+    return ([unicode(cell, "utf-8") for cell in row] for row in csv_reader)
 
 
-unicode_reader = (_p2_unicode_reader  # Python 2 with hacky workarounds.
-                  if sys.version_info < (3,0)
-                  else csv.reader)  # Py3 with native Unicode support.
+unicode_reader = (
+    _p2_unicode_reader if sys.version_info < (3, 0) else csv.reader  # Python 2 with hacky workarounds.
+)  # Py3 with native Unicode support.
diff --git a/clkhash/benchmark.py b/clkhash/benchmark.py
@@ -17,28 +17,28 @@ def compute_hash_speed(n, quiet=False):
     os_fd, tmpfile_name = tempfile.mkstemp(text=True)
 
     schema = NameList.SCHEMA
-    header_row = ','.join([f.identifier for f in schema.fields])
+    header_row = ",".join([f.identifier for f in schema.fields])
 
-    with open(tmpfile_name, 'wt') as f:
+    with open(tmpfile_name, "wt") as f:
         f.write(header_row)
-        f.write('\n')
+        f.write("\n")
         for person in namelist.names:
-            print(','.join([str(field) for field in person]), file=f)
+            print(",".join([str(field) for field in person]), file=f)
 
-    with open(tmpfile_name, 'rt') as f:
+    with open(tmpfile_name, "rt") as f:
         start = timer()
-        generate_clk_from_csv(f, ('key1', 'key2'), schema, progress_bar=not quiet)
+        generate_clk_from_csv(f, ("key1", "key2"), schema, progress_bar=not quiet)
         end = timer()
 
     os.close(os_fd)
     os.remove(tmpfile_name)
 
     elapsed_time = end - start
     if not quiet:
-        print("{:6d} hashes in {:.6f} seconds. {:.2f} KH/s".format(n, elapsed_time, n/(1000*elapsed_time)))
+        print("{:6d} hashes in {:.6f} seconds. {:.2f} KH/s".format(n, elapsed_time, n / (1000 * elapsed_time)))
     return n / elapsed_time
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     for n in [100, 1000, 10000, 50000, 100000]:
-        compute_hash_speed(n, quiet=n<=10000)
+        compute_hash_speed(n, quiet=n <= 10000)
diff --git a/clkhash/bloomfilter.py b/clkhash/bloomfilter.py
@@ -26,16 +26,13 @@
 except ImportError:
     # We are in Python older than 3.6.
     from pyblake2 import blake2b  # type: ignore
-    # Ignore because otherwise Mypy raises errors, thinking that
-    # blake2b is already defined.
+# Ignore because otherwise Mypy raises errors, thinking that
+# blake2b is already defined.
 
 
-def double_hash_encode_ngrams(ngrams,          # type: Iterable[str]
-                              keys,            # type: Sequence[bytes]
-                              k,               # type: int
-                              l,               # type: int
-                              encoding         # type: str
-                              ):
+def double_hash_encode_ngrams(
+    ngrams, keys, k, l, encoding  # type: Iterable[str]  # type: Sequence[bytes]  # type: int  # type: int  # type: str
+):
     # type: (...) -> bitarray
     """
     Computes the double hash encoding of the provided ngrams with the given keys.
@@ -63,12 +60,9 @@ def double_hash_encode_ngrams(ngrams,          # type: Iterable[str]
     return bf
 
 
-def double_hash_encode_ngrams_non_singular(ngrams,          # type: Iterable[str]
-                              keys,            # type: Sequence[bytes]
-                              k,               # type: int
-                              l,               # type: int
-                              encoding         # type: str
-                              ):
+def double_hash_encode_ngrams_non_singular(
+    ngrams, keys, k, l, encoding  # type: Iterable[str]  # type: Sequence[bytes]  # type: int  # type: int  # type: str
+):
     # type: (...) -> bitarray.bitarray
     """
     computes the double hash encoding of the provided n-grams with the given keys.
@@ -114,14 +108,13 @@ def double_hash_encode_ngrams_non_singular(ngrams,          # type: Iterable[str
         sha1hm_bytes = hmac.new(key_sha1, m_bytes, sha1).digest()
         md5hm_bytes = hmac.new(key_md5, m_bytes, md5).digest()
 
-        sha1hm = int_from_bytes(sha1hm_bytes, 'big') % l
-        md5hm = int_from_bytes(md5hm_bytes, 'big') % l
+        sha1hm = int_from_bytes(sha1hm_bytes, "big") % l
+        md5hm = int_from_bytes(md5hm_bytes, "big") % l
 
         i = 0
         while md5hm == 0:
-            md5hm_bytes = hmac.new(
-                key_md5, m_bytes + chr(i).encode(), md5).digest()
-            md5hm = int_from_bytes(md5hm_bytes, 'big') % l
+            md5hm_bytes = hmac.new(key_md5, m_bytes + chr(i).encode(), md5).digest()
+            md5hm = int_from_bytes(md5hm_bytes, "big") % l
             i += 1
 
         for i in range(k):
@@ -130,12 +123,9 @@ def double_hash_encode_ngrams_non_singular(ngrams,          # type: Iterable[str
     return bf
 
 
-def blake_encode_ngrams(ngrams,          # type: Iterable[str]
-                        keys,            # type: Sequence[bytes]
-                        k,               # type: int
-                        l,               # type: int
-                        encoding         # type: str
-                        ):
+def blake_encode_ngrams(
+    ngrams, keys, k, l, encoding  # type: Iterable[str]  # type: Sequence[bytes]  # type: int  # type: int  # type: str
+):
     # type: (...) -> bitarray.bitarray
     """
     Computes the encoding of the provided ngrams using the BLAKE2 hash function.
@@ -189,19 +179,21 @@ def blake_encode_ngrams(ngrams,          # type: Iterable[str]
     key, = keys  # Unpack.
 
     log_l = int(math.log(l, 2))
-    if not 2**log_l == l:
+    if not 2 ** log_l == l:
         raise ValueError('parameter "l" has to be a power of two for the BLAKE2 encoding, but was: {}'.format(l))
+
     bf = bitarray(l)
     bf.setall(False)
     if k < 1:
         return bf
-    num_macs = (k+31) // 32
+
+    num_macs = (k + 31) // 32
 
     for m in ngrams:
         random_shorts = []  # type: List[int]
         for i in range(num_macs):
             hash_bytes = blake2b(m.encode(encoding=encoding), key=key, salt=str(i).encode()).digest()
-            random_shorts.extend(struct.unpack('32H', hash_bytes))  # interpret hash bytes as 32 unsigned shorts.
+            random_shorts.extend(struct.unpack("32H", hash_bytes))  # interpret hash bytes as 32 unsigned shorts.
         for i in range(k):
             idx = random_shorts[i] % l
             bf[idx] = 1
@@ -231,25 +223,24 @@ def __call__(self, *args):
         return self.value(*args)
 
     @classmethod
-    def from_properties(cls,
-                        properties  # type: GlobalHashingProperties
-                        ):
+    def from_properties(cls, properties):  # type: GlobalHashingProperties
         # type: (...) -> Callable[[Iterable[str], Sequence[bytes], int, int, str], bitarray]
-        if properties.hash_type == 'doubleHash':
+        if properties.hash_type == "doubleHash":
             if properties.hash_prevent_singularity:
                 return cls.DOUBLE_HASH_NON_SINGULAR
+
             else:
                 return cls.DOUBLE_HASH
-        elif properties.hash_type == 'blakeHash':
+
+        elif properties.hash_type == "blakeHash":
             return cls.BLAKE_HASH
+
         else:
             msg = "Unsupported hash type '{}'".format(properties.hash_type)
             raise ValueError(msg)
 
 
-def fold_xor(bloomfilter,  # type: bitarray
-             folds         # type: int
-             ):
+def fold_xor(bloomfilter, folds):  # type: bitarray  # type: int
     # type: (...) -> bitarray
     """ Performs XOR folding on a Bloom filter.
 
@@ -263,10 +254,11 @@ def fold_xor(bloomfilter,  # type: bitarray
     """
 
     if len(bloomfilter) % 2 ** folds != 0:
-        msg = ('The length of the bloom filter is {length}. It is not '
-               'divisible by 2 ** {folds}, so it cannot be folded {folds} '
-               'times.'
-               .format(length=len(bloomfilter), folds=folds))
+        msg = (
+            "The length of the bloom filter is {length}. It is not "
+            "divisible by 2 ** {folds}, so it cannot be folded {folds} "
+            "times.".format(length=len(bloomfilter), folds=folds)
+        )
         raise ValueError(msg)
 
     for _ in range(folds):
@@ -278,12 +270,13 @@ def fold_xor(bloomfilter,  # type: bitarray
     return bloomfilter
 
 
-def crypto_bloom_filter(record,          # type: Sequence[Text]
-                        tokenizers,      # type: List[Callable[[Text], Iterable[Text]]]
-                        field_hashing,   # type: List[FieldHashingProperties]
-                        keys,            # type: Sequence[Sequence[bytes]]
-                        hash_properties  # type: GlobalHashingProperties
-                        ):
+def crypto_bloom_filter(
+    record,  # type: Sequence[Text]
+    tokenizers,  # type: List[Callable[[Text], Iterable[Text]]]
+    field_hashing,  # type: List[FieldHashingProperties]
+    keys,  # type: Sequence[Sequence[bytes]]
+    hash_properties,  # type: GlobalHashingProperties
+):
     # type: (...) -> Tuple[bitarray, Text, int]
     """
     Makes a Bloom filter from a record with given tokenizers and lists of keys.
@@ -311,23 +304,20 @@ def crypto_bloom_filter(record,          # type: Sequence[Text]
     bloomfilter = bitarray(l)
     bloomfilter.setall(False)
 
-    for (entry, tokenizer, field, key) \
-            in zip(record, tokenizers, field_hashing, keys):
+    for (entry, tokenizer, field, key) in zip(record, tokenizers, field_hashing, keys):
         ngrams = tokenizer(entry)
         adjusted_k = int(round(field.weight * k))
 
-        bloomfilter |= hash_function(
-            ngrams, key, adjusted_k, l, field.encoding)
+        bloomfilter |= hash_function(ngrams, key, adjusted_k, l, field.encoding)
 
     bloomfilter = fold_xor(bloomfilter, xor_folds)
 
     return bloomfilter, record[0], bloomfilter.count()
 
 
-def stream_bloom_filters(dataset,  # type: Iterable[Sequence[Text]]
-                         keys,     # type: Sequence[Sequence[bytes]]
-                         schema    # type: Schema
-                         ):
+def stream_bloom_filters(
+    dataset, keys, schema  # type: Iterable[Sequence[Text]]  # type: Sequence[Sequence[bytes]]  # type: Schema
+):
     # type: (...) -> Iterable[Tuple[bitarray, Text, int]]
     """
     Yield bloom filters
@@ -338,19 +328,16 @@ def stream_bloom_filters(dataset,  # type: Iterable[Sequence[Text]]
     :param xor_folds: number of XOR folds to perform
     :return: Yields bloom filters as 3-tuples
     """
-    tokenizers = [tokenizer.get_tokenizer(field.hashing_properties)
-                  for field in schema.fields]
+    tokenizers = [tokenizer.get_tokenizer(field.hashing_properties) for field in schema.fields]
     field_hashing = [field.hashing_properties for field in schema.fields]
     hash_properties = schema.hashing_globals
 
-    return (crypto_bloom_filter(s, tokenizers, field_hashing,
-                                keys, hash_properties)
-            for s in dataset)
+    return (crypto_bloom_filter(s, tokenizers, field_hashing, keys, hash_properties) for s in dataset)
 
 
 def serialize_bitarray(ba):
     # type: (bitarray) -> str
     """Serialize a bitarray (bloomfilter)
 
     """
-    return base64.b64encode(ba.tobytes()).decode('utf8')
+    return base64.b64encode(ba.tobytes()).decode("utf8")