Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consider applying black to format the code #110

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions clkhash/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from . import randomnames

try:
__version__ = pkg_resources.get_distribution('clkhash').version
__version__ = pkg_resources.get_distribution("clkhash").version
except pkg_resources.DistributionNotFound:
__version__ = "development"

__author__ = 'N1 Analytics'
__author__ = "N1 Analytics"
31 changes: 14 additions & 17 deletions clkhash/backports.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,28 +22,25 @@ def __int_from_bytes(bytes, byteorder, signed=False):
:param byteorder: Either `'big'` or `'little'`.
"""
if signed:
raise NotImplementedError(
"Signed integers are not currently supported in this "
"backport.")
raise NotImplementedError("Signed integers are not currently supported in this " "backport.")

if byteorder == 'big':
if byteorder == "big":
pass
elif byteorder == 'little':
elif byteorder == "little":
bytes = bytes[::-1]
else:
raise ValueError("byteorder must be either 'little' or 'big'")

hex_str = codecs.encode(bytes, 'hex') # type: ignore
hex_str = codecs.encode(bytes, "hex") # type: ignore
return int(hex_str, 16)

# Make this cast since Python 2 doesn't have syntax for default
# named arguments. Hence, must cast so Mypy thinks it matches the
# original function.
int_from_bytes = cast(Callable[[Arg(Sequence[int], 'bytes'),
Arg(str, 'byteorder'),
DefaultNamedArg(bool, 'signed')],
int],
__int_from_bytes)
int_from_bytes = cast(
Callable[[Arg(Sequence[int], "bytes"), Arg(str, "byteorder"), DefaultNamedArg(bool, "signed")], int],
__int_from_bytes,
)


def re_compile_full(pattern, flags=0):
Expand All @@ -65,11 +62,11 @@ def re_compile_full(pattern, flags=0):
# A pattern of type bytes doesn't make sense in Python 3.
assert type(pattern) is not bytes or str is bytes

return re.compile('(?:{})\Z'.format(pattern), flags=flags)
return re.compile("(?:{})\Z".format(pattern), flags=flags)


def _utf_8_encoder(unicode_csv_data):
return (line.encode('utf-8') for line in unicode_csv_data)
return (line.encode("utf-8") for line in unicode_csv_data)


def _p2_unicode_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
Expand All @@ -92,9 +89,9 @@ def _p2_unicode_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
csv_reader = csv.reader(utf8_csv_data, dialect=dialect, **kwargs)

# Decode UTF-8 back to Unicode, cell by cell:
return ([unicode(cell, 'utf-8') for cell in row] for row in csv_reader)
return ([unicode(cell, "utf-8") for cell in row] for row in csv_reader)


unicode_reader = (_p2_unicode_reader # Python 2 with hacky workarounds.
if sys.version_info < (3,0)
else csv.reader) # Py3 with native Unicode support.
unicode_reader = (
_p2_unicode_reader if sys.version_info < (3, 0) else csv.reader # Python 2 with hacky workarounds.
) # Py3 with native Unicode support.
18 changes: 9 additions & 9 deletions clkhash/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,28 +17,28 @@ def compute_hash_speed(n, quiet=False):
os_fd, tmpfile_name = tempfile.mkstemp(text=True)

schema = NameList.SCHEMA
header_row = ','.join([f.identifier for f in schema.fields])
header_row = ",".join([f.identifier for f in schema.fields])

with open(tmpfile_name, 'wt') as f:
with open(tmpfile_name, "wt") as f:
f.write(header_row)
f.write('\n')
f.write("\n")
for person in namelist.names:
print(','.join([str(field) for field in person]), file=f)
print(",".join([str(field) for field in person]), file=f)

with open(tmpfile_name, 'rt') as f:
with open(tmpfile_name, "rt") as f:
start = timer()
generate_clk_from_csv(f, ('key1', 'key2'), schema, progress_bar=not quiet)
generate_clk_from_csv(f, ("key1", "key2"), schema, progress_bar=not quiet)
end = timer()

os.close(os_fd)
os.remove(tmpfile_name)

elapsed_time = end - start
if not quiet:
print("{:6d} hashes in {:.6f} seconds. {:.2f} KH/s".format(n, elapsed_time, n/(1000*elapsed_time)))
print("{:6d} hashes in {:.6f} seconds. {:.2f} KH/s".format(n, elapsed_time, n / (1000 * elapsed_time)))
return n / elapsed_time


if __name__ == '__main__':
if __name__ == "__main__":
for n in [100, 1000, 10000, 50000, 100000]:
compute_hash_speed(n, quiet=n<=10000)
compute_hash_speed(n, quiet=n <= 10000)
107 changes: 47 additions & 60 deletions clkhash/bloomfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,13 @@
except ImportError:
# We are in Python older than 3.6.
from pyblake2 import blake2b # type: ignore
# Ignore because otherwise Mypy raises errors, thinking that
# blake2b is already defined.
# Ignore because otherwise Mypy raises errors, thinking that
# blake2b is already defined.


def double_hash_encode_ngrams(ngrams, # type: Iterable[str]
keys, # type: Sequence[bytes]
k, # type: int
l, # type: int
encoding # type: str
):
def double_hash_encode_ngrams(
ngrams, keys, k, l, encoding # type: Iterable[str] # type: Sequence[bytes] # type: int # type: int # type: str
):
# type: (...) -> bitarray
"""
Computes the double hash encoding of the provided ngrams with the given keys.
Expand Down Expand Up @@ -63,12 +60,9 @@ def double_hash_encode_ngrams(ngrams, # type: Iterable[str]
return bf


def double_hash_encode_ngrams_non_singular(ngrams, # type: Iterable[str]
keys, # type: Sequence[bytes]
k, # type: int
l, # type: int
encoding # type: str
):
def double_hash_encode_ngrams_non_singular(
ngrams, keys, k, l, encoding # type: Iterable[str] # type: Sequence[bytes] # type: int # type: int # type: str
):
# type: (...) -> bitarray.bitarray
"""
computes the double hash encoding of the provided n-grams with the given keys.
Expand Down Expand Up @@ -114,14 +108,13 @@ def double_hash_encode_ngrams_non_singular(ngrams, # type: Iterable[str
sha1hm_bytes = hmac.new(key_sha1, m_bytes, sha1).digest()
md5hm_bytes = hmac.new(key_md5, m_bytes, md5).digest()

sha1hm = int_from_bytes(sha1hm_bytes, 'big') % l
md5hm = int_from_bytes(md5hm_bytes, 'big') % l
sha1hm = int_from_bytes(sha1hm_bytes, "big") % l
md5hm = int_from_bytes(md5hm_bytes, "big") % l

i = 0
while md5hm == 0:
md5hm_bytes = hmac.new(
key_md5, m_bytes + chr(i).encode(), md5).digest()
md5hm = int_from_bytes(md5hm_bytes, 'big') % l
md5hm_bytes = hmac.new(key_md5, m_bytes + chr(i).encode(), md5).digest()
md5hm = int_from_bytes(md5hm_bytes, "big") % l
i += 1

for i in range(k):
Expand All @@ -130,12 +123,9 @@ def double_hash_encode_ngrams_non_singular(ngrams, # type: Iterable[str
return bf


def blake_encode_ngrams(ngrams, # type: Iterable[str]
keys, # type: Sequence[bytes]
k, # type: int
l, # type: int
encoding # type: str
):
def blake_encode_ngrams(
ngrams, keys, k, l, encoding # type: Iterable[str] # type: Sequence[bytes] # type: int # type: int # type: str
):
# type: (...) -> bitarray.bitarray
"""
Computes the encoding of the provided ngrams using the BLAKE2 hash function.
Expand Down Expand Up @@ -189,19 +179,21 @@ def blake_encode_ngrams(ngrams, # type: Iterable[str]
key, = keys # Unpack.

log_l = int(math.log(l, 2))
if not 2**log_l == l:
if not 2 ** log_l == l:
raise ValueError('parameter "l" has to be a power of two for the BLAKE2 encoding, but was: {}'.format(l))

bf = bitarray(l)
bf.setall(False)
if k < 1:
return bf
num_macs = (k+31) // 32

num_macs = (k + 31) // 32

for m in ngrams:
random_shorts = [] # type: List[int]
for i in range(num_macs):
hash_bytes = blake2b(m.encode(encoding=encoding), key=key, salt=str(i).encode()).digest()
random_shorts.extend(struct.unpack('32H', hash_bytes)) # interpret hash bytes as 32 unsigned shorts.
random_shorts.extend(struct.unpack("32H", hash_bytes)) # interpret hash bytes as 32 unsigned shorts.
for i in range(k):
idx = random_shorts[i] % l
bf[idx] = 1
Expand Down Expand Up @@ -231,25 +223,24 @@ def __call__(self, *args):
return self.value(*args)

@classmethod
def from_properties(cls,
properties # type: GlobalHashingProperties
):
def from_properties(cls, properties): # type: GlobalHashingProperties
# type: (...) -> Callable[[Iterable[str], Sequence[bytes], int, int, str], bitarray]
if properties.hash_type == 'doubleHash':
if properties.hash_type == "doubleHash":
if properties.hash_prevent_singularity:
return cls.DOUBLE_HASH_NON_SINGULAR

else:
return cls.DOUBLE_HASH
elif properties.hash_type == 'blakeHash':

elif properties.hash_type == "blakeHash":
return cls.BLAKE_HASH

else:
msg = "Unsupported hash type '{}'".format(properties.hash_type)
raise ValueError(msg)


def fold_xor(bloomfilter, # type: bitarray
folds # type: int
):
def fold_xor(bloomfilter, folds): # type: bitarray # type: int
# type: (...) -> bitarray
""" Performs XOR folding on a Bloom filter.

Expand All @@ -263,10 +254,11 @@ def fold_xor(bloomfilter, # type: bitarray
"""

if len(bloomfilter) % 2 ** folds != 0:
msg = ('The length of the bloom filter is {length}. It is not '
'divisible by 2 ** {folds}, so it cannot be folded {folds} '
'times.'
.format(length=len(bloomfilter), folds=folds))
msg = (
"The length of the bloom filter is {length}. It is not "
"divisible by 2 ** {folds}, so it cannot be folded {folds} "
"times.".format(length=len(bloomfilter), folds=folds)
)
raise ValueError(msg)

for _ in range(folds):
Expand All @@ -278,12 +270,13 @@ def fold_xor(bloomfilter, # type: bitarray
return bloomfilter


def crypto_bloom_filter(record, # type: Sequence[Text]
tokenizers, # type: List[Callable[[Text], Iterable[Text]]]
field_hashing, # type: List[FieldHashingProperties]
keys, # type: Sequence[Sequence[bytes]]
hash_properties # type: GlobalHashingProperties
):
def crypto_bloom_filter(
record, # type: Sequence[Text]
tokenizers, # type: List[Callable[[Text], Iterable[Text]]]
field_hashing, # type: List[FieldHashingProperties]
keys, # type: Sequence[Sequence[bytes]]
hash_properties, # type: GlobalHashingProperties
):
# type: (...) -> Tuple[bitarray, Text, int]
"""
Makes a Bloom filter from a record with given tokenizers and lists of keys.
Expand Down Expand Up @@ -311,23 +304,20 @@ def crypto_bloom_filter(record, # type: Sequence[Text]
bloomfilter = bitarray(l)
bloomfilter.setall(False)

for (entry, tokenizer, field, key) \
in zip(record, tokenizers, field_hashing, keys):
for (entry, tokenizer, field, key) in zip(record, tokenizers, field_hashing, keys):
ngrams = tokenizer(entry)
adjusted_k = int(round(field.weight * k))

bloomfilter |= hash_function(
ngrams, key, adjusted_k, l, field.encoding)
bloomfilter |= hash_function(ngrams, key, adjusted_k, l, field.encoding)

bloomfilter = fold_xor(bloomfilter, xor_folds)

return bloomfilter, record[0], bloomfilter.count()


def stream_bloom_filters(dataset, # type: Iterable[Sequence[Text]]
keys, # type: Sequence[Sequence[bytes]]
schema # type: Schema
):
def stream_bloom_filters(
dataset, keys, schema # type: Iterable[Sequence[Text]] # type: Sequence[Sequence[bytes]] # type: Schema
):
# type: (...) -> Iterable[Tuple[bitarray, Text, int]]
"""
Yield bloom filters
Expand All @@ -338,19 +328,16 @@ def stream_bloom_filters(dataset, # type: Iterable[Sequence[Text]]
:param xor_folds: number of XOR folds to perform
:return: Yields bloom filters as 3-tuples
"""
tokenizers = [tokenizer.get_tokenizer(field.hashing_properties)
for field in schema.fields]
tokenizers = [tokenizer.get_tokenizer(field.hashing_properties) for field in schema.fields]
field_hashing = [field.hashing_properties for field in schema.fields]
hash_properties = schema.hashing_globals

return (crypto_bloom_filter(s, tokenizers, field_hashing,
keys, hash_properties)
for s in dataset)
return (crypto_bloom_filter(s, tokenizers, field_hashing, keys, hash_properties) for s in dataset)


def serialize_bitarray(ba):
# type: (bitarray) -> str
"""Serialize a bitarray (bloomfilter)

"""
return base64.b64encode(ba.tobytes()).decode('utf8')
return base64.b64encode(ba.tobytes()).decode("utf8")
Loading