-
Notifications
You must be signed in to change notification settings - Fork 9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Consider applying black to format the code #110
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,7 +22,9 @@ def __int_from_bytes(bytes, byteorder, signed=False): | |
:param byteorder: Either `'big'` or `'little'`. | ||
""" | ||
if signed: | ||
raise NotImplementedError("Signed integers are not currently supported in this " "backport.") | ||
raise NotImplementedError( | ||
"Signed integers are not currently supported in this " "backport." | ||
) | ||
|
||
if byteorder == "big": | ||
pass | ||
|
@@ -38,7 +40,14 @@ def __int_from_bytes(bytes, byteorder, signed=False): | |
# named arguments. Hence, must cast so Mypy thinks it matches the | ||
# original function. | ||
int_from_bytes = cast( | ||
Callable[[Arg(Sequence[int], "bytes"), Arg(str, "byteorder"), DefaultNamedArg(bool, "signed")], int], | ||
Callable[ | ||
[ | ||
Arg(Sequence[int], "bytes"), | ||
Arg(str, "byteorder"), | ||
DefaultNamedArg(bool, "signed"), | ||
], | ||
int, | ||
], | ||
__int_from_bytes, | ||
) | ||
|
||
|
@@ -93,5 +102,6 @@ def _p2_unicode_reader(unicode_csv_data, dialect=csv.excel, **kwargs): | |
|
||
|
||
unicode_reader = ( | ||
_p2_unicode_reader if sys.version_info < (3, 0) else csv.reader # Python 2 with hacky workarounds. | ||
_p2_unicode_reader if sys.version_info | ||
< (3, 0) else csv.reader # Python 2 with hacky workarounds. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I find this much less readable than the previous form. |
||
) # Py3 with native Unicode support. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,7 +31,11 @@ | |
|
||
|
||
def double_hash_encode_ngrams( | ||
ngrams, keys, k, l, encoding # type: Iterable[str] # type: Sequence[bytes] # type: int # type: int # type: str | ||
ngrams, | ||
keys, | ||
k, | ||
l, | ||
encoding, # type: Iterable[str] # type: Sequence[bytes] # type: int # type: int # type: str | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. MyPy compatibility is broken here. |
||
): | ||
# type: (...) -> bitarray | ||
""" | ||
|
@@ -52,16 +56,24 @@ def double_hash_encode_ngrams( | |
bf = bitarray(l) | ||
bf.setall(False) | ||
for m in ngrams: | ||
sha1hm = int(hmac.new(key_sha1, m.encode(encoding=encoding), sha1).hexdigest(), 16) % l | ||
md5hm = int(hmac.new(key_md5, m.encode(encoding=encoding), md5).hexdigest(), 16) % l | ||
sha1hm = int( | ||
hmac.new(key_sha1, m.encode(encoding=encoding), sha1).hexdigest(), 16 | ||
) % l | ||
md5hm = int( | ||
hmac.new(key_md5, m.encode(encoding=encoding), md5).hexdigest(), 16 | ||
) % l | ||
for i in range(k): | ||
gi = (sha1hm + i * md5hm) % l | ||
bf[gi] = 1 | ||
return bf | ||
|
||
|
||
def double_hash_encode_ngrams_non_singular( | ||
ngrams, keys, k, l, encoding # type: Iterable[str] # type: Sequence[bytes] # type: int # type: int # type: str | ||
ngrams, | ||
keys, | ||
k, | ||
l, | ||
encoding, # type: Iterable[str] # type: Sequence[bytes] # type: int # type: int # type: str | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ditto regarding Mypy compatibility. |
||
): | ||
# type: (...) -> bitarray.bitarray | ||
""" | ||
|
@@ -124,7 +136,11 @@ def double_hash_encode_ngrams_non_singular( | |
|
||
|
||
def blake_encode_ngrams( | ||
ngrams, keys, k, l, encoding # type: Iterable[str] # type: Sequence[bytes] # type: int # type: int # type: str | ||
ngrams, | ||
keys, | ||
k, | ||
l, | ||
encoding, # type: Iterable[str] # type: Sequence[bytes] # type: int # type: int # type: str | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ditto regarding Mypy compatibility. |
||
): | ||
# type: (...) -> bitarray.bitarray | ||
""" | ||
|
@@ -180,7 +196,11 @@ def blake_encode_ngrams( | |
|
||
log_l = int(math.log(l, 2)) | ||
if not 2 ** log_l == l: | ||
raise ValueError('parameter "l" has to be a power of two for the BLAKE2 encoding, but was: {}'.format(l)) | ||
raise ValueError( | ||
'parameter "l" has to be a power of two for the BLAKE2 encoding, but was: {}'.format( | ||
l | ||
) | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is really ugly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree - it looks better with the longer line length - 8254286#diff-f36426d58cf2623d909f2f77aae79799R183 |
||
|
||
bf = bitarray(l) | ||
bf.setall(False) | ||
|
@@ -192,8 +212,12 @@ def blake_encode_ngrams( | |
for m in ngrams: | ||
random_shorts = [] # type: List[int] | ||
for i in range(num_macs): | ||
hash_bytes = blake2b(m.encode(encoding=encoding), key=key, salt=str(i).encode()).digest() | ||
random_shorts.extend(struct.unpack("32H", hash_bytes)) # interpret hash bytes as 32 unsigned shorts. | ||
hash_bytes = blake2b( | ||
m.encode(encoding=encoding), key=key, salt=str(i).encode() | ||
).digest() | ||
random_shorts.extend( | ||
struct.unpack("32H", hash_bytes) | ||
) # interpret hash bytes as 32 unsigned shorts. | ||
for i in range(k): | ||
idx = random_shorts[i] % l | ||
bf[idx] = 1 | ||
|
@@ -316,7 +340,9 @@ def crypto_bloom_filter( | |
|
||
|
||
def stream_bloom_filters( | ||
dataset, keys, schema # type: Iterable[Sequence[Text]] # type: Sequence[Sequence[bytes]] # type: Schema | ||
dataset, | ||
keys, | ||
schema, # type: Iterable[Sequence[Text]] # type: Sequence[Sequence[bytes]] # type: Schema | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mypy. |
||
): | ||
# type: (...) -> Iterable[Tuple[bitarray, Text, int]] | ||
""" | ||
|
@@ -328,11 +354,16 @@ def stream_bloom_filters( | |
:param xor_folds: number of XOR folds to perform | ||
:return: Yields bloom filters as 3-tuples | ||
""" | ||
tokenizers = [tokenizer.get_tokenizer(field.hashing_properties) for field in schema.fields] | ||
tokenizers = [ | ||
tokenizer.get_tokenizer(field.hashing_properties) for field in schema.fields | ||
] | ||
field_hashing = [field.hashing_properties for field in schema.fields] | ||
hash_properties = schema.hashing_globals | ||
|
||
return (crypto_bloom_filter(s, tokenizers, field_hashing, keys, hash_properties) for s in dataset) | ||
return ( | ||
crypto_bloom_filter(s, tokenizers, field_hashing, keys, hash_properties) | ||
for s in dataset | ||
) | ||
|
||
|
||
def serialize_bitarray(ba): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,9 @@ | |
import concurrent.futures | ||
import logging | ||
import time | ||
from typing import AnyStr, Callable, Iterable, List, Optional, Sequence, TextIO, Tuple, TypeVar | ||
from typing import ( | ||
AnyStr, Callable, Iterable, List, Optional, Sequence, TextIO, Tuple, TypeVar | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can get behind this! |
||
|
||
from tqdm import tqdm | ||
|
||
|
@@ -23,7 +25,9 @@ | |
|
||
|
||
def hash_and_serialize_chunk( | ||
chunk_pii_data, keys, schema # type: Sequence[Sequence[str]] # type: Sequence[Sequence[bytes]] # type: Schema | ||
chunk_pii_data, | ||
keys, | ||
schema, # type: Sequence[Sequence[str]] # type: Sequence[Sequence[bytes]] # type: Schema | ||
): | ||
# type: (...) -> Tuple[List[str], Sequence[int]] | ||
""" | ||
|
@@ -91,7 +95,9 @@ def callback(tics, clk_stats): | |
pbar.set_postfix(mean=stats.mean(), std=stats.std(), refresh=False) | ||
pbar.update(tics) | ||
|
||
results = generate_clks(pii_data, schema, keys, validate=validate, callback=callback) | ||
results = generate_clks( | ||
pii_data, schema, keys, validate=validate, callback=callback | ||
) | ||
else: | ||
results = generate_clks(pii_data, schema, keys, validate=validate) | ||
|
||
|
@@ -132,7 +138,9 @@ def generate_clks( | |
for chunk in chunks(pii_data, chunk_size): | ||
future = executor.submit(hash_and_serialize_chunk, chunk, key_lists, schema) | ||
if callback is not None: | ||
future.add_done_callback(lambda f: callback(len(f.result()[0]), f.result()[1])) | ||
future.add_done_callback( | ||
lambda f: callback(len(f.result()[0]), f.result()[1]) | ||
) | ||
futures.append(future) | ||
|
||
results = [] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Black doesn’t recognise that these will be merged into one string so it doesn’t remove the superfluous
" “
.