Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consider applying black to format the code #110

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions clkhash/backports.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ def __int_from_bytes(bytes, byteorder, signed=False):
:param byteorder: Either `'big'` or `'little'`.
"""
if signed:
raise NotImplementedError("Signed integers are not currently supported in this " "backport.")
raise NotImplementedError(
"Signed integers are not currently supported in this " "backport."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Black doesn’t recognise that these will be merged into one string so it doesn’t remove the superfluous " “.

)

if byteorder == "big":
pass
Expand All @@ -38,7 +40,14 @@ def __int_from_bytes(bytes, byteorder, signed=False):
# named arguments. Hence, must cast so Mypy thinks it matches the
# original function.
int_from_bytes = cast(
Callable[[Arg(Sequence[int], "bytes"), Arg(str, "byteorder"), DefaultNamedArg(bool, "signed")], int],
Callable[
[
Arg(Sequence[int], "bytes"),
Arg(str, "byteorder"),
DefaultNamedArg(bool, "signed"),
],
int,
],
__int_from_bytes,
)

Expand Down Expand Up @@ -93,5 +102,6 @@ def _p2_unicode_reader(unicode_csv_data, dialect=csv.excel, **kwargs):


unicode_reader = (
_p2_unicode_reader if sys.version_info < (3, 0) else csv.reader # Python 2 with hacky workarounds.
_p2_unicode_reader if sys.version_info
< (3, 0) else csv.reader # Python 2 with hacky workarounds.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find this much less readable than the previous form.

) # Py3 with native Unicode support.
6 changes: 5 additions & 1 deletion clkhash/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,11 @@ def compute_hash_speed(n, quiet=False):

elapsed_time = end - start
if not quiet:
print("{:6d} hashes in {:.6f} seconds. {:.2f} KH/s".format(n, elapsed_time, n / (1000 * elapsed_time)))
print(
"{:6d} hashes in {:.6f} seconds. {:.2f} KH/s".format(
n, elapsed_time, n / (1000 * elapsed_time)
)
)
return n / elapsed_time


Expand Down
53 changes: 42 additions & 11 deletions clkhash/bloomfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@


def double_hash_encode_ngrams(
ngrams, keys, k, l, encoding # type: Iterable[str] # type: Sequence[bytes] # type: int # type: int # type: str
ngrams,
keys,
k,
l,
encoding, # type: Iterable[str] # type: Sequence[bytes] # type: int # type: int # type: str
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MyPy compatibility is broken here.

):
# type: (...) -> bitarray
"""
Expand All @@ -52,16 +56,24 @@ def double_hash_encode_ngrams(
bf = bitarray(l)
bf.setall(False)
for m in ngrams:
sha1hm = int(hmac.new(key_sha1, m.encode(encoding=encoding), sha1).hexdigest(), 16) % l
md5hm = int(hmac.new(key_md5, m.encode(encoding=encoding), md5).hexdigest(), 16) % l
sha1hm = int(
hmac.new(key_sha1, m.encode(encoding=encoding), sha1).hexdigest(), 16
) % l
md5hm = int(
hmac.new(key_md5, m.encode(encoding=encoding), md5).hexdigest(), 16
) % l
for i in range(k):
gi = (sha1hm + i * md5hm) % l
bf[gi] = 1
return bf


def double_hash_encode_ngrams_non_singular(
ngrams, keys, k, l, encoding # type: Iterable[str] # type: Sequence[bytes] # type: int # type: int # type: str
ngrams,
keys,
k,
l,
encoding, # type: Iterable[str] # type: Sequence[bytes] # type: int # type: int # type: str
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto regarding Mypy compatibility.

):
# type: (...) -> bitarray.bitarray
"""
Expand Down Expand Up @@ -124,7 +136,11 @@ def double_hash_encode_ngrams_non_singular(


def blake_encode_ngrams(
ngrams, keys, k, l, encoding # type: Iterable[str] # type: Sequence[bytes] # type: int # type: int # type: str
ngrams,
keys,
k,
l,
encoding, # type: Iterable[str] # type: Sequence[bytes] # type: int # type: int # type: str
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto regarding Mypy compatibility.

):
# type: (...) -> bitarray.bitarray
"""
Expand Down Expand Up @@ -180,7 +196,11 @@ def blake_encode_ngrams(

log_l = int(math.log(l, 2))
if not 2 ** log_l == l:
raise ValueError('parameter "l" has to be a power of two for the BLAKE2 encoding, but was: {}'.format(l))
raise ValueError(
'parameter "l" has to be a power of two for the BLAKE2 encoding, but was: {}'.format(
l
)
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is really ugly.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree - it looks better with the longer line length - 8254286#diff-f36426d58cf2623d909f2f77aae79799R183


bf = bitarray(l)
bf.setall(False)
Expand All @@ -192,8 +212,12 @@ def blake_encode_ngrams(
for m in ngrams:
random_shorts = [] # type: List[int]
for i in range(num_macs):
hash_bytes = blake2b(m.encode(encoding=encoding), key=key, salt=str(i).encode()).digest()
random_shorts.extend(struct.unpack("32H", hash_bytes)) # interpret hash bytes as 32 unsigned shorts.
hash_bytes = blake2b(
m.encode(encoding=encoding), key=key, salt=str(i).encode()
).digest()
random_shorts.extend(
struct.unpack("32H", hash_bytes)
) # interpret hash bytes as 32 unsigned shorts.
for i in range(k):
idx = random_shorts[i] % l
bf[idx] = 1
Expand Down Expand Up @@ -316,7 +340,9 @@ def crypto_bloom_filter(


def stream_bloom_filters(
dataset, keys, schema # type: Iterable[Sequence[Text]] # type: Sequence[Sequence[bytes]] # type: Schema
dataset,
keys,
schema, # type: Iterable[Sequence[Text]] # type: Sequence[Sequence[bytes]] # type: Schema
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mypy.

):
# type: (...) -> Iterable[Tuple[bitarray, Text, int]]
"""
Expand All @@ -328,11 +354,16 @@ def stream_bloom_filters(
:param xor_folds: number of XOR folds to perform
:return: Yields bloom filters as 3-tuples
"""
tokenizers = [tokenizer.get_tokenizer(field.hashing_properties) for field in schema.fields]
tokenizers = [
tokenizer.get_tokenizer(field.hashing_properties) for field in schema.fields
]
field_hashing = [field.hashing_properties for field in schema.fields]
hash_properties = schema.hashing_globals

return (crypto_bloom_filter(s, tokenizers, field_hashing, keys, hash_properties) for s in dataset)
return (
crypto_bloom_filter(s, tokenizers, field_hashing, keys, hash_properties)
for s in dataset
)


def serialize_bitarray(ba):
Expand Down
91 changes: 72 additions & 19 deletions clkhash/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,12 @@ def cli(verbose=False):
@click.argument("keys", nargs=2, type=click.Tuple([str, str]))
@click.argument("schema", type=click.File("r", lazy=True))
@click.argument("output", type=click.File("w"))
@click.option("-q", "--quiet", default=False, is_flag=True, help="Quiet any progress messaging")
@click.option("--no-header", default=False, is_flag=True, help="Don't skip the first row")
@click.option(
"-q", "--quiet", default=False, is_flag=True, help="Quiet any progress messaging"
)
@click.option(
"--no-header", default=False, is_flag=True, help="Don't skip the first row"
)
def hash(input, keys, schema, output, quiet, no_header):
"""Process data to create CLKs

Expand All @@ -67,16 +71,25 @@ def hash(input, keys, schema, output, quiet, no_header):

schema_object = clkhash.schema.Schema.from_json_file(schema_file=schema)

clk_data = clk.generate_clk_from_csv(input, keys, schema_object, header=not no_header, progress_bar=not quiet)
clk_data = clk.generate_clk_from_csv(
input, keys, schema_object, header=not no_header, progress_bar=not quiet
)
json.dump({"clks": clk_data}, output)
if hasattr(output, "name"):
log("CLK data written to {}".format(output.name))


@cli.command("status", short_help="Get status of entity service")
@click.option("--server", type=str, default=DEFAULT_SERVICE_URL, help="Server address including protocol")
@click.option(
"--server",
type=str,
default=DEFAULT_SERVICE_URL,
help="Server address including protocol",
)
@click.option("-o", "--output", type=click.File("w"), default="-")
@click.option("-v", "--verbose", default=False, is_flag=True, help="Script is more talkative")
@click.option(
"-v", "--verbose", default=False, is_flag=True, help="Script is more talkative"
)
def status(server, output, verbose):
"""Connect to an entity matching server and check the service status.

Expand Down Expand Up @@ -118,11 +131,22 @@ def status(server, output, verbose):
default="permutation_unencrypted_mask",
help="Alternative protocol/view type of the mapping. Default is unencrypted permutation and mask.",
)
@click.option("--schema", type=click.File("r"), help="Schema to publicly share with participating parties.")
@click.option("--server", type=str, default=DEFAULT_SERVICE_URL, help="Server address including protocol")
@click.option(
"--schema",
type=click.File("r"),
help="Schema to publicly share with participating parties.",
)
@click.option(
"--server",
type=str,
default=DEFAULT_SERVICE_URL,
help="Server address including protocol",
)
@click.option("-o", "--output", type=click.File("w"), default="-")
@click.option("-t", "--threshold", type=float, default=0.95)
@click.option("-v", "--verbose", default=False, is_flag=True, help="Script is more talkative")
@click.option(
"-v", "--verbose", default=False, is_flag=True, help="Script is more talkative"
)
def create(type, schema, server, output, threshold, verbose):
"""Create a new mapping on an entity matching server.

Expand All @@ -149,7 +173,8 @@ def create(type, schema, server, output, threshold, verbose):

log("Creating new mapping")
response = requests.post(
"{}/api/v1/mappings".format(server), json={"schema": schema_json, "result_type": type, "threshold": threshold}
"{}/api/v1/mappings".format(server),
json={"schema": schema_json, "result_type": type, "threshold": threshold},
)

if response.status_code != 200:
Expand All @@ -166,9 +191,16 @@ def create(type, schema, server, output, threshold, verbose):
@click.argument("input", type=click.File("r"))
@click.option("--mapping", help="Server identifier of the mapping")
@click.option("--apikey", help="Authentication API key for the server.")
@click.option("--server", type=str, default=DEFAULT_SERVICE_URL, help="Server address including protocol")
@click.option(
"--server",
type=str,
default=DEFAULT_SERVICE_URL,
help="Server address including protocol",
)
@click.option("-o", "--output", type=click.File("w"), default="-")
@click.option("-v", "--verbose", default=False, is_flag=True, help="Script is more talkative")
@click.option(
"-v", "--verbose", default=False, is_flag=True, help="Script is more talkative"
)
def upload(input, mapping, apikey, server, output, verbose):
"""Upload CLK data to entity matching server.

Expand Down Expand Up @@ -196,16 +228,25 @@ def upload(input, mapping, apikey, server, output, verbose):

if verbose:
log(response.text)
log("When the other party has uploaded their CLKS, you should be able to watch for results")
log(
"When the other party has uploaded their CLKS, you should be able to watch for results"
)

print(response.text, file=output)


@cli.command("results", short_help="fetch results from entity service")
@click.option("--mapping", help="Server identifier of the mapping")
@click.option("--apikey", help="Authentication API key for the server.")
@click.option("-w", "--watch", help="Follow/wait until results are available", is_flag=True)
@click.option("--server", type=str, default=DEFAULT_SERVICE_URL, help="Server address including protocol")
@click.option(
"-w", "--watch", help="Follow/wait until results are available", is_flag=True
)
@click.option(
"--server",
type=str,
default=DEFAULT_SERVICE_URL,
help="Server address including protocol",
)
@click.option("-o", "--output", type=click.File("w"), default="-")
def results(mapping, apikey, watch, server, output):
"""
Expand All @@ -223,7 +264,10 @@ def results(mapping, apikey, watch, server, output):
log("Status: {}".format(status))

def get_result():
return requests.get("{}/api/v1/mappings/{}".format(server, mapping), headers={"Authorization": apikey})
return requests.get(
"{}/api/v1/mappings/{}".format(server, mapping),
headers={"Authorization": apikey},
)

response = get_result()
log("Response code: {}".format(response.status_code))
Expand Down Expand Up @@ -257,14 +301,23 @@ def generate(size, output, schema):
if schema is not None:
raise NotImplementedError

randomnames.save_csv(pii_data.names, [f.identifier for f in pii_data.SCHEMA.fields], output)
randomnames.save_csv(
pii_data.names, [f.identifier for f in pii_data.SCHEMA.fields], output
)


@cli.command("generate-default-schema", short_help="get the default schema used in generated random PII")
@click.argument("output", type=click.Path(writable=True, readable=False, resolve_path=True))
@cli.command(
"generate-default-schema",
short_help="get the default schema used in generated random PII",
)
@click.argument(
"output", type=click.Path(writable=True, readable=False, resolve_path=True)
)
def generate_default_schema(output):
"""Get default schema for fake PII"""
original_path = os.path.join(os.path.dirname(__file__), "data", "randomnames-schema.json")
original_path = os.path.join(
os.path.dirname(__file__), "data", "randomnames-schema.json"
)
shutil.copyfile(original_path, output)


Expand Down
16 changes: 12 additions & 4 deletions clkhash/clk.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
import concurrent.futures
import logging
import time
from typing import AnyStr, Callable, Iterable, List, Optional, Sequence, TextIO, Tuple, TypeVar
from typing import (
AnyStr, Callable, Iterable, List, Optional, Sequence, TextIO, Tuple, TypeVar
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can get behind this!


from tqdm import tqdm

Expand All @@ -23,7 +25,9 @@


def hash_and_serialize_chunk(
chunk_pii_data, keys, schema # type: Sequence[Sequence[str]] # type: Sequence[Sequence[bytes]] # type: Schema
chunk_pii_data,
keys,
schema, # type: Sequence[Sequence[str]] # type: Sequence[Sequence[bytes]] # type: Schema
):
# type: (...) -> Tuple[List[str], Sequence[int]]
"""
Expand Down Expand Up @@ -91,7 +95,9 @@ def callback(tics, clk_stats):
pbar.set_postfix(mean=stats.mean(), std=stats.std(), refresh=False)
pbar.update(tics)

results = generate_clks(pii_data, schema, keys, validate=validate, callback=callback)
results = generate_clks(
pii_data, schema, keys, validate=validate, callback=callback
)
else:
results = generate_clks(pii_data, schema, keys, validate=validate)

Expand Down Expand Up @@ -132,7 +138,9 @@ def generate_clks(
for chunk in chunks(pii_data, chunk_size):
future = executor.submit(hash_and_serialize_chunk, chunk, key_lists, schema)
if callback is not None:
future.add_done_callback(lambda f: callback(len(f.result()[0]), f.result()[1]))
future.add_done_callback(
lambda f: callback(len(f.result()[0]), f.result()[1])
)
futures.append(future)

results = []
Expand Down
Loading