-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_bloomfilter.py
65 lines (51 loc) · 2.03 KB
/
run_bloomfilter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env python
"""
bloomfilter.py: A pure python implementation of a bloom filter
sources:
http://code.activestate.com/recipes/577686-bloom-filter/
http://stromberg.dnsalias.org/svn/bloom-filter/trunk/bloom_filter_mod.py
Modified by Dirk Moors
"""
from bloomfilter import BloomFilter
from bloomfilter.probegenerators import get_probegenerator
def assertListEquals(l1, l2):
assert len(l1) == len(l2)
for i in xrange(len(l1)):
assert l1[i] == l2[i]
if __name__ == '__main__':
from random import sample
from string import ascii_letters
states = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()
bf1 = BloomFilter(ideal_num_elements_n=100000, error_rate_p=0.001)
for state in states:
bf1.add(state)
json_bf = bf1.toJSON()
print "##################"
print json_bf
print "##################"
len_json = len(json_bf)
print "data size: %s bytes"%len_json
bf2 = BloomFilter.fromJSON(json_bf)
assertListEquals(bf1.data, bf2.data)
new_data = bf2.get_data()
m = sum(state in bf2 for state in states)
print('%d true positives out of %d trials' % (m, len(states)))
trials = 100000
fp = 0
for trial in range(trials):
while True:
candidate = ''.join(sample(ascii_letters, 5))
# If we accidentally found a real state, try again
if candidate in states:
continue
if candidate in bf2:
fp += 1
break
print('%d true negatives and %d false positives out of %d trials'
% (trials-fp, fp, trials))