Skip to content

Commit

Permalink
remove Jaccard calculation from manysearch (#72)
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb authored Aug 28, 2023
1 parent ea3f26c commit 7a002e8
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 20 deletions.
18 changes: 4 additions & 14 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,6 @@ fn prepare_query(search_sigs: &[Signature], template: &Sketch) -> Option<SmallSi
///
/// Note: this function loads all _queries_ into memory, and iterates over
/// database once.
///
/// TODO:
/// - support jaccard as well as containment/overlap
/// - support md5 output columns; other?

fn manysearch<P: AsRef<Path>>(
querylist: P,
Expand Down Expand Up @@ -208,10 +204,10 @@ fn manysearch<P: AsRef<Path>>(
};
let thrd = std::thread::spawn(move || {
let mut writer = BufWriter::new(out);
writeln!(&mut writer, "query_name,query_md5,match_name,match_md5,containment,jaccard,intersect_hashes").unwrap();
for (query, query_md5, m, m_md5, cont, jaccard, overlap) in recv.into_iter() {
writeln!(&mut writer, "\"{}\",{},\"{}\",{},{},{},{}",
query, query_md5, m, m_md5, cont, jaccard, overlap).ok();
writeln!(&mut writer, "query_name,query_md5,match_name,match_md5,containment,intersect_hashes").unwrap();
for (query, query_md5, m, m_md5, cont, overlap) in recv.into_iter() {
writeln!(&mut writer, "\"{}\",{},\"{}\",{},{},{}",
query, query_md5, m, m_md5, cont, overlap).ok();
}
});

Expand Down Expand Up @@ -243,19 +239,13 @@ fn manysearch<P: AsRef<Path>>(
let overlap = q.minhash.count_common(&search_sm.minhash, false).unwrap() as f64;
let query_size = q.minhash.size() as f64;

let mut merged = q.minhash.clone();
merged.merge(&search_sm.minhash).ok();
let total_size = merged.size() as f64;

let containment = overlap / query_size;
let jaccard = overlap / total_size;
if containment > threshold {
results.push((q.name.clone(),
q.md5sum.clone(),
search_sm.name.clone(),
search_sm.md5sum.clone(),
containment,
jaccard,
overlap))
}
}
Expand Down
7 changes: 1 addition & 6 deletions src/python/tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,25 +53,20 @@ def test_simple(runtmp):
if row['query_md5'] == row['match_md5']:
assert row['match_name'] == row['query_name']
assert float(row['containment'] == 1.0)
assert float(row['jaccard'] == 1.0)
else:
# confirm hand-checked numbers
q = row['query_name'].split()[0]
m = row['match_name'].split()[0]
jaccard = float(row['jaccard'])
cont = float(row['containment'])
intersect_hashes = int(row['intersect_hashes'])

jaccard = round(jaccard, 4)
cont = round(cont, 4)
print(q, m, f"{jaccard:.04}", f"{cont:.04}")
print(q, m, f"{cont:.04}")

if q == 'NC_011665.1' and m == 'NC_009661.1':
assert jaccard == 0.3207
assert cont == 0.4828

if q == 'NC_009661.1' and m == 'NC_011665.1':
assert jaccard == 0.3207
assert cont == 0.4885


Expand Down

0 comments on commit 7a002e8

Please sign in to comment.