From 7a002e83f25bf532fc19b5a09cdcd040ae81fd12 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 28 Aug 2023 07:37:09 -0700 Subject: [PATCH] remove Jaccard calculation from manysearch (#72) --- src/lib.rs | 18 ++++-------------- src/python/tests/test_search.py | 7 +------ 2 files changed, 5 insertions(+), 20 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 19d23e28..d5713a7f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -143,10 +143,6 @@ fn prepare_query(search_sigs: &[Signature], template: &Sketch) -> Option>( querylist: P, @@ -208,10 +204,10 @@ fn manysearch>( }; let thrd = std::thread::spawn(move || { let mut writer = BufWriter::new(out); - writeln!(&mut writer, "query_name,query_md5,match_name,match_md5,containment,jaccard,intersect_hashes").unwrap(); - for (query, query_md5, m, m_md5, cont, jaccard, overlap) in recv.into_iter() { - writeln!(&mut writer, "\"{}\",{},\"{}\",{},{},{},{}", - query, query_md5, m, m_md5, cont, jaccard, overlap).ok(); + writeln!(&mut writer, "query_name,query_md5,match_name,match_md5,containment,intersect_hashes").unwrap(); + for (query, query_md5, m, m_md5, cont, overlap) in recv.into_iter() { + writeln!(&mut writer, "\"{}\",{},\"{}\",{},{},{}", + query, query_md5, m, m_md5, cont, overlap).ok(); } }); @@ -243,19 +239,13 @@ fn manysearch>( let overlap = q.minhash.count_common(&search_sm.minhash, false).unwrap() as f64; let query_size = q.minhash.size() as f64; - let mut merged = q.minhash.clone(); - merged.merge(&search_sm.minhash).ok(); - let total_size = merged.size() as f64; - let containment = overlap / query_size; - let jaccard = overlap / total_size; if containment > threshold { results.push((q.name.clone(), q.md5sum.clone(), search_sm.name.clone(), search_sm.md5sum.clone(), containment, - jaccard, overlap)) } } diff --git a/src/python/tests/test_search.py b/src/python/tests/test_search.py index 9970a366..6e207223 100644 --- a/src/python/tests/test_search.py +++ b/src/python/tests/test_search.py @@ -53,25 +53,20 @@ def test_simple(runtmp): if row['query_md5'] == row['match_md5']: assert row['match_name'] == row['query_name'] assert float(row['containment'] == 1.0) - assert float(row['jaccard'] == 1.0) else: # confirm hand-checked numbers q = row['query_name'].split()[0] m = row['match_name'].split()[0] - jaccard = float(row['jaccard']) cont = float(row['containment']) intersect_hashes = int(row['intersect_hashes']) - jaccard = round(jaccard, 4) cont = round(cont, 4) - print(q, m, f"{jaccard:.04}", f"{cont:.04}") + print(q, m, f"{cont:.04}") if q == 'NC_011665.1' and m == 'NC_009661.1': - assert jaccard == 0.3207 assert cont == 0.4828 if q == 'NC_009661.1' and m == 'NC_011665.1': - assert jaccard == 0.3207 assert cont == 0.4885