From 0c853dec3abed076163ec87aa91d382575bdab3d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 15 Aug 2023 08:03:59 -0700 Subject: [PATCH] MRG: reorganize code to all be under `src`/ (#29) * refactor manysearch to use common functions * rename things appropriately * more refactor * refactor more * refactor/simplify layout * update src/ README --- pyproject.toml | 11 +-- python/pyo3_branchwater/__init__.py | 1 - python/pyo3_branchwater/__main__.py | 26 ------ src/README.md | 4 +- src/lib.rs | 87 ++++++------------ .../python/branchwater_plugin.py | 14 +-- {python => src/python}/tests/conftest.py | 0 .../python}/tests/sourmash_tst_utils.py | 0 .../python}/tests/test-data/2.fa.sig.gz | Bin .../python}/tests/test-data/47.fa.sig.gz | Bin .../python}/tests/test-data/63.fa.sig.gz | Bin .../python}/tests/test-data/SRR606249.sig.gz | Bin {python => src/python}/tests/test_gather.py | 0 .../python}/tests/test_multigather.py | 0 {python => src/python}/tests/test_search.py | 0 15 files changed, 39 insertions(+), 104 deletions(-) delete mode 100644 python/pyo3_branchwater/__init__.py delete mode 100755 python/pyo3_branchwater/__main__.py rename python/pyo3_branchwater/sourmash_plugin.py => src/python/branchwater_plugin.py (91%) rename {python => src/python}/tests/conftest.py (100%) rename {python => src/python}/tests/sourmash_tst_utils.py (100%) rename {python => src/python}/tests/test-data/2.fa.sig.gz (100%) rename {python => src/python}/tests/test-data/47.fa.sig.gz (100%) rename {python => src/python}/tests/test-data/63.fa.sig.gz (100%) rename {python => src/python}/tests/test-data/SRR606249.sig.gz (100%) rename {python => src/python}/tests/test_gather.py (100%) rename {python => src/python}/tests/test_multigather.py (100%) rename {python => src/python}/tests/test_search.py (100%) diff --git a/pyproject.toml b/pyproject.toml index 75ec5c35..8a42fe1c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,16 +13,13 @@ classifiers = [ requires = ["maturin>=1.1.0,<2"] build-backend = "maturin" -[project.scripts] -"pyo3_branchwater" = "pyo3_branchwater.__main__:main" - [project.entry-points."sourmash.cli_script"] -manysearch = "pyo3_branchwater.sourmash_plugin:Branchwater_Manysearch" -fastgather = "pyo3_branchwater.sourmash_plugin:Branchwater_Fastgather" -fastmultigather = "pyo3_branchwater.sourmash_plugin:Branchwater_Fastmultigather" +manysearch = "branchwater_plugin:Branchwater_Manysearch" +fastgather = "branchwater_plugin:Branchwater_Fastgather" +fastmultigather = "branchwater_plugin:Branchwater_Fastmultigather" [tool.maturin] -python-source = "python" +python-source = "src/python" [metadata] license = { text = "BSD 3-Clause License" } diff --git a/python/pyo3_branchwater/__init__.py b/python/pyo3_branchwater/__init__.py deleted file mode 100644 index 792d6005..00000000 --- a/python/pyo3_branchwater/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# diff --git a/python/pyo3_branchwater/__main__.py b/python/pyo3_branchwater/__main__.py deleted file mode 100755 index 094d17c3..00000000 --- a/python/pyo3_branchwater/__main__.py +++ /dev/null @@ -1,26 +0,0 @@ -#! /usr/bin/env python -import sys -import argparse -from . import _pyo3_branchwater - - -def main(): - p = argparse.ArgumentParser() - p.add_argument('query_paths') - p.add_argument('against_paths') - p.add_argument('-o', '--output', required=True) - p.add_argument('-t', '--threshold', default=0.01, type=float) - p.add_argument('-k', '--ksize', default=31, type=int) - p.add_argument('-s', '--scaled', default=1000, type=int) - args = p.parse_args() - - pyo3_branchwater.do_search(args.query_paths, - args.against_paths, - args.threshold, - args.ksize, - args.scaled, - args.output) - - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/README.md b/src/README.md index cbdc4467..ece8afbe 100644 --- a/src/README.md +++ b/src/README.md @@ -1,3 +1,3 @@ -The Rust source lives here, under `src/.`. +The Rust source is in `lib.rs`. -See `python/branchwater/` for the Python modules! +The Python source code is under `python/`, and tests under `python/tests/`. diff --git a/src/lib.rs b/src/lib.rs index 0478852d..ca2cfd1e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -105,7 +105,7 @@ fn prepare_query(search_sig: &Signature, template: &Sketch) -> Option>( +fn manysearch>( querylist: P, siglist: P, threshold: f64, @@ -124,36 +124,10 @@ fn search>( // Read in list of query paths. eprintln!("Reading list of queries from: '{}'", querylist.as_ref().display()); - let querylist_file = BufReader::new(File::open(querylist)?); // Load all queries into memory at once. - let queries: Vec<(String, KmerMinHash)> = querylist_file - .lines() - .filter_map(|line| { - let line = line.unwrap(); - if !line.is_empty() { - // skip empty lines; load non-empty! - let mut path = PathBuf::new(); - path.push(line); - Some(path) - } else { - None - } - }) - // for non-empty paths, load whichever one matches template. - .filter_map(|query| { - let query_sig = Signature::from_path(query).unwrap(); - - let mut query = None; - for sig in &query_sig { - if let Some(mh) = prepare_query(sig, &template) { - query = Some((sig.name(), mh)); - break; - } - } - query - }) - .collect(); + let querylist_paths = load_sketchlist_filenames(querylist).unwrap(); + let queries = load_sketches(querylist_paths, &template).unwrap(); if queries.is_empty() { bail!("No query signatures loaded, exiting."); @@ -164,25 +138,12 @@ fn search>( // Load all _paths_, not signatures, into memory. eprintln!("Reading search file paths from: '{}'", siglist.as_ref().display()); - let siglist_file = BufReader::new(File::open(siglist)?); - let search_sigs: Vec = siglist_file - .lines() - .filter_map(|line| { - let line = line.unwrap(); - if !line.is_empty() { - let mut path = PathBuf::new(); - path.push(line); - Some(path) - } else { - None - } - }) - .collect(); - if search_sigs.is_empty() { + let search_sigs_paths = load_sketchlist_filenames(siglist).unwrap(); + if search_sigs_paths.is_empty() { bail!("No signatures to search loaded, exiting."); } - eprintln!("Loaded {} sig paths to search.", search_sigs.len()); + eprintln!("Loaded {} sig paths to search.", search_sigs_paths.len()); // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); @@ -206,10 +167,12 @@ fn search>( // loading them individually and searching them. Stuff results into // the writer thread above. // + // CTB: might want to just load everything into memory here. + // let processed_sigs = AtomicUsize::new(0); - let send = search_sigs + let send = search_sigs_paths .par_iter() .filter_map(|filename| { let i = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst); @@ -232,12 +195,14 @@ fn search>( let mut results = vec![]; // search for matches & save containment. - for (name, query) in &queries { - let overlap = - query.count_common(&search_mh, false).unwrap() as f64 / query.size() as f64; - if overlap > threshold { - results.push((name.clone(), - query.md5sum(), + for q in queries.iter() { + let overlap = q.minhash.count_common(&search_mh, false).unwrap() as f64; + let size = q.minhash.size() as f64; + + let containment = overlap / size; + if containment > threshold { + results.push((q.name.clone(), + q.minhash.md5sum(), search_sig.name(), search_sig.md5sum(), overlap)) @@ -672,15 +637,15 @@ fn multigather + std::fmt::Debug + Clone>( // #[pyfunction] -fn do_search(querylist_path: String, - siglist_path: String, - threshold: f64, - ksize: u8, - scaled: usize, - output_path: String +fn do_manysearch(querylist_path: String, + siglist_path: String, + threshold: f64, + ksize: u8, + scaled: usize, + output_path: String ) -> PyResult { - match search(querylist_path, siglist_path, threshold, ksize, scaled, - Some(output_path)) { + match manysearch(querylist_path, siglist_path, threshold, ksize, scaled, + Some(output_path)) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); @@ -734,7 +699,7 @@ fn get_num_threads() -> PyResult { #[pymodule] fn pyo3_branchwater(_py: Python, m: &PyModule) -> PyResult<()> { - m.add_function(wrap_pyfunction!(do_search, m)?)?; + m.add_function(wrap_pyfunction!(do_manysearch, m)?)?; m.add_function(wrap_pyfunction!(do_countergather, m)?)?; m.add_function(wrap_pyfunction!(do_multigather, m)?)?; m.add("SomeError", _py.get_type::())?; diff --git a/python/pyo3_branchwater/sourmash_plugin.py b/src/python/branchwater_plugin.py similarity index 91% rename from python/pyo3_branchwater/sourmash_plugin.py rename to src/python/branchwater_plugin.py index 7ca60466..0202fa8e 100755 --- a/python/pyo3_branchwater/sourmash_plugin.py +++ b/src/python/branchwater_plugin.py @@ -4,7 +4,7 @@ from sourmash.plugins import CommandLinePlugin from sourmash.logging import notify -from . import pyo3_branchwater +from pyo3_branchwater import pyo3_branchwater class Branchwater_Manysearch(CommandLinePlugin): command = 'manysearch' @@ -28,12 +28,12 @@ def main(self, args): num_threads = pyo3_branchwater.get_num_threads() notify(f"searching all sketches in '{args.query_paths}' against '{args.against_paths}' using {num_threads} threads") super().main(args) - status = pyo3_branchwater.do_search(args.query_paths, - args.against_paths, - args.threshold, - args.ksize, - args.scaled, - args.output) + status = pyo3_branchwater.do_manysearch(args.query_paths, + args.against_paths, + args.threshold, + args.ksize, + args.scaled, + args.output) if status == 0: notify(f"...manysearch is done! results in '{args.output}'") return status diff --git a/python/tests/conftest.py b/src/python/tests/conftest.py similarity index 100% rename from python/tests/conftest.py rename to src/python/tests/conftest.py diff --git a/python/tests/sourmash_tst_utils.py b/src/python/tests/sourmash_tst_utils.py similarity index 100% rename from python/tests/sourmash_tst_utils.py rename to src/python/tests/sourmash_tst_utils.py diff --git a/python/tests/test-data/2.fa.sig.gz b/src/python/tests/test-data/2.fa.sig.gz similarity index 100% rename from python/tests/test-data/2.fa.sig.gz rename to src/python/tests/test-data/2.fa.sig.gz diff --git a/python/tests/test-data/47.fa.sig.gz b/src/python/tests/test-data/47.fa.sig.gz similarity index 100% rename from python/tests/test-data/47.fa.sig.gz rename to src/python/tests/test-data/47.fa.sig.gz diff --git a/python/tests/test-data/63.fa.sig.gz b/src/python/tests/test-data/63.fa.sig.gz similarity index 100% rename from python/tests/test-data/63.fa.sig.gz rename to src/python/tests/test-data/63.fa.sig.gz diff --git a/python/tests/test-data/SRR606249.sig.gz b/src/python/tests/test-data/SRR606249.sig.gz similarity index 100% rename from python/tests/test-data/SRR606249.sig.gz rename to src/python/tests/test-data/SRR606249.sig.gz diff --git a/python/tests/test_gather.py b/src/python/tests/test_gather.py similarity index 100% rename from python/tests/test_gather.py rename to src/python/tests/test_gather.py diff --git a/python/tests/test_multigather.py b/src/python/tests/test_multigather.py similarity index 100% rename from python/tests/test_multigather.py rename to src/python/tests/test_multigather.py diff --git a/python/tests/test_search.py b/src/python/tests/test_search.py similarity index 100% rename from python/tests/test_search.py rename to src/python/tests/test_search.py