Skip to content

Commit

Permalink
MRG: reorganize code to all be under src/ (#29)
Browse files Browse the repository at this point in the history
* refactor manysearch to use common functions

* rename things appropriately

* more refactor

* refactor more

* refactor/simplify layout

* update src/ README
  • Loading branch information
ctb authored Aug 15, 2023
1 parent f814805 commit 0c853de
Show file tree
Hide file tree
Showing 15 changed files with 39 additions and 104 deletions.
11 changes: 4 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,13 @@ classifiers = [
requires = ["maturin>=1.1.0,<2"]
build-backend = "maturin"

[project.scripts]
"pyo3_branchwater" = "pyo3_branchwater.__main__:main"

[project.entry-points."sourmash.cli_script"]
manysearch = "pyo3_branchwater.sourmash_plugin:Branchwater_Manysearch"
fastgather = "pyo3_branchwater.sourmash_plugin:Branchwater_Fastgather"
fastmultigather = "pyo3_branchwater.sourmash_plugin:Branchwater_Fastmultigather"
manysearch = "branchwater_plugin:Branchwater_Manysearch"
fastgather = "branchwater_plugin:Branchwater_Fastgather"
fastmultigather = "branchwater_plugin:Branchwater_Fastmultigather"

[tool.maturin]
python-source = "python"
python-source = "src/python"

[metadata]
license = { text = "BSD 3-Clause License" }
1 change: 0 additions & 1 deletion python/pyo3_branchwater/__init__.py

This file was deleted.

26 changes: 0 additions & 26 deletions python/pyo3_branchwater/__main__.py

This file was deleted.

4 changes: 2 additions & 2 deletions src/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
The Rust source lives here, under `src/.`.
The Rust source is in `lib.rs`.

See `python/branchwater/` for the Python modules!
The Python source code is under `python/`, and tests under `python/tests/`.
87 changes: 26 additions & 61 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ fn prepare_query(search_sig: &Signature, template: &Sketch) -> Option<KmerMinHas
/// - support jaccard as well as containment/overlap
/// - support md5 output columns; other?

fn search<P: AsRef<Path>>(
fn manysearch<P: AsRef<Path>>(
querylist: P,
siglist: P,
threshold: f64,
Expand All @@ -124,36 +124,10 @@ fn search<P: AsRef<Path>>(

// Read in list of query paths.
eprintln!("Reading list of queries from: '{}'", querylist.as_ref().display());
let querylist_file = BufReader::new(File::open(querylist)?);

// Load all queries into memory at once.
let queries: Vec<(String, KmerMinHash)> = querylist_file
.lines()
.filter_map(|line| {
let line = line.unwrap();
if !line.is_empty() {
// skip empty lines; load non-empty!
let mut path = PathBuf::new();
path.push(line);
Some(path)
} else {
None
}
})
// for non-empty paths, load whichever one matches template.
.filter_map(|query| {
let query_sig = Signature::from_path(query).unwrap();

let mut query = None;
for sig in &query_sig {
if let Some(mh) = prepare_query(sig, &template) {
query = Some((sig.name(), mh));
break;
}
}
query
})
.collect();
let querylist_paths = load_sketchlist_filenames(querylist).unwrap();
let queries = load_sketches(querylist_paths, &template).unwrap();

if queries.is_empty() {
bail!("No query signatures loaded, exiting.");
Expand All @@ -164,25 +138,12 @@ fn search<P: AsRef<Path>>(
// Load all _paths_, not signatures, into memory.
eprintln!("Reading search file paths from: '{}'", siglist.as_ref().display());

let siglist_file = BufReader::new(File::open(siglist)?);
let search_sigs: Vec<PathBuf> = siglist_file
.lines()
.filter_map(|line| {
let line = line.unwrap();
if !line.is_empty() {
let mut path = PathBuf::new();
path.push(line);
Some(path)
} else {
None
}
})
.collect();
if search_sigs.is_empty() {
let search_sigs_paths = load_sketchlist_filenames(siglist).unwrap();
if search_sigs_paths.is_empty() {
bail!("No signatures to search loaded, exiting.");
}

eprintln!("Loaded {} sig paths to search.", search_sigs.len());
eprintln!("Loaded {} sig paths to search.", search_sigs_paths.len());

// set up a multi-producer, single-consumer channel.
let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads());
Expand All @@ -206,10 +167,12 @@ fn search<P: AsRef<Path>>(
// loading them individually and searching them. Stuff results into
// the writer thread above.
//
// CTB: might want to just load everything into memory here.
//

let processed_sigs = AtomicUsize::new(0);

let send = search_sigs
let send = search_sigs_paths
.par_iter()
.filter_map(|filename| {
let i = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst);
Expand All @@ -232,12 +195,14 @@ fn search<P: AsRef<Path>>(
let mut results = vec![];

// search for matches & save containment.
for (name, query) in &queries {
let overlap =
query.count_common(&search_mh, false).unwrap() as f64 / query.size() as f64;
if overlap > threshold {
results.push((name.clone(),
query.md5sum(),
for q in queries.iter() {
let overlap = q.minhash.count_common(&search_mh, false).unwrap() as f64;
let size = q.minhash.size() as f64;

let containment = overlap / size;
if containment > threshold {
results.push((q.name.clone(),
q.minhash.md5sum(),
search_sig.name(),
search_sig.md5sum(),
overlap))
Expand Down Expand Up @@ -672,15 +637,15 @@ fn multigather<P: AsRef<Path> + std::fmt::Debug + Clone>(
//

#[pyfunction]
fn do_search(querylist_path: String,
siglist_path: String,
threshold: f64,
ksize: u8,
scaled: usize,
output_path: String
fn do_manysearch(querylist_path: String,
siglist_path: String,
threshold: f64,
ksize: u8,
scaled: usize,
output_path: String
) -> PyResult<u8> {
match search(querylist_path, siglist_path, threshold, ksize, scaled,
Some(output_path)) {
match manysearch(querylist_path, siglist_path, threshold, ksize, scaled,
Some(output_path)) {
Ok(_) => Ok(0),
Err(e) => {
eprintln!("Error: {e}");
Expand Down Expand Up @@ -734,7 +699,7 @@ fn get_num_threads() -> PyResult<usize> {

#[pymodule]
fn pyo3_branchwater(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_function(wrap_pyfunction!(do_search, m)?)?;
m.add_function(wrap_pyfunction!(do_manysearch, m)?)?;
m.add_function(wrap_pyfunction!(do_countergather, m)?)?;
m.add_function(wrap_pyfunction!(do_multigather, m)?)?;
m.add("SomeError", _py.get_type::<SomeError>())?;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from sourmash.plugins import CommandLinePlugin
from sourmash.logging import notify

from . import pyo3_branchwater
from pyo3_branchwater import pyo3_branchwater

class Branchwater_Manysearch(CommandLinePlugin):
command = 'manysearch'
Expand All @@ -28,12 +28,12 @@ def main(self, args):
num_threads = pyo3_branchwater.get_num_threads()
notify(f"searching all sketches in '{args.query_paths}' against '{args.against_paths}' using {num_threads} threads")
super().main(args)
status = pyo3_branchwater.do_search(args.query_paths,
args.against_paths,
args.threshold,
args.ksize,
args.scaled,
args.output)
status = pyo3_branchwater.do_manysearch(args.query_paths,
args.against_paths,
args.threshold,
args.ksize,
args.scaled,
args.output)
if status == 0:
notify(f"...manysearch is done! results in '{args.output}'")
return status
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

0 comments on commit 0c853de

Please sign in to comment.