From de660142b009adee7aa47ebe6ba0bd6e6b43ad0c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 15 Nov 2024 16:20:39 -0800 Subject: [PATCH 1/2] MRG: add `--input-moltype` to singlesketch (#515) * WIP: add --input-moltype to singlesketch * fix black * support -c on singlesketch * black * remove -c * add more tests * comment, test * remove zip mentions * fix black * rely only on input_moltype --- src/lib.rs | 5 +- .../sourmash_plugin_branchwater/__init__.py | 16 ++- src/python/tests/test_sketch.py | 128 +++++++++++++++++- src/singlesketch.rs | 17 +-- 4 files changed, 146 insertions(+), 20 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 741b4980..afa5b857 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -318,14 +318,15 @@ fn do_manysketch( } #[pyfunction] -#[pyo3(signature = (input_filename, param_str, output, name))] +#[pyo3(signature = (input_filename, input_moltype, param_str, output, name))] fn do_singlesketch( input_filename: String, + input_moltype: String, param_str: String, output: String, name: String, ) -> anyhow::Result { - match singlesketch::singlesketch(input_filename, param_str, output, name) { + match singlesketch::singlesketch(input_filename, input_moltype, param_str, output, name) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index 597f165f..3de354aa 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -603,6 +603,14 @@ def __init__(self, p): "--name", help="optional name for the signature, default is the basename of input path", ) + p.add_argument( + "-I", + "--input-moltype", + "--input-molecule-type", + choices=["DNA", "dna", "protein"], + default="DNA", + help="molecule type of input sequence (DNA or protein)", + ) def main(self, args): print_version() @@ -636,12 +644,16 @@ def main(self, args): ) notify( - f"sketching file '{args.input_filename}' with params '{args.param_string}' and name '{signature_name}'" + f"sketching file '{args.input_filename}' ({args.input_moltype}) with params '{args.param_string}' and name '{signature_name}' using a single thread" ) super().main(args) status = sourmash_plugin_branchwater.do_singlesketch( - args.input_filename, args.param_string, args.output, signature_name + args.input_filename, + args.input_moltype, + args.param_string, + args.output, + signature_name, ) # Pass the name to Rust if status == 0: notify(f"...singlesketch is done! results in '{args.output}'") diff --git a/src/python/tests/test_sketch.py b/src/python/tests/test_sketch.py index b8cf4e23..33b85f46 100644 --- a/src/python/tests/test_sketch.py +++ b/src/python/tests/test_sketch.py @@ -1194,6 +1194,12 @@ def test_singlesketch_simple(runtmp): assert sig.minhash.is_dna assert sig.minhash.scaled == 1000 + # validate against sourmash sketch + output2 = runtmp.output("short2.sig") + runtmp.sourmash("sketch", "dna", fa1, "-o", output2) + sig2 = sourmash.load_one_signature(output2) + assert sig.minhash.hashes == sig2.minhash.hashes + def test_singlesketch_with_name(runtmp): """Test single sketching with a custom name.""" @@ -1241,14 +1247,74 @@ def test_singlesketch_mult_k(runtmp): assert any(sig.minhash.ksize == 31 for sig in sigs) -def test_singlesketch_mult_moltype(runtmp): +def test_singlesketch_mult_k_2(runtmp): + """Test single sketching with multiple k-mer sizes in one param string""" + fa1 = get_test_data("short.fa") + output = runtmp.output("short_mult_k.sig") + + # Run the singlesketch command with multiple k sizes + runtmp.sourmash( + "scripts", + "singlesketch", + fa1, + "-o", + output, + "-p", + "k=21,k=31,scaled=100", + ) + + # Check if the output exists and contains the expected data + assert os.path.exists(output) + sigs = list(sourmash.load_signatures(output)) + + # Verify that two signatures with different k-mer sizes exist + assert len(sigs) == 2 + assert any(sig.minhash.ksize == 21 for sig in sigs) + assert any(sig.minhash.ksize == 31 for sig in sigs) + + +def test_singlesketch_explicit_dna(runtmp): + """Test single sketching with explicit DNA in name""" + fa1 = get_test_data("short.fa") + output = runtmp.output("short_dna.sig") + + # Run the singlesketch command with multiple k sizes + runtmp.sourmash( + "scripts", + "singlesketch", + fa1, + "-o", + output, + "-p", + "k=21,k=31,scaled=100,dna", + ) + + # Check if the output exists and contains the expected data + assert os.path.exists(output) + sigs = list(sourmash.load_signatures(output)) + + # Verify that two signatures with different k-mer sizes exist + assert len(sigs) == 2 + assert any(sig.minhash.ksize == 21 for sig in sigs) + assert any(sig.minhash.ksize == 31 for sig in sigs) + + +def test_singlesketch_protein_moltype(runtmp): """Test single sketching with different molecule types.""" fa1 = get_test_data("short-protein.fa") output = runtmp.output("short_mult_moltype.sig") - # Run the singlesketch command with multiple molecule types + # Run the singlesketch command with prot molecule types runtmp.sourmash( - "scripts", "singlesketch", fa1, "-o", output, "-p", "protein,k=10,scaled=100" + "scripts", + "singlesketch", + fa1, + "-o", + output, + "-p", + "protein,k=10,scaled=100", + "--input-moltype", + "protein", ) # Check if the output exists and contains the expected data @@ -1260,6 +1326,12 @@ def test_singlesketch_mult_moltype(runtmp): assert sig.minhash.is_protein assert sig.minhash.scaled == 100 + # validate against sourmash sketch + output2 = runtmp.output("short2.sig") + runtmp.sourmash("sketch", "protein", fa1, "-p", "k=10,scaled=100", "-o", output2) + sig2 = sourmash.load_one_signature(output2) + assert sig.minhash.hashes == sig2.minhash.hashes + def test_singlesketch_invalid_params(runtmp, capfd): """Test singlesketch command with invalid parameters.""" @@ -1275,3 +1347,53 @@ def test_singlesketch_invalid_params(runtmp, capfd): # Check that the error message is correct captured = capfd.readouterr() assert "Failed to parse params string" in captured.err + + +@pytest.mark.xfail(reason="needs to be implemented") +def test_singlesketch_translate(runtmp): + """Test basic single sketching with input = DNA, output = protein""" + fa1 = get_test_data("short.fa") + output = runtmp.output("short.sig") + + # Run the singlesketch command + runtmp.sourmash( + "scripts", + "singlesketch", + fa1, + "-o", + output, + "--input-moltype", + "dna", + "-p", + "protein,k=7", + ) + + # Check if the output exists and contains the expected data + assert os.path.exists(output) + sig = sourmash.load_one_signature(output) + + assert sig.name == "short.fa" + assert sig.minhash.ksize == 31 + assert sig.minhash.is_dna + assert sig.minhash.scaled == 1000 + + +@pytest.mark.xfail(reason="needs to be implemented") +def test_singlesketch_multimoltype_fail(runtmp): + """Test failure with multiple moltype""" + fa1 = get_test_data("short.fa") + output = runtmp.output("short.sig") + + # Run the singlesketch command + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash( + "scripts", + "singlesketch", + fa1, + "-o", + output, + "--input-moltype", + "dna", + "-p", + "protein,dna,k=7", + ) diff --git a/src/singlesketch.rs b/src/singlesketch.rs index 12231c9d..80226d37 100644 --- a/src/singlesketch.rs +++ b/src/singlesketch.rs @@ -7,6 +7,7 @@ use std::io::{self, BufWriter, Write}; pub fn singlesketch( input_filename: String, + input_moltype: String, param_str: String, output: String, name: String, @@ -21,20 +22,10 @@ pub fn singlesketch( } }; - let moltype = if param_str.contains("dna") { - "dna" - } else if param_str.contains("protein") { - "protein" - } else if param_str.contains("dayhoff") { - "dayhoff" - } else if param_str.contains("hp") { - "hp" - } else { - bail!("Unrecognized molecule type in params string"); - }; + let input_moltype = input_moltype.to_ascii_lowercase(); // Build signature templates based on parsed parameters and detected moltype - let mut sigs = crate::manysketch::build_siginfo(¶ms_vec, moltype); + let mut sigs = crate::manysketch::build_siginfo(¶ms_vec, input_moltype.as_str()); if sigs.is_empty() { bail!("No signatures to build for the given parameters."); @@ -56,7 +47,7 @@ pub fn singlesketch( match record_result { Ok(record) => { sigs.iter_mut().for_each(|sig| { - if moltype == "protein" { + if input_moltype == "protein" { sig.add_protein(&record.seq()) .expect("Failed to add protein"); } else { From 7d51b6a2a5ee3e91e18df0eea55225e4010a2b0a Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 15 Nov 2024 16:37:55 -0800 Subject: [PATCH 2/2] MRG: bump to version 0.9.11 (#522) --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 24f4e778..1fd2380c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1820,7 +1820,7 @@ dependencies = [ [[package]] name = "sourmash_plugin_branchwater" -version = "0.9.10" +version = "0.9.11" dependencies = [ "anyhow", "assert_cmd", diff --git a/Cargo.toml b/Cargo.toml index 064fa0c0..a8a5030d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sourmash_plugin_branchwater" -version = "0.9.10" +version = "0.9.11" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html