Skip to content

Commit

Permalink
Merge branch 'main' into integrate-buildutils
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes authored Nov 16, 2024
2 parents 07062a9 + 7d51b6a commit c4f509d
Show file tree
Hide file tree
Showing 6 changed files with 148 additions and 22 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "sourmash_plugin_branchwater"
version = "0.9.10"
version = "0.9.11"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
5 changes: 3 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -318,14 +318,15 @@ fn do_manysketch(
}

#[pyfunction]
#[pyo3(signature = (input_filename, param_str, output, name))]
#[pyo3(signature = (input_filename, input_moltype, param_str, output, name))]
fn do_singlesketch(
input_filename: String,
input_moltype: String,
param_str: String,
output: String,
name: String,
) -> anyhow::Result<u8> {
match singlesketch::singlesketch(input_filename, param_str, output, name) {
match singlesketch::singlesketch(input_filename, input_moltype, param_str, output, name) {
Ok(_) => Ok(0),
Err(e) => {
eprintln!("Error: {e}");
Expand Down
16 changes: 14 additions & 2 deletions src/python/sourmash_plugin_branchwater/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,14 @@ def __init__(self, p):
"--name",
help="optional name for the signature, default is the basename of input path",
)
p.add_argument(
"-I",
"--input-moltype",
"--input-molecule-type",
choices=["DNA", "dna", "protein"],
default="DNA",
help="molecule type of input sequence (DNA or protein)",
)

def main(self, args):
print_version()
Expand Down Expand Up @@ -636,12 +644,16 @@ def main(self, args):
)

notify(
f"sketching file '{args.input_filename}' with params '{args.param_string}' and name '{signature_name}'"
f"sketching file '{args.input_filename}' ({args.input_moltype}) with params '{args.param_string}' and name '{signature_name}' using a single thread"
)

super().main(args)
status = sourmash_plugin_branchwater.do_singlesketch(
args.input_filename, args.param_string, args.output, signature_name
args.input_filename,
args.input_moltype,
args.param_string,
args.output,
signature_name,
) # Pass the name to Rust
if status == 0:
notify(f"...singlesketch is done! results in '{args.output}'")
Expand Down
128 changes: 125 additions & 3 deletions src/python/tests/test_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -1194,6 +1194,12 @@ def test_singlesketch_simple(runtmp):
assert sig.minhash.is_dna
assert sig.minhash.scaled == 1000

# validate against sourmash sketch
output2 = runtmp.output("short2.sig")
runtmp.sourmash("sketch", "dna", fa1, "-o", output2)
sig2 = sourmash.load_one_signature(output2)
assert sig.minhash.hashes == sig2.minhash.hashes


def test_singlesketch_with_name(runtmp):
"""Test single sketching with a custom name."""
Expand Down Expand Up @@ -1241,14 +1247,74 @@ def test_singlesketch_mult_k(runtmp):
assert any(sig.minhash.ksize == 31 for sig in sigs)


def test_singlesketch_mult_moltype(runtmp):
def test_singlesketch_mult_k_2(runtmp):
"""Test single sketching with multiple k-mer sizes in one param string"""
fa1 = get_test_data("short.fa")
output = runtmp.output("short_mult_k.sig")

# Run the singlesketch command with multiple k sizes
runtmp.sourmash(
"scripts",
"singlesketch",
fa1,
"-o",
output,
"-p",
"k=21,k=31,scaled=100",
)

# Check if the output exists and contains the expected data
assert os.path.exists(output)
sigs = list(sourmash.load_signatures(output))

# Verify that two signatures with different k-mer sizes exist
assert len(sigs) == 2
assert any(sig.minhash.ksize == 21 for sig in sigs)
assert any(sig.minhash.ksize == 31 for sig in sigs)


def test_singlesketch_explicit_dna(runtmp):
"""Test single sketching with explicit DNA in name"""
fa1 = get_test_data("short.fa")
output = runtmp.output("short_dna.sig")

# Run the singlesketch command with multiple k sizes
runtmp.sourmash(
"scripts",
"singlesketch",
fa1,
"-o",
output,
"-p",
"k=21,k=31,scaled=100,dna",
)

# Check if the output exists and contains the expected data
assert os.path.exists(output)
sigs = list(sourmash.load_signatures(output))

# Verify that two signatures with different k-mer sizes exist
assert len(sigs) == 2
assert any(sig.minhash.ksize == 21 for sig in sigs)
assert any(sig.minhash.ksize == 31 for sig in sigs)


def test_singlesketch_protein_moltype(runtmp):
"""Test single sketching with different molecule types."""
fa1 = get_test_data("short-protein.fa")
output = runtmp.output("short_mult_moltype.sig")

# Run the singlesketch command with multiple molecule types
# Run the singlesketch command with prot molecule types
runtmp.sourmash(
"scripts", "singlesketch", fa1, "-o", output, "-p", "protein,k=10,scaled=100"
"scripts",
"singlesketch",
fa1,
"-o",
output,
"-p",
"protein,k=10,scaled=100",
"--input-moltype",
"protein",
)

# Check if the output exists and contains the expected data
Expand All @@ -1260,6 +1326,12 @@ def test_singlesketch_mult_moltype(runtmp):
assert sig.minhash.is_protein
assert sig.minhash.scaled == 100

# validate against sourmash sketch
output2 = runtmp.output("short2.sig")
runtmp.sourmash("sketch", "protein", fa1, "-p", "k=10,scaled=100", "-o", output2)
sig2 = sourmash.load_one_signature(output2)
assert sig.minhash.hashes == sig2.minhash.hashes


def test_singlesketch_invalid_params(runtmp, capfd):
"""Test singlesketch command with invalid parameters."""
Expand All @@ -1275,3 +1347,53 @@ def test_singlesketch_invalid_params(runtmp, capfd):
# Check that the error message is correct
captured = capfd.readouterr()
assert "Failed to parse params string" in captured.err


@pytest.mark.xfail(reason="needs to be implemented")
def test_singlesketch_translate(runtmp):
"""Test basic single sketching with input = DNA, output = protein"""
fa1 = get_test_data("short.fa")
output = runtmp.output("short.sig")

# Run the singlesketch command
runtmp.sourmash(
"scripts",
"singlesketch",
fa1,
"-o",
output,
"--input-moltype",
"dna",
"-p",
"protein,k=7",
)

# Check if the output exists and contains the expected data
assert os.path.exists(output)
sig = sourmash.load_one_signature(output)

assert sig.name == "short.fa"
assert sig.minhash.ksize == 31
assert sig.minhash.is_dna
assert sig.minhash.scaled == 1000


@pytest.mark.xfail(reason="needs to be implemented")
def test_singlesketch_multimoltype_fail(runtmp):
"""Test failure with multiple moltype"""
fa1 = get_test_data("short.fa")
output = runtmp.output("short.sig")

# Run the singlesketch command
with pytest.raises(SourmashCommandFailed):
runtmp.sourmash(
"scripts",
"singlesketch",
fa1,
"-o",
output,
"--input-moltype",
"dna",
"-p",
"protein,dna,k=7",
)
17 changes: 4 additions & 13 deletions src/singlesketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use std::io::{self, BufWriter, Write};

pub fn singlesketch(
input_filename: String,
input_moltype: String,
param_str: String,
output: String,
name: String,
Expand All @@ -21,20 +22,10 @@ pub fn singlesketch(
}
};

let moltype = if param_str.contains("dna") {
"dna"
} else if param_str.contains("protein") {
"protein"
} else if param_str.contains("dayhoff") {
"dayhoff"
} else if param_str.contains("hp") {
"hp"
} else {
bail!("Unrecognized molecule type in params string");
};
let input_moltype = input_moltype.to_ascii_lowercase();

// Build signature templates based on parsed parameters and detected moltype
let mut sigs = crate::manysketch::build_siginfo(&params_vec, moltype);
let mut sigs = crate::manysketch::build_siginfo(&params_vec, input_moltype.as_str());

if sigs.is_empty() {
bail!("No signatures to build for the given parameters.");
Expand All @@ -56,7 +47,7 @@ pub fn singlesketch(
match record_result {
Ok(record) => {
sigs.iter_mut().for_each(|sig| {
if moltype == "protein" {
if input_moltype == "protein" {
sig.add_protein(&record.seq())
.expect("Failed to add protein");
} else {
Expand Down

0 comments on commit c4f509d

Please sign in to comment.