Skip to content

Commit

Permalink
fix column names to match sourmash prefetch (#50)
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb authored Aug 24, 2023
1 parent 9af88a4 commit de5b422
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 11 deletions.
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ fn write_prefetch<P: AsRef<Path> + std::fmt::Debug + std::fmt::Display + Clone>(
None => Box::new(std::io::stdout()),
};
let mut writer = BufWriter::new(prefetch_out);
writeln!(&mut writer, "query_file,match,match_md5,overlap").ok();
writeln!(&mut writer, "query_filename,match_name,match_md5,intersect_bp").ok();

for m in matchlist.iter() {
writeln!(&mut writer, "{},\"{}\",{},{}", query_label,
Expand Down Expand Up @@ -490,7 +490,7 @@ fn consume_query_by_gather<P: AsRef<Path> + std::fmt::Debug + std::fmt::Display
None => Box::new(std::io::stdout()),
};
let mut writer = BufWriter::new(gather_out);
writeln!(&mut writer, "query_file,rank,match,match_md5,overlap").ok();
writeln!(&mut writer, "query_filename,rank,match_name,match_md5,intersect_bp").ok();

let mut matching_sketches = matchlist;
let mut rank = 0;
Expand Down
65 changes: 60 additions & 5 deletions src/python/tests/test_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def test_simple(runtmp):
df = pandas.read_csv(g_output)
assert len(df) == 3
keys = set(df.keys())
assert keys == {'query_file', 'match', 'match_md5', 'rank', 'overlap'}
assert keys == {'query_file', 'match', 'match_md5', 'rank', 'intersect_bp'}


def test_simple_with_prefetch(runtmp):
Expand All @@ -71,12 +71,12 @@ def test_simple_with_prefetch(runtmp):
df = pandas.read_csv(g_output)
assert len(df) == 3
keys = set(df.keys())
assert keys == {'query_file', 'match', 'match_md5', 'rank', 'overlap'}
assert keys == {'query_filename', 'match_name', 'match_md5', 'rank', 'intersect_bp'}

df = pandas.read_csv(p_output)
assert len(df) == 3
keys = set(df.keys())
assert keys == {'query_file', 'match', 'match_md5', 'overlap'}
assert keys == {'query_filename', 'match_name', 'match_md5', 'intersect_bp'}


def test_missing_query(runtmp, capfd):
Expand Down Expand Up @@ -317,17 +317,72 @@ def test_md5s(runtmp):
p_output = runtmp.output('prefetch.csv')

runtmp.sourmash('scripts', 'fastgather', query, against_list,
'-o', g_output, '-s', '100000')
'-o', g_output, '--output-prefetch', p_output,
'-s', '100000')
assert os.path.exists(g_output)
assert os.path.exists(p_output)

# test gather output!
df = pandas.read_csv(g_output)
assert len(df) == 3
keys = set(df.keys())
assert keys == {'query_file', 'match', 'match_md5', 'rank', 'overlap'}
assert keys == {'query_file', 'match', 'match_md5', 'rank', 'intersect_bp'}

md5s = list(df['match_md5'])
print(md5s)

for against_file in (sig2, sig47, sig63):
for ss in sourmash.load_file_as_signatures(against_file, ksize=31):
assert ss.md5sum() in md5s

# test prefetch output!
df = pandas.read_csv(p_output)
assert len(df) == 3
keys = set(df.keys())

# prefetch output has no rank.
assert keys == {'query_file', 'match', 'match_md5', 'intersect_bp'}

md5s = list(df['match_md5'])
print(md5s)

for against_file in (sig2, sig47, sig63):
for ss in sourmash.load_file_as_signatures(against_file, ksize=31):
assert ss.md5sum() in md5s


def test_csv_columns_vs_sourmash_prefetch(runtmp):
# the column names should be strict subsets of sourmash prefetch cols
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')

sig2 = get_test_data('2.fa.sig.gz')
sig47 = get_test_data('47.fa.sig.gz')
sig63 = get_test_data('63.fa.sig.gz')

make_file_list(against_list, [sig2, sig47, sig63])

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

# first run fastgather
runtmp.sourmash('scripts', 'fastgather', query, against_list,
'-o', g_output, '--output-prefetch', p_output,
'-s', '100000')
assert os.path.exists(g_output)
assert os.path.exists(p_output)

# now run sourmash prefetch
sp_output = runtmp.output('sourmash-prefetch.csv')
runtmp.sourmash('prefetch', query, against_list,
'-o', sp_output, '--scaled', '100000')

gather_df = pandas.read_csv(g_output)
g_keys = set(gather_df.keys())
assert g_keys == {'query_filename', 'match_name', 'match_md5', 'rank', 'intersect_bp'}
g_keys.remove('rank') # 'rank' is not in sourmash prefetch!

sourmash_prefetch_df = pandas.read_csv(sp_output)
sp_keys = set(sourmash_prefetch_df.keys())
print(g_keys - sp_keys)
assert not g_keys - sp_keys, g_keys - sp_keys
50 changes: 46 additions & 4 deletions src/python/tests/test_multigather.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,12 @@ def test_simple(runtmp):
df = pandas.read_csv(g_output)
assert len(df) == 3
keys = set(df.keys())
assert keys == {'query_file', 'match', 'match_md5', 'rank', 'overlap'}
assert keys == {'query_filename', 'match_name', 'match_md5', 'rank', 'intersect_bp'}

df = pandas.read_csv(p_output)
assert len(df) == 3
keys = set(df.keys())
assert keys == {'query_file', 'match', 'match_md5', 'overlap'}
assert keys == {'query_filename', 'match_name', 'match_md5', 'intersect_bp'}


def test_missing_querylist(runtmp, capfd):
Expand Down Expand Up @@ -286,7 +286,7 @@ def test_md5(runtmp):
df = pandas.read_csv(g_output)
assert len(df) == 3
keys = set(df.keys())
assert keys == {'query_file', 'match', 'match_md5', 'rank', 'overlap'}
assert keys == {'query_filename', 'match_name', 'match_md5', 'rank', 'intersect_bp'}

md5s = set(df['match_md5'])
for against_file in (sig2, sig47, sig63):
Expand All @@ -297,9 +297,51 @@ def test_md5(runtmp):
df = pandas.read_csv(p_output)
assert len(df) == 3
keys = set(df.keys())
assert keys == {'query_file', 'match', 'match_md5', 'overlap'}
assert keys == {'query_filename', 'match_name', 'match_md5', 'intersect_bp'}

md5s = set(df['match_md5'])
for against_file in (sig2, sig47, sig63):
for ss in sourmash.load_file_as_signatures(against_file, ksize=31):
assert ss.md5sum() in md5s


def test_csv_columns_vs_sourmash_prefetch(runtmp):
# the column names should be strict subsets of sourmash prefetch cols
query = get_test_data('SRR606249.sig.gz')

sig2 = get_test_data('2.fa.sig.gz')
sig47 = get_test_data('47.fa.sig.gz')
sig63 = get_test_data('63.fa.sig.gz')

query_list = runtmp.output('query.txt')
make_file_list(query_list, [query])
against_list = runtmp.output('against.txt')
make_file_list(against_list, [sig2, sig47, sig63])

cwd = os.getcwd()
try:
os.chdir(runtmp.output(''))
runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list,
'-s', '100000', '-t', '0')
finally:
os.chdir(cwd)

g_output = runtmp.output('SRR606249.sig.gz.gather.csv')
assert os.path.exists(g_output)
p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv')
assert os.path.exists(p_output)

# now run sourmash prefetch
sp_output = runtmp.output('sourmash-prefetch.csv')
runtmp.sourmash('prefetch', query, against_list,
'-o', sp_output, '--scaled', '100000')

gather_df = pandas.read_csv(g_output)
g_keys = set(gather_df.keys())
assert g_keys == {'query_filename', 'match_name', 'match_md5', 'rank', 'intersect_bp'}
g_keys.remove('rank') # 'rank' is not in sourmash prefetch!

sourmash_prefetch_df = pandas.read_csv(sp_output)
sp_keys = set(sourmash_prefetch_df.keys())
print(g_keys - sp_keys)
assert not g_keys - sp_keys, g_keys - sp_keys

0 comments on commit de5b422

Please sign in to comment.