Skip to content

Commit

Permalink
MRG: add tests that confirm that fastgather output can be used as pic…
Browse files Browse the repository at this point in the history
…klists for sourmash (#51)

* fix column names to match sourmash prefetch

* fix colum name from #50

* fix column names in tests; test fastgather output as picklist

* add test of prefetch output as picklist too
  • Loading branch information
ctb authored Aug 24, 2023
1 parent de5b422 commit 3f0e3d2
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 4 deletions.
2 changes: 1 addition & 1 deletion doc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ This CSV file can then be used as a picklist for `sourmash gather` like so:

```
sourmash gather SRR606249.trim.sig.gz /group/ctbrowngrp/sourmash-db/gtdb-rs214/gtdb-rs214-k21.zip \
--picklist SRR606249.fastgather.csv:match:ident \
--picklist SRR606249.fastgather.csv:match_name:ident \
-o SRR606249.gather.csv
```

Expand Down
82 changes: 79 additions & 3 deletions src/python/tests/test_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def test_simple(runtmp):
df = pandas.read_csv(g_output)
assert len(df) == 3
keys = set(df.keys())
assert keys == {'query_file', 'match', 'match_md5', 'rank', 'intersect_bp'}
assert keys == {'query_filename', 'match_name', 'match_md5', 'rank', 'intersect_bp'}


def test_simple_with_prefetch(runtmp):
Expand Down Expand Up @@ -326,7 +326,7 @@ def test_md5s(runtmp):
df = pandas.read_csv(g_output)
assert len(df) == 3
keys = set(df.keys())
assert keys == {'query_file', 'match', 'match_md5', 'rank', 'intersect_bp'}
assert keys == {'query_filename', 'match_name', 'match_md5', 'rank', 'intersect_bp'}

md5s = list(df['match_md5'])
print(md5s)
Expand All @@ -341,7 +341,7 @@ def test_md5s(runtmp):
keys = set(df.keys())

# prefetch output has no rank.
assert keys == {'query_file', 'match', 'match_md5', 'intersect_bp'}
assert keys == {'query_filename', 'match_name', 'match_md5', 'intersect_bp'}

md5s = list(df['match_md5'])
print(md5s)
Expand Down Expand Up @@ -386,3 +386,79 @@ def test_csv_columns_vs_sourmash_prefetch(runtmp):
sp_keys = set(sourmash_prefetch_df.keys())
print(g_keys - sp_keys)
assert not g_keys - sp_keys, g_keys - sp_keys


def test_fastgather_gatherout_as_picklist(runtmp):
# should be able to use fastgather gather output as picklist
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')

sig2 = get_test_data('2.fa.sig.gz')
sig47 = get_test_data('47.fa.sig.gz')
sig63 = get_test_data('63.fa.sig.gz')

make_file_list(against_list, [sig2, sig47, sig63])

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

# first run fastgather
runtmp.sourmash('scripts', 'fastgather', query, against_list,
'-o', g_output, '--output-prefetch', p_output,
'-s', '100000')
assert os.path.exists(g_output)
assert os.path.exists(p_output)

# now run sourmash gather using as picklist as picklist
gather_picklist_output = runtmp.output('sourmash-gather+picklist.csv')
runtmp.sourmash('gather', query, against_list,
'-o', gather_picklist_output, '--scaled', '100000',
'--picklist', f'{g_output}:match_name:ident')

# finally, run sourmash gather using fastgather gather output as picklist
full_gather_output = runtmp.output('sourmash-gather.csv')
runtmp.sourmash('gather', query, against_list,
'-o', full_gather_output, '--scaled', '100000')

picklist_df = pandas.read_csv(gather_picklist_output)
full_df = pandas.read_csv(full_gather_output)

assert picklist_df.equals(full_df)


def test_fastgather_prefetchout_as_picklist(runtmp):
# should be able to use fastgather prefetch output as picklist
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')

sig2 = get_test_data('2.fa.sig.gz')
sig47 = get_test_data('47.fa.sig.gz')
sig63 = get_test_data('63.fa.sig.gz')

make_file_list(against_list, [sig2, sig47, sig63])

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

# first run fastgather
runtmp.sourmash('scripts', 'fastgather', query, against_list,
'-o', g_output, '--output-prefetch', p_output,
'-s', '100000')
assert os.path.exists(g_output)
assert os.path.exists(p_output)

# now run sourmash gather using fastgather prefetch output as picklist
gather_picklist_output = runtmp.output('sourmash-gather+picklist.csv')
runtmp.sourmash('gather', query, against_list,
'-o', gather_picklist_output, '--scaled', '100000',
'--picklist', f'{p_output}:match_name:ident')

# finally, run sourmash gather using as picklist as picklist
full_gather_output = runtmp.output('sourmash-gather.csv')
runtmp.sourmash('gather', query, against_list,
'-o', full_gather_output, '--scaled', '100000')

picklist_df = pandas.read_csv(gather_picklist_output)
full_df = pandas.read_csv(full_gather_output)

assert picklist_df.equals(full_df)

0 comments on commit 3f0e3d2

Please sign in to comment.