Skip to content

Commit

Permalink
Merge pull request #2171 from merenlab/issue-2170
Browse files Browse the repository at this point in the history
Verbose output for missing genomes in anvi-get-sequences-for-hmm-hits output
  • Loading branch information
meren authored Nov 6, 2023
2 parents 808a36e + ade8eb8 commit 1a95e66
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 7 deletions.
49 changes: 49 additions & 0 deletions anvio/hmmops.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,55 @@ def filter_hmm_sequences_dict_from_genes_that_occur_in_less_than_N_bins(self, hm
return (hmm_sequences_dict_for_splits, set([]))


def filter_hmm_sequences_dict_for_to_only_include_specific_genes(self, hmm_sequences_dict_for_splits, gene_names=[]):
"""This takes the dictionary for HMM hits, and removes all genes from it except the ones in `gene_names`.
It is critical to keep in mind that the removal of genes can leave behind no gene at all for some of the
genomes/bins. That's why this function tracks the genome names in the dictionary before and after to make
sure it can report the loss of genomes for the user to consider.
"""

# gather all bin names
bin_names_in_original_dict = set([])
for entry in hmm_sequences_dict_for_splits.values():
bin_names_in_original_dict.add(entry['bin_id'])

# filter out every gene hit except those in `gene_names`
hmm_sequences_dict_for_splits = utils.get_filtered_dict(hmm_sequences_dict_for_splits, 'gene_name', set(gene_names))

# gather remaining bin names in the dict
bin_names_in_filtered_dict = set([])
for entry in hmm_sequences_dict_for_splits.values():
bin_names_in_filtered_dict.add(entry['bin_id'])

bins_that_are_lost = bin_names_in_original_dict.difference(bin_names_in_filtered_dict)

if not len(bins_that_are_lost):
# well, all bins are still in the data structure. we're good to return everything
return hmm_sequences_dict, []

# if we are still here, it means some bins were gon buh-bye. we start by letting
# the user gently that stuff went south
self.run.info_single("Yo yo yo! The anvi'o function that helps you focus only on a specific list of gene names "
"among your HMM hits is speaking (we are here most likely you used the --gene-names flag "
"to get rid of all the other genes in a given HMM collection). What follows is a report of "
"happened because ANVI'O ENDED UP LOSING SOME BINS/GENOMES FROM YOUR ANALYSIS AS THEY DID "
"NOT HAVE *ANY* OF THE GENES YOU LISTED (sorry for the CAPS lock here, but we wanted to "
"make sure you don't miss this, since this will certainly influence your downstream "
"analyses). If you want to keep more bins in your analysis, you can include more genes "
"in your `--gene-names` -- but of course it will not change the fact that some bins will "
"still be missing some genes, and how does this patchiness will impact your downstream "
"analyses (such as phylogenomics) is an important question that will require you to "
"consider. Pro tip: you can always use the program `anvi-script-gen-function-matrix-across-genomes` "
"to see the distribution of HMM hits across your bins/genomes.", nl_before=1, nl_after=1)

self.run.info('Num bins at the beginning of this filter', len(bin_names_in_original_dict), nl_after=1)
self.run.info(f'Num bins that lacked the {P("gene", len(gene_names))} in `--gene-names`', len(bins_that_are_lost), nl_after=1, mc='red')
self.run.info('Bins that are no more in the analysis', ', '.join(bins_that_are_lost), nl_after=1, mc='red')

return hmm_sequences_dict_for_splits, bins_that_are_lost


def filter_hmm_sequences_dict_for_bins_that_lack_more_than_N_genes(self, hmm_sequences_dict_for_splits, gene_names, max_num_genes_missing=0):
"""This takes the output of `get_sequences_dict_for_hmm_hits_in_splits`, and goes through every bin\
to identify bins or genomes that have lack more than `max_num_genes_missing` from a list of genes.
Expand Down
24 changes: 17 additions & 7 deletions bin/anvi-get-sequences-for-hmm-hits
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ __resources__ = [("A tutorial on anvi'o phylogenomics workflow", "http://merenla

run = terminal.Run()
progress = terminal.Progress()
P = terminal.pluralize


def main(args):
Expand Down Expand Up @@ -160,10 +161,12 @@ def main(args):
hmm_sequences_dict = s.get_sequences_dict_for_hmm_hits_in_splits(splits_dict, return_amino_acid_sequences=args.get_aa_sequences)
CHK()


run.info('Sources', f"{', '.join(hmm_sources)}")
run.info('Hits', '%d HMM hits for %d source(s)' % (len(hmm_sequences_dict), len(s.sources)))

# keep track of bins removed from the analysis results due to various filters:
bins_removed_for_any_reason = set([])

# figure out gene names.. if the user provided a file, use that, otherwhise parse gene names out of the comma-separated text
if args.gene_names and filesnpaths.is_file_exists(args.gene_names, dont_raise=True):
gene_names = [g.strip() for g in open(args.gene_names, 'r').readlines()] if args.gene_names else []
Expand All @@ -173,21 +176,22 @@ def main(args):
run.info('Genes of interest', f"{', '.join(gene_names) if gene_names else None}")

if len(gene_names):
hmm_sequences_dict = utils.get_filtered_dict(hmm_sequences_dict, 'gene_name', set(gene_names))
run.info('Filtered hits', '%d hits remain after filtering for %d gene(s)' % (len(hmm_sequences_dict), len(gene_names)))
hmm_sequences_dict, bins_removed = s.filter_hmm_sequences_dict_for_to_only_include_specific_genes(hmm_sequences_dict, gene_names)
CHK()

if len(bins_removed):
bins_removed_for_any_reason.update(bins_removed)

run.info('Filtered hits', '%d hits remain after filtering for %d gene(s)' % (len(hmm_sequences_dict), len(gene_names)))

if args.max_num_genes_missing_from_bin is not None:
hmm_sequences_dict, bins_removed = s.filter_hmm_sequences_dict_for_bins_that_lack_more_than_N_genes(hmm_sequences_dict, gene_names, int(args.max_num_genes_missing_from_bin))
CHK()

if len(bins_removed):
bins_removed_for_any_reason.update(bins_removed)
run.info('Filtered hits', '%d hits remain after filtering for `--max-num-genes-missing-from-bin` flag' % (len(hmm_sequences_dict)))

run.warning('The `--max-num-genes-missing-from-bin` flag caused the removal of %d bins (or genomes, whatever) '
'from your analysis. This is the list of bins that will live in our memories: %s' % \
(len(bins_removed), ', '.join(bins_removed)))

if args.min_num_bins_gene_occurs is not None:
hmm_sequences_dict, genes_removed = s.filter_hmm_sequences_dict_from_genes_that_occur_in_less_than_N_bins(hmm_sequences_dict, int(args.min_num_bins_gene_occurs))
CHK()
Expand Down Expand Up @@ -230,6 +234,12 @@ def main(args):

run.info('Filtered hits', '%d hits remain after removing weak hits for multiple models' % (len(hmm_sequences_dict)))

if len(bins_removed_for_any_reason):
run.warning(f"Please note that a total of {P('bin', len(bins_removed_for_any_reason))} from your final results "
f"were removed while anvi'o was applying all the filters to your HMM hits. Please carefully "
f"review the logs above to make sure you have a good grasp on what happened, and you're happy "
f"with the reults.", header='CONDOLENCES FROM DAAN TO YOU FOR YOUR BINS THAT WENT 💀')

if args.separator:
separator = args.separator
else:
Expand Down

0 comments on commit 1a95e66

Please sign in to comment.