Merge pull request #2171 from merenlab/issue-2170

Verbose output for missing genomes in anvi-get-sequences-for-hmm-hits output
merenlab · Nov 6, 2023 · 1a95e66 · 1a95e66
2 parents 808a36e + ade8eb8
commit 1a95e66
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 7 deletions.
diff --git a/anvio/hmmops.py b/anvio/hmmops.py
@@ -580,6 +580,55 @@ def filter_hmm_sequences_dict_from_genes_that_occur_in_less_than_N_bins(self, hm
             return (hmm_sequences_dict_for_splits, set([]))
 
 
+    def filter_hmm_sequences_dict_for_to_only_include_specific_genes(self, hmm_sequences_dict_for_splits, gene_names=[]):
+        """This takes the dictionary for HMM hits, and removes all genes from it except the ones in `gene_names`.
+
+        It is critical to keep in mind that the removal of genes can leave behind no gene at all for some of the
+        genomes/bins. That's why this function tracks the genome names in the dictionary before and after to make
+        sure it can report the loss of genomes for the user to consider.
+        """
+
+        # gather all bin names
+        bin_names_in_original_dict = set([])
+        for entry in hmm_sequences_dict_for_splits.values():
+            bin_names_in_original_dict.add(entry['bin_id'])
+
+        # filter out every gene hit except those in `gene_names`
+        hmm_sequences_dict_for_splits = utils.get_filtered_dict(hmm_sequences_dict_for_splits, 'gene_name', set(gene_names))
+
+        # gather remaining bin names in the dict
+        bin_names_in_filtered_dict = set([])
+        for entry in hmm_sequences_dict_for_splits.values():
+            bin_names_in_filtered_dict.add(entry['bin_id'])
+
+        bins_that_are_lost = bin_names_in_original_dict.difference(bin_names_in_filtered_dict)
+
+        if not len(bins_that_are_lost):
+            # well, all bins are still in the data structure. we're good to return everything
+            return hmm_sequences_dict, []
+
+        # if we are still here, it means some bins were gon buh-bye. we start by letting
+        # the user gently that stuff went south
+        self.run.info_single("Yo yo yo! The anvi'o function that helps you focus only on a specific list of gene names "
+                             "among your HMM hits is speaking (we are here most likely you used the --gene-names flag "
+                             "to get rid of all the other genes in a given HMM collection). What follows is a report of "
+                             "happened because ANVI'O ENDED UP LOSING SOME BINS/GENOMES FROM YOUR ANALYSIS AS THEY DID "
+                             "NOT HAVE *ANY* OF THE GENES YOU LISTED (sorry for the CAPS lock here, but we wanted to "
+                             "make sure you don't miss this, since this will certainly influence your downstream "
+                             "analyses). If you want to keep more bins in your analysis, you can include more genes "
+                             "in your `--gene-names` -- but of course it will not change the fact that some bins will "
+                             "still be missing some genes, and how does this patchiness will impact your downstream "
+                             "analyses (such as phylogenomics) is an important question that will require you to "
+                             "consider. Pro tip: you can always use the program `anvi-script-gen-function-matrix-across-genomes` "
+                             "to see the distribution of HMM hits across your bins/genomes.", nl_before=1, nl_after=1)
+
+        self.run.info('Num bins at the beginning of this filter', len(bin_names_in_original_dict), nl_after=1)
+        self.run.info(f'Num bins that lacked the {P("gene", len(gene_names))} in `--gene-names`', len(bins_that_are_lost), nl_after=1, mc='red')
+        self.run.info('Bins that are no more in the analysis', ', '.join(bins_that_are_lost), nl_after=1, mc='red')
+
+        return hmm_sequences_dict_for_splits, bins_that_are_lost
+
+
     def filter_hmm_sequences_dict_for_bins_that_lack_more_than_N_genes(self, hmm_sequences_dict_for_splits, gene_names, max_num_genes_missing=0):
         """This takes the output of `get_sequences_dict_for_hmm_hits_in_splits`, and goes through every bin\
            to identify bins or genomes that have lack more than `max_num_genes_missing` from a list of genes.

diff --git a/bin/anvi-get-sequences-for-hmm-hits b/bin/anvi-get-sequences-for-hmm-hits
@@ -66,6 +66,7 @@ __resources__ = [("A tutorial on anvi'o phylogenomics workflow", "http://merenla
 
 run = terminal.Run()
 progress = terminal.Progress()
+P = terminal.pluralize
 
 
 def main(args):
@@ -160,10 +161,12 @@ def main(args):
     hmm_sequences_dict = s.get_sequences_dict_for_hmm_hits_in_splits(splits_dict, return_amino_acid_sequences=args.get_aa_sequences)
     CHK()
 
-
     run.info('Sources', f"{', '.join(hmm_sources)}")
     run.info('Hits', '%d HMM hits for %d source(s)' % (len(hmm_sequences_dict), len(s.sources)))
 
+    # keep track of bins removed from the analysis results due to various filters:
+    bins_removed_for_any_reason = set([])
+
     # figure out gene names.. if the user provided a file, use that, otherwhise parse gene names out of the comma-separated text
     if args.gene_names and filesnpaths.is_file_exists(args.gene_names, dont_raise=True):
         gene_names = [g.strip() for g in open(args.gene_names, 'r').readlines()] if args.gene_names else []
@@ -173,21 +176,22 @@ def main(args):
     run.info('Genes of interest', f"{', '.join(gene_names) if gene_names else None}")
 
     if len(gene_names):
-        hmm_sequences_dict = utils.get_filtered_dict(hmm_sequences_dict, 'gene_name', set(gene_names))
-        run.info('Filtered hits', '%d hits remain after filtering for %d gene(s)' % (len(hmm_sequences_dict), len(gene_names)))
+        hmm_sequences_dict, bins_removed = s.filter_hmm_sequences_dict_for_to_only_include_specific_genes(hmm_sequences_dict, gene_names)
         CHK()
 
+        if len(bins_removed):
+            bins_removed_for_any_reason.update(bins_removed)
+
+        run.info('Filtered hits', '%d hits remain after filtering for %d gene(s)' % (len(hmm_sequences_dict), len(gene_names)))
+
     if args.max_num_genes_missing_from_bin is not None:
         hmm_sequences_dict, bins_removed = s.filter_hmm_sequences_dict_for_bins_that_lack_more_than_N_genes(hmm_sequences_dict, gene_names, int(args.max_num_genes_missing_from_bin))
         CHK()
 
         if len(bins_removed):
+            bins_removed_for_any_reason.update(bins_removed)
             run.info('Filtered hits', '%d hits remain after filtering for `--max-num-genes-missing-from-bin` flag' % (len(hmm_sequences_dict)))
 
-            run.warning('The `--max-num-genes-missing-from-bin` flag caused the removal of %d bins (or genomes, whatever) '
-                        'from your analysis. This is the list of bins that will live in our memories: %s' % \
-                                                (len(bins_removed), ', '.join(bins_removed)))
-
     if args.min_num_bins_gene_occurs is not None:
         hmm_sequences_dict, genes_removed = s.filter_hmm_sequences_dict_from_genes_that_occur_in_less_than_N_bins(hmm_sequences_dict, int(args.min_num_bins_gene_occurs))
         CHK()
@@ -230,6 +234,12 @@ def main(args):
 
         run.info('Filtered hits', '%d hits remain after removing weak hits for multiple models' % (len(hmm_sequences_dict)))
 
+    if len(bins_removed_for_any_reason):
+            run.warning(f"Please note that a total of {P('bin', len(bins_removed_for_any_reason))} from your final results "
+                        f"were removed while anvi'o was applying all the filters to your HMM hits. Please carefully "
+                        f"review the logs above to make sure you have a good grasp on what happened, and you're happy "
+                        f"with the reults.", header='CONDOLENCES FROM DAAN TO YOU FOR YOUR BINS THAT WENT 💀')
+
     if args.separator:
         separator = args.separator
     else: