Revert "Allow matches to be filtered to a set of variant IDs (e.g. on…

…es that are shared across datasets)" This reverts commit 8f4f1b9.
PGScatalog · Jan 6, 2023 · 5bd8a2b · 5bd8a2b
1 parent 8f4f1b9
commit 5bd8a2b
Show file tree

Hide file tree

Showing 4 changed files with 6 additions and 36 deletions.
diff --git a/pgscatalog_utils/match/combine_matches.py b/pgscatalog_utils/match/combine_matches.py
@@ -58,9 +58,6 @@ def _parse_args(args=None):
                         help='<Required> List of match files')
     parser.add_argument('--min_overlap', dest='min_overlap', required=True,
                         type=float, help='<Required> Minimum proportion of variants to match before error')
-    parser.add_argument('-IDs', '--filter_IDs', dest='filter',
-                        help='<Optional> Path to file containing list of variant IDs that can be included in the final scorefile.'
-                             '[useful for limiting scoring files to variants present in multiple datasets]')
     parser = add_match_args(parser) # params for labelling matches
     parser.add_argument('--outdir', dest='outdir', required=True,
                         help='<Required> Output directory')

diff --git a/pgscatalog_utils/match/label.py b/pgscatalog_utils/match/label.py
@@ -9,17 +9,10 @@
 
 def make_params_dict(args) -> dict[str, bool]:
     """ Make a dictionary with parameters that control labelling match candidates """
-    filter_IDs = []
-    if args.filter:
-        logger.debug("Reading filter file (variant IDs)")
-        with open(args.filter, 'r') as f:
-            filter_IDs = [line.strip() for line in f]
-
     return {'keep_first_match': args.keep_first_match,
             'remove_ambiguous': args.remove_ambiguous,
             'skip_flip': args.skip_flip,
-            'remove_multiallelic': args.remove_multiallelic,
-            'filter_IDs': filter_IDs}
+            'remove_multiallelic': args.remove_multiallelic}
 
 
 def label_matches(df: pl.LazyFrame, params: dict[str, bool]) -> pl.LazyFrame:
@@ -30,15 +23,14 @@ def label_matches(df: pl.LazyFrame, params: dict[str, bool]) -> pl.LazyFrame:
     - duplicate: True if more than one best match exists for the same accession and ID
     - ambiguous: True if ambiguous
     """
-    assert set(params.keys()) == {'keep_first_match', 'remove_ambiguous', 'remove_multiallelic', 'skip_flip', 'filter_IDs'}
+    assert set(params.keys()) == {'keep_first_match', 'remove_ambiguous', 'remove_multiallelic', 'skip_flip'}
     labelled = (df.with_column(pl.lit(False).alias('exclude'))  # set up dummy exclude column for _label_*
                 .pipe(_label_best_match)
                 .pipe(_label_duplicate_best_match)
                 .pipe(_label_duplicate_id, params['keep_first_match'])
                 .pipe(_label_biallelic_ambiguous, params['remove_ambiguous'])
                 .pipe(_label_multiallelic, params['remove_multiallelic'])
                 .pipe(_label_flips, params['skip_flip'])
-                .pipe(_label_filter, params['filter_IDs'])
                 .with_column(pl.lit(True).alias('match_candidate')))
 
     return _encode_match_priority(labelled)
@@ -223,19 +215,3 @@ def _label_flips(df: pl.LazyFrame, skip_flip: bool) -> pl.LazyFrame:
     else:
         logger.debug("Not excluding flipped matches")
         return df
-
-
-def _label_filter(df: pl.LazyFrame, filter_IDs: list) -> pl.LazyFrame:
-    nIDs = len(filter_IDs)
-    if nIDs > 0:
-        logger.debug("Excluding variants that are not in ID list (read {} IDs)".format(nIDs))
-        df = df.with_column(pl.when(pl.col('ID').is_in(filter_IDs))
-                            .then(pl.lit(True))
-                            .otherwise(pl.lit(False))
-                            .alias('match_IDs'))
-        return df.with_column(pl.when(pl.col('match_IDs') == False)
-                              .then(True)
-                              .otherwise(pl.col('exclude'))
-                              .alias('exclude'))
-    else:
-        return df.with_column((pl.lit('NA')).alias('match_IDs'))
diff --git a/pgscatalog_utils/match/log.py b/pgscatalog_utils/match/log.py
@@ -22,8 +22,8 @@ def make_summary_log(match_candidates: pl.LazyFrame, scorefile: pl.LazyFrame, fi
             .select(pl.exclude("^.*_right$"))
             .with_columns([pl.col('match_status').fill_null(value='unmatched'),
                            pl.lit(dataset).alias('dataset')])  # fill in unmatched variants
-            .groupby(['dataset', 'accession', 'match_IDs', 'ambiguous', 'is_multiallelic', 'match_flipped',
-                      'duplicate_best_match', 'duplicate_ID', 'match_status'])
+            .groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'match_flipped',
+                      'duplicate_best_match', 'duplicate_ID'])
             .agg(pl.count())
             .join(filter_summary, how='left', on='accession')
             .pipe(_prettify_summary))
@@ -45,7 +45,7 @@ def check_log_count(scorefile: pl.LazyFrame, summary_log: pl.LazyFrame) -> None:
 
 def _prettify_summary(df: pl.LazyFrame) -> pl.LazyFrame:
     keep_cols = ["dataset", "accession", "score_pass", "match_status", "ambiguous", "is_multiallelic",
-                 "duplicate_best_match", "duplicate_ID", "match_IDs", "count", "percent"]
+                 "duplicate_best_match", "duplicate_ID", "count", "percent"]
     return (df.with_column((pl.col("count") / pl.sum("count") * 100)
                            .over(["dataset", "accession"])
                            .alias("percent"))
@@ -55,7 +55,7 @@ def _prettify_summary(df: pl.LazyFrame) -> pl.LazyFrame:
 def _prettify_log(df: pl.LazyFrame) -> pl.LazyFrame:
     keep_cols = ["row_nr", "accession", "chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight",
                  "effect_type", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic",
-                 "ambiguous", "match_flipped", "best_match", "exclude", "duplicate_best_match", "duplicate_ID", "match_IDs",
+                 "ambiguous", "match_flipped", "best_match", "exclude", "duplicate_best_match", "duplicate_ID",
                  "match_status", "dataset"]
     pretty_df = (df.select(keep_cols)
                  .select(pl.exclude("^.*_right"))

diff --git a/pgscatalog_utils/match/match_variants.py b/pgscatalog_utils/match/match_variants.py
@@ -210,9 +210,6 @@ def _parse_args(args=None):
                         help="<Optional> Only match, then write intermediate files, don't make scoring files")
     parser.add_argument('--min_overlap', dest='min_overlap', required=False,
                         type=float, help='<Optional> Minimum proportion of variants to match before error')
-    parser.add_argument('-IDs', '--filter_IDs', dest='filter',
-                        help='<Optional> Path to file containing list of variant IDs that can be included in the final scorefile.'
-                             '[useful for limiting scoring files to variants present in multiple datasets]')
     parser = add_match_args(parser) # params for labelling matches
     parser.add_argument('--outdir', dest='outdir', required=True,
                         help='<Required> Output directory')