Skip to content

Commit

Permalink
Revert "Allow matches to be filtered to a set of variant IDs (e.g. on…
Browse files Browse the repository at this point in the history
…es that are shared across datasets)"

This reverts commit 8f4f1b9.
  • Loading branch information
smlmbrt committed Jan 6, 2023
1 parent 8f4f1b9 commit 5bd8a2b
Show file tree
Hide file tree
Showing 4 changed files with 6 additions and 36 deletions.
3 changes: 0 additions & 3 deletions pgscatalog_utils/match/combine_matches.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,6 @@ def _parse_args(args=None):
help='<Required> List of match files')
parser.add_argument('--min_overlap', dest='min_overlap', required=True,
type=float, help='<Required> Minimum proportion of variants to match before error')
parser.add_argument('-IDs', '--filter_IDs', dest='filter',
help='<Optional> Path to file containing list of variant IDs that can be included in the final scorefile.'
'[useful for limiting scoring files to variants present in multiple datasets]')
parser = add_match_args(parser) # params for labelling matches
parser.add_argument('--outdir', dest='outdir', required=True,
help='<Required> Output directory')
Expand Down
28 changes: 2 additions & 26 deletions pgscatalog_utils/match/label.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,10 @@

def make_params_dict(args) -> dict[str, bool]:
""" Make a dictionary with parameters that control labelling match candidates """
filter_IDs = []
if args.filter:
logger.debug("Reading filter file (variant IDs)")
with open(args.filter, 'r') as f:
filter_IDs = [line.strip() for line in f]

return {'keep_first_match': args.keep_first_match,
'remove_ambiguous': args.remove_ambiguous,
'skip_flip': args.skip_flip,
'remove_multiallelic': args.remove_multiallelic,
'filter_IDs': filter_IDs}
'remove_multiallelic': args.remove_multiallelic}


def label_matches(df: pl.LazyFrame, params: dict[str, bool]) -> pl.LazyFrame:
Expand All @@ -30,15 +23,14 @@ def label_matches(df: pl.LazyFrame, params: dict[str, bool]) -> pl.LazyFrame:
- duplicate: True if more than one best match exists for the same accession and ID
- ambiguous: True if ambiguous
"""
assert set(params.keys()) == {'keep_first_match', 'remove_ambiguous', 'remove_multiallelic', 'skip_flip', 'filter_IDs'}
assert set(params.keys()) == {'keep_first_match', 'remove_ambiguous', 'remove_multiallelic', 'skip_flip'}
labelled = (df.with_column(pl.lit(False).alias('exclude')) # set up dummy exclude column for _label_*
.pipe(_label_best_match)
.pipe(_label_duplicate_best_match)
.pipe(_label_duplicate_id, params['keep_first_match'])
.pipe(_label_biallelic_ambiguous, params['remove_ambiguous'])
.pipe(_label_multiallelic, params['remove_multiallelic'])
.pipe(_label_flips, params['skip_flip'])
.pipe(_label_filter, params['filter_IDs'])
.with_column(pl.lit(True).alias('match_candidate')))

return _encode_match_priority(labelled)
Expand Down Expand Up @@ -223,19 +215,3 @@ def _label_flips(df: pl.LazyFrame, skip_flip: bool) -> pl.LazyFrame:
else:
logger.debug("Not excluding flipped matches")
return df


def _label_filter(df: pl.LazyFrame, filter_IDs: list) -> pl.LazyFrame:
nIDs = len(filter_IDs)
if nIDs > 0:
logger.debug("Excluding variants that are not in ID list (read {} IDs)".format(nIDs))
df = df.with_column(pl.when(pl.col('ID').is_in(filter_IDs))
.then(pl.lit(True))
.otherwise(pl.lit(False))
.alias('match_IDs'))
return df.with_column(pl.when(pl.col('match_IDs') == False)
.then(True)
.otherwise(pl.col('exclude'))
.alias('exclude'))
else:
return df.with_column((pl.lit('NA')).alias('match_IDs'))
8 changes: 4 additions & 4 deletions pgscatalog_utils/match/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ def make_summary_log(match_candidates: pl.LazyFrame, scorefile: pl.LazyFrame, fi
.select(pl.exclude("^.*_right$"))
.with_columns([pl.col('match_status').fill_null(value='unmatched'),
pl.lit(dataset).alias('dataset')]) # fill in unmatched variants
.groupby(['dataset', 'accession', 'match_IDs', 'ambiguous', 'is_multiallelic', 'match_flipped',
'duplicate_best_match', 'duplicate_ID', 'match_status'])
.groupby(['dataset', 'accession', 'match_status', 'ambiguous', 'is_multiallelic', 'match_flipped',
'duplicate_best_match', 'duplicate_ID'])
.agg(pl.count())
.join(filter_summary, how='left', on='accession')
.pipe(_prettify_summary))
Expand All @@ -45,7 +45,7 @@ def check_log_count(scorefile: pl.LazyFrame, summary_log: pl.LazyFrame) -> None:

def _prettify_summary(df: pl.LazyFrame) -> pl.LazyFrame:
keep_cols = ["dataset", "accession", "score_pass", "match_status", "ambiguous", "is_multiallelic",
"duplicate_best_match", "duplicate_ID", "match_IDs", "count", "percent"]
"duplicate_best_match", "duplicate_ID", "count", "percent"]
return (df.with_column((pl.col("count") / pl.sum("count") * 100)
.over(["dataset", "accession"])
.alias("percent"))
Expand All @@ -55,7 +55,7 @@ def _prettify_summary(df: pl.LazyFrame) -> pl.LazyFrame:
def _prettify_log(df: pl.LazyFrame) -> pl.LazyFrame:
keep_cols = ["row_nr", "accession", "chr_name", "chr_position", "effect_allele", "other_allele", "effect_weight",
"effect_type", "ID", "REF", "ALT", "matched_effect_allele", "match_type", "is_multiallelic",
"ambiguous", "match_flipped", "best_match", "exclude", "duplicate_best_match", "duplicate_ID", "match_IDs",
"ambiguous", "match_flipped", "best_match", "exclude", "duplicate_best_match", "duplicate_ID",
"match_status", "dataset"]
pretty_df = (df.select(keep_cols)
.select(pl.exclude("^.*_right"))
Expand Down
3 changes: 0 additions & 3 deletions pgscatalog_utils/match/match_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,9 +210,6 @@ def _parse_args(args=None):
help="<Optional> Only match, then write intermediate files, don't make scoring files")
parser.add_argument('--min_overlap', dest='min_overlap', required=False,
type=float, help='<Optional> Minimum proportion of variants to match before error')
parser.add_argument('-IDs', '--filter_IDs', dest='filter',
help='<Optional> Path to file containing list of variant IDs that can be included in the final scorefile.'
'[useful for limiting scoring files to variants present in multiple datasets]')
parser = add_match_args(parser) # params for labelling matches
parser.add_argument('--outdir', dest='outdir', required=True,
help='<Required> Output directory')
Expand Down

0 comments on commit 5bd8a2b

Please sign in to comment.