Skip to content

Commit

Permalink
bug fix: Infer effect column
Browse files Browse the repository at this point in the history
  • Loading branch information
Al-Murphy committed Aug 1, 2024
1 parent c79a47a commit e1d6289
Show file tree
Hide file tree
Showing 8 changed files with 36 additions and 23 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: MungeSumstats
Type: Package
Title: Standardise summary statistics from GWAS
Version: 1.13.2
Version: 1.13.3
Authors@R:
c(person(given = "Alan",
family = "Murphy",
Expand Down
8 changes: 8 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
## CHANGES IN VERSION 1.13.3

### Bug fix
* Bug fix for check 3 in infer effect column - previously A1 & A2 were swapped
when there were more matches for the ref genome in A1 rather than A2 which was
incorrect. Corrected now so it will only be flipped when A2 has more matches to
the reference genome.

## CHANGES IN VERSION 1.13.2

### New features
Expand Down
1 change: 1 addition & 0 deletions R/check_allele_flip.R
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ check_allele_flip <- function(sumstats_dt, path,
sumstats_dt[is.na(ref_gen_allele), match_type := TRUE]
sumstats_dt[A1 == ref_gen_allele, match_type := TRUE]
sumstats_dt[A2 == ref_gen_allele, match_type := FALSE]
print(sumstats_dt)
# drop cases that don't match either
if (allele_flip_drop &&
nrow(sumstats_dt[A1 != ref_gen_allele &
Expand Down
2 changes: 1 addition & 1 deletion R/format_sumstats.R
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,7 @@ format_sumstats <- function(path,
#### Check 40:Check for log10 p instead of p ####
sumstats_return <-
read_log_pval(sumstats_dt = sumstats_return$sumstats_dt)

#### Check 2:Check for effect direction ####
sumstats_return <-
infer_effect_column(
Expand Down
6 changes: 4 additions & 2 deletions R/get_genome_build.R
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ get_genome_build <- function(sumstats,
on = c("SNP"="SNP","pos"="BP","seqnames"="CHR"),
nomatch = FALSE
])

if (num_37 > num_38) {
ref_gen_num <- num_37
ref_genome <- "GRCH37"
Expand Down Expand Up @@ -221,11 +222,12 @@ get_genome_build <- function(sumstats,
"ref_allele"="A2","alt_alleles"="A1"),
nomatch = FALSE
])
if(num_a2>=num_a1){

if(num_a1>=num_a2){
message("Effect/frq column(s) relate to A2 in the inputted sumstats")
#this is what MSS expects so no action required
switch_req <- FALSE
}else{#num_a2<num_a1
}else{#num_a1<num_a2
message("Effect/frq column(s) relate to A1 in the inputted sumstats")
switch_req <- TRUE
}
Expand Down
30 changes: 16 additions & 14 deletions R/infer_effect_column.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#' mentioned, if found then we know the direction and should update A1/A2
#' naming so A2 is the effect column. We can look for such columns by getting
#' every combination of A1/A2 naming and effect/frq naming.
#' 3. If note found in 2, a final check should be against the reference genome,
#' 3. If not found in 2, a final check should be against the reference genome,
#' whichever of A1 and A2 has more of a match with the reference genome should
#' be taken as **not** the effect allele. There is an assumption in this but is
#' still better than guessing the ambiguous allele naming.
Expand Down Expand Up @@ -49,13 +49,14 @@ infer_effect_column <-
# vs those that are interpretable
colnames(mapping_file) <- toupper(colnames(mapping_file))
allele_mapping <- mapping_file[mapping_file$CORRECTED %in% c('A1','A2'),]
ambig_allele_map <- allele_mapping[grepl('1',allele_mapping$UNCORRECTED)|
grepl('2',allele_mapping$UNCORRECTED),]
ambig_allele_map <-
allele_mapping[grepl('1',allele_mapping$UNCORRECTED)|
grepl('2',allele_mapping$UNCORRECTED),]
unambig_allele_map <-
allele_mapping[!(grepl('1',allele_mapping$UNCORRECTED)|
grepl('2',allele_mapping$UNCORRECTED)),]
#as long as the sumstats contains 1 unambiguous allele column MSS will work
#as expected
#as long as the sumstats contains 1 unambiguous allele column MSS will
#work as expected
unambig_cols <- intersect(unambig_allele_map$UNCORRECTED,
toupper(column_headers))
ambig_cols <- intersect(ambig_allele_map$UNCORRECTED,
Expand All @@ -81,21 +82,22 @@ infer_effect_column <-
message("Renaming ambiguous allele columns so they won't be used")
#get the related ambig naming and change there name so won't be used
ambig_uncorrcted_rnme <-
ambig_allele_map[ambig_allele_map$CORRECTED %in% ambig_corrcted_rnme,
]$UNCORRECTED
ambig_allele_map[ambig_allele_map$CORRECTED %in%
ambig_corrcted_rnme,]$UNCORRECTED
#now rename any matches in sumstats
chng_nmes <- column_headers[toupper(column_headers) %in%
ambig_uncorrcted_rnme]
for(chng_i in chng_nmes){
data.table::setnames(sumstats_dt, chng_i, paste0(chng_i,"_INPUTTED"))
data.table::setnames(sumstats_dt, chng_i,
paste0(chng_i,"_INPUTTED"))
}
}
} else if (length(unambig_cols)==0 && length(ambig_cols)>=2){
#only continue if no unambiguous columns found but 2 ambig ones are found-
#less than 2 in total means allele info is missing which MSS can try fill
#in later
message("Allele columns are ambiguous, attempting to infer direction")
#get names for allele mared eff/frq columns
#get names for allele marked eff/frq columns
eff_frq_allele_matches <- get_eff_frq_allele_combns()
#now look for matches in sumstats
fnd_allele_indicator <-
Expand All @@ -107,10 +109,10 @@ infer_effect_column <-
a1_mtch <- sum(grepl("A1",fnd_allele_indicator))
a2_mtch <- sum(grepl("A2",fnd_allele_indicator))
if(a2_mtch>=a1_mtch){
message("Effect/frq column(s) relate to A2 in the inputted sumstats")
message("Effect/frq column(s) relate to A2 in the sumstats")
#this is what MSS expects so no action required
}else{#a2_mtch<a1_mtch
message("Effect/frq column(s) relate to A1 in the inputted sumstats")
message("Effect/frq column(s) relate to A1 in the sumstats")
#this is the opposite to what MSS expects so switch A1/A2 naming
#first get corrected names for allele columns then switch
for (headerI in seq_len(nrow(mapping_file))) {
Expand Down Expand Up @@ -146,9 +148,9 @@ infer_effect_column <-
ref_genome = ref_genome
)
if(is.logical(switch_req)){
message(paste0("Found direction from matchine reference genome - N",
"OTE this assumes non-effect allele will macth the ",
"reference genome"))
message(paste0("Found direction from matching reference genome -",
" NOTE this assumes non-effect allele will match ",
"the reference genome"))
if(isTRUE(switch_req)){
#swap A1 and A2
#this is the opposite to what MSS expects so switch A1/A2 naming
Expand Down
8 changes: 4 additions & 4 deletions longtests/testthat/test-infer_effect_column.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,15 @@ test_that("Test infer effect column function works", {
)


#finally check if the ref genome can be used to infer rather than being told in
#eff/frq cols - just subset for speed
#finally check if the ref genome can be used to infer rather than being
#told in eff/frq cols - just subset for speed
snps <- c("rs11210860","rs34305371","rs1008078","rs11588857","rs1777827",
"rs76076331","rs2457660","rs10496091","rs4851251","rs12987662",
"rs10930008","rs301800","rs2568955","rs61787263","rs2992632",
"rs11689269","rs11690172")
d <- copy(b)
data.table::setnames(d,"BETA1","BETA")
d_for <- MungeSumstats::format_sumstats(d[SNP %in% snps,], return_data = TRUE,
d_for <- MungeSumstats::format_sumstats(d[!SNP %in% snps,], return_data = TRUE,
on_ref_genome = TRUE,
#all just make MSS run faster
ref_genome = 'GRCh37',
Expand All @@ -82,7 +82,7 @@ test_that("Test infer effect column function works", {
data.table::setkey(b_renamed_for,"SNP")
data.table::setkey(d_for,"SNP")
testthat::expect_equal(
all.equal(b_renamed_for[SNP %in% snps], d_for,
all.equal(b_renamed_for[!SNP %in% snps], d_for,
ignore.col.order = TRUE),
TRUE
)
Expand Down
2 changes: 1 addition & 1 deletion man/infer_effect_column.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit e1d6289

Please sign in to comment.