Skip to content

Commit

Permalink
Merge pull request #182 from rmgpanw/dev_check_missing_data_cols
Browse files Browse the repository at this point in the history
add `drop_na_cols` parameter to `format_sumstats()`
  • Loading branch information
Al-Murphy authored Apr 24, 2024
2 parents f7bd333 + b88a70d commit 588a898
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 3 deletions.
14 changes: 12 additions & 2 deletions R/check_miss_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
#' @keywords internal
#' @importFrom stats complete.cases
check_miss_data <- function(sumstats_dt, path, log_folder_ind, check_save_out,
tabix_index, nThread, log_files) {
tabix_index, nThread, log_files,
drop_na_cols) {
message("Checking for missing data.")
col_headers <- names(sumstats_dt)
# use data table for speed
Expand All @@ -20,7 +21,16 @@ check_miss_data <- function(sumstats_dt, path, log_folder_ind, check_save_out,
col_headers[grepl("^convert_", col_headers)],
"SNP_INFO"["SNP_INFO"%in% col_headers]
)
incl_cols <- names(sumstats_dt)[!names(sumstats_dt) %in% ignore_cols]

if (!is.null(drop_na_cols)) {
drop_na_cols_in_sumstats <-
c(drop_na_cols)[drop_na_cols %in% names(sumstats_dt)]
incl_cols <-
c(drop_na_cols_in_sumstats)[!drop_na_cols_in_sumstats %in% ignore_cols]
} else {
incl_cols <- names(sumstats_dt)[!names(sumstats_dt) %in% ignore_cols]
}

if (nrow(sumstats_dt[!complete.cases(sumstats_dt[, incl_cols,
with = FALSE
]), ]) > 0) {
Expand Down
10 changes: 9 additions & 1 deletion R/format_sumstats.R
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,11 @@
#' give is incorrect you can supply your own mapping file. Must be a 2 column
#' dataframe with column names "Uncorrected" and "Corrected". See
#' data(sumstatsColHeaders) for default mapping and necessary format.
#' @param drop_na_cols A character vector of column names to be checked for missing values.
#' Rows with missing values in any of these columns (if present in the dataset) will be dropped. If `NULL`,
#' all columns will be checked for missing values. Default columns are SNP,
#' chromosome, position, allele 1, allele2, frequency, beta, standard error, p
#' value and N columns.
#'
#' @importFrom data.table fread
#' @importFrom data.table fwrite
Expand Down Expand Up @@ -284,6 +289,7 @@ format_sumstats <- function(path,
imputation_ind = FALSE,
force_new = FALSE,
mapping_file = sumstatsColHeaders,
drop_na_cols = c("SNP", "CHR", "BP", "A1", "A2", "FRQ", "BETA", "SE", "P", "N"),
#deprecated parameters
rmv_chrPrefix = NULL
) {
Expand Down Expand Up @@ -367,6 +373,7 @@ format_sumstats <- function(path,
mapping_file = mapping_file,
tabix_index = tabix_index,
chain_source = chain_source,
drop_na_cols = drop_na_cols,
#deprecated parameters
rmv_chrPrefix = rmv_chrPrefix
)
Expand Down Expand Up @@ -773,7 +780,8 @@ format_sumstats <- function(path,
check_save_out = check_save_out,
tabix_index = tabix_index,
nThread = nThread,
log_files = log_files
log_files = log_files,
drop_na_cols = drop_na_cols
)
# update values
log_files <- sumstats_return$log_files
Expand Down
10 changes: 10 additions & 0 deletions R/validate_parameters.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ validate_parameters <- function(path,
mapping_file,
tabix_index,
chain_source,
drop_na_cols,
#deprecated parameters
rmv_chrPrefix) {
# Checking if the file exists should happen first -
Expand Down Expand Up @@ -406,6 +407,15 @@ validate_parameters <- function(path,
stop(tbx_msg)
}

# validate drop_na_cols
if (!is.character(drop_na_cols)) {
if (!is.null(drop_na_cols)) {
stop(
"Parameter `drop_na_cols` should be either a character vector of column names, or `NULL`"
)
}
}

#deprecated parameters
if (!is.null(rmv_chrPrefix)) {
dep_msg <- paste0(
Expand Down
32 changes: 32 additions & 0 deletions tests/testthat/test-missing_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,38 @@ test_that("Handle missing data", {
)
reformatted_lines <- readLines(reformatted)
expect_equal(reformatted_lines, org_lines)

# set `drop_na_cols` to `NULL`
miss_extra_col <- miss
miss_extra_col$extra <- NA

expect_error(MungeSumstats::format_sumstats(
miss_extra_col,
ref_genome = "GRCh37",
on_ref_genome = FALSE,
strand_ambig_filter = FALSE,
bi_allelic_filter = FALSE,
allele_flip_check = FALSE,
sort_coordinates = FALSE,
dbSNP = 144,
drop_na_cols = NULL
),
regexp = "All SNPs have been filtered out of your summary statistics dataset")

reformatted_extra_col <- MungeSumstats::format_sumstats(
miss_extra_col,
ref_genome = "GRCh37",
on_ref_genome = FALSE,
strand_ambig_filter = FALSE,
bi_allelic_filter = FALSE,
allele_flip_check = FALSE,
sort_coordinates = FALSE,
dbSNP = 144,
drop_na_cols = c("CHR", "POS")
)

reformatted_extra_col_lines <- readLines(reformatted_extra_col)
expect_equal(length(reformatted_extra_col_lines), length(org_lines))
}
else{
expect_equal(is_32bit_windows, TRUE)
Expand Down

0 comments on commit 588a898

Please sign in to comment.