diff --git a/DESCRIPTION b/DESCRIPTION index a531936..45be2e5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: MungeSumstats Type: Package Title: Standardise summary statistics from GWAS -Version: 1.11.8 +Version: 1.11.9 Authors@R: c(person(given = "Alan", family = "Murphy", diff --git a/NEWS.md b/NEWS.md index c5b35ed..253e702 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +## CHANGES IN VERSION 1.11.9 + +### New features +* Can now control what columns are checked for missing data (`drop_na_cols` in +`format_sumstats()`). By default, SNP, effect columns and P/N columns are +checked. Set to Null to check all columns or choose specific columns. + ## CHANGES IN VERSION 1.11.7 ### Bug fix diff --git a/R/check_miss_data.R b/R/check_miss_data.R index b24bf61..630a7d9 100644 --- a/R/check_miss_data.R +++ b/R/check_miss_data.R @@ -27,6 +27,17 @@ check_miss_data <- function(sumstats_dt, path, log_folder_ind, check_save_out, c(drop_na_cols)[drop_na_cols %in% names(sumstats_dt)] incl_cols <- c(drop_na_cols_in_sumstats)[!drop_na_cols_in_sumstats %in% ignore_cols] + if(length(incl_cols)<1){ + msg <- paste0( + "WARNING: None of the inputted columns:\n", + paste(drop_na_cols,collapse=" "),"\n", + "To be checked for missing data were found in the sumstats. Sumstats", + " columns:\n", + paste(names(sumstats_dt),collapse=" "),"\n", + "This check will not be run." + ) + message(msg) + } } else { incl_cols <- names(sumstats_dt)[!names(sumstats_dt) %in% ignore_cols] } diff --git a/R/format_sumstats.R b/R/format_sumstats.R index 212e11f..e66c8a5 100644 --- a/R/format_sumstats.R +++ b/R/format_sumstats.R @@ -169,6 +169,12 @@ #' dropped? These can not be checked against a reference dataset and will have #' the same RS ID and position as SNPs which can affect downstream analysis. #' Default is False. +#' @param drop_na_cols A character vector of column names to be checked for +#' missing values. Rows with missing values in any of these columns (if present +#' in the dataset) will be dropped. If `NULL`, all columns will be checked for +#' missing values. Default columns are SNP, chromosome, position, allele 1, +#' allele2, effect columns (frequency, beta, Z-score, standard error, log odds, +#' signed sumstats, odds ratio), p value and N columns. #' @param dbSNP version of dbSNP to be used for imputation (144 or 155). #' @param check_dups whether to check for duplicates - if formatting QTL #' datasets this should be set to FALSE otherwise keep as TRUE. Default is TRUE. @@ -221,11 +227,6 @@ #' give is incorrect you can supply your own mapping file. Must be a 2 column #' dataframe with column names "Uncorrected" and "Corrected". See #' data(sumstatsColHeaders) for default mapping and necessary format. -#' @param drop_na_cols A character vector of column names to be checked for missing values. -#' Rows with missing values in any of these columns (if present in the dataset) will be dropped. If `NULL`, -#' all columns will be checked for missing values. Default columns are SNP, -#' chromosome, position, allele 1, allele2, frequency, beta, standard error, p -#' value and N columns. #' #' @importFrom data.table fread #' @importFrom data.table fwrite @@ -272,6 +273,10 @@ format_sumstats <- function(path, frq_is_maf = TRUE, indels = TRUE, drop_indels = FALSE, + drop_na_cols = c("SNP", "CHR", "BP", "A1", "A2", + "FRQ", "BETA", "Z", "OR", + "LOG_ODDS", "SIGNED_SUMSTAT", "SE", + "P", "N"), dbSNP = 155, check_dups = TRUE, sort_coordinates = TRUE, @@ -289,7 +294,6 @@ format_sumstats <- function(path, imputation_ind = FALSE, force_new = FALSE, mapping_file = sumstatsColHeaders, - drop_na_cols = c("SNP", "CHR", "BP", "A1", "A2", "FRQ", "BETA", "SE", "P", "N"), #deprecated parameters rmv_chrPrefix = NULL ) { diff --git a/README.Rmd b/README.Rmd index 24a5c07..e206337 100644 --- a/README.Rmd +++ b/README.Rmd @@ -142,5 +142,6 @@ development: * [Jonathan Griffiths](https://github.com/jonathangriffiths) * [Kitty Murphy](https://github.com/KittyMurphy) * [Mykhaylo Malakhov](https://github.com/MykMal) + * [Alasdair Warwick](https://github.com/rmgpanw) # References diff --git a/README.md b/README.md index 50ba30a..7549280 100644 --- a/README.md +++ b/README.md @@ -4,19 +4,19 @@ Authors: Alan Murphy, Brian Schilder and Nathan Skene
-Updated: Jan-15-2024 +Updated: Apr-24-2024
[![](https://img.shields.io/badge/release%20version-1.10.1-black.svg)](https://www.bioconductor.org/packages/MungeSumstats) -[![](https://img.shields.io/badge/devel%20version-1.11.3-black.svg)](https://github.com/neurogenomics/MungeSumstats) +[![](https://img.shields.io/badge/devel%20version-1.11.9-black.svg)](https://github.com/neurogenomics/MungeSumstats) [![R build status](https://github.com/neurogenomics/MungeSumstats/workflows/rworkflows/badge.svg)](https://github.com/neurogenomics/MungeSumstats/actions) [![](https://img.shields.io/github/last-commit/neurogenomics/MungeSumstats.svg)](https://github.com/neurogenomics/MungeSumstats/commits/master) [![](https://codecov.io/gh/neurogenomics/MungeSumstats/branch/master/graph/badge.svg)](https://codecov.io/gh/neurogenomics/MungeSumstats) -[![](https://img.shields.io/badge/download-11379/total-blue.svg)](https://bioconductor.org/packages/stats/bioc/MungeSumstats) +[![](https://img.shields.io/badge/download-15314/total-blue.svg)](https://bioconductor.org/packages/stats/bioc/MungeSumstats) [![License: Artistic-2.0](https://img.shields.io/badge/license-Artistic--2.0-blue.svg)](https://cran.r-project.org/web/licenses/Artistic-2.0) [![](https://img.shields.io/badge/doi-https://doi.org/10.1093/bioinformatics/btab665-blue.svg)](https://doi.org/https://doi.org/10.1093/bioinformatics/btab665) @@ -150,6 +150,7 @@ We would like to acknowledge all those who have contributed to - [Jonathan Griffiths](https://github.com/jonathangriffiths) - [Kitty Murphy](https://github.com/KittyMurphy) - [Mykhaylo Malakhov](https://github.com/MykMal) +- [Alasdair Warwick](https://github.com/rmgpanw) # References diff --git a/man/check_miss_data.Rd b/man/check_miss_data.Rd index a0a39d3..5125c0c 100644 --- a/man/check_miss_data.Rd +++ b/man/check_miss_data.Rd @@ -11,7 +11,8 @@ check_miss_data( check_save_out, tabix_index, nThread, - log_files + log_files, + drop_na_cols ) } \arguments{ @@ -31,6 +32,13 @@ FALSE.} \item{nThread}{Number of threads to use for parallel processes.} \item{log_files}{list of log file locations} + +\item{drop_na_cols}{A character vector of column names to be checked for +missing values. Rows with missing values in any of these columns (if present +in the dataset) will be dropped. If \code{NULL}, all columns will be checked for +missing values. Default columns are SNP, chromosome, position, allele 1, +allele2, effect columns (frequency, beta, Z-score, standard error, log odds, +signed sumstats, odds ratio), p value and N columns.} } \value{ list containing sumstats_dt, the modified summary statistics data diff --git a/man/format_sumstats.Rd b/man/format_sumstats.Rd index 86dac05..b5eabe0 100644 --- a/man/format_sumstats.Rd +++ b/man/format_sumstats.Rd @@ -43,6 +43,8 @@ format_sumstats( frq_is_maf = TRUE, indels = TRUE, drop_indels = FALSE, + drop_na_cols = c("SNP", "CHR", "BP", "A1", "A2", "FRQ", "BETA", "Z", "OR", "LOG_ODDS", + "SIGNED_SUMSTAT", "SE", "P", "N"), dbSNP = 155, check_dups = TRUE, sort_coordinates = TRUE, @@ -236,6 +238,13 @@ dropped? These can not be checked against a reference dataset and will have the same RS ID and position as SNPs which can affect downstream analysis. Default is False.} +\item{drop_na_cols}{A character vector of column names to be checked for +missing values. Rows with missing values in any of these columns (if present +in the dataset) will be dropped. If \code{NULL}, all columns will be checked for +missing values. Default columns are SNP, chromosome, position, allele 1, +allele2, effect columns (frequency, beta, Z-score, standard error, log odds, +signed sumstats, odds ratio), p value and N columns.} + \item{dbSNP}{version of dbSNP to be used for imputation (144 or 155).} \item{check_dups}{whether to check for duplicates - if formatting QTL diff --git a/man/import_sumstats.Rd b/man/import_sumstats.Rd index 4083cd3..44a3b3f 100644 --- a/man/import_sumstats.Rd +++ b/man/import_sumstats.Rd @@ -193,6 +193,12 @@ value is TRUE. Default is TRUE.} dropped? These can not be checked against a reference dataset and will have the same RS ID and position as SNPs which can affect downstream analysis. Default is False.} + \item{\code{drop_na_cols}}{A character vector of column names to be checked for +missing values. Rows with missing values in any of these columns (if present +in the dataset) will be dropped. If \code{NULL}, all columns will be checked for +missing values. Default columns are SNP, chromosome, position, allele 1, +allele2, effect columns (frequency, beta, Z-score, standard error, log odds, +signed sumstats, odds ratio), p value and N columns.} \item{\code{dbSNP}}{version of dbSNP to be used for imputation (144 or 155).} \item{\code{check_dups}}{whether to check for duplicates - if formatting QTL datasets this should be set to FALSE otherwise keep as TRUE. Default is TRUE.} diff --git a/man/validate_parameters.Rd b/man/validate_parameters.Rd index 91c2cc4..f6bcaac 100644 --- a/man/validate_parameters.Rd +++ b/man/validate_parameters.Rd @@ -48,6 +48,7 @@ validate_parameters( mapping_file, tabix_index, chain_source, + drop_na_cols, rmv_chrPrefix ) } @@ -236,6 +237,13 @@ data(sumstatsColHeaders) for default mapping and necessary format.} genome build ("ucsc" or "ensembl"). Note that the UCSC chain files require a license for commercial use. The Ensembl chain is used by default ("ensembl").} +\item{drop_na_cols}{A character vector of column names to be checked for +missing values. Rows with missing values in any of these columns (if present +in the dataset) will be dropped. If \code{NULL}, all columns will be checked for +missing values. Default columns are SNP, chromosome, position, allele 1, +allele2, effect columns (frequency, beta, Z-score, standard error, log odds, +signed sumstats, odds ratio), p value and N columns.} + \item{rmv_chrPrefix}{Is now deprecated, do. not use. Use chr_style instead - chr_style = 'Ensembl' will give the same result as rmv_chrPrefix=TRUE used to give.} diff --git a/tests/testthat/test-missing_data.R b/tests/testthat/test-missing_data.R index 771c159..f2c7002 100644 --- a/tests/testthat/test-missing_data.R +++ b/tests/testthat/test-missing_data.R @@ -59,13 +59,13 @@ test_that("Handle missing data", { dbSNP=144 ) reformatted_lines <- readLines(reformatted) - expect_equal(reformatted_lines, org_lines) + testthat::expect_equal(reformatted_lines, org_lines) # set `drop_na_cols` to `NULL` miss_extra_col <- miss miss_extra_col$extra <- NA - expect_error(MungeSumstats::format_sumstats( + testthat::expect_error(MungeSumstats::format_sumstats( miss_extra_col, ref_genome = "GRCh37", on_ref_genome = FALSE, @@ -87,7 +87,7 @@ test_that("Handle missing data", { allele_flip_check = FALSE, sort_coordinates = FALSE, dbSNP = 144, - drop_na_cols = c("CHR", "POS") + drop_na_cols = c("CHRA", "APOS") ) reformatted_extra_col_lines <- readLines(reformatted_extra_col) diff --git a/vignettes/MungeSumstats.Rmd b/vignettes/MungeSumstats.Rmd index c90e305..7d41868 100644 --- a/vignettes/MungeSumstats.Rmd +++ b/vignettes/MungeSumstats.Rmd @@ -406,6 +406,12 @@ conducted by *MungeSumstats* are: dropped? These can not be checked against a reference dataset and will have the same RS ID and position as SNPs which can affect downstream analysis. Default is False. +- **drop_na_cols** A character vector of column names to be checked for + missing values. Rows with missing values in any of these columns (if present + in the dataset) will be dropped. If `NULL`, all columns will be checked for + missing values. Default columns are SNP, chromosome, position, allele 1, + allele 2, effect columns (frequency, beta, Z-score, standard error, + log odds, signed sumstats, odds ratio), p value and N columns. - **dbSNP** The dbSNP version to use as a reference - defaults to the most recent version available (155). Note that with the 9x more SNPs in dbSNP 155 vs 144, run times will increase.