diff --git a/DESCRIPTION b/DESCRIPTION
index a531936..45be2e5 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
Package: MungeSumstats
Type: Package
Title: Standardise summary statistics from GWAS
-Version: 1.11.8
+Version: 1.11.9
Authors@R:
c(person(given = "Alan",
family = "Murphy",
diff --git a/NEWS.md b/NEWS.md
index c5b35ed..253e702 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,10 @@
+## CHANGES IN VERSION 1.11.9
+
+### New features
+* Can now control what columns are checked for missing data (`drop_na_cols` in
+`format_sumstats()`). By default, SNP, effect columns and P/N columns are
+checked. Set to Null to check all columns or choose specific columns.
+
## CHANGES IN VERSION 1.11.7
### Bug fix
diff --git a/R/check_miss_data.R b/R/check_miss_data.R
index b24bf61..630a7d9 100644
--- a/R/check_miss_data.R
+++ b/R/check_miss_data.R
@@ -27,6 +27,17 @@ check_miss_data <- function(sumstats_dt, path, log_folder_ind, check_save_out,
c(drop_na_cols)[drop_na_cols %in% names(sumstats_dt)]
incl_cols <-
c(drop_na_cols_in_sumstats)[!drop_na_cols_in_sumstats %in% ignore_cols]
+ if(length(incl_cols)<1){
+ msg <- paste0(
+ "WARNING: None of the inputted columns:\n",
+ paste(drop_na_cols,collapse=" "),"\n",
+ "To be checked for missing data were found in the sumstats. Sumstats",
+ " columns:\n",
+ paste(names(sumstats_dt),collapse=" "),"\n",
+ "This check will not be run."
+ )
+ message(msg)
+ }
} else {
incl_cols <- names(sumstats_dt)[!names(sumstats_dt) %in% ignore_cols]
}
diff --git a/R/format_sumstats.R b/R/format_sumstats.R
index 212e11f..e66c8a5 100644
--- a/R/format_sumstats.R
+++ b/R/format_sumstats.R
@@ -169,6 +169,12 @@
#' dropped? These can not be checked against a reference dataset and will have
#' the same RS ID and position as SNPs which can affect downstream analysis.
#' Default is False.
+#' @param drop_na_cols A character vector of column names to be checked for
+#' missing values. Rows with missing values in any of these columns (if present
+#' in the dataset) will be dropped. If `NULL`, all columns will be checked for
+#' missing values. Default columns are SNP, chromosome, position, allele 1,
+#' allele2, effect columns (frequency, beta, Z-score, standard error, log odds,
+#' signed sumstats, odds ratio), p value and N columns.
#' @param dbSNP version of dbSNP to be used for imputation (144 or 155).
#' @param check_dups whether to check for duplicates - if formatting QTL
#' datasets this should be set to FALSE otherwise keep as TRUE. Default is TRUE.
@@ -221,11 +227,6 @@
#' give is incorrect you can supply your own mapping file. Must be a 2 column
#' dataframe with column names "Uncorrected" and "Corrected". See
#' data(sumstatsColHeaders) for default mapping and necessary format.
-#' @param drop_na_cols A character vector of column names to be checked for missing values.
-#' Rows with missing values in any of these columns (if present in the dataset) will be dropped. If `NULL`,
-#' all columns will be checked for missing values. Default columns are SNP,
-#' chromosome, position, allele 1, allele2, frequency, beta, standard error, p
-#' value and N columns.
#'
#' @importFrom data.table fread
#' @importFrom data.table fwrite
@@ -272,6 +273,10 @@ format_sumstats <- function(path,
frq_is_maf = TRUE,
indels = TRUE,
drop_indels = FALSE,
+ drop_na_cols = c("SNP", "CHR", "BP", "A1", "A2",
+ "FRQ", "BETA", "Z", "OR",
+ "LOG_ODDS", "SIGNED_SUMSTAT", "SE",
+ "P", "N"),
dbSNP = 155,
check_dups = TRUE,
sort_coordinates = TRUE,
@@ -289,7 +294,6 @@ format_sumstats <- function(path,
imputation_ind = FALSE,
force_new = FALSE,
mapping_file = sumstatsColHeaders,
- drop_na_cols = c("SNP", "CHR", "BP", "A1", "A2", "FRQ", "BETA", "SE", "P", "N"),
#deprecated parameters
rmv_chrPrefix = NULL
) {
diff --git a/README.Rmd b/README.Rmd
index 24a5c07..e206337 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -142,5 +142,6 @@ development:
* [Jonathan Griffiths](https://github.com/jonathangriffiths)
* [Kitty Murphy](https://github.com/KittyMurphy)
* [Mykhaylo Malakhov](https://github.com/MykMal)
+ * [Alasdair Warwick](https://github.com/rmgpanw)
# References
diff --git a/README.md b/README.md
index 50ba30a..7549280 100644
--- a/README.md
+++ b/README.md
@@ -4,19 +4,19 @@
Authors: Alan Murphy, Brian Schilder and Nathan Skene
-Updated: Jan-15-2024
+Updated: Apr-24-2024
[![](https://img.shields.io/badge/release%20version-1.10.1-black.svg)](https://www.bioconductor.org/packages/MungeSumstats)
-[![](https://img.shields.io/badge/devel%20version-1.11.3-black.svg)](https://github.com/neurogenomics/MungeSumstats)
+[![](https://img.shields.io/badge/devel%20version-1.11.9-black.svg)](https://github.com/neurogenomics/MungeSumstats)
[![R build
status](https://github.com/neurogenomics/MungeSumstats/workflows/rworkflows/badge.svg)](https://github.com/neurogenomics/MungeSumstats/actions)
[![](https://img.shields.io/github/last-commit/neurogenomics/MungeSumstats.svg)](https://github.com/neurogenomics/MungeSumstats/commits/master)
[![](https://codecov.io/gh/neurogenomics/MungeSumstats/branch/master/graph/badge.svg)](https://codecov.io/gh/neurogenomics/MungeSumstats)
-[![](https://img.shields.io/badge/download-11379/total-blue.svg)](https://bioconductor.org/packages/stats/bioc/MungeSumstats)
+[![](https://img.shields.io/badge/download-15314/total-blue.svg)](https://bioconductor.org/packages/stats/bioc/MungeSumstats)
[![License:
Artistic-2.0](https://img.shields.io/badge/license-Artistic--2.0-blue.svg)](https://cran.r-project.org/web/licenses/Artistic-2.0)
[![](https://img.shields.io/badge/doi-https://doi.org/10.1093/bioinformatics/btab665-blue.svg)](https://doi.org/https://doi.org/10.1093/bioinformatics/btab665)
@@ -150,6 +150,7 @@ We would like to acknowledge all those who have contributed to
- [Jonathan Griffiths](https://github.com/jonathangriffiths)
- [Kitty Murphy](https://github.com/KittyMurphy)
- [Mykhaylo Malakhov](https://github.com/MykMal)
+- [Alasdair Warwick](https://github.com/rmgpanw)
# References
diff --git a/man/check_miss_data.Rd b/man/check_miss_data.Rd
index a0a39d3..5125c0c 100644
--- a/man/check_miss_data.Rd
+++ b/man/check_miss_data.Rd
@@ -11,7 +11,8 @@ check_miss_data(
check_save_out,
tabix_index,
nThread,
- log_files
+ log_files,
+ drop_na_cols
)
}
\arguments{
@@ -31,6 +32,13 @@ FALSE.}
\item{nThread}{Number of threads to use for parallel processes.}
\item{log_files}{list of log file locations}
+
+\item{drop_na_cols}{A character vector of column names to be checked for
+missing values. Rows with missing values in any of these columns (if present
+in the dataset) will be dropped. If \code{NULL}, all columns will be checked for
+missing values. Default columns are SNP, chromosome, position, allele 1,
+allele2, effect columns (frequency, beta, Z-score, standard error, log odds,
+signed sumstats, odds ratio), p value and N columns.}
}
\value{
list containing sumstats_dt, the modified summary statistics data
diff --git a/man/format_sumstats.Rd b/man/format_sumstats.Rd
index 86dac05..b5eabe0 100644
--- a/man/format_sumstats.Rd
+++ b/man/format_sumstats.Rd
@@ -43,6 +43,8 @@ format_sumstats(
frq_is_maf = TRUE,
indels = TRUE,
drop_indels = FALSE,
+ drop_na_cols = c("SNP", "CHR", "BP", "A1", "A2", "FRQ", "BETA", "Z", "OR", "LOG_ODDS",
+ "SIGNED_SUMSTAT", "SE", "P", "N"),
dbSNP = 155,
check_dups = TRUE,
sort_coordinates = TRUE,
@@ -236,6 +238,13 @@ dropped? These can not be checked against a reference dataset and will have
the same RS ID and position as SNPs which can affect downstream analysis.
Default is False.}
+\item{drop_na_cols}{A character vector of column names to be checked for
+missing values. Rows with missing values in any of these columns (if present
+in the dataset) will be dropped. If \code{NULL}, all columns will be checked for
+missing values. Default columns are SNP, chromosome, position, allele 1,
+allele2, effect columns (frequency, beta, Z-score, standard error, log odds,
+signed sumstats, odds ratio), p value and N columns.}
+
\item{dbSNP}{version of dbSNP to be used for imputation (144 or 155).}
\item{check_dups}{whether to check for duplicates - if formatting QTL
diff --git a/man/import_sumstats.Rd b/man/import_sumstats.Rd
index 4083cd3..44a3b3f 100644
--- a/man/import_sumstats.Rd
+++ b/man/import_sumstats.Rd
@@ -193,6 +193,12 @@ value is TRUE. Default is TRUE.}
dropped? These can not be checked against a reference dataset and will have
the same RS ID and position as SNPs which can affect downstream analysis.
Default is False.}
+ \item{\code{drop_na_cols}}{A character vector of column names to be checked for
+missing values. Rows with missing values in any of these columns (if present
+in the dataset) will be dropped. If \code{NULL}, all columns will be checked for
+missing values. Default columns are SNP, chromosome, position, allele 1,
+allele2, effect columns (frequency, beta, Z-score, standard error, log odds,
+signed sumstats, odds ratio), p value and N columns.}
\item{\code{dbSNP}}{version of dbSNP to be used for imputation (144 or 155).}
\item{\code{check_dups}}{whether to check for duplicates - if formatting QTL
datasets this should be set to FALSE otherwise keep as TRUE. Default is TRUE.}
diff --git a/man/validate_parameters.Rd b/man/validate_parameters.Rd
index 91c2cc4..f6bcaac 100644
--- a/man/validate_parameters.Rd
+++ b/man/validate_parameters.Rd
@@ -48,6 +48,7 @@ validate_parameters(
mapping_file,
tabix_index,
chain_source,
+ drop_na_cols,
rmv_chrPrefix
)
}
@@ -236,6 +237,13 @@ data(sumstatsColHeaders) for default mapping and necessary format.}
genome build ("ucsc" or "ensembl"). Note that the UCSC chain files require a
license for commercial use. The Ensembl chain is used by default ("ensembl").}
+\item{drop_na_cols}{A character vector of column names to be checked for
+missing values. Rows with missing values in any of these columns (if present
+in the dataset) will be dropped. If \code{NULL}, all columns will be checked for
+missing values. Default columns are SNP, chromosome, position, allele 1,
+allele2, effect columns (frequency, beta, Z-score, standard error, log odds,
+signed sumstats, odds ratio), p value and N columns.}
+
\item{rmv_chrPrefix}{Is now deprecated, do. not use. Use chr_style instead -
chr_style = 'Ensembl' will give the same result as rmv_chrPrefix=TRUE used to
give.}
diff --git a/tests/testthat/test-missing_data.R b/tests/testthat/test-missing_data.R
index 771c159..f2c7002 100644
--- a/tests/testthat/test-missing_data.R
+++ b/tests/testthat/test-missing_data.R
@@ -59,13 +59,13 @@ test_that("Handle missing data", {
dbSNP=144
)
reformatted_lines <- readLines(reformatted)
- expect_equal(reformatted_lines, org_lines)
+ testthat::expect_equal(reformatted_lines, org_lines)
# set `drop_na_cols` to `NULL`
miss_extra_col <- miss
miss_extra_col$extra <- NA
- expect_error(MungeSumstats::format_sumstats(
+ testthat::expect_error(MungeSumstats::format_sumstats(
miss_extra_col,
ref_genome = "GRCh37",
on_ref_genome = FALSE,
@@ -87,7 +87,7 @@ test_that("Handle missing data", {
allele_flip_check = FALSE,
sort_coordinates = FALSE,
dbSNP = 144,
- drop_na_cols = c("CHR", "POS")
+ drop_na_cols = c("CHRA", "APOS")
)
reformatted_extra_col_lines <- readLines(reformatted_extra_col)
diff --git a/vignettes/MungeSumstats.Rmd b/vignettes/MungeSumstats.Rmd
index c90e305..7d41868 100644
--- a/vignettes/MungeSumstats.Rmd
+++ b/vignettes/MungeSumstats.Rmd
@@ -406,6 +406,12 @@ conducted by *MungeSumstats* are:
dropped? These can not be checked against a reference dataset and will have
the same RS ID and position as SNPs which can affect downstream analysis.
Default is False.
+- **drop_na_cols** A character vector of column names to be checked for
+ missing values. Rows with missing values in any of these columns (if present
+ in the dataset) will be dropped. If `NULL`, all columns will be checked for
+ missing values. Default columns are SNP, chromosome, position, allele 1,
+ allele 2, effect columns (frequency, beta, Z-score, standard error,
+ log odds, signed sumstats, odds ratio), p value and N columns.
- **dbSNP** The dbSNP version to use as a reference - defaults to the most
recent version available (155). Note that with the 9x more SNPs in dbSNP
155 vs 144, run times will increase.