drop miss dat select cols

Al-Murphy · Apr 24, 2024 · bf83eaa · bf83eaa
1 parent 588a898
commit bf83eaa
Show file tree

Hide file tree

Showing 12 changed files with 75 additions and 14 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: MungeSumstats
 Type: Package
 Title: Standardise summary statistics from GWAS
-Version: 1.11.8
+Version: 1.11.9
 Authors@R:
     c(person(given = "Alan",
            family = "Murphy",

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,10 @@
+## CHANGES IN VERSION 1.11.9
+
+### New features
+* Can now control what columns are checked for missing data (`drop_na_cols` in
+`format_sumstats()`). By default, SNP, effect columns and P/N columns are 
+checked. Set to Null to check all columns or choose specific columns.
+
 ## CHANGES IN VERSION 1.11.7
 
 ### Bug fix

diff --git a/R/check_miss_data.R b/R/check_miss_data.R
@@ -27,6 +27,17 @@ check_miss_data <- function(sumstats_dt, path, log_folder_ind, check_save_out,
         c(drop_na_cols)[drop_na_cols %in% names(sumstats_dt)]
       incl_cols <-
         c(drop_na_cols_in_sumstats)[!drop_na_cols_in_sumstats %in% ignore_cols]
+      if(length(incl_cols)<1){
+        msg <- paste0(
+          "WARNING: None of the inputted columns:\n",
+          paste(drop_na_cols,collapse=" "),"\n",
+          "To be checked for missing data were found in the sumstats. Sumstats",
+          " columns:\n",
+          paste(names(sumstats_dt),collapse=" "),"\n",
+          "This check will not be run."
+        )
+        message(msg)
+      }
     } else {
       incl_cols <- names(sumstats_dt)[!names(sumstats_dt) %in% ignore_cols]  
     }

diff --git a/R/format_sumstats.R b/R/format_sumstats.R
@@ -169,6 +169,12 @@
 #' dropped? These can not be checked against a reference dataset and will have
 #' the same RS ID and position as SNPs which can affect downstream analysis.
 #' Default is False.
+#' @param drop_na_cols A character vector of column names to be checked for 
+#' missing values. Rows with missing values in any of these columns (if present 
+#' in the dataset) will be dropped. If `NULL`, all columns will be checked for 
+#' missing values. Default columns are SNP, chromosome, position, allele 1, 
+#' allele2, effect columns (frequency, beta, Z-score, standard error, log odds,
+#' signed sumstats, odds ratio), p value and N columns. 
 #' @param dbSNP version of dbSNP to be used for imputation (144 or 155).
 #' @param check_dups whether to check for duplicates - if formatting QTL
 #' datasets this should be set to FALSE otherwise keep as TRUE. Default is TRUE.
@@ -221,11 +227,6 @@
 #' give is incorrect you can supply your own mapping file. Must be a 2 column
 #' dataframe with column names "Uncorrected" and "Corrected". See
 #' data(sumstatsColHeaders) for default mapping and necessary format.
-#' @param drop_na_cols A character vector of column names to be checked for missing values. 
-#' Rows with missing values in any of these columns (if present in the dataset) will be dropped. If `NULL`, 
-#' all columns will be checked for missing values. Default columns are SNP, 
-#' chromosome, position, allele 1, allele2, frequency, beta, standard error, p 
-#' value and N columns.
 #'
 #' @importFrom data.table fread
 #' @importFrom data.table fwrite
@@ -272,6 +273,10 @@ format_sumstats <- function(path,
                             frq_is_maf = TRUE,
                             indels = TRUE,
                             drop_indels  = FALSE,
+                            drop_na_cols = c("SNP", "CHR", "BP", "A1", "A2", 
+                                             "FRQ", "BETA", "Z", "OR", 
+                                             "LOG_ODDS", "SIGNED_SUMSTAT", "SE", 
+                                             "P", "N"),
                             dbSNP = 155,
                             check_dups = TRUE,
                             sort_coordinates = TRUE,
@@ -289,7 +294,6 @@ format_sumstats <- function(path,
                             imputation_ind = FALSE,
                             force_new = FALSE,
                             mapping_file = sumstatsColHeaders,
-                            drop_na_cols = c("SNP", "CHR", "BP", "A1", "A2", "FRQ", "BETA", "SE", "P", "N"),
                             #deprecated parameters
                             rmv_chrPrefix = NULL
                             ) {

diff --git a/README.Rmd b/README.Rmd
@@ -142,5 +142,6 @@ development:
  * [Jonathan Griffiths](https://github.com/jonathangriffiths)
  * [Kitty Murphy](https://github.com/KittyMurphy)
  * [Mykhaylo Malakhov](https://github.com/MykMal)
+ * [Alasdair Warwick](https://github.com/rmgpanw)
 
 # References
diff --git a/README.md b/README.md
@@ -4,19 +4,19 @@
 <i>Authors</i>: Alan Murphy, Brian Schilder and Nathan Skene  
 </h5>
 <h5>  
-<i>Updated</i>: Jan-15-2024  
+<i>Updated</i>: Apr-24-2024  
 </h5>
 
 <!-- Readme.md is generated from Readme.Rmd. Please edit that file -->
 <!-- badges: start -->
 
 [![](https://img.shields.io/badge/release%20version-1.10.1-black.svg)](https://www.bioconductor.org/packages/MungeSumstats)
-[![](https://img.shields.io/badge/devel%20version-1.11.3-black.svg)](https://github.com/neurogenomics/MungeSumstats)
+[![](https://img.shields.io/badge/devel%20version-1.11.9-black.svg)](https://github.com/neurogenomics/MungeSumstats)
 [![R build
 status](https://github.com/neurogenomics/MungeSumstats/workflows/rworkflows/badge.svg)](https://github.com/neurogenomics/MungeSumstats/actions)
 [![](https://img.shields.io/github/last-commit/neurogenomics/MungeSumstats.svg)](https://github.com/neurogenomics/MungeSumstats/commits/master)
 [![](https://codecov.io/gh/neurogenomics/MungeSumstats/branch/master/graph/badge.svg)](https://codecov.io/gh/neurogenomics/MungeSumstats)
-[![](https://img.shields.io/badge/download-11379/total-blue.svg)](https://bioconductor.org/packages/stats/bioc/MungeSumstats)
+[![](https://img.shields.io/badge/download-15314/total-blue.svg)](https://bioconductor.org/packages/stats/bioc/MungeSumstats)
 [![License:
 Artistic-2.0](https://img.shields.io/badge/license-Artistic--2.0-blue.svg)](https://cran.r-project.org/web/licenses/Artistic-2.0)
 [![](https://img.shields.io/badge/doi-https://doi.org/10.1093/bioinformatics/btab665-blue.svg)](https://doi.org/https://doi.org/10.1093/bioinformatics/btab665)
@@ -150,6 +150,7 @@ We would like to acknowledge all those who have contributed to
 - [Jonathan Griffiths](https://github.com/jonathangriffiths)
 - [Kitty Murphy](https://github.com/KittyMurphy)
 - [Mykhaylo Malakhov](https://github.com/MykMal)
+- [Alasdair Warwick](https://github.com/rmgpanw)
 
 # References
 

diff --git a/man/check_miss_data.Rd b/man/check_miss_data.Rd
diff --git a/man/format_sumstats.Rd b/man/format_sumstats.Rd
diff --git a/man/import_sumstats.Rd b/man/import_sumstats.Rd
diff --git a/man/validate_parameters.Rd b/man/validate_parameters.Rd
diff --git a/tests/testthat/test-missing_data.R b/tests/testthat/test-missing_data.R
@@ -59,13 +59,13 @@ test_that("Handle missing data", {
                                             dbSNP=144
         )
         reformatted_lines <- readLines(reformatted)
-        expect_equal(reformatted_lines, org_lines)
+        testthat::expect_equal(reformatted_lines, org_lines)
 
         # set `drop_na_cols` to `NULL`
         miss_extra_col <- miss
         miss_extra_col$extra <- NA
 
-        expect_error(MungeSumstats::format_sumstats(
+        testthat::expect_error(MungeSumstats::format_sumstats(
           miss_extra_col,
           ref_genome = "GRCh37",
           on_ref_genome = FALSE,
@@ -87,7 +87,7 @@ test_that("Handle missing data", {
           allele_flip_check = FALSE,
           sort_coordinates = FALSE,
           dbSNP = 144, 
-          drop_na_cols = c("CHR", "POS")
+          drop_na_cols = c("CHRA", "APOS")
         )
 
         reformatted_extra_col_lines <- readLines(reformatted_extra_col)

diff --git a/vignettes/MungeSumstats.Rmd b/vignettes/MungeSumstats.Rmd
@@ -406,6 +406,12 @@ conducted by *MungeSumstats* are:
     dropped? These can not be checked against a reference dataset and will have 
     the same RS ID and position as SNPs which can affect downstream analysis. 
     Default is False.    
+-   **drop_na_cols** A character vector of column names to be checked for 
+    missing values. Rows with missing values in any of these columns (if present 
+    in the dataset) will be dropped. If `NULL`, all columns will be checked for 
+    missing values. Default columns are SNP, chromosome, position, allele 1, 
+    allele 2, effect columns (frequency, beta, Z-score, standard error, 
+    log odds, signed sumstats, odds ratio), p value and N columns.
 -   **dbSNP** The dbSNP version to use as a reference - defaults to the most 
     recent version available (155). Note that with the 9x more SNPs in dbSNP 
     155 vs 144, run times will increase.