From 6ee20d2873bd2530cb8bbf57ae5a22c58c13a1f1 Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sat, 29 Jan 2022 22:56:58 +0800 Subject: [PATCH 1/2] Fix issue 33 --- R/na_summary.R | 60 +++++++++++++++++++++++++++-------------------- man/na_summary.Rd | 19 ++++++++++----- 2 files changed, 47 insertions(+), 32 deletions(-) diff --git a/R/na_summary.R b/R/na_summary.R index 47623a7..1713fa8 100644 --- a/R/na_summary.R +++ b/R/na_summary.R @@ -4,39 +4,45 @@ #' @param round_to Number of places to round 2. Defaults to user digits option. #' @param pattern_type A regular expression type. One of "starts_with", #' "contains", or "regex". Defaults to NULL. Only use for selective inclusion. -#' @param regex_kind One of inclusion or exclusion. Defaults to exclusion to exclude -#' columns using regular expressions. -#' @param pattern Pattern to use for exclusion or inclusion. -#' column inclusion criteria. +#' @param regex_kind One of inclusion or exclusion. Defaults to exclusion to exclude +#' columns using regular expressions. +#' @param pattern Pattern to use for exclusion or inclusion. +#' column inclusion criteria. +#' @param reset_rownames Should the rownames be reset in the output? defaults to FALSE #' @importFrom stats "aggregate" "as.formula" "na.pass" #' @examples #' na_summary(airquality) #' # grouping #' test2 <- data.frame(ID= c("A","A","B","A","B"),Vals = c(rep(NA,4),"No"), #' ID2 = c("E","E","D","E","D")) +#' df <- data.frame(A=1:5,B=c(NA,NA,25,24,53), C=c(NA,1,2,3,4)) +#' #' na_summary(test2,grouping_cols = c("ID","ID2")) #' # sort summary #' na_summary(airquality,sort_by = "percent_missing",descending = TRUE) #' na_summary(airquality,sort_by = "percent_complete") #' # Include only via a regular expression -#' na_summary(mtcars, pattern_type = "contains", +#' na_summary(mtcars, pattern_type = "contains", #' pattern = "mpg|disp|wt", regex_kind = "inclusion") -#' na_summary(airquality, pattern_type = "starts_with", +#' na_summary(airquality, pattern_type = "starts_with", #' pattern = "ozone", regex_kind = "inclusion") -#' # exclusion via a regex +#' # exclusion via a regex #' na_summary(airquality, pattern_type = "starts_with", #' pattern = "oz|Sol", regex_kind = "exclusion") +#' # reset rownames when sorting by variable +#' na_summary(df,sort_by="variable",descending=TRUE, reset_rownames = TRUE) #' @export na_summary <- function(df,grouping_cols=NULL, sort_by=NULL, - descending=FALSE, + descending=FALSE, exclude_cols = NULL, pattern = NULL, pattern_type = NULL, - regex_kind = "exclusion", - round_to = NULL){ + regex_kind = "exclusion", + round_to = NULL, + reset_rownames = FALSE){ UseMethod("na_summary") @@ -46,28 +52,29 @@ na_summary <- function(df,grouping_cols=NULL, na_summary.data.frame <- function(df,grouping_cols=NULL, sort_by=NULL, - descending=FALSE, + descending=FALSE, exclude_cols = NULL, pattern = NULL, pattern_type = NULL, - regex_kind = "exclusion", - round_to = NULL){ + regex_kind = "exclusion", + round_to = NULL, + reset_rownames = FALSE){ # Round percents to chosen round round_to = ifelse(is.null(round_to), options("digits")[[1]], round_to) if(all(!is.null(exclude_cols), !is.null(pattern_type))){ stop("Use either exclude_cols or pattern_type, not both.") - + } - + if(!is.null(pattern_type)){ if(is.null(pattern)) stop("Please provide a pattern to use.") if(!regex_kind %in% c("inclusion", "exclusion")) stop(paste0("Use either inclusion or exclusion not ", regex_kind)) df<-switch(regex_kind, - inclusion =df[recode_selectors(df, + inclusion =df[recode_selectors(df, pattern_type = pattern_type, pattern = pattern)], - exclusion = df[-recode_selectors(df, + exclusion = df[-recode_selectors(df, pattern_type =pattern_type, pattern = pattern)] ) @@ -75,8 +82,8 @@ na_summary.data.frame <- function(df,grouping_cols=NULL, if(!is.null(exclude_cols)){ exclude_cols_indices <- which(names(df) %in% exclude_cols) - df <- df[-exclude_cols_indices] - } + df <- df[-exclude_cols_indices] + } if(is.null(grouping_cols)){ # stick to(with?) base as much as possible # get total NAs columnwise @@ -84,7 +91,7 @@ if(is.null(grouping_cols)){ all_percents <- stack(percent_missing(df)) all_percents$values <- round(all_percents$values, digits=round_to) - + names(all_counts) <- c("missing","variable") names(all_percents) <- c("percent_missing","variable") @@ -114,7 +121,7 @@ if(length(non_grouping) > 1) warning("All non grouping values used. Using select agg_formula <- as.formula(paste0(".~", grouping_cols_formula)) res<-do.call(data.frame,aggregate(agg_formula,data=df, - function(x) c(missing = sum(is.na(x)), + function(x) c(missing = sum(is.na(x)), complete = length(x) - sum(is.na(x)), percent_complete = mean(!is.na(x)) * 100, percent_missing = mean(is.na(x)) * 100 @@ -123,27 +130,28 @@ if(length(non_grouping) > 1) warning("All non grouping values used. Using select tidyr::separate(name,c("variable","metric"), sep="\\.(?=percent|miss|complete)") %>% tidyr::pivot_wider(names_from=metric,values_from=value) - - + + res$percent_complete=round(res$percent_complete, digits = round_to) res$percent_missing=round(res$percent_missing, digits = round_to) } if(!is.null(sort_by)){ -stopifnot("sort_by should be a valid name in the output of na_summary" = +stopifnot("sort_by should be a valid name in the output of na_summary" = sort_by %in% names(res)) - + # Get the value to sort by target_column <- res[[sort_by]] # Check class of this value and use appropriate sorting if (is.factor(target_column)) target_column <- as.character(target_column) -res <- res[sort(target_column,decreasing=descending,index.return=TRUE)[[2]],] +res <- res[sort(target_column,decreasing=descending,index.return=TRUE)[[2]],] } +if(reset_rownames) rownames(res) <- NULL res } diff --git a/man/na_summary.Rd b/man/na_summary.Rd index 9ac0b78..908f702 100644 --- a/man/na_summary.Rd +++ b/man/na_summary.Rd @@ -13,7 +13,8 @@ na_summary( pattern = NULL, pattern_type = NULL, regex_kind = "exclusion", - round_to = NULL + round_to = NULL, + reset_rownames = FALSE ) } \arguments{ @@ -30,16 +31,18 @@ data.} \item{exclude_cols}{A character vector indicating columns to exclude when returning results.} -\item{pattern}{Pattern to use for exclusion or inclusion. +\item{pattern}{Pattern to use for exclusion or inclusion. column inclusion criteria.} \item{pattern_type}{A regular expression type. One of "starts_with", "contains", or "regex". Defaults to NULL. Only use for selective inclusion.} -\item{regex_kind}{One of inclusion or exclusion. Defaults to exclusion to exclude +\item{regex_kind}{One of inclusion or exclusion. Defaults to exclusion to exclude columns using regular expressions.} \item{round_to}{Number of places to round 2. Defaults to user digits option.} + +\item{reset_rownames}{Should the rownames be reset in the output? defaults to FALSE} } \description{ An all-in-one missingness report @@ -49,16 +52,20 @@ na_summary(airquality) # grouping test2 <- data.frame(ID= c("A","A","B","A","B"),Vals = c(rep(NA,4),"No"), ID2 = c("E","E","D","E","D")) +df <- data.frame(A=1:5,B=c(NA,NA,25,24,53), C=c(NA,1,2,3,4)) + na_summary(test2,grouping_cols = c("ID","ID2")) # sort summary na_summary(airquality,sort_by = "percent_missing",descending = TRUE) na_summary(airquality,sort_by = "percent_complete") # Include only via a regular expression -na_summary(mtcars, pattern_type = "contains", +na_summary(mtcars, pattern_type = "contains", pattern = "mpg|disp|wt", regex_kind = "inclusion") -na_summary(airquality, pattern_type = "starts_with", +na_summary(airquality, pattern_type = "starts_with", pattern = "ozone", regex_kind = "inclusion") -# exclusion via a regex +# exclusion via a regex na_summary(airquality, pattern_type = "starts_with", pattern = "oz|Sol", regex_kind = "exclusion") +# reset rownames when sorting by variable +na_summary(df,sort_by="variable",descending=TRUE, reset_rownames = TRUE) } From 0f0b2c0e3fe2d930667a9f4e876f4ed5b210245d Mon Sep 17 00:00:00 2001 From: Ronak Shah Date: Sun, 30 Jan 2022 08:18:10 +0800 Subject: [PATCH 2/2] add new tests --- NEWS.md | 3 ++ tests/testthat/test_na_summary.R | 53 ++++++++++++++++++-------------- 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/NEWS.md b/NEWS.md index efb2ae5..6140609 100644 --- a/NEWS.md +++ b/NEWS.md @@ -24,6 +24,9 @@ output: html_document * `na_counts` and `percent_na` are new vector focused functions to allow `get`ting `na_counts` and percent missingness for objects of classes like `POSIXct`. +* include a new argument to reset rownames in `na_summary`. Fixes +[#33](https://github.com/Nelson-Gon/mde/issues/33). + # mde 0.3.1 * Fixed a bug that made `exclude_cols` result in non-exclusion in grouped `na.summary` outputs. diff --git a/tests/testthat/test_na_summary.R b/tests/testthat/test_na_summary.R index 08c44ac..0004630 100644 --- a/tests/testthat/test_na_summary.R +++ b/tests/testthat/test_na_summary.R @@ -3,57 +3,64 @@ testthat::test_that(desc="Test na_summary", skip_on_oldrel() - + # expect that columns are excluded excluded <- na_summary(mtcars, exclude_cols = "mpg") expect_false( "mpg" %in% excluded$variable) - + expect_true(na_summary(airquality,sort_by = "percent_missing")[1,1]=="Day") expect_true(na_summary(airquality,sort_by = "percent_missing", descending = TRUE)[1,1]=="Ozone") expect_equal(ncol(na_summary(airquality)),5) # Check that string sorting works - expect_true(na_summary(airquality,sort_by="variable", + expect_true(na_summary(airquality,sort_by="variable", descending = TRUE)[[1]][1] == "Wind") - expect_true(na_summary(airquality,sort_by="variable", + expect_true(na_summary(airquality,sort_by="variable", descending = FALSE)[[1]][1] == "Day") - - # Check that rounding works as expected - expect_true(na_summary(airquality, round_to=2, sort_by="percent_missing", + + # Check that rounding works as expected + expect_true(na_summary(airquality, round_to=2, sort_by="percent_missing", descending = TRUE)[1,"percent_missing"] == 24.18) - + # Check that we can use inclusion patterns - only_ozone <- na_summary(airquality, - pattern_type = "starts_with", + only_ozone <- na_summary(airquality, + pattern_type = "starts_with", pattern = "ozone", regex_kind = "inclusion") - # we expect this to have the same number of missing values as no inclusion. + # we expect this to have the same number of missing values as no inclusion. expect_true(na_summary(airquality)[3,"missing"] == only_ozone[["missing"]]) - # Check that we can exclude via a regex - expect_false(any(c("Ozone", "Solar.R") %in% na_summary(airquality, + # Check that we can exclude via a regex + expect_false(any(c("Ozone", "Solar.R") %in% na_summary(airquality, pattern_type = "starts_with", - pattern = "oz|Sol", + pattern = "oz|Sol", regex_kind = "exclusion")[["variable"]])) expect_snapshot( - + { - # Error if a user provides unexpected args - expect_error(na_summary(airquality, + # Error if a user provides unexpected args + expect_error(na_summary(airquality, pattern_type = "starts_with", - pattern = "oz|Sol", + pattern = "oz|Sol", regex_kind = "random")) - expect_error(na_summary(airquality, + expect_error(na_summary(airquality, pattern_type = "starts_with", - pattern = "oz|Sol", + pattern = "oz|Sol", exclude_cols = "Solar.R")) expect_error(na_summary(airquality,sort_by = "not_in")) expect_warning(na_summary(airquality, grouping_cols=c("Month","Day"))) expect_error(na_summary(airquality,grouping_cols="gibberish")) - + } ) - - + + + }) + + +testthat::test_that("test to check reset_rownames in na_summary", + { + tmp <- na_summary(airquality, sort_by = "variable", descending = TRUE, reset_rownames = TRUE) + expect_equal(rownames(tmp), as.character(1:6)) })