Skip to content

Commit

Permalink
Merge pull request #43 from shahronak47/fix_issue_33
Browse files Browse the repository at this point in the history
Fix #33
  • Loading branch information
Nelson-Gon authored Jan 30, 2022
2 parents 8447bab + 0f0b2c0 commit 21b783e
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 55 deletions.
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ output: html_document

* `na_counts` and `percent_na` are new vector focused functions to allow `get`ting `na_counts` and percent missingness for objects of classes like `POSIXct`.

* include a new argument to reset rownames in `na_summary`. Fixes
[#33](https://github.com/Nelson-Gon/mde/issues/33).

# mde 0.3.1

* Fixed a bug that made `exclude_cols` result in non-exclusion in grouped `na.summary` outputs.
Expand Down
60 changes: 34 additions & 26 deletions R/na_summary.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,39 +4,45 @@
#' @param round_to Number of places to round 2. Defaults to user digits option.
#' @param pattern_type A regular expression type. One of "starts_with",
#' "contains", or "regex". Defaults to NULL. Only use for selective inclusion.
#' @param regex_kind One of inclusion or exclusion. Defaults to exclusion to exclude
#' columns using regular expressions.
#' @param pattern Pattern to use for exclusion or inclusion.
#' column inclusion criteria.
#' @param regex_kind One of inclusion or exclusion. Defaults to exclusion to exclude
#' columns using regular expressions.
#' @param pattern Pattern to use for exclusion or inclusion.
#' column inclusion criteria.
#' @param reset_rownames Should the rownames be reset in the output? defaults to FALSE
#' @importFrom stats "aggregate" "as.formula" "na.pass"
#' @examples
#' na_summary(airquality)
#' # grouping
#' test2 <- data.frame(ID= c("A","A","B","A","B"),Vals = c(rep(NA,4),"No"),
#' ID2 = c("E","E","D","E","D"))
#' df <- data.frame(A=1:5,B=c(NA,NA,25,24,53), C=c(NA,1,2,3,4))
#'
#' na_summary(test2,grouping_cols = c("ID","ID2"))
#' # sort summary
#' na_summary(airquality,sort_by = "percent_missing",descending = TRUE)

#' na_summary(airquality,sort_by = "percent_complete")
#' # Include only via a regular expression
#' na_summary(mtcars, pattern_type = "contains",
#' na_summary(mtcars, pattern_type = "contains",
#' pattern = "mpg|disp|wt", regex_kind = "inclusion")
#' na_summary(airquality, pattern_type = "starts_with",
#' na_summary(airquality, pattern_type = "starts_with",
#' pattern = "ozone", regex_kind = "inclusion")
#' # exclusion via a regex
#' # exclusion via a regex
#' na_summary(airquality, pattern_type = "starts_with",
#' pattern = "oz|Sol", regex_kind = "exclusion")
#' # reset rownames when sorting by variable
#' na_summary(df,sort_by="variable",descending=TRUE, reset_rownames = TRUE)
#' @export

na_summary <- function(df,grouping_cols=NULL,
sort_by=NULL,
descending=FALSE,
descending=FALSE,
exclude_cols = NULL,
pattern = NULL,
pattern_type = NULL,
regex_kind = "exclusion",
round_to = NULL){
regex_kind = "exclusion",
round_to = NULL,
reset_rownames = FALSE){
UseMethod("na_summary")


Expand All @@ -46,45 +52,46 @@ na_summary <- function(df,grouping_cols=NULL,

na_summary.data.frame <- function(df,grouping_cols=NULL,
sort_by=NULL,
descending=FALSE,
descending=FALSE,
exclude_cols = NULL,
pattern = NULL,
pattern_type = NULL,
regex_kind = "exclusion",
round_to = NULL){
regex_kind = "exclusion",
round_to = NULL,
reset_rownames = FALSE){
# Round percents to chosen round
round_to = ifelse(is.null(round_to),
options("digits")[[1]], round_to)
if(all(!is.null(exclude_cols), !is.null(pattern_type))){
stop("Use either exclude_cols or pattern_type, not both.")

}

if(!is.null(pattern_type)){
if(is.null(pattern)) stop("Please provide a pattern to use.")
if(!regex_kind %in% c("inclusion", "exclusion")) stop(paste0("Use either inclusion or exclusion not ", regex_kind))
df<-switch(regex_kind,
inclusion =df[recode_selectors(df,
inclusion =df[recode_selectors(df,
pattern_type = pattern_type,
pattern = pattern)],
exclusion = df[-recode_selectors(df,
exclusion = df[-recode_selectors(df,
pattern_type =pattern_type,
pattern = pattern)]
)
}

if(!is.null(exclude_cols)){
exclude_cols_indices <- which(names(df) %in% exclude_cols)
df <- df[-exclude_cols_indices]
}
df <- df[-exclude_cols_indices]
}
if(is.null(grouping_cols)){
# stick to(with?) base as much as possible
# get total NAs columnwise
all_counts <-stack(get_na_counts(df))
all_percents <- stack(percent_missing(df))

all_percents$values <- round(all_percents$values, digits=round_to)

names(all_counts) <- c("missing","variable")
names(all_percents) <- c("percent_missing","variable")

Expand Down Expand Up @@ -114,7 +121,7 @@ if(length(non_grouping) > 1) warning("All non grouping values used. Using select
agg_formula <- as.formula(paste0(".~",
grouping_cols_formula))
res<-do.call(data.frame,aggregate(agg_formula,data=df,
function(x) c(missing = sum(is.na(x)),
function(x) c(missing = sum(is.na(x)),
complete = length(x) - sum(is.na(x)),
percent_complete = mean(!is.na(x)) * 100,
percent_missing = mean(is.na(x)) * 100
Expand All @@ -123,27 +130,28 @@ if(length(non_grouping) > 1) warning("All non grouping values used. Using select
tidyr::separate(name,c("variable","metric"),
sep="\\.(?=percent|miss|complete)") %>%
tidyr::pivot_wider(names_from=metric,values_from=value)


res$percent_complete=round(res$percent_complete, digits = round_to)
res$percent_missing=round(res$percent_missing, digits = round_to)


}
if(!is.null(sort_by)){
stopifnot("sort_by should be a valid name in the output of na_summary" =
stopifnot("sort_by should be a valid name in the output of na_summary" =
sort_by %in% names(res))


# Get the value to sort by
target_column <- res[[sort_by]]
# Check class of this value and use appropriate sorting
if (is.factor(target_column)) target_column <- as.character(target_column)

res <- res[sort(target_column,decreasing=descending,index.return=TRUE)[[2]],]
res <- res[sort(target_column,decreasing=descending,index.return=TRUE)[[2]],]


}
if(reset_rownames) rownames(res) <- NULL
res
}

Expand Down
19 changes: 13 additions & 6 deletions man/na_summary.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

53 changes: 30 additions & 23 deletions tests/testthat/test_na_summary.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,57 +3,64 @@ testthat::test_that(desc="Test na_summary",

skip_on_oldrel()


# expect that columns are excluded
excluded <- na_summary(mtcars, exclude_cols = "mpg")

expect_false( "mpg" %in% excluded$variable)

expect_true(na_summary(airquality,sort_by = "percent_missing")[1,1]=="Day")
expect_true(na_summary(airquality,sort_by = "percent_missing",
descending = TRUE)[1,1]=="Ozone")
expect_equal(ncol(na_summary(airquality)),5)
# Check that string sorting works
expect_true(na_summary(airquality,sort_by="variable",
expect_true(na_summary(airquality,sort_by="variable",
descending = TRUE)[[1]][1] == "Wind")

expect_true(na_summary(airquality,sort_by="variable",
expect_true(na_summary(airquality,sort_by="variable",
descending = FALSE)[[1]][1] == "Day")
# Check that rounding works as expected
expect_true(na_summary(airquality, round_to=2, sort_by="percent_missing",

# Check that rounding works as expected
expect_true(na_summary(airquality, round_to=2, sort_by="percent_missing",
descending = TRUE)[1,"percent_missing"] == 24.18)

# Check that we can use inclusion patterns
only_ozone <- na_summary(airquality,
pattern_type = "starts_with",
only_ozone <- na_summary(airquality,
pattern_type = "starts_with",
pattern = "ozone",
regex_kind = "inclusion")
# we expect this to have the same number of missing values as no inclusion.
# we expect this to have the same number of missing values as no inclusion.
expect_true(na_summary(airquality)[3,"missing"] == only_ozone[["missing"]])
# Check that we can exclude via a regex
expect_false(any(c("Ozone", "Solar.R") %in% na_summary(airquality,
# Check that we can exclude via a regex
expect_false(any(c("Ozone", "Solar.R") %in% na_summary(airquality,
pattern_type = "starts_with",
pattern = "oz|Sol",
pattern = "oz|Sol",
regex_kind = "exclusion")[["variable"]]))
expect_snapshot(

{
# Error if a user provides unexpected args
expect_error(na_summary(airquality,
# Error if a user provides unexpected args
expect_error(na_summary(airquality,
pattern_type = "starts_with",
pattern = "oz|Sol",
pattern = "oz|Sol",
regex_kind = "random"))
expect_error(na_summary(airquality,
expect_error(na_summary(airquality,
pattern_type = "starts_with",
pattern = "oz|Sol",
pattern = "oz|Sol",
exclude_cols = "Solar.R"))
expect_error(na_summary(airquality,sort_by = "not_in"))
expect_warning(na_summary(airquality, grouping_cols=c("Month","Day")))
expect_error(na_summary(airquality,grouping_cols="gibberish"))

}
)




})


testthat::test_that("test to check reset_rownames in na_summary",
{
tmp <- na_summary(airquality, sort_by = "variable", descending = TRUE, reset_rownames = TRUE)
expect_equal(rownames(tmp), as.character(1:6))
})

0 comments on commit 21b783e

Please sign in to comment.