Merge pull request #43 from shahronak47/fix_issue_33

Fix #33
Nelson-Gon · Jan 30, 2022 · 21b783e · 21b783e
2 parents 8447bab + 0f0b2c0
commit 21b783e
Show file tree

Hide file tree

Showing 4 changed files with 80 additions and 55 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -24,6 +24,9 @@ output: html_document
 
 * `na_counts`  and `percent_na` are new vector focused functions to allow `get`ting `na_counts` and percent missingness for objects of classes like `POSIXct`.
 
+* include a new argument to reset rownames in `na_summary`. Fixes  
+[#33](https://github.com/Nelson-Gon/mde/issues/33). 
+
 # mde 0.3.1
 
 * Fixed a bug that made `exclude_cols` result in non-exclusion in grouped `na.summary` outputs. 

diff --git a/R/na_summary.R b/R/na_summary.R
@@ -4,39 +4,45 @@
 #' @param round_to Number of places to round 2. Defaults to user digits option.
 #' @param pattern_type A regular expression type. One of "starts_with",
 #' "contains", or "regex". Defaults to NULL. Only use for selective inclusion.
-#' @param regex_kind One of inclusion or exclusion. Defaults to exclusion to exclude 
-#' columns using regular expressions. 
-#' @param pattern Pattern to use for exclusion or inclusion. 
-#' column inclusion criteria. 
+#' @param regex_kind One of inclusion or exclusion. Defaults to exclusion to exclude
+#' columns using regular expressions.
+#' @param pattern Pattern to use for exclusion or inclusion.
+#' column inclusion criteria.
+#' @param reset_rownames Should the rownames be reset in the output? defaults to FALSE
 #' @importFrom stats "aggregate" "as.formula" "na.pass"
 #' @examples
 #' na_summary(airquality)
 #' # grouping
 #' test2 <- data.frame(ID= c("A","A","B","A","B"),Vals = c(rep(NA,4),"No"),
 #' ID2 = c("E","E","D","E","D"))
+#' df <- data.frame(A=1:5,B=c(NA,NA,25,24,53), C=c(NA,1,2,3,4))
+#'
 #' na_summary(test2,grouping_cols = c("ID","ID2"))
 #' # sort summary
 #' na_summary(airquality,sort_by = "percent_missing",descending = TRUE)
 
 #' na_summary(airquality,sort_by = "percent_complete")
 #' # Include only via a regular expression
-#' na_summary(mtcars, pattern_type = "contains", 
+#' na_summary(mtcars, pattern_type = "contains",
 #' pattern = "mpg|disp|wt", regex_kind = "inclusion")
-#' na_summary(airquality, pattern_type = "starts_with", 
+#' na_summary(airquality, pattern_type = "starts_with",
 #' pattern = "ozone", regex_kind = "inclusion")
-#' # exclusion via a regex 
+#' # exclusion via a regex
 #' na_summary(airquality, pattern_type = "starts_with",
 #' pattern = "oz|Sol", regex_kind = "exclusion")
+#' # reset rownames when sorting by variable
+#' na_summary(df,sort_by="variable",descending=TRUE, reset_rownames = TRUE)
 #' @export
 
 na_summary <- function(df,grouping_cols=NULL,
                        sort_by=NULL,
-                       descending=FALSE, 
+                       descending=FALSE,
                        exclude_cols = NULL,
                        pattern = NULL,
                        pattern_type = NULL,
-                       regex_kind = "exclusion", 
-                       round_to = NULL){
+                       regex_kind = "exclusion",
+                       round_to = NULL,
+                       reset_rownames = FALSE){
   UseMethod("na_summary")
 
 
@@ -46,45 +52,46 @@ na_summary <- function(df,grouping_cols=NULL,
 
 na_summary.data.frame <- function(df,grouping_cols=NULL,
                                    sort_by=NULL,
-                                   descending=FALSE, 
+                                   descending=FALSE,
                                    exclude_cols = NULL,
                                    pattern = NULL,
                                    pattern_type = NULL,
-                                   regex_kind = "exclusion", 
-                                   round_to = NULL){
+                                   regex_kind = "exclusion",
+                                   round_to = NULL,
+                                   reset_rownames = FALSE){
   # Round percents to chosen round
   round_to = ifelse(is.null(round_to),
                     options("digits")[[1]], round_to)
   if(all(!is.null(exclude_cols), !is.null(pattern_type))){
     stop("Use either exclude_cols or pattern_type, not both.")
-    
+
   }
-  
+
   if(!is.null(pattern_type)){
     if(is.null(pattern)) stop("Please provide a pattern to use.")
     if(!regex_kind %in% c("inclusion", "exclusion")) stop(paste0("Use either inclusion or exclusion not ", regex_kind))
     df<-switch(regex_kind,
-           inclusion =df[recode_selectors(df, 
+           inclusion =df[recode_selectors(df,
                               pattern_type = pattern_type,
                               pattern = pattern)],
-           exclusion =  df[-recode_selectors(df, 
+           exclusion =  df[-recode_selectors(df,
                                   pattern_type =pattern_type,
                                    pattern = pattern)]
     )
   }
 
   if(!is.null(exclude_cols)){
     exclude_cols_indices <- which(names(df) %in% exclude_cols)
-    df <- df[-exclude_cols_indices] 
-  }  
+    df <- df[-exclude_cols_indices]
+  }
 if(is.null(grouping_cols)){
   # stick to(with?) base as much as possible
   # get total NAs columnwise
   all_counts <-stack(get_na_counts(df))
   all_percents <- stack(percent_missing(df))
 
   all_percents$values <- round(all_percents$values, digits=round_to)
-  
+
   names(all_counts) <- c("missing","variable")
   names(all_percents) <- c("percent_missing","variable")
 
@@ -114,7 +121,7 @@ if(length(non_grouping) > 1) warning("All non grouping values used. Using select
     agg_formula <- as.formula(paste0(".~",
                                      grouping_cols_formula))
     res<-do.call(data.frame,aggregate(agg_formula,data=df,
-                                      function(x) c(missing = sum(is.na(x)), 
+                                      function(x) c(missing = sum(is.na(x)),
                                       complete = length(x) - sum(is.na(x)),
                              percent_complete = mean(!is.na(x)) * 100,
                              percent_missing = mean(is.na(x)) * 100
@@ -123,27 +130,28 @@ if(length(non_grouping) > 1) warning("All non grouping values used. Using select
       tidyr::separate(name,c("variable","metric"),
                       sep="\\.(?=percent|miss|complete)")  %>%
       tidyr::pivot_wider(names_from=metric,values_from=value)
-    
-    
+
+
     res$percent_complete=round(res$percent_complete, digits = round_to)
     res$percent_missing=round(res$percent_missing, digits = round_to)
 
 
 }
 if(!is.null(sort_by)){
-stopifnot("sort_by should be a valid name in the output of na_summary" = 
+stopifnot("sort_by should be a valid name in the output of na_summary" =
             sort_by %in% names(res))
-  
+
 
 # Get the value to sort by
 target_column <- res[[sort_by]]
   # Check class of this value and use appropriate sorting
 if (is.factor(target_column)) target_column <- as.character(target_column)
 
-res <- res[sort(target_column,decreasing=descending,index.return=TRUE)[[2]],]  
+res <- res[sort(target_column,decreasing=descending,index.return=TRUE)[[2]],]
 
 
 }
+if(reset_rownames) rownames(res) <- NULL
 res
 }
 

diff --git a/man/na_summary.Rd b/man/na_summary.Rd
diff --git a/tests/testthat/test_na_summary.R b/tests/testthat/test_na_summary.R
@@ -3,57 +3,64 @@ testthat::test_that(desc="Test na_summary",
 
                       skip_on_oldrel()
 
-    
+
     # expect that columns are excluded
     excluded <- na_summary(mtcars, exclude_cols = "mpg")
 
     expect_false( "mpg" %in% excluded$variable)
-   
+
    expect_true(na_summary(airquality,sort_by = "percent_missing")[1,1]=="Day")
    expect_true(na_summary(airquality,sort_by = "percent_missing",
                           descending = TRUE)[1,1]=="Ozone")
    expect_equal(ncol(na_summary(airquality)),5)
     # Check that string sorting works
-   expect_true(na_summary(airquality,sort_by="variable", 
+   expect_true(na_summary(airquality,sort_by="variable",
                           descending = TRUE)[[1]][1] == "Wind")
 
-   expect_true(na_summary(airquality,sort_by="variable", 
+   expect_true(na_summary(airquality,sort_by="variable",
                           descending = FALSE)[[1]][1] == "Day")
-   
-   # Check that rounding works as expected 
-   expect_true(na_summary(airquality, round_to=2, sort_by="percent_missing", 
+
+   # Check that rounding works as expected
+   expect_true(na_summary(airquality, round_to=2, sort_by="percent_missing",
               descending = TRUE)[1,"percent_missing"] == 24.18)
-   
+
    # Check that we can use inclusion patterns
-   only_ozone <- na_summary(airquality, 
-                            pattern_type = "starts_with", 
+   only_ozone <- na_summary(airquality,
+                            pattern_type = "starts_with",
                             pattern = "ozone",
                             regex_kind = "inclusion")
-   # we expect this to have the same number of missing values as no inclusion. 
+   # we expect this to have the same number of missing values as no inclusion.
    expect_true(na_summary(airquality)[3,"missing"] == only_ozone[["missing"]])
-   # Check that we can exclude via a regex 
-   expect_false(any(c("Ozone", "Solar.R") %in% na_summary(airquality, 
+   # Check that we can exclude via a regex
+   expect_false(any(c("Ozone", "Solar.R") %in% na_summary(airquality,
                      pattern_type = "starts_with",
-              pattern = "oz|Sol", 
+              pattern = "oz|Sol",
               regex_kind = "exclusion")[["variable"]]))
    expect_snapshot(
-     
+
      {
-       # Error if a user provides unexpected args 
-       expect_error(na_summary(airquality, 
+       # Error if a user provides unexpected args
+       expect_error(na_summary(airquality,
                                pattern_type = "starts_with",
-                               pattern = "oz|Sol", 
+                               pattern = "oz|Sol",
                                regex_kind = "random"))
-       expect_error(na_summary(airquality, 
+       expect_error(na_summary(airquality,
                                pattern_type = "starts_with",
-                               pattern = "oz|Sol", 
+                               pattern = "oz|Sol",
                                exclude_cols = "Solar.R"))
        expect_error(na_summary(airquality,sort_by = "not_in"))
        expect_warning(na_summary(airquality, grouping_cols=c("Month","Day")))
        expect_error(na_summary(airquality,grouping_cols="gibberish"))
-       
+
      }
    )
-
-
+
+
+                    })
+
+
+testthat::test_that("test to check reset_rownames in na_summary",
+                    {
+                      tmp <- na_summary(airquality, sort_by = "variable", descending = TRUE, reset_rownames = TRUE)
+                      expect_equal(rownames(tmp), as.character(1:6))
                     })