diff --git a/README.md b/README.md index 9947753..8add4a1 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ mde: Missing Data Explorer ================ -2021-09-19 +2022-01-31 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3890659.svg)](https://doi.org/10.5281/zenodo.3890659) -[![CRAN\_Status\_Badge](https://r-pkg.org/badges/version/mde)](https://cran.r-project.org/package=mde) -[![CRAN\_Release\_Badge](https://www.r-pkg.org/badges/version-ago/mde)](https://CRAN.R-project.org/package=mde) +[![CRAN_Status_Badge](https://r-pkg.org/badges/version/mde)](https://cran.r-project.org/package=mde) +[![CRAN_Release_Badge](https://www.r-pkg.org/badges/version-ago/mde)](https://CRAN.R-project.org/package=mde) [![Codecov test coverage](https://codecov.io/gh/Nelson-Gon/mde/branch/master/graph/badge.svg)](https://codecov.io/gh/Nelson-Gon/mde?branch=master) [![R-CMD-check](https://github.com/Nelson-Gon/mde/actions/workflows/devel-check.yaml/badge.svg)](https://github.com/Nelson-Gon/mde/actions/workflows/devel-check.yaml) @@ -93,6 +93,21 @@ na_summary(airquality,sort_by = "percent_complete") #> 6 Wind 0 153 100.00000 0.000000 ``` +If one would like to reset (drop) row names, then one can set +`row_names` to `TRUE` This may especially be useful in cases where +`rownames` are simply numeric and do not have much additional use. + +``` r +na_summary(airquality,sort_by = "percent_complete", reset_rownames = TRUE) +#> variable missing complete percent_complete percent_missing +#> 1 Ozone 37 116 75.81699 24.183007 +#> 2 Solar.R 7 146 95.42484 4.575163 +#> 3 Day 0 153 100.00000 0.000000 +#> 4 Month 0 153 100.00000 0.000000 +#> 5 Temp 0 153 100.00000 0.000000 +#> 6 Wind 0 153 100.00000 0.000000 +``` + To sort by `percent_missing` instead: ``` r diff --git a/README.rmd b/README.rmd index 50edf37..6a49d5e 100644 --- a/README.rmd +++ b/README.rmd @@ -86,7 +86,7 @@ devtools::install_github("Nelson-Gon/mde@develop") **Loading the package** -```{r} +```{r pkg_load} library(mde) @@ -105,7 +105,7 @@ library(mde) To get a simple missingness report, use `na_summary`: -```{r} +```{r na_summary_default} na_summary(airquality) @@ -114,10 +114,20 @@ na_summary(airquality) To sort this summary by a given column : -```{r} +```{r column_sort_na_summary} na_summary(airquality,sort_by = "percent_complete") +``` +If one would like to reset (drop) row names, then one can set `row_names` to +`TRUE` This may especially be useful in cases where `rownames` are simply +numeric and do not have much additional use. + + +```{r reset_rownames} + +na_summary(airquality,sort_by = "percent_complete", reset_rownames = TRUE) + ``` To sort by `percent_missing` instead: @@ -133,6 +143,7 @@ To sort the above in descending order: na_summary(airquality, sort_by="percent_missing", descending = TRUE) ``` + To exclude certain columns from the analysis: ```{r} @@ -147,7 +158,9 @@ To include or exclude via regex match: na_summary(airquality, regex_kind = "inclusion",pattern_type = "starts_with", pattern = "O|S") ``` ```{r} + na_summary(airquality, regex_kind = "exclusion",pattern_type = "regex", pattern = "^[O|S]") + ``` To get this summary by group: diff --git a/docs/articles/mde_vignette.html b/docs/articles/mde_vignette.html index 6832c6f..6e8dad1 100644 --- a/docs/articles/mde_vignette.html +++ b/docs/articles/mde_vignette.html @@ -86,7 +86,7 @@
vignettes/mde_vignette.rmd
mde_vignette.rmd
vignettes/missingness.rmd
missingness.rmd
To sort by percent_missing
instead:
If one would like to reset (drop) row names, then one can set row_names
to TRUE
This may especially be useful in cases where rownames
are simply numeric and do not have much additional use.
+
+na_summary(airquality,sort_by = "percent_complete", reset_rownames = TRUE)
+#> variable missing complete percent_complete percent_missing
+#> 1 Ozone 37 116 75.81699 24.183007
+#> 2 Solar.R 7 146 95.42484 4.575163
+#> 3 Day 0 153 100.00000 0.000000
+#> 4 Month 0 153 100.00000 0.000000
+#> 5 Temp 0 153 100.00000 0.000000
+#> 6 Wind 0 153 100.00000 0.000000
To sort by percent_missing
instead:
na_summary(airquality, sort_by = "percent_missing")
#> variable missing complete percent_complete percent_missing
#> 1 Day 0 153 100.00000 0.000000
@@ -137,7 +148,7 @@ 2022-01-19
#> 4 Solar.R 7 146 95.42484 4.575163
#> 3 Ozone 37 116 75.81699 24.183007
To sort the above in descending order:
-+na_summary(airquality, sort_by="percent_missing", descending = TRUE) #> variable missing complete percent_complete percent_missing #> 3 Ozone 37 116 75.81699 24.183007 @@ -147,7 +158,7 @@
2022-01-19
#> 5 Temp 0 153 100.00000 0.000000 #> 6 Wind 0 153 100.00000 0.000000To exclude certain columns from the analysis:
-+na_summary(airquality, exclude_cols = c("Day", "Wind")) #> variable missing complete percent_complete percent_missing @@ -156,12 +167,12 @@
2022-01-19
#> 3 Solar.R 7 146 95.42484 4.575163 #> 4 Temp 0 153 100.00000 0.000000To include or exclude via regex match:
-+-na_summary(airquality, regex_kind = "inclusion",pattern_type = "starts_with", pattern = "O|S") #> variable missing complete percent_complete percent_missing #> 1 Ozone 37 116 75.81699 24.183007 #> 2 Solar.R 7 146 95.42484 4.575163
+na_summary(airquality, regex_kind = "exclusion",pattern_type = "regex", pattern = "^[O|S]") #> variable missing complete percent_complete percent_missing #> 1 Day 0 153 100 0 @@ -169,7 +180,7 @@
2022-01-19
#> 3 Temp 0 153 100 0 #> 4 Wind 0 153 100 0To get this summary by group:
-+-test2 <- data.frame(ID= c("A","A","B","A","B"), Vals = c(rep(NA,4),"No"),ID2 = c("E","E","D","E","D")) @@ -179,7 +190,7 @@
2022-01-19
#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> #> 1 B D Vals 1 1 50 50 #> 2 A E Vals 3 0 0 100+na_summary(test2, grouping_cols="ID") #> Warning in na_summary.data.frame(test2, grouping_cols = "ID"): All non grouping @@ -196,13 +207,13 @@
2022-01-19
This provides a convenient way to show the number of missing values column-wise. It is relatively fast(tests done on about 400,000 rows, took a few microseconds.)
To get the number of missing values in each column of
-airquality
, we can use the function as follows:+get_na_counts(airquality) #> Ozone Solar.R Wind Temp Month Day #> 1 37 7 0 0 0 0
The above might be less useful if one would like to get the results by group. In that case, one can provide a grouping vector of names in
-grouping_cols
.+test <- structure(list(Subject = structure(c(1L, 1L, 2L, 2L), .Label = c("A", "B"), class = "factor"), res = c(NA, 1, 2, 3), ID = structure(c(1L, @@ -219,14 +230,14 @@
2022-01-19
percent_missing
This is a very simple to use but quick way to take a look at the percentage of data that is missing column-wise.
-+percent_missing(airquality) #> Ozone Solar.R Wind Temp Month Day #> 1 24.18301 4.575163 0 0 0 0
We can get the results by group by providing an optional
-grouping_cols
character vector.+percent_missing(test, grouping_cols = "Subject") #> # A tibble: 2 x 3 @@ -235,7 +246,7 @@
2022-01-19
#> 1 A 50 0 #> 2 B 0 0To exclude some columns from the above exploration, one can provide an optional character vector in
-exclude_cols
.+percent_missing(airquality,exclude_cols = c("Day","Temp")) #> Ozone Solar.R Wind Month @@ -245,7 +256,7 @@
2022-01-19
This provides a very simple but relatively fast way to sort variables by missingness. Unless otherwise stated, this does not currently support arranging grouped percents.
Usage:
-+sort_by_missingness(airquality, sort_by = "counts") @@ -257,7 +268,7 @@
2022-01-19
#> 5 Solar.R 7 #> 6 Ozone 37To sort in descending order:
-+sort_by_missingness(airquality, sort_by = "counts", descend = TRUE) #> variable percent @@ -268,7 +279,7 @@
2022-01-19
#> 5 Month 0 #> 6 Day 0To use percentages instead:
-++sort_by_missingness(airquality, sort_by = "percents") #> variable percent diff --git a/docs/articles/recoding.html b/docs/articles/recoding.html index 699180e..a116ca6 100644 --- a/docs/articles/recoding.html +++ b/docs/articles/recoding.html @@ -86,7 +86,7 @@
+Recoding to and from NA with mde
-2022-01-19
+2022-01-31
Source:vignettes/recoding.rmd
@@ -514,6 +514,25 @@recoding.rmd
Dropping Missing Values#> 4 18 313 11.5 62 5 4 #> 5 42 520 14.3 56 5 5 #> 6 28 520 14.9 66 5 6
+General Recoding +
++
+- For general recoding of values, one can use
+recode_as_value
for example as shown below+++head(recode_as_value(airquality, value=c(67,118),replacement=NA, + pattern_type="starts_with",pattern="S|O")) +#> Ozone Solar.R Wind Temp Month Day +#> 1 41 190 7.4 67 5 1 +#> 2 36 NA 8.0 72 5 2 +#> 3 12 149 12.6 74 5 3 +#> 4 18 313 11.5 62 5 4 +#> 5 NA NA 14.3 56 5 5 +#> 6 28 NA 14.9 66 5 6
The above is a more general function that can do what the other functions do and may be more useful for development purposes.
Please note that the
mde
project is released with a Contributor Code of Conduct. By contributing to this project, you agree to abide by its terms.For further exploration, please
diff --git a/docs/index.html b/docs/index.html index 9d7d770..fec8840 100644 --- a/docs/index.html +++ b/docs/index.html @@ -88,7 +88,7 @@browseVignettes("mde")
.--2021-09-19
+2022-01-31
@@ -136,8 +136,18 @@Exploring missingness#> 2 Month 0 153 100.00000 0.000000 #> 5 Temp 0 153 100.00000 0.000000 #> 6 Wind 0 153 100.00000 0.000000
To sort by
+percent_missing
instead:If one would like to reset (drop) row names, then one can set
row_names
toTRUE
This may especially be useful in cases whererownames
are simply numeric and do not have much additional use.++na_summary(airquality,sort_by = "percent_complete", reset_rownames = TRUE) +#> variable missing complete percent_complete percent_missing +#> 1 Ozone 37 116 75.81699 24.183007 +#> 2 Solar.R 7 146 95.42484 4.575163 +#> 3 Day 0 153 100.00000 0.000000 +#> 4 Month 0 153 100.00000 0.000000 +#> 5 Temp 0 153 100.00000 0.000000 +#> 6 Wind 0 153 100.00000 0.000000
To sort by
+percent_missing
instead:na_summary(airquality, sort_by = "percent_missing") #> variable missing complete percent_complete percent_missing #> 1 Day 0 153 100.00000 0.000000 @@ -147,7 +157,7 @@
Exploring missingness#> 4 Solar.R 7 146 95.42484 4.575163 #> 3 Ozone 37 116 75.81699 24.183007
To sort the above in descending order:
-+na_summary(airquality, sort_by="percent_missing", descending = TRUE) #> variable missing complete percent_complete percent_missing #> 3 Ozone 37 116 75.81699 24.183007 @@ -157,7 +167,7 @@
Exploring missingness#> 5 Temp 0 153 100.00000 0.000000 #> 6 Wind 0 153 100.00000 0.000000
To exclude certain columns from the analysis:
-+na_summary(airquality, exclude_cols = c("Day", "Wind")) #> variable missing complete percent_complete percent_missing #> 1 Month 0 153 100.00000 0.000000 @@ -165,12 +175,12 @@
Exploring missingness#> 3 Solar.R 7 146 95.42484 4.575163 #> 4 Temp 0 153 100.00000 0.000000
To include or exclude via regex match:
-+-na_summary(airquality, regex_kind = "inclusion",pattern_type = "starts_with", pattern = "O|S") #> variable missing complete percent_complete percent_missing #> 1 Ozone 37 116 75.81699 24.183007 #> 2 Solar.R 7 146 95.42484 4.575163
+na_summary(airquality, regex_kind = "exclusion",pattern_type = "regex", pattern = "^[O|S]") #> variable missing complete percent_complete percent_missing #> 1 Day 0 153 100 0 @@ -178,7 +188,7 @@
Exploring missingness#> 3 Temp 0 153 100 0 #> 4 Wind 0 153 100 0
To get this summary by group:
-+-test2 <- data.frame(ID= c("A","A","B","A","B"), Vals = c(rep(NA,4),"No"),ID2 = c("E","E","D","E","D")) na_summary(test2,grouping_cols = c("ID","ID2")) @@ -187,7 +197,7 @@
Exploring missingness#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> #> 1 B D Vals 1 1 50 50 #> 2 A E Vals 3 0 0 100
+na_summary(test2, grouping_cols="ID") #> Warning in na_summary.data.frame(test2, grouping_cols = "ID"): All non grouping #> values used. Using select non groups is currently not supported @@ -203,12 +213,12 @@
Exploring missingness
This provides a convenient way to show the number of missing values column-wise. It is relatively fast(tests done on about 400,000 rows, took a few microseconds.)
To get the number of missing values in each column of
-airquality
, we can use the function as follows:+get_na_counts(airquality) #> Ozone Solar.R Wind Temp Month Day #> 1 37 7 0 0 0 0
The above might be less useful if one would like to get the results by group. In that case, one can provide a grouping vector of names in
-grouping_cols
.+test <- structure(list(Subject = structure(c(1L, 1L, 2L, 2L), .Label = c("A", "B"), class = "factor"), res = c(NA, 1, 2, 3), ID = structure(c(1L, 1L, 2L, 2L), .Label = c("1", "2"), class = "factor")), class = "data.frame", row.names = c(NA, @@ -224,13 +234,13 @@
Exploring missingness
percent_missing
This is a very simple to use but quick way to take a look at the percentage of data that is missing column-wise.
-+percent_missing(airquality) #> Ozone Solar.R Wind Temp Month Day #> 1 24.18301 4.575163 0 0 0 0
We can get the results by group by providing an optional
-grouping_cols
character vector.+percent_missing(test, grouping_cols = "Subject") #> # A tibble: 2 x 3 #> Subject res ID @@ -238,7 +248,7 @@
Exploring missingness#> 1 A 50 0 #> 2 B 0 0
To exclude some columns from the above exploration, one can provide an optional character vector in
-exclude_cols
.+@@ -247,7 +257,7 @@percent_missing(airquality,exclude_cols = c("Day","Temp")) #> Ozone Solar.R Wind Month #> 1 24.18301 4.575163 0 0
Exploring missingness
This provides a very simple but relatively fast way to sort variables by missingness. Unless otherwise stated, this does not currently support arranging grouped percents.
Usage:
-+sort_by_missingness(airquality, sort_by = "counts") #> variable percent @@ -258,7 +268,7 @@
Exploring missingness#> 5 Solar.R 7 #> 6 Ozone 37
To sort in descending order:
-+sort_by_missingness(airquality, sort_by = "counts", descend = TRUE) #> variable percent #> 1 Ozone 37 @@ -268,7 +278,7 @@
Exploring missingness#> 5 Month 0 #> 6 Day 0
To use percentages instead:
-+sort_by_missingness(airquality, sort_by = "percents") #> variable percent #> 1 Wind 0.000000 @@ -286,7 +296,7 @@
Recoding as NA
As the name might imply, this converts any value or vector of values to
NA
i.e. we take a value such as “missing” or “NA” (not a realNA
according toR
) and convert it to R’s known handler for missing values (NA
).To use the function out of the box (with default arguments), one simply does something like:
-+dummy_test <- data.frame(ID = c("A","B","B","A"), values = c("n/a",NA,"Yes","No")) # Convert n/a and no to NA @@ -297,7 +307,7 @@
Recoding as NA#> 3 B Yes #> 4 A <NA>
Great, but I want to do so for specific columns not the entire dataset. You can do this by providing column names to
-subset_cols
.+another_dummy <- data.frame(ID = 1:5, Subject = 7:11, Change = c("missing","n/a",2:4 )) @@ -310,7 +320,7 @@
Recoding as NA#> 4 4 10 3 #> 5 5 11 4
To recode columns using RegEx,one can provide
-pattern_type
and a targetpattern
. Currently supportedpattern_types
arestarts_with
,ends_with
,contains
andregex
. See docs for more details.:+-# only change at columns that start with Solar head(recode_as_na(airquality,value=190,pattern_type="starts_with",pattern="Solar")) #> Ozone Solar.R Wind Temp Month Day @@ -320,7 +330,7 @@
Recoding as NA#> 4 18 313 11.5 62 5 4 #> 5 NA NA 14.3 56 5 5 #> 6 28 NA 14.9 66 5 6
+-# recode at columns that start with O or S(case sensitive) head(recode_as_na(airquality,value=c(67,118),pattern_type="starts_with",pattern="S|O")) #> Ozone Solar.R Wind Temp Month Day @@ -330,7 +340,7 @@
Recoding as NA#> 4 18 313 11.5 62 5 4 #> 5 NA NA 14.3 56 5 5 #> 6 28 NA 14.9 66 5 6
+# use my own RegEx head(recode_as_na(airquality,value=c(67,118),pattern_type="regex",pattern="(?i)^(s|o)")) #> Ozone Solar.R Wind Temp Month Day @@ -344,7 +354,7 @@
Recoding as NA
recode_as_na_if
This function allows one to deliberately introduce missing values if a column meets a certain threshold of missing values. This is similar to
-amputation
but is much more basic. It is only provided here because it is hoped it may be useful to someone for whatever reason.+head(recode_as_na_if(airquality,sign="gt", percent_na=20)) #> Ozone Solar.R Wind Temp Month Day #> 1 NA 190 7.4 67 5 1 @@ -357,7 +367,7 @@
Recoding as NA
recode_as_na_str
This allows recoding as
-NA
based on a string match.+partial_match <- data.frame(A=c("Hi","match_me","nope"), B=c(NA, "not_me","nah")) recode_as_na_str(partial_match,"ends_with","ME", case_sensitive=FALSE) @@ -370,7 +380,7 @@
Recoding as NA
For all values greater/less/less or equal/greater or equal than some value, can I convert them to
NA
?!Yes You Can! All we have to do is use
-recode_as_na_for
:+head(recode_as_na_for(airquality,criteria="gt",value=25)) #> Ozone Solar.R Wind Temp Month Day #> 1 NA NA 7.4 NA 5 1 @@ -380,7 +390,7 @@
Recoding as NA#> 5 NA NA 14.3 NA 5 5 #> 6 NA NA 14.9 NA 5 6
To do so at specific columns, pass an optional
-subset_cols
character vector:+head(recode_as_na_for(airquality, value=40,subset_cols=c("Solar.R","Ozone"), criteria="gt")) #> Ozone Solar.R Wind Temp Month Day #> 1 NA NA 7.4 67 5 1 @@ -397,7 +407,7 @@
Recoding NA as
recode_na_as
Sometimes, for whatever reason, one would like to replace
-NA
s with whatever value they would like.recode_na_as
provides a very simple way to do just that.+head(recode_na_as(airquality)) #> Ozone Solar.R Wind Temp Month Day #> 1 41 190 7.4 67 5 1 @@ -418,7 +428,7 @@
Recoding NA as#> 5 NaN NaN 14.3 56 5 5 #> 6 28 NaN 14.9 66 5 6
As a “bonus”, you can manipulate the data only at specific columns as shown here:
-+head(recode_na_as(airquality, value=0, subset_cols="Ozone")) #> Ozone Solar.R Wind Temp Month Day #> 1 41 190 7.4 67 5 1 @@ -428,7 +438,7 @@
Recoding NA as#> 5 0 NA 14.3 56 5 5 #> 6 28 NA 14.9 66 5 6
The above also supports custom recoding similar to
-recode_na_as
:+head(mde::recode_na_as(airquality, value=0, pattern_type="starts_with",pattern="Solar")) #> Ozone Solar.R Wind Temp Month Day #> 1 41 190 7.4 67 5 1 @@ -441,7 +451,7 @@
Recoding NA as
column_based_recode
Ever needed to change values in a given column based on the proportions of
-NA
s in other columns(row-wise)?!. The goal ofcolumn_based_recode
is to achieve just that. Let’s see how we could do this with a simple example:+head(column_based_recode(airquality, values_from = "Wind", values_to="Wind", pattern_type = "regex", pattern = "Solar|Ozone")) #> Ozone Solar.R Wind Temp Month Day @@ -456,7 +466,7 @@
Recoding NA as
This allows recoding
NA
values with common stats functions such asmean
,max
,min
,sd
.To use default values:
-+head(custom_na_recode(airquality)) #> Ozone Solar.R Wind Temp Month Day #> 1 41.00000 190.0000 7.4 67 5 1 @@ -466,7 +476,7 @@
Recoding NA as#> 5 42.12931 185.9315 14.3 56 5 5 #> 6 28.00000 185.9315 14.9 66 5 6
To use select columns:
-+head(custom_na_recode(airquality,func="mean",across_columns=c("Solar.R","Ozone"))) @@ -479,7 +489,7 @@
Recoding NA as#> 6 28.00000 185.9315 14.9 66 5 6
To use a function from another package to perform replacements:
To perform a forward fill with
-dplyr
’slead
:+# use lag for a backfill head(custom_na_recode(airquality,func=dplyr::lead )) #> Ozone Solar.R Wind Temp Month Day @@ -490,7 +500,7 @@
Recoding NA as#> 5 23 99 14.3 56 5 5 #> 6 28 19 14.9 66 5 6
To perform replacement by group:
-+some_data <- data.frame(ID=c("A1","A1","A1","A2","A2", "A2"),A=c(5,NA,0,8,3,4),B=c(10,0,0,NA,5,6),C=c(1,NA,NA,25,7,8)) head(custom_na_recode(some_data,func = "mean", grouping_cols = "ID")) @@ -504,7 +514,7 @@
Recoding NA as#> 5 A2 3 5 7 #> 6 A2 4 6 8
Across specific columns:
-+head(custom_na_recode(some_data,func = "mean", grouping_cols = "ID", across_columns = c("C", "A"))) #> # A tibble: 6 x 4 #> ID A B C @@ -519,7 +529,7 @@
Recoding NA as
recode_na_if
Given a
-data.frame
object, one can recodeNA
s as another value based on a grouping variable. In the example below, we replace allNA
s in all columns with 0s if the ID isA2
orA3
+some_data <- data.frame(ID=c("A1","A2","A3", "A4"), A=c(5,NA,0,8), B=c(10,0,0,1), C=c(1,NA,NA,25)) @@ -542,7 +552,7 @@
Dropping NAs
+head(drop_na_if(airquality, sign="gteq",percent_na = 24)) #> Solar.R Wind Temp Month Day #> 1 190 7.4 67 5 1 @@ -553,7 +563,7 @@
Dropping NAs#> 6 NA 14.9 66 5 6
The above also supports less than or equal to(
lteq
), equal to(eq
), greater than(gt
) and less than(lt
).To keep certain columns despite fitting the target
-percent_na
criteria, one can provide an optionalkeep_columns
character vector.+head(drop_na_if(airquality, percent_na = 24, keep_columns = "Ozone")) #> Ozone Solar.R Wind Temp Month Day @@ -564,7 +574,7 @@
Dropping NAs#> 5 NA NA 14.3 56 5 5 #> 6 28 NA 14.9 66 5 6
Compare the above result to the following:
-+head(drop_na_if(airquality, percent_na = 24)) #> Solar.R Wind Temp Month Day #> 1 190 7.4 67 5 1 @@ -574,7 +584,7 @@
Dropping NAs#> 5 NA 14.3 56 5 5 #> 6 NA 14.9 66 5 6
To drop groups that meet a set missingness criterion, we proceed as follows.
-+grouped_drop <- structure(list(ID = c("A", "A", "B", "A", "B"), Vals = c(4, NA, NA, NA, NA), Values = c(5, 6, 7, 8, NA)), row.names = c(NA, -5L), class = "data.frame") @@ -592,7 +602,7 @@
Dropping NAs
+# Drop rows with at least two NAs head(drop_row_if(airquality, sign="gteq", type="count" , value = 2)) #> Dropped 2 rows. @@ -604,7 +614,7 @@
Dropping NAs#> 6 28 NA 14.9 66 5 6 #> 7 23 299 8.6 65 5 7
To drop based on percentages:
-+diff --git a/vignettes/missingness.rmd b/vignettes/missingness.rmd index 1221442..b7a56f8 100644 --- a/vignettes/missingness.rmd +++ b/vignettes/missingness.rmd @@ -55,6 +55,20 @@ na_summary(airquality,sort_by = "percent_complete") ``` + + +If one would like to reset (drop) row names, then one can set `row_names` to +`TRUE` This may especially be useful in cases where `rownames` are simply +numeric and do not have much additional use. + + +```{r reset_rownames} + +na_summary(airquality,sort_by = "percent_complete", reset_rownames = TRUE) + +``` + + To sort by `percent_missing` instead: ```{r} diff --git a/vignettes/recoding.rmd b/vignettes/recoding.rmd index 9a034ad..bf7f7f5 100644 --- a/vignettes/recoding.rmd +++ b/vignettes/recoding.rmd @@ -35,6 +35,8 @@ library(mde) ``` + + # Recoding as NA * `recode_as_na` @@ -377,7 +379,20 @@ head(dict_recode(airquality, use_func="recode_na_as", pattern_type="starts_with", values = c(520,42))) ``` +# General Recoding + + +* For general recoding of values, one can use `recode_as_value` for +example as shown below + + +```{r} + +head(recode_as_value(airquality, value=c(67,118),replacement=NA, + pattern_type="starts_with",pattern="S|O")) +``` +The above is a more general function that can do what the other functions do and may be more useful for development purposes. ---# Drops 42 rows head(drop_row_if(airquality, type="percent", value=16, sign="gteq", as_percent=TRUE)) @@ -621,7 +631,7 @@
Dropping NAs
+head(drop_na_at(airquality,pattern_type = "starts_with","O")) #> Ozone #> 1 41 @@ -634,7 +644,7 @@
Dropping NAs
+test2 <- data.frame(ID= c("A","A","B","A","B"), Vals = c(4,rep(NA, 4))) drop_all_na(test2, grouping_cols="ID") #> # A tibble: 3 x 2 @@ -644,7 +654,7 @@
Dropping NAs#> 2 A NA #> 3 A NA
Alternatively, we can drop groups where all variables are all NA.
-+@@ -98,16 +99,18 @@test2 <- data.frame(ID= c("A","A","B","A","B"), Vals = rep(NA, 5)) head(drop_all_na(test, grouping_cols = "ID")) @@ -659,7 +669,7 @@
Dropping NAs
+head(dict_recode(airquality, use_func="recode_na_as", patterns = c("solar", "ozone"), pattern_type="starts_with", values = c(520,42))) diff --git a/docs/news/index.html b/docs/news/index.html index 326e64e..9830116 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -73,6 +73,7 @@
.mde 0.3.2dplyr::select
get_na_means
andpercent_missing
now supportPOSIXct
.+
na_counts
andpercent_na
are new vector focused functions to allowget
tingna_counts
and percent missingness for objects of classes likePOSIXct
.include a new argument to reset rownames in
na_summary
. Fixes
#33.mde 0.3.12021-08-17
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index c6c4568..c0a83e1 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -5,7 +5,7 @@ articles: mde_vignette: mde_vignette.html missingness: missingness.html recoding: recoding.html -last_built: 2022-01-19T20:43Z +last_built: 2022-01-31T20:54Z urls: reference: https://nelson-gon.github.io/mde/reference article: https://nelson-gon.github.io/mde/articles diff --git a/docs/reference/na_summary.html b/docs/reference/na_summary.html index bda6e96..fc6350e 100644 --- a/docs/reference/na_summary.html +++ b/docs/reference/na_summary.html @@ -78,7 +78,8 @@An all-in-one missingness report
pattern = NULL, pattern_type = NULL, regex_kind = "exclusion", - round_to = NULL + round_to = NULL, + reset_rownames = FALSE )Arguments
A character vector indicating columns to exclude when returning results.
pattern -Pattern to use for exclusion or inclusion. +
Pattern to use for exclusion or inclusion. column inclusion criteria.
pattern_type A regular expression type. One of "starts_with", "contains", or "regex". Defaults to NULL. Only use for selective inclusion.
regex_kind -One of inclusion or exclusion. Defaults to exclusion to exclude +
One of inclusion or exclusion. Defaults to exclusion to exclude columns using regular expressions.
round_to + Number of places to round 2. Defaults to user digits option.
reset_rownames +Should the rownames be reset in the output? defaults to FALSE
@@ -123,6 +126,8 @@Examples
# grouping test2 <- data.frame(ID= c("A","A","B","A","B"),Vals = c(rep(NA,4),"No"), ID2 = c("E","E","D","E","D")) +df <- data.frame(A=1:5,B=c(NA,NA,25,24,53), C=c(NA,1,2,3,4)) + na_summary(test2,grouping_cols = c("ID","ID2")) #> # A tibble: 2 x 7 #> ID ID2 variable missing complete percent_complete percent_missing @@ -147,17 +152,17 @@Examples
#> 5 Temp 0 153 100.00000 0.000000 #> 6 Wind 0 153 100.00000 0.000000 # Include only via a regular expression -na_summary(mtcars, pattern_type = "contains", +na_summary(mtcars, pattern_type = "contains", pattern = "mpg|disp|wt", regex_kind = "inclusion") #> variable missing complete percent_complete percent_missing #> 1 disp 0 32 100 0 #> 2 mpg 0 32 100 0 #> 3 wt 0 32 100 0 -na_summary(airquality, pattern_type = "starts_with", +na_summary(airquality, pattern_type = "starts_with", pattern = "ozone", regex_kind = "inclusion") #> variable missing complete percent_complete percent_missing #> 1 Ozone 37 116 75.81699 24.18301 -# exclusion via a regex +# exclusion via a regex na_summary(airquality, pattern_type = "starts_with", pattern = "oz|Sol", regex_kind = "exclusion") #> variable missing complete percent_complete percent_missing @@ -165,6 +170,12 @@Examples
#> 2 Month 0 153 100 0 #> 3 Temp 0 153 100 0 #> 4 Wind 0 153 100 0 +# reset rownames when sorting by variable +na_summary(df,sort_by="variable",descending=TRUE, reset_rownames = TRUE) +#> variable missing complete percent_complete percent_missing +#> 1 C 1 4 80 20 +#> 2 B 2 3 60 40 +#> 3 A 0 5 100 0