Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Plot missing data indicator matrix #123

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ Imports:
mice,
purrr,
rlang,
scales,
stats,
stringr,
tidyr,
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ export(densityplot)
export(ggmice)
export(plot_corr)
export(plot_flux)
export(plot_miss)
export(plot_pattern)
export(plot_pred)
export(plot_trace)
export(plot_variance)
hanneoberman marked this conversation as resolved.
Show resolved Hide resolved
export(stripplot)
export(xyplot)
importFrom(magrittr,"%>%")
Expand Down
125 changes: 125 additions & 0 deletions R/plot_miss.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#' Plot missingness in a dataset
#'
#' @param data An incomplete dataset of class `data.frame` or `matrix`.
#' @param vrb String, vector, or unquoted expression with variable name(s), default is "all".
#' @param grid Logical indicating whether borders should be present between tiles.
#' @param ordered Logical indicating whether rows should be ordered according to their pattern.
#' @param square Logical indicating whether the plot tiles should be squares, defaults to squares.
#'
#' @return An object of class [ggplot2::ggplot].
#'
#' @examples
#' plot_miss(mice::nhanes)
#' @export

plot_miss <-
function(data,
vrb = "all",
grid = FALSE,
square = FALSE,
ordered = FALSE) {
# input processing
if (is.matrix(data) && ncol(data) > 1) {
data <- as.data.frame(data)
}
verify_data(data, df = TRUE)
vrb <- substitute(vrb)
if (vrb[1] == "all") {
vrb <- names(data)
} else {
vrb <- names(dplyr::select(as.data.frame(data), {{vrb}}))
}
if (".x" %in% vrb || ".y" %in% vrb) {
cli::cli_abort(
c(
"The variable names '.x' and '.y' are used internally to produce the missing data pattern plot.",
"i" = "Please exclude or rename your variable(s)."
)
)
}
if (ordered) {
# extract md.pattern matrix
mdpat <- utils::head(mice::md.pattern(data, plot = FALSE), -1)
# save frequency of patterns
freq.pat <- rownames(mdpat) %>%

Check warning on line 44 in R/plot_miss.R

View workflow job for this annotation

GitHub Actions / lint

file=R/plot_miss.R,line=44,col=7,[object_name_linter] Variable and function name style should match snake_case or symbols.
as.numeric()

na.mat <- mdpat %>%

Check warning on line 47 in R/plot_miss.R

View workflow job for this annotation

GitHub Actions / lint

file=R/plot_miss.R,line=47,col=7,[object_name_linter] Variable and function name style should match snake_case or symbols.
as.data.frame() %>%
dplyr::select(-ncol(.data)) %>%
dplyr::mutate(nmis = freq.pat) %>%
tidyr::uncount(nmis)
} else {
# Create missingness indicator matrix
na.mat <-

Check warning on line 54 in R/plot_miss.R

View workflow job for this annotation

GitHub Actions / lint

file=R/plot_miss.R,line=54,col=7,[object_name_linter] Variable and function name style should match snake_case or symbols.
purrr::map_df(data[, vrb], function(y)

Check warning on line 55 in R/plot_miss.R

View workflow job for this annotation

GitHub Actions / lint

file=R/plot_miss.R,line=55,col=36,[brace_linter] Any function spanning multiple lines should use curly braces.
as.numeric(!is.na(y)))

Check warning on line 56 in R/plot_miss.R

View workflow job for this annotation

GitHub Actions / lint

file=R/plot_miss.R,line=56,col=10,[indentation_linter] Indentation should be 24 spaces but is 10 spaces.
}
# extract pattern info
vrb <- colnames(na.mat)
rws <- nrow(na.mat)
cls <- ncol(na.mat)

# transform to long format
long <-
as.data.frame(cbind(.y = 1:rws, na.mat)) %>%
tidyr::pivot_longer(
cols = tidyselect::all_of(vrb),
names_to = "x",
values_to = ".where"
) %>%
dplyr::mutate(.x = as.numeric(factor(
.data$x,
levels = vrb, ordered = TRUE
)),
.where = factor(
.data$.where,
levels = c(0, 1),
labels = c("missing", "observed")
))
gg <-
ggplot2::ggplot(long,
ggplot2::aes(.data$.x,
as.numeric(.data$.y),
fill = .data$.where)) +
ggplot2::scale_fill_manual(values = c(
"observed" = "#006CC2B3",
"missing" = "#B61A51B3"
)) +
ggplot2::scale_alpha_continuous(limits = c(0, 1), guide = "none") +
ggplot2::scale_x_continuous(breaks = 1:cls,
labels = vrb) +
ggplot2::scale_y_reverse(breaks = \(y) {
eb = scales::extended_breaks()(y)
eb[1] = min(long$.y)
eb[length(eb)] = max(long$.y)
eb
}) +
ggplot2::labs(
x = "Column name",
y = "Row number",
fill = "",
alpha = ""
) +
theme_minimice()
# additional arguments
if (grid) {
gg <- gg + ggplot2::geom_tile(color = "black")
} else{
gg <- gg + ggplot2::geom_tile()
}
if (square) {
gg <- gg + ggplot2::coord_fixed(expand = FALSE)
} else {
gg <- gg + ggplot2::coord_cartesian(expand = FALSE)
}
if (ordered) {
gg <- gg +
ggplot2::theme(
axis.text.y = ggplot2::element_blank(),
axis.ticks.y = ggplot2::element_blank()
)
}
return(gg)
}

71 changes: 71 additions & 0 deletions R/plot_variance.R
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in 7b107d9

Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#' Plot the scaled between imputation variance for every cell as a heatmap
#'
#' This function plots the cell-level between imputation variance. The function
#' scales the variances column-wise, without centering cf. `base::scale(center = FALSE)`
#' and plots the data image as a heatmap. Darker red cells indicate more variance,
#' lighter cells indicate less variance. White cells represent observed cells or unobserved cells with zero between
#' imputation variance.
#'
#' @param data A package `mice` generated multiply imputed data set of class
#' `mids`. Non-`mids` objects that have not been generated with `mice::mice()`
#' can be converted through a pipeline with `mice::as.mids()`.
#' @param grid Logical indicating whether grid lines should be displayed.
#'
#' @return An object of class `ggplot`.
#' @examples
#' imp <- mice::mice(mice::nhanes, printFlag = FALSE)
#' plot_variance(imp)
#' @export
plot_variance <- function(data, grid = TRUE) {
verify_data(data, imp = TRUE)
if (data$m < 2) {
cli::cli_abort(
c(
"The between imputation variance cannot be computed if there are fewer than two imputations (m < 2).",
"i" = "Please provide an object with 2 or more imputations"
)
)
}
if (grid) {
gridcol <- "black"
} else {
gridcol <- NA
}

gg <- mice::complete(data, "long") %>%
dplyr::mutate(dplyr::across(where(is.factor), as.numeric)) %>%
dplyr::select(-.imp) %>%
dplyr::group_by(.id) %>%
dplyr::summarise(dplyr::across(dplyr::everything(), stats::var)) %>%
dplyr::ungroup() %>%
dplyr::mutate(dplyr::across(.cols = -.id, ~ scale_above_zero(.))) %>%
tidyr::pivot_longer(cols = -.id) %>%
ggplot2::ggplot(ggplot2::aes(name, .id, fill = value)) +
ggplot2::geom_tile(color = gridcol) +
ggplot2::scale_fill_gradient(low = "white", high = mice::mdc(2)) +
ggplot2::labs(
x = "Column name",
y = "Row number",
fill = "Imputation variability*
",
caption = "*scaled cell-level between imputation variance"
) + # "Cell-level between imputation\nvariance (scaled)\n\n"
ggplot2::scale_x_discrete(position = "top", expand = c(0, 0)) +
ggplot2::scale_y_continuous(trans = "reverse", expand = c(0, 0)) +
theme_minimice()

if (!grid) {
gg <-
gg + ggplot2::theme(panel.border = ggplot2::element_rect(fill = NA))
}

# return the ggplot object
return(gg)
}

# function to scale only non-zero values without centering
scale_above_zero <- function(x) {
x <- as.matrix(x)
x[x != 0] <- scale(x[x != 0], center = FALSE)
return(x)
}
2 changes: 1 addition & 1 deletion R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
NULL

# suppress undefined global functions or variables note
utils::globalVariables(c(".id", ".imp", ".where", ".id", "where", "name", "value"))
utils::globalVariables(c(".id", ".imp", ".where", ".id", "where", "name", "value", "nmis"))

# Alias a function with `foo <- function(...) pkgB::blah(...)`

Expand Down
28 changes: 28 additions & 0 deletions man/plot_miss.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 29 additions & 0 deletions man/plot_variance.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 24 additions & 0 deletions tests/testthat/test-plot_miss.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# create test objects
dat <- mice::nhanes

# tests
test_that("plot_miss produces plot", {
expect_s3_class(plot_miss(dat), "ggplot")
expect_s3_class(plot_miss(dat, grid = TRUE, ordered = TRUE, square = TRUE), "ggplot")
expect_s3_class(plot_miss(cbind(dat, "testvar" = NA)), "ggplot")
})

test_that("plot_miss works with different inputs", {
expect_s3_class(plot_miss(dat, c("age", "bmi")), "ggplot")
expect_s3_class(plot_miss(dat, c(age, bmi)), "ggplot")
expect_s3_class(plot_miss(data.frame(age = dat$age, testvar = NA)), "ggplot")
expect_s3_class(plot_miss(cbind(dat, "with space" = NA)), "ggplot")
})


test_that("plot_miss with incorrect argument(s)", {
expect_s3_class(plot_miss(na.omit(dat)), "ggplot")
expect_error(plot_miss("test"))
expect_error(plot_miss(dat, vrb = "test"))
expect_error(plot_miss(cbind(dat, .x = NA)))
})
Loading