From a95b6c6852b1b8ce4e22ec8d1c827a341f98c12e Mon Sep 17 00:00:00 2001 From: Kyle Ward Date: Mon, 9 Sep 2019 11:25:33 -0400 Subject: [PATCH] closes #2 It turns out I already had pre-processing code to handle the contrast error for factors with one level, but it was buggy. I updated this code and also consolidated it into a helper function. Also added a testthat test to make sure this doesn't break in the future. --- NEWS.md | 2 + R/ipu.R | 93 +++++++++++++++++++++--------------- README.md | 7 +-- man/ipf.Rd | 2 +- man/ipu.Rd | 4 +- man/ipu_nr.Rd | 6 +-- man/process_seed_table.Rd | 24 ++++++++++ man/setup_arizona.Rd | 8 ++-- tests/testthat/test-basics.R | 18 +++++++ 9 files changed, 112 insertions(+), 52 deletions(-) create mode 100644 man/process_seed_table.Rd diff --git a/NEWS.md b/NEWS.md index 1e7f8e6..0bf66b7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # ipfr (development version) + * Fixed bug when marginal target only had 1 category (#2) + # ipfr 1.0.0 (2019-08-21) * First release of the package. \ No newline at end of file diff --git a/R/ipu.R b/R/ipu.R index cfcc0b1..f676692 100644 --- a/R/ipu.R +++ b/R/ipu.R @@ -137,48 +137,21 @@ ipu <- function(primary_seed, primary_targets, # to be used as needed. geo_equiv <- primary_seed %>% dplyr::select(dplyr::starts_with("geo_"), primary_id, "weight") - primary_seed_mod <- primary_seed %>% - dplyr::select(-dplyr::starts_with("geo_")) - - # Remove any fields that aren't in the target list and change the ones - # that are to factors. - col_names <- names(primary_targets) - primary_seed_mod <- primary_seed_mod %>% - # Keep only the fields of interest (marginal columns and id) - dplyr::select(dplyr::one_of(c(col_names, primary_id))) %>% - # Convert to factors and then to dummy columns if the column has more - # than one category. - dplyr::mutate_at( - .vars = col_names, - .funs = list(~as.factor(.)) - ) - # If one of the columns has only one value, it cannot be a factor. The name - # must also be changed to match what the rest will be after one-hot encoding. - for (name in col_names){ - if (length(unique(primary_seed_mod[[name]])) == 1) { - # unfactor - primary_seed_mod[[name]] <- type.convert(as.character(primary_seed_mod[[name]])) - # change name - value = primary_seed_mod[[name]][1] - new_name <- paste0(name, ".", value) - names(primary_seed_mod)[names(primary_seed_mod) == name] <- new_name - } - } - # Use one-hot encoding to convert the remaining factor fields to dummies - primary_seed_mod <- primary_seed_mod %>% - mlr::createDummyFeatures() + # primary_seed_mod <- primary_seed %>% + # dplyr::select(-dplyr::starts_with("geo_")) + + # Process the seed table into dummy variables (one-hot encoding) + marginal_columns <- names(primary_targets) + primary_seed_mod <- process_seed_table( + primary_seed, primary_id, marginal_columns + ) if (!is.null(secondary_seed)) { # Modify the person seed table the same way, but sum by primary ID - col_names <- names(secondary_targets_mod) - secondary_seed_mod <- secondary_seed %>% - # Keep only the fields of interest - dplyr::select(dplyr::one_of(col_names), primary_id) %>% - dplyr::mutate_at( - .vars = col_names, - .funs = list(~as.factor(.)) - ) %>% - mlr::createDummyFeatures() %>% + marginal_columns <- names(secondary_targets_mod) + secondary_seed_mod <- process_seed_table( + secondary_seed, primary_id, marginal_columns + ) %>% dplyr::group_by(!!as.name(primary_id)) %>% dplyr::summarize_all( .funs = sum @@ -893,4 +866,46 @@ ipu_matrix <- function(mtx, row_targets, column_targets, ...) { return(final) } +#' Helper function to process a seed table +#' +#' Helper for \code{ipu()}. Strips columns from seed table except for the +#' primary id and marginal column (as reflected in the targets tables). Also +#' identifies factor columns with one level and processes them before +#' \code{mlr::createDummyFeatures()} is called. +#' +#' @param df the \code{data.frame} as processed by \code{ipu()} before this +#' function is called. +#' @param primary_id the name of the primary ID column. +#' @param marginal_columns The vector of column names in the seed table that +#' have matching targets. +#' @keywords internal +process_seed_table <- function(df, primary_id, marginal_columns){ + df <- df %>% + dplyr::select(-dplyr::starts_with("geo_")) %>% + dplyr::select(dplyr::one_of(c(marginal_columns, primary_id))) %>% + dplyr::mutate_at( + .vars = marginal_columns, + .funs = list(~as.factor(.)) + ) + + # handle any factors with only 1 level + for (name in marginal_columns){ + if (length(unique(df[[name]])) == 1) { + # unfactor + df[[name]] <- type.convert( + as.character(df[[name]]), + as.is = TRUE + ) + # change name + value = df[[name]][1] + new_name <- paste0(name, ".", value) + names(df)[names(df) == name] <- new_name + # change value + df[[new_name]] <- 1 + } + } + df <- df %>% + mlr::createDummyFeatures() + return(df) +} \ No newline at end of file diff --git a/README.md b/README.md index e5c91d0..a1e0c41 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,9 @@ status](https://www.r-pkg.org/badges/version/ipfr)](https://cran.r-project.org/p # ipfr -A package for iterative proportional fitting on multiple -marginal distributions in R. +A package for iterative proportional fitting on multiple marginal distributions +in R. The goal of this package is to make survey raking, matrix balancing, and +population synthesis easier. ## Installation Install the latest official version from CRAN: @@ -26,7 +27,7 @@ install_github("dkyleward/ipfr", build_vignettes = TRUE) ## Basic Usage -(More in the vignettes) +(See vignettes at the bottom for advanced topics.) A basic matrix balance task: diff --git a/man/ipf.Rd b/man/ipf.Rd index 89a12df..4e9ea8d 100644 --- a/man/ipf.Rd +++ b/man/ipf.Rd @@ -5,7 +5,7 @@ \title{Re-weight a Seed Table to Marginal Controls} \usage{ ipf(seed, targets, relative_gap = 0.01, absolute_gap = 1, - max_iterations = 50, min_weight = 0.0001, verbose = FALSE) + max_iterations = 50, min_weight = 1e-04, verbose = FALSE) } \arguments{ \item{seed}{A \code{data frame} including a \code{weight} field and necessary diff --git a/man/ipu.Rd b/man/ipu.Rd index 591ab31..cdf695c 100644 --- a/man/ipu.Rd +++ b/man/ipu.Rd @@ -7,8 +7,8 @@ ipu(primary_seed, primary_targets, secondary_seed = NULL, secondary_targets = NULL, primary_id = "id", secondary_importance = 1, relative_gap = 0.01, - max_iterations = 100, absolute_diff = 10, weight_floor = 0.00001, - verbose = FALSE, max_ratio = 10000, min_ratio = 0.0001) + max_iterations = 100, absolute_diff = 10, weight_floor = 1e-05, + verbose = FALSE, max_ratio = 10000, min_ratio = 1e-04) } \arguments{ \item{primary_seed}{In population synthesis or household survey expansion, diff --git a/man/ipu_nr.Rd b/man/ipu_nr.Rd index a0a441f..de33ff6 100644 --- a/man/ipu_nr.Rd +++ b/man/ipu_nr.Rd @@ -5,10 +5,10 @@ \title{Iterative Proportional Updating (Newton-Raphson)} \usage{ ipu_nr(primary_seed, primary_targets, secondary_seed = NULL, - secondary_targets = NULL, target_priority = 10000000, + secondary_targets = NULL, target_priority = 1e+07, relative_gap = 0.01, max_iterations = 100, absolute_diff = 10, - weight_floor = 0.00001, verbose = FALSE, max_ratio = 10000, - min_ratio = 0.0001) + weight_floor = 1e-05, verbose = FALSE, max_ratio = 10000, + min_ratio = 1e-04) } \arguments{ \item{primary_seed}{In population synthesis or household survey expansion, diff --git a/man/process_seed_table.Rd b/man/process_seed_table.Rd new file mode 100644 index 0000000..70f706e --- /dev/null +++ b/man/process_seed_table.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ipu.R +\name{process_seed_table} +\alias{process_seed_table} +\title{Helper function to process a seed table} +\usage{ +process_seed_table(df, primary_id, marginal_columns) +} +\arguments{ +\item{df}{the \code{data.frame} as processed by \code{ipu()} before this +function is called.} + +\item{primary_id}{the name of the primary ID column.} + +\item{marginal_columns}{The vector of column names in the seed table that +have matching targets.} +} +\description{ +Helper for \code{ipu()}. Strips columns from seed table except for the +primary id and marginal column (as reflected in the targets tables). Also +identifies factor columns with one level and processes them before +\code{mlr::createDummyFeatures()} is called. +} +\keyword{internal} diff --git a/man/setup_arizona.Rd b/man/setup_arizona.Rd index 112b321..9f68a43 100644 --- a/man/setup_arizona.Rd +++ b/man/setup_arizona.Rd @@ -2,18 +2,18 @@ % Please edit documentation in R/utils.R \name{setup_arizona} \alias{setup_arizona} -\title{Setup the ASU example} +\title{Create the ASU example} \usage{ setup_arizona() } \value{ -When run, it creates four variables in the calling environment: +A list of four variables: hh_seed, hh_targets, per_seed, and per_targets. These can be used directly by \code{\link{ipu}}. } \description{ -This code sets up the Arizona example IPU problem and avoids repeating this -code in multiple places throughout the package (vignettes/tests). +Sets up the Arizona example IPU problem and is used in multiple places +throughout the package (vignettes/tests). } \examples{ setup_arizona() diff --git a/tests/testthat/test-basics.R b/tests/testthat/test-basics.R index deaed3d..4ca9da8 100644 --- a/tests/testthat/test-basics.R +++ b/tests/testthat/test-basics.R @@ -42,6 +42,24 @@ test_that("basic ipu works", { ) }) +test_that("single marginal targets work", { + result <- setup_arizona() + hh_seed <- result$hh_seed + hh_targets <- result$hh_targets + per_seed <- result$per_seed + per_targets <- result$per_targets + + # Modify if only a regional person count is known + per_seed <- per_seed %>% + mutate(pertype = "any") + per_targets$pertype <- tibble( + any = 260 + ) + + result <- ipu(hh_seed, hh_targets, per_seed, per_targets, max_iterations = 1) + expect_equal(result$secondary_comp$category[[1]], "pertype_any") +}) + test_that("weight constraint works", { result <- setup_arizona() hh_seed <- result$hh_seed