closes #2

It turns out I already had pre-processing code to handle the contrast error for factors with one level, but it was buggy. I updated this code and also consolidated it into a helper function. Also added a testthat test to make sure this doesn't break in the future.
dkyleward · Sep 9, 2019 · a95b6c6 · a95b6c6
1 parent f4ac3a0
commit a95b6c6
Show file tree

Hide file tree

Showing 9 changed files with 112 additions and 52 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # ipfr (development version)
 
+  * Fixed bug when marginal target only had 1 category (#2)
+
 # ipfr 1.0.0 (2019-08-21)
 
   * First release of the package.
diff --git a/R/ipu.R b/R/ipu.R
@@ -137,48 +137,21 @@ ipu <- function(primary_seed, primary_targets,
   # to be used as needed.
   geo_equiv <- primary_seed %>%
     dplyr::select(dplyr::starts_with("geo_"), primary_id, "weight")
-  primary_seed_mod <- primary_seed %>%
-    dplyr::select(-dplyr::starts_with("geo_"))
-
-  # Remove any fields that aren't in the target list and change the ones
-  # that are to factors.
-  col_names <- names(primary_targets)
-  primary_seed_mod <- primary_seed_mod %>%
-    # Keep only the fields of interest (marginal columns and id)
-    dplyr::select(dplyr::one_of(c(col_names, primary_id))) %>%
-    # Convert to factors and then to dummy columns if the column has more
-    # than one category.
-    dplyr::mutate_at(
-      .vars = col_names,
-      .funs = list(~as.factor(.))
-    )
-  # If one of the columns has only one value, it cannot be a factor. The name
-  # must also be changed to match what the rest will be after one-hot encoding.
-  for (name in col_names){
-    if (length(unique(primary_seed_mod[[name]])) == 1) {
-      # unfactor
-      primary_seed_mod[[name]] <- type.convert(as.character(primary_seed_mod[[name]]))
-      # change name
-      value = primary_seed_mod[[name]][1]
-      new_name <- paste0(name, ".", value)
-      names(primary_seed_mod)[names(primary_seed_mod) == name] <- new_name
-    }
-  }
-  # Use one-hot encoding to convert the remaining factor fields to dummies
-  primary_seed_mod <- primary_seed_mod %>%
-    mlr::createDummyFeatures()
+  # primary_seed_mod <- primary_seed %>%
+  #   dplyr::select(-dplyr::starts_with("geo_"))
+
+  # Process the seed table into dummy variables (one-hot encoding)
+  marginal_columns <- names(primary_targets)
+  primary_seed_mod <- process_seed_table(
+    primary_seed, primary_id, marginal_columns
+  )
 
   if (!is.null(secondary_seed)) {
     # Modify the person seed table the same way, but sum by primary ID
-    col_names <- names(secondary_targets_mod)
-    secondary_seed_mod <- secondary_seed %>%
-      # Keep only the fields of interest
-      dplyr::select(dplyr::one_of(col_names), primary_id) %>%
-      dplyr::mutate_at(
-        .vars = col_names,
-        .funs = list(~as.factor(.))
-      ) %>%
-      mlr::createDummyFeatures() %>%
+    marginal_columns <- names(secondary_targets_mod)
+    secondary_seed_mod <- process_seed_table(
+      secondary_seed, primary_id, marginal_columns
+    ) %>%
       dplyr::group_by(!!as.name(primary_id)) %>%
       dplyr::summarize_all(
         .funs = sum
@@ -893,4 +866,46 @@ ipu_matrix <- function(mtx, row_targets, column_targets, ...) {
   return(final)
 }
 
+#' Helper function to process a seed table
+#' 
+#' Helper for \code{ipu()}. Strips columns from seed table except for the
+#' primary id and marginal column (as reflected in the targets tables). Also
+#' identifies factor columns with one level and processes them before
+#' \code{mlr::createDummyFeatures()} is called.
+#' 
+#' @param df the \code{data.frame} as processed by \code{ipu()} before this
+#'   function is called.
+#' @param primary_id the name of the primary ID column.
+#' @param marginal_columns The vector of column names in the seed table that
+#'   have matching targets.
+#' @keywords internal
 
+process_seed_table <- function(df, primary_id, marginal_columns){
+  df <- df %>%
+    dplyr::select(-dplyr::starts_with("geo_")) %>%
+    dplyr::select(dplyr::one_of(c(marginal_columns, primary_id))) %>%
+    dplyr::mutate_at(
+      .vars = marginal_columns,
+      .funs = list(~as.factor(.))
+    )
+
+  # handle any factors with only 1 level
+  for (name in marginal_columns){
+    if (length(unique(df[[name]])) == 1) {
+      # unfactor
+      df[[name]] <- type.convert(
+        as.character(df[[name]]),
+        as.is = TRUE
+      )
+      # change name
+      value = df[[name]][1]
+      new_name <- paste0(name, ".", value)
+      names(df)[names(df) == name] <- new_name
+      # change value
+      df[[new_name]] <- 1
+    }
+  }
+  df <- df  %>%
+    mlr::createDummyFeatures()
+  return(df)
+}
diff --git a/README.md b/README.md
@@ -7,8 +7,9 @@ status](https://www.r-pkg.org/badges/version/ipfr)](https://cran.r-project.org/p
 
 # ipfr
 
-A package for iterative proportional fitting on multiple
-marginal distributions in R.
+A package for iterative proportional fitting on multiple marginal distributions
+in R. The goal of this package is to make survey raking, matrix balancing, and
+population synthesis easier.
 
 ## Installation
 Install the latest official version from CRAN:
@@ -26,7 +27,7 @@ install_github("dkyleward/ipfr", build_vignettes = TRUE)
 
 ## Basic Usage
 
-(More in the vignettes) 
+(See vignettes at the bottom for advanced topics.)
 
 A basic matrix balance task:
 

diff --git a/man/ipf.Rd b/man/ipf.Rd
diff --git a/man/ipu.Rd b/man/ipu.Rd
diff --git a/man/ipu_nr.Rd b/man/ipu_nr.Rd
diff --git a/man/process_seed_table.Rd b/man/process_seed_table.Rd
diff --git a/man/setup_arizona.Rd b/man/setup_arizona.Rd
diff --git a/tests/testthat/test-basics.R b/tests/testthat/test-basics.R
@@ -42,6 +42,24 @@ test_that("basic ipu works", {
   )
 })
 
+test_that("single marginal targets work", {
+  result <- setup_arizona()
+  hh_seed <- result$hh_seed
+  hh_targets <- result$hh_targets
+  per_seed <- result$per_seed
+  per_targets <- result$per_targets
+
+  # Modify if only a regional person count is known
+  per_seed <- per_seed %>%
+    mutate(pertype = "any")
+  per_targets$pertype <- tibble(
+    any = 260
+  )
+
+  result <- ipu(hh_seed, hh_targets, per_seed, per_targets, max_iterations = 1)
+  expect_equal(result$secondary_comp$category[[1]], "pertype_any")
+})
+
 test_that("weight constraint works", {
   result <- setup_arizona()
   hh_seed <- result$hh_seed