From a95b6c6852b1b8ce4e22ec8d1c827a341f98c12e Mon Sep 17 00:00:00 2001
From: Kyle Ward <kyle@caliper.com>
Date: Mon, 9 Sep 2019 11:25:33 -0400
Subject: [PATCH] closes #2

It turns out I already had pre-processing code to handle the contrast
error for factors with one level, but it was buggy. I updated this code
and also consolidated it into a helper function. Also added a testthat
test to make sure this doesn't break in the future.
---
 NEWS.md                      |  2 +
 R/ipu.R                      | 93 +++++++++++++++++++++---------------
 README.md                    |  7 +--
 man/ipf.Rd                   |  2 +-
 man/ipu.Rd                   |  4 +-
 man/ipu_nr.Rd                |  6 +--
 man/process_seed_table.Rd    | 24 ++++++++++
 man/setup_arizona.Rd         |  8 ++--
 tests/testthat/test-basics.R | 18 +++++++
 9 files changed, 112 insertions(+), 52 deletions(-)
 create mode 100644 man/process_seed_table.Rd

diff --git a/NEWS.md b/NEWS.md
index 1e7f8e6..0bf66b7 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,7 @@
 # ipfr (development version)
 
+  * Fixed bug when marginal target only had 1 category (#2)
+
 # ipfr 1.0.0 (2019-08-21)
 
   * First release of the package.
\ No newline at end of file
diff --git a/R/ipu.R b/R/ipu.R
index cfcc0b1..f676692 100644
--- a/R/ipu.R
+++ b/R/ipu.R
@@ -137,48 +137,21 @@ ipu <- function(primary_seed, primary_targets,
   # to be used as needed.
   geo_equiv <- primary_seed %>%
     dplyr::select(dplyr::starts_with("geo_"), primary_id, "weight")
-  primary_seed_mod <- primary_seed %>%
-    dplyr::select(-dplyr::starts_with("geo_"))
-
-  # Remove any fields that aren't in the target list and change the ones
-  # that are to factors.
-  col_names <- names(primary_targets)
-  primary_seed_mod <- primary_seed_mod %>%
-    # Keep only the fields of interest (marginal columns and id)
-    dplyr::select(dplyr::one_of(c(col_names, primary_id))) %>%
-    # Convert to factors and then to dummy columns if the column has more
-    # than one category.
-    dplyr::mutate_at(
-      .vars = col_names,
-      .funs = list(~as.factor(.))
-    )
-  # If one of the columns has only one value, it cannot be a factor. The name
-  # must also be changed to match what the rest will be after one-hot encoding.
-  for (name in col_names){
-    if (length(unique(primary_seed_mod[[name]])) == 1) {
-      # unfactor
-      primary_seed_mod[[name]] <- type.convert(as.character(primary_seed_mod[[name]]))
-      # change name
-      value = primary_seed_mod[[name]][1]
-      new_name <- paste0(name, ".", value)
-      names(primary_seed_mod)[names(primary_seed_mod) == name] <- new_name
-    }
-  }
-  # Use one-hot encoding to convert the remaining factor fields to dummies
-  primary_seed_mod <- primary_seed_mod %>%
-    mlr::createDummyFeatures()
+  # primary_seed_mod <- primary_seed %>%
+  #   dplyr::select(-dplyr::starts_with("geo_"))
+
+  # Process the seed table into dummy variables (one-hot encoding)
+  marginal_columns <- names(primary_targets)
+  primary_seed_mod <- process_seed_table(
+    primary_seed, primary_id, marginal_columns
+  )
   
   if (!is.null(secondary_seed)) {
     # Modify the person seed table the same way, but sum by primary ID
-    col_names <- names(secondary_targets_mod)
-    secondary_seed_mod <- secondary_seed %>%
-      # Keep only the fields of interest
-      dplyr::select(dplyr::one_of(col_names), primary_id) %>%
-      dplyr::mutate_at(
-        .vars = col_names,
-        .funs = list(~as.factor(.))
-      ) %>%
-      mlr::createDummyFeatures() %>%
+    marginal_columns <- names(secondary_targets_mod)
+    secondary_seed_mod <- process_seed_table(
+      secondary_seed, primary_id, marginal_columns
+    ) %>%
       dplyr::group_by(!!as.name(primary_id)) %>%
       dplyr::summarize_all(
         .funs = sum
@@ -893,4 +866,46 @@ ipu_matrix <- function(mtx, row_targets, column_targets, ...) {
   return(final)
 }
 
+#' Helper function to process a seed table
+#' 
+#' Helper for \code{ipu()}. Strips columns from seed table except for the
+#' primary id and marginal column (as reflected in the targets tables). Also
+#' identifies factor columns with one level and processes them before
+#' \code{mlr::createDummyFeatures()} is called.
+#' 
+#' @param df the \code{data.frame} as processed by \code{ipu()} before this
+#'   function is called.
+#' @param primary_id the name of the primary ID column.
+#' @param marginal_columns The vector of column names in the seed table that
+#'   have matching targets.
+#' @keywords internal
 
+process_seed_table <- function(df, primary_id, marginal_columns){
+  df <- df %>%
+    dplyr::select(-dplyr::starts_with("geo_")) %>%
+    dplyr::select(dplyr::one_of(c(marginal_columns, primary_id))) %>%
+    dplyr::mutate_at(
+      .vars = marginal_columns,
+      .funs = list(~as.factor(.))
+    )
+  
+  # handle any factors with only 1 level
+  for (name in marginal_columns){
+    if (length(unique(df[[name]])) == 1) {
+      # unfactor
+      df[[name]] <- type.convert(
+        as.character(df[[name]]),
+        as.is = TRUE
+      )
+      # change name
+      value = df[[name]][1]
+      new_name <- paste0(name, ".", value)
+      names(df)[names(df) == name] <- new_name
+      # change value
+      df[[new_name]] <- 1
+    }
+  }
+  df <- df  %>%
+    mlr::createDummyFeatures()
+  return(df)
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index e5c91d0..a1e0c41 100644
--- a/README.md
+++ b/README.md
@@ -7,8 +7,9 @@ status](https://www.r-pkg.org/badges/version/ipfr)](https://cran.r-project.org/p
 
 # ipfr
 
-A package for iterative proportional fitting on multiple
-marginal distributions in R.
+A package for iterative proportional fitting on multiple marginal distributions
+in R. The goal of this package is to make survey raking, matrix balancing, and
+population synthesis easier.
 
 ## Installation
 Install the latest official version from CRAN:
@@ -26,7 +27,7 @@ install_github("dkyleward/ipfr", build_vignettes = TRUE)
 
 ## Basic Usage
 
-(More in the vignettes) 
+(See vignettes at the bottom for advanced topics.)
 
 A basic matrix balance task:
 
diff --git a/man/ipf.Rd b/man/ipf.Rd
index 89a12df..4e9ea8d 100644
--- a/man/ipf.Rd
+++ b/man/ipf.Rd
@@ -5,7 +5,7 @@
 \title{Re-weight a Seed Table to Marginal Controls}
 \usage{
 ipf(seed, targets, relative_gap = 0.01, absolute_gap = 1,
-  max_iterations = 50, min_weight = 0.0001, verbose = FALSE)
+  max_iterations = 50, min_weight = 1e-04, verbose = FALSE)
 }
 \arguments{
 \item{seed}{A \code{data frame} including a \code{weight} field and necessary
diff --git a/man/ipu.Rd b/man/ipu.Rd
index 591ab31..cdf695c 100644
--- a/man/ipu.Rd
+++ b/man/ipu.Rd
@@ -7,8 +7,8 @@
 ipu(primary_seed, primary_targets, secondary_seed = NULL,
   secondary_targets = NULL, primary_id = "id",
   secondary_importance = 1, relative_gap = 0.01,
-  max_iterations = 100, absolute_diff = 10, weight_floor = 0.00001,
-  verbose = FALSE, max_ratio = 10000, min_ratio = 0.0001)
+  max_iterations = 100, absolute_diff = 10, weight_floor = 1e-05,
+  verbose = FALSE, max_ratio = 10000, min_ratio = 1e-04)
 }
 \arguments{
 \item{primary_seed}{In population synthesis or household survey expansion, 
diff --git a/man/ipu_nr.Rd b/man/ipu_nr.Rd
index a0a441f..de33ff6 100644
--- a/man/ipu_nr.Rd
+++ b/man/ipu_nr.Rd
@@ -5,10 +5,10 @@
 \title{Iterative Proportional Updating (Newton-Raphson)}
 \usage{
 ipu_nr(primary_seed, primary_targets, secondary_seed = NULL,
-  secondary_targets = NULL, target_priority = 10000000,
+  secondary_targets = NULL, target_priority = 1e+07,
   relative_gap = 0.01, max_iterations = 100, absolute_diff = 10,
-  weight_floor = 0.00001, verbose = FALSE, max_ratio = 10000,
-  min_ratio = 0.0001)
+  weight_floor = 1e-05, verbose = FALSE, max_ratio = 10000,
+  min_ratio = 1e-04)
 }
 \arguments{
 \item{primary_seed}{In population synthesis or household survey expansion, 
diff --git a/man/process_seed_table.Rd b/man/process_seed_table.Rd
new file mode 100644
index 0000000..70f706e
--- /dev/null
+++ b/man/process_seed_table.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ipu.R
+\name{process_seed_table}
+\alias{process_seed_table}
+\title{Helper function to process a seed table}
+\usage{
+process_seed_table(df, primary_id, marginal_columns)
+}
+\arguments{
+\item{df}{the \code{data.frame} as processed by \code{ipu()} before this
+function is called.}
+
+\item{primary_id}{the name of the primary ID column.}
+
+\item{marginal_columns}{The vector of column names in the seed table that
+have matching targets.}
+}
+\description{
+Helper for \code{ipu()}. Strips columns from seed table except for the
+primary id and marginal column (as reflected in the targets tables). Also
+identifies factor columns with one level and processes them before
+\code{mlr::createDummyFeatures()} is called.
+}
+\keyword{internal}
diff --git a/man/setup_arizona.Rd b/man/setup_arizona.Rd
index 112b321..9f68a43 100644
--- a/man/setup_arizona.Rd
+++ b/man/setup_arizona.Rd
@@ -2,18 +2,18 @@
 % Please edit documentation in R/utils.R
 \name{setup_arizona}
 \alias{setup_arizona}
-\title{Setup the ASU example}
+\title{Create the ASU example}
 \usage{
 setup_arizona()
 }
 \value{
-When run, it creates four variables in the calling environment:
+A list of four variables:
   hh_seed, hh_targets, per_seed, and per_targets. These can be used directly
   by \code{\link{ipu}}.
 }
 \description{
-This code sets up the Arizona example IPU problem and avoids repeating this
-code in multiple places throughout the package (vignettes/tests).
+Sets up the Arizona example IPU problem and is used in multiple places
+throughout the package (vignettes/tests).
 }
 \examples{
 setup_arizona()
diff --git a/tests/testthat/test-basics.R b/tests/testthat/test-basics.R
index deaed3d..4ca9da8 100644
--- a/tests/testthat/test-basics.R
+++ b/tests/testthat/test-basics.R
@@ -42,6 +42,24 @@ test_that("basic ipu works", {
   )
 })
 
+test_that("single marginal targets work", {
+  result <- setup_arizona()
+  hh_seed <- result$hh_seed
+  hh_targets <- result$hh_targets
+  per_seed <- result$per_seed
+  per_targets <- result$per_targets
+  
+  # Modify if only a regional person count is known
+  per_seed <- per_seed %>%
+    mutate(pertype = "any")
+  per_targets$pertype <- tibble(
+    any = 260
+  )
+  
+  result <- ipu(hh_seed, hh_targets, per_seed, per_targets, max_iterations = 1)
+  expect_equal(result$secondary_comp$category[[1]], "pertype_any")
+})
+
 test_that("weight constraint works", {
   result <- setup_arizona()
   hh_seed <- result$hh_seed