Merge branch 'main' into add-degf-option

gergness · Jul 14, 2024 · e676e2e · e676e2e
2 parents 003d4fb + 43d2072
commit e676e2e
Show file tree

Hide file tree

Showing 7 changed files with 84 additions and 12 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -4,7 +4,7 @@ Title: 'dplyr'-Like Syntax for Summary Statistics of Survey Data
 Description: Use piping, verbs like 'group_by' and 'summarize', and other
     'dplyr' inspired syntactic style when calculating summary statistics on survey
     data using functions from the 'survey' package.
-Version: 1.2.0
+Version: 1.2.0.9000
 Date: 2023-02-20
 Authors@R: c(person("Greg", "Freedman Ellis", email = "[email protected]", role = c("aut", "cre")),
   person("Thomas", "Lumley", role = "ctb"),

diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,4 @@
-# (development)
+# srvyr (development version)
 * `as_survey_rep()` now has an argument `degf`, corresponding to the same argument in the survey function `svrepdesign()`. This argument can be useful for large data sets, since specifying a value for `degf` avoids a calculation which can be slow for very large data sets.
 
 # srvyr 1.2.0

diff --git a/R/srvyr.r b/R/srvyr.r
@@ -46,6 +46,5 @@
 #' Within summarise, you can also use \code{\link{unweighted}}, which calculates
 #' a function without taking into consideration the survey weighting.
 #'
-#' @docType package
 #' @name srvyr
-NULL
+"_PACKAGE"
diff --git a/R/subset_svy_vars.R b/R/subset_svy_vars.R
@@ -12,7 +12,11 @@ subset_svy_vars.survey.design2 <- function(x, ..., .preserve = FALSE) {
   if (is.calibrated(x) || is.pps(x)){
     ## Set weights to zero: no memory saving possible
     ## Will always be numeric because srvyr's construction
-    x$prob[-row_numbers] <- Inf
+    if (length(row_numbers) == 0) {
+      x$prob <- rep(Inf, length(x$prob))
+    } else {
+      x$prob[-row_numbers] <- Inf
+    }
 
     index <- is.finite(x$prob)
     psu <- !duplicated(x$cluster[index, 1])
@@ -68,9 +72,18 @@ subset_svy_vars.twophase2 <- function(x, ..., .preserve = FALSE) {
 
   ## Set weights to zero:  don't try to save memory
   ## Will always have numeric because of srvyr's structure
-  x$prob[-row_numbers] <- Inf
-  x$phase2$prob[-row_numbers] <- Inf
-  x$dcheck <- lapply(x$dcheck, function(m) {m[-row_numbers, -row_numbers] <- 0; m})
+  if (length(row_numbers) == 0) {
+    x$prob <- rep(Inf, length(x$prob))
+    x$phase2$prob <- rep(Inf, length(x$phase2$prob))
+    x$dcheck <- lapply(x$dcheck, function(m) {
+      m[seq_len(nrow(m)), seq_len(ncol(m))] <- 0
+      m
+    })
+  } else {
+    x$prob[-row_numbers] <- Inf
+    x$phase2$prob[-row_numbers] <- Inf
+    x$dcheck <- lapply(x$dcheck, function(m) {m[-row_numbers, -row_numbers] <- 0; m})
+  }
 
   index <- is.finite(x$prob)
   psu <- !duplicated(x$phase2$cluster[index, 1])

diff --git a/R/survey_statistics_helpers.R b/R/survey_statistics_helpers.R
@@ -43,7 +43,7 @@ set_survey_vars <- function(
     out$phase1$sample$variables[[name]] <- x
   } else {
     if (!add) {
-      out$variables <- select(out$variables, dplyr::one_of(group_vars(out)))
+      out$variables <- out$variables[, group_vars(out), drop = FALSE]
     }
     out$variables[[name]] <- x
   }
@@ -83,7 +83,7 @@ get_var_est <- function(
       se <- survey::SE(stat)
       # Needed for grouped quantile
       if (!inherits(se, "data.frame")) {
-        se <- data.frame(matrix(se, ncol = out_width))
+        se <- as.data.frame(se)
       }
       names(se) <- "_se"
       se
@@ -122,7 +122,7 @@ get_var_est <- function(
     }
   })
 
-  coef <- data.frame(matrix(coef(stat), ncol = out_width))
+  coef <- as.data.frame(unclass(coef(stat)))
   names(coef) <- "coef"
   out <- c(list(coef), out)
 
@@ -136,7 +136,7 @@ get_var_est <- function(
     out <- c(out, list(deff))
   }
 
-  as_srvyr_result_df(dplyr::bind_cols(out))
+  as_srvyr_result_df(do.call(cbind, out))
 }
 
 # Largely the same as get_var_est(), but need to handle the fact that there can be

diff --git a/man/srvyr.Rd b/man/srvyr.Rd
diff --git a/tests/testthat/test_survey_statistics.r b/tests/testthat/test_survey_statistics.r
@@ -729,6 +729,8 @@ test_that("unweighted allows named arguments",  {
 
 test_that(
   "unweighted works with filtered data in calibrated or PPS designs", {
+
+    # First check for calibrated designs
     data(api, package = "survey")
     dclus1 <- as_survey_design(apiclus1, id = dnum, weights = pw, fpc = fpc)
 
@@ -739,6 +741,7 @@ test_that(
                          sample.margins = list(~stype,~sch.wide),
                          population.margins = list(pop.types, pop.schwide))
 
+    # Check when filtering returns at least one row
     out_calib <- raked_design %>%
       filter(sch.wide == "Yes") %>%
       group_by(stype) %>%
@@ -752,11 +755,28 @@ test_that(
     expect_equal(out_calib[['sample_size']],
                  out_noncalib[['sample_size']])
 
+    # Check when filtering returns zero rows
+
+    out_calib <- raked_design %>%
+      filter(sch.wide == "Fake Category") %>%
+      summarize(sample_size = unweighted(n()))
+
+    out_noncalib <- dclus1 %>%
+      filter(sch.wide == "Fake Category") %>%
+      summarize(sample_size = unweighted(n()))
+
+    expect_equal(out_calib[['sample_size']],
+                 expected = 0)
+    expect_equal(out_noncalib[['sample_size']],
+                 expected = 0)
+
+    # Next check for PPS design
     data(election, package = "survey")
 
     non_pps_design <- as_survey_design(election_pps, id = 1)
     pps_design <- as_survey_design(election_pps, id = 1, fpc = p, pps = "brewer")
 
+    # Check correct results when filtering returns at least one row
     out_nonpps <- non_pps_design %>%
       filter(County == "Los Angeles") %>%
       summarize(n_rows = unweighted(n()))
@@ -767,6 +787,20 @@ test_that(
 
     expect_equal(out_pps[['n_rows']],
                  out_nonpps[['n_rows']])
+
+    # Check correct results when filtering returns zero rows
+    out_nonpps <- non_pps_design %>%
+      filter(County == "Fake Category") %>%
+      summarize(n_rows = unweighted(n()))
+
+    out_pps <- pps_design %>%
+      filter(County == "Fake Category") %>%
+      summarize(n_rows = unweighted(n()))
+
+    expect_equal(out_pps[['n_rows']],
+                 expected = 0)
+    expect_equal(out_nonpps[['n_rows']],
+                 expected = 0)
   }
 )