From e631c4c22dea24ce8b19116defd85eed99bd53db Mon Sep 17 00:00:00 2001
From: Lorenzo Gaborini <lorenzo.gaborini@gmail.com>
Date: Fri, 31 May 2019 14:50:30 +0200
Subject: [PATCH] Fix bug in prior elicitation when background item is seen
 only once

---
 DESCRIPTION                                |  2 +-
 NEWS.md                                    |  5 +++
 R/two_level_functions.R                    | 36 +++++++++++++++++-----
 man/inv_pd.Rd                              |  2 +-
 man/samesource_C.Rd                        |  2 +-
 man/two.level.multivariate.calculate.UC.Rd | 10 ++++--
 6 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 2331c92..255fa7e 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: bayessource
 Type: Package
 Title: Bayesian approach to evaluate whether two datasets come from the same population
-Version: 0.3
+Version: 0.3.1
 Authors@R: c(person("Lorenzo", "Gaborini", role = c("aut", "cre"),
                      email = "lorenzo.gaborini@gmail.com"),
               person("Silvia", "Bozza", role = c("ctb", "cph")))
diff --git a/NEWS.md b/NEWS.md
index d940d0b..7e53911 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,8 @@
+# bayessource 0.3.1
+
+* Fig bug in `make_priors_and_init` when a background item is seen only once.    
+  Now warns or fails gracefully.
+
 # bayessource 0.3
 
 * Add prior and init elicitation functions.
diff --git a/R/two_level_functions.R b/R/two_level_functions.R
index dfa360c..344f503 100644
--- a/R/two_level_functions.R
+++ b/R/two_level_functions.R
@@ -9,14 +9,16 @@
 #'
 #' Two-level function: item is chosen with the third parameter.
 #'
+#' Items who are observed only once cannot contribute to within covariance estimation.
+#' In that case, either we skip its contribute (default) or fail.
+#'
 #' Multivariate functions supporting paper:
 #' Bozza, A., Taroni, F., Marquis, R., M. Schmittbuhl
 #'
-#' This function is RNG-free.
-#'
 #' @param population matrix or dataframe containing data columns, item column and other columns
 #' @param idx.variables data columns (indexes or character vectors)
 #' @param idx.item item column (index or character vector)
+#' @param singular.action action to take when one of the items is observed once; one of `'skip'` (default) or `'fail'`
 #' @importFrom stats cov
 #' @importFrom utils modifyList
 #' @return list with items:
@@ -25,7 +27,9 @@
 #' - `W` (the within covariance matrix)
 #' - `B` (the between covariance matrix)
 #' @keywords internal
-two.level.multivariate.calculate.UC <- function(population, idx.variables, idx.item) {
+two.level.multivariate.calculate.UC <- function(population, idx.variables, idx.item, singular.action = 'skip') {
+
+   stopifnot(singular.action %in% c('skip', 'fail'))
 
    # Convert to matrix
    mtx_population <- as.matrix(population[, idx.variables])
@@ -37,10 +41,14 @@ two.level.multivariate.calculate.UC <- function(population, idx.variables, idx.i
    all.items <- as.vector(unique(item_population))
    n.items <- length(all.items)
 
+   if (n.items == 1) {
+      stop('two.level.multivariate.calculate.UC: only one item in population')
+   }
+
    variable.names <- colnames(population)[idx.variables]
    n.variables <- length(idx.variables)
    if (n.variables < 2) {
-      stop("cannot handle fewer than two variables with this function")
+      stop("two.level.multivariate.calculate.UC: cannot handle fewer than two variables with this function")
    }
 
    # How many replicates per item
@@ -64,6 +72,7 @@ two.level.multivariate.calculate.UC <- function(population, idx.variables, idx.i
    colnames(group.means) <- variable.names
 
    n.pop <- dim(population)[1]
+   n.items.skip <- 0
 
    # for each unique item
    for (i.item in 1:n.items) {
@@ -75,12 +84,22 @@ two.level.multivariate.calculate.UC <- function(population, idx.variables, idx.i
       n.replicates <- length(idx.replicates)
 
       # pick out the measurements for the item
-      tempdat <- mtx_population[idx.replicates, ]
+      tempdat <- mtx_population[idx.replicates, , drop = FALSE]
+
 
       # assign values to the group means for the population
       means <- colMeans(tempdat)
       group.means[i.item, ] <- means
 
+      if (n.replicates == 1) {
+         if (identical(singular.action, 'skip')) {
+            message(paste0('two.level.multivariate.calculate.UC: item "', i.item, '" has 1 replicate: cannot contribute to within covariance matrix estimation. Skipping.'))
+            n.items.skip <- n.items.skip + 1
+            next
+         }
+         stop(paste0('two.level.multivariate.calculate.UC: item "', i.item, '" has 1 replicate: cannot contribute to within covariance matrix estimation. Failing.'))
+      }
+
       # sum the within and between item means
       S.this <- t(means - all.means) %*% (means - all.means)
       S <- S + S.this
@@ -93,11 +112,14 @@ two.level.multivariate.calculate.UC <- function(population, idx.variables, idx.i
       Sw <- Sw + Cov.this
    }
 
+   # Only these items have contributed to S, Sw
+   n.items.contributing <- n.items - n.items.skip
+
    # convert matrix Sw to matrix U
-   W <- Sw/(n.pop - n.items)
+   W <- Sw/(n.pop - n.items.contributing)
 
    # convert matrix S to matrix C
-   B <- S/(n.items - 1)
+   B <- S/(n.items.contributing - 1)
 
    op <- list(group.means = group.means, all.means = all.means, W = W, B = B)
    return(op)
diff --git a/man/inv_pd.Rd b/man/inv_pd.Rd
index 6bd9083..bddaeeb 100644
--- a/man/inv_pd.Rd
+++ b/man/inv_pd.Rd
@@ -13,5 +13,5 @@ inv_pd(X)
 the inverse of X
 }
 \description{
-Compute the inverse of a positive-definite matrix using Cholesky composition.
+Compute the inverse of a positive-definite matrix using Cholesky decomposition.
 }
diff --git a/man/samesource_C.Rd b/man/samesource_C.Rd
index 86b4c18..833cb40 100644
--- a/man/samesource_C.Rd
+++ b/man/samesource_C.Rd
@@ -33,7 +33,7 @@ samesource_C(quest, ref, n.iter, B.inv, W.inv.1, W.inv.2, U, nw, mu,
 \item{marginals}{return the marginal likelihoods in the LR formula (default: FALSE)}
 }
 \value{
-the log-LR value, or a list with the marginals: \code{list(value, log_ml_Hp, log_ml_Hd_ref, log_ml_Hd_quest)}
+the log-BF value (base e), or a list with the log-BF and the marginals: \code{list(value, log_ml_Hp, log_ml_Hd_ref, log_ml_Hd_quest)}
 }
 \description{
 Implemented in C.
diff --git a/man/two.level.multivariate.calculate.UC.Rd b/man/two.level.multivariate.calculate.UC.Rd
index 4c90e18..1e84f00 100644
--- a/man/two.level.multivariate.calculate.UC.Rd
+++ b/man/two.level.multivariate.calculate.UC.Rd
@@ -4,7 +4,8 @@
 \alias{two.level.multivariate.calculate.UC}
 \title{Calculate Between/Within covariance matrices}
 \usage{
-two.level.multivariate.calculate.UC(population, idx.variables, idx.item)
+two.level.multivariate.calculate.UC(population, idx.variables, idx.item,
+  singular.action = "skip")
 }
 \arguments{
 \item{population}{matrix or dataframe containing data columns, item column and other columns}
@@ -12,6 +13,8 @@ two.level.multivariate.calculate.UC(population, idx.variables, idx.item)
 \item{idx.variables}{data columns (indexes or character vectors)}
 
 \item{idx.item}{item column (index or character vector)}
+
+\item{singular.action}{action to take when one of the items is observed once; one of \code{'skip'} (default) or \code{'fail'}}
 }
 \value{
 list with items:
@@ -26,9 +29,10 @@ list with items:
 Two-level function: item is chosen with the third parameter.
 }
 \details{
+Items who are observed only once cannot contribute to within covariance estimation.
+In that case, either we skip its contribute (default) or fail.
+
 Multivariate functions supporting paper:
 Bozza, A., Taroni, F., Marquis, R., M. Schmittbuhl
-
-This function is RNG-free.
 }
 \keyword{internal}