allow user to set intercept type

MarioniLab · Sep 11, 2024 · 58d5661 · 58d5661
1 parent c9024f7
commit 58d5661
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 38 deletions.
diff --git a/R/glmm.R b/R/glmm.R
@@ -15,6 +15,11 @@
 #' @param geno.only A logical value that flags the model to use either just the \code{matrix} `Kin` or the supplied random effects.
 #' @param solver a character value that determines which optimisation algorithm is used for the variance components. Must be either
 #' HE (Haseman-Elston regression) or Fisher (Fisher scoring).
+#' @param intercept.type A character scalar, either \emph{fixed} or \emph{random} that sets the type of the global
+#' intercept variable in the model. This only applies to the GLMM case where additional random effects variables are
+#' already included. Setting \code{intercept.type="fixed"} or \code{intercept.type="random"} will require the user to
+#' test their model for failures with each. In the case of using a kinship matrix, \code{intercept.type="fixed"} is
+#' set automatically.
 #'
 #' @details
 #' This function runs a negative binomial generalised linear mixed effects model. If mixed effects are detected in testNhoods,
@@ -81,6 +86,7 @@ fitGLMM <- function(X, Z, y, offsets, init.theta=NULL, Kin=NULL,
                                       init.sigma=NULL, init.beta=NULL,
                                       init.u=NULL, solver=NULL),
                     dispersion = 1, geno.only=FALSE,
+                    intercept.type="fixed",
                     solver=NULL){
 
     if(!is.null(solver)){
@@ -273,8 +279,12 @@ fitGLMM <- function(X, Z, y, offsets, init.theta=NULL, Kin=NULL,
             diag(eyeN) <- 1
             errMat <- (e0 %*% t(e0))/sig0 - eyeN
 
-            exp.levels <- random.levels
-            exp.levels <- exp.levels[!grepl(exp.levels, pattern="residual")]
+            if(intercept.type == "random"){
+                exp.levels <- random.levels
+                exp.levels <- exp.levels[!grepl(exp.levels, pattern="residual")]
+            } else{
+                exp.levels <- random.levels
+            }
 
             curr_sigma.vec <- sapply(exp.levels, FUN=function(lvls, bigZ, errVec){
                 ijZ <- bigZ[, lvls]
@@ -291,8 +301,11 @@ fitGLMM <- function(X, Z, y, offsets, init.theta=NULL, Kin=NULL,
             }, bigZ=full.Z, errVec=errMat)
 
             # add the residual variance
-            res.var <- abs(sig0 - sum(curr_sigma.vec))
-            curr_sigma.vec <- c(curr_sigma.vec, res.var)
+            if(intercept.type == "random"){
+                res.var <- abs(sig0 - sum(curr_sigma.vec))
+                curr_sigma.vec <- c(curr_sigma.vec, res.var)
+            }
+
             curr_sigma <- Matrix(abs(curr_sigma.vec), ncol=1, sparse=TRUE)
             # curr_sigma = Matrix(rep(var(y)/(ncol(Z) + 2), ncol(Z)), ncol=1, sparse=TRUE) # split evenly
             # curr_sigma <- Matrix(runif(ncol(Z), 0, 1), ncol=1, sparse = TRUE)

diff --git a/R/testNhoods.R b/R/testNhoods.R
@@ -55,6 +55,11 @@
 #' these should have the same \code{length} as \code{nrow} of \code{nhoodCounts}. If numeric, then these are assumed
 #' to correspond to indices of \code{nhoodCounts} - if the maximal index is greater than \code{nrow(nhoodCounts(x))}
 #' an error will be produced.
+#' @param intercept.type A character scalar, either \emph{fixed} or \emph{random} that sets the type of the global
+#' intercept variable in the model. This only applies to the GLMM case where additional random effects variables are
+#' already included. Setting \code{intercept.type="fixed"} or \code{intercept.type="random"} will require the user to
+#' test their model for failures with each. In the case of using a kinship matrix, \code{intercept.type="fixed"} is
+#' set automatically.
 #' @param fail.on.error A logical scalar the determines the behaviour of the error reporting. Used for debugging only.
 #' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying the arguments for parallelisation. By default
 #' this will evaluate using \code{SerialParam()}. See \code{details}on how to use parallelisation in \code{testNhoods}.
@@ -163,10 +168,15 @@ testNhoods <- function(x, design, design.df, kinship=NULL,
                        min.mean=0, model.contrasts=NULL, robust=TRUE, reduced.dim="PCA", REML=TRUE,
                        norm.method=c("TMM", "RLE", "logMS"), cell.sizes=NULL,
                        max.iters = 50, max.tol = 1e-5, glmm.solver=NULL,
-                       subset.nhoods=NULL, fail.on.error=FALSE, BPPARAM=SerialParam(),
-                       force=FALSE){
+                       subset.nhoods=NULL, intercept.type=c("fixed", "random"),
+                       fail.on.error=FALSE, BPPARAM=SerialParam(), force=FALSE){
     is.lmm <- FALSE
     geno.only <- FALSE
+
+    if(!any(intercept.type %in% c("fixed", "random"))){
+        stop("intercept.type: ", intercept.type, " not recognised, must be either 'fixed' or 'random'")
+    }
+
     if(is(design, "formula")){
         # parse to find random and fixed effects - need to double check the formula is valid
         parse <- unlist(strsplit(gsub(design, pattern="~", replacement=""), split= "+", fixed=TRUE))
@@ -185,14 +195,25 @@ testNhoods <- function(x, design, design.df, kinship=NULL,
             is.lmm <- TRUE
             if(find_re | is.null(kinship)){
                 # make model matrices for fixed and random effects
-                z.model <- .parse_formula(design, design.df, vtype="re")
+                if(length(intercept.type) > 1){
+                    intercept.type <- intercept.type[1]
+                }
+
+                if(intercept.type == "random"){
+                    rand.int <- TRUE
+                } else{
+                    rand.int <- FALSE
+                }
+
+                z.model <- .parse_formula(design, design.df, vtype="re", add.int=rand.int)
                 rownames(z.model) <- rownames(design.df)
             } else if(find_re & !is.null(kinship)){
                 if(!all(rownames(kinship) == rownames(design.df))){
                     stop("Genotype rownames do not match design.df rownames")
                 }
 
-                z.model <- .parse_formula(design, design.df, vtype="re")
+                # genetic model MUST have fixed intercept
+                z.model <- .parse_formula(design, design.df, vtype="re", add.int=FALSE)
                 rownames(z.model) <- rownames(design.df)
             } else if(!find_re & !is.null(kinship)){
                 z.model <- diag(nrow(kinship))
@@ -203,7 +224,13 @@ testNhoods <- function(x, design, design.df, kinship=NULL,
 
             # this will always implicitly include a fixed intercept term - perhaps
             # this shouldn't be the case?
-            x.model <- .parse_formula(design, design.df, vtype="fe")
+            if(intercept.type == "fixed"){
+                fixed.int <- TRUE
+            } else{
+                fixed.int <- FALSE
+            }
+
+            x.model <- .parse_formula(design, design.df, vtype="fe", add.int=fixed.int)
             rownames(x.model) <- rownames(design.df)
             max.iters <- max.iters
 
@@ -459,22 +486,24 @@ testNhoods <- function(x, design, design.df, kinship=NULL,
 
         #wrapper function is the same for all analyses
         glmmWrapper <- function(Y, disper, Xmodel, Zmodel, off.sets, randlevels,
-                                reml, glmm.contr, genonly=FALSE, kin.ship=NULL,
+                                reml, glmm.contr, int.type, genonly=FALSE, kin.ship=NULL,
                                 BPPARAM=BPPARAM, error.fail=FALSE){
             #bp.list <- NULL
             # this needs to be able to run with BiocParallel
             bp.list <- bptry({bplapply(seq_len(nrow(Y)), BPOPTIONS=bpoptions(stop.on.error = error.fail),
                                          FUN=function(i, Xmodel, Zmodel, Y, off.sets,
                                                       randlevels, disper, genonly,
-                                                      kin.ship, glmm.contr, reml){
+                                                      kin.ship, glmm.contr, reml, int.type){
                                              fitGLMM(X=Xmodel, Z=Zmodel, y=Y[i, ], offsets=off.sets,
                                                      random.levels=randlevels, REML = reml,
                                                      dispersion=disper[i], geno.only=genonly,
-                                                     Kin=kinship, glmm.control=glmm.contr)
+                                                     Kin=kinship, glmm.control=glmm.contr,
+                                                     intercept.type=int.type)
                                              }, BPPARAM=BPPARAM,
                                          Xmodel=Xmodel, Zmodel=Zmodel, Y=Y, off.sets=off.sets,
                                          randlevels=randlevels, disper=disper, genonly=genonly,
-                                         kin.ship=kin.ship, glmm.cont=glmm.cont, reml=reml)
+                                         kin.ship=kin.ship, glmm.cont=glmm.cont, reml=reml,
+                                       int.type=intercept.type)
                                 }) # need to handle this output which is a bplist_error object
 
             # parse the bplist_error object
@@ -512,26 +541,14 @@ testNhoods <- function(x, design, design.df, kinship=NULL,
             } else{
                 message("Running genetic model with ", nrow(z.model), " observations")
             }
-
-            if(geno.only){
-                fit <- glmmWrapper(Y=dge$counts, disper = 1/dispersion, Xmodel=x.model, Zmodel=z.model,
-                                   off.sets=offsets, randlevels=rand.levels, reml=REML, glmm.contr = glmm.cont,
-                                   genonly = geno.only, kin.ship=kinship,
-                                   BPPARAM=BPPARAM, error.fail=fail.on.error)
-            } else{
-                fit <- glmmWrapper(Y=dge$counts, disper = 1/dispersion, Xmodel=x.model, Zmodel=z.model,
-                                   off.sets=offsets, randlevels=rand.levels, reml=REML, glmm.contr = glmm.cont,
-                                   genonly = geno.only, kin.ship=kinship,
-                                   BPPARAM=BPPARAM, error.fail=fail.on.error)
-            }
-
-        } else{
-            fit <- glmmWrapper(Y=dge$counts, disper = 1/dispersion, Xmodel=x.model, Zmodel=z.model,
-                               off.sets=offsets, randlevels=rand.levels, reml=REML, glmm.contr = glmm.cont,
-                               genonly = geno.only, kin.ship=kinship,
-                               BPPARAM=BPPARAM, error.fail=fail.on.error)
         }
 
+        fit <- glmmWrapper(Y=dge$counts, disper = 1/dispersion, Xmodel=x.model, Zmodel=z.model,
+                           off.sets=offsets, randlevels=rand.levels, reml=REML, glmm.contr = glmm.cont,
+                           genonly = geno.only, kin.ship=kinship,
+                           BPPARAM=BPPARAM, error.fail=fail.on.error,
+                           int.type=intercept.type)
+
         # give warning about how many neighborhoods didn't converge and error if > 50% nhoods failed
         n.nhoods <- length(fit)
         half.n <- floor(n.nhoods * 0.5)

diff --git a/R/utils.R b/R/utils.R
@@ -66,7 +66,7 @@
 
 # parse design formula
 #' @export
-.parse_formula <- function(in.form, design.df, vtype=c("re", "fe")){
+.parse_formula <- function(in.form, design.df, vtype=c("re", "fe"), add.int=FALSE){
     ## parse the formula and return the X and Z matrices
     # need to decide on how to handle intercept terms - i.e. FE or RE
     sp.form <- unlist(strsplit(as.character(in.form),
@@ -83,18 +83,29 @@
                                           function(x) as.integer(factor(x)))), ncol = length(v.terms))
         }
 
-        # add the residual variance term
-        d.mat <- cbind(d.mat, matrix(data=1L, nrow=nrow(d.mat), ncol=1))
-        colnames(d.mat) <- c(trimws(v.terms), "residual")
+        # add the residual variance term if appropriate
+        if(isTRUE(add.int)){
+            d.mat <- cbind(d.mat, matrix(data=1L, nrow=nrow(d.mat), ncol=1))
+            colnames(d.mat) <- c(trimws(v.terms), "residual")
+        } else{
+            colnames(d.mat) <- trimws(v.terms)
+        }
 
     } else if(vtype %in% c("fe")){
         v.terms <- trimws(unlist(sp.form[!grepl(trimws(sp.form), pattern="~|\\|")]))
         if(length(v.terms) > 1){
             v.terms <- paste(v.terms, collapse=" + ")
         }
 
+        # the intercept is a fixed effect in this model
         d.mat <- model.matrix(as.formula(paste("~ 1 +", v.terms)), data = design.df)
-        d.mat <- d.mat[ ,!grepl("1*\\|", colnames(d.mat))] # drop the intercept term
+        d.mat <- d.mat[ ,!grepl("1*\\|", colnames(d.mat))] # drop the random terms
+
+        if(isFALSE(add.int)){
+            # drop the intercept term if required.
+            d.mat[, !grepl("Intercept", colnames(d.mat)), drop=FALSE]
+        }
+
     } else{
         stop("vtype ", vtype, " not recognised")
     }

diff --git a/vignettes/milo_glmm.Rmd b/vignettes/milo_glmm.Rmd
@@ -167,7 +167,7 @@ set.seed(42)
 rownames(pbmc.design) <- pbmc.design$ObsID
 
 da_results <- testNhoods(pbmc.milo, design = ~ adjmfc.time + (1|tenx_lane), design.df = pbmc.design, fdr.weighting="graph-overlap",
-                         glmm.solver="Fisher", REML=TRUE, norm.method="TMM", BPPARAM = bpparam, fail.on.error=FALSE)
+                         glmm.solver="Fisher", REML=TRUE, norm.method="TMM", BPPARAM = bpparam, fail.on.error=FALSE, intercept.type="fixed")
 table(da_results$SpatialFDR < 0.1)
 ```
 
@@ -219,7 +219,7 @@ the results table when using the GLM.
 We can inspect the distribution of the variance parameter estimates, and compare between the converged vs. not converged nhoods.
 
 ```{r}
-ggplot(da_results, aes(x=Converged, y=`tenx_lane variance`)) +
+ggplot(da_results, aes(x=Converged, y=`tenx_lane_variance`)) +
     geom_boxplot() + 
     scale_y_log10() +
     NULL