Skip to content

Commit

Permalink
allow user to set intercept type
Browse files Browse the repository at this point in the history
  • Loading branch information
MikeDMorgan committed Sep 11, 2024
1 parent c9024f7 commit 58d5661
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 38 deletions.
21 changes: 17 additions & 4 deletions R/glmm.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
#' @param geno.only A logical value that flags the model to use either just the \code{matrix} `Kin` or the supplied random effects.
#' @param solver a character value that determines which optimisation algorithm is used for the variance components. Must be either
#' HE (Haseman-Elston regression) or Fisher (Fisher scoring).
#' @param intercept.type A character scalar, either \emph{fixed} or \emph{random} that sets the type of the global
#' intercept variable in the model. This only applies to the GLMM case where additional random effects variables are
#' already included. Setting \code{intercept.type="fixed"} or \code{intercept.type="random"} will require the user to
#' test their model for failures with each. In the case of using a kinship matrix, \code{intercept.type="fixed"} is
#' set automatically.
#'
#' @details
#' This function runs a negative binomial generalised linear mixed effects model. If mixed effects are detected in testNhoods,
Expand Down Expand Up @@ -81,6 +86,7 @@ fitGLMM <- function(X, Z, y, offsets, init.theta=NULL, Kin=NULL,
init.sigma=NULL, init.beta=NULL,
init.u=NULL, solver=NULL),
dispersion = 1, geno.only=FALSE,
intercept.type="fixed",
solver=NULL){

if(!is.null(solver)){
Expand Down Expand Up @@ -273,8 +279,12 @@ fitGLMM <- function(X, Z, y, offsets, init.theta=NULL, Kin=NULL,
diag(eyeN) <- 1
errMat <- (e0 %*% t(e0))/sig0 - eyeN

exp.levels <- random.levels
exp.levels <- exp.levels[!grepl(exp.levels, pattern="residual")]
if(intercept.type == "random"){
exp.levels <- random.levels
exp.levels <- exp.levels[!grepl(exp.levels, pattern="residual")]
} else{
exp.levels <- random.levels
}

curr_sigma.vec <- sapply(exp.levels, FUN=function(lvls, bigZ, errVec){
ijZ <- bigZ[, lvls]
Expand All @@ -291,8 +301,11 @@ fitGLMM <- function(X, Z, y, offsets, init.theta=NULL, Kin=NULL,
}, bigZ=full.Z, errVec=errMat)

# add the residual variance
res.var <- abs(sig0 - sum(curr_sigma.vec))
curr_sigma.vec <- c(curr_sigma.vec, res.var)
if(intercept.type == "random"){
res.var <- abs(sig0 - sum(curr_sigma.vec))
curr_sigma.vec <- c(curr_sigma.vec, res.var)
}

curr_sigma <- Matrix(abs(curr_sigma.vec), ncol=1, sparse=TRUE)
# curr_sigma = Matrix(rep(var(y)/(ncol(Z) + 2), ncol(Z)), ncol=1, sparse=TRUE) # split evenly
# curr_sigma <- Matrix(runif(ncol(Z), 0, 1), ncol=1, sparse = TRUE)
Expand Down
71 changes: 44 additions & 27 deletions R/testNhoods.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@
#' these should have the same \code{length} as \code{nrow} of \code{nhoodCounts}. If numeric, then these are assumed
#' to correspond to indices of \code{nhoodCounts} - if the maximal index is greater than \code{nrow(nhoodCounts(x))}
#' an error will be produced.
#' @param intercept.type A character scalar, either \emph{fixed} or \emph{random} that sets the type of the global
#' intercept variable in the model. This only applies to the GLMM case where additional random effects variables are
#' already included. Setting \code{intercept.type="fixed"} or \code{intercept.type="random"} will require the user to
#' test their model for failures with each. In the case of using a kinship matrix, \code{intercept.type="fixed"} is
#' set automatically.
#' @param fail.on.error A logical scalar the determines the behaviour of the error reporting. Used for debugging only.
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying the arguments for parallelisation. By default
#' this will evaluate using \code{SerialParam()}. See \code{details}on how to use parallelisation in \code{testNhoods}.
Expand Down Expand Up @@ -163,10 +168,15 @@ testNhoods <- function(x, design, design.df, kinship=NULL,
min.mean=0, model.contrasts=NULL, robust=TRUE, reduced.dim="PCA", REML=TRUE,
norm.method=c("TMM", "RLE", "logMS"), cell.sizes=NULL,
max.iters = 50, max.tol = 1e-5, glmm.solver=NULL,
subset.nhoods=NULL, fail.on.error=FALSE, BPPARAM=SerialParam(),
force=FALSE){
subset.nhoods=NULL, intercept.type=c("fixed", "random"),
fail.on.error=FALSE, BPPARAM=SerialParam(), force=FALSE){
is.lmm <- FALSE
geno.only <- FALSE

if(!any(intercept.type %in% c("fixed", "random"))){
stop("intercept.type: ", intercept.type, " not recognised, must be either 'fixed' or 'random'")
}

if(is(design, "formula")){
# parse to find random and fixed effects - need to double check the formula is valid
parse <- unlist(strsplit(gsub(design, pattern="~", replacement=""), split= "+", fixed=TRUE))
Expand All @@ -185,14 +195,25 @@ testNhoods <- function(x, design, design.df, kinship=NULL,
is.lmm <- TRUE
if(find_re | is.null(kinship)){
# make model matrices for fixed and random effects
z.model <- .parse_formula(design, design.df, vtype="re")
if(length(intercept.type) > 1){
intercept.type <- intercept.type[1]
}

if(intercept.type == "random"){
rand.int <- TRUE
} else{
rand.int <- FALSE
}

z.model <- .parse_formula(design, design.df, vtype="re", add.int=rand.int)
rownames(z.model) <- rownames(design.df)
} else if(find_re & !is.null(kinship)){
if(!all(rownames(kinship) == rownames(design.df))){
stop("Genotype rownames do not match design.df rownames")
}

z.model <- .parse_formula(design, design.df, vtype="re")
# genetic model MUST have fixed intercept
z.model <- .parse_formula(design, design.df, vtype="re", add.int=FALSE)
rownames(z.model) <- rownames(design.df)
} else if(!find_re & !is.null(kinship)){
z.model <- diag(nrow(kinship))
Expand All @@ -203,7 +224,13 @@ testNhoods <- function(x, design, design.df, kinship=NULL,

# this will always implicitly include a fixed intercept term - perhaps
# this shouldn't be the case?
x.model <- .parse_formula(design, design.df, vtype="fe")
if(intercept.type == "fixed"){
fixed.int <- TRUE
} else{
fixed.int <- FALSE
}

x.model <- .parse_formula(design, design.df, vtype="fe", add.int=fixed.int)
rownames(x.model) <- rownames(design.df)
max.iters <- max.iters

Expand Down Expand Up @@ -459,22 +486,24 @@ testNhoods <- function(x, design, design.df, kinship=NULL,

#wrapper function is the same for all analyses
glmmWrapper <- function(Y, disper, Xmodel, Zmodel, off.sets, randlevels,
reml, glmm.contr, genonly=FALSE, kin.ship=NULL,
reml, glmm.contr, int.type, genonly=FALSE, kin.ship=NULL,
BPPARAM=BPPARAM, error.fail=FALSE){
#bp.list <- NULL
# this needs to be able to run with BiocParallel
bp.list <- bptry({bplapply(seq_len(nrow(Y)), BPOPTIONS=bpoptions(stop.on.error = error.fail),
FUN=function(i, Xmodel, Zmodel, Y, off.sets,
randlevels, disper, genonly,
kin.ship, glmm.contr, reml){
kin.ship, glmm.contr, reml, int.type){
fitGLMM(X=Xmodel, Z=Zmodel, y=Y[i, ], offsets=off.sets,
random.levels=randlevels, REML = reml,
dispersion=disper[i], geno.only=genonly,
Kin=kinship, glmm.control=glmm.contr)
Kin=kinship, glmm.control=glmm.contr,
intercept.type=int.type)
}, BPPARAM=BPPARAM,
Xmodel=Xmodel, Zmodel=Zmodel, Y=Y, off.sets=off.sets,
randlevels=randlevels, disper=disper, genonly=genonly,
kin.ship=kin.ship, glmm.cont=glmm.cont, reml=reml)
kin.ship=kin.ship, glmm.cont=glmm.cont, reml=reml,
int.type=intercept.type)
}) # need to handle this output which is a bplist_error object

# parse the bplist_error object
Expand Down Expand Up @@ -512,26 +541,14 @@ testNhoods <- function(x, design, design.df, kinship=NULL,
} else{
message("Running genetic model with ", nrow(z.model), " observations")
}

if(geno.only){
fit <- glmmWrapper(Y=dge$counts, disper = 1/dispersion, Xmodel=x.model, Zmodel=z.model,
off.sets=offsets, randlevels=rand.levels, reml=REML, glmm.contr = glmm.cont,
genonly = geno.only, kin.ship=kinship,
BPPARAM=BPPARAM, error.fail=fail.on.error)
} else{
fit <- glmmWrapper(Y=dge$counts, disper = 1/dispersion, Xmodel=x.model, Zmodel=z.model,
off.sets=offsets, randlevels=rand.levels, reml=REML, glmm.contr = glmm.cont,
genonly = geno.only, kin.ship=kinship,
BPPARAM=BPPARAM, error.fail=fail.on.error)
}

} else{
fit <- glmmWrapper(Y=dge$counts, disper = 1/dispersion, Xmodel=x.model, Zmodel=z.model,
off.sets=offsets, randlevels=rand.levels, reml=REML, glmm.contr = glmm.cont,
genonly = geno.only, kin.ship=kinship,
BPPARAM=BPPARAM, error.fail=fail.on.error)
}

fit <- glmmWrapper(Y=dge$counts, disper = 1/dispersion, Xmodel=x.model, Zmodel=z.model,
off.sets=offsets, randlevels=rand.levels, reml=REML, glmm.contr = glmm.cont,
genonly = geno.only, kin.ship=kinship,
BPPARAM=BPPARAM, error.fail=fail.on.error,
int.type=intercept.type)

# give warning about how many neighborhoods didn't converge and error if > 50% nhoods failed
n.nhoods <- length(fit)
half.n <- floor(n.nhoods * 0.5)
Expand Down
21 changes: 16 additions & 5 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@

# parse design formula
#' @export
.parse_formula <- function(in.form, design.df, vtype=c("re", "fe")){
.parse_formula <- function(in.form, design.df, vtype=c("re", "fe"), add.int=FALSE){
## parse the formula and return the X and Z matrices
# need to decide on how to handle intercept terms - i.e. FE or RE
sp.form <- unlist(strsplit(as.character(in.form),
Expand All @@ -83,18 +83,29 @@
function(x) as.integer(factor(x)))), ncol = length(v.terms))
}

# add the residual variance term
d.mat <- cbind(d.mat, matrix(data=1L, nrow=nrow(d.mat), ncol=1))
colnames(d.mat) <- c(trimws(v.terms), "residual")
# add the residual variance term if appropriate
if(isTRUE(add.int)){
d.mat <- cbind(d.mat, matrix(data=1L, nrow=nrow(d.mat), ncol=1))
colnames(d.mat) <- c(trimws(v.terms), "residual")
} else{
colnames(d.mat) <- trimws(v.terms)
}

} else if(vtype %in% c("fe")){
v.terms <- trimws(unlist(sp.form[!grepl(trimws(sp.form), pattern="~|\\|")]))
if(length(v.terms) > 1){
v.terms <- paste(v.terms, collapse=" + ")
}

# the intercept is a fixed effect in this model
d.mat <- model.matrix(as.formula(paste("~ 1 +", v.terms)), data = design.df)
d.mat <- d.mat[ ,!grepl("1*\\|", colnames(d.mat))] # drop the intercept term
d.mat <- d.mat[ ,!grepl("1*\\|", colnames(d.mat))] # drop the random terms

if(isFALSE(add.int)){
# drop the intercept term if required.
d.mat[, !grepl("Intercept", colnames(d.mat)), drop=FALSE]
}

} else{
stop("vtype ", vtype, " not recognised")
}
Expand Down
4 changes: 2 additions & 2 deletions vignettes/milo_glmm.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ set.seed(42)
rownames(pbmc.design) <- pbmc.design$ObsID
da_results <- testNhoods(pbmc.milo, design = ~ adjmfc.time + (1|tenx_lane), design.df = pbmc.design, fdr.weighting="graph-overlap",
glmm.solver="Fisher", REML=TRUE, norm.method="TMM", BPPARAM = bpparam, fail.on.error=FALSE)
glmm.solver="Fisher", REML=TRUE, norm.method="TMM", BPPARAM = bpparam, fail.on.error=FALSE, intercept.type="fixed")
table(da_results$SpatialFDR < 0.1)
```

Expand Down Expand Up @@ -219,7 +219,7 @@ the results table when using the GLM.
We can inspect the distribution of the variance parameter estimates, and compare between the converged vs. not converged nhoods.

```{r}
ggplot(da_results, aes(x=Converged, y=`tenx_lane variance`)) +
ggplot(da_results, aes(x=Converged, y=`tenx_lane_variance`)) +
geom_boxplot() +
scale_y_log10() +
NULL
Expand Down

0 comments on commit 58d5661

Please sign in to comment.