diff --git a/.circleci/config.yml b/.circleci/config.yml index fbc8d7b6..6da56726 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,7 +2,7 @@ version: 2 jobs: build: docker: - - image: rocker/verse:4.2 + - image: rocker/verse:4.3 environment: _R_CHECK_FORCE_SUGGESTS_: false steps: diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 8d242c1b..8c7b2c23 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -4,6 +4,7 @@ name: Publish Docker image on: release: types: [published] + workflow_dispatch: # push: # branches: [main, master] @@ -16,22 +17,22 @@ jobs: uses: actions/checkout@v3 - name: Log in to Docker Hub - uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9 + uses: docker/login-action@v3 with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - name: Extract metadata (tags, labels) for Docker id: meta - uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38 + uses: docker/metadata-action@v5 with: images: pkharchenkolab/numbat-rbase - name: Build and push Docker image - uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc + uses: docker/build-push-action@v5 with: context: . file: docker/Dockerfile push: true tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} \ No newline at end of file + labels: ${{ steps.meta.outputs.labels }} diff --git a/DESCRIPTION b/DESCRIPTION index 317e621c..93103446 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: numbat Title: Haplotype-Aware CNV Analysis from scRNA-Seq URL: https://github.com/kharchenkolab/numbat/, https://kharchenkolab.github.io/numbat/ -Version: 1.3.4 +Version: 1.4.0 Authors@R: c(person("Teng","Gao", email="tgaoteng@gmail.com", role=c("cre", "aut")), person("Ruslan", "Soldatov", email="soldatr@mskcc.org", role="aut"), person("Hirak", "Sarkar", email="hirak_sarkar@hms.harvard.edu", role="aut"), person("Evan", "Biederstedt", email="evan.biederstedt@gmail.com", role="aut"), person("Peter", "Kharchenko", email = "peter_kharchenko@hms.harvard.edu", role = "aut")) Description: A computational method that infers copy number variations (CNVs) in cancer scRNA-seq data and reconstructs the tumor phylogeny. 'numbat' integrates signals from gene expression, allelic ratio, and population haplotype structures to accurately infer allele-specific CNVs in single cells and reconstruct their lineage relationship. 'numbat' can be used to: 1. detect allele-specific copy number variations from single-cells; 2. differentiate tumor versus normal cells in the tumor microenvironment; 3. infer the clonal architecture and evolutionary history of profiled tumors. 'numbat' does not require tumor/normal-paired DNA or genotype data, but operates solely on the donor scRNA-data data (for example, 10x Cell Ranger output). Additional examples and documentations are available at . For details on the method please see Gao et al. Nature Biotechnology (2022) . License: MIT + file LICENSE @@ -21,6 +21,7 @@ Imports: ggraph, ggtree, glue, + hahmmr, igraph, IRanges, logger, diff --git a/NAMESPACE b/NAMESPACE index e968827c..d075cc39 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -32,6 +32,16 @@ importFrom(data.table,fread) importFrom(data.table,fwrite) importFrom(ggtree,"%<+%") importFrom(grDevices,colorRampPalette) +importFrom(hahmmr,dbbinom) +importFrom(hahmmr,dpoilog) +importFrom(hahmmr,fit_lnpois_cpp) +importFrom(hahmmr,forward_back_allele) +importFrom(hahmmr,l_bbinom) +importFrom(hahmmr,l_lnpois) +importFrom(hahmmr,likelihood_allele) +importFrom(hahmmr,logSumExp) +importFrom(hahmmr,run_allele_hmm_s5) +importFrom(hahmmr,run_joint_hmm_s15) importFrom(igraph,"E<-") importFrom(igraph,"V<-") importFrom(igraph,E) @@ -74,4 +84,4 @@ importFrom(stats,setNames) importFrom(stats,start) importFrom(stats,t.test) importFrom(utils,combn) -useDynLib(numbat) +useDynLib(numbat, .registration=TRUE) diff --git a/NEWS.md b/NEWS.md index 16a7411a..541f956a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,13 @@ +# numbat 1.4.0 - 02/23/2023 + +* Integration with hahmmr + +* Better input checking for pileup_and_phase + +* Fix compatibility with igraph v2.0+ and tidygraph v1.3+ (#150) + +* Fix multiallelic CNV state probability reporting (#146) + # numbat 1.3.3 - 08/15/2023 * Fix plotting issue #135 diff --git a/R/RcppExports.R b/R/RcppExports.R index de4ce30d..354af170 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,43 +1,7 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -cppdbbinom <- function(x, size, alpha, beta, log_prob = FALSE) { - .Call('_numbat_cppdbbinom', PACKAGE = 'numbat', x, size, alpha, beta, log_prob) -} - -cpp_dgpois <- function(x, alpha, beta, log_prob = FALSE) { - .Call('_numbat_cpp_dgpois', PACKAGE = 'numbat', x, alpha, beta, log_prob) -} - -logSumExp <- function(x) { - .Call('_numbat_logSumExp', PACKAGE = 'numbat', x) -} - -likelihood_compute <- function(logphi, logprob, logPi, n, m) { - .Call('_numbat_likelihood_compute', PACKAGE = 'numbat', logphi, logprob, logPi, n, m) -} - -forward_backward_compute <- function(logphi, logprob, logPi, n, m) { - .Call('_numbat_forward_backward_compute', PACKAGE = 'numbat', logphi, logprob, logPi, n, m) -} - -viterbi_compute <- function(log_delta, logprob, logPi, n, m, nu, z) { - .Call('_numbat_viterbi_compute', PACKAGE = 'numbat', log_delta, logprob, logPi, n, m, nu, z) -} - roman2int_internal <- function(letters, nchar) { - .Call('_numbat_roman2int_internal', PACKAGE = 'numbat', letters, nchar) -} - -fit_lnpois_cpp <- function(Y_obs, lambda_ref, d) { - .Call('_numbat_fit_lnpois_cpp', PACKAGE = 'numbat', Y_obs, lambda_ref, d) -} - -poilog1 <- function(x, my, sig) { - .Call('_numbat_poilog1', PACKAGE = 'numbat', x, my, sig) -} - -l_lnpois_cpp <- function(Y_obs, lambda_ref, d, mu, sig, phi = 1.0) { - .Call('_numbat_l_lnpois_cpp', PACKAGE = 'numbat', Y_obs, lambda_ref, d, mu, sig, phi) + .Call(`_numbat_roman2int_internal`, letters, nchar) } diff --git a/R/diagnostics.R b/R/diagnostics.R index 0e7cd58a..ea127ba2 100644 --- a/R/diagnostics.R +++ b/R/diagnostics.R @@ -149,6 +149,12 @@ check_exp_ref = function(lambdas_ref) { if (!is.matrix(lambdas_ref)) { lambdas_ref = as.matrix(lambdas_ref) %>% magrittr::set_colnames('ref') } + + if (any(is.na(lambdas_ref))) { + msg = "The reference expression matrix 'lambdas_ref' should not contain any NA values." + log_error(msg) + stop(msg) + } # check if all entries in the reference profile are integers if (all(lambdas_ref == as.integer(lambdas_ref))) { diff --git a/R/genotyping.R b/R/genotyping.R index 1f773b89..9827ab2b 100644 --- a/R/genotyping.R +++ b/R/genotyping.R @@ -211,9 +211,6 @@ preprocess_allele = function( # phased VCF vcf_phased = vcf_phased %>% - mutate(INFO = str_remove_all(INFO, '[:alpha:]|=')) %>% - tidyr::separate(col = 'INFO', into = c('AD', 'DP', 'OTH'), sep = ';') %>% - mutate_at(c('AD', 'DP', 'OTH'), as.integer) %>% mutate(snp_id = paste(CHROM, POS, REF, ALT, sep = '_')) %>% mutate(GT = get(sample)) diff --git a/R/hmm.R b/R/hmm.R index 596c1f16..6cdd5d22 100644 --- a/R/hmm.R +++ b/R/hmm.R @@ -1,62 +1,8 @@ -############ Probability distributions ############ - -#'@useDynLib numbat #'@import Rcpp NULL -## refer to https://github.com/evanbiederstedt/poilogcpp - -#' Returns the density for the Poisson lognormal distribution with parameters mu and sig -#' -#' @param x vector of integers, the observations -#' @param mu mean of lognormal distribution -#' @param sig standard deviation of lognormal distribution -#' @param log boolean Return the log density if TRUE (default=FALSE) -#' @return NULL -#' @keywords internal -dpoilog <- function(x, mu, sig, log=FALSE){ - if (!(length(x) == length(mu) & length(x) == length(sig))) stop('dpoilog: All parameters must be same length') - if (any((x[x!=0]/trunc(x[x!=0]))!=1)) stop('dpoilog: all x must be integers') - if (any(x<0)) stop('dpoilog: one or several values of x are negative') - if (!all(is.finite(c(mu,sig)))) { - stop('dpoilog: all parameters should be finite') - } - if (any(is.na(c(x,mu,sig)))) stop('dpoilog: Parameters cannot be NA') - if (any(sig<=0)) { - stop(c('dpoilog: sig is not larger than 0', unique(sig))) - } - - p = poilog1(x=as.integer(x), my=as.double(mu), sig=as.double(sig^2)) - - p[p == 0] = 1e-15 - - if (log) { - return(log(p)) - } else { - return(p) - } -} - -#' Beta-binomial distribution density function -#' A distribution is beta-binomial if p, the probability of success, -#' in a binomial distribution has a beta distribution with shape -#' parameters alpha > 0 and beta > 0 -#' For more details, see extraDistr::dbbinom -#' -#' @param x vector of quantiles -#' @param size number of trials (zero or more) -#' @param alpha numeric (default=1) -#' @param beta numeric (default=1) -#' @param log boolean (default=FALSE) -#' @return density values returned as numeric vector -#' @keywords internal -dbbinom <- function(x, size, alpha = 1, beta = 1, log = FALSE) { - cppdbbinom(x, size, alpha, beta, log[1L]) -} - ############ Allele HMMs ############ - #' Get an allele HMM #' @param pAD integer vector Paternal allele counts #' @param DP integer vector Total alelle counts @@ -100,162 +46,6 @@ get_allele_hmm = function(pAD, DP, p_s, theta, gamma = 20) { return(hmm) } -#' Viterbi algorithm for allele HMM -#' @param hmm HMM object; expect variables x (allele depth), d (total depth), -#' logPi (log transition prob matrix), delta (prior for each state), -#' alpha (alpha for each state), beta (beta for each state), -#' states (states), p_s (phase switch probs) -#' @keywords internal -viterbi_allele <- function(hmm) { - - N <- hmm$N - M <- hmm$M - nu <- matrix(NA, nrow = N, ncol = M) - z <- rep(NA, N) - - logprob = sapply(1:M, function(m) { - - l_x = dbbinom(x = hmm$x, size = hmm$d, alpha = hmm$alpha[,m], beta = hmm$beta[,m], log = TRUE) - - l_x[is.na(l_x)] = 0 - - return(l_x) - - }) - - z = viterbi_compute(log(hmm$delta), logprob, hmm$logPi, N, M, nu, z) - - return(z) -} - -#' Forward-backward algorithm for allele HMM -#' @param hmm HMM object; expect variables x (allele depth), d (total depth), -#' logPi (log transition prob matrix), delta (prior for each state), -#' alpha (alpha for each state), beta (beta for each state), -#' states (states), p_s (phase switch probs) -#' @keywords internal -forward_back_allele = function(hmm) { - - # case of one-data point - if (hmm$N == 1) { - return(NA) - } - - N <- hmm$N - M <- hmm$M - - logprob = sapply(1:M, function(m) { - - l_x = dbbinom(x = hmm$x, size = hmm$d, alpha = hmm$alpha[,m], beta = hmm$beta[,m], log = TRUE) - - l_x[is.na(l_x)] = 0 - - return(l_x) - - }) - - logphi <- log(hmm$delta) - - marginals = forward_backward_compute(logphi, logprob, hmm$logPi, N, M) - - colnames(marginals) = hmm$states - - return(marginals) -} - -#' Only compute total log likelihood from an allele HMM -#' @param hmm HMM object; expect variables x (allele depth), d (total depth), -#' logPi (log transition prob matrix), delta (prior for each state), -#' alpha (alpha for each state), beta (beta for each state), -#' states (states), p_s (phase switch probs) -#' @keywords internal -likelihood_allele = function(hmm) { - - N <- hmm$N - M <- hmm$M - - logprob = sapply(1:M, function(m) { - - l_x = dbbinom(x = hmm$x, size = hmm$d, alpha = hmm$alpha[,m], beta = hmm$beta[,m], log = TRUE) - - l_x[is.na(l_x)] = 0 - - return(l_x) - - }) - - logphi <- log(hmm$delta) - - LL <- likelihood_compute(logphi, logprob, hmm$logPi, N, M) - - return(LL) -} - -#' allele-only HMM -#' @param pAD integer vector Paternal allele counts -#' @param DP integer vector Total alelle counts -#' @param p_s numeric vector Phase switch probabilities -#' @param t numeric Transition probability between copy number states -#' @param theta_min numeric Minimum haplotype frequency deviation threshold -#' @param gamma numeric Overdispersion in the allele-specific expression -#' @return character vector Decoded states -#' @keywords internal -run_allele_hmm = function(pAD, DP, p_s, t = 1e-5, theta_min = 0.08, gamma = 20, prior = NULL) { - - gamma = unique(gamma) - - if (length(gamma) > 1) { - stop('More than one gamma parameter') - } - - # states - states = c("neu", "theta_1_up", "theta_1_down", "theta_2_up", "theta_2_down") - - N = length(pAD) - M = 5 - - # transition matrices - calc_trans_mat = function(p_s, t) { - matrix( - c(1-t, t/4, t/4, t/4, t/4, - t/2, (1-t)*(1-p_s), (1-t)*p_s, t/4, t/4, - t/2, (1-t)*p_s, (1-t)*(1-p_s), t/4, t/4, - t/2, t/4, t/4, (1-t)*(1-p_s), (1-t)*p_s, - t/2, t/4, t/4, (1-t)*p_s, (1-t)*(1-p_s)), - ncol = 5, - byrow = TRUE - ) - } - - Pi = sapply(p_s, function(p_s) {calc_trans_mat(p_s, t)}) %>% - array(dim = c(M, M, N)) - - # intitial probabilities - if (is.null(prior)) { - prior = rep(1/5, 5) - } - - theta_1 = theta_min - theta_2 = 0.4 - alphas = gamma * c(0.5, 0.5 + theta_1, 0.5 - theta_1, 0.5 + theta_2, 0.5 - theta_2) - betas = gamma * c(0.5, 0.5 - theta_1, 0.5 + theta_1, 0.5 - theta_2, 0.5 + theta_2) - - hmm = list( - x = pAD, - logPi = log(Pi), - delta = prior, - alpha = matrix(rep(alphas, N), ncol = M, byrow = TRUE), - beta = matrix(rep(betas, N), ncol = M, byrow = TRUE), - d = DP, - N = N, - M = M, - states = states - ) - - mpc = states[viterbi_allele(hmm)] - - return(mpc) -} #' Calculate allele likelihoods #' @param pAD integer vector Paternal allele counts @@ -270,260 +60,6 @@ calc_allele_lik = function (pAD, DP, p_s, theta, gamma = 20) { return(LL) } -############ Joint HMM ############ - - -#' Run joint HMM on a pseudobulk profile -#' @param pAD integer vector Paternal allele counts -#' @param DP integer vector Total alelle counts -#' @param p_s numeric vector Phase switch probabilities -#' @param theta_min numeric Minimum haplotype imbalance threshold -#' @param gamma numeric Overdispersion in the allele-specific expression -#' @param Y_obs numeric vector Observed gene counts -#' @param lambda_ref numeric vector Reference expression rates -#' @param d_total integer Total library size for expression counts -#' @param phi_del numeric Expected fold change for deletion -#' @param phi_amp numeric Expected fold change for amplification -#' @param phi_bamp numeric Expected fold change for balanced amplification -#' @param phi_bdel numeric Expected fold change for balanced deletion -#' @param mu numeric Global expression bias -#' @param sig numeric Global expression variance -#' @param t numeric Transition probability between copy number states -#' @param exp_only logical Whether to only use expression data -#' @param allele_only logical Whether to only use allele data -#' @keywords internal -run_joint_hmm = function( - pAD, DP, p_s, Y_obs = 0, lambda_ref = 0, d_total = 0, theta_min = 0.08, theta_neu = 0, - bal_cnv = TRUE, phi_del = 2^(-0.25), phi_amp = 2^(0.25), phi_bamp = phi_amp, phi_bdel = phi_del, - alpha = 1, beta = 1, - mu = 0, sig = 1, - t = 1e-5, gamma = 18, - prior = NULL, exp_only = FALSE, allele_only = FALSE, - classify_allele = FALSE, phasing = TRUE, debug = FALSE -) { - - # states - states = c( - "1" = "neu", "2" = "del_1_up", "3" = "del_1_down", "4" = "del_2_up", "5" = "del_2_down", - "6" = "loh_1_up", "7" = "loh_1_down", "8" = "loh_2_up", "9" = "loh_2_down", - "10" = "amp_1_up", "11" = "amp_1_down", "12" = "amp_2_up", "13" = "amp_2_down", - "14" = "bamp", "15" = "bdel" - ) - - states_cn = str_remove(states, '_up|_down') - states_phase = str_extract(states, 'up|down') - - # relative abundance of states - w = c('neu' = 1, 'del_1' = 1, 'del_2' = 1e-10, 'loh_1' = 1, 'loh_2' = 1e-10, 'amp_1' = 1, 'amp_2' = 1e-10, 'bamp' = 1e-4, 'bdel' = 1e-10) - - # intitial probabilities - if (is.null(prior)) { - # encourage CNV from telomeres - prior = sapply(1:length(states), function(to){ - get_trans_probs( - t = min(t * 100, 1), p_s = 0, w, - cn_from = 'neu', phase_from = NA, - cn_to = states_cn[to], phase_to = states_phase[to]) - }) - } - - # to do: renormalize the probabilities after deleting states - states_index = 1:length(states) - - if (!bal_cnv) { - states_index = 1:13 - } - - if (exp_only) { - pAD = rep(NA, length(pAD)) - p_s = rep(0, length(p_s)) - } - - if (allele_only) { - states_index = c(1, 6:9) - - Y_obs = rep(NA, length(Y_obs)) - } - - if (!phasing) { - states_index = c(1, 6) - - p_s = ifelse(is.na(pAD), p_s, 0) - pAD = ifelse(pAD > (DP - pAD), pAD, DP - pAD) - theta_neu = 0.1 - theta_min = 0.45 - } - - if (classify_allele) { - states_index = c(6,7) - } - - # transition matrices - As = calc_trans_mat(t, p_s, w, states_cn, states_phase) - - theta_u_1 = 0.5 + theta_min - theta_d_1 = 0.5 - theta_min - - theta_u_2 = 0.9 - theta_d_2 = 0.1 - - theta_u_neu = 0.5 + theta_neu - theta_d_neu = 0.5 - theta_neu - - # parameters for each state - alpha_states = gamma * c(theta_u_neu, rep(c(theta_u_1, theta_d_1, theta_u_2, theta_d_2), 3), theta_u_neu, theta_u_neu) - beta_states = gamma * c(theta_d_neu, rep(c(theta_d_1, theta_u_1, theta_d_2, theta_u_2), 3), theta_d_neu, theta_d_neu) - phi_states = c(1, rep(phi_del, 2), rep(0.5, 2), rep(1, 4), rep(phi_amp, 2), rep(2.5, 2), phi_bamp, phi_bdel) - - # subset for relevant states - prior = prior[states_index] - As = As[states_index, states_index,] - alpha_states = alpha_states[states_index] - beta_states = beta_states[states_index] - phi_states = phi_states[states_index] - states = states[states_index] %>% setNames(1:length(.)) - - N = length(Y_obs) - - if (length(mu) == 1) { - mu = rep(mu, N) - sig = rep(sig, N) - } - - if (length(d_total) == 1) { - d_total = rep(d_total, N) - } - - hmm = list( - x = matrix(pAD), - d = matrix(DP), - y = matrix(Y_obs), - l = matrix(d_total), - lambda = matrix(lambda_ref), - mu = matrix(mu), - sig = matrix(sig), - logPi = log(As), - phi = matrix(phi_states), - delta = matrix(prior), - alpha = matrix(alpha_states), - beta = matrix(beta_states), - states = matrix(states), - p_s = p_s - ) - - MPC = states[viterbi_joint(hmm)] - - return(MPC) -} - - -#' Calculate the transition matrix for joint HMM -#' @param t numeric; CNV state transition probability -#' @param p_s numeric; phase switch probability -#' @param w numeric; relative abundance of states -#' @param states_cn character; CNV states -#' @param states_phase character; haplotype phase states -#' @return array; transition matrix -#' @keywords internal -calc_trans_mat = function(t, p_s, w, states_cn, states_phase) { - - sapply(1:length(states_cn), function(from) { - sapply(1:length(states_cn), function(to) { - get_trans_probs(t, p_s, w, states_cn[from], states_phase[from], states_cn[to], states_phase[to]) - }) %>% t - }) %>% t %>% - array(dim = c(length(states_cn), length(states_cn), length(p_s))) - -} - - -#' Helper function to calculate transition porbabilities -#' cn/phase are sclars, only p_s is vectorized -#' @param t numeric; CNV state transition probability -#' @param p_s numeric; phase switch probability -#' @param w numeric; relative abundance of states -#' @param cn_from character; originating CNV state -#' @param phase_from character; originating haplotype phase state -#' @param cn_to character; destination CNV state -#' @param phase_to character; destination haplotype phase state -#' @return numeric; transition probability -#' @keywords internal -get_trans_probs = function(t, p_s, w, cn_from, phase_from, cn_to, phase_to) { - - if (cn_from == 'neu' & cn_to == 'neu') { - p_s = rep(0.5, length(p_s)) - } - - if (cn_from == cn_to) { - if (is.na(phase_from) & is.na(phase_to)) { - p = 1-t - p = rep(p, length(p_s)) - } else if (phase_from == phase_to) { - p = (1-t) * (1-p_s) - } else { - p = (1-t) * p_s - } - } else { - p = t * w[[cn_to]]/sum(w[names(w)!=cn_from]) - if (!is.na(phase_to)) { - p = p/2 - } - p = rep(p, length(p_s)) - } - - return(p) -} - -#' Generalized viterbi algorithm for joint HMM -#' @param hmm HMM object; expect variables x (allele depth), d (total depth), y (expression count), l (cell total library size), -#' lambda (reference expression rate), mu (global expression mean), sig (global expression standard deviation), -#' logPi (log transition prob matrix), phi (expression fold change for each state), delta (prior for each state), -#' alpha (alpha for each state), beta (beta for each state), states (states), p_s (phase switch probs) -#' @keywords internal -viterbi_joint <- function(hmm) { - - N <- nrow(hmm$x) - M <- nrow(hmm$logPi[,,1]) - K <- ncol(hmm$x) - nu <- matrix(NA, nrow = N, ncol = M) - z <- rep(NA, N) - - logprob = sapply(1:M, function(m) { - - sapply( - 1:K, - function(k) { - - l_x = dbbinom(x = hmm$x[,k], size = hmm$d[,k], alpha = hmm$alpha[m,k], beta = hmm$beta[m,k], log = TRUE) - - l_x[is.na(l_x)] = 0 - - if (!is.null(hmm$y)) { - l_y = rep(0,N) - valid = !is.na(hmm$y[,k]) - - l_y[valid] = dpoilog( - x = hmm$y[valid,k], - sig = hmm$sig[valid,k], - mu = hmm$mu[valid,k] + log(hmm$phi[m,k] * hmm$l[valid,k] * hmm$lambda[valid,k]), - log = TRUE - ) - } else { - l_y = 0 - } - - return(l_x + l_y) - - } - ) %>% rowSums() - - }) - - z = viterbi_compute(log(hmm$delta), logprob, hmm$logPi, N, M, nu, z) - - return(z) -} - ############ Clonal deletion HMM ############ #' Viterbi for clonal LOH detection diff --git a/R/main.R b/R/main.R index a5a8148c..38f78ffc 100644 --- a/R/main.R +++ b/R/main.R @@ -15,9 +15,10 @@ #' @import patchwork #' @importFrom grDevices colorRampPalette #' @importFrom stats as.dendrogram as.dist cor cutree dbinom dnbinom dnorm dpois end hclust integrate model.matrix na.omit optim p.adjust pnorm reorder rnorm setNames start t.test as.ts complete.cases is.leaf na.contiguous +#' @importFrom hahmmr logSumExp dpoilog dbbinom l_lnpois l_bbinom fit_lnpois_cpp likelihood_allele forward_back_allele run_joint_hmm_s15 run_allele_hmm_s5 #' @import tibble #' @importFrom utils combn -#' @useDynLib numbat +#' @useDynLib numbat, .registration=TRUE NULL #' Run workflow to decompose tumor subclones @@ -133,8 +134,9 @@ run_numbat = function( ######### Log parameters ######### log_message(paste('\n', - glue('Numbat version: ', as.character(utils::packageVersion("numbat"))), - glue('Scistreer version: ', as.character(utils::packageVersion("scistreer"))), + glue('numbat version: ', as.character(utils::packageVersion("numbat"))), + glue('scistreer version: ', as.character(utils::packageVersion("scistreer"))), + glue('hahmmr version: ', as.character(utils::packageVersion("hahmmr"))), 'Running under parameters:', glue('t = {t}'), glue('alpha = {alpha}'), diff --git a/R/utils.R b/R/utils.R index 0b792af5..88fe112b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -558,7 +558,7 @@ analyze_bulk = function( bulk = bulk %>% group_by(CHROM) %>% mutate(state = - run_joint_hmm( + run_joint_hmm_s15( pAD = pAD, DP = DP, p_s = p_s, @@ -1128,7 +1128,7 @@ find_common_diploid = function( bulk %>% group_by(CHROM) %>% mutate(state = - run_allele_hmm( + run_allele_hmm_s5( pAD = pAD, DP = DP, p_s = p_s, @@ -1355,17 +1355,6 @@ get_segs_neu = function(bulks) { return(segs_neu) } -#' calculate joint likelihood of allele data -#' @param AD numeric vector Variant allele depth -#' @param DP numeric vector Total allele depth -#' @param alpha numeric Alpha parameter of Beta-Binomial distribution -#' @param beta numeric Beta parameter of Beta-Binomial distribution -#' @return numeric Joint log likelihood -#' @keywords internal -l_bbinom = function(AD, DP, alpha, beta) { - sum(dbbinom(AD, DP, alpha, beta, log = TRUE)) -} - #' fit a Beta-Binomial model by maximum likelihood #' @param AD numeric vector Variant allele depth #' @param DP numeric vector Total allele depth @@ -1407,63 +1396,6 @@ fit_gamma = function(AD, DP, start = 20) { return(gamma) } -#' Density function for a gamma-poisson distribution -#' @keywords internal -dgpois <- function(x, shape, rate, scale = 1/rate, log = FALSE) { - cpp_dgpois(x, shape, scale, log[1L]) -} - - -#' calculate joint likelihood of a gamma-poisson model -#' @param Y_obs numeric vector Gene expression counts -#' @param lambda_ref numeric vector Reference expression levels -#' @param d numeric Total library size -#' @param alpha numeric Alpha parameter of Gamma-Poisson distribution -#' @param beta numeric Beta parameter of Gamma-Poisson distribution -#' @return numeric Joint log likelihood -#' @keywords internal -l_gpois = function(Y_obs, lambda_ref, d, alpha, beta, phi = 1) { - sum(dgpois(Y_obs, shape = alpha, rate = beta/(phi * d * lambda_ref), log = TRUE)) -} - -#' fit a Gamma-Poisson model by maximum likelihood -#' @param Y_obs numeric vector Gene expression counts -#' @param lambda_ref numeric vector Reference expression levels -#' @param d numeric Total library size -#' @return numeric MLE of alpha and beta -#' @keywords internal -fit_gpois = function(Y_obs, lambda_ref, d) { - - Y_obs = Y_obs[lambda_ref > 0] - lambda_ref = lambda_ref[lambda_ref > 0] - - fit = stats4::mle( - minuslogl = function(alpha, beta) { - -l_gpois(Y_obs, lambda_ref, d, alpha, beta) - }, - start = c(1, 1), - lower = c(0.01, 0.01), - control = list('trace' = FALSE) - ) - - alpha = fit@coef[1] - beta = fit@coef[2] - - return(fit) -} - -#' calculate joint likelihood of a PLN model -#' @param Y_obs numeric vector Gene expression counts -#' @param lambda_ref numeric vector Reference expression levels -#' @param d numeric Total library size -#' @return numeric Joint log likelihood -#' @keywords internal -l_lnpois = function(Y_obs, lambda_ref, d, mu, sig, phi = 1) { - if (any(sig <= 0)) {stop(glue('negative sigma. value: {sig}'))} - if (length(sig) == 1) {sig = rep(sig, length(Y_obs))} - sum(log(dpoilog(Y_obs, mu + log(phi * d * lambda_ref), sig))) -} - #' fit a PLN model by maximum likelihood #' @param Y_obs numeric vector Gene expression counts #' @param lambda_ref numeric vector Reference expression levels diff --git a/docker/Dockerfile b/docker/Dockerfile index e54293e4..c620741f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,10 +1,9 @@ -FROM r-base:4.2.1 +FROM r-base:4.3.1 -LABEL authors="Evan Biederstedt , Teng Gao " \ +LABEL authors="Evan Biederstedt , Teng Gao " \ version.image="0.1.3" \ version.pagoda2="0.1.3" \ - description="r-base image R 4.2.1 to run numbat with R" - + description="r-base image to run numbat with R" RUN apt-get update --yes && apt-get install --yes build-essential \ libcurl4-gnutls-dev libxml2-dev libssl-dev libbz2-dev zlib1g-dev \ @@ -14,16 +13,33 @@ RUN apt-get update --yes && apt-get install --yes build-essential \ libglpk-dev git autoconf gettext libtool automake \ samtools sudo - RUN cd /usr/bin && \ wget https://github.com/samtools/htslib/releases/download/1.15.1/htslib-1.15.1.tar.bz2 && \ tar -vxjf htslib-1.15.1.tar.bz2 && cd htslib-1.15.1 && make && sudo make install +RUN R -e 'chooseCRANmirror(ind=42); install.packages("BiocManager")' + +RUN R -e 'chooseCRANmirror(ind=42); install.packages("ragg")' + +RUN R -e 'chooseCRANmirror(ind=42); install.packages("pkgdown")' + +RUN R -e 'chooseCRANmirror(ind=42); install.packages("devtools")' + +RUN R -e 'devtools::install_github("YuLab-SMU/ggtree", dependencies=TRUE)' + +RUN R -e 'devtools::install_github("kharchenkolab/numbat", dependencies=TRUE)' + +RUN git clone https://github.com/kharchenkolab/numbat.git + +RUN mkdir -p /tmp && chmod 777 /tmp + +RUN chmod 777 /numbat/inst/bin/pileup_and_phase.R + RUN mkdir data # hg 38 -RUN cd /data && wget https://sourceforge.net/projects/cellsnp/files/SNPlist/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf.gz && gzip -d genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf.gz +RUN cd /data && wget -q https://sourceforge.net/projects/cellsnp/files/SNPlist/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf.gz && gzip -d genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf.gz ## Note: not run; the user can follow these commands if they want hg19 @@ -32,7 +48,7 @@ RUN cd /data && wget https://sourceforge.net/projects/cellsnp/files/SNPlist/geno # hg38 -RUN cd /data && wget http://pklab.med.harvard.edu/teng/data/1000G_hg38.zip && unzip 1000G_hg38.zip +RUN cd /data && wget -q http://pklab.med.harvard.edu/teng/data/1000G_hg38.zip && unzip 1000G_hg38.zip # hg19 ##RUN cd /data && wget http://pklab.med.harvard.edu/teng/data/1000G_hg19.zip && unzip 1000G_hg19.zip @@ -42,26 +58,7 @@ RUN cd /data && wget http://pklab.med.harvard.edu/teng/data/1000G_hg38.zip && un RUN git clone https://github.com/single-cell-genetics/cellsnp-lite.git && cd cellsnp-lite && \ autoreconf -iv && ./configure && make && sudo make install -RUN wget https://storage.googleapis.com/broad-alkesgroup-public/Eagle/downloads/Eagle_v2.4.1.tar.gz && cd .. && tar -xvzf Eagle_v2.4.1.tar.gz && cd /Eagle_v2.4.1 && cp eagle /usr/bin - - - -RUN R -e 'chooseCRANmirror(ind=42); install.packages("BiocManager")' - -RUN R -e 'chooseCRANmirror(ind=42); install.packages("ragg")' - -RUN R -e 'chooseCRANmirror(ind=42); install.packages("pkgdown")' - -RUN R -e 'chooseCRANmirror(ind=42); install.packages("devtools")' - - -RUN R -e 'devtools::install_github("kharchenkolab/numbat", dependencies=TRUE)' - -RUN git clone https://github.com/kharchenkolab/numbat.git - -RUN mkdir -p /tmp && chmod 777 /tmp - -RUN chmod 777 /numbat/inst/bin/pileup_and_phase.R +RUN wget -q https://storage.googleapis.com/broad-alkesgroup-public/Eagle/downloads/Eagle_v2.4.1.tar.gz && cd .. && tar -xvzf Eagle_v2.4.1.tar.gz && cd /Eagle_v2.4.1 && cp eagle /usr/bin WORKDIR /numbat diff --git a/inst/bin/pileup_and_phase.R b/inst/bin/pileup_and_phase.R index cbf16d7d..8bcceb0c 100644 --- a/inst/bin/pileup_and_phase.R +++ b/inst/bin/pileup_and_phase.R @@ -11,12 +11,12 @@ parser = add_option(parser, '--gmap', default = NULL, type = "character", help = parser = add_option(parser, '--eagle', type = "character", default = 'eagle', help = "Path to Eagle2 binary file") parser = add_option(parser, '--snpvcf', default = NULL, type = "character", help = "SNP VCF for pileup") parser = add_option(parser, '--paneldir', default = NULL, type = "character", help = "Directory to phasing reference panel (BCF files)") -parser = add_option(parser, '--outdir', default = './pileup_and_phase', type = "character", help = "Output directory") +parser = add_option(parser, '--outdir', default = NULL, type = "character", help = "Output directory") parser = add_option(parser, '--ncores', default = 1, type = "integer", help = "Number of cores") parser = add_option(parser, '--UMItag', default = "Auto", type = "character", help = "UMI tag in bam; should be Auto for 10x and XM for Slide-seq") parser = add_option(parser, '--cellTAG', default = "CB", type = "character", help = "Cell tag in bam; should be CB for 10x and XC for Slide-seq") parser = add_option(parser, '--smartseq', default = FALSE, action = 'store_true', help = "Running with SMART-seq mode; Supply a txt file containing directories of BAM files to --bams and a txt file containing cell names to --barcodes (each entry on its own line for both; ordering must match).") -parser = add_option(parser, '--bulk', default = FALSE, action = 'store_true', help = "Running with bulk RNA-seq mode") +parser = add_option(parser, '--bulk', default = FALSE, action = 'store_true', help = "Running with bulk RNA-seq mode; supply --bams and --samples but not --barcodes.") args <- parse_args(parser) suppressPackageStartupMessages({ @@ -29,9 +29,25 @@ suppressPackageStartupMessages({ library(numbat) }) -# required args -if (any(is.null(c(args$bams, args$barcodes, args$gmap, args$snpvcf, args$paneldir)))) { - stop('Missing one or more required arguments: bams, barcodes, gmap, snpvcf, paneldir') +if (any(sapply(list(args$bams, args$snpvcf, args$outdir, args$paneldir, args$gmap), is.null))) { + stop('Missing one or more always required arguments: --bams, --snpvcf, --outdir, --paneldir, --gmap') +} + +if (args$smartseq) { + if (any(sapply(list(args$barcodes), is.null))) { + stop('Missing one or more required arguments for smartseq mode: --barcodes') + } + mode = 'SMART-Seq' +} else if (args$bulk) { + if (any(sapply(list(args$samples), is.null))) { + stop('Missing one or more required arguments for bulk mode: --samples') + } + mode = 'Bulk' +} else { + if (any(sapply(list(args$samples, args$barcodes), is.null))) { + stop('Missing one or more required arguments for 10x mode: --samples, --barcodes') + } + mode = '10X' } label = args$label @@ -52,8 +68,39 @@ cellTAG = args$cellTAG smartseq = args$smartseq bulk = args$bulk genome = ifelse(str_detect(args$gmap, 'hg19'), 'hg19', 'hg38') + +message(paste0('Running in ', mode, ' mode')) message(paste0('Using genome version: ', genome)) +## check if files exist +for (bam in bams) { + if (!file.exists(bam)) { + stop('BAM file not found') + } +} + +if (!file.exists(snpvcf)) { + stop('SNP VCF not found') +} + + +if (!is.null(barcodes)) { + for (barcode in barcodes) { + if (!file.exists(barcode)) { + stop('Barcode file not found') + } + } +} + +if (!file.exists(gmap)) { + stop('Genetic map not found') +} + +if (!file.exists(paneldir)) { + stop('Phasing reference panel not found') +} + + dir.create(outdir, showWarnings = FALSE, recursive = TRUE) dir.create(glue('{outdir}/pileup'), showWarnings = FALSE) dir.create(glue('{outdir}/phasing'), showWarnings = FALSE) diff --git a/man/calc_trans_mat.Rd b/man/calc_trans_mat.Rd deleted file mode 100644 index 5e322034..00000000 --- a/man/calc_trans_mat.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hmm.R -\name{calc_trans_mat} -\alias{calc_trans_mat} -\title{Calculate the transition matrix for joint HMM} -\usage{ -calc_trans_mat(t, p_s, w, states_cn, states_phase) -} -\arguments{ -\item{t}{numeric; CNV state transition probability} - -\item{p_s}{numeric; phase switch probability} - -\item{w}{numeric; relative abundance of states} - -\item{states_cn}{character; CNV states} - -\item{states_phase}{character; haplotype phase states} -} -\value{ -array; transition matrix -} -\description{ -Calculate the transition matrix for joint HMM -} -\keyword{internal} diff --git a/man/dbbinom.Rd b/man/dbbinom.Rd deleted file mode 100644 index f8266053..00000000 --- a/man/dbbinom.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hmm.R -\name{dbbinom} -\alias{dbbinom} -\title{Beta-binomial distribution density function -A distribution is beta-binomial if p, the probability of success, -in a binomial distribution has a beta distribution with shape -parameters alpha > 0 and beta > 0 -For more details, see extraDistr::dbbinom} -\usage{ -dbbinom(x, size, alpha = 1, beta = 1, log = FALSE) -} -\arguments{ -\item{x}{vector of quantiles} - -\item{size}{number of trials (zero or more)} - -\item{alpha}{numeric (default=1)} - -\item{beta}{numeric (default=1)} - -\item{log}{boolean (default=FALSE)} -} -\value{ -density values returned as numeric vector -} -\description{ -Beta-binomial distribution density function -A distribution is beta-binomial if p, the probability of success, -in a binomial distribution has a beta distribution with shape -parameters alpha > 0 and beta > 0 -For more details, see extraDistr::dbbinom -} -\keyword{internal} diff --git a/man/dgpois.Rd b/man/dgpois.Rd deleted file mode 100644 index 5c6a14f5..00000000 --- a/man/dgpois.Rd +++ /dev/null @@ -1,12 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{dgpois} -\alias{dgpois} -\title{Density function for a gamma-poisson distribution} -\usage{ -dgpois(x, shape, rate, scale = 1/rate, log = FALSE) -} -\description{ -Density function for a gamma-poisson distribution -} -\keyword{internal} diff --git a/man/dpoilog.Rd b/man/dpoilog.Rd deleted file mode 100644 index 816b5ec8..00000000 --- a/man/dpoilog.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hmm.R -\name{dpoilog} -\alias{dpoilog} -\title{Returns the density for the Poisson lognormal distribution with parameters mu and sig} -\usage{ -dpoilog(x, mu, sig, log = FALSE) -} -\arguments{ -\item{x}{vector of integers, the observations} - -\item{mu}{mean of lognormal distribution} - -\item{sig}{standard deviation of lognormal distribution} - -\item{log}{boolean Return the log density if TRUE (default=FALSE)} -} -\description{ -Returns the density for the Poisson lognormal distribution with parameters mu and sig -} -\keyword{internal} diff --git a/man/fit_gpois.Rd b/man/fit_gpois.Rd deleted file mode 100644 index f7688725..00000000 --- a/man/fit_gpois.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{fit_gpois} -\alias{fit_gpois} -\title{fit a Gamma-Poisson model by maximum likelihood} -\usage{ -fit_gpois(Y_obs, lambda_ref, d) -} -\arguments{ -\item{Y_obs}{numeric vector Gene expression counts} - -\item{lambda_ref}{numeric vector Reference expression levels} - -\item{d}{numeric Total library size} -} -\value{ -numeric MLE of alpha and beta -} -\description{ -fit a Gamma-Poisson model by maximum likelihood -} -\keyword{internal} diff --git a/man/forward_back_allele.Rd b/man/forward_back_allele.Rd deleted file mode 100644 index 2f03aa80..00000000 --- a/man/forward_back_allele.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hmm.R -\name{forward_back_allele} -\alias{forward_back_allele} -\title{Forward-backward algorithm for allele HMM} -\usage{ -forward_back_allele(hmm) -} -\arguments{ -\item{hmm}{HMM object; expect variables x (allele depth), d (total depth), -logPi (log transition prob matrix), delta (prior for each state), -alpha (alpha for each state), beta (beta for each state), -states (states), p_s (phase switch probs)} -} -\description{ -Forward-backward algorithm for allele HMM -} -\keyword{internal} diff --git a/man/get_trans_probs.Rd b/man/get_trans_probs.Rd deleted file mode 100644 index 9e71ef52..00000000 --- a/man/get_trans_probs.Rd +++ /dev/null @@ -1,32 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hmm.R -\name{get_trans_probs} -\alias{get_trans_probs} -\title{Helper function to calculate transition porbabilities -cn/phase are sclars, only p_s is vectorized} -\usage{ -get_trans_probs(t, p_s, w, cn_from, phase_from, cn_to, phase_to) -} -\arguments{ -\item{t}{numeric; CNV state transition probability} - -\item{p_s}{numeric; phase switch probability} - -\item{w}{numeric; relative abundance of states} - -\item{cn_from}{character; originating CNV state} - -\item{phase_from}{character; originating haplotype phase state} - -\item{cn_to}{character; destination CNV state} - -\item{phase_to}{character; destination haplotype phase state} -} -\value{ -numeric; transition probability -} -\description{ -Helper function to calculate transition porbabilities -cn/phase are sclars, only p_s is vectorized -} -\keyword{internal} diff --git a/man/l_bbinom.Rd b/man/l_bbinom.Rd deleted file mode 100644 index 4325edda..00000000 --- a/man/l_bbinom.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{l_bbinom} -\alias{l_bbinom} -\title{calculate joint likelihood of allele data} -\usage{ -l_bbinom(AD, DP, alpha, beta) -} -\arguments{ -\item{AD}{numeric vector Variant allele depth} - -\item{DP}{numeric vector Total allele depth} - -\item{alpha}{numeric Alpha parameter of Beta-Binomial distribution} - -\item{beta}{numeric Beta parameter of Beta-Binomial distribution} -} -\value{ -numeric Joint log likelihood -} -\description{ -calculate joint likelihood of allele data -} -\keyword{internal} diff --git a/man/l_gpois.Rd b/man/l_gpois.Rd deleted file mode 100644 index 72ce8e17..00000000 --- a/man/l_gpois.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{l_gpois} -\alias{l_gpois} -\title{calculate joint likelihood of a gamma-poisson model} -\usage{ -l_gpois(Y_obs, lambda_ref, d, alpha, beta, phi = 1) -} -\arguments{ -\item{Y_obs}{numeric vector Gene expression counts} - -\item{lambda_ref}{numeric vector Reference expression levels} - -\item{d}{numeric Total library size} - -\item{alpha}{numeric Alpha parameter of Gamma-Poisson distribution} - -\item{beta}{numeric Beta parameter of Gamma-Poisson distribution} -} -\value{ -numeric Joint log likelihood -} -\description{ -calculate joint likelihood of a gamma-poisson model -} -\keyword{internal} diff --git a/man/l_lnpois.Rd b/man/l_lnpois.Rd deleted file mode 100644 index c8018053..00000000 --- a/man/l_lnpois.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{l_lnpois} -\alias{l_lnpois} -\title{calculate joint likelihood of a PLN model} -\usage{ -l_lnpois(Y_obs, lambda_ref, d, mu, sig, phi = 1) -} -\arguments{ -\item{Y_obs}{numeric vector Gene expression counts} - -\item{lambda_ref}{numeric vector Reference expression levels} - -\item{d}{numeric Total library size} -} -\value{ -numeric Joint log likelihood -} -\description{ -calculate joint likelihood of a PLN model -} -\keyword{internal} diff --git a/man/likelihood_allele.Rd b/man/likelihood_allele.Rd deleted file mode 100644 index 93ff3e30..00000000 --- a/man/likelihood_allele.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hmm.R -\name{likelihood_allele} -\alias{likelihood_allele} -\title{Only compute total log likelihood from an allele HMM} -\usage{ -likelihood_allele(hmm) -} -\arguments{ -\item{hmm}{HMM object; expect variables x (allele depth), d (total depth), -logPi (log transition prob matrix), delta (prior for each state), -alpha (alpha for each state), beta (beta for each state), -states (states), p_s (phase switch probs)} -} -\description{ -Only compute total log likelihood from an allele HMM -} -\keyword{internal} diff --git a/man/run_allele_hmm.Rd b/man/run_allele_hmm.Rd deleted file mode 100644 index 087dd635..00000000 --- a/man/run_allele_hmm.Rd +++ /dev/null @@ -1,36 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hmm.R -\name{run_allele_hmm} -\alias{run_allele_hmm} -\title{allele-only HMM} -\usage{ -run_allele_hmm( - pAD, - DP, - p_s, - t = 1e-05, - theta_min = 0.08, - gamma = 20, - prior = NULL -) -} -\arguments{ -\item{pAD}{integer vector Paternal allele counts} - -\item{DP}{integer vector Total alelle counts} - -\item{p_s}{numeric vector Phase switch probabilities} - -\item{t}{numeric Transition probability between copy number states} - -\item{theta_min}{numeric Minimum haplotype frequency deviation threshold} - -\item{gamma}{numeric Overdispersion in the allele-specific expression} -} -\value{ -character vector Decoded states -} -\description{ -allele-only HMM -} -\keyword{internal} diff --git a/man/run_joint_hmm.Rd b/man/run_joint_hmm.Rd deleted file mode 100644 index 44d1521d..00000000 --- a/man/run_joint_hmm.Rd +++ /dev/null @@ -1,73 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hmm.R -\name{run_joint_hmm} -\alias{run_joint_hmm} -\title{Run joint HMM on a pseudobulk profile} -\usage{ -run_joint_hmm( - pAD, - DP, - p_s, - Y_obs = 0, - lambda_ref = 0, - d_total = 0, - theta_min = 0.08, - theta_neu = 0, - bal_cnv = TRUE, - phi_del = 2^(-0.25), - phi_amp = 2^(0.25), - phi_bamp = phi_amp, - phi_bdel = phi_del, - alpha = 1, - beta = 1, - mu = 0, - sig = 1, - t = 1e-05, - gamma = 18, - prior = NULL, - exp_only = FALSE, - allele_only = FALSE, - classify_allele = FALSE, - phasing = TRUE, - debug = FALSE -) -} -\arguments{ -\item{pAD}{integer vector Paternal allele counts} - -\item{DP}{integer vector Total alelle counts} - -\item{p_s}{numeric vector Phase switch probabilities} - -\item{Y_obs}{numeric vector Observed gene counts} - -\item{lambda_ref}{numeric vector Reference expression rates} - -\item{d_total}{integer Total library size for expression counts} - -\item{theta_min}{numeric Minimum haplotype imbalance threshold} - -\item{phi_del}{numeric Expected fold change for deletion} - -\item{phi_amp}{numeric Expected fold change for amplification} - -\item{phi_bamp}{numeric Expected fold change for balanced amplification} - -\item{phi_bdel}{numeric Expected fold change for balanced deletion} - -\item{mu}{numeric Global expression bias} - -\item{sig}{numeric Global expression variance} - -\item{t}{numeric Transition probability between copy number states} - -\item{gamma}{numeric Overdispersion in the allele-specific expression} - -\item{exp_only}{logical Whether to only use expression data} - -\item{allele_only}{logical Whether to only use allele data} -} -\description{ -Run joint HMM on a pseudobulk profile -} -\keyword{internal} diff --git a/man/viterbi_allele.Rd b/man/viterbi_allele.Rd deleted file mode 100644 index 7ffa6a1d..00000000 --- a/man/viterbi_allele.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hmm.R -\name{viterbi_allele} -\alias{viterbi_allele} -\title{Viterbi algorithm for allele HMM} -\usage{ -viterbi_allele(hmm) -} -\arguments{ -\item{hmm}{HMM object; expect variables x (allele depth), d (total depth), -logPi (log transition prob matrix), delta (prior for each state), -alpha (alpha for each state), beta (beta for each state), -states (states), p_s (phase switch probs)} -} -\description{ -Viterbi algorithm for allele HMM -} -\keyword{internal} diff --git a/man/viterbi_joint.Rd b/man/viterbi_joint.Rd deleted file mode 100644 index 0ce57e53..00000000 --- a/man/viterbi_joint.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hmm.R -\name{viterbi_joint} -\alias{viterbi_joint} -\title{Generalized viterbi algorithm for joint HMM} -\usage{ -viterbi_joint(hmm) -} -\arguments{ -\item{hmm}{HMM object; expect variables x (allele depth), d (total depth), y (expression count), l (cell total library size), -lambda (reference expression rate), mu (global expression mean), sig (global expression standard deviation), -logPi (log transition prob matrix), phi (expression fold change for each state), delta (prior for each state), -alpha (alpha for each state), beta (beta for each state), states (states), p_s (phase switch probs)} -} -\description{ -Generalized viterbi algorithm for joint HMM -} -\keyword{internal} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 2db09143..57445a49 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -11,93 +11,6 @@ Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif -// cppdbbinom -NumericVector cppdbbinom(const NumericVector& x, const NumericVector& size, const NumericVector& alpha, const NumericVector& beta, const bool& log_prob); -RcppExport SEXP _numbat_cppdbbinom(SEXP xSEXP, SEXP sizeSEXP, SEXP alphaSEXP, SEXP betaSEXP, SEXP log_probSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP); - Rcpp::traits::input_parameter< const NumericVector& >::type size(sizeSEXP); - Rcpp::traits::input_parameter< const NumericVector& >::type alpha(alphaSEXP); - Rcpp::traits::input_parameter< const NumericVector& >::type beta(betaSEXP); - Rcpp::traits::input_parameter< const bool& >::type log_prob(log_probSEXP); - rcpp_result_gen = Rcpp::wrap(cppdbbinom(x, size, alpha, beta, log_prob)); - return rcpp_result_gen; -END_RCPP -} -// cpp_dgpois -NumericVector cpp_dgpois(const NumericVector& x, const NumericVector& alpha, const NumericVector& beta, const bool& log_prob); -RcppExport SEXP _numbat_cpp_dgpois(SEXP xSEXP, SEXP alphaSEXP, SEXP betaSEXP, SEXP log_probSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const NumericVector& >::type x(xSEXP); - Rcpp::traits::input_parameter< const NumericVector& >::type alpha(alphaSEXP); - Rcpp::traits::input_parameter< const NumericVector& >::type beta(betaSEXP); - Rcpp::traits::input_parameter< const bool& >::type log_prob(log_probSEXP); - rcpp_result_gen = Rcpp::wrap(cpp_dgpois(x, alpha, beta, log_prob)); - return rcpp_result_gen; -END_RCPP -} -// logSumExp -double logSumExp(const arma::vec& x); -RcppExport SEXP _numbat_logSumExp(SEXP xSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const arma::vec& >::type x(xSEXP); - rcpp_result_gen = Rcpp::wrap(logSumExp(x)); - return rcpp_result_gen; -END_RCPP -} -// likelihood_compute -double likelihood_compute(Rcpp::NumericVector logphi, Rcpp::NumericMatrix logprob, arma::cube logPi, int n, int m); -RcppExport SEXP _numbat_likelihood_compute(SEXP logphiSEXP, SEXP logprobSEXP, SEXP logPiSEXP, SEXP nSEXP, SEXP mSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< Rcpp::NumericVector >::type logphi(logphiSEXP); - Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type logprob(logprobSEXP); - Rcpp::traits::input_parameter< arma::cube >::type logPi(logPiSEXP); - Rcpp::traits::input_parameter< int >::type n(nSEXP); - Rcpp::traits::input_parameter< int >::type m(mSEXP); - rcpp_result_gen = Rcpp::wrap(likelihood_compute(logphi, logprob, logPi, n, m)); - return rcpp_result_gen; -END_RCPP -} -// forward_backward_compute -Rcpp::NumericMatrix forward_backward_compute(Rcpp::NumericVector logphi, Rcpp::NumericMatrix logprob, arma::cube logPi, int n, int m); -RcppExport SEXP _numbat_forward_backward_compute(SEXP logphiSEXP, SEXP logprobSEXP, SEXP logPiSEXP, SEXP nSEXP, SEXP mSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< Rcpp::NumericVector >::type logphi(logphiSEXP); - Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type logprob(logprobSEXP); - Rcpp::traits::input_parameter< arma::cube >::type logPi(logPiSEXP); - Rcpp::traits::input_parameter< int >::type n(nSEXP); - Rcpp::traits::input_parameter< int >::type m(mSEXP); - rcpp_result_gen = Rcpp::wrap(forward_backward_compute(logphi, logprob, logPi, n, m)); - return rcpp_result_gen; -END_RCPP -} -// viterbi_compute -Rcpp::NumericVector viterbi_compute(Rcpp::NumericVector log_delta, Rcpp::NumericMatrix logprob, arma::cube logPi, int n, int m, Rcpp::NumericMatrix nu, Rcpp::NumericVector z); -RcppExport SEXP _numbat_viterbi_compute(SEXP log_deltaSEXP, SEXP logprobSEXP, SEXP logPiSEXP, SEXP nSEXP, SEXP mSEXP, SEXP nuSEXP, SEXP zSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< Rcpp::NumericVector >::type log_delta(log_deltaSEXP); - Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type logprob(logprobSEXP); - Rcpp::traits::input_parameter< arma::cube >::type logPi(logPiSEXP); - Rcpp::traits::input_parameter< int >::type n(nSEXP); - Rcpp::traits::input_parameter< int >::type m(mSEXP); - Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type nu(nuSEXP); - Rcpp::traits::input_parameter< Rcpp::NumericVector >::type z(zSEXP); - rcpp_result_gen = Rcpp::wrap(viterbi_compute(log_delta, logprob, logPi, n, m, nu, z)); - return rcpp_result_gen; -END_RCPP -} // roman2int_internal int roman2int_internal(Rcpp::StringVector letters, int nchar); RcppExport SEXP _numbat_roman2int_internal(SEXP lettersSEXP, SEXP ncharSEXP) { @@ -110,60 +23,9 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// fit_lnpois_cpp -arma::rowvec fit_lnpois_cpp(std::vector Y_obs, std::vector lambda_ref, int d); -RcppExport SEXP _numbat_fit_lnpois_cpp(SEXP Y_obsSEXP, SEXP lambda_refSEXP, SEXP dSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< std::vector >::type Y_obs(Y_obsSEXP); - Rcpp::traits::input_parameter< std::vector >::type lambda_ref(lambda_refSEXP); - Rcpp::traits::input_parameter< int >::type d(dSEXP); - rcpp_result_gen = Rcpp::wrap(fit_lnpois_cpp(Y_obs, lambda_ref, d)); - return rcpp_result_gen; -END_RCPP -} -// poilog1 -std::vector poilog1(std::vector x, std::vector my, std::vector sig); -RcppExport SEXP _numbat_poilog1(SEXP xSEXP, SEXP mySEXP, SEXP sigSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< std::vector >::type x(xSEXP); - Rcpp::traits::input_parameter< std::vector >::type my(mySEXP); - Rcpp::traits::input_parameter< std::vector >::type sig(sigSEXP); - rcpp_result_gen = Rcpp::wrap(poilog1(x, my, sig)); - return rcpp_result_gen; -END_RCPP -} -// l_lnpois_cpp -double l_lnpois_cpp(std::vector Y_obs, std::vector lambda_ref, int d, double mu, double sig, double phi); -RcppExport SEXP _numbat_l_lnpois_cpp(SEXP Y_obsSEXP, SEXP lambda_refSEXP, SEXP dSEXP, SEXP muSEXP, SEXP sigSEXP, SEXP phiSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< std::vector >::type Y_obs(Y_obsSEXP); - Rcpp::traits::input_parameter< std::vector >::type lambda_ref(lambda_refSEXP); - Rcpp::traits::input_parameter< int >::type d(dSEXP); - Rcpp::traits::input_parameter< double >::type mu(muSEXP); - Rcpp::traits::input_parameter< double >::type sig(sigSEXP); - Rcpp::traits::input_parameter< double >::type phi(phiSEXP); - rcpp_result_gen = Rcpp::wrap(l_lnpois_cpp(Y_obs, lambda_ref, d, mu, sig, phi)); - return rcpp_result_gen; -END_RCPP -} static const R_CallMethodDef CallEntries[] = { - {"_numbat_cppdbbinom", (DL_FUNC) &_numbat_cppdbbinom, 5}, - {"_numbat_cpp_dgpois", (DL_FUNC) &_numbat_cpp_dgpois, 4}, - {"_numbat_logSumExp", (DL_FUNC) &_numbat_logSumExp, 1}, - {"_numbat_likelihood_compute", (DL_FUNC) &_numbat_likelihood_compute, 5}, - {"_numbat_forward_backward_compute", (DL_FUNC) &_numbat_forward_backward_compute, 5}, - {"_numbat_viterbi_compute", (DL_FUNC) &_numbat_viterbi_compute, 7}, {"_numbat_roman2int_internal", (DL_FUNC) &_numbat_roman2int_internal, 2}, - {"_numbat_fit_lnpois_cpp", (DL_FUNC) &_numbat_fit_lnpois_cpp, 3}, - {"_numbat_poilog1", (DL_FUNC) &_numbat_poilog1, 3}, - {"_numbat_l_lnpois_cpp", (DL_FUNC) &_numbat_l_lnpois_cpp, 6}, {NULL, NULL, 0} }; diff --git a/src/bbinom.cpp b/src/bbinom.cpp deleted file mode 100644 index b4007773..00000000 --- a/src/bbinom.cpp +++ /dev/null @@ -1,161 +0,0 @@ - -#include - -// https://github.com/twolodzko/extraDistr/blob/master/src/beta-binomial-distribution.cpp - -using std::pow; -using std::sqrt; -using std::abs; -using std::exp; -using std::log; -using std::floor; -using std::ceil; -using Rcpp::NumericVector; - - -// MACROS - -#define GETV(x, i) x[i % x.length()] // wrapped indexing of vector -#define GETM(x, i, j) x(i % x.nrow(), j) // wrapped indexing of matrix -#define VALID_PROB(p) ((p >= 0.0) && (p <= 1.0)) - - -bool isInteger(double x, bool warn = true); - - -bool isInteger(double x, bool warn) { - if (ISNAN(x)) - return false; - if (((x < 0.0) ? std::ceil(x) : std::floor(x)) != x) { - if (warn) { - char msg[55]; - std::snprintf(msg, sizeof(msg), "non-integer: %f", x); - Rcpp::warning(msg); - } - return false; - } - return true; -} - - -inline double logpmf_bbinom(double k, double n, double alpha, - double beta, bool& throw_warning) { -#ifdef IEEE_754 - if (ISNAN(k) || ISNAN(n) || ISNAN(alpha) || ISNAN(beta)) - return k+n+alpha+beta; -#endif - if (alpha < 0.0 || beta < 0.0 || n < 0.0 || !isInteger(n, false)) { - throw_warning = true; - return NAN; - } - if (!isInteger(k) || k < 0.0 || k > n) - return R_NegInf; - // R::choose(n, k) * R::beta(k+alpha, n-k+beta) / R::beta(alpha, beta); - return R::lchoose(n, k) + R::lbeta(k+alpha, n-k+beta) - R::lbeta(alpha, beta); -} - - -// https://github.com/twolodzko/extraDistr/blob/master/src/beta-binomial-distribution.cpp - -// [[Rcpp::export]] -NumericVector cppdbbinom( - const NumericVector& x, - const NumericVector& size, - const NumericVector& alpha, - const NumericVector& beta, - const bool& log_prob = false - ) { - - if (std::min({x.length(), size.length(), - alpha.length(), beta.length()}) < 1) { - return NumericVector(0); - } - - int Nmax = std::max({ - x.length(), - size.length(), - alpha.length(), - beta.length() - }); - NumericVector p(Nmax); - - bool throw_warning = false; - - for (int i = 0; i < Nmax; i++){ - p[i] = logpmf_bbinom(GETV(x, i), GETV(size, i), - GETV(alpha, i), GETV(beta, i), - throw_warning); - } - - if (!log_prob){ - p = Rcpp::exp(p); - } - - if (throw_warning){ - Rcpp::warning("NaNs produced"); - } - - return p; -} - - -// https://github.com/cran/extraDistr/blob/19708536de6549404dec52588a36f4b598d0016a/src/gamma-poisson-distribution.cpp - -inline double lfactorial(double x) { - return R::lgammafn(x + 1.0); -} - -inline double logpmf_gpois(double x, double alpha, double beta, - bool& throw_warning) { -#ifdef IEEE_754 - if (ISNAN(x) || ISNAN(alpha) || ISNAN(beta)) - return x+alpha+beta; -#endif - if (alpha <= 0.0 || beta <= 0.0) { - throw_warning = true; - return NAN; - } - if (!isInteger(x) || x < 0.0 || !R_FINITE(x)) - return R_NegInf; - // p = beta/(1.0+beta); - double p = exp( log(beta) - log1p(beta) ); - return R::lgammafn(alpha+x) - lfactorial(x) - R::lgammafn(alpha) + - log(p)*x + log(1.0-p)*alpha; -} - -// [[Rcpp::export]] -NumericVector cpp_dgpois( - const NumericVector& x, - const NumericVector& alpha, - const NumericVector& beta, - const bool& log_prob = false - ) { - - if (std::min({x.length(), alpha.length(), beta.length()}) < 1) { - return NumericVector(0); - } - - int Nmax = std::max({ - x.length(), - alpha.length(), - beta.length() - }); - NumericVector p(Nmax); - - bool throw_warning = false; - - for (int i = 0; i < Nmax; i++) - p[i] = logpmf_gpois(GETV(x, i), GETV(alpha, i), - GETV(beta, i), throw_warning); - - if (!log_prob) - p = Rcpp::exp(p); - - if (throw_warning) - Rcpp::warning("NaNs produced"); - - return p; -} - - - diff --git a/src/hmm.cpp b/src/hmm.cpp deleted file mode 100644 index 7ed5c2cc..00000000 --- a/src/hmm.cpp +++ /dev/null @@ -1,194 +0,0 @@ - -#include - -using namespace Rcpp; - - -#ifdef HAVE_LONG_DOUBLE -# define LDOUBLE long double -# define EXPL expl -#else -# define LDOUBLE double -# define EXPL exp -#endif - -// [[Rcpp::export]] -double logSumExp(const arma::vec& x) { - // https://github.com/helske/seqHMM/blob/master/src/logSumExp.cpp - unsigned int maxi = x.index_max(); - LDOUBLE maxv = x(maxi); - if (!(maxv > -arma::datum::inf)) { - return -arma::datum::inf; - } - LDOUBLE cumsum = 0.0; - for (unsigned int i = 0; i < x.n_elem; i++) { - if ((i != maxi) & (x(i) > -arma::datum::inf)) { - cumsum += EXPL(x(i) - maxv); - } - } - - return maxv + log1p(cumsum); -} - - -// expensive for loops in likelihood_allele() and forward_back_allele)() - -// [[Rcpp::export]] -double likelihood_compute(Rcpp::NumericVector logphi, Rcpp::NumericMatrix logprob, arma::cube logPi, int n, int m) { - - double LL = 0.0; - - for (int i = 0; i < n; i++) { - - if (i > 0) { - Rcpp::NumericVector logphi_new(m); - for (int j = 0; j < m; j++) { - Rcpp::NumericMatrix subset_logPi = wrap(logPi.slice(i)); - Rcpp::NumericVector logphi_logPi = logphi + subset_logPi(_, j); - logphi_new[j] = logSumExp(logphi_logPi); - } - logphi = logphi_new; - } - - logphi = logphi + logprob(i, _); // Note: logprob(i, _) is Rcpp::NumericVector - - double logSumPhi = logSumExp(logphi); - - logphi = logphi - logSumPhi; - - LL = LL + logSumPhi; - } - - return LL; -} - - -// [[Rcpp::export]] -Rcpp::NumericMatrix forward_backward_compute(Rcpp::NumericVector logphi, Rcpp::NumericMatrix logprob, arma::cube logPi, int n, int m) { - - const int nrow = n; - const int ncol = m; - Rcpp::NumericMatrix logalpha(nrow, ncol); // logalpha <- matrix(as.double(rep(0, m * n)), nrow = n) - - double lscale = 0.0; - double LL = 0.0; - - for (int t = 0; t < n; t++) { - - if (t > 0) { - Rcpp::NumericVector logphi_new(m); - for (int j = 0; j < m; j++) { - Rcpp::NumericMatrix subset_logPi = wrap(logPi.slice(t)); - Rcpp::NumericVector logphi_logPi = logphi + subset_logPi(_, j); - logphi_new[j] = logSumExp(logphi_logPi); - } - logphi = logphi_new; - } - - logphi = logphi + logprob(t, _); // Note: logprob(i, _) is Rcpp::NumericVector - - double logSumPhi = logSumExp(logphi); - - logphi = logphi - logSumPhi; - - lscale = lscale + logSumPhi; - - logalpha(t, _) = logphi + lscale; - - } - - LL = lscale; - - Rcpp::NumericMatrix logbeta(nrow, ncol); // logalpha <- matrix(as.double(rep(0, m * n)), nrow = n) - Rcpp::NumericVector logphi_(m); // logphi <- log(as.double(rep(1/m, m))) - for (int i = 0; i < m; i++){ - double mval = static_cast(m); - logphi_[i] = std::log(1/mval); - } - - double lscale_ = std::log(m); - - for (int t = n-2; t>=0; t--) { - - // logphi = sapply(1:m, function(i) matrixStats::logSumExp(logphi + logprob[t+1,] + logPi[[t+1]][i,])) - - Rcpp::NumericVector logphinew(m); - for (int j = 0; j < m; j++) { - Rcpp::NumericMatrix subset_logPi = wrap(logPi.slice(t+1)); - Rcpp::NumericVector logphi_logprob_logPi = logphi_ + logprob(t+1, _) + subset_logPi(j, _); - logphinew[j] = logSumExp(logphi_logprob_logPi); - } - logphi_ = logphinew; - - logbeta(t, _) = logphi_ + lscale_; - - double logSumPhi = logSumExp(logphi_); - - logphi_ = logphi_ - logSumPhi; - - lscale_ = lscale_ + logSumPhi; - - } - - // p_up = exp(logalpha + logbeta - LL)[,1] - - // matrix addition - Rcpp::NumericMatrix expoutput(nrow, ncol); - for (int i = 0; i < nrow; i++) { - for (int j = 0; j < ncol; j++) { - double sum = logalpha(i, j) + logbeta(i, j) - LL; - expoutput(i, j) = std::exp(sum); // Rcpp::exp() works with vectors - } - } - - return expoutput; -} - -// [[Rcpp::export]] -Rcpp::NumericVector viterbi_compute(Rcpp::NumericVector log_delta, Rcpp::NumericMatrix logprob, arma::cube logPi, int n, int m, Rcpp::NumericMatrix nu, Rcpp::NumericVector z) { - - const int nrow = n; - const int ncol = m; - - nu(0, _) = log_delta + logprob(0, _); // nu[1, ] <- log(hmm$delta) + logprob[1,] - - Rcpp::NumericMatrix matrixnu(ncol, ncol); - for (int i = 1; i < nrow; i++) { - Rcpp::NumericVector nu_vec = nu(i - 1, _); - for (int j = 0; j < ncol; j++) { - matrixnu(j, _) = rep(nu_vec[j], ncol); - } - // nu[i, ] = apply(matrixnu + hmm$logPi[,,i], 2, max) - // Step 1) - // Add the two matrices matrixnu + hmm$logPi[,,i] - Rcpp::NumericMatrix subset_logPi = wrap(logPi.slice(i)); - Rcpp::NumericMatrix sum_matrixnu_logPi(ncol, ncol); - for (int ii = 0; ii < ncol; ii++) { - for (int jj = 0; jj < ncol; jj++) { - sum_matrixnu_logPi(ii, jj) = matrixnu(ii, jj) + subset_logPi(ii, jj); - } - } - // Step 2) - // Calculate the 'max' for each column, return a NumericVector with 'max' for each column - Rcpp::NumericVector newnu; - for (int k = 0; k < ncol; k++) { - newnu.push_back(max(sum_matrixnu_logPi(_, k))); - } - nu(i, _) = newnu; // nu[i, ] = apply(matrixnu + hmm$logPi[,,i], 2, max) - - nu(i, _) = nu(i, _) + logprob(i, _); // nu[i, ] = nu[i, ] + logprob[i,] - - } - - z[n - 1] = which_max(nu(n-1, _)) + 1; // which_max() uses 0-indexing, so add 1 - - // for (i in seq(N - 1, 1, -1)) z[i] <- which.max(hmm$logPi[,,i+1][, z[i+1]] + nu[i, ]) - for (int t = n-2; t>=0; t--) { - Rcpp::NumericMatrix subset_logPi = wrap(logPi.slice(t+1)); - // note: 'z[t+1] - 1' as we convert R 1-indexing to C++ 0-indexing - Rcpp::NumericVector logPi_nu = subset_logPi(_, z[t+1] - 1) + nu(t, _); - z[t] = which_max(logPi_nu) + 1; // which_max() uses 0-indexing, so add 1 - } - - return z; -} diff --git a/src/mle.cpp b/src/mle.cpp deleted file mode 100644 index 9c6d0e47..00000000 --- a/src/mle.cpp +++ /dev/null @@ -1,45 +0,0 @@ -#include // std::pow -#include -// [[Rcpp::depends(RcppArmadillo)]] -#include -// [[Rcpp::depends(roptim)]] -using namespace roptim; - -double l_lnpois_cpp(std::vector Y_obs, std::vector lambda_ref, int d, double mu, double sig, double phi = 1.0); - -class fit_lnpois : public Functor { - - public: - - const std::vector Y_obs; - - const std::vector lambda_ref; - - const int d; - - // initialize with source and destination - fit_lnpois(std::vector Y_obs, const std::vector lambda_ref, const int d): - Y_obs(Y_obs), lambda_ref(lambda_ref), d(d) {} - - double operator()(const arma::vec &x) override { - return -l_lnpois_cpp(Y_obs, lambda_ref, d, x[0], x[1]); - }; -}; - -// [[Rcpp::export]] -arma::rowvec fit_lnpois_cpp(std::vector Y_obs, std::vector lambda_ref, int d) { - - fit_lnpois model(Y_obs, lambda_ref, d); - - Roptim opt("L-BFGS-B"); - opt.control.trace = 0; - opt.set_hessian(false); - arma::vec lower = {-arma::datum::inf, 0.01}; - opt.set_lower(lower); - - arma::vec x = {0, 1}; - opt.minimize(model, x); - - return opt.par().t(); -} - diff --git a/src/poilog.cpp b/src/poilog.cpp deleted file mode 100644 index 4974f6c6..00000000 --- a/src/poilog.cpp +++ /dev/null @@ -1,151 +0,0 @@ - -#include -#include - -using namespace Rcpp; - -// refer to: -// https://github.com/evanbiederstedt/poilogcpp - -// Function declarations -double poilog(int x, double my, double sig); -double bipoilog(int x, int y, double my1, double my2, double sig1, double sig2, double ro); -double maxf(int x, double my, double sig); -double upper(int x, double m, double my, double sig); -double lower(int x, double m, double my, double sig); -double my_f(double z, int x, double my, double sig, double fac); -void my_f_vec(double *z, int n, void *p); -std::vector poilog1(std::vector x, std::vector my, std::vector sig); - -struct My_fparams { - int x; - double sig; - double my; - double fac; -}; - -// [[Rcpp::export]] -std::vector poilog1(std::vector x, std::vector my, std::vector sig){ - int nrN = x.size(); - std::vector vect(nrN); - for (int i = 0; i < nrN; i++) { - vect[i] = poilog(x[i], my[i], sig[i]); - } - return vect; -} - -double maxf(int x, double my, double sig){ - double d,z; - z=0; - d=100; - while (d>0.00001) { - if (x-1-exp(z)-1/sig*(z-my)>0) { - z=z+d; - } else { - z=z-d; - } - d=d/2; - } - return(z); -} - - -double upper(int x, double m, double my, double sig){ - double d,z,mf; - mf = (x-1)*m-exp(m)-0.5/sig*((m-my)*(m-my)); - z = m+20; - d = 10; - while (d>0.000001) { - if ((x-1)*z-exp(z)-0.5/sig*((z-my)*(z-my))-mf+log(1000000)>0) { - z=z+d; - } else { - z=z-d; - } - d=d/2; - } - return(z); -} - - -double lower(int x, double m, double my, double sig) { - double d,z,mf; - mf = (x-1)*m-exp(m)-0.5/sig*((m-my)*(m-my)); - z = m-20; - d = 10; - while (d>0.000001) { - if ((x-1)*z-exp(z)-0.5/sig*((z-my)*(z-my))-mf+log(1000000)>0) { - z=z-d; - } else { - z=z+d; - } - d=d/2; - } - return(z); -} - - -double my_f(double z, int x, double my, double sig, double fac) { - return exp(z*x-exp(z)-0.5/sig*((z-my)*(z-my))-fac); -} - -void my_f_vec(double *z, int n, void *p){ - struct My_fparams *params = (struct My_fparams *)p; - int x = (params->x); - double sig = (params->sig); - double my = (params->my); - double fac = (params->fac); - for (int i=0;i Y_obs, std::vector lambda_ref, int d, double mu, double sig, double phi = 1.0) { - - int n = Y_obs.size(); - - double l = 0; - double p = 0; - - for (int i = 0; i < n; i++) { - p = poilog(Y_obs[i], mu + log(phi * d * lambda_ref[i]), std::pow(sig, 2)); - if (p == 0) {p = 1e-15;} - l += log(p); - } - - return l; -} \ No newline at end of file diff --git a/tests/testthat/test_functions.R b/tests/testthat/test_functions.R index a6486371..bc050a8e 100644 --- a/tests/testthat/test_functions.R +++ b/tests/testthat/test_functions.R @@ -1,73 +1,6 @@ library(numbat) -test_that("dpoilog works", { - expect_equal(numbat:::dpoilog(c(1,11),c(1,1),c(1,1)), c(0.175733342664327, 0.0150105250670325)) -}) - -# test_that("HMM works", { - -# message('getting pseudobulk ..') - -# bulk = get_bulk( -# count_mat = count_mat_ATC2, -# df_allele = df_allele_ATC2, -# lambdas_ref = ref_hca, -# gtf = gtf_hg38 -# ) - -# expect_true(length(bulk) > 0) - -# message('running HMM..') - -# bulk = analyze_bulk(bulk, t = 1e-5) - -# expect_true(length(bulk) > 0) - -# }) - -test_that("logSumExp() works", { - - b = numbat:::logSumExp(c(1.2, 3.4)) - expect_equal(b, 3.5050833) - d = numbat:::logSumExp(c(1.2, 3.4, -5.6, -7.8)) - expect_equal(d, 3.5052067) - -}) - -test_that("Check that likelihood_allele() works as expected", { - - LL = numbat:::likelihood_allele(pre_likelihood_hmm) - expect_equal(as.integer(LL), -736) - -}) - -test_that("Check that forward_backward() works as expected", { - - p_major = numbat:::forward_back_allele(pre_likelihood_hmm)[,1] - expect_equal(is.vector(p_major), TRUE) - expect_equal(length(p_major), 1042) - expect_equal(round(p_major[1], 3), 0.963) - expect_equal(round(p_major[2], 3), 0) - expect_equal(round(p_major[3], 3), 0) - expect_equal(round(p_major[10], 3), 0.745) - -}) - -test_that("Check that viterbi_allele() works as expected", { - - states = numbat:::viterbi_allele(pre_likelihood_hmm) - expect_equal(sum(states == 1), 440) - expect_equal(sum(states == 2), 602) - expect_equal(states[1], 1) - expect_equal(states[2], 2) - expect_equal(states[3], 2) - expect_equal(states[1024], 1) - -}) - - - test_that("Check that roman2int works as expected", { val.3899 <- 'MMMDCCCXCIX' val.3900 <- 'MMMCM' diff --git a/vignettes/numbat.Rmd b/vignettes/numbat.Rmd index a768435e..d6eec7a6 100644 --- a/vignettes/numbat.Rmd +++ b/vignettes/numbat.Rmd @@ -53,7 +53,7 @@ devtools::install_github("https://github.com/kharchenkolab/numbat") ## Docker We provide a ready-to-run Docker container that includes the Numbat R package and all of its dependencies. You can launch it as follows: ``` -docker run -v /work:/mnt/mydata -it pkharchenkolab/numbat-rbase:main /bin/bash +docker run -v /work:/mnt/mydata -it pkharchenkolab/numbat-rbase:latest /bin/bash ``` where `/work` is a local data folder you would like to access and write to. Within the container, `cellsnp-lite`, `eagle2` and `samtools` are available in path and the 1000 Genome SNP VCF and phasing panel files (hg38) are stored under `/data`. You can also launch R and run interactive analysis using Numbat (or run an R script).