Skip to content

Commit

Permalink
updating summary functions, cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
m-murphy committed Oct 12, 2023
1 parent d033cf8 commit df5716c
Show file tree
Hide file tree
Showing 13 changed files with 139 additions and 66 deletions.
2 changes: 1 addition & 1 deletion .ccls
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
clang
-std=c++14
-std=c++17
%h -x c++-header
-I/usr/local/opt/r/lib/R/include
-I/Library/Frameworks/R.framework/Resources/library/Rcpp/include
Expand Down
16 changes: 10 additions & 6 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
Package: moire
Title: Multiplicity of Infection and Allele Frequency Recovery from Noisy Polyallelic Genetics Data
Version: 3.1.0
Authors@R:
Authors@R: c(
person(given = "Maxwell",
family = "Murphy",
role = c("aut", "cre"),
email = "[email protected]",
comment = c(ORCID = "0000-0003-0332-4388"))
comment = c(ORCID = "0000-0003-0332-4388")),
person(given = "Bryan",
family = "Greenhouse",
role = c("aut", "ths"),
email = "[email protected]",
comment = c(ORCID = "0000-0003-0287-9111")))
Description: A Markov Chain Monte Carlo (MCMC) based approach to Bayesian estimation of individual level multiplicity of infection, within host relatedness, and
population allele frequencies from polyallelic genetic data.
License: GPL (>= 3)
Expand All @@ -26,16 +31,15 @@ Imports:
stats,
purrr,
rlang,
Gmedian,
URL: https://github.com/EPPIcenter/moire
ggplot2,
URL: https://github.com/EPPIcenter/moire, https://eppicenter.github.io/moire/, https://eppicenter.ucsf.edu/resources
BugReports: https://github.com/EPPIcenter/moire/issues
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.1
RoxygenNote: 7.2.3
Suggests:
knitr,
rmarkdown,
markdown,
ggplot2,
forcats,
testthat (>= 3.0.0)
VignetteBuilder: knitr
Expand Down
10 changes: 8 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# Generated by roxygen2: do not edit by hand

export(calculate_he)
export(calculate_median_allele_frequencies)
export(calculate_med_allele_freqs)
export(calculate_naive_allele_frequencies)
export(calculate_naive_coi)
export(calculate_naive_coi_offset)
export(load_delimited_data)
export(load_long_form_data)
export(plot_chain_swaps)
export(rdirichlet)
export(run_mcmc)
export(simulate_allele_frequencies)
Expand All @@ -23,11 +24,16 @@ export(summarize_epsilon_neg)
export(summarize_epsilon_pos)
export(summarize_he)
export(summarize_relatedness)
import(Gmedian)
import(RcppProgress)
import(purrr)
importFrom(Rcpp,sourceCpp)
importFrom(ggplot2,aes)
importFrom(ggplot2,coord_cartesian)
importFrom(ggplot2,geom_point)
importFrom(ggplot2,geom_vline)
importFrom(ggplot2,ggplot)
importFrom(rlang,.data)
importFrom(stats,dist)
importFrom(stats,dpois)
importFrom(stats,qpois)
importFrom(stats,quantile)
Expand Down
7 changes: 6 additions & 1 deletion R/mcmc.R
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ run_mcmc <-
pt_num_threads = 1,
adapt_temp = FALSE,
pre_adapt_steps = 50,
temp_adapt_steps = 100) {
temp_adapt_steps = 10) {
start_time <- Sys.time()
args <- as.list(environment())
mcmc_args <- as.list(environment())
mcmc_args$data <- data$data
Expand Down Expand Up @@ -143,7 +144,11 @@ run_mcmc <-
chains[[1]] <- chain
}

end_time <- Sys.time()

res$runtime <- end_time - start_time
res$chains <- chains
res$args <- args

res
}
92 changes: 43 additions & 49 deletions R/summary.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,45 @@
#' Calculate the geometric median of the posterior distribution of allele
#' frequencies
#'
#' @details Returns the geometric median of the posterior distribution, defined
#' as the point minimizing the L2 distance from each sampled point.
#'
#' @import purrr
#' @importFrom stats dist
#'
#' @export
#'
#' @param mcmc_results Result of calling run_mcmc()
#'
#' @param merge_chains boolean indicating that all chain results should be merged
calculate_med_allele_freqs <- function(mcmc_results, merge_chains = TRUE) {
if (merge_chains) {
chains <- mcmc_results$chains
post_af <- purrr::transpose(purrr::map(chains, ~ .x$allele_freqs))
medians <- purrr::map(post_af, function(loc) {
samples <- purrr::flatten(loc)
mat <- matrix(unlist(samples), ncol = length(samples[[1]]))
d <- dist(t(mat))
samples[[which.min(rowSums(as.matrix(d)))]]
})
names(medians) <- mcmc_results$args$data$loci
} else {
chains <- mcmc_results$chains
medians <- lapply(chains, function(chain) {
post_af <- chain$allele_freqs
res <- purrr::map(post_af, function(samples) {
mat <- matrix(unlist(samples), ncol = length(samples[[1]]))
d <- dist(t(mat))
samples[[which.min(rowSums(as.matrix(d)))]]
})
names(res) <- mcmc_results$args$data$loci
return(res)
})
}
return(medians)
}


#' Calculate naive COI offset
#'
#' @details Estimates the complexity of infection using a naive approach
Expand Down Expand Up @@ -89,7 +131,7 @@ calculate_he <- function(allele_freqs) {
#' distribution of COI for each biological sample, as well as naive
#' estimates of COI.
#'
#' @importFrom stats quantile
#' @importFrom stats quantile
#'
#' @export
#'
Expand Down Expand Up @@ -650,51 +692,3 @@ summarize_effective_coi <- function(mcmc_results, lower_quantile = .025, upper_q
return(relatedness_data)
}
}


#' Calculate the geometric median of the posterior distribution of allele
#' frequencies
#'
#' @details Returns the geometric median of the posterior distribution, defined
#' as the point minimizing the L2 distance from each sampled point.
#'
#' @import purrr
#' @import Gmedian
#'
#' @export
#'
#' @param mcmc_results Result of calling run_mcmc()
#'
#' @param merge_chains boolean indicating that all chain results should be merged
calculate_median_allele_frequencies <- function(mcmc_results, merge_chains = TRUE) {
if (merge_chains) {
chains <- mcmc_results$chains
post_af <- purrr::transpose(purrr::map(chains, ~ .x$allele_freqs))
medians <- purrr::map(post_af, function(loc) {
samples <- purrr::flatten(loc)
total_samples <- length(samples)
num_alleles <- length(samples[[1]])
median <- Gmedian::Gmedian(
matrix(unlist(samples), nrow = total_samples, ncol = num_alleles, byrow = TRUE)
)
return(median)
})
names(medians) <- mcmc_results$args$data$loci
} else {
chains <- mcmc_results$chains
medians <- lapply(chains, function(chain) {
post_af <- chain$allele_freqs
res <- purrr::map(post_af, function(samples) {
total_samples <- length(samples)
num_alleles <- length(samples[[1]])
median <- Gmedian::Gmedian(
matrix(unlist(samples), nrow = total_samples, ncol = num_alleles, byrow = TRUE)
)
return(median)
})
names(res) <- mcmc_results$args$data$loci
return(res)
})
}
return(medians)
}
31 changes: 31 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,34 @@ load_delimited_data <- function(data, sep = ";", warn_uninformative = TRUE) {
dplyr::filter(!is.na(.data$allele))
return(load_long_form_data(df))
}


#' Plot chain swap acceptance rates
#'
#' @details Plot the swap acceptance rates for each chain.
#' The x-axis is the temperature, and the y-axis is the swap acceptance rate.
#' The dashed lines indicate the temperatures used for parallel tempering.
#'
#' @export
#'
#' @param mcmc_results list of results from `run_mcmc`
#'
#' @importFrom ggplot2 aes coord_cartesian geom_point geom_vline ggplot
#'
#' @return list of ggplot objects
plot_chain_swaps <- function(mcmc_results) {
plots <- lapply(mcmc_results$chains, function(chain) {
# swaps for a chain happen every 2 samples
swaps_per_chain <- mcmc_results$args$samples_per_chain / 2
swap_dist <- chain$swap_acceptances / swaps_per_chain
temps <- temps <- mcmc_results$chains[[1]]$temp_gradient
swap_idx <- (temps[1:length(temps) - 1] + temps[2:length(temps)]) / 2 # nolint: seq_linter.
dat <- data.frame(swap_rate = swap_dist, temp = swap_idx)
g <- ggplot(dat, aes(x = temp, y = swap_rate)) +
geom_point() +
geom_vline(data = data.frame(x = temps), aes(xintercept = x), linetype = "dashed", alpha = 0.25) +
coord_cartesian(ylim = c(0, 1))
g
})
return(plots)
}
1 change: 1 addition & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ reference:
- title: Utilities
- contents:
- starts_with("load_")
- starts_with("plot_")
- title: Data
- contents:
- simulated_data
Expand Down
10 changes: 10 additions & 0 deletions inst/CITATION
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
bibentry(
bibtype = "Article",
title = "MOIRE: A software package for the estimation of allele frequencies and effective multiplicity of infection from polyallelic data",
author = "Maxwell Murphy and Bryan Greenhouse",
journal = "bioRxiv",
year = "2023",
doi = "10.1101/2023.10.03.560769",
abstract = "Malaria parasite genetic data can provide insight into parasite phenotypes, evolution, and transmission. However, estimating key parameters such as allele frequencies, multiplicity of infection (MOI), and within-host relatedness from genetic data has been challenging, particularly in the presence of multiple related coinfecting strains. Existing methods often rely on single nucleotide polymorphism (SNP) data and do not account for within-host relatedness. In this study, we introduce a Bayesian approach called MOIRE (Multiplicity Of Infection and allele frequency REcovery), designed to estimate allele frequencies, MOI, and within-host relatedness from genetic data subject to experimental error. Importantly, MOIRE is flexible in accommodating both polyallelic and SNP data, making it adaptable to diverse genotyping panels. We also introduce a novel metric, the effective MOI (eMOI), which integrates MOI and within-host relatedness, providing a robust and interpretable measure of genetic diversity. Using extensive simulations and real-world data from a malaria study in Namibia, we demonstrate the superior performance of MOIRE over naive estimation methods, accurately estimating MOI up to 7 with moderate sized panels of diverse loci (e.g. microhaplotypes). MOIRE also revealed substantial heterogeneity in population mean MOI and mean relatedness across health districts in Namibia, suggesting detectable differences in transmission dynamics. Notably, eMOI emerges as a portable metric of within-host diversity, facilitating meaningful comparisons across settings, even when allele frequencies or genotyping panels are different. MOIRE represents an important addition to the analysis toolkit for malaria population dynamics. Compared to existing software, MOIRE enhances the accuracy of parameter estimation and enables more comprehensive insights into within-host diversity and population structure. Additionally, MOIRE{\textquoteright}s adaptability to diverse data sources and potential for future improvements make it a valuable asset for research on malaria and other organisms, such as other eukaryotic pathogens. MOIRE is available as an R package at https://eppicenter.github.io/moire/.Competing Interest StatementThe authors have declared no competing interest.",
publisher = "Cold Spring Harbor Laboratory"
)

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 22 additions & 0 deletions man/plot_chain_swaps.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/run_mcmc.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions man/summarize_coi.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion src/mcmc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,9 @@ void MCMC::swap_chains(int step, bool burnin)
swap_barriers[i] += 1.0 - acceptanceRate;
}

if ((acceptanceRatio > 0 || log(R::runif(0, 1)) < acceptanceRatio) and
double u = log(R::runif(0, 1));

if ((acceptanceRatio > 0 || u < acceptanceRatio) and
!std::isnan(acceptanceRatio))
{
std::swap(swap_indices[i], swap_indices[i + 1]);
Expand Down

0 comments on commit df5716c

Please sign in to comment.