From e459bc86132b463e1dd266a8f6f4921658fe6596 Mon Sep 17 00:00:00 2001 From: Zena Lapp Date: Tue, 5 Sep 2023 11:29:00 -0600 Subject: [PATCH] Update documentation --- R/bistro.R | 6 +- R/calc_log10_lrs.R | 20 +++--- R/checks.R | 40 +++++++---- R/identify_matches.R | 42 +++++------ R/utils.R | 4 +- README.Rmd | 2 +- README.md | 4 +- man/bistro.Rd | 4 +- man/calc_one_log10_lr.Rd | 4 +- man/identify_one_match_set.Rd | 4 +- tests/testthat/_snaps/bistro.md | 35 ++++----- tests/testthat/_snaps/calc_log10_lrs.md | 30 ++++---- tests/testthat/_snaps/identify_matches.md | 43 +++++------ tests/testthat/test-bistro.R | 4 +- tests/testthat/test-checks.R | 6 +- vignettes/bistro.Rmd | 87 ++++++++++++----------- 16 files changed, 170 insertions(+), 165 deletions(-) diff --git a/R/bistro.R b/R/bistro.R index afde3c3..6795aa6 100644 --- a/R/bistro.R +++ b/R/bistro.R @@ -64,12 +64,12 @@ #' these are included as separate rows. #' #' * `bloodmeal_id`: bloodmeal id -#' * `bloodmeal_locus_count`: number of loci successfully typed in the bloodmeal +#' * `locus_count`: number of loci successfully typed in the bloodmeal #' * `est_noc`: estimated number of contributors to the bloodmeal #' * `match`: whether a match was identified for a given bloodmeal (yes or no) #' * `human_id`: If match, human id (NA otherwise) #' * `log10_lr`: If match, log10 likelihood ratio (NA otherwise) -#' * `note`: Why the bloodmeal does or doesn't have a match +#' * `notes`: Why the bloodmeal does or doesn't have a match #' #' #' @export @@ -155,7 +155,7 @@ bistro <- dplyr::filter(SampleName %in% human_ids) |> rm_dups() - message("Calculating log10_lrs") + message("Calculating log10LRs") log10_lrs <- calc_log10_lrs( diff --git a/R/calc_log10_lrs.R b/R/calc_log10_lrs.R index d86644a..87c48c2 100644 --- a/R/calc_log10_lrs.R +++ b/R/calc_log10_lrs.R @@ -6,9 +6,9 @@ #' which to compute the log10_lr #' #' @return tibble with log10_lr for bloodmeal-human pair including bloodmeal_id, -#' human_id, bloodmeal_locus_count (number of STR loci used for matching), +#' human_id, locus_count (number of STR loci used for matching), #' est_noc (estimated number of contributors), efm_noc (number of contributors -#' used in euroformix), log10_lr (log10 likelihood ratio), note +#' used in euroformix), log10_lr (log10 likelihood ratio), notes #' @inheritParams bistro #' #' @keywords internal @@ -33,10 +33,10 @@ calc_one_log10_lr <- human_profile <- human_profiles |> dplyr::filter(SampleName == human_id) - bloodmeal_locus_count <- + locus_count <- dplyr::n_distinct(bloodmeal_profile$Marker, na.rm = TRUE) est_noc <- - ifelse(bloodmeal_locus_count == 0, 0, ceiling(max(table( + ifelse(locus_count == 0, 0, ceiling(max(table( bloodmeal_profile$Marker ) / 2))) efm_noc <- min(est_noc, 3) @@ -44,15 +44,15 @@ calc_one_log10_lr <- output_df <- tibble::tibble( bloodmeal_id = bloodmeal_id, human_id = human_id, - bloodmeal_locus_count = bloodmeal_locus_count, + locus_count = locus_count, est_noc = est_noc, efm_noc = efm_noc, log10_lr = NA, - note = NA + notes = NA ) if (nrow(bloodmeal_profile) == 0) { - output_df$note <- "no peaks above threshold" + output_df$notes <- "no peaks above threshold" } else if (nrow( dplyr::inner_join( bloodmeal_profile |> dplyr::select(-SampleName), @@ -60,7 +60,7 @@ calc_one_log10_lr <- by = dplyr::join_by(Marker, Allele) ) ) == 0) { - output_df$note <- "no shared alleles" + output_df$notes <- "no shared alleles" } else { bloodmeal_profile_list <- format_bloodmeal_profiles(bloodmeal_profile) @@ -105,9 +105,9 @@ calc_one_log10_lr <- if (is.numeric(efm_out)) { output_df$log10_lr <- efm_out } else if (class(efm_out)[1] == "simpleError") { - output_df$note <- "euroformix error" + output_df$notes <- "euroformix error" } else if (class(efm_out)[1] == "TimeoutException") { - output_df$note <- "timed out" + output_df$notes <- "timed out" } } diff --git a/R/checks.R b/R/checks.R index e8f0ac6..08c790b 100644 --- a/R/checks.R +++ b/R/checks.R @@ -31,15 +31,21 @@ check_bistro_inputs <- kit_df <- check_kit(kit) + bm_prof_markers <- toupper(unique(bloodmeal_profiles$Marker)) + hu_prof_markers <- toupper(unique(human_profiles$Marker)) + kit_markers <- toupper(unique(kit_df$Marker)) + check_setdiff_markers( - toupper(unique(human_profiles$Marker)), - toupper(unique(kit_df$Marker)), + bm_prof_markers, + kit_markers, "bloodmeal_profiles", "kit" ) check_setdiff_markers( - toupper(unique(human_profiles$Marker)), - toupper(unique(kit_df$Marker)), "human_profiles", "kit" + hu_prof_markers, + kit_markers, + "human_profiles", + "kit" ) check_calc_allele_freqs(calc_allele_freqs) @@ -143,11 +149,16 @@ check_if_allele_freqs <- "If `calc_allele_freqs = FALSE`, ", "then `pop_allele_freqs` is required." ) - } else if (!is.null(pop_allele_freqs) && calc_allele_freqs == FALSE) { + } else if (!is.null(pop_allele_freqs) && + calc_allele_freqs == FALSE) { check_colnames(pop_allele_freqs, c("Allele")) + + pop_freq_markers <- + toupper(names(pop_allele_freqs)[!names(pop_allele_freqs) == "Allele"]) + kit_markers <- toupper(unique(kit_df$Marker)) check_setdiff_markers( - toupper(names(pop_allele_freqs)[!names(pop_allele_freqs) == "Allele"]), - toupper(unique(kit_df$Marker)), + pop_freq_markers, + kit_markers, "pop_allele_freqs", "kit" ) @@ -169,11 +180,14 @@ check_setdiff_markers <- markers1_name, markers2_name) { in_markers1_only <- setdiff(toupper(markers1), toupper(markers2)) + in_markers1_only <- in_markers1_only[!is.na(in_markers1_only)] n_in_markers1_only <- length(in_markers1_only) - in_markers2_only <- setdiff(toupper(markers2), toupper(markers1)) + in_markers2_only <- + setdiff(toupper(markers2), toupper(markers1)) + in_markers2_only <- in_markers2_only[!is.na(in_markers2_only)] n_in_markers2_only <- length(in_markers2_only) if (n_in_markers1_only > 0) { - warning( + message( n_in_markers1_only, "/", length(markers1), @@ -182,12 +196,11 @@ check_setdiff_markers <- " but not in ", markers2_name, ": ", - paste0(in_markers1_only, collapse = ","), - "\n" + paste0(in_markers1_only, collapse = ",") ) } if (n_in_markers2_only > 0) { - warning( + message( n_in_markers2_only, "/", length(markers2), @@ -196,8 +209,7 @@ check_setdiff_markers <- " but not in ", markers1_name, ": ", - paste0(in_markers2_only, collapse = ","), - "\n" + paste0(in_markers2_only, collapse = ",") ) } } diff --git a/R/identify_matches.R b/R/identify_matches.R index 643ef25..dd4bc81 100644 --- a/R/identify_matches.R +++ b/R/identify_matches.R @@ -3,9 +3,9 @@ #' @param log10_lrs output from [calc_log10_lrs()] #' #' @return tibble with matches for bloodmeal-human pairs including bloodmeal_id, -#' bloodmeal_locus_count (number of STR loci used for matching), est_noc +#' locus_count (number of STR loci used for matching), est_noc #' (estimated number of contributors), match, human_id (if match), log10_lr -#' (log10 likelihood ratio), note +#' (log10 likelihood ratio), notes #' @inheritParams calc_one_log10_lr #' @keywords internal identify_one_match_set <- function(log10_lrs, bloodmeal_id) { @@ -14,26 +14,26 @@ identify_one_match_set <- function(log10_lrs, bloodmeal_id) { dplyr::filter(bloodmeal_id == bm_id) est_noc <- unique(log10_lrs$est_noc) - bloodmeal_locus_count <- unique(log10_lrs$bloodmeal_locus_count) - notes <- stringr::str_c(unique(log10_lrs$note), collapse = ";") + locus_count <- unique(log10_lrs$locus_count) + notes <- stringr::str_c(unique(log10_lrs$notes), collapse = ";") matches <- tibble::tibble( bloodmeal_id = bloodmeal_id, + locus_count = locus_count, est_noc = est_noc, - bloodmeal_locus_count = bloodmeal_locus_count, match = "no", human_id = NA, log10_lr = NA, - note = notes, + notes = notes, thresh_low = NA ) if (all(is.na(log10_lrs$log10_lr))) { matches_thresh <- matches } else if (max(log10_lrs$log10_lr[!is.infinite(log10_lrs$log10_lr)]) < 1.5 && - is.na(notes)) { + is.na(notes)) { matches_thresh <- matches |> - dplyr::mutate(note = "all log10_lr < 1.5") + dplyr::mutate(notes = "all log10LRs < 1.5") } else { # matches can only have log10_lrs > 1 log10_lrs <- log10_lrs |> @@ -48,24 +48,24 @@ identify_one_match_set <- function(log10_lrs, bloodmeal_id) { matches_thresh <- log10_lrs |> dplyr::filter(log10_lr >= thresh) |> dplyr::mutate( - note = ifelse( + notes = ifelse( dplyr::n() > est_noc, "> min NOC matches", "passed all filters" ), - match = ifelse(note == "passed all filters", "yes", "no"), - human_id = ifelse(note == "> min NOC matches", NA, human_id), - log10_lr = ifelse(note == "> min NOC matches", NA, log10_lr), + match = ifelse(notes == "passed all filters", "yes", "no"), + human_id = ifelse(notes == "> min NOC matches", NA, human_id), + log10_lr = ifelse(notes == "> min NOC matches", NA, log10_lr), thresh_low = thresh ) |> dplyr::select( bloodmeal_id, + locus_count, est_noc, - bloodmeal_locus_count, match, human_id, log10_lr, - note, + notes, thresh_low ) |> dplyr::distinct() # do we need this? @@ -80,9 +80,9 @@ identify_one_match_set <- function(log10_lrs, bloodmeal_id) { dplyr::pull(human_id) if ((identical(mht, mlt) && - nrow(matches_thresh) == est_noc) || - matches_thresh$note[1] == "> min NOC matches" || - thresh == 0.5) { + nrow(matches_thresh) == est_noc) || + matches_thresh$notes[1] == "> min NOC matches" || + thresh == 0.5) { break } @@ -90,11 +90,11 @@ identify_one_match_set <- function(log10_lrs, bloodmeal_id) { } if (nrow(matches_thresh) < est_noc || - matches_thresh$note[1] == "> min NOC matches" || - matches_thresh$thresh_low[1] == 1) { + matches_thresh$notes[1] == "> min NOC matches" || + matches_thresh$thresh_low[1] == 1) { # are 1st and last both required above? temp <- matches |> - dplyr::filter(note != "> min NOC matches") |> + dplyr::filter(notes != "> min NOC matches") |> dplyr::group_by(thresh_low) |> dplyr::mutate( n_samps = dplyr::n_distinct(human_id), @@ -105,7 +105,7 @@ identify_one_match_set <- function(log10_lrs, bloodmeal_id) { dplyr::distinct() |> dplyr::arrange(thresh_low) |> dplyr::mutate(next_same = human_id == dplyr::lead(human_id) & - note == dplyr::lead(note)) |> + notes == dplyr::lead(notes)) |> dplyr::filter(next_same) |> dplyr::filter(n_samps == suppressWarnings(max(n_samps))) |> dplyr::slice_max(thresh_low) |> diff --git a/R/utils.R b/R/utils.R index fc6938d..0bfc87c 100644 --- a/R/utils.R +++ b/R/utils.R @@ -15,7 +15,7 @@ utils::globalVariables( "all_alleles", "n", "log10_lr", - "note", + "notes", "human_id", "thresh_low", "next_same", @@ -26,7 +26,7 @@ utils::globalVariables( # import euroformix (required to get kit) #' @import euroformix -# no note for codetools +# no notes for codetools ignore_unused_imports <- function() { codetools::checkUsage } diff --git a/README.Rmd b/README.Rmd index 8c78032..846c3aa 100644 --- a/README.Rmd +++ b/README.Rmd @@ -40,4 +40,4 @@ detools::install_github("duke-malaria-collaboratory/bistro") ## Usage -Check out `vignette('bistro')` for more information. +Check out the [vignette](https://duke-malaria-collaboratory.github.io/bistro/articles/bistro.html) for more information. diff --git a/README.md b/README.md index f09a3c3..aa5f4cd 100644 --- a/README.md +++ b/README.md @@ -33,4 +33,6 @@ detools::install_github("duke-malaria-collaboratory/bistro") ## Usage -Check out `vignette('bistro')` for more information. +Check out the +[vignette](https://duke-malaria-collaboratory.github.io/bistro/articles/bistro.html) +for more information. diff --git a/man/bistro.Rd b/man/bistro.Rd index 3049c80..2117e8d 100644 --- a/man/bistro.Rd +++ b/man/bistro.Rd @@ -98,12 +98,12 @@ listed below. Note that if multiple matches are found for a bloodmeal, these are included as separate rows. \itemize{ \item \code{bloodmeal_id}: bloodmeal id -\item \code{bloodmeal_locus_count}: number of loci successfully typed in the bloodmeal +\item \code{locus_count}: number of loci successfully typed in the bloodmeal \item \code{est_noc}: estimated number of contributors to the bloodmeal \item \code{match}: whether a match was identified for a given bloodmeal (yes or no) \item \code{human_id}: If match, human id (NA otherwise) \item \code{log10_lr}: If match, log10 likelihood ratio (NA otherwise) -\item \code{note}: Why the bloodmeal does or doesn't have a match +\item \code{notes}: Why the bloodmeal does or doesn't have a match } } \description{ diff --git a/man/calc_one_log10_lr.Rd b/man/calc_one_log10_lr.Rd index 8de7d44..bc3bcbc 100644 --- a/man/calc_one_log10_lr.Rd +++ b/man/calc_one_log10_lr.Rd @@ -81,9 +81,9 @@ argument. Default: 1} } \value{ tibble with log10_lr for bloodmeal-human pair including bloodmeal_id, -human_id, bloodmeal_locus_count (number of STR loci used for matching), +human_id, locus_count (number of STR loci used for matching), est_noc (estimated number of contributors), efm_noc (number of contributors -used in euroformix), log10_lr (log10 likelihood ratio), note +used in euroformix), log10_lr (log10 likelihood ratio), notes } \description{ Calculate log10_lr for one bloodmeal-human pair diff --git a/man/identify_one_match_set.Rd b/man/identify_one_match_set.Rd index 46d0689..de60774 100644 --- a/man/identify_one_match_set.Rd +++ b/man/identify_one_match_set.Rd @@ -14,9 +14,9 @@ identify_one_match_set(log10_lrs, bloodmeal_id) } \value{ tibble with matches for bloodmeal-human pairs including bloodmeal_id, -bloodmeal_locus_count (number of STR loci used for matching), est_noc +locus_count (number of STR loci used for matching), est_noc (estimated number of contributors), match, human_id (if match), log10_lr -(log10 likelihood ratio), note +(log10 likelihood ratio), notes } \description{ Identify matches between a bloodmeal and human references diff --git a/tests/testthat/_snaps/bistro.md b/tests/testthat/_snaps/bistro.md index 890ce31..4106411 100644 --- a/tests/testthat/_snaps/bistro.md +++ b/tests/testthat/_snaps/bistro.md @@ -3,14 +3,12 @@ Code bistro(bm_evid1, hu_p1, pop_allele_freqs = pop_allele_freqs, kit = "ESX17", peak_threshold = 200) - Warning - 1/17 markers in kit but not in pop_allele_freqs: AMEL - Message + 1/17 markers in kit but not in pop_allele_freqs: AMEL Formatting bloodmeal profiles Removing 4 peaks under the threshold of 200 RFU. Formatting human profiles - Calculating log10_lrs + Calculating log10LRs # bloodmeal ids: 1 # human ids: 1 Bloodmeal id 1/1 @@ -18,24 +16,21 @@ Identifying matches Output # A tibble: 1 x 8 - bloodmeal_id est_noc bloodmeal_locus_count match human_id log10_lr note - - 1 evid1 2 17 yes P1 21.8 passed all~ - # i 1 more variable: thresh_low + bloodmeal_id locus_count est_noc match human_id log10_lr notes thresh_low + + 1 evid1 17 2 yes P1 21.8 passed al~ 21 --- Code bistro(bm_evid1, hu_p1, pop_allele_freqs = pop_allele_freqs, kit = "ESX17", peak_threshold = 200, bloodmeal_ids = "evid1", human_ids = "P1") - Warning - 1/17 markers in kit but not in pop_allele_freqs: AMEL - Message + 1/17 markers in kit but not in pop_allele_freqs: AMEL Formatting bloodmeal profiles Removing 4 peaks under the threshold of 200 RFU. Formatting human profiles - Calculating log10_lrs + Calculating log10LRs # bloodmeal ids: 1 # human ids: 1 Bloodmeal id 1/1 @@ -43,10 +38,9 @@ Identifying matches Output # A tibble: 1 x 8 - bloodmeal_id est_noc bloodmeal_locus_count match human_id log10_lr note - - 1 evid1 2 17 yes P1 21.8 passed all~ - # i 1 more variable: thresh_low + bloodmeal_id locus_count est_noc match human_id log10_lr notes thresh_low + + 1 evid1 17 2 yes P1 21.8 passed al~ 21 --- @@ -57,7 +51,7 @@ Formatting bloodmeal profiles Removing 4 peaks under the threshold of 200 RFU. Formatting human profiles - Calculating log10_lrs + Calculating log10LRs # bloodmeal ids: 1 # human ids: 1 Bloodmeal id 1/1 @@ -65,8 +59,7 @@ Identifying matches Output # A tibble: 1 x 8 - bloodmeal_id est_noc bloodmeal_locus_count match human_id log10_lr note - - 1 evid1 2 17 yes P1 7.05 passed all~ - # i 1 more variable: thresh_low + bloodmeal_id locus_count est_noc match human_id log10_lr notes thresh_low + + 1 evid1 17 2 yes P1 7.05 passed al~ 6.5 diff --git a/tests/testthat/_snaps/calc_log10_lrs.md b/tests/testthat/_snaps/calc_log10_lrs.md index 1d194ec..1f00467 100644 --- a/tests/testthat/_snaps/calc_log10_lrs.md +++ b/tests/testthat/_snaps/calc_log10_lrs.md @@ -6,9 +6,9 @@ peak_threshold = 200) Output # A tibble: 1 x 7 - bloodmeal_id human_id bloodmeal_locus_count est_noc efm_noc log10_lr note + bloodmeal_id human_id locus_count est_noc efm_noc log10_lr notes - 1 evid1 00-JP0001-1~ 17 2 2 -9.62 NA + 1 evid1 00-JP0001-14_20142342~ 17 2 2 -9.62 NA --- @@ -18,9 +18,9 @@ peak_threshold = 200) Output # A tibble: 1 x 7 - bloodmeal_id human_id bloodmeal_locus_count est_noc efm_noc log10_lr note + bloodmeal_id human_id locus_count est_noc efm_noc log10_lr notes - 1 evid2 00-JP0001-1~ 1 2 2 NA no s~ + 1 evid2 00-JP0001-14_20142342~ 1 2 2 NA no s~ --- @@ -30,9 +30,9 @@ peak_threshold = 200) Output # A tibble: 1 x 7 - bloodmeal_id human_id bloodmeal_locus_count est_noc efm_noc log10_lr note + bloodmeal_id human_id locus_count est_noc efm_noc log10_lr notes - 1 evid4 00-JP0001-1~ 0 0 0 NA no p~ + 1 evid4 00-JP0001-14_20142342~ 0 0 0 NA no p~ --- @@ -41,9 +41,9 @@ pop_allele_freqs = pop_allele_freqs, kit = "ESX17", peak_threshold = 200) Output # A tibble: 1 x 7 - bloodmeal_id human_id bloodmeal_locus_count est_noc efm_noc log10_lr note - - 1 evid2 P1 1 2 2 NA euroform~ + bloodmeal_id human_id locus_count est_noc efm_noc log10_lr notes + + 1 evid2 P1 1 2 2 NA euroformix error --- @@ -53,9 +53,9 @@ peak_threshold = 200, time_limit = 1e-08) Output # A tibble: 1 x 7 - bloodmeal_id human_id bloodmeal_locus_count est_noc efm_noc log10_lr note + bloodmeal_id human_id locus_count est_noc efm_noc log10_lr notes - 1 evid1 00-JP0001-1~ 17 2 2 NA time~ + 1 evid1 00-JP0001-14_20142342~ 17 2 2 NA time~ --- @@ -71,8 +71,8 @@ Human id 1/1 Output # A tibble: 2 x 7 - bloodmeal_id human_id bloodmeal_locus_count est_noc efm_noc log10_lr note - - 1 evid2 P1 1 2 2 NA euroform~ - 2 evid3 P1 8 1 1 -4.69 + bloodmeal_id human_id locus_count est_noc efm_noc log10_lr notes + + 1 evid2 P1 1 2 2 NA euroformix error + 2 evid3 P1 8 1 1 -4.69 diff --git a/tests/testthat/_snaps/identify_matches.md b/tests/testthat/_snaps/identify_matches.md index eecad03..cfc5118 100644 --- a/tests/testthat/_snaps/identify_matches.md +++ b/tests/testthat/_snaps/identify_matches.md @@ -4,11 +4,10 @@ identify_one_match_set(lrs, "evid1") Output # A tibble: 2 x 8 - bloodmeal_id est_noc bloodmeal_locus_count match human_id log10_lr note - - 1 evid1 2 17 yes P1 21.8 passed all~ - 2 evid1 2 17 yes P2 10.3 passed all~ - # i 1 more variable: thresh_low + bloodmeal_id locus_count est_noc match human_id log10_lr notes thresh_low + + 1 evid1 17 2 yes P1 21.8 passed al~ 9.5 + 2 evid1 17 2 yes P2 10.3 passed al~ 9.5 --- @@ -16,10 +15,9 @@ identify_one_match_set(lrs, "evid2") Output # A tibble: 1 x 8 - bloodmeal_id est_noc bloodmeal_locus_count match human_id log10_lr note - - 1 evid2 2 1 no NA NA no shared ~ - # i 1 more variable: thresh_low + bloodmeal_id locus_count est_noc match human_id log10_lr notes thresh_low + + 1 evid2 1 2 no NA NA no shared~ NA --- @@ -27,10 +25,9 @@ identify_one_match_set(dplyr::filter(lrs, human_id == "P1"), "evid1") Output # A tibble: 1 x 8 - bloodmeal_id est_noc bloodmeal_locus_count match human_id log10_lr note - - 1 evid1 2 17 yes P1 21.8 passed all~ - # i 1 more variable: thresh_low + bloodmeal_id locus_count est_noc match human_id log10_lr notes thresh_low + + 1 evid1 17 2 yes P1 21.8 passed al~ 21 --- @@ -39,10 +36,9 @@ log10_lr = c(1.55, 1.1, 1.2)), "evid1") Output # A tibble: 1 x 8 - bloodmeal_id est_noc bloodmeal_locus_count match human_id log10_lr note - - 1 evid1 2 17 no NA NA > min NOC ~ - # i 1 more variable: thresh_low + bloodmeal_id locus_count est_noc match human_id log10_lr notes thresh_low + + 1 evid1 17 2 no NA NA > min NOC~ 1 --- @@ -50,11 +46,10 @@ identify_matches(lrs) Output # A tibble: 4 x 8 - bloodmeal_id est_noc bloodmeal_locus_count match human_id log10_lr note - - 1 evid1 2 17 yes P1 21.8 passed all~ - 2 evid1 2 17 yes P2 10.3 passed all~ - 3 evid2 2 1 no NA no shared ~ - 4 evid3 1 8 no NA all log10_~ - # i 1 more variable: thresh_low + bloodmeal_id locus_count est_noc match human_id log10_lr notes thresh_low + + 1 evid1 17 2 yes P1 21.8 passed al~ 9.5 + 2 evid1 17 2 yes P2 10.3 passed al~ 9.5 + 3 evid2 1 2 no NA no shared~ NA + 4 evid3 8 1 no NA all log10~ NA diff --git a/tests/testthat/test-bistro.R b/tests/testthat/test-bistro.R index 9040c0b..4ec56c5 100644 --- a/tests/testthat/test-bistro.R +++ b/tests/testthat/test-bistro.R @@ -44,7 +44,7 @@ test_that("bistro works", { "If `calc_allele_freqs = FALSE`, then `pop_allele_freqs` is required." ) - expect_message(expect_warning( + expect_message(expect_message(expect_message( expect_error( bistro( bloodmeal_profiles %>% dplyr::filter(Height < 200), @@ -55,5 +55,5 @@ test_that("bistro works", { ), "All bloodmeal peak heights below threshold." ) - )) + ))) }) diff --git a/tests/testthat/test-checks.R b/tests/testthat/test-checks.R index b7d54a6..fadde2b 100644 --- a/tests/testthat/test-checks.R +++ b/tests/testthat/test-checks.R @@ -65,7 +65,7 @@ test_that("check_calc_allele_freqs works", { test_that("check_if_allele_freqs works", { expect_no_warning(check_if_allele_freqs(pop_allele_freqs, TRUE)) - expect_warning( + expect_message( check_if_allele_freqs(pop_allele_freqs, FALSE, euroformix::getKit("ESX17")), "1/17 markers in kit but not in pop_allele_freqs: AMEL" ) @@ -110,8 +110,8 @@ test_that("check_peak_threshold works", { }) test_that("check_setdiff_markers works", { - expect_warning( - expect_warning( + expect_message( + expect_message( check_setdiff_markers(1:3, c(1, 4), "m1", "m2"), "2/3 markers in m1 but not in m2: 2,3" ), diff --git a/vignettes/bistro.Rmd b/vignettes/bistro.Rmd index 9a3420c..4a16037 100644 --- a/vignettes/bistro.Rmd +++ b/vignettes/bistro.Rmd @@ -16,20 +16,20 @@ knitr::opts_chunk$set( ## The bistro algorithm -bistro is an algorithm, an R package, and a function within said R package. It employs methods from forensics to identify matches between bloodmeals and the people they bit using short tandem repeat (STR) profiles of human blood from freshly fed bloodmeals and from people. Note that you can use it for matching any STR profiles for research purposes, but we very much _do not_ recommend using it for forensic purposes. For more details about the algorithm, please refer to the bistro manuscript (note: will add link once we have it). +bistro is an algorithm, an R package, and a function within said R package. It employs methods from forensics to identify matches between bloodmeals and the people they bit using short tandem repeat (STR) profiles of human blood from freshly fed bloodmeals and from people. It can be used to match multi-source bloodmeals and bloodmeals with incomplete STR profiles. Note that while you can use it for matching any STR profiles for research purposes, we very much _do not_ recommend using it for forensic purposes. For more details about the algorithm, please refer to the bistro manuscript (will add link once it's posted). -## Installing the bistro R package +## Installing the `bistro` R package -First, you have to install `bistro`: +To install `bistro`, run the following commands: ```{r, eval = FALSE} install.packages("devtools") # install devtools if needed devtools::install_github("duke-malaria-collaboratory/bistro") ``` -## Getting started with bistro +## Getting started with `bistro` -The most important function in `bistro` is `bistro()`. This document will take you through all of the required and optional `bistro()` inputs, as well as the output. +The most important function in `bistro` is `bistro()`. This document will take you through all of the required and optional inputs, as well as the output. In summary, you must provide: @@ -38,7 +38,7 @@ In summary, you must provide: - The STR genotyping kit name - An allele peak height threshold (in relative fluorescence units, RFUs) for the bloodmeal STR profiles -And the function outputs matches for each bloodmeal-human pair. +The function outputs matches for each bloodmeal-human pair. To follow along with the vignette, first load `bistro`: @@ -46,13 +46,13 @@ To follow along with the vignette, first load `bistro`: library(bistro) ``` -Note that when we load `bistro`, we also load `euroformix` because `bistro` depends on the `euroformix` package. +Note that when we load `bistro`, we also load [`euroformix`](https://github.com/oyvble/euroformix/) because `bistro` depends on the `euroformix` package. ## Required inputs -Two required, and one optional but often recommended, datasets are required to run `bistro()`: bloodmeal and human STR profiles are required, and human population allele frequencies are optional. The other two required inputs to bistro are the STR genotyping kit name and the bloodmeal STR allele peak height threshold. +Before we run `bistro()` using example data, let's learn more about the required inputs. -Before we run `bistro()`, let's learn more about the required inputs. +Two required, and one optional but often recommended, datasets are required to run `bistro()`: bloodmeal and human STR profiles are required, and human population allele frequencies are optional. The other two required inputs are the STR genotyping kit name and the bloodmeal STR allele peak height threshold. ### Bloodmeal STR profiles (`bloodmeal_profiles`) @@ -77,7 +77,7 @@ The human STR profiles dataset takes a similar format to the bloodmeal STR profi head(human_profiles) ``` -In this example dataset, we have 3 different bloodmeals: +In this example dataset, we have 3 different human profiles: ```{r} unique(human_profiles$SampleName) @@ -95,11 +95,11 @@ If the kit you used to genotype samples is not available in the defaults, you wi ### Bloodmeal allele peak threshold (`peak_threshold`) -`bistro()` also requires a bloodmeal allele peak threshold (in RFUs) to be provided. All peaks with heights below this threshold will be removed prior to matching. If prior filtering was performed on the bloodmeal profiles based on peak height, then this number should be equal to or greater than the threshold used. +`bistro()` also requires a bloodmeal allele peak threshold (in RFUs) to be provided. All peaks with heights below this threshold will be removed before matching. If prior filtering was performed on the bloodmeal profiles based on peak height, then this number should be equal to or greater than that threshold. ### Human population allele frequencies (`pop_allele_freqs`) -Human population frequencies for each allele at each locus may be supplied. If you would prefer, population allele frequencies can be computed from the human STR profiles (see below for more details). Note that the latter option is not suggested when a small human reference database is being used. +Human population frequencies for each allele at each locus may be supplied. If you would prefer, population allele frequencies can be computed from the human STR profiles (see below for more details). Note that the latter option is not suggested when a small set of human reference profiles is being used. If you would like to input population allele frequencies, the dataset should contain one column for each STR marker and one row for each allele. The alleles should be listed in the first column of the dataset and the column should be named "Allele". The entries for each marker-allele combination are the population allele frequency or NA if that allele does not exist for that marker. Here is an example with the loci from the PowerPlex ESX 17 System (ESX17) kit. @@ -121,11 +121,12 @@ bistro_output <- bistro( ) ``` -You'll notice a bunch of things print out to the screen. -First, a warning that one of the markers (AMEL) is in the kit but not in `pop_allele_freqs`. -Then, a message saying that 6 peaks under the 200 RFU threshold were removed, and that after filtering one of the bloodmeals did not have any peaks remaining above the threshold. -Next, there is a message notifying you that log10_lrs (log10 likelihood ratios) are being calculated for each bloodmeal-human pair (for 4 bloodmeals and 3 humans). This can take a while, so the progress is also printed out. -Finally, the function notifies you that it is identifying matches. +You'll notice a bunch of messages print out to the screen: + +- One of the markers (AMEL) is in the kit but not in `pop_allele_freqs`. +- 6 peaks under the 200 RFU threshold were removed, and after filtering one of the bloodmeals did not have any peaks remaining above the threshold. +- Notification that log10LRs (log10 likelihood ratios) are being calculated for each bloodmeal-human pair (for 4 bloodmeals and 3 humans). This can take a while, so the progress is also printed out. +- Notification that matches are being identified. Let's take a look at the output: @@ -135,30 +136,30 @@ bistro_output The output contains 8 columns: -- `bloodmeal_id`: bloodmeal sample +- `bloodmeal_id`: bloodmeal sample name. - `est_noc`: estimated number of contributors (NOCs) to the bloodmeal. When this equals one, it's predicted to be a single-source bloodmeal; when it is greater than one, it's predicted to be a multi-source bloodmeal. Note that sources here mean humans only. -- `bloodmeal_locus_count`: number of loci STR-typed in the bloodmeal -- `match`: whether the bloodmeal STR profile matched to a human in the database (yes or no) -- `human_id`: human match for the bloodmeal (NA if no match) -- `log10_lr`: log10 likelihood ratio of the bloodmeal-human match (NA if no match) -- `note`: why the bloodmeal does or doesn't have a match -- `thresh_low`: log10_lr threshold at which the match was made +- `locus_count`: number of loci STR-typed in the bloodmeal. +- `match`: whether the bloodmeal STR profile matched to a human in the database (yes or no). +- `human_id`: human match for the bloodmeal (NA if no match). +- `log10_lr`: log10 likelihood ratio of the bloodmeal-human match (NA if no match). +- `notes`: why the bloodmeal does or doesn't have a match (see below for more details). +- `thresh_low`: log10LR threshold at which the match was made. -An important thing to notice is that, even though we have only 4 bloodmeals, there are 5 rows in the dataset. This is because, while each bloodmeal will appear at least once in the dataset, multiple rows are present for bloodmeals that match to multiple people (one row per match). For example, here, evid1 matches to 2 different people (P1 and P2) Please keep this in mind when performing data analysis. +An important thing to notice is that, even though we have only 4 bloodmeals, there are 5 rows in the output dataset. This is because, while each bloodmeal will appear at least once in the dataset, multiple rows are present for bloodmeals that match to multiple people (one row per match). For example, here, evid1 matches to 2 different people (P1 and P2) Please keep this in mind when performing data analysis. -The note column provides more information on why there was or was not a match: +The notes column provides more information on why there was or was not a match: - passed all filters: there was a match. -- \> min NOC matches: there were more matches than expected that could not be resolved. This can happen when there are closely related people in the human database. -- no shared alleles: no people in the human database had alleles that overlapped with the alleles in the bloodmeal. -- all log10_lrs < 1.5: none of the log10_lrs were high enough to be considered a match. +- \> min NOC matches: there were more matches than expected and they could not be resolved. This can happen when there are closely related people in the human reference dataset. +- no shared alleles: no people in the human reference dataset had alleles that overlapped with the alleles in the bloodmeal. +- all log10LRs < 1.5: none of the log10LRs were high enough to be considered a match. - euroformix error: the `euroformix::contLikSearch()` function did not run successfully. This often happens with very incomplete bloodmeal profiles (i.e. where only a few markers were succesffuly STR-typed). - no peaks above threshold: all bloodmeal STR peaks were below the peak height threshold. - timed out: log-likelihood took too long to calculate. If you want to give it more time, you can modify the `time_limit` argument (see below). ## Using computed population allele frequencies -Let's say you want to have the population allele frequences computed from the human reference database. (Note that we just show this as an example because we would not recommend doing this for a database with only 3 people like we have here.) To do so, you must set `calc_allele_freqs = TRUE`: +If you want to have the population allele frequencies computed from the inputted human reference profiles, you must set `calc_allele_freqs = TRUE`: ```{r, eval = FALSE} bistro( @@ -170,12 +171,14 @@ bistro( ) ``` +Note that, while we use the builtin dataset as an example here, we do not recommend using this method for a human dataset with only 3 people. An example of when it may make sense is if you have a relatively comprehensive set of human reference profiles for your study site. + ## Running bistro on a subset of samples -`bistro()` can take a long time to run, especially if you have many samples or a closely related reference population. Therefore, before running `bistro()` with all of your samples, we highly recommend testing it out on a small subset of samples to make sure everything is working as you expect. To do this, you can use the `bloodmeal_ids` and `human_ids` arguments to tell `bistro()` what ids you want to compare: +`bistro()` can take a long time to run, especially if you have many samples or a closely related reference population. Therefore, before running it with all of your samples, we highly recommend testing it out on a small subset of samples to make sure everything is working as you expect. To do this, you can use the `bloodmeal_ids` and `human_ids` arguments to tell `bistro()` what ids you want to compare: -```{r} +```{r, eval = FALSE} bistro( bloodmeal_profiles = bloodmeal_profiles, human_profiles = human_profiles, @@ -187,22 +190,22 @@ bistro( ) ``` -This can also be useful if you would like to parallelize bistro, for example by using a workflow manager such as Snakemake or NextFlow. We highly recommend doing this if you are planning to run `bistro()` on many samples. +This can also be useful if you would like to parallelize `bistro()`, for example by using a workflow manager such as Snakemake or NextFlow. We highly recommend doing this if you are planning to run it on many samples. ## Other arguments Other arguments you can modify when running `bistro()` include: -- `rm_twins`: whether or not to remove identical STR profiles, likely twins, prior to computing population allele frequencies and identifying matches (default: TRUE) -- `model_degrad`: whether or not to model peak degradation when calculating log-likelihoods using `euroformix::contLikSearch()` (default: TRUE) -- `model_bw_stutt` and `model_fw_stutt`: whether or not to model peak backward and forward stutter when calculating log-likelihoods using `euroformix::contLikSearch()` (default: FALSE) -- `difftol`: difference tolerance in log-likelihood across 2 iterations when using `euroformix::contLikSearch()` (default: 1) -- `threads`: nubmer of threads to use when computing log-likelihoods when using `euroformix::contLikSearch()` (default: 4) -- `seed`: seed for `euroformix::contLikSearch()` (default: 1) -- `time_limit`: longest amount of time allowed to calculate the log-likelihood for one bloodmeal-human pair, in minutes (default: 3) - - +Used by `bistro`: +- `rm_twins`: whether or not to remove identical STR profiles that are likely twins prior to computing population allele frequencies and identifying matches (default: TRUE) +- `time_limit`: longest amount of time allowed to calculate the log-likelihood for one bloodmeal-human pair, in minutes (default: 3) +Used by `euroformix::contLikSearch()` when calculating log-likelihoods: +- `model_degrad`: whether or not to model peak degradation (default: TRUE) +- `model_bw_stutt` and `model_fw_stutt`: whether or not to model peak backward and forward stutter (default: FALSE) +- `difftol`: difference tolerance in log-likelihood across 2 iterations (default: 1) +- `threads`: number of threads to use when computing log-likelihoods (default: 4) +- `seed`: seed for reproducible results (default: 1)