From 9a2d61dee2148912d0c7d8bfd2dd6319fd7dc0b2 Mon Sep 17 00:00:00 2001
From: Zena Lapp <zenalapp@gmail.com>
Date: Thu, 7 Sep 2023 08:05:49 -0600
Subject: [PATCH] Modularize str profile preprocessing

---
 NAMESPACE                                 |   2 +
 R/bistro.R                                |  40 ++----
 R/calc_log10_lrs.R                        |   5 +
 R/identify_matches.R                      |   6 +-
 R/preprocess_data.R                       | 145 ++++++++++++++++++++++
 R/utils.R                                 |  86 -------------
 man/calc_log10_lrs.Rd                     |   5 +-
 man/prep_bloodmeal_profiles.Rd            |  29 +++++
 man/prep_human_profiles.Rd                |  27 ++++
 man/rm_dups.Rd                            |   2 +-
 man/rm_twins.Rd                           |   2 +-
 man/subset_ids.Rd                         |   2 +-
 tests/testthat/_snaps/bistro.md           |  22 ----
 tests/testthat/_snaps/identify_matches.md |   4 +-
 tests/testthat/_snaps/preprocess_data.md  |  86 +++++++++++++
 tests/testthat/_snaps/utils.md            |  23 ----
 tests/testthat/test-bistro.R              |  25 ----
 tests/testthat/test-identify_matches.R    |  14 ++-
 tests/testthat/test-preprocess_data.R     |  52 ++++++++
 tests/testthat/test-utils.R               |  32 -----
 20 files changed, 381 insertions(+), 228 deletions(-)
 create mode 100644 R/preprocess_data.R
 create mode 100644 man/prep_bloodmeal_profiles.Rd
 create mode 100644 man/prep_human_profiles.Rd
 create mode 100644 tests/testthat/_snaps/preprocess_data.md
 delete mode 100644 tests/testthat/_snaps/utils.md
 create mode 100644 tests/testthat/test-preprocess_data.R

diff --git a/NAMESPACE b/NAMESPACE
index e9f2359..57b908f 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -5,6 +5,8 @@ export(calc_allele_freqs)
 export(calc_log10_lrs)
 export(filter_peaks)
 export(identify_matches)
+export(prep_bloodmeal_profiles)
+export(prep_human_profiles)
 export(rm_dups)
 export(rm_twins)
 import(euroformix)
diff --git a/R/bistro.R b/R/bistro.R
index b126869..4f07c3f 100644
--- a/R/bistro.R
+++ b/R/bistro.R
@@ -114,39 +114,19 @@ bistro <-
       pop_allele_freqs <- calc_allele_freqs(human_profiles)
     }
 
-    if (rm_twins) {
-      human_profiles <- rm_twins(human_profiles)
-    }
-
     message("Formatting bloodmeal profiles")
-
-    if (is.null(bloodmeal_ids)) {
-      bloodmeal_ids <- unique(bloodmeal_profiles$SampleName)
-    } else {
-      bloodmeal_ids <-
-        subset_ids(bloodmeal_ids, bloodmeal_profiles$SampleName)
-    }
-
-    bloodmeal_profiles <- bloodmeal_profiles |>
-      dplyr::filter(SampleName %in% bloodmeal_ids)
-
-    check_heights(bloodmeal_profiles$Height, peak_thresh)
-
-    bloodmeal_profiles <- bloodmeal_profiles |>
-      rm_dups() |>
-      filter_peaks(peak_thresh)
+    bloodmeal_profiles <- prep_bloodmeal_profiles(
+      bloodmeal_profiles,
+      peak_thresh,
+      bloodmeal_ids
+    )
 
     message("Formatting human profiles")
-
-    if (is.null(human_ids)) {
-      human_ids <- unique(human_profiles$SampleName)
-    } else {
-      human_ids <- subset_ids(human_ids, human_profiles$SampleName)
-    }
-
-    human_profiles <- human_profiles |>
-      dplyr::filter(SampleName %in% human_ids) |>
-      rm_dups()
+    human_profiles <- prep_human_profiles(
+      human_profiles,
+      human_ids,
+      rm_twins
+    )
 
     message("Calculating log10LRs")
 
diff --git a/R/calc_log10_lrs.R b/R/calc_log10_lrs.R
index bf2572b..a3e8bb9 100644
--- a/R/calc_log10_lrs.R
+++ b/R/calc_log10_lrs.R
@@ -116,6 +116,11 @@ calc_one_log10_lr <-
 
 #' Calculate log10_lrs for multiple bloodmeal-human pairs
 #'
+#' Note that this function doesn't preprocess the bloodmeal and human profile
+#' data. If you would like to preprocess it in the same way as is performed
+#' internally in the `bistro()` function, you must run
+#' `prep_bloodmeal_profiles()` and `prep_human_profiles()` first.
+#'
 #' @inheritParams bistro
 #'
 #' @return A tibble with the same output as for [bistro()], except there is no
diff --git a/R/identify_matches.R b/R/identify_matches.R
index 8c9d763..1294a75 100644
--- a/R/identify_matches.R
+++ b/R/identify_matches.R
@@ -29,7 +29,11 @@ identify_one_match_set <- function(log10_lrs, bloodmeal_id) {
   )
 
   if (all(is.na(log10_lrs$log10_lr) | is.infinite(log10_lrs$log10_lr))) {
-    matches_thresh <- matches
+    matches_thresh <- matches |>
+      dplyr::mutate(notes = ifelse(is.na(notes),
+        "all log10LRs NA or Inf",
+        paste0("all log10LRs NA or Inf", ";", notes)
+      ))
   } else if (max(log10_lrs$log10_lr[!is.infinite(log10_lrs$log10_lr)]) < 1.5 &&
     is.na(notes)) {
     matches_thresh <- matches |>
diff --git a/R/preprocess_data.R b/R/preprocess_data.R
new file mode 100644
index 0000000..8533e16
--- /dev/null
+++ b/R/preprocess_data.R
@@ -0,0 +1,145 @@
+#' Preprocess bloodmeal profiles
+#'
+#' Removes duplicates and peaks below threshold, subsets ids
+#'
+#' @inheritParams bistro
+#'
+#' @return Dataframe with preprocessed bloodmeal profiles
+#' @export
+#' @keywords internal
+prep_bloodmeal_profiles <- function(bloodmeal_profiles,
+                                    peak_thresh,
+                                    bloodmeal_ids = NULL) {
+  if (is.null(bloodmeal_ids)) {
+    bloodmeal_ids <- unique(bloodmeal_profiles$SampleName)
+  } else {
+    bloodmeal_ids <-
+      subset_ids(bloodmeal_ids, bloodmeal_profiles$SampleName)
+  }
+
+  bloodmeal_profiles <- bloodmeal_profiles |>
+    dplyr::filter(SampleName %in% bloodmeal_ids)
+
+  check_heights(bloodmeal_profiles$Height, peak_thresh)
+
+  bloodmeal_profiles <- bloodmeal_profiles |>
+    rm_dups() |>
+    filter_peaks(peak_thresh)
+
+  return(bloodmeal_profiles)
+}
+
+#' Preprocess human profiles
+#'
+#' Removes duplicates and optionally twins, subsets ids
+#'
+#' @inheritParams bistro
+#'
+#' @return Dataframe with preprocessed human profiles
+#' @export
+#' @keywords internal
+prep_human_profiles <- function(human_profiles,
+                                human_ids = NULL,
+                                rm_twins = TRUE) {
+  if (rm_twins) {
+    human_profiles <- rm_twins(human_profiles)
+  }
+
+  if (is.null(human_ids)) {
+    human_ids <- unique(human_profiles$SampleName)
+  } else {
+    human_ids <- subset_ids(human_ids, human_profiles$SampleName)
+  }
+
+  human_profiles <- human_profiles |>
+    dplyr::filter(SampleName %in% human_ids) |>
+    rm_dups()
+
+  return(human_profiles)
+}
+
+#' Remove duplicate rows with warning
+#'
+#' @param df Dataframe from which to remove duplicate rows
+#'
+#' @return Un-duplicated dataframe
+#' @export
+#' @keywords internal
+rm_dups <- function(df) {
+  n_orig <- nrow(df)
+  if (n_orig != dplyr::n_distinct(df)) {
+    df <- df |>
+      dplyr::distinct()
+    warning(paste0(
+      "Detected and removed ",
+      n_orig - nrow(df),
+      " duplicate rows."
+    ))
+  }
+  return(df)
+}
+
+#' Remove identical STR profiles
+#'
+#' @inheritParams bistro
+#'
+#' @return Data frame with twins removed
+#' @export
+#' @keywords internal
+rm_twins <- function(human_profiles) {
+  not_twins <- human_profiles |>
+    dplyr::arrange(Marker, Allele) |>
+    dplyr::group_by(SampleName) |>
+    dplyr::summarize(all_alleles = stringr::str_c(paste0(Marker, Allele),
+      collapse = ";"
+    )) |>
+    dplyr::group_by(all_alleles) |>
+    dplyr::mutate(n = dplyr::n()) |>
+    dplyr::filter(n == 1) |>
+    dplyr::pull(SampleName)
+
+  n_ident_profs <-
+    dplyr::n_distinct(human_profiles$SampleName) - length(not_twins)
+
+  if (n_ident_profs > 0) {
+    message(
+      paste0(
+        "Identified ",
+        n_ident_profs,
+        " people whose profiles appear more than once",
+        " (likely identical twins). These are being removed."
+      )
+    )
+    human_profiles <- human_profiles |>
+      dplyr::filter(SampleName %in% not_twins)
+  }
+
+  return(human_profiles)
+}
+
+#' Subset to ids present in the dataset
+#'
+#' @param ids ids to check if in vec
+#' @param vec vector of ids
+#' @param vec_name name of vector
+#'
+#' @return list of IDs that are present
+#' @keywords internal
+subset_ids <- function(ids, vec, vec_name) {
+  id_absent <- setdiff(ids, vec)
+  if (length(id_absent) > 0) {
+    warning(
+      "Removing ",
+      length(id_absent),
+      " ",
+      vec_name,
+      " not found in the dataset: ",
+      id_absent
+    )
+  }
+  ids <- intersect(ids, vec)
+  if (length(ids) == 0) {
+    stop("None of the provided ", vec_name, " are present in the dataset.")
+  }
+  return(ids)
+}
diff --git a/R/utils.R b/R/utils.R
index 309cf0e..79c5997 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -30,89 +30,3 @@ utils::globalVariables(
 ignore_unused_imports <- function() {
   codetools::checkUsage
 }
-
-#' Remove duplicate rows with warning
-#'
-#' @param df Dataframe from which to remove duplicate rows
-#'
-#' @return Un-duplicated dataframe
-#' @export
-#' @keywords internal
-rm_dups <- function(df) {
-  n_orig <- nrow(df)
-  if (n_orig != dplyr::n_distinct(df)) {
-    df <- df |>
-      dplyr::distinct()
-    warning(paste0(
-      "Detected and removed ",
-      n_orig - nrow(df),
-      " duplicate rows."
-    ))
-  }
-  return(df)
-}
-
-#' Remove identical STR profiles
-#'
-#' @inheritParams bistro
-#'
-#' @return Data frame with twins removed
-#' @export
-#' @keywords internal
-rm_twins <- function(human_profiles) {
-  not_twins <- human_profiles |>
-    dplyr::arrange(Marker, Allele) |>
-    dplyr::group_by(SampleName) |>
-    dplyr::summarize(all_alleles = stringr::str_c(paste0(Marker, Allele),
-      collapse = ";"
-    )) |>
-    dplyr::group_by(all_alleles) |>
-    dplyr::mutate(n = dplyr::n()) |>
-    dplyr::filter(n == 1) |>
-    dplyr::pull(SampleName)
-
-  n_ident_profs <-
-    dplyr::n_distinct(human_profiles$SampleName) - length(not_twins)
-
-  if (n_ident_profs > 0) {
-    message(
-      paste0(
-        "Identified ",
-        n_ident_profs,
-        " people whose profiles appear more than once",
-        " (likely identical twins). These are being removed."
-      )
-    )
-    human_profiles <- human_profiles |>
-      dplyr::filter(SampleName %in% not_twins)
-  }
-
-  return(human_profiles)
-}
-
-#' Subset to ids present in the dataset
-#'
-#' @param ids ids to check if in vec
-#' @param vec vector of ids
-#' @param vec_name name of vector
-#'
-#' @return list of IDs that are present
-#' @keywords internal
-subset_ids <- function(ids, vec, vec_name) {
-  id_absent <- setdiff(ids, vec)
-  if (length(id_absent) > 0) {
-    warning(
-      "Removing ",
-      length(id_absent),
-      " ",
-      vec_name,
-      " not found in the dataset: ",
-      id_absent
-    )
-  }
-  ids <- intersect(ids, vec)
-  if (length(ids) == 0) {
-    stop("None of the provided ", vec_name, " are present in the dataset.")
-  }
-  return(ids)
-}
diff --git a/man/calc_log10_lrs.Rd b/man/calc_log10_lrs.Rd
index b724bc6..0c38d5d 100644
--- a/man/calc_log10_lrs.Rd
+++ b/man/calc_log10_lrs.Rd
@@ -85,6 +85,9 @@ contributors used for \code{\link[euroformix:contLikSearch]{euroformix::contLikS
 value of 3.
 }
 \description{
-Calculate log10_lrs for multiple bloodmeal-human pairs
+Note that this function doesn't preprocess the bloodmeal and human profile
+data. If you would like to preprocess it in the same way as is performed
+internally in the \code{bistro()} function, you must run
+\code{prep_bloodmeal_profiles()} and \code{prep_human_profiles()} first.
 }
 \keyword{internal}
diff --git a/man/prep_bloodmeal_profiles.Rd b/man/prep_bloodmeal_profiles.Rd
new file mode 100644
index 0000000..aebdaa8
--- /dev/null
+++ b/man/prep_bloodmeal_profiles.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/preprocess_data.R
+\name{prep_bloodmeal_profiles}
+\alias{prep_bloodmeal_profiles}
+\title{Preprocess bloodmeal profiles}
+\usage{
+prep_bloodmeal_profiles(bloodmeal_profiles, peak_thresh, bloodmeal_ids = NULL)
+}
+\arguments{
+\item{bloodmeal_profiles}{Tibble with alleles for all bloodmeals in reference
+database including 4 columns: SampleName, Marker, Allele, Height. Height
+must be numeric or coercible to numeric.}
+
+\item{peak_thresh}{Allele peak height threshold in RFUs. All peaks under this
+threshold will be filtered out. If prior filtering was performed, this
+number should be equal to or greater than that number. Also used for
+\code{threshT} argument in \code{\link[euroformix:contLikSearch]{euroformix::contLikSearch()}}.}
+
+\item{bloodmeal_ids}{Vector of bloodmeal ids from the SampleName column in
+\code{bloodmeal_profiles} for which to compute log10_lrs. If NULL, all ids in
+the input dataframe will be used. Default: NULL}
+}
+\value{
+Dataframe with preprocessed bloodmeal profiles
+}
+\description{
+Removes duplicates and peaks below threshold, subsets ids
+}
+\keyword{internal}
diff --git a/man/prep_human_profiles.Rd b/man/prep_human_profiles.Rd
new file mode 100644
index 0000000..91c4531
--- /dev/null
+++ b/man/prep_human_profiles.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/preprocess_data.R
+\name{prep_human_profiles}
+\alias{prep_human_profiles}
+\title{Preprocess human profiles}
+\usage{
+prep_human_profiles(human_profiles, human_ids = NULL, rm_twins = TRUE)
+}
+\arguments{
+\item{human_profiles}{Tibble with alleles for all humans in reference
+database including three columns: SampleName, Marker, Allele.}
+
+\item{human_ids}{Vector of human ids from the SampleName column in
+\code{human_profiles} for which to compute log10_lrs. If NULL, all ids in the
+input dataframe will be used. Default: NULL}
+
+\item{rm_twins}{A boolean indicating whether or not to remove likely twins
+(identical STR profiles) from the human database prior to identifying
+matches. Default: TRUE}
+}
+\value{
+Dataframe with preprocessed human profiles
+}
+\description{
+Removes duplicates and optionally twins, subsets ids
+}
+\keyword{internal}
diff --git a/man/rm_dups.Rd b/man/rm_dups.Rd
index 21eff1a..0130d8a 100644
--- a/man/rm_dups.Rd
+++ b/man/rm_dups.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/utils.R
+% Please edit documentation in R/preprocess_data.R
 \name{rm_dups}
 \alias{rm_dups}
 \title{Remove duplicate rows with warning}
diff --git a/man/rm_twins.Rd b/man/rm_twins.Rd
index 2da1dcf..6c34eac 100644
--- a/man/rm_twins.Rd
+++ b/man/rm_twins.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/utils.R
+% Please edit documentation in R/preprocess_data.R
 \name{rm_twins}
 \alias{rm_twins}
 \title{Remove identical STR profiles}
diff --git a/man/subset_ids.Rd b/man/subset_ids.Rd
index fa97257..d1fb105 100644
--- a/man/subset_ids.Rd
+++ b/man/subset_ids.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/utils.R
+% Please edit documentation in R/preprocess_data.R
 \name{subset_ids}
 \alias{subset_ids}
 \title{Subset to ids present in the dataset}
diff --git a/tests/testthat/_snaps/bistro.md b/tests/testthat/_snaps/bistro.md
index 317924f..83fd9f7 100644
--- a/tests/testthat/_snaps/bistro.md
+++ b/tests/testthat/_snaps/bistro.md
@@ -20,28 +20,6 @@
         <chr>              <int>   <dbl> <chr> <chr>       <dbl> <chr>           <dbl>
       1 evid1                 17       2 yes   P1           21.8 passed al~         21
 
----
-
-    Code
-      bistro(bm_evid1, hu_p1, pop_allele_freqs = pop_allele_freqs, kit = "ESX17",
-        peak_thresh = 200, bloodmeal_ids = "evid1", human_ids = "P1")
-    Message <simpleMessage>
-      1/17 markers in kit but not in pop_allele_freqs: AMEL
-      Formatting bloodmeal profiles
-      Removing 4 peaks under the threshold of 200 RFU.
-      Formatting human profiles
-      Calculating log10LRs
-      # bloodmeal ids: 1
-      # human ids: 1
-      Bloodmeal id 1/1
-      Human id 1/1
-      Identifying matches
-    Output
-      # A tibble: 1 x 8
-        bloodmeal_id locus_count est_noc match human_id log10_lr notes      thresh_low
-        <chr>              <int>   <dbl> <chr> <chr>       <dbl> <chr>           <dbl>
-      1 evid1                 17       2 yes   P1           21.8 passed al~         21
-
 ---
 
     Code
diff --git a/tests/testthat/_snaps/identify_matches.md b/tests/testthat/_snaps/identify_matches.md
index cfc5118..4967a5c 100644
--- a/tests/testthat/_snaps/identify_matches.md
+++ b/tests/testthat/_snaps/identify_matches.md
@@ -17,7 +17,7 @@
       # A tibble: 1 x 8
         bloodmeal_id locus_count est_noc match human_id log10_lr notes      thresh_low
         <chr>              <int>   <dbl> <chr> <lgl>    <lgl>    <chr>      <lgl>     
-      1 evid2                  1       2 no    NA       NA       no shared~ NA        
+      1 evid2                  1       2 no    NA       NA       all log10~ NA        
 
 ---
 
@@ -50,6 +50,6 @@
         <chr>              <int>   <dbl> <chr> <chr>       <dbl> <chr>           <dbl>
       1 evid1                 17       2 yes   P1           21.8 passed al~        9.5
       2 evid1                 17       2 yes   P2           10.3 passed al~        9.5
-      3 evid2                  1       2 no    <NA>         NA   no shared~       NA  
+      3 evid2                  1       2 no    <NA>         NA   all log10~       NA  
       4 evid3                  8       1 no    <NA>         NA   all log10~       NA  
 
diff --git a/tests/testthat/_snaps/preprocess_data.md b/tests/testthat/_snaps/preprocess_data.md
new file mode 100644
index 0000000..6e4dcfb
--- /dev/null
+++ b/tests/testthat/_snaps/preprocess_data.md
@@ -0,0 +1,86 @@
+# prep_bloodmeal_profiles works
+
+    Code
+      prep_bloodmeal_profiles(bloodmeal_profiles, bloodmeal_ids = "evid1",
+        peak_thresh = 200)
+    Message <simpleMessage>
+      Removing 4 peaks under the threshold of 200 RFU.
+    Output
+      # A tibble: 50 x 4
+         SampleName Marker   Allele Height
+         <chr>      <chr>    <chr>   <dbl>
+       1 evid1      AMEL     X        2136
+       2 evid1      AMEL     Y        1015
+       3 evid1      D10S1248 13       1856
+       4 evid1      D10S1248 15       1045
+       5 evid1      D12S391  18        297
+       6 evid1      D12S391  18.3     1446
+       7 evid1      D12S391  19        751
+       8 evid1      D12S391  22       1370
+       9 evid1      D16S539  10        312
+      10 evid1      D16S539  11        743
+      # i 40 more rows
+
+# prep_human_profiles works
+
+    Code
+      prep_human_profiles(human_profiles)
+    Output
+      # A tibble: 96 x 3
+         SampleName                       Marker   Allele
+         <chr>                            <chr>    <chr> 
+       1 00-JP0001-14_20142342311_NO-3241 AMEL     X     
+       2 00-JP0001-14_20142342311_NO-3241 AMEL     Y     
+       3 00-JP0001-14_20142342311_NO-3241 D10S1248 12    
+       4 00-JP0001-14_20142342311_NO-3241 D10S1248 13    
+       5 00-JP0001-14_20142342311_NO-3241 D12S391  17    
+       6 00-JP0001-14_20142342311_NO-3241 D12S391  18    
+       7 00-JP0001-14_20142342311_NO-3241 D16S539  10    
+       8 00-JP0001-14_20142342311_NO-3241 D16S539  11    
+       9 00-JP0001-14_20142342311_NO-3241 D18S51   13    
+      10 00-JP0001-14_20142342311_NO-3241 D18S51   17    
+      # i 86 more rows
+
+---
+
+    Code
+      prep_human_profiles(human_profiles, human_ids = "P1")
+    Output
+      # A tibble: 33 x 3
+         SampleName Marker   Allele
+         <chr>      <chr>    <chr> 
+       1 P1         AMEL     X     
+       2 P1         AMEL     Y     
+       3 P1         D10S1248 13    
+       4 P1         D10S1248 15    
+       5 P1         D12S391  18.3  
+       6 P1         D12S391  22    
+       7 P1         D16S539  11    
+       8 P1         D16S539  12    
+       9 P1         D18S51   15    
+      10 P1         D18S51   17    
+      # i 23 more rows
+
+# rm_twins works
+
+    Code
+      rm_twins(dplyr::bind_rows(human_profiles, dplyr::mutate(dplyr::filter(
+        human_profiles, SampleName == "P1"), SampleName = "Pdup")))
+    Message <simpleMessage>
+      Identified 2 people whose profiles appear more than once (likely identical twins). These are being removed.
+    Output
+      # A tibble: 63 x 3
+         SampleName                       Marker   Allele
+         <chr>                            <chr>    <chr> 
+       1 00-JP0001-14_20142342311_NO-3241 AMEL     X     
+       2 00-JP0001-14_20142342311_NO-3241 AMEL     Y     
+       3 00-JP0001-14_20142342311_NO-3241 D10S1248 12    
+       4 00-JP0001-14_20142342311_NO-3241 D10S1248 13    
+       5 00-JP0001-14_20142342311_NO-3241 D12S391  17    
+       6 00-JP0001-14_20142342311_NO-3241 D12S391  18    
+       7 00-JP0001-14_20142342311_NO-3241 D16S539  10    
+       8 00-JP0001-14_20142342311_NO-3241 D16S539  11    
+       9 00-JP0001-14_20142342311_NO-3241 D18S51   13    
+      10 00-JP0001-14_20142342311_NO-3241 D18S51   17    
+      # i 53 more rows
+
diff --git a/tests/testthat/_snaps/utils.md b/tests/testthat/_snaps/utils.md
deleted file mode 100644
index aa94c88..0000000
--- a/tests/testthat/_snaps/utils.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# rm_twins works
-
-    Code
-      rm_twins(dplyr::bind_rows(human_profiles, dplyr::mutate(dplyr::filter(
-        human_profiles, SampleName == "P1"), SampleName = "Pdup")))
-    Message <simpleMessage>
-      Identified 2 people whose profiles appear more than once (likely identical twins). These are being removed.
-    Output
-      # A tibble: 63 x 3
-         SampleName                       Marker   Allele
-         <chr>                            <chr>    <chr> 
-       1 00-JP0001-14_20142342311_NO-3241 AMEL     X     
-       2 00-JP0001-14_20142342311_NO-3241 AMEL     Y     
-       3 00-JP0001-14_20142342311_NO-3241 D10S1248 12    
-       4 00-JP0001-14_20142342311_NO-3241 D10S1248 13    
-       5 00-JP0001-14_20142342311_NO-3241 D12S391  17    
-       6 00-JP0001-14_20142342311_NO-3241 D12S391  18    
-       7 00-JP0001-14_20142342311_NO-3241 D16S539  10    
-       8 00-JP0001-14_20142342311_NO-3241 D16S539  11    
-       9 00-JP0001-14_20142342311_NO-3241 D18S51   13    
-      10 00-JP0001-14_20142342311_NO-3241 D18S51   17    
-      # i 53 more rows
-
diff --git a/tests/testthat/test-bistro.R b/tests/testthat/test-bistro.R
index 25f833a..cebde79 100644
--- a/tests/testthat/test-bistro.R
+++ b/tests/testthat/test-bistro.R
@@ -14,18 +14,6 @@ test_that("bistro works", {
     )
   )
 
-  expect_snapshot(
-    bistro(
-      bm_evid1,
-      hu_p1,
-      pop_allele_freqs = pop_allele_freqs,
-      kit = "ESX17",
-      peak_thresh = 200,
-      bloodmeal_ids = "evid1",
-      human_ids = "P1"
-    )
-  )
-
   expect_snapshot(bistro(
     bm_evid1,
     hu_p1,
@@ -43,17 +31,4 @@ test_that("bistro works", {
     ),
     "If `calc_allele_freqs = FALSE`, then `pop_allele_freqs` is required."
   )
-
-  expect_message(expect_message(expect_message(
-    expect_error(
-      bistro(
-        bloodmeal_profiles %>% dplyr::filter(Height < 200),
-        human_profiles,
-        pop_allele_freqs = pop_allele_freqs,
-        kit = "ESX17",
-        peak_thresh = 200
-      ),
-      "All bloodmeal peak heights below threshold of 200."
-    )
-  )))
 })
diff --git a/tests/testthat/test-identify_matches.R b/tests/testthat/test-identify_matches.R
index 5f2c734..eee47f3 100644
--- a/tests/testthat/test-identify_matches.R
+++ b/tests/testthat/test-identify_matches.R
@@ -33,7 +33,15 @@ test_that("identify_matches works", {
 
   expect_snapshot(identify_matches(lrs))
 
-  expect_no_error(lrs |>
-    dplyr::mutate(log10_lr = Inf) |>
-    identify_matches())
+  expect_equal(
+    lrs |>
+      dplyr::mutate(log10_lr = Inf) |>
+      identify_matches() |>
+      dplyr::pull(notes),
+    c(
+      "all log10LRs NA or Inf",
+      "all log10LRs NA or Inf;no shared alleles;euroformix error",
+      "all log10LRs NA or Inf"
+    )
+  )
 })
diff --git a/tests/testthat/test-preprocess_data.R b/tests/testthat/test-preprocess_data.R
new file mode 100644
index 0000000..02acfc1
--- /dev/null
+++ b/tests/testthat/test-preprocess_data.R
@@ -0,0 +1,52 @@
+test_that("prep_bloodmeal_profiles works", {
+  expect_error(
+    prep_bloodmeal_profiles(
+      bloodmeal_profiles %>% dplyr::filter(Height < 200),
+      peak_thresh = 200
+    ),
+    "All bloodmeal peak heights below threshold of 200."
+  )
+
+  expect_snapshot(prep_bloodmeal_profiles(
+    bloodmeal_profiles,
+    bloodmeal_ids = "evid1",
+    peak_thresh = 200
+  ))
+})
+
+test_that("prep_human_profiles works", {
+  expect_snapshot(prep_human_profiles(human_profiles))
+
+  expect_snapshot(prep_human_profiles(human_profiles, human_ids = "P1"))
+})
+
+test_that("rm_dups works", {
+  expect_no_message(rm_dups(tibble::tibble(test = 1:2)))
+
+  expect_warning(
+    rm_dups(tibble::tibble(test = rep(1, 2))),
+    "Detected and removed 1 duplicate rows."
+  )
+})
+
+test_that("rm_twins works", {
+  expect_snapshot(rm_twins(
+    dplyr::bind_rows(
+      human_profiles,
+      human_profiles |>
+        dplyr::filter(SampleName == "P1") |>
+        dplyr::mutate(SampleName = "Pdup")
+    )
+  ))
+})
+
+test_that("subset_ids works", {
+  expect_equal(subset_ids("a", c("a", "b"), "ids"), "a")
+  expect_error(
+    expect_warning(
+      subset_ids("c", c("a", "b"), "ids"),
+      "Removing 1 ids not found in the dataset: c"
+    ),
+    "None of the provided ids are present in the dataset."
+  )
+})
diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R
index b535e2d..da50b77 100644
--- a/tests/testthat/test-utils.R
+++ b/tests/testthat/test-utils.R
@@ -1,35 +1,3 @@
 test_that("ignore_unused_imports works", {
   expect_no_error(ignore_unused_imports())
 })
-
-
-test_that("rm_dups works", {
-  expect_no_message(rm_dups(tibble::tibble(test = 1:2)))
-
-  expect_warning(
-    rm_dups(tibble::tibble(test = rep(1, 2))),
-    "Detected and removed 1 duplicate rows."
-  )
-})
-
-test_that("rm_twins works", {
-  expect_snapshot(rm_twins(
-    dplyr::bind_rows(
-      human_profiles,
-      human_profiles |>
-        dplyr::filter(SampleName == "P1") |>
-        dplyr::mutate(SampleName = "Pdup")
-    )
-  ))
-})
-
-test_that("subset_ids works", {
-  expect_equal(subset_ids("a", c("a", "b"), "ids"), "a")
-  expect_error(
-    expect_warning(
-      subset_ids("c", c("a", "b"), "ids"),
-      "Removing 1 ids not found in the dataset: c"
-    ),
-    "None of the provided ids are present in the dataset."
-  )
-})