From bd8ac3135df4c02fffbf77dba26738f72dd7690b Mon Sep 17 00:00:00 2001
From: Jeff Kimbrel <jakpot@gmail.com>
Date: Sun, 17 Sep 2023 15:05:33 -0700
Subject: [PATCH] add gradient_pos_rel_amt function

---
 DESCRIPTION                     |  2 +-
 NAMESPACE                       |  1 +
 R/add_gradient_pos_rel_amt.R    | 31 +++++++++++++++++++++++++++++
 man/add_gradient_pos_rel_amt.Rd | 27 +++++++++++++++++++++++++
 vignettes/sample_data.Rmd       | 35 ++++++++++++++++++++++++++-------
 5 files changed, 88 insertions(+), 8 deletions(-)
 create mode 100644 R/add_gradient_pos_rel_amt.R
 create mode 100644 man/add_gradient_pos_rel_amt.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index e5a4b92b..133db645 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: qSIP2
 Title: qSIP Analysis
-Version: 0.4.0.9007
+Version: 0.4.0.9008
 Authors@R: 
     person("Jeff", "Kimbrel", , "kimbrel1@llnl.gov", role = c("aut", "cre"),
            comment = c(ORCID = "YOUR-ORCID-ID"))
diff --git a/NAMESPACE b/NAMESPACE
index 127eabd8..d9d7038d 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+export(add_gradient_pos_rel_amt)
 export(add_isotopolog_label)
 export(get_sample_counts)
 export(gradient_pos_density_validation)
diff --git a/R/add_gradient_pos_rel_amt.R b/R/add_gradient_pos_rel_amt.R
new file mode 100644
index 00000000..0809d367
--- /dev/null
+++ b/R/add_gradient_pos_rel_amt.R
@@ -0,0 +1,31 @@
+#' Add gradient_pos_rel_amt to data
+#'
+#' This function will calculate the relative amt of a fraction compared to the
+#' whole replicate using either qPCR copies or DNA concentrations.
+#'
+#' @param data A dataframe or tibble
+#' @param source_mat_id Grouping variable for a replicate
+#' @param amt Column name that has the qPCR or DNA amounts per fraction
+#' @param overwrite Determines whether or not to overwrite an existing gradient_pos_rel_amt column
+#'
+#' @export
+#'
+#' @keywords sample_data
+
+add_gradient_pos_rel_amt = function(data,
+                                    source_mat_id = "source_mat_id",
+                                    amt,
+                                    overwrite = F) {
+
+  if ("gradient_pos_rel_amt" %in% colnames(data)) {
+    if (overwrite == FALSE) {
+      stop("gradient_pos_rel_amt already exists! Set overwrite = TRUE if you want to overwrite")
+    } else if (overwrite == TRUE) {
+      message("gradient_pos_rel_amt already exists and will be overwritten")
+    }
+  }
+
+  data |>
+    dplyr::mutate(gradient_pos_rel_amt = !!as.name(amt) / sum(!!as.name(amt)),
+                  .by = !!as.name(source_mat_id))
+}
diff --git a/man/add_gradient_pos_rel_amt.Rd b/man/add_gradient_pos_rel_amt.Rd
new file mode 100644
index 00000000..7626b31a
--- /dev/null
+++ b/man/add_gradient_pos_rel_amt.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/add_gradient_pos_rel_amt.R
+\name{add_gradient_pos_rel_amt}
+\alias{add_gradient_pos_rel_amt}
+\title{Add gradient_pos_rel_amt to data}
+\usage{
+add_gradient_pos_rel_amt(
+  data,
+  source_mat_id = "source_mat_id",
+  amt,
+  overwrite = F
+)
+}
+\arguments{
+\item{data}{A dataframe or tibble}
+
+\item{source_mat_id}{Grouping variable for a replicate}
+
+\item{amt}{Column name that has the qPCR or DNA amounts per fraction}
+
+\item{overwrite}{Determines whether or not to overwrite an existing gradient_pos_rel_amt column}
+}
+\description{
+This function will calculate the relative amt of a fraction compared to the
+whole replicate using either qPCR copies or DNA concentrations.
+}
+\keyword{sample_data}
diff --git a/vignettes/sample_data.Rmd b/vignettes/sample_data.Rmd
index bfc3e1ea..04fe7fc8 100644
--- a/vignettes/sample_data.Rmd
+++ b/vignettes/sample_data.Rmd
@@ -16,15 +16,15 @@ knitr::opts_chunk$set(
 )
 ```
 
-```{r setup}
+# Background
+
+In `qSIP2`, "sample data" refers to any high level metadata associated with either an experiment or the individual fractions. This vignette will show available tools to format and validate your sample data for the `qsip_sample_object` class in the `qSIP2` package.
+
+```{r setup, message=FALSE}
 library(dplyr)
 library(qSIP2)
 ```
 
-# Background
-
-In `qSIP2`, "sample data" refers to any high level metadata associated with either an experiment or the individual fractions.
-
 ## What is a sample?
 
 The word **sample** typically refers to the biological or environmental entity the DNA was isolated from as well as the single sequencing run tied to that **sample**. In qSIP, however, because there are multiple sequencing runs per biological subject, the term **sample** has historically been reserved for sequencing of each fraction. In practice, this means you will have many **samples** for each single biological subject.
@@ -43,7 +43,7 @@ To standardize the qSIP workflow, column names should adhere as closely to MISIP
 
 In traditional qSIP the `isotope` field has been populated with either the light (e.g. 16O) or heavy (e.g. 18O) isotope depending on the substrate used in that rep or `source_mat_id`. In MISIP standards, only the heavy isotope is listed under the `isotope` field, and then a secondary field `isotopolog_label` is used to designate whether the replicate used a substrate with "natural abundance" (i.e. "light") or "isotopically labeled" (i.e. "heavy") isotopes.
 
-In the `qSIP2` package, either method can be used. If the `isotopolog_label` is missing from your dataset then it will assume both the light and heavy isotopes are present in the `isotope` field. But, if you do have an `isotopolog_label` field, then only the heavy isotope designation is allowed in the `isotope` field and the dataframe will not pass validation checks. 
+In the `qSIP2` package, either method can be used. If the `isotopolog_label` is missing from your dataset then it will assume both the light and heavy isotopes are present in the `isotope` field. But, if you do have an `isotopolog_label` field, then only the heavy isotope designation is allowed in the `isotope` field and the dataframe will not pass validation checks if there are light isotopes listed there. 
 
 Conversion between these two objects can be done with the `add_isotopolog_label()` or `remove_isotopolog_label()` functions.
 
@@ -55,7 +55,8 @@ sample_data_nonMISIP %>%
   count(isotope)
 
 # new data has only one isotope and a mixture of isotopolog_label 
-df_with_labels = add_isotopolog_label(sample_data_nonMISIP, isotope = "isotope") 
+df_with_labels = add_isotopolog_label(sample_data_nonMISIP, 
+                                      isotope = "isotope") 
 
 df_with_labels %>%
   count(isotope, isotopolog_label)
@@ -65,6 +66,26 @@ remove_isotopolog_label(df_with_labels) %>%
   count(isotope)
 ```
 
+## Fraction relative amounts
+
+A requirement for qSIP is the `gradient_pos_rel_amt` field, which gives the percent amount that a fraction has of the whole. The preferred method is given in qPCR copy numbers, but DNA concentrations can be used as well. 
+
+For example, if there are 100,000 total 16S copies in a replicate as determined by qPCR, and 15,000 copies in fraction 7, then the `gradient_pos_rel_amt` value for fraction 7 would be 0.15 (15,000 / 100,000). Similarly, if you had 25ng total DNA used for density separation, and fraction 7 had 3.75 ng DNA recovered, then `gradient_pos_rel_amt` would also be 0.15 (3.75 / 25). 
+
+Ideally, all of the `gradient_pos_rel_amt` for a given `source_mat_id` should add up to 1, but there are situation where it might be less than 1. For example, if you removed some fractions because they didn't sequence well or there was some other reason to remove a fraction. If you have 16S or DNA concentrations for these removed samples they would be subtracted from the total. So, for our example above, if fraction 7 needed to be removed, then the total for all fractions of that `source_mat_id` would only be 0.85. The total fractions within a `source_mat_id` within should never be greater than 1.
+
+If your sample data tibble does not have the `gradient_pos_rel_amt` there is a function that can add it for you. 
+
+The `add_gradient_pos_rel_amt()` can create this column from either a column of qPCR totals, or DNA concentrations. Because this function can not know if there are missing fractions, the totals per `source_mat_id` will be equal to 1. If you do have fractions that you want removed, keep them in the dataframe for this `add_gradient_pos_rel_amt()` and then remove them after.
+
+
+
+
+
+
+
+
+
 
 
 # Make a qSIP sample data object