put back standardized curations and convenience files to the pipeline

floratos-lab · Apr 24, 2023 · a8e6c6a · a8e6c6a
1 parent 7cbf6eb
commit a8e6c6a
Show file tree

Hide file tree

Showing 20 changed files with 2,970 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -36,6 +36,10 @@ The repository contents are as follows:
       - **./data/source_curations**: source curation spreadsheets, as
         provided by curators.
 
+      - **./data/standardized_curations**: curation spreadsheets in denormalized form, after
+        undergoing processing for quality control and standardization. This is the state from 
+        which all other files are created, i.e. the convenience files and release files. 
+
       - **./data/reference_files**: resources used to support the
         standardization process, including controlled vocabularies from
         external sources and mapping files maintained by the project.
@@ -45,6 +49,9 @@ The repository contents are as follows:
         curations. They are formatted as needed for uploading to the
         Dashboard.
 
+      - **./data/convenience_files**: the standardized curations conveniently reformatted to support 
+        human inspection and downstream computational analysis.
+
   - **./docs**: curation templates and column specification.
 
 The overall workflow of the project is depicted below.
@@ -151,17 +158,23 @@ and are updated with the latest versions prior to a Dashboard release.
 ### Release Preparation
 
 The quality control and standardization process produces new, denormalized versions of
-the curation data.
-These data then undergo further processing to generate the
+the source spreadsheets which are placed under **./data/standardized_curations**. 
+These files then undergo further processing to generate the
 final data release files which will be uploaded to the Dashboard. This
 processing is essentially a straightforward repackaging of the
 spreadsheets into a format appropriate for the upload scripts. It
 involves splitting immune signatures into individual spreadsheet files, one
 signature per file. The final release files are stored under **./data/release_files** and
-have the same columns as the standardized curation data, with some additions
+have the same columns as the standardized curation files, with some additions
 to support the requirements of the Dashboard itself and to preserve
 original curated values for fields updated by the pipeline.
 
+The release preparation process also generates a number of convenience files, i.e., 
+partially re-normalized versions of the standardized curations. These files are available 
+in spreadsheet format, to facilitate human inspection; and in the Broad GMT tab delimited file format, 
+to support downstream computational processing. The files are stored under 
+**./data/convenience_files**
+
 Additional details about the processing pipeline can be found under the
 **./data** directory.
 

diff --git a/code/main.sh b/code/main.sh
@@ -0,0 +1,5 @@
+#!/bin/bash -x
+time Rscript main_inf_ctf.R
+time Rscript main_inf_gene.R
+time Rscript main_vac_ctf.R
+time Rscript main_vac_gene.R
diff --git a/code/main_inf_ctf.R b/code/main_inf_ctf.R
@@ -309,6 +309,9 @@ df2 <- cSplit(df2, "comparison", sep = ";", direction = "long")
 df2 <- as.data.frame(df2)
 df2$comparison <- trimws(df2$comparison)
 
+source("standardized_and_convenience.R")
+save_standardized_curations(df2, base_filename)
+
 #############################################################
 #### Recreate original spreadsheet with all corrections #####
 #############################################################
@@ -355,3 +358,5 @@ if(!is.null(s)) {
 write_submission_template(df2, header_rows, "../data/release_files", NULL, "hipc_inf_ctf", titles_and_dates_df,
                             resp_components_full_sig, unmatched_symbols_map = NULL,
                             "CELLTYPE_FREQUENCY", "INFECTION", "Immune cell-type frequency response to infection")
+
+save_convenience_files(df2, header_rows, base_filename, "INFECTION", "CELLTYPE_FREQUENCY")
diff --git a/code/main_inf_gene.R b/code/main_inf_gene.R
@@ -322,6 +322,9 @@ df2 <- cSplit(df2, "comparison", sep = ";", direction = "long")
 df2 <- as.data.frame(df2)
 df2$comparison <- trimws(df2$comparison)
 
+source("standardized_and_convenience.R")
+save_standardized_curations(df2, base_filename)
+
 #############################################################
 #### Recreate original spreadsheet with all corrections #####
 #############################################################
@@ -380,3 +383,5 @@ df2$exposure_material <- NULL
 write_submission_template(df2, header_rows, "../data/release_files", NULL, "hipc_inf_gene", titles_and_dates_df,
                             resp_components_collected, unmatched_symbols_map,
                             "GENE", "INFECTION", "Gene expression response to infection")
+
+save_convenience_files(df2, header_rows, base_filename, "INFECTION", "GENE")
diff --git a/code/main_vac_ctf.R b/code/main_vac_ctf.R
@@ -338,6 +338,9 @@ df2 <- cSplit(df2, "comparison", sep = ";", direction = "long")
 df2 <- as.data.frame(df2)
 df2$comparison <- trimws(df2$comparison)
 
+source("standardized_and_convenience.R")
+save_standardized_curations(df2, base_filename)
+
 #############################################################
 #### Recreate original spreadsheet with all corrections #####
 #############################################################
@@ -390,3 +393,5 @@ header_rows <- header_rows[!colnames(header_rows) %in% del_cols]
 write_submission_template(df2, header_rows, "../data/release_files", NULL, "hipc_vac_ctf", titles_and_dates_df,
                             resp_components_full_sig, unmatched_symbols_map = NULL,
                             "CELLTYPE_FREQUENCY", "VACCINE", "Immune cell-type frequency response to vaccine exposure")
+
+save_convenience_files(df2, header_rows, base_filename, "VACCINE", "CELLTYPE_FREQUENCY")
diff --git a/code/main_vac_gene.R b/code/main_vac_gene.R
@@ -341,6 +341,9 @@ df2 <- cSplit(df2, "comparison", sep = ";", direction = "long")
 df2 <- as.data.frame(df2)
 df2$comparison <- trimws(df2$comparison)
 
+source("standardized_and_convenience.R")
+save_standardized_curations(df2, base_filename)
+
 #############################################################
 #### Recreate original spreadsheet with all corrections #####
 #############################################################
@@ -403,3 +406,5 @@ header_rows <- header_rows[!colnames(header_rows) %in% del_cols]
 write_submission_template(df2, header_rows, "../data/release_files", NULL, "hipc_vac_gene", titles_and_dates_df,
                             resp_components_collected, unmatched_symbols_map,
                             "GENE", "VACCINE", "Gene expression response to vaccine exposure")
+
+save_convenience_files(df2, header_rows, base_filename, "VACCINE", "GENE")
diff --git a/code/standardized_and_convenience.R b/code/standardized_and_convenience.R
@@ -0,0 +1,181 @@
+# this file is adopted from the original generate_HIPC_submissions.R
+
+library(R.utils) # for gzip
+
+save_standardized_curations <- function(df2, base_filename) {
+    del_cols <- c(
+        "submission_name", "submission_date",
+        "template_name", "short_comment", "process_note"
+    )
+    df2tmp <- df2[!colnames(df2) %in% del_cols]
+    df2tmp <- df2tmp[-1]
+
+    filename <- paste0(
+        "../data/standardized_curations/", base_filename,
+        "-standardized_denormalized.tsv"
+    )
+    write.table(df2tmp,
+        file = filename, sep = "\t",
+        row.names = FALSE, col.names = TRUE
+    )
+    gzip(filename,
+        destname = paste0(filename, ".gz"), overwrite = TRUE,
+        remove = TRUE
+    )
+}
+
+save_convenience_files <- function(
+    df2, header_rows, base_filename,
+    exposure_type, response_type) {
+    if (exposure_type != "VACCINE" && exposure_type != "INFECTION") {
+        stop("Incorrect exposure type encountered")
+    }
+    if (response_type != "GENE" && response_type != "CELLTYPE_FREQUENCY") {
+        stop("Incorrect response type encountered")
+    }
+
+    if (response_type == "GENE") {
+        response_behavior_type_var <- "gene expression"
+    } else if (response_type == "CELLTYPE_FREQUENCY") {
+        response_behavior_type_var <- "cell-type frequency"
+    }
+
+    convenience_files <- "../data/convenience_files/"
+
+    uniq_sig_row_ids <- unique(df2$sig_row_id)
+    resp_components_annotated <- vector("list", length(uniq_sig_row_ids))
+    recreated_template <- vector("list", length(uniq_sig_row_ids))
+
+    for (i in seq_along(uniq_sig_row_ids)) {
+        df2tmp <- df2[df2$sig_row_id == uniq_sig_row_ids[i], ]
+        # Recreate a full signature in one row
+        base_row <- df2tmp[1, ] # get first row for this uniqID
+
+        response_rowname <- paste(base_row$publication_reference_id,
+            base_row$sig_subm_id, uniq_sig_row_ids[i],
+            sep = "_"
+        )
+        response_description <- paste("PMID", base_row$publication_reference_id,
+            response_behavior_type_var, base_row$sig_subm_id,
+            sep = " "
+        )
+
+        # Use the full original set of response components
+        # rather than just those for which a valid symbol was found.
+        base_row$response_component_original <- paste(
+            unique(df2tmp$response_component_original),
+            collapse = "; "
+        )
+
+        base_row$exposure_material_id <- paste(
+            unique(df2tmp$exposure_material_id),
+            collapse = "; "
+        )
+        base_row$tissue_type_term_id <- paste(
+            unique(df2tmp$tissue_type_term_id),
+            collapse = "; "
+        )
+
+        if (response_type == "GENE") {
+            base_row$response_component <- paste(
+                unique(df2tmp$response_component),
+                collapse = "; "
+            )
+            resp_components_annotated[[i]] <- c(
+                response_rowname,
+                response_description, unique(df2tmp$response_component)
+            )
+        } else if (response_type == "CELLTYPE_FREQUENCY") {
+            full_sig <- unique(df2tmp$fully_qualified_response_component)
+            # FIXME - only response_component is getting put back together?
+            base_row$response_component <- paste(full_sig, collapse = "; ")
+            base_row$response_component_id <- paste(
+                unique(df2tmp$response_component_id),
+                collapse = "; "
+            )
+            base_row$proterm_and_extra <- paste(
+                unique(df2tmp$proterm_and_extra),
+                collapse = "; "
+            )
+            base_row$fully_qualified_response_component <- paste(
+                unique(df2tmp$fully_qualified_response_component),
+                collapse = "; "
+            )
+            # The pro_ontology_id values are already separated by semicolons,
+            # so change to commas
+            # before potentially joining two lists of pro-terms.
+            df2tmp$pro_ontology_id <- sapply(
+                df2tmp$pro_ontology_id,
+                function(x) {
+                    gsub(";", ",", x)
+                }
+            )
+            base_row$pro_ontology_id <- paste(
+                unique(df2tmp$pro_ontology_id),
+                collapse = "; "
+            )
+
+            resp_components_annotated[[i]] <- c(
+                response_rowname, response_description, full_sig
+            )
+        }
+
+        # Reconstitute target_pathogen and exposure_material_id
+        if (exposure_type == "VACCINE") {
+            base_row$target_pathogen_taxonid <- paste(
+                unique(df2tmp$target_pathogen_taxonid),
+                collapse = "; "
+            )
+        }
+
+        recreated_template[[i]] <- base_row
+    }
+
+    names(resp_components_annotated) <- uniq_sig_row_ids
+
+    # consolidate to a single data.frame
+    recreated_template_df <- as.data.frame(rbindlist(recreated_template))
+    if (any(colnames(header_rows) != colnames(recreated_template_df))) {
+        stop("mismatch between header rows and recreated_template_df rows")
+    }
+
+    recreated_template_df <- rbind(header_rows, recreated_template_df)
+
+    # First save a complete version for use in debugging/logging
+    del_cols <- c("submission_name", "submission_date", "template_name")
+    recreated_template_df <- recreated_template_df[
+        !colnames(recreated_template_df) %in% del_cols
+    ]
+
+    # Set that first column name back to blank
+    colnames(recreated_template_df)[1] <- ""
+
+    del_cols <- c("sig_subm_id", "sig_row_id")
+
+    recreated_template_df <- recreated_template_df[
+        !colnames(recreated_template_df) %in% del_cols
+    ]
+    write.table(recreated_template_df,
+        file = paste0(
+            convenience_files,
+            base_filename, "-standardized_curation_template.tsv"
+        ),
+        sep = "\t", row.names = FALSE
+    )
+
+    gmt_file <- paste0(
+        convenience_files,
+        base_filename, "-response_components.gmt.txt"
+    )
+    if (file.exists(gmt_file)) file.remove(gmt_file)
+    lapply(
+        resp_components_annotated,
+        function(x) {
+            write.table(paste(x, collapse = "\t"),
+                file = gmt_file, row.names = FALSE, col.names = FALSE,
+                quote = FALSE, append = TRUE
+            )
+        }
+    )
+    message("Finished creating convenience files")
+}