ccmbioinfo · Madelinehazel · Jun 20, 2024 · Jun 20, 2024
diff --git a/cre.gemini2txt.vcf2db.sh b/cre.gemini2txt.vcf2db.sh
@@ -88,7 +88,7 @@ sQuery="select \
         revel_score as Revel_score,\
         gerp_score as Gerp_score,\
         $noncoding_scores, \
-        AlphaMissense as AlphaMissense,\
+        AlphaMissense as AlphaMissense_score,\
         aa_change as AA_change,\
         hgvsc as Codon_change,\
         "$callers" as Callers,\

diff --git a/cre.vcf2db.R b/cre.vcf2db.R
@@ -52,6 +52,27 @@ genotype2zygocity <- function (genotype_str, ref, alt_depth, type){
     return(result)
 }
 
+# count how many noncoding scores predict an impact and return a fraction
+# e.g. 3/4 indicates that 3/4 scores predict an impact
+noncoding_pred <- function(cadd, ncer, remm, linsight){
+  cadd <- ifelse(cadd == "None", NA, cadd)
+  ncer <- ifelse(ncer == "None", NA, ncer)
+  remm <- ifelse(remm == "None", NA, remm)
+  linsight <- ifelse(linsight == "None", NA, linsight)
+  # https://doi.org/10.1016/j.gpb.2022.02.002 table 3 best threshold
+  cadd_pred <- cadd > 14.19 # paper used CADD v1.3, we use 1.6, so score here is from personal work
+  ncer_pred <- ncer > 95.95
+  remm_pred <- remm > 0.9585 
+  linsight_pred <- linsight > 0.9828
+  preds <- c(cadd_pred, ncer_pred, remm_pred, linsight_pred)
+  pred_impact <- sum(preds, na.rm = TRUE)
+  denominator <- sum(!is.na(preds))
+  fraction = paste(as.character(pred_impact), as.character(denominator), sep="/")
+
+  return(fraction)
+}
+
+
 # output : family.ensemble.txt
 create_report <- function(family, samples, type){
     file <- paste0(family, ".variants.txt")
@@ -400,6 +421,13 @@ create_report <- function(family, samples, type){
         }
     }else print("VEP MaxEntScan annotation is missing")
 
+    # count number of noncoding scores that predict an impact
+    if (type == 'wgs' || type == 'denovo'){
+        noncoding_agg <- mapply(noncoding_pred, variants[, "Cadd_score"], variants[,"ncER_score"], variants[,"ReMM_score"], variants[,"LINSIGHT_score"])
+        variants[, "Noncoding_path_pred"] <- unlist(noncoding_agg)
+    }
+
+
     # Column 51: SpliceAI
 
 
@@ -435,7 +463,7 @@ select_and_write2 <- function(variants, samples, prefix, type)
     print(colnames(variants))
     if (type == 'wgs' || type == 'denovo'){
         noncoding_cols <- c("DNaseI_hypersensitive_site", "CTCF_binding_site", "ENH_cellline_tissue", "TF_binding_sites")
-        noncoding_scores <- c("ncER_score", "ReMM_score", "LINSIGHT_score")
+        noncoding_scores <- c("ncER_score", "ReMM_score", "LINSIGHT_score", "Noncoding_path_pred")
         wgs_counts <- c("C4R_WGS_counts", "C4R_WGS_samples")
         variants$C4R_WGS_counts[variants$C4R_WGS_counts == "None"] <- 0 
         variants$C4R_WGS_counts <- as.integer(variants$C4R_WGS_counts)
@@ -460,7 +488,7 @@ select_and_write2 <- function(variants, samples, prefix, type)
                             "Gnomad_af_popmax", "Gnomad_af", "Gnomad_ac", "Gnomad_hom",
                             "Ensembl_transcript_id", "AA_position", "Exon", "Protein_domains", "rsIDs",
                             "Gnomad_oe_lof_score", "Gnomad_oe_mis_score", "Exac_pli_score", "Exac_prec_score", "Exac_pnull_score",
-                            "Conserved_in_20_mammals", "SpliceAI_impact", "SpliceAI_score", "Sift_score", "Polyphen_score", "Cadd_score", "Vest3_score", "Revel_score", "Gerp_score", "AlphaMissense"),
+                            "Conserved_in_20_mammals", "SpliceAI_impact", "SpliceAI_score", "Sift_score", "Polyphen_score", "Cadd_score", "Vest3_score", "Revel_score", "Gerp_score", "AlphaMissense_score"),
                            noncoding_scores,
                            c("Imprinting_status", "Imprinting_expressed_allele", "Pseudoautosomal", "Gnomad_male_ac",
                             "Number_of_callers", "Old_multiallelic", "UCE_100bp", "UCE_200bp"),