Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Aggregate non-coding score #55

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cre.gemini2txt.vcf2db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ sQuery="select \
revel_score as Revel_score,\
gerp_score as Gerp_score,\
$noncoding_scores, \
AlphaMissense as AlphaMissense,\
AlphaMissense as AlphaMissense_score,\
aa_change as AA_change,\
hgvsc as Codon_change,\
"$callers" as Callers,\
Expand Down
32 changes: 30 additions & 2 deletions cre.vcf2db.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,27 @@ genotype2zygocity <- function (genotype_str, ref, alt_depth, type){
return(result)
}

# count how many noncoding scores predict an impact and return a fraction
# e.g. 3/4 indicates that 3/4 scores predict an impact
noncoding_pred <- function(cadd, ncer, remm, linsight){
cadd <- ifelse(cadd == "None", NA, cadd)
ncer <- ifelse(ncer == "None", NA, ncer)
remm <- ifelse(remm == "None", NA, remm)
linsight <- ifelse(linsight == "None", NA, linsight)
# https://doi.org/10.1016/j.gpb.2022.02.002 table 3 best threshold
cadd_pred <- cadd > 14.19 # paper used CADD v1.3, we use 1.6, so score here is from personal work
ncer_pred <- ncer > 95.95
remm_pred <- remm > 0.9585
linsight_pred <- linsight > 0.9828
preds <- c(cadd_pred, ncer_pred, remm_pred, linsight_pred)
pred_impact <- sum(preds, na.rm = TRUE)
denominator <- sum(!is.na(preds))
fraction = paste(as.character(pred_impact), as.character(denominator), sep="/")

return(fraction)
}


# output : family.ensemble.txt
create_report <- function(family, samples, type){
file <- paste0(family, ".variants.txt")
Expand Down Expand Up @@ -400,6 +421,13 @@ create_report <- function(family, samples, type){
}
}else print("VEP MaxEntScan annotation is missing")

# count number of noncoding scores that predict an impact
if (type == 'wgs' || type == 'denovo'){
noncoding_agg <- mapply(noncoding_pred, variants[, "Cadd_score"], variants[,"ncER_score"], variants[,"ReMM_score"], variants[,"LINSIGHT_score"])
variants[, "Noncoding_path_pred"] <- unlist(noncoding_agg)
}


# Column 51: SpliceAI


Expand Down Expand Up @@ -435,7 +463,7 @@ select_and_write2 <- function(variants, samples, prefix, type)
print(colnames(variants))
if (type == 'wgs' || type == 'denovo'){
noncoding_cols <- c("DNaseI_hypersensitive_site", "CTCF_binding_site", "ENH_cellline_tissue", "TF_binding_sites")
noncoding_scores <- c("ncER_score", "ReMM_score", "LINSIGHT_score")
noncoding_scores <- c("ncER_score", "ReMM_score", "LINSIGHT_score", "Noncoding_path_pred")
wgs_counts <- c("C4R_WGS_counts", "C4R_WGS_samples")
variants$C4R_WGS_counts[variants$C4R_WGS_counts == "None"] <- 0
variants$C4R_WGS_counts <- as.integer(variants$C4R_WGS_counts)
Expand All @@ -460,7 +488,7 @@ select_and_write2 <- function(variants, samples, prefix, type)
"Gnomad_af_popmax", "Gnomad_af", "Gnomad_ac", "Gnomad_hom",
"Ensembl_transcript_id", "AA_position", "Exon", "Protein_domains", "rsIDs",
"Gnomad_oe_lof_score", "Gnomad_oe_mis_score", "Exac_pli_score", "Exac_prec_score", "Exac_pnull_score",
"Conserved_in_20_mammals", "SpliceAI_impact", "SpliceAI_score", "Sift_score", "Polyphen_score", "Cadd_score", "Vest3_score", "Revel_score", "Gerp_score", "AlphaMissense"),
"Conserved_in_20_mammals", "SpliceAI_impact", "SpliceAI_score", "Sift_score", "Polyphen_score", "Cadd_score", "Vest3_score", "Revel_score", "Gerp_score", "AlphaMissense_score"),
noncoding_scores,
c("Imprinting_status", "Imprinting_expressed_allele", "Pseudoautosomal", "Gnomad_male_ac",
"Number_of_callers", "Old_multiallelic", "UCE_100bp", "UCE_200bp"),
Expand Down