Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mosaic pipeline #42

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cre.gemini.variant_impacts.vcf2db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ else
fi

#if pipeline is cre, filter out variants only called by one of freebayes, samtools, platypus
#else pipeline is mosaic/crg (uses one caller), do not filter by caller, i.e. no "callers" in the gemini db
callers=`gemini db_info $file | grep -w "variants" | grep -w "callers"`
if [ ! -z "$callers" ]
if [ ! -z "$callers" ] #variable $callers is not an empty string, i.e. it exists in the gemini db
then
callers="v.callers"
caller_filter="and v.callers not in ('freebayes', 'samtools', 'platypus')"
Expand Down
3 changes: 2 additions & 1 deletion cre.gemini2txt.vcf2db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ alt_depth=3
gemini query -q "select name from samples order by name" $file > samples.txt

#if pipeline is cre, filter out variants only called by one of freebayes, samtools, platypus
#else pipeline is mosaic/crg (uses one caller), do not filter by caller, i.e. no "callers" in the gemini db
callers=`gemini db_info $file | grep -w "variants" | grep -w "callers"`
if [ ! -z "$callers" ]
if [ ! -z "$callers" ] #variable $callers is not an empty string, i.e. it exists in the gemini db
then
callers="callers"
caller_filter="and Callers not in ('freebayes', 'samtools', 'platypus')"
Expand Down
2 changes: 1 addition & 1 deletion cre.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# family = [family_id] (=project_id=case_id=folder_name, main result file should be family/family-ensemble.db)
# cleanup= [0|1] default = 0
# make_report=[0|1] default = 1, don't make report for WGS analysis first
# type = [ wes.regular (default) | wes.synonymous | wes.fast | rnaseq | wgs | annotate (only for cleaning) |
# type = [ wes.regular (default) | wes.synonymous | wes.mosaic | wes.fast | rnaseq | wgs | annotate (only for cleaning) |
# denovo (all rare variants in wgs, proband should have phenotype=2, parents=phenotype1 also sex for parents in gemini.db) ]
# max_af = af filter, default = 0.01
# database = path to folder where c4r count files and hgmd.csv are found.
Expand Down
36 changes: 29 additions & 7 deletions cre.vcf2db.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ add_placeholder <- function(variants, column_name, placeholder){
}

get_variants_from_file <- function (filename){
variants <- read.delim(filename, stringsAsFactors = F)
variants <- read.delim(filename, stringsAsFactors=FALSE, colClasses = c("character"))
return(variants)
}

Expand Down Expand Up @@ -47,6 +47,26 @@ genotype2zygocity <- function (genotype_str, ref, alt_depth){
return(result)
}

# when no variants are made into report when running WES mosaic-panel pipeline
check_empty <- function(family, type){
file <- paste0(family, ".variants.txt")
variants <- get_variants_from_file(file)

if (nrow(variants) == 0){
variants = data.frame()
variants[1,1] = paste0("No variants are reported for ", family)

write.csv(variants, paste0(family,".wes.regular.", datetime, ".csv"), row.names = F)
write.csv(variants, paste0(family,".wes.mosaic.", datetime, ".csv"), row.names = F)
write.csv(variants, paste0(family,".clinical.wes.regular.", datetime, ".csv"), row.names = F)
write.csv(variants, paste0(family,".clinical.wes.mosaic.", datetime, ".csv"), row.names = F)

} else{
message("Go ahead to create report with at least 1 variant")
}

}

# output : family.ensemble.txt
create_report <- function(family, samples, type){
file <- paste0(family, ".variants.txt")
Expand Down Expand Up @@ -290,7 +310,7 @@ create_report <- function(family, samples, type){
variants <- add_placeholder(variants, "SpliceAI_impact", "")
for (i in 1:nrow(variants)){
print(i)
if (variants[i,"SpliceAI_score"] == ""){
if (is.na(variants[i,"SpliceAI_score"])){
variants[i, "SpliceAI_impact"] <- "NA|NA|NA"
variants[i, "SpliceAI_score"] <- 0
} else {
Expand Down Expand Up @@ -346,7 +366,7 @@ create_report <- function(family, samples, type){
# Column44 = cadd
# Column45 = vest3
for (i in 1:nrow(variants)){
v_vest <- strsplit(variants[i,"Vest3_score"], ",", fixed = T)[[1]]
v_vest <- strsplit(as.character(variants[i,"Vest3_score"]), ",", fixed = T)[[1]]
variants[i, "Vest3_score"] <- max(v_vest)
}

Expand Down Expand Up @@ -455,6 +475,7 @@ select_and_write2 <- function(variants, samples, prefix, type)
"Number_of_callers", "Old_multiallelic", "UCE_100bp", "UCE_200bp"), noncoding_cols)]

variants <- variants[order(variants$Position),]
#print(paste0("In select_and_write2, Alt is ", variants$Alt))

if (type == 'denovo'){
variants <- variants[variants$C4R_WGS_counts < 10,]
Expand All @@ -473,7 +494,7 @@ fix_column_name <- function(column_name){
# merges ensembl, gatk-haplotype reports
merge_reports <- function(family, samples, type){
ensemble_file <- paste0(family, ".create_report.csv")
ensemble <- read.csv(ensemble_file, stringsAsFactors = F)
ensemble <- read.csv(ensemble_file, stringsAsFactors = F, colClasses = c("character"))
ensemble$superindex <- with(ensemble, paste(Position, Ref, Alt, sep = '-'))

for (i in 1:nrow(ensemble)){
Expand All @@ -489,7 +510,7 @@ merge_reports <- function(family, samples, type){

ensemble_table_file <- paste0(family, ".table")
if (file.exists(ensemble_table_file)){
ensemble_table <- read.delim(ensemble_table_file, stringsAsFactors = F)
ensemble_table <- read.delim(ensemble_table_file, stringsAsFactors = F, colClasses = c("character"))
ensemble_table$superindex <- with(ensemble_table, paste(paste0(CHROM,":",POS), REF, ALT, sep = '-'))
ensemble_table[c("CHROM", "POS", "REF", "ALT")] <- NULL
for (i in 1:nrow(ensemble_table)){
Expand Down Expand Up @@ -724,8 +745,7 @@ parse_ad <- function(ad_cell) {
}

annotate_w_care4rare <- function(family,samples,type){
variants <- read.csv(paste0(family, ".merge_reports.csv"), stringsAsFactors = F)

variants <- read.csv(paste0(family, ".merge_reports.csv"), stringsAsFactors = F, colClasses = c("character"))
variants$superindex <- with(variants, paste(Position, Ref, Alt, sep='-'))

if(exists("seen_in_c4r_counts")){
Expand Down Expand Up @@ -891,6 +911,8 @@ samples <- gsub("-", ".", samples)

print("Loading tables")
load_tables(debug)
print("Check empty tables")
check_empty(family)
print("Creating report")
create_report(family,samples,type)
print("Merging reports")
Expand Down