-
Notifications
You must be signed in to change notification settings - Fork 97
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
qc: Add variant_based_inferred_sex.py to run variant based inferred s…
…ex (work in progress) #TASK-6773 #TASK-6766
- Loading branch information
Showing
1 changed file
with
107 additions
and
0 deletions.
There are no files selected for viewing
107 changes: 107 additions & 0 deletions
107
opencga-app/app/analysis/qc/individual_qc/variant_based_inferred_sex.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import os | ||
import logging | ||
import gzip | ||
import json | ||
|
||
from utils import create_output_dir, execute_bash_command, generate_results_json | ||
# something similar to Ximena's family_qc.relatedness_results? | ||
|
||
|
||
LOGGER = logging.getLogger('variant_qc_logger') | ||
|
||
|
||
class VariantBasedInferredSexAnalysis: | ||
def __init__(self, individual_qc_executor_info): | ||
""" | ||
:param individual_qc_executor_info: | ||
""" | ||
|
||
self.output_variant_based_inferred_sex_dir = None | ||
self.chrx_vars = None | ||
self.chrx_var_frq = None | ||
self.individual_qc_executor_info = individual_qc_executor_info | ||
# Results thingy similar to Ximena's? | ||
|
||
def variant_based_inferred_sex_setup(self): | ||
if self.individual_qc_executor_info != None: | ||
self.set_variant_based_inferred_sex_files() | ||
self.set_variant_based_inferred_sex_dir() | ||
else: | ||
msg = "No instance of IndividualQCExecutor was found. Therefore variant based inferred sex analysis cannot be executed." | ||
LOGGER.error(msg) | ||
raise TypeError(msg) | ||
|
||
def set_variant_based_inferred_sex_files(self): | ||
LOGGER.info("Checking and setting up variant based inferred sex files.") | ||
if os.path.exists(self.individual_qc_executor_info["resource_dir"]): | ||
check_sex_files = { | ||
"chrx_vars": os.path.join(self.individual_qc_executor_info["resource_dir"],"chrX_1000G_QC.prune.in"), | ||
# current prune.in input: 20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chrX.recalibrated_variants_filtered_annotated_chrX.prune.in | ||
"chrx_var_frq": os.path.join(self.individual_qc_executor_info["resource_dir"],"chrX_1000G_QC_prune_in.frq") | ||
# current frq input: 20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chrX.recalibrated_variants_filtered_annotated_chrX.frq | ||
} | ||
for key,file in check_sex_files.items(): | ||
if os.path.isfile(file): | ||
if key == "chrx_vars": | ||
self.chrx_vars = file | ||
else: | ||
self.chr_var_frq = file | ||
LOGGER.info("File '{}' set up successfully".format(file)) | ||
else: | ||
msg = "File '{}' does notxist".format(file) | ||
LOGGER.error(msg) | ||
raise FileNoFoundError(msg) | ||
else: | ||
msg = "Directory '{}' does not exist".format(resources_path) | ||
LOGGER.error(msg) | ||
raise FileNotFoundError(msg) | ||
|
||
def set_variant_based_inferred_sex_dir(self): | ||
check_sex_dir = create_output_dir(path_elements=[self.individual_qc_executor_info["output_parent_dir"],"variant_based_inferred_sex"]) | ||
self.check_sex_dir = check_sex_dir | ||
|
||
def filter_variants(self): | ||
""" | ||
Annotates input VCF with chromosome coordinate IDs and filters for pruned PASS chr X variants using BCFTools. | ||
:return: | ||
""" | ||
# Read in VCF: | ||
vcf_file = gzip.open(self.individual_qc_executor_info["vcf_file"], "r") | ||
# Read in list of variants to filter for: | ||
good_vars = open(self.chrx_vars, "r") | ||
|
||
# Create output file: | ||
individual_id = self.individual_qc_executor_info["id_"] | ||
filtered_vcf_name = individual_id + self.sample_id + "_filtered_variants.vcf.gz" | ||
filtered_vcf_path = os.path.join(self.check_sex_dir, filtered_vcf_name) | ||
LOGGER.debug("Generating VCF filtered for good quality pruned chr X variants: '{}'".format(filtered_vcf_path)) | ||
|
||
# Check if input VCF FILTER field contains PASS variants: | ||
pass_check_cmd = "bcftools view -i 'FILTER="PASS"' " + vcf_file + " zgrep -vw '#/|CHROM' | wc -l" | ||
pass_vars = execute_bash_command(cmd=pass_check_cmd) | ||
LOGGER.info("Checking for PASS variants in the FILTER field of '{}'".format(vcf_file)) | ||
|
||
# Annotate input VCF with chr coordinate IDs, filter for chr X pruned variants and, if annotated, for PASS variants: | ||
annotate_and_filter_pass_cmd = "bcftools annotate --set-id '%CHROM\:%POS\:%REF\:%FIRST_ALT' " + vcf_file + "-Oz | bcftools view --include ID==@" + self.chrx_vars + " -Oz | bcftools view -i 'FILTER="PASS"' -Oz -o " + filtered_vcf_path | ||
annotate_and_filter_no_pass_cmd = "bcftools annotate --set-id '%CHROM\:%POS\:%REF\:%FIRST_ALT' " + vcf_file + "-Oz | bcftools view --include ID==@" + self.chrx_vars + " -Oz -o " + filtered_vcf_path | ||
#### NEED TO FIGURE OUT A WAY TO COUNT THE NUMBER OF VARIANTS REMAINING AFTER FILTERING SO I CAN HAVE AN ELIF FOR LOW NO. VARS | ||
if pass_vars == 0: | ||
execute_bash_command(cmd=annotate_and_filter_no_pass_cmd) | ||
LOGGER.debug("Annotating and filtering '{}' for all pruned chr X variants".format(vcf_file)) | ||
LOGGER.info("WARNING: no FILTER information available, input data will not be filtered for PASS variants, results may be unreliable. There are '{}'") | ||
|
||
execute_bash_command(cmd=annotate_and_filter_pass_cmd) | ||
LOGGER.debug("Annotating and filtering '{}' for chr X pruned PASS variants".format(vcf_file)) | ||
LOGGER.info("There are '{}' good quality chr X pruned variants after filtering".format(pass_vars)) | ||
else: | ||
|
||
|
||
|
||
for sample in self.individual_qc_executor_info["sample_ids"]: | ||
sample_id = sample | ||
|
||
|
||
|