From ffca906b9283226b3a324ac2787f3deeca8edd11 Mon Sep 17 00:00:00 2001 From: Ian Hoskins Date: Mon, 11 Sep 2023 16:31:23 -0500 Subject: [PATCH 1/3] IH 11SEP2023 version to v1.1.0. --- CHANGES.txt | 3 + README.md | 10 +- docs/satmut_utils_manual.md | 3 + setup.cfg | 3 +- src/analysis/coordinate_mapper.py | 1082 +++++++++++++++++- src/analysis/variant_caller.py | 226 +++- src/core_utils/vcf_utils.py | 3 + src/satmut_utils/satmut_utils.py | 2 +- src/tests/analysis/test_coordinate_mapper.py | 569 +++++++-- src/tests/analysis/test_variant_caller.py | 184 ++- 10 files changed, 1897 insertions(+), 188 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 017592f..0c13f1b 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -17,3 +17,6 @@ Release Summary: 1.0.4, June 27, 2023 Fixed bug where relative file paths for the output directory caused the alignment workflow to fail. + +1.1.0, September 11, 2023 + Added support for simple insertion and deletion calling. Added MAVE-HGVS annotations. diff --git a/README.md b/README.md index 85c9640..86bd52b 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,9 @@ Common arguments to both 'sim' and 'call' subcommands should be provided first, It is recommended that a new output directory is created for each job. Default is to output to the current directory. ```OUTPUT_DIR="/tmp/satmut_utils_test"``` +Additionally, the user can set a directory for temporary files by setting an environment variable $SCRATCH. If this variable is not set, temporary files are written to /tmp. +```export SCRATCH="$(mktemp -d -t satmut_temp)"``` + ### Run 'sim' Run 'sim' on *in silico* alignments to generate SNPs, MNPs, and InDels. Structural variants and gene fusions are not currently supported. @@ -71,7 +74,7 @@ The 'sim' workflow outputs paired FASTQs, a realigned BAM file, and a truth VCF ### Run 'call' -Currently, only SNP and MNP calling is supported. InDels and long-range haplotypes are not called. +Support exists for calling SNPs/SNVs, MNPs/MNVs, insertions, and deletions. Multi-codon changes and complex InDels are not supported (e.g. delete a codon, insert a nt). Run 'call' on the simulated data by specifying an Ensembl transcript/gene ID and the directory containing curated reference files. ``` @@ -109,3 +112,8 @@ To run unit tests, execute the following from the satmut\_utils repository: ```nose2 -q``` +## Citation + +If you use satmut\_utils, please cite the following paper: + +Hoskins I, Sun S, Cote A, Roth FP, Cenik C. satmut_utils: a simulation and variant calling package for multiplexed assays of variant effect. Genome Biol. BioMed Central; 2023 Apr 20;24(1):1–2 diff --git a/docs/satmut_utils_manual.md b/docs/satmut_utils_manual.md index 6a23bc5..e7c7dac 100644 --- a/docs/satmut_utils_manual.md +++ b/docs/satmut_utils_manual.md @@ -200,6 +200,9 @@ It is recommended that a new output directory is created for each job. Default i ```OUTPUT_DIR="/tmp/satmut_utils_test"``` +Additionally, the user can set a directory for temporary files by setting an environment variable $SCRATCH. If this variable is not set, temporary files are written to /tmp. +```export SCRATCH="$(mktemp -d -t satmut_temp)"``` + ### 'sim' code examples Run the 'sim' workflow by providing a BAM containing paired-end, single-contig alignments, and a VCF file specifying variants and their desired frequencies. diff --git a/setup.cfg b/setup.cfg index 6dbd9e7..a2f5a97 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = satmut_utils -version = 1.0.4 +version = 1.1.0 author = Ian Hoskins author_email = ianjameshoskins@utexas.edu description = Tools for variant simulation and variant calling in paired end, targeted sequencing saturation mutagenesis experiments @@ -16,7 +16,6 @@ classifiers = Operating System :: POSIX :: Linux Programming Language :: Python Programming Language :: Python :: 3 - Programming Language :: Python :: 3.7 Topic :: Scientific/Engineering :: Bio-Informatics [options] diff --git a/src/analysis/coordinate_mapper.py b/src/analysis/coordinate_mapper.py index 38de5d3..885eb90 100644 --- a/src/analysis/coordinate_mapper.py +++ b/src/analysis/coordinate_mapper.py @@ -23,6 +23,8 @@ __email__ = "ianjameshoskins@utexas.edu" __status__ = "Development" +STOP_AA = "*" + CODONS = ("GCC", "GCT", "GCA", "GCG", "TGC", "TGT", "GAC", "GAT", @@ -47,16 +49,25 @@ CODON_AAS = ["A"] * 4 + ["C"] * 2 + ["D"] * 2 + ["E"] * 2 + ["F"] * 2 + ["G"] * 4 + ["H"] * 2 + ["I"] * 3 + ["K"] * 2 + \ ["L"] * 6 + ["M"] + ["N"] * 2 + ["P"] * 4 + ["Q"] * 2 + ["R"] * 6 + ["S"] * 6 + ["T"] * 4 + ["V"] * 4 + ["W"] + \ - ["Y"] * 2 + ["*"] * 3 + ["Y"] * 2 + [STOP_AA] * 3 CODON_AA_DICT = dict(zip(CODONS, CODON_AAS)) +STOP_CODONS = {"TGA", "TAA", "TAG"} + HGVS_AA_FORMAT = "p.{}{}{}" MUT_SIG_UNEXPECTED_WOBBLE_BPS = {"NNN": set(), "NNK": {"A", "C"}, "NNS": {"A", "T"}} +AA_MAP = {"A": "Ala", "V": "Val", "I": "Ile", "L": "Leu", "M": "Met", "F": "Phe", "Y": "Tyr", "W": "Trp", + "S": "Ser", "T": "Thr", "N": "Asn", "Q": "Gln", "C": "Cys", "U": "Sec", "G": "Gly", "P": "Pro", + "R": "Arg", "H": "His", "K": "Lys", "D": "Asp", "E": "Glu", "*": "Ter"} + EXON_COORDS_TUPLE = collections.namedtuple("EXON_COORDS_TUPLE", "exon_id, contig, start, stop, exon_len, strand") MUT_INFO_TUPLE = collections.namedtuple( - "MUT_INFO_TUPLE", "location, wt_codons, mut_codons, wt_aas, mut_aas, aa_changes, aa_positions, matches_mut_sig") + "MUT_INFO_TUPLE", "location, wt_codons, mut_codons, wt_aas, mut_aas, aa_changes, aa_positions, matches_mut_sig, " + "mave_hgvs_nt, mave_hgvs_tx, mave_hgvs_pro") + +CODON_TUPLE = collections.namedtuple("CODON_TUPLE", "codon_pos, codon, base_index") tempfile.tempdir = DEFAULT_TEMPDIR logger = logging.getLogger(__name__) @@ -136,7 +147,7 @@ class MapperBase(object): class AminoAcidMapper(MapperBase): - """Class for determining the codon and AA change(s) given a transcriptomic variant.""" + """Class for determining nucleotide and amino acids annotations given a transcriptomic variant.""" FEATURES_TO_OMIT = re.compile("|".join(MapperBase.FEATURES_TO_OMIT + [MapperBase.GENE_ID, MapperBase.TRX_ID, MapperBase.MRNA_ID, @@ -162,6 +173,7 @@ class AminoAcidMapper(MapperBase): CDS_INFO_CDS_SEQ_INDEX = 4 DEFAULT_ARGS = MUT_INFO_TUPLE._fields[1:] DEFAULT_KWARGS = dict(zip(DEFAULT_ARGS, [su.R_COMPAT_NA] * len(DEFAULT_ARGS))) + NONSTOP_CHAR = "X" def __init__(self, gff, ref, outdir=DEFAULT_OUTDIR, use_pickle=True, make_pickle=True, overwrite_pickle=False, mut_sig=MUT_SIG_ANY, filter_unexpected=False): @@ -305,7 +317,7 @@ def _get_cds_info(self): def _concat_multi_mut_info(self, mut_info_tuple): """Concatenates multi-position mutation changes into comma-delimited strings. - :param collections.namedtuple mut_info_tuple: MUT_INFO_TUPLE with multi-position mutations in list + :param collections.namedtuple mut_info_tuple: MUT_INFO_TUPLE with multi-position mutations in iterable :return collections.namedtuple MUT_INFO_TUPLE: MUT_INFO_TUPLE with multi-position mutations in str """ @@ -314,77 +326,1006 @@ def _concat_multi_mut_info(self, mut_info_tuple): wt_aas = self.MUT_INFO_DELIM.join(mut_info_tuple.wt_aas) mut_aas = self.MUT_INFO_DELIM.join(mut_info_tuple.mut_aas) aa_changes = self.MUT_INFO_DELIM.join(mut_info_tuple.aa_changes) - aa_pos_list = [re.sub("[p.A-Z*]", "", aa) for aa in mut_info_tuple.aa_changes] + aa_pos_list = [re.sub("[,p.A-Z*]", "", aa) for aa in mut_info_tuple.aa_changes] aa_positions = self.MUT_INFO_DELIM.join(aa_pos_list) matches_mut_sig = self.MUT_INFO_DELIM.join(list(map(str, mut_info_tuple.matches_mut_sig))) new_mut_info_tuple = MUT_INFO_TUPLE( location=mut_info_tuple.location, wt_codons=wt_codons, mut_codons=mut_codons, wt_aas=wt_aas, mut_aas=mut_aas, - aa_changes=aa_changes, aa_positions=aa_positions, matches_mut_sig=matches_mut_sig) + aa_changes=aa_changes, aa_positions=aa_positions, matches_mut_sig=matches_mut_sig, + mave_hgvs_nt=mut_info_tuple.mave_hgvs_nt, mave_hgvs_tx=mut_info_tuple.mave_hgvs_tx, + mave_hgvs_pro=mut_info_tuple.mave_hgvs_pro) return new_mut_info_tuple - def _get_mut_info(self, wt_cds_seq, mut_cds_seq): - """Finds codon and amino acid changes given a WT and mutant CDS. + @staticmethod + def _get_pos_codon_dict(cds_seq): + """Gets a mapping between index in the coding sequence and codon. + + :param str cds_seq: coding sequence + :return dict: index in the CDS: CODON_TUPLE (codon_pos, codon, base_index) + """ + + cds_dict = {} + wobble_positions = set(range(2, len(cds_seq), 3)) + codon_pos = 0 + + for i in range(0, len(cds_seq)): + if (i % 3) == 0: + codon_pos += 1 + curr_codon = cds_seq[i:i + 3] + cds_dict[i] = CODON_TUPLE(codon_pos, curr_codon, 0) + last_codon = curr_codon + else: + if i in wobble_positions: + cds_dict[i] = CODON_TUPLE(codon_pos, last_codon, 2) + else: + cds_dict[i] = CODON_TUPLE(codon_pos, last_codon, 1) + + return cds_dict + + def _get_indel_downstream_remainder(self, trx_seq, pos, ref_len, alt, curr_codon, base_index, var_type): + """Gets the remaining codons and amino acids downstream of a frameshift insertion or deletion. + + :param str trx_seq: transcript sequence + :param int pos: 1-based position of the variant within the transcript + :param int ref_len: length of REF + :param str alt: alternate bases + :param str curr_codon: current reference codon + :param int base_index: index of variant position in codon + :param aenum.MultiValueEnum var_type: variant type, either ins or del + :return tuple: (remaining codons, remaining amino acids) + :raises NotImplementedError: if the var_type is not an insertion or deletion + """ + + if var_type == vu.VariantType.INS: + alt_plus_downstream = curr_codon[:base_index] + alt + trx_seq[pos:] + elif var_type == vu.VariantType.DEL: + alt_plus_downstream = curr_codon[:base_index + 1] + trx_seq[pos + ref_len - 1:] + else: + raise NotImplementedError + + alt_plus_downstream_codons = [alt_plus_downstream[i:i + 3] for i in range(0, len(alt_plus_downstream), 3)] + + remaining_codons = [] + remaining_aas = [] + + for codon in alt_plus_downstream_codons: + + if len(codon) != 3: + # Very unlikely this would happen before we break (if nonsense codon is not seen before the end of + # the transcript), but need to handle + remaining_codons.append(codon) + remaining_aas.append(self.NONSTOP_CHAR) + break + + remaining_codons.append(codon) + remaining_aas.append(translate(codon)) + + if codon in STOP_CODONS: + break + + return tuple(remaining_codons), tuple(remaining_aas) + + def _annotate_snp(self, ref_codon_dict, alt_codon_dict, start_index): + """Gets annotations for a single-codon change. + + :param dict ref_codon_dict: index in the REF CDS: CODON_TUPLE (codon, index of base in the codon) + :param dict alt_codon_dict: index in the ALT CDS: CODON_TUPLE (codon, index of base in the codon) + :param int start_index: 0-based position of the variant in the CDS + :return tuple: (ref_codons, alt_codons, ref_aas, alt_aas, aa_changes, matches_mut_sigs) + """ + + ref_codons = (ref_codon_dict[start_index].codon,) + alt_codons = (alt_codon_dict[start_index].codon,) + ref_aas = (translate(ref_codons[0]),) + alt_aas = (translate(alt_codons[0]),) + aa_changes = (HGVS_AA_FORMAT.format(ref_aas[0], ref_codon_dict[start_index].codon_pos, alt_aas[0]),) + wt_wobble = ref_codons[0][2] + mut_wobble = alt_codons[0][2] + matches_mut_sigs = (False if self.mut_sig != self.MUT_SIG_ANY and wt_wobble != mut_wobble and + mut_wobble in MUT_SIG_UNEXPECTED_WOBBLE_BPS[self.mut_sig] else True,) + + return ref_codons, alt_codons, ref_aas, alt_aas, aa_changes, matches_mut_sigs + + def _annotate_mnp(self, ref_codon_dict, alt_codon_dict, start_index, end_index): + """Gets annotations for a multi-codon change. + + :param dict ref_codon_dict: index in the REF CDS: CODON_TUPLE (codon, index of base in the codon) + :param dict alt_codon_dict: index in the ALT CDS: CODON_TUPLE (codon, index of base in the codon) + :param int start_index: 0-based position of the first mismatch in the CDS + :param int end_index: 0-based position of the last mismatch in the CDS + :return tuple: (ref_codons, alt_codons, ref_aas, alt_aas, aa_changes, matches_mut_sigs) + """ + + ref_codons = [] + alt_codons = [] + ref_aas = [] + alt_aas = [] + aa_changes = [] + matches_mut_sigs = [] + + # Require end_index + 3 for MNPs spanning codons + for i in range(start_index, end_index + 3, 3): + + ref_codon = ref_codon_dict[i].codon + alt_codon = alt_codon_dict[i].codon + + # Skip intervening codons that match + if ref_codon == alt_codon: + continue + + ref_aa = translate(ref_codon) + alt_aa = translate(alt_codon) + aa_change = HGVS_AA_FORMAT.format(ref_aa, ref_codon_dict[i].codon_pos, alt_aa) + wt_wobble = ref_codon[2] + mut_wobble = alt_codon[2] + matches_mut_sig = False if self.mut_sig != self.MUT_SIG_ANY and wt_wobble != mut_wobble and \ + mut_wobble in MUT_SIG_UNEXPECTED_WOBBLE_BPS[self.mut_sig] else True + + ref_codons.append(ref_codon) + alt_codons.append(alt_codon) + ref_aas.append(ref_aa) + alt_aas.append(alt_aa) + aa_changes.append(aa_change) + matches_mut_sigs.append(matches_mut_sig) + + return tuple(ref_codons), tuple(alt_codons), tuple(ref_aas), tuple(alt_aas), tuple(aa_changes), \ + tuple(matches_mut_sigs) + + def _annotate_ins(self, trx_seq, ref_codon_dict, start_index, pos, ref, alt): + """Gets annotations for an insertion. + + :param str trx_seq: transcript sequence + :param dict ref_codon_dict: index in the REF CDS: CODON_TUPLE (codon, index of base in the codon) + :param int start_index: 0-based position of the first mismatch in the CDS + :param int pos: 1-based position of the variant within the transcript + :param str ref: reference bases + :param str alt: alternate bases + :return tuple: (ref_codons, alt_codons, ref_aas, alt_aas, aa_changes, matches_mut_sigs) + """ + + alt_len = len(alt) + + if ref_codon_dict[start_index].base_index == 2 and ((alt_len - 1) % 3) == 0: + # Insertion is in-frame and does not change current/downstream codons + alt_ins = alt[1:] + alt_list = [alt_ins[i:i + 3] for i in range(0, alt_len, 3)] + ref_codons = (ref_codon_dict[start_index].codon,) + alt_codons = (ref_codons[0], *alt_list,) + ref_aas = (translate(ref_codons[0]),) + alt_aas = (ref_aas[0], *[translate(e) for e in alt_list],) + aa_changes = (HGVS_AA_FORMAT.format(ref_aas[0], ref_codon_dict[start_index].codon_pos, + self.MUT_INFO_DELIM.join(alt_aas)),) + matches_mut_sigs = (True,) + else: + # Out-of-frame insertion or shuffling of downstream codons + codon_remainder, aa_remainder = self._get_indel_downstream_remainder( + trx_seq, pos, len(ref), alt, ref_codon_dict[start_index].codon, ref_codon_dict[start_index].base_index, + var_type=vu.VariantType.INS) + + ref_codons = (ref_codon_dict[start_index].codon,) + alt_codons = codon_remainder + ref_aas = (translate(ref_codons[0]),) + alt_aas = aa_remainder + aa_changes = (HGVS_AA_FORMAT.format(ref_aas[0], ref_codon_dict[start_index].codon_pos, + self.MUT_INFO_DELIM.join(alt_aas)),) + matches_mut_sigs = (False,) + + return ref_codons, alt_codons, ref_aas, alt_aas, aa_changes, matches_mut_sigs + + def _annotate_del(self, trx_seq, ref_codon_dict, start_index, pos, ref, alt): + """Gets annotations for an insertion. + + :param str trx_seq: transcript sequence + :param dict ref_codon_dict: index in the REF CDS: CODON_TUPLE (codon, index of base in the codon) + :param int start_index: 0-based position of the first mismatch in the CDS + :param int pos: 1-based position of the variant within the transcript + :param str ref: reference bases + :param str alt: alternate bases + :return tuple: (ref_codons, alt_codons, ref_aas, alt_aas, aa_changes, matches_mut_sigs) + """ + + ref_len = len(ref) + + if ref_codon_dict[start_index].base_index == 2 and ((ref_len - 1) % 3) == 0: + # Deletion is in-frame and does not change current/downstream codons + ref_del = ref[1:] + ref_list = [ref_del[i:i + 3] for i in range(0, len(ref_del), 3)] + ref_codons = (ref_codon_dict[start_index].codon, *ref_list,) + alt_codons = (ref_codons[0],) + ref_aas = (translate(ref_codons[0]), *[translate(e) for e in ref_list],) + alt_aas = (ref_aas[0],) + aa_changes = (HGVS_AA_FORMAT.format( + self.MUT_INFO_DELIM.join(ref_aas), ref_codon_dict[start_index].codon_pos, alt_aas[0]),) + matches_mut_sigs = (True,) + else: + # Out-of-frame deletion or shuffling of downstream codons + codon_remainder, aa_remainder = self._get_indel_downstream_remainder( + trx_seq, pos, ref_len, alt, ref_codon_dict[start_index].codon, ref_codon_dict[start_index].base_index, + var_type=vu.VariantType.DEL) + + ref_codons = (ref_codon_dict[start_index].codon,) + alt_codons = codon_remainder + ref_aas = (translate(ref_codons[0]),) + alt_aas = aa_remainder + aa_changes = (HGVS_AA_FORMAT.format( + ref_aas[0], ref_codon_dict[start_index].codon_pos, self.MUT_INFO_DELIM.join(alt_aas)),) + matches_mut_sigs = (False,) + + return ref_codons, alt_codons, ref_aas, alt_aas, aa_changes, matches_mut_sigs + def _annotate_stopvar(self, trx_seq, pos, ref, alt, cds_start_offset, ref_codon_dict, alt_codon_dict): + """Annotates a variant in the stop codon. + + :param str trx_seq: transcript sequence + :param int pos: 1-based position of the variant within the transcript + :param str ref: reference bases + :param str alt: alternate bases + :param int cds_start_offset: transcript position of the CDS start nucleotide + :param dict ref_codon_dict: index in the REF CDS: CODON_TUPLE (codon, index of base in the codon) + :param dict alt_codon_dict: index in the ALT CDS: CODON_TUPLE (codon, index of base in the codon) + :return collections.namedtuple: MUT_INFO_TUPLE + """ + + ref_len = len(ref) + alt_len = len(alt) + start_index = pos - cds_start_offset - 1 + end_index = start_index if (ref_len == alt_len and ref_len == 1) else start_index + ref_len + + if ref_len == alt_len: + + # Here we have a SNP or MNP so we can simply index into the codon lists + if start_index == end_index: + # Here we have a SNP + mut_info_tuple = self._annotate_snp(ref_codon_dict, alt_codon_dict, start_index) + + else: + + # Here we have a single-codon MNP or a multi-codon change (haplotype, multivariant) + # NOTE: this is not accurate if a MNP spans the CDS/3' UTR junction, because alt_codon_dict + # derives from a sliced mut_cds_seq in get_codon_and_aa_changes() + mut_info_tuple = self._annotate_mnp(ref_codon_dict, alt_codon_dict, start_index, end_index) + + elif ref_len < alt_len: + + # Here we have an insertion or duplication + mut_info_tuple = self._annotate_ins(trx_seq, ref_codon_dict, start_index, pos, ref, alt) + + else: + + # Here we have a deletion + mut_info_tuple = self._annotate_del(trx_seq, ref_codon_dict, start_index, pos, ref, alt) + + return mut_info_tuple + + # Note: MAVE-HGVS formatting assumes the transcript has a coding region (not noncoding) + def _annotate_snp_hgvs(self, trx_id, location, pos, ref, alt, cds_start_offset, cds_stop_offset, start_index, + ref_aa, aa_pos, alt_aa): + """Annotates MAVE-HGVS for a substitution. + + :param str trx_id: transcript ID + :param str location: transcript location, one of {5_UTR, CDS, 3_UTR} + :param int pos: 1-based position of the variant within the transcript + :param str ref: reference bases + :param str alt: alternate bases + :param int cds_start_offset: 0-based transcript position of the CDS start nucleotide + :param int cds_stop_offset: 0-based transcript position of the nucleotide after the last CDS position + :param int | None start_index: 0-based position of the first mismatch in the CDS + :param str | None ref_aa: reference amino acid + :param int | None aa_pos: amino acid position + :param str | None alt_aa: alternate amino acid + :return tuple: (hgvs_nt, hgvs_tx, hgvs_pro) + :raises NotImplementedError: if the variant has an intergenic or untranslated (intronic) location + """ + + if location == self.FIVEPRIME_UTR: + utr_pos = pos - cds_start_offset - 1 + hgvs_nt = "%s:c.%i%s>%s" % (trx_id, utr_pos, ref.lower(), alt.lower()) + hgvs_tx = "%s:r.%i%s>%s" % (trx_id, utr_pos, su.dna_to_rna(ref.lower()), su.dna_to_rna(alt.lower())) + hgvs_pro = "p.(=)" + + elif location == self.THREEPRIME_UTR: + utr_pos = pos - cds_stop_offset + hgvs_nt = "%s:c.*%i%s>%s" % (trx_id, utr_pos, ref.lower(), alt.lower()) + hgvs_tx = "%s:r.*%i%s>%s" % (trx_id, utr_pos, su.dna_to_rna(ref.lower()), su.dna_to_rna(alt.lower())) + hgvs_pro = "p.(=)" + + elif location == self.CDS_ID: + hgvs_nt = "%s:c.%i%s>%s" % (trx_id, start_index + 1, ref.lower(), alt.lower()) + hgvs_tx = "%s:r.%i%s>%s" % (trx_id, start_index + 1, su.dna_to_rna(ref.lower()), su.dna_to_rna(alt.lower())) + if ref_aa == alt_aa: + hgvs_pro = "p.%s%i=" % (AA_MAP[ref_aa], aa_pos) + else: + hgvs_pro = "p.%s%i%s" % (AA_MAP[ref_aa], aa_pos, AA_MAP[alt_aa]) + else: + raise NotImplementedError + + return hgvs_nt, hgvs_tx, hgvs_pro + + def _annotate_mnp_hgvs(self, trx_id, location, pos, alt, cds_start_offset, cds_stop_offset, start_index, + ref_codon_dict, alt_codon_dict): + """Annotates MAVE-HGVS for a MNP (single or dicodon change). + + :param str trx_id: transcript ID + :param str location: transcript location, one of {5_UTR, CDS, 3_UTR} + :param int pos: 1-based position of the variant within the transcript + :param str alt: alternate bases + :param int cds_start_offset: 0-based transcript position of the CDS start nucleotide + :param int cds_stop_offset: 0-based transcript position of the nucleotide after the last CDS position + :param int | None start_index: 0-based position of the first mismatch in the CDS + :param dict ref_codon_dict: index in the REF CDS: CODON_TUPLE (codon, index of base in the codon) + :param dict alt_codon_dict: index in the ALT CDS: CODON_TUPLE (codon, index of base in the codon) + :return tuple: (hgvs_nt, hgvs_tx, hgvs_pro) + :raises NotImplementedError: if the variant has an intergenic or untranslated (intronic) location + """ + + alt_len = len(alt) + + if location == self.FIVEPRIME_UTR: + utr_start = pos - cds_start_offset - 1 + utr_stop = utr_start + alt_len - 1 + + # Test for MNP spanning 5' UTR-CDS junction + if utr_stop >= 0: + # We can use alt_codon_dict because we generated the mut_cds_seq in get_codon_and_aa_changes + # for this special case of MNP spanning 5' UTR-CDS junction + hgvs_nt = "%s:c.%i_%idelins%s" % (trx_id, utr_start + 1, utr_stop + 1, alt.lower()) + hgvs_tx = "%s:r.%i_%idelins%s" % (trx_id, utr_start + 1, utr_stop + 1, su.dna_to_rna(alt.lower())) + + ref_aa = translate(ref_codon_dict[0].codon) + alt_aa = translate(alt_codon_dict[0].codon) + if ref_aa == alt_aa: + hgvs_pro = "p.%s%i=" % (AA_MAP[ref_aa], 1) + else: + # Unclear if this should be a substitution or delins if only one mismatch is in the start codon + # For now leave as delins to indicate variant is a MNP + hgvs_pro = "p.(=),p.%s%idelins%s" % (AA_MAP[ref_aa], 1, AA_MAP[alt_aa]) + else: + # MNP is isolated in the 5' UTR + hgvs_nt = "%s:c.%i_%idelins%s" % (trx_id, utr_start, utr_stop, alt.lower()) + hgvs_tx = "%s:r.%i_%idelins%s" % (trx_id, utr_start, utr_stop, su.dna_to_rna(alt.lower())) + hgvs_pro = "p.(=)" + + elif location == self.THREEPRIME_UTR: + utr_start = pos - cds_stop_offset + utr_stop = utr_start + alt_len - 1 + hgvs_nt = "%s:c.*%i_*%idelins%s" % (trx_id, utr_start, utr_stop, alt.lower()) + hgvs_tx = "%s:r.*%i_*%idelins%s" % (trx_id, utr_start, utr_stop, su.dna_to_rna(alt.lower())) + hgvs_pro = "p.(=)" + + elif location == self.CDS_ID: + hgvs_nt = "%s:c.%i_%idelins%s" % (trx_id, start_index + 1, start_index + alt_len, alt.lower()) + hgvs_tx = "%s:r.%i_%idelins%s" % ( + trx_id, start_index + 1, start_index + alt_len, su.dna_to_rna(alt.lower())) + + ref_aa = translate(ref_codon_dict[start_index].codon) + alt_aa = translate(alt_codon_dict[start_index].codon) + + # Determine if the MNP spans codons + if (ref_codon_dict[start_index].base_index == 1 and alt_len == 3) or \ + ref_codon_dict[start_index].base_index == 2: + + # MNP spans codons + second_ref_aa = translate(ref_codon_dict[start_index + 3].codon) + ref_aas = "".join((ref_aa, second_ref_aa,)) + alt_aas = "".join((AA_MAP[alt_aa], AA_MAP[translate(alt_codon_dict[start_index + 3].codon)],)) + + if ref_aas == alt_aas: + hgvs_pro = "p.%s%i_%s%i=" % ( + AA_MAP[ref_aa], ref_codon_dict[start_index].codon_pos, AA_MAP[second_ref_aa], + ref_codon_dict[start_index + 3].codon_pos) + else: + hgvs_pro = "p.%s%i_%s%idelins%s" % ( + AA_MAP[ref_aa], ref_codon_dict[start_index].codon_pos, AA_MAP[second_ref_aa], + ref_codon_dict[start_index + 3].codon_pos, alt_aas) + else: + # MNP affects one codon + if ref_aa == alt_aa: + hgvs_pro = "p.%s%i=" % (AA_MAP[ref_aa], ref_codon_dict[start_index].codon_pos) + else: + hgvs_pro = "p.%s%idelins%s" % (AA_MAP[ref_aa], ref_codon_dict[start_index].codon_pos, AA_MAP[alt_aa]) + else: + raise NotImplementedError + + return hgvs_nt, hgvs_tx, hgvs_pro + + @staticmethod + def _group_mismatches(mm_positions, mm_indices): + """Groups mismatches to enable determination of substitution or delins type. + + :param tuple mm_positions: mismatched positions + :param tuple mm_indices: mismatched indices of REF and ALT fields + :return tuple: group tuple containing lists of indices and positions + """ + + mm_groups = [[]] + for i, (mm_idx, mm_pos) in enumerate(zip(mm_indices, mm_positions)): + if i == 0: + mm_groups[0].append((mm_idx, mm_pos,)) + continue + + for j, mm_group in enumerate(mm_groups): + + mm_group_positions = [i[1] for i in mm_group] + min_pos = min(mm_group_positions) + + if mm_pos - min_pos < 3: + mm_groups[j].append((mm_idx, mm_pos,)) + break + else: + # If the mismatch could not be merged with an existing group create a new one + mm_groups.append([(mm_idx, mm_pos,)]) + + return tuple(mm_groups) + + def _annotate_multivariant_hgvs(self, trx_id, location, pos, ref, alt, cds_start_offset, cds_stop_offset, + ref_codon_dict, alt_codon_dict): + """Concatenates HGVS for multivariants (alleles). + + :param str trx_id: transcript ID + :param str location: transcript location, one of {5_UTR, CDS, 3_UTR} + :param int pos: 1-based position of the variant within the transcript + :param str ref: reference bases + :param str alt: alternate bases + :param int cds_start_offset: 0-based transcript position of the CDS start nucleotide + :param int cds_stop_offset: 0-based transcript position of the nucleotide after the last CDS position + :param dict ref_codon_dict: index in the REF CDS: CODON_TUPLE (codon, index of base in the codon) + :param dict alt_codon_dict: index in the ALT CDS: CODON_TUPLE (codon, index of base in the codon) + :return tuple: (hgvs_nt, hgvs_tx, hgvs_pro) + :raises NotImplementedError: if the variant has an intergenic or untranslated (intronic) location + """ + + if location not in {self.FIVEPRIME_UTR, self.THREEPRIME_UTR, self.CDS_ID}: + return NotImplementedError + + hgvs_nt_list = [] + hgvs_tx_list = [] + hgvs_pro_list = [] + + # Group mismatches to determine substitution versus delins types + mm_indices = tuple([i for i, (e1, e2) in enumerate(zip(ref, alt)) if e1 != e2]) + mm_positions = tuple([i + pos for i in mm_indices]) + mm_groups = self._group_mismatches(mm_positions, mm_indices) + + # Each group is a list of tuples (mm_index, mm_pos) + for mm_group in mm_groups: + + if len(mm_group) == 1: + + # Call a substitution + if location == self.FIVEPRIME_UTR or location == self.THREEPRIME_UTR: + hgvs_nt, hgvs_tx, hgvs_pro = self._annotate_snp_hgvs( + trx_id, location, mm_group[0][1], ref[mm_group[0][0]], alt[mm_group[0][0]], + cds_start_offset, cds_stop_offset, None, None, None, None) + else: + start_index = mm_group[0][1] - cds_start_offset - 1 + ref_aa = translate(ref_codon_dict[start_index].codon) + aa_pos = ref_codon_dict[start_index].codon_pos + alt_aa = translate(alt_codon_dict[start_index].codon) + hgvs_nt, hgvs_tx, hgvs_pro = self._annotate_snp_hgvs( + trx_id, location, mm_group[0][1], ref[mm_group[0][0]], alt[mm_group[0][0]], + cds_start_offset, cds_stop_offset, start_index, ref_aa, aa_pos, alt_aa) + else: + + # Call an delins + if location == self.FIVEPRIME_UTR or location == self.THREEPRIME_UTR: + hgvs_nt, hgvs_tx, hgvs_pro = self._annotate_mnp_hgvs( + trx_id, location, mm_group[0][1], alt[mm_group[0][0]:mm_group[len(mm_group) - 1][0] + 1], + cds_start_offset, cds_stop_offset, None, ref_codon_dict, alt_codon_dict) + else: + start_index = mm_group[0][1] - cds_start_offset - 1 + hgvs_nt, hgvs_tx, hgvs_pro = self._annotate_mnp_hgvs( + trx_id, location, mm_group[0][1], alt[mm_group[0][0]:mm_group[len(mm_group) - 1][0] + 1], + cds_start_offset, cds_stop_offset, start_index, ref_codon_dict, alt_codon_dict) + + hgvs_nt_list.append(hgvs_nt) + hgvs_tx_list.append(hgvs_tx) + hgvs_pro_list.append(hgvs_pro) + + # MAVEdb wants these as semicolon delimited changes, but because write to VCF INFO field, use comma delim + hgvs_nt = self.MUT_INFO_DELIM.join(hgvs_nt_list) + hgvs_tx = self.MUT_INFO_DELIM.join(hgvs_tx_list) + hgvs_pro = self.MUT_INFO_DELIM.join(hgvs_pro_list) + + return hgvs_nt, hgvs_tx, hgvs_pro + + @staticmethod + def _is_duplication(trx_seq, pos, alt): + """Determines if a variant is a duplication. + + :param str trx_seq: transcript sequence + :param int pos: 1-based position of the variant within the transcript + :param str alt: alternate bases + :return bool: whether or not the variant is a duplication + """ + + # Get the sequence just upstream the alternate + alt_len = len(alt) + + # Need to check for cases where the 5' UTR is non-existent or very short and/or the insertion is large + upstream_start = pos - alt_len + if upstream_start < 0: + # This may actually be True, but difficult to index into intergenic space to verify + # For now assume False + return False + + upstream_bases = trx_seq[upstream_start + 1:pos] + + if upstream_bases == alt[1:]: + return True + + return False + + @staticmethod + def _get_fs_first_alt_aa(alt_aas, cds_stop_offset, start_index, ref_codon_dict): + """Gets the first altered amino acid from a frameshift. + + :param tuple alt_aas: alternate amino acids + :param int cds_stop_offset: transcript position of the CDS last nucleotide + :param int start_index: 0-based position of the first mismatch in the CDS + :param dict ref_codon_dict: index in the REF CDS: CODON_TUPLE (codon, index of base in the codon) + :return tuple: (first reference amino acid changed, position) + """ + + counter = 0 + + for i in range(start_index, cds_stop_offset, 3): + codon_pos, codon, base_index = ref_codon_dict[i] + + if base_index == 2: + ref_aa = translate(ref_codon_dict[i + 1].codon) + alt_aa = alt_aas[counter + 1] + aa_pos = ref_codon_dict[i + 1].codon_pos + else: + ref_aa = translate(ref_codon_dict[i].codon) + alt_aa = alt_aas[counter] + aa_pos = ref_codon_dict[i].codon_pos + + counter += 1 + + if ref_aa != alt_aa: + return ref_aa, aa_pos + + def _annotate_dup_hgvs(self, trx_id, location, pos, alt_len, cds_start_offset, cds_stop_offset, start_index, + alt_aas, ref_codon_dict): + """Annotates MAVE-HGVS for a duplication. + + :param str trx_id: transcript ID + :param str location: transcript location, one of {5_UTR, CDS, 3_UTR} + :param int pos: 1-based position of the variant within the transcript + :param int alt_len: alternate length + :param int cds_start_offset: 0-based transcript position of the CDS start nucleotide + :param int cds_stop_offset: 0-based transcript position of the nucleotide after the last CDS position + :param int | None start_index: 0-based position of the first mismatch in the CDS + :param tuple | None alt_aas: alternate amino acids + :param dict | None ref_codon_dict: index in the REF CDS: CODON_TUPLE (codon, index of base in the codon) + :return tuple: (hgvs_nt, hgvs_tx, hgvs_pro) + :raises NotImplementedError: if the variant has an intergenic or untranslated (intronic) location + """ + + if location == self.FIVEPRIME_UTR: + utr_stop = pos - cds_start_offset - 1 + if alt_len == 2: + # Dup is 1 base + hgvs_nt = "%s:c.%idup" % (trx_id, utr_stop) + hgvs_tx = "%s:r.%idup" % (trx_id, utr_stop) + else: + utr_start = utr_stop - alt_len + 2 + hgvs_nt = "%s:c.%i_%idup" % (trx_id, utr_start, utr_stop) + hgvs_tx = "%s:r.%i_%idup" % (trx_id, utr_start, utr_stop) + + hgvs_pro = "p.(=)" + + elif location == self.THREEPRIME_UTR: + utr_start = pos - cds_stop_offset + if alt_len == 2: + # Dup is 1 base + hgvs_nt = "%s:c.*%idup" % (trx_id, utr_start) + hgvs_tx = "%s:r.*%idup" % (trx_id, utr_start) + elif utr_start != 1: + # multi-nt dup is at +2 or more + utr_start = utr_start - alt_len + 2 + utr_stop = utr_start + alt_len - 2 + hgvs_nt = "%s:c.*%i_*%idup" % (trx_id, utr_start, utr_stop) + hgvs_tx = "%s:r.*%i_*%idup" % (trx_id, utr_start, utr_stop) + else: + # Special case where multi-nt dup ends at +1 + utr_start = cds_stop_offset - alt_len + 3 + hgvs_nt = "%s:c.%i_*%idup" % (trx_id, utr_start, 1) + hgvs_tx = "%s:r.%i_*%idup" % (trx_id, utr_start, 1) + + hgvs_pro = "p.(=)" + + elif location == self.CDS_ID: + # Determine if ins is 1 base (simpler annotation), if ins is in-frame, or is out-of-frame + if alt_len == 2: + # Dup is 1 base and we have a frameshift at the level of protein + hgvs_nt = "%s:c.%idup" % (trx_id, start_index + 1) + hgvs_tx = "%s:r.%idup" % (trx_id, start_index + 1) + + # Get the first alt amino acid + ref_aa, aa_pos = self._get_fs_first_alt_aa(alt_aas, cds_stop_offset, start_index, ref_codon_dict) + hgvs_pro = "p.%s%ifs" % (AA_MAP[ref_aa], aa_pos) + + else: + hgvs_nt = "%s:c.%i_%idup" % (trx_id, start_index - (alt_len - 2) + 1, start_index + 1) + hgvs_tx = "%s:r.%i_%idup" % (trx_id, start_index - (alt_len - 2) + 1, start_index + 1) + if ((alt_len - 1) % 3) == 0: + # In-frame dup + if alt_len == 4: + # Single amino acid dup + hgvs_pro = "p.%s%idup" % ( + AA_MAP[translate(ref_codon_dict[start_index].codon)], ref_codon_dict[start_index].codon_pos) + else: + # Multi amino acid dup + hgvs_pro = "p.%s%i_%s%idup" % ( + AA_MAP[translate(ref_codon_dict[start_index - (alt_len - 2)].codon)], + ref_codon_dict[start_index - (alt_len - 2)].codon_pos, + AA_MAP[translate(ref_codon_dict[start_index].codon)], ref_codon_dict[start_index].codon_pos) + else: + # Frameshift dup + ref_aa, aa_pos = self._get_fs_first_alt_aa(alt_aas, cds_stop_offset, start_index, ref_codon_dict) + hgvs_pro = "p.%s%ifs" % (AA_MAP[translate(ref_aa)], aa_pos) + else: + raise NotImplementedError + + return hgvs_nt, hgvs_tx, hgvs_pro + + def _annotate_ins_hgvs(self, trx_id, trx_seq, location, pos, alt, cds_start_offset, cds_stop_offset, start_index, + alt_aas, ref_codon_dict): + """Annotates MAVE-HGVS for an insertion of duplication. + + :param str trx_id: transcript ID + :param str trx_seq: transcript sequence + :param str location: transcript location, one of {5_UTR, CDS, 3_UTR} + :param int pos: 1-based position of the variant within the transcript + :param str alt: alternate bases + :param int cds_start_offset: 0-based transcript position of the CDS start nucleotide + :param int cds_stop_offset: 0-based transcript position of the nucleotide after the last CDS position + :param int | None start_index: 0-based position of the first mismatch in the CDS + :param tuple | None alt_aas: alternate amino acids + :param dict | None ref_codon_dict: index in the REF CDS: CODON_TUPLE (codon, index of base in the codon) + :return tuple: (hgvs_nt, hgvs_tx, hgvs_pro) + :raises NotImplementedError: if the variant has an intergenic or untranslated (intronic) location + """ + + alt_len = len(alt) + + # For each location, determine if we have an insertion or duplication + if location == self.FIVEPRIME_UTR: + if self._is_duplication(trx_seq, pos, alt): + hgvs_nt, hgvs_tx, hgvs_pro = self._annotate_dup_hgvs( + trx_id, location, pos, alt_len, cds_start_offset, cds_stop_offset, start_index, alt_aas, + ref_codon_dict) + else: + utr_start = pos - cds_start_offset - 1 + utr_stop = utr_start + 1 + hgvs_nt = "%s:c.%i_%iins%s" % (trx_id, utr_start, utr_stop, alt[1:].lower()) + hgvs_tx = "%s:r.%i_%iins%s" % (trx_id, utr_start, utr_stop, su.dna_to_rna(alt[1:].lower())) + hgvs_pro = "p.(=)" + + elif location == self.THREEPRIME_UTR: + if self._is_duplication(trx_seq, pos, alt): + hgvs_nt, hgvs_tx, hgvs_pro = self._annotate_dup_hgvs( + trx_id, location, pos, alt_len, cds_start_offset, cds_stop_offset, start_index, alt_aas, + ref_codon_dict) + else: + utr_start = pos - cds_stop_offset + utr_stop = utr_start + 1 + hgvs_nt = "%s:c.*%i_*%iins%s" % (trx_id, utr_start, utr_stop, alt[1:].lower()) + hgvs_tx = "%s:r.*%i_*%iins%s" % (trx_id, utr_start, utr_stop, su.dna_to_rna(alt[1:].lower())) + hgvs_pro = "p.(=)" + + elif location == self.CDS_ID: + + if self._is_duplication(trx_seq, pos, alt): + hgvs_nt, hgvs_tx, hgvs_pro = self._annotate_dup_hgvs( + trx_id, location, pos, alt_len, cds_start_offset, cds_stop_offset, start_index, alt_aas, + ref_codon_dict) + else: + # We have a non-dup insertion + hgvs_nt = "%s:c.%i_%iins%s" % (trx_id, start_index + 1, start_index + 2, alt[1:].lower()) + hgvs_tx = "%s:r.%i_%iins%s" % ( + trx_id, start_index + 1, start_index + 2, su.dna_to_rna(alt[1:].lower())) + + if ref_codon_dict[start_index].base_index == 2 and ((alt_len - 1) % 3) == 0: + # In-frame insertion + alt_aa_str = "".join([AA_MAP[e] for e in alt_aas[1:]]) + hgvs_pro = "p.%s%i_%s%iins%s" % ( + AA_MAP[translate(ref_codon_dict[start_index].codon)], ref_codon_dict[start_index].codon_pos, + AA_MAP[translate(ref_codon_dict[start_index + 3].codon)], + ref_codon_dict[start_index + 3].codon_pos, alt_aa_str) + else: + # Out-of-frame insertion + ref_aa, aa_pos = self._get_fs_first_alt_aa(alt_aas, cds_stop_offset, start_index, ref_codon_dict) + hgvs_pro = "p.%s%ifs" % (AA_MAP[ref_aa], aa_pos) + else: + raise NotImplementedError + + return hgvs_nt, hgvs_tx, hgvs_pro + + def _annotate_del_hgvs(self, trx_id, location, pos, ref, cds_start_offset, cds_stop_offset, start_index, alt_aas, + ref_codon_dict): + """Annotates MAVE-HGVS for a deletion. + + :param str trx_id: transcript ID + :param str location: + :param int pos: 1-based position of the variant within the transcript + :param str ref: reference bases + :param int cds_start_offset: 0-based transcript position of the CDS start nucleotide + :param int cds_stop_offset: 0-based transcript position of the nucleotide after the last CDS position + :param int | None start_index: 0-based position of the first mismatch in the CDS + :param tuple | None alt_aas: alternate amino acids + :param dict ref_codon_dict: index in the REF CDS: CODON_TUPLE (codon, index of base in the codon) + :return tuple: (hgvs_nt, hgvs_tx, hgvs_pro) + :raises NotImplementedError: if the variant has an intergenic or untranslated (intronic) location + """ + + ref_len = len(ref) + + if location == self.FIVEPRIME_UTR: + utr_start = pos - cds_start_offset + utr_stop = utr_start + ref_len - 2 + if ref_len == 2: + # Single nt deleted + hgvs_nt = "%s:c.%idel" % (trx_id, utr_start) + hgvs_tx = "%s:r.%idel" % (trx_id, utr_start) + else: + # Multiple nts deleted + hgvs_nt = "%s:c.%i_%idel" % (trx_id, utr_start, utr_stop) + hgvs_tx = "%s:r.%i_%idel" % (trx_id, utr_start, utr_stop) + + if utr_stop >= 0: + # The deletion removes the start codon; determine which amino acids were affected + n_aa_del = len(list(range(0, utr_stop, 3))) + if n_aa_del == 1: + # Deletion affects only the start codon + hgvs_pro = "p.%s%idel" % (AA_MAP[translate(ref_codon_dict[0].codon)], 1) + elif (utr_stop + 1) % 3 == 0: + # Deletion affects start codon and downstream codons + # Multiple nts deleted + hgvs_nt = "%s:c.%i_%idel" % (trx_id, utr_start, utr_stop + 1) + hgvs_tx = "%s:r.%i_%idel" % (trx_id, utr_start, utr_stop + 1) + hgvs_pro = "p.%s%i_%s%idel" % ( + AA_MAP[translate(ref_codon_dict[0].codon)], 1, + AA_MAP[translate(ref_codon_dict[utr_stop - 1].codon)], ref_codon_dict[utr_stop - 1].codon_pos) + else: + # Deletion affects start codon and is a frameshift + hgvs_pro = "p.%s%ifs" % (AA_MAP[translate(ref_codon_dict[0].codon)], 1) + else: + hgvs_pro = "p.(=)" + + elif location == self.THREEPRIME_UTR: + utr_start = pos - cds_stop_offset + 1 + if ref_len == 2: + # Single nt deleted + hgvs_nt = "%s:c.*%idel" % (trx_id, utr_start) + hgvs_tx = "%s:r.*%idel" % (trx_id, utr_start) + else: + # Multiple nts deleted + utr_stop = utr_start + ref_len - 2 + hgvs_nt = "%s:c.*%i_*%idel" % (trx_id, utr_start, utr_stop) + hgvs_tx = "%s:r.*%i_*%idel" % (trx_id, utr_start, utr_stop) + + hgvs_pro = "p.(=)" + + elif location == self.CDS_ID: + + if ref_len == 2: + # Single nt deleted + hgvs_nt = "%s:c.%idel" % (trx_id, start_index + 2) + hgvs_tx = "%s:r.%idel" % (trx_id, start_index + 2) + else: + # Multiple nts deleted + hgvs_nt = "%s:c.%i_%idel" % (trx_id, start_index + 2, start_index + ref_len) + hgvs_tx = "%s:r.%i_%idel" % (trx_id, start_index + 2, start_index + ref_len) + + if ref_codon_dict[start_index].base_index == 2 and ((ref_len - 1) % 3) == 0: + + # In-frame deletion + if ref_len == 4: + # Single-codon del + hgvs_pro = "p.%s%idel" % ( + AA_MAP[translate(ref_codon_dict[start_index + 1].codon)], + ref_codon_dict[start_index + 1].codon_pos) + else: + # Multi-codon del + hgvs_pro = "p.%s%i_%s%idel" % ( + AA_MAP[translate(ref_codon_dict[start_index + 1].codon)], + ref_codon_dict[start_index + 1].codon_pos, + AA_MAP[translate(ref_codon_dict[start_index + ref_len - 1].codon)], + ref_codon_dict[start_index + ref_len - 1].codon_pos) + else: + # Out-of-frame deletion with frameshift + # The amino acid should be the first AA that is changed + ref_aa, aa_pos = self._get_fs_first_alt_aa(alt_aas, cds_stop_offset, start_index, ref_codon_dict) + hgvs_pro = "p.%s%ifs" % (AA_MAP[ref_aa], aa_pos) + + else: + raise NotImplementedError + + return hgvs_nt, hgvs_tx, hgvs_pro + + def _annotate_stopvar_hgvs(self, trx_id, location, trx_seq, pos, ref, alt, ref_len, alt_len, cds_start_offset, + cds_stop_offset, start_index, end_index, ref_codon_dict, alt_codon_dict): + """Annotates MAVE-HGVS for a variant at the stop codon (including nonstop/extension variant). + + :param str trx_id: transcript ID + :param str location: transcript location, one of {5_UTR, CDS, 3_UTR} + :param str trx_seq: transcript sequence + :param int pos: 1-based position of the variant within the transcript + :param str ref: reference bases + :param str alt: alternate bases + :param int ref_len: reference length + :param int alt_len: alternate length + :param int cds_start_offset: 0-based transcript position of the CDS start nucleotide + :param int cds_stop_offset: 0-based transcript position of the nucleotide after the last CDS position + :param int start_index: 0-based position of the first mismatch in the CDS + :param int end_index: 0-based position of the last mismatch in the CDS + :param dict ref_codon_dict: index in the REF CDS: CODON_TUPLE (codon, index of base in the codon) + :param dict alt_codon_dict: index in the ALT CDS: CODON_TUPLE (codon, index of base in the codon) + :return tuple: (hgvs_nt, hgvs_tx, hgvs_pro) + :raises NotImplementedError: if the variant has an intergenic or untranslated (intronic) location + """ + + if ref_len == alt_len: + # Here we have a SNP or MNP so we can simply index into the codon lists + if start_index == end_index: + # Here we have a SNP + _, _, ref_aas, alt_aas, _, _ = self._annotate_snp(ref_codon_dict, alt_codon_dict, start_index) + + hgvs_nt, hgvs_tx, _ = self._annotate_snp_hgvs( + trx_id, location, pos, ref, alt, cds_start_offset, cds_stop_offset, start_index, ref_aas[0], + ref_codon_dict[start_index].codon_pos, alt_aas[0]) + else: + # Here we have a single-codon MNP or a multi-codon change (haplotype, multivariant) + hgvs_nt, hgvs_tx, _ = self._annotate_multivariant_hgvs( + trx_id, location, pos, ref, alt, cds_start_offset, cds_stop_offset, ref_codon_dict, alt_codon_dict) + + elif ref_len < alt_len: + # Here we have an insertion or duplication + _, _, _, alt_aas, _, _ = self._annotate_ins(trx_seq, ref_codon_dict, start_index, pos, ref, alt) + + hgvs_nt, hgvs_tx, _ = self._annotate_ins_hgvs( + trx_id, trx_seq, location, pos, alt, cds_start_offset, cds_stop_offset, start_index, alt_aas, + ref_codon_dict) + + else: + # Here we have a deletion + _, _, _, alt_aas, _, _ = self._annotate_del(trx_seq, ref_codon_dict, start_index, pos, ref, alt) + + hgvs_nt, hgvs_tx, _ = self._annotate_del_hgvs( + trx_id, location, pos, ref, cds_start_offset, cds_stop_offset, start_index, alt_aas, ref_codon_dict) + + # Either the variant changes the stop codon or it doesn't + if translate(ref_codon_dict[start_index].codon) == STOP_AA and \ + translate(alt_codon_dict[start_index].codon) == STOP_AA: + # If there is a variant in which the stop codon was not altered, no protein change + hgvs_pro = "p.%s%i=" % (AA_MAP["*"], ref_codon_dict[start_index].codon_pos) + return hgvs_nt, hgvs_tx, hgvs_pro + + # Otherwise we have a frameshift + hgvs_pro = "p.%s%ifs" % (AA_MAP["*"], ref_codon_dict[start_index].codon_pos) + + return hgvs_nt, hgvs_tx, hgvs_pro + + def _get_mut_info(self, trx_id, location, trx_seq, wt_cds_seq, mut_cds_seq, pos, ref, alt, cds_start_offset, + cds_stop_offset): + """Finds codon and amino acid changes given a variant POS, REF, and ALT fields. + + :param str trx_id: transcript ID + :param str location: transcript location, one of {5_UTR, CDS, 3_UTR} + :param str trx_seq: transcript sequence :param str wt_cds_seq: WT CDS sequence :param str mut_cds_seq: Mutant CDS sequence + :param int pos: 1-based position of the variant within the transcript + :param str ref: reference bases + :param str alt: alternate bases + :param int cds_start_offset: 0-based transcript position of the CDS start nucleotide + :param int cds_stop_offset: 0-based transcript position of the nucleotide after the last CDS position :return collections.namedtuple: MUT_INFO_TUPLE """ - # If we weren't interested in reporting silent mutations, we might just translate the whole sequences then - # compare AAs; but multi-base MNPs (haplotypes) make optimization logic a little tricky, as the MNP may - # span multiple codons. For now just iterate over all the codons/AAs - codon_comparitors = zip( - [wt_cds_seq[i:i + 3] for i in range(0, len(wt_cds_seq), 3)], - [mut_cds_seq[i:i + 3] for i in range(0, len(mut_cds_seq), 3)] - ) - - # In the future this may need to be modified to handle InDels - # Checking codons is needed to call synonymous AA changes - wt_codons = [] - mut_codons = [] - wt_aas = [] - mut_aas = [] - mut_pos = set() - aa_changes = [] - matches_mut_sig = [] + # First set defaults for cases where the variant is in a UTR + ref_codons, alt_codons, ref_aas, alt_aas, aa_changes, matches_mut_sigs = tuple([su.R_COMPAT_NA] * 6) + + ref_len = len(ref) + alt_len = len(alt) + start_index = pos - cds_start_offset - 1 - for i, (wt_codon, mut_codon) in enumerate(codon_comparitors): + # Handle UTR variants as start_index will be an invalid key + # Make start_index and stop_index valid, but ensure annotation functions do not make use of them + if location == self.FIVEPRIME_UTR or location == self.THREEPRIME_UTR: + start_index = 0 - if wt_codon != mut_codon: + end_index = start_index if (ref_len == alt_len and ref_len == 1) else start_index + ref_len - wt_codons.append(wt_codon) - mut_codons.append(mut_codon) + # Note: If end_index is greater than the stop codon for a long-range haplotype in a short CDS, annotation + # will be incorrect; however, this would be extremely rare so don't check - # Either mark or filter changes that are unexpected by looking at the wobble bp - wt_wobble = wt_codon[2] - mut_wobble = mut_codon[2] + ref_codon_dict = self._get_pos_codon_dict(wt_cds_seq) - # Determine if the mutation matches the mutagenesis signature - matches_sig = True - if self.mut_sig != self.MUT_SIG_ANY and wt_wobble != mut_wobble and \ - mut_wobble in MUT_SIG_UNEXPECTED_WOBBLE_BPS[self.mut_sig]: + # We annotate each variant with a verbose representation (for CDS variants only), + # as well as a standardized MAVE-HGVS format (for CDS or UTR variants) - matches_sig = False + if translate(ref_codon_dict[start_index].codon) == STOP_AA: - if self.filter_unexpected: - continue + # Annotate variants at the stop codon first as they are a special case + # Any nonstop variant will be assumed a frameshift variant + alt_codon_dict = self._get_pos_codon_dict(mut_cds_seq) - matches_mut_sig.append(matches_sig) + ref_codons, alt_codons, ref_aas, alt_aas, aa_changes, matches_mut_sigs = self._annotate_stopvar( + trx_seq, pos, ref, alt, cds_start_offset, ref_codon_dict, alt_codon_dict) - wt_aa = translate(wt_codon) - mut_aa = translate(mut_codon) - wt_aas.append(wt_aa) - mut_aas.append(mut_aa) - mut_pos.add(i + 1) - aa_changes.append(HGVS_AA_FORMAT.format(wt_aa, i + 1, mut_aa)) + # Special case requires investigation of all variant types to determine if a non-stop variant exists + hgvs_nt, hgvs_tx, hgvs_pro = self._annotate_stopvar_hgvs( + trx_id, location, trx_seq, pos, ref, alt, ref_len, alt_len, cds_start_offset, cds_stop_offset, + start_index, end_index, ref_codon_dict, alt_codon_dict) + + elif ref_len == alt_len: + + # Here we have a SNP or MNP so we can simply index into the codon lists + alt_codon_dict = self._get_pos_codon_dict(mut_cds_seq) + + if start_index == end_index: + + # We have a SNP + if location == self.CDS_ID: + ref_codons, alt_codons, ref_aas, alt_aas, aa_changes, matches_mut_sigs = self._annotate_snp( + ref_codon_dict, alt_codon_dict, start_index) + + hgvs_nt, hgvs_tx, hgvs_pro = self._annotate_snp_hgvs( + trx_id, location, pos, ref, alt, cds_start_offset, cds_stop_offset, start_index, ref_aas[0], + ref_codon_dict[start_index].codon_pos, alt_aas[0]) + else: + hgvs_nt, hgvs_tx, hgvs_pro = self._annotate_snp_hgvs( + trx_id, location, pos, ref, alt, cds_start_offset, cds_stop_offset, None, None, None, None) + + else: + + # Here we have a single-codon MNP or a multi-codon change (haplotype, multivariant) + if location == self.CDS_ID: + # For MNPs that span the start codon, we will not have custom annotations, but the MAVE-HGVS + # annotation will be correct + ref_codons, alt_codons, ref_aas, alt_aas, aa_changes, matches_mut_sigs = self._annotate_mnp( + ref_codon_dict, alt_codon_dict, start_index, end_index) + + hgvs_nt, hgvs_tx, hgvs_pro = self._annotate_multivariant_hgvs( + trx_id, location, pos, ref, alt, cds_start_offset, cds_stop_offset, ref_codon_dict, alt_codon_dict) + + elif ref_len < alt_len: + + # Here we have an insertion or duplication + if location == self.CDS_ID: + ref_codons, alt_codons, ref_aas, alt_aas, aa_changes, matches_mut_sigs = self._annotate_ins( + trx_seq, ref_codon_dict, start_index, pos, ref, alt) + + hgvs_nt, hgvs_tx, hgvs_pro = self._annotate_ins_hgvs( + trx_id, trx_seq, location, pos, alt, cds_start_offset, cds_stop_offset, start_index, alt_aas, + ref_codon_dict) + else: + hgvs_nt, hgvs_tx, hgvs_pro = self._annotate_ins_hgvs( + trx_id, trx_seq, location, pos, alt, cds_start_offset, cds_stop_offset, None, None, None) + + else: + + # Here we have a deletion + if location == self.CDS_ID: + ref_codons, alt_codons, ref_aas, alt_aas, aa_changes, matches_mut_sigs = self._annotate_del( + trx_seq, ref_codon_dict, start_index, pos, ref, alt) + + hgvs_nt, hgvs_tx, hgvs_pro = self._annotate_del_hgvs( + trx_id, location, pos, ref, cds_start_offset, cds_stop_offset, start_index, alt_aas, ref_codon_dict) + else: + hgvs_nt, hgvs_tx, hgvs_pro = self._annotate_del_hgvs( + trx_id, location, pos, ref, cds_start_offset, cds_stop_offset, None, None, ref_codon_dict) # Collect the data into a mutation info tuple mut_info_tuple = MUT_INFO_TUPLE( - location=self.CDS_ID, wt_codons=wt_codons, mut_codons=mut_codons, - wt_aas=wt_aas, mut_aas=mut_aas, aa_changes=aa_changes, aa_positions=[], - matches_mut_sig=matches_mut_sig) + location=self.CDS_ID, wt_codons=ref_codons, mut_codons=alt_codons, + wt_aas=ref_aas, mut_aas=alt_aas, aa_changes=aa_changes, aa_positions=tuple(), + matches_mut_sig=matches_mut_sigs, mave_hgvs_nt=hgvs_nt, mave_hgvs_tx=hgvs_tx, mave_hgvs_pro=hgvs_pro) concat_mut_info_tuple = self._concat_multi_mut_info(mut_info_tuple) @@ -412,6 +1353,7 @@ def get_codon_and_aa_changes(self, trx_id, pos, ref, alt): trx_len, cds_start_offset, cds_stop_offset, trx_seq, cds_seq = self.cds_info[trx_id] if pos > trx_len or pos < 1: + # Support for intergenic variants is not implemented var_key = vu.VARIANT_FORMAT.format(trx_id, pos, ref, alt) msg_str = "Variant %s position is not within transcript bounds." % var_key warnings.warn(msg_str, TranscriptomicCoordNotFound) @@ -428,27 +1370,43 @@ def get_codon_and_aa_changes(self, trx_id, pos, ref, alt): logger.exception(msg_str) raise RuntimeError(msg_str) + # We have a valid variant in the coding region; annotate AA changes + cds_len = len(cds_seq) + ref_len = len(ref) + + # Create the ALT within the CDS sequence for SNP and MNP calling + # Set default mutant CDS sequence for isolated UTR variants; update for CDS variants + mut_cds_seq = cds_seq + cds_pos = zbased_pos - cds_start_offset + + # For SNPs and MNPs, ensure the WT and mutant CDS sequences are the same length + # mut_cds_seq will only be used for SNP and MNP, not InDel, calling + if cds_stop_offset > cds_pos >= 0: + # Variant is within the CDS + mut_cds_seq = cds_seq[:cds_pos] + alt + cds_seq[cds_pos + ref_len:] + mut_cds_seq = mut_cds_seq[:cds_len] + # NOTE: as a result of slice we will not call MNPs that span the CDS/3' UTR junction + elif cds_pos < 0 < (cds_pos + ref_len): + # Variant starts in 5' UTR and spans the 5' UTR/CDS junction + mut_cds_seq = (alt + cds_seq)[-cds_len:] + # Now that we have ensured our variant is valid, we can check for its placement if cds_start_offset is None and cds_stop_offset is None: + # Support for intronic variants is not implemented return MUT_INFO_TUPLE(location=self.UNTRANSLATED, **self.DEFAULT_KWARGS) + elif zbased_pos < cds_start_offset: - return MUT_INFO_TUPLE(location=self.FIVEPRIME_UTR, **self.DEFAULT_KWARGS) + location = self.FIVEPRIME_UTR elif zbased_pos >= cds_stop_offset: - return MUT_INFO_TUPLE(location=self.THREEPRIME_UTR, **self.DEFAULT_KWARGS) + location = self.THREEPRIME_UTR else: - # To determine the AA change, translate the mutant CDS sequence and find the differences - cds_len = len(cds_seq) - ref_len = len(ref) - - # Create the ALT within the CDS sequence - cds_pos = zbased_pos - cds_start_offset - mut_cds_seq = cds_seq[:cds_pos] + alt + cds_seq[cds_pos + ref_len:] - mut_cds_seq = mut_cds_seq[:cds_len] + location = self.CDS_ID - # Extract the information about the consequences of the variant change - mut_info_tuple = self._get_mut_info(wt_cds_seq=cds_seq, mut_cds_seq=mut_cds_seq) + # Extract the information about the consequences of the variant change + mut_info_tuple = self._get_mut_info( + trx_id, location, trx_seq, cds_seq, mut_cds_seq, pos, ref, alt, cds_start_offset, cds_stop_offset) - return mut_info_tuple + return mut_info_tuple def generate_pickle(self): """Makes a pickle of the object.""" diff --git a/src/analysis/variant_caller.py b/src/analysis/variant_caller.py index c599bf9..0c186ca 100644 --- a/src/analysis/variant_caller.py +++ b/src/analysis/variant_caller.py @@ -174,6 +174,30 @@ def _is_indel(aligned_pair): return False + @staticmethod + def _get_clipped_indices(cigartuples): + """Determines the query (read) positions that are clipped. + + :param pysam.AlignedSegment align_seg: read object + :return set: indices of clipped bases + """ + + base_index = 0 + clip_indices = set() + + for op, length in cigartuples: + + # Skip deletions + if op == su.PYSAM_CIGARTUPLES_DEL: + continue + + if op == su.PYSAM_CIGARTUPLES_SOFTCLIP: + clip_indices |= set(range(base_index, base_index + length)) + + base_index += length + + return clip_indices + def _get_haplotype_ref(self, contig, first_pos, last_pos, mm_pos_set): r"""Determines the variant REF and local mismatch positions given the first and last MM_TUPLE positions. @@ -315,6 +339,22 @@ def _call_snps(filt_r_mms, position_blacklist, haplotypes=None): return snp_dict + @staticmethod + def _call_indels(filt_r_mms): + r"""Calls InDels if the mismatches are not included in existing MNPs/haplotypes. + + :param list filt_r_mms: list of filtered R1 or R2 MM_TUPLEs + :return dict: {collections.namedtuple: set} keyed by CALL_TUPLE with values a set of starting coordinate + position for the InDel (just one pos, but consistent with haplotypes dict format) + """ + + indel_dict = collections.defaultdict(set) + + for mmt in filt_r_mms: + indel_dict[CALL_TUPLE(mmt.contig, mmt.pos, mmt.ref, mmt.alt, None, None, None)] |= {mmt.pos} + + return indel_dict + def _enumerate_mismatches(self, align_seg, min_bq=VARIANT_CALL_MIN_BQ): """Enumerates the mismatches in a read and excludes detection of InDels. @@ -367,17 +407,116 @@ def _enumerate_mismatches(self, align_seg, min_bq=VARIANT_CALL_MIN_BQ): return mms + def _enumerate_indels(self, align_seg, min_bq=VARIANT_CALL_MIN_BQ): + """Enumerates the InDels in a read. + + :param pysam.AlignedSegment align_seg: read object + :param int min_bq: min base quality for an insertion to be considered as supporting a variant call + :return list: list of MM_TUPLEs + + We require a MD tag for pysam.AlignedSegment.get_aligned_pairs(), which forms the basis of read-level calling. + The output of get_aligned_pairs is a 3-tuple for each base. + + if query_pos is None, we are in a del operation + if reference_pos is None, we are in an ins operation + if reference_base is lowercase that indicates a mismatch to the reference + """ + + read_strand = su.Strand(align_seg.is_reverse) + mms = [] + clip_indices = self._get_clipped_indices(align_seg.cigartuples) + + for apt in align_seg.get_aligned_pairs(with_seq=True): + + # Determine attrs of the current base + query_pos, reference_pos, reference_base = apt + + # Skip soft-clipped bases as the reference_pos and reference_base are None and interfere with ins detection + if query_pos in clip_indices: + continue + + # Here we have a simple deletion + if query_pos is None: + # Determine if we need to keep extending the deletion + if last_query_pos is not None: + # Start new deletion; since reference_pos is 0-based and we want 1-based, and because the first + # match before del should be the REF field, just use current reference_pos + # Fields are POS, REF, ALT, and last query position so position in read can be determined + new_del = ( + reference_pos, [last_reference_base.upper(), reference_base], + last_reference_base.upper(), last_query_pos,) + else: + # Otherwise extend the deletion + new_del[1].append(reference_base) + + elif "new_del" in locals(): + # Once the deletion is finished enumerate it + # We want the deletion position 1-based with respect to the 5' end + var_pos = new_del[3] + 1 if read_strand == su.Strand.PLUS else align_seg.query_length - new_del[3] + var_qual = align_seg.query_qualities[new_del[3]] + + # Append the candidate InDel using the mismatch tuple + mms.append(MM_TUPLE( + contig=align_seg.reference_name, pos=new_del[0], ref="".join(new_del[1]), alt=new_del[2], + bq=var_qual, read_pos=var_pos)) + + del new_del + + # Here we have an simple insertion + if reference_pos is None: + + # Filter bases of insertions using BQs + ins_qual = align_seg.query_qualities[query_pos] + if ins_qual < min_bq: + last_query_pos = query_pos + # Note we do not reset last_reference_pos and last_reference_base because they need to point + # to a match/mismatch and not a ins + continue + + if last_reference_base is not None: + # Start new ins + # Fields are POS, REF, ALT, and last query position so position in read can be determined + new_ins = ( + last_reference_pos + 1, last_reference_base.upper(), + [last_reference_base.upper(), align_seg.query_sequence[query_pos].upper()], last_query_pos,) + else: + # Otherwise extend the insertion + new_ins[2].append(align_seg.query_sequence[query_pos].upper()) + + elif "new_ins" in locals(): + # Once the insertion is finished enumerate it + # We want the insertion position 1-based with respect to the 5' end + var_pos = new_ins[3] + 1 if read_strand == su.Strand.PLUS else align_seg.query_length - new_ins[3] + ins_bases_query_start = new_ins[3] + 1 + ins_nbases = len(new_ins[2]) + # Use average quality of the inserted bases for simplicity + var_qual = sum( + align_seg.query_qualities[ins_bases_query_start:ins_bases_query_start+ins_nbases]) / ins_nbases + + # Append the candidate InDel using the mismatch tuple + mms.append(MM_TUPLE( + contig=align_seg.reference_name, pos=new_ins[0], ref=new_ins[1], alt="".join(new_ins[2]), + bq=var_qual, read_pos=var_pos)) + + del new_ins + + last_query_pos = query_pos + last_reference_pos = reference_pos + last_reference_base = reference_base + + return mms + @staticmethod - def _intersect_mismatches(r1_mms, r2_mms): - """Intersects the mismatches between read pairs. + def _intersect_edits(r1_mms, r2_mms): + """Intersects the mismatches and InDels between read pairs. :param list r1_mms: list of MM_TUPLEs for R1 :param list r2_mms: list of MM_TUPLEs for R2 :return tuple: filtered list of R1 and R2 MM_TUPLES """ - r1_mm_bases = {(r1_mm.contig, r1_mm.pos, r1_mm.alt,) for r1_mm in r1_mms} - r2_mm_bases = {(r2_mm.contig, r2_mm.pos, r2_mm.alt,) for r2_mm in r2_mms} + r1_mm_bases = {(r1_mm.contig, r1_mm.pos, r1_mm.ref, r1_mm.alt,) for r1_mm in r1_mms} + r2_mm_bases = {(r2_mm.contig, r2_mm.pos, r2_mm.ref, r2_mm.alt,) for r2_mm in r2_mms} r_intersect = r1_mm_bases & r2_mm_bases # Now we can iterate back through the list and filter it @@ -388,10 +527,10 @@ def _intersect_mismatches(r1_mms, r2_mms): # Must use itertools.zip_longest in case differing number of mismatches are found in the pair for r1_mm, r2_mm in itertools.zip_longest(r1_mms, r2_mms): - if r1_mm is not None and (r1_mm.contig, r1_mm.pos, r1_mm.alt,) in r_intersect: + if r1_mm is not None and (r1_mm.contig, r1_mm.pos, r1_mm.ref, r1_mm.alt,) in r_intersect: filtered_r1_mms.append(r1_mm) - if r2_mm is not None and (r2_mm.contig, r2_mm.pos, r2_mm.alt,) in r_intersect: + if r2_mm is not None and (r2_mm.contig, r2_mm.pos, r2_mm.ref, r2_mm.alt,) in r_intersect: filtered_r2_mms.append(r2_mm) return filtered_r1_mms, filtered_r2_mms @@ -619,14 +758,17 @@ def _iterate_over_reads(self, af1, af2, min_bq=VARIANT_CALL_MIN_BQ, max_nm=VARIA raise RuntimeError( "Improper pairing of reads. R1 was %s and R2 was %s." % (r1.query_name, r2.query_name)) + if not self._reads_overlap(r1, r2): + continue + # Only call variants for pairs that pass filters r1_nm = su.get_edit_distance(r1) r2_nm = su.get_edit_distance(r2) if r1_nm > max_nm or r2_nm > max_nm: continue - if not self._reads_overlap(r1, r2): - continue + r1_strand = su.Strand(r1.is_reverse) + r2_strand = su.Strand(r2.is_reverse) # Compute fragment coverage/depth; note only filtered read pairs contribute to depth self._update_pos_dp(r1, r2) @@ -636,40 +778,44 @@ def _iterate_over_reads(self, af1, af2, min_bq=VARIANT_CALL_MIN_BQ, max_nm=VARIA r2_mms = self._enumerate_mismatches(r2, min_bq) # Now find intersections between the mismatches - filt_r1_mms, filt_r2_mms = self._intersect_mismatches(r1_mms, r2_mms) + filt_r1_mms, filt_r2_mms = self._intersect_edits(r1_mms, r2_mms) - # If we found no intersected mismatches, continue to the next read pair - if len(filt_r1_mms) == 0: - continue + # Enumerate simple InDels with another pass over the aligned_pairs + r1_indel_mms = self._enumerate_indels(r1, min_bq) + r2_indel_mms = self._enumerate_indels(r2, min_bq) - # Call MNPs and haplotypes - haplotype_res = self._call_haplotypes(filt_r1_mms, max_mnp_window) + # Now find intersections between the mismatches + filt_r1_indel_mms, filt_r2_indel_mms = self._intersect_edits(r1_indel_mms, r2_indel_mms) - if isinstance(haplotype_res, tuple): - haplotypes, position_blacklist = haplotype_res - if len(position_blacklist) == 0: - # In this case we had >= 3 mismatches but they were not within the window and position_blacklist - # is an empty set + # If we found no intersected mismatches, continue to the next read pair + if len(filt_r1_mms) != 0: + # Call SNPs, MNPs, and haplotypes + haplotype_res = self._call_haplotypes(filt_r1_mms, max_mnp_window) + + if isinstance(haplotype_res, tuple): + haplotypes, position_blacklist = haplotype_res + if len(position_blacklist) == 0: + # In this case we had >= 3 mismatches but they were not within the window and position_blacklist + # is an empty set + haplotypes = None + else: + # In this case we had either 1 mismatch or 2 mismatches that were not within the window haplotypes = None - else: - # In this case we had either 1 mismatch or 2 mismatches that were not within the window - haplotypes = None - position_blacklist = set() - - # Call SNPs not captured in a MNP or haplotype - snps = self._call_snps(filt_r1_mms, position_blacklist, haplotypes) + position_blacklist = set() - collective_variants = snps - if haplotypes is not None: - collective_variants = {**haplotypes, **snps} + # Call SNPs not captured in a MNP or haplotype + snps = self._call_snps(filt_r1_mms, position_blacklist, haplotypes) - r1_strand = su.Strand(r1.is_reverse) - r2_strand = su.Strand(r2.is_reverse) + collective_variants = snps + if haplotypes is not None: + collective_variants = {**haplotypes, **snps} - # Now that we have called haplotypes and SNPs for the pair, update the dict of counts and stats - self._update_counts(collective_variants, filt_r1_mms, filt_r2_mms, r1_nm, r2_nm, r1_strand, r2_strand) + # Now that we have called haplotypes and SNPs for the pair, update the dict of counts and stats + self._update_counts(collective_variants, filt_r1_mms, filt_r2_mms, r1_nm, r2_nm, r1_strand, r2_strand) - # InDel enumeration can exist as a separate call here in the future + if len(filt_r1_indel_mms) != 0: + indels = self._call_indels(filt_r1_indel_mms) + self._update_counts(indels, filt_r1_indel_mms, filt_r2_indel_mms, r1_nm, r2_nm, r1_strand, r2_strand) def _summarize_stats(self, var_list, read_index, stat_index, pos_index): """Summarizes per-bp stats across reads for a particular contributing base to a variant call. @@ -856,7 +1002,7 @@ def _write_concordant_variants(self, vckt, vcst, reference_candidates_fh): # Determine the codon, AA change(s) and whether or not the variant matches the mutagenesis signature var_location, var_wt_codons, var_mut_codons, var_wt_aas, var_mut_aas, var_aa_changes, var_aa_positions, \ - var_matches_mut_sig = self.amino_acid_mapper.get_codon_and_aa_changes( + var_matches_mut_sig, hgvs_nt, hgvs_tx, hgvs_pro = self.amino_acid_mapper.get_codon_and_aa_changes( trx_id=trx_id, pos=vckt.pos, ref=vckt.ref, alt=vckt.alt) # Create an INFO field for the VCF @@ -893,7 +1039,10 @@ def _write_concordant_variants(self, vckt, vcst, reference_candidates_fh): vu.VCF_AAM_AA_ALT_ID: var_mut_aas, vu.VCF_AAM_AA_CHANGE_ID: var_aa_changes, vu.VCF_AAM_AA_POS_ID: var_aa_positions, - vu.VCF_MUT_SIG_MATCH: str(var_matches_mut_sig) + vu.VCF_MUT_SIG_MATCH: var_matches_mut_sig, + vu.VCF_MAVE_HGVS_NT_ID: hgvs_nt, + vu.VCF_MAVE_HGVS_TX_ID: hgvs_tx, + vu.VCF_MAVE_HGVS_PRO_ID: hgvs_pro, } # Generate a VCF record from scracth @@ -965,7 +1114,10 @@ def _create_vcf_header(self): "%s comma-delimited amino acid change(s). NA if the variant is out of CDS bounds." % aa_mapper_module_path), (vu.VCF_AAM_AA_POS_ID, ".", "String", "%s comma-delimited amino acid position(s). NA if the variant is out of CDS bounds." % aa_mapper_module_path), - (vu.VCF_MUT_SIG_MATCH, ".", "String", "Whether or not the variant matches the mutagenesis signature.") + (vu.VCF_MUT_SIG_MATCH, ".", "String", "Whether or not the variant matches the mutagenesis signature."), + (vu.VCF_MAVE_HGVS_NT_ID, ".", "String", "MAVE-HGVS nucleotide change."), + (vu.VCF_MAVE_HGVS_TX_ID, ".", "String", "MAVE-HGVS transcript change."), + (vu.VCF_MAVE_HGVS_PRO_ID, ".", "String", "MAVE-HGVS protein change.") ] # Add the INFO fields to be populated in the variant records diff --git a/src/core_utils/vcf_utils.py b/src/core_utils/vcf_utils.py index 7619434..66ce18b 100644 --- a/src/core_utils/vcf_utils.py +++ b/src/core_utils/vcf_utils.py @@ -131,6 +131,9 @@ VCF_AAM_AA_ALT_ID = "ALT_AA" VCF_AAM_AA_CHANGE_ID = "AA_CHANGE" VCF_AAM_AA_POS_ID = "AA_POS" +VCF_MAVE_HGVS_NT_ID = "HGVS_NT" +VCF_MAVE_HGVS_TX_ID = "HGVS_TX" +VCF_MAVE_HGVS_PRO_ID = "HGVS_PRO" # VG=VariantGenerator constants VCF_VG_VAR_ALTS_ID = "CODON_ALTS" diff --git a/src/satmut_utils/satmut_utils.py b/src/satmut_utils/satmut_utils.py index 49a614a..b6678dc 100644 --- a/src/satmut_utils/satmut_utils.py +++ b/src/satmut_utils/satmut_utils.py @@ -23,7 +23,7 @@ __author__ = "Ian Hoskins" __credits__ = ["Ian Hoskins"] __license__ = "GPLv3" -__version__ = "1.0.4" +__version__ = "1.1.0" __maintainer__ = "Ian Hoskins" __email__ = "ianjameshoskins@utexas.edu" __status__ = "Development" diff --git a/src/tests/analysis/test_coordinate_mapper.py b/src/tests/analysis/test_coordinate_mapper.py index 759667f..1736d0d 100644 --- a/src/tests/analysis/test_coordinate_mapper.py +++ b/src/tests/analysis/test_coordinate_mapper.py @@ -8,6 +8,7 @@ import analysis.coordinate_mapper as cm from analysis.references import faidx_ref import core_utils.file_utils as fu +from core_utils.vcf_utils import VariantType from satmut_utils.definitions import * tempfile.tempdir = DEFAULT_TEMPDIR @@ -88,6 +89,8 @@ def setUpClass(cls): cls.appris_pknox1_cds_seq = cls.appris_pknox1_trx_seq[211:1522] + cls.ref_codon_dict = cls.appris_aa_mapper._get_pos_codon_dict("ATGCCTTCTGAG") + @classmethod def tearDownClass(cls): """Tear down for TestAminoAcidMapper.""" @@ -140,133 +143,551 @@ def test_concat_multi_mut_info(self): """Tests that MUT_INFO tuples with multiple codon position changes are concatenated into strings.""" input_mut_info = cm.MUT_INFO_TUPLE( - location="CDS", wt_codons=["ATG", "AAA"], mut_codons=["AGG", "AGA"], - wt_aas=["M", "K"], mut_aas=["R", "R"], aa_changes=["p.M1R", "p.K2R"], - aa_positions=[], matches_mut_sig=[True, False]) + location="CDS", wt_codons=("ATG", "AAA",), mut_codons=("AGG", "AGA",), + wt_aas=("M", "K",), mut_aas=("R", "R",), aa_changes=("p.M1R", "p.K2R",), + aa_positions=(), matches_mut_sig=(True, False,), + mave_hgvs_nt="ENST00000398165.7:c.2t>g,ENST00000398165.7:c.5a>g", + mave_hgvs_tx="ENST00000398165.7:r.2u>g,ENST00000398165.7:r.5a>g", + mave_hgvs_pro="p.Met1Arg,p.Lys2Arg") expected = cm.MUT_INFO_TUPLE( location="CDS", wt_codons="ATG,AAA", mut_codons="AGG,AGA", wt_aas="M,K", mut_aas="R,R", aa_changes="p.M1R,p.K2R", - aa_positions="1,2", matches_mut_sig="True,False") + aa_positions="1,2", matches_mut_sig="True,False", + mave_hgvs_nt="ENST00000398165.7:c.2t>g,ENST00000398165.7:c.5a>g", + mave_hgvs_tx="ENST00000398165.7:r.2u>g,ENST00000398165.7:r.5a>g", + mave_hgvs_pro="p.Met1Arg,p.Lys2Arg") observed = self.appris_aa_mapper._concat_multi_mut_info(input_mut_info) self.assertEqual(expected, observed) - def test_get_mut_info(self): - """Tests that mutation info is properly generate for a WT and mutant CDS.""" + def test_get_pos_codon_dict(self): + """Tests that a CDS sequence can be indexed.""" + + observed = self.appris_aa_mapper._get_pos_codon_dict("ATGCCT") + + expected = dict( + zip(range(0, 6), + (cm.CODON_TUPLE(1, "ATG", 0), cm.CODON_TUPLE(1, "ATG", 1), cm.CODON_TUPLE(1, "ATG", 2), + cm.CODON_TUPLE(2, "CCT", 0), cm.CODON_TUPLE(2, "CCT", 1), cm.CODON_TUPLE(2, "CCT", 2)))) + + self.assertDictEqual(expected, observed) - # NNK means A and C are not expected in the wobble position, but only if it does not match the REF - expected = cm.MUT_INFO_TUPLE(location="CDS", wt_codons="ATG,AAA", mut_codons="AGG,AGA", - wt_aas="M,K", mut_aas="R,R", aa_changes="p.M1R,p.K2R", - aa_positions="1,2", matches_mut_sig="True,True") + def test_get_indel_downstream_remainder_ins(self): + """Tests for correct downstream codons and amino acids following an insertion frameshift.""" - wt_cds_seq = "ATGAAA" - mut_cds_seq = "AGGAGA" + observed = self.appris_aa_mapper._get_indel_downstream_remainder( + trx_seq=self.appris_cbs_trx_seq, pos=265, ref_len=1, alt="CT", curr_codon="CCT", base_index=1, + var_type=VariantType.INS) - observed = self.appris_aa_mapper._get_mut_info(wt_cds_seq, mut_cds_seq) + # Remaining codons and remaining amino acids + expected = (("CCT", "TTC", "TGA",), ("P", "F", "*")) self.assertEqual(expected, observed) - def test_get_mut_info_mut_sig(self): - """Tests that mutation info is properly generate for a WT and mutant CDS with mut_sig mismatch.""" + def test_get_indel_downstream_remainder_del(self): + """Tests for correct downstream codons and amino acids following an deletion frameshift.""" - # NNK means A and C are not expected in the wobble position, but only if it does not match the REF - expected = cm.MUT_INFO_TUPLE(location="CDS", wt_codons="ATG,AAA", mut_codons="AGG,AGC", - wt_aas="M,K", mut_aas="R,S", aa_changes="p.M1R,p.K2S", - aa_positions="1,2", matches_mut_sig="True,False") + observed = self.appris_aa_mapper._get_indel_downstream_remainder( + trx_seq=self.appris_cbs_trx_seq, pos=263, ref_len=3, alt="G", curr_codon="ATG", base_index=2, + var_type=VariantType.DEL) + + # Remaining codons and remaining amino acids + expected = (("ATG", "TTC", "TGA",), ("M", "F", "*")) + + self.assertEqual(expected, observed) - wt_cds_seq = "ATGAAA" - mut_cds_seq = "AGGAGC" + def test_annotate_snp(self): + """Tests for correct custom annotation of a SNP.""" - observed = self.appris_aa_mapper._get_mut_info(wt_cds_seq, mut_cds_seq) + observed = self.appris_aa_mapper._annotate_snp( + ref_codon_dict=self.ref_codon_dict, + alt_codon_dict=self.appris_aa_mapper._get_pos_codon_dict("ATGCCCTCTGAG"), start_index=5) + + expected = (("CCT",), ("CCC",), ("P",), ("P",), ("p.P2P",), (False,)) self.assertEqual(expected, observed) - def test_get_codon_and_aa_changes_trx_not_found(self): - """Tests that a TranscriptNotFound Exception is raised if requested transcript is not in the input annotations.""" + def test_annotate_mnp(self): + """Tests for correct custom annotation of a MNP.""" - with self.assertRaises(cm.TranscriptNotFound): - self.appris_aa_mapper.get_codon_and_aa_changes(trx_id="ENSTnonexistent", pos=2, ref="T", alt="G") + observed = self.appris_aa_mapper._annotate_mnp( + ref_codon_dict=self.ref_codon_dict, + alt_codon_dict=self.appris_aa_mapper._get_pos_codon_dict("ATGCCCACTGAG"), start_index=5, end_index=6) - def test_get_codon_and_aa_changes_integenic_left(self): - """Tests that an empty MUT_INFO tuple is returned with intergenic location if pos < 1 passed.""" + expected = (("CCT", "TCT",), ("CCC", "ACT",), ("P", "S",), ("P", "T",), ("p.P2P", "p.S3T",), (False, True,)) - expected = cm.MUT_INFO_TUPLE(location=cm.AminoAcidMapper.INTERGENIC, **cm.AminoAcidMapper.DEFAULT_KWARGS) - observed = self.appris_aa_mapper.get_codon_and_aa_changes(trx_id="ENST00000398165.7", pos=0, ref="T", alt="G") self.assertEqual(expected, observed) - def test_get_codon_and_aa_changes_integenic_right(self): - """Tests that an empty MUT_INFO tuple is returned with intergenic location if pos > trx len is passed.""" + def test_annotate_ins(self): + """Tests for correct custom annotation of an insertion.""" + + observed = self.appris_aa_mapper._annotate_ins( + trx_seq=self.appris_cbs_trx_seq, ref_codon_dict=self.ref_codon_dict, start_index=4, + pos=265, ref="C", alt="CT") + + expected = (("CCT",), ("CCT", "TTC", "TGA",), ("P",), ("P", "F", "*",), ("p.P2P,F,*",), (False,)) - expected = cm.MUT_INFO_TUPLE(location=cm.AminoAcidMapper.INTERGENIC, **cm.AminoAcidMapper.DEFAULT_KWARGS) - observed = self.appris_aa_mapper.get_codon_and_aa_changes(trx_id="ENST00000398165.7", pos=22756, ref="T", alt="G") self.assertEqual(expected, observed) - def test_get_codon_and_aa_changes_ref_mismatches(self): - """Tests that an RuntimeError is raised when the passed REF does not match the transcript reference.""" + def test_annotate_del(self): + """Tests for correct custom annotation of a deletion.""" - with self.assertRaises(RuntimeError): - self.appris_aa_mapper.get_codon_and_aa_changes(trx_id="ENST00000398165.7", pos=260, ref="T", alt="G") + observed = self.appris_aa_mapper._annotate_del( + trx_seq=self.appris_cbs_trx_seq, ref_codon_dict=self.ref_codon_dict, start_index=2, + pos=263, ref="GCC", alt="G") + + expected = (("ATG",), ("ATG", "TTC", "TGA",), ("M",), ("M", "F", "*",), ("p.M1M,F,*",), (False,)) - def test_get_codon_and_aa_changes_fiveprime_untranslated(self): - """Tests no protein annotations are returned for a variant in the 5' UTR.""" + self.assertEqual(expected, observed) + + def test_annotate_stopvar(self): + """Tests for correct custom annotation of a nonstop variant.""" + + ref_codon_dict = self.appris_aa_mapper._get_pos_codon_dict(self.appris_cbs_cds_seq) + + # TGA>TGT SNP + alt_codon_dict = self.appris_aa_mapper._get_pos_codon_dict(self.appris_cbs_cds_seq[:-1] + "T") + + observed = self.appris_aa_mapper._annotate_stopvar( + trx_seq=self.appris_cbs_trx_seq, pos=1916, ref="A", alt="T", cds_start_offset=260, + ref_codon_dict=ref_codon_dict, alt_codon_dict=alt_codon_dict) + + expected = (("TGA",), ("TGT",), ("*",), ("C",), ("p.*552C",), (True,)) - expected = cm.MUT_INFO_TUPLE(location=cm.AminoAcidMapper.FIVEPRIME_UTR, **cm.AminoAcidMapper.DEFAULT_KWARGS) - observed = self.appris_aa_mapper.get_codon_and_aa_changes(trx_id="ENST00000398165.7", pos=260, ref="C", alt="G") self.assertEqual(expected, observed) - def test_get_codon_and_aa_changes_threeprime_untranslated(self): - """Tests no protein annotations are returned for a variant in the 3' UTR.""" + def test_annotate_snp_hgvs_cds(self): + """Tests for correct MAVE-HGVS annotation of a SNP (substitution) in the CDS.""" + + observed = self.appris_aa_mapper._annotate_snp_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.CDS_ID, pos=268, ref="C", alt="T", + cds_start_offset=260, cds_stop_offset=1916, start_index=7, ref_aa="S", aa_pos=3, alt_aa="F") + + expected = ("ENST00000398165.7:c.8c>t", "ENST00000398165.7:r.8c>u", "p.Ser3Phe",) - expected = cm.MUT_INFO_TUPLE(location=cm.AminoAcidMapper.THREEPRIME_UTR, **cm.AminoAcidMapper.DEFAULT_KWARGS) - observed = self.appris_aa_mapper.get_codon_and_aa_changes(trx_id="ENST00000398165.7", pos=1917, ref="A", alt="G") self.assertEqual(expected, observed) - def test_get_codon_and_aa_changes_custom_negative_single(self): - """Tests correct protein annotations are returned for a single-codon variant in CBS.""" + def test_annotate_snp_hgvs_utr5(self): + """Tests for correct MAVE-HGVS annotation of a SNP (substitution) in the 5' UTR.""" - expected = cm.MUT_INFO_TUPLE( - location=cm.AminoAcidMapper.CDS_ID, wt_codons="ATG", mut_codons="GGG", wt_aas="M", mut_aas="G", - aa_changes="p.M1G", aa_positions="1", matches_mut_sig="True") + observed = self.appris_aa_mapper._annotate_snp_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.FIVEPRIME_UTR, pos=259, ref="G", alt="T", + cds_start_offset=260, cds_stop_offset=1916, start_index=None, ref_aa=None, aa_pos=None, alt_aa=None) - observed = self.cbs_pezy3_aa_mapper.get_codon_and_aa_changes( - trx_id="CBS_pEZY3", pos=974, ref="ATG", alt="GGG") + expected = ("ENST00000398165.7:c.-2g>t", "ENST00000398165.7:r.-2g>u", "p.(=)",) self.assertEqual(expected, observed) - def test_get_codon_and_aa_changes_appris_negative_single(self): - """Tests correct protein annotations are returned for a single-codon variant in CBS.""" + def test_annotate_snp_hgvs_utr3(self): + """Tests for correct MAVE-HGVS annotation of a SNP (substitution) in the 3' UTR.""" - expected = cm.MUT_INFO_TUPLE( - location=cm.AminoAcidMapper.CDS_ID, wt_codons="ATG", mut_codons="GGG", wt_aas="M", mut_aas="G", - aa_changes="p.M1G", aa_positions="1", matches_mut_sig="True") + observed = self.appris_aa_mapper._annotate_snp_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.THREEPRIME_UTR, pos=1920, ref="C", alt="T", + cds_start_offset=260, cds_stop_offset=1916, start_index=None, ref_aa=None, aa_pos=None, alt_aa=None) - observed = self.appris_aa_mapper.get_codon_and_aa_changes( - trx_id="ENST00000398165.7", pos=261, ref="ATG", alt="GGG") + expected = ("ENST00000398165.7:c.*4c>t", "ENST00000398165.7:r.*4c>u", "p.(=)",) self.assertEqual(expected, observed) - def test_get_codon_and_aa_changes_appris_negative_multi(self): - """Tests correct protein annotations are returned for a multi-codon variant in CBS.""" + def test_annotate_mnp_hgvs_cds_single_codon(self): + """Tests for correct MAVE-HGVS annotation of a MNP (delins) affecting a single codon.""" - expected = cm.MUT_INFO_TUPLE( - location=cm.AminoAcidMapper.CDS_ID, wt_codons="CCT,TCT", mut_codons="CCA,GGT", wt_aas="P,S", mut_aas="P,G", - aa_changes="p.P2P,p.S3G", aa_positions="2,3", matches_mut_sig="False,True") + alt_codon_dict = self.appris_aa_mapper._get_pos_codon_dict("ATGCCTAAAGAG") + + observed = self.appris_aa_mapper._annotate_mnp_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.CDS_ID, pos=267, alt="AAA", cds_start_offset=260, + cds_stop_offset=1916, start_index=6, ref_codon_dict=self.ref_codon_dict, alt_codon_dict=alt_codon_dict) - observed = self.appris_aa_mapper.get_codon_and_aa_changes( - trx_id="ENST00000398165.7", pos=266, ref="TTC", alt="AGG") + expected = ("ENST00000398165.7:c.7_9delinsaaa", "ENST00000398165.7:r.7_9delinsaaa", "p.Ser3delinsLys",) self.assertEqual(expected, observed) - def test_get_codon_and_aa_changes_appris_positive_single(self): - """Tests correct protein annotations are returned for a single-codon variant in PKNOX1.""" + def test_annotate_mnp_hgvs_cds_multi_codon(self): + """Tests for correct MAVE-HGVS annotation of a MNP (delins) affecting two codons.""" - expected = cm.MUT_INFO_TUPLE( - location=cm.AminoAcidMapper.CDS_ID, wt_codons="GCT", mut_codons="ACA", wt_aas="A", mut_aas="T", - aa_changes="p.A3T", aa_positions="3", matches_mut_sig="False") + alt_codon_dict = self.appris_aa_mapper._get_pos_codon_dict("ATGCCTTCACAG") + + observed = self.appris_aa_mapper._annotate_mnp_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.CDS_ID, pos=269, alt="AC", cds_start_offset=260, + cds_stop_offset=1916, start_index=8, ref_codon_dict=self.ref_codon_dict, alt_codon_dict=alt_codon_dict) + + expected = ("ENST00000398165.7:c.9_10delinsac", "ENST00000398165.7:r.9_10delinsac", "p.Ser3_Glu4delinsSerGln",) + + self.assertEqual(expected, observed) + + def test_annotate_mnp_hgvs_utr5_multi(self): + """Tests for correct MAVE-HGVS annotation of a MNP (delins) in the 5' UTR.""" + + observed = self.appris_aa_mapper._annotate_mnp_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.FIVEPRIME_UTR, pos=255, alt="GCG", + cds_start_offset=260, cds_stop_offset=1916, start_index=None, ref_codon_dict=self.ref_codon_dict, + alt_codon_dict=self.ref_codon_dict) + + expected = ("ENST00000398165.7:c.-6_-4delinsgcg", "ENST00000398165.7:r.-6_-4delinsgcg", "p.(=)",) + + self.assertEqual(expected, observed) + + def test_annotate_mnp_hgvs_utr3_multi(self): + """Tests for correct MAVE-HGVS annotation of a MNP (delins) in the 3' UTR.""" + + observed = self.appris_aa_mapper._annotate_mnp_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.THREEPRIME_UTR, pos=1918, alt="TC", + cds_start_offset=260, cds_stop_offset=1916, start_index=None, ref_codon_dict=self.ref_codon_dict, + alt_codon_dict=self.ref_codon_dict) + + expected = ("ENST00000398165.7:c.*2_*3delinstc", "ENST00000398165.7:r.*2_*3delinsuc", "p.(=)",) + + self.assertEqual(expected, observed) + + def test_group_mismatches(self): + """Tests that mismatches in multivariants are appropriately grouped to determine substitution versus delins.""" + + mm_positions = (500, 503, 504, 510, 511, 512) + mm_indices = (0, 3, 4, 10, 11, 12) + observed = self.appris_aa_mapper._group_mismatches(mm_positions, mm_indices) + + expected = ([(0, 500)], [(3, 503), (4, 504)], [(10, 510), (11, 511), (12, 512)],) + + self.assertEqual(expected, observed) + + def test_annotate_multivariant_hgvs_cds(self): + """Tests that a multivariant is correctly annotated in the CDS.""" + + alt_codon_dict = self.appris_aa_mapper._get_pos_codon_dict("ATGTTTTGTGAG") + + observed = self.appris_aa_mapper._annotate_multivariant_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.CDS_ID, pos=264, ref="CCTTCT", + alt="TTTTGT", cds_start_offset=260, cds_stop_offset=1916, ref_codon_dict=self.ref_codon_dict, + alt_codon_dict=alt_codon_dict) + + expected = ("ENST00000398165.7:c.4_5delinstt,ENST00000398165.7:c.8c>g", + "ENST00000398165.7:r.4_5delinsuu,ENST00000398165.7:r.8c>g", + "p.Pro2delinsPhe,p.Ser3Cys",) + + self.assertEqual(expected, observed) + + def test_annotate_multivariant_hgvs_utr5_isolated(self): + """Tests that a multivariant in the 5' UTR and not overlapping the CDS is correctly annotated.""" + + # Need to pass codon dictionaries as the might be needed for 5' UTR variants, although here they are not req'd + observed = self.appris_aa_mapper._annotate_multivariant_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.FIVEPRIME_UTR, pos=255, ref="CCCAGC", + alt="TCCGAC", cds_start_offset=260, cds_stop_offset=1916, ref_codon_dict=self.ref_codon_dict, + alt_codon_dict=self.ref_codon_dict) + + expected = ("ENST00000398165.7:c.-6c>t,ENST00000398165.7:c.-3_-2delinsga", + "ENST00000398165.7:r.-6c>u,ENST00000398165.7:r.-3_-2delinsga", "p.(=),p.(=)",) + + self.assertEqual(expected, observed) + + def test_annotate_multivariant_hgvs_utr5_overlapping(self): + """Tests that a multivariant in the 5' UTR and overlapping the CDS is correctly annotated.""" + + alt_codon_dict = self.appris_aa_mapper._get_pos_codon_dict("TGGCCTTCTGAG") + + observed = self.appris_aa_mapper._annotate_multivariant_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.FIVEPRIME_UTR, pos=258, ref="AGCAT", + alt="TGCTG", cds_start_offset=260, cds_stop_offset=1916, ref_codon_dict=self.ref_codon_dict, + alt_codon_dict=alt_codon_dict) + + expected = ("ENST00000398165.7:c.-3a>t,ENST00000398165.7:c.1_2delinstg", + "ENST00000398165.7:r.-3a>u,ENST00000398165.7:r.1_2delinsug", + "p.(=),p.(=),p.Met1delinsTrp",) + + # For protein annotation, the first p.(=) is for the subst, and the second is associated with the MNP + # that spans the UTR/CDS junction + + self.assertEqual(expected, observed) + + def test_annotate_multivariant_hgvs_utr3_isolated(self): + """Tests that a multivariant in the 3' UTR and not overlapping the CDS is correctly annotated.""" + + observed = self.appris_aa_mapper._annotate_multivariant_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.THREEPRIME_UTR, pos=1920, ref="CCGGAG", + alt="TCGCCG", cds_start_offset=260, cds_stop_offset=1916, ref_codon_dict=self.ref_codon_dict, + alt_codon_dict=self.ref_codon_dict) + + expected = ("ENST00000398165.7:c.*4c>t,ENST00000398165.7:c.*7_*8delinscc", + "ENST00000398165.7:r.*4c>u,ENST00000398165.7:r.*7_*8delinscc", "p.(=),p.(=)",) + + self.assertEqual(expected, observed) + + def test_is_duplication_true_single_nt(self): + """Tests that a single-nt duplication is detected.""" + + observed = self.appris_aa_mapper._is_duplication(trx_seq=self.appris_cbs_trx_seq, pos=265, alt="CC") + self.assertTrue(observed) + + def test_is_duplication_false_single_nt(self): + """Tests that a normal insertion is not determined to be a duplication.""" + + observed = self.appris_aa_mapper._is_duplication(trx_seq="ENST00000398165.7", pos=265, alt="CT") + self.assertFalse(observed) + + def test_is_duplication_true_three_nt(self): + """Tests that a three-nt duplication is detected.""" + + observed = self.appris_aa_mapper._is_duplication(trx_seq=self.appris_cbs_trx_seq, pos=265, alt="CGCC") + self.assertTrue(observed) + + def test_is_duplication_true_four_nt(self): + """Tests that a four-nt duplication is detected.""" + + observed = self.appris_aa_mapper._is_duplication(trx_seq=self.appris_cbs_trx_seq, pos=265, alt="CTGCC") + self.assertTrue(observed) + + def test_get_fs_first_alt_aa_ins(self): + """Tests that the first amino acid not matching the reference amino acid for a frameshift ins is determined.""" + + observed = self.appris_aa_mapper._get_fs_first_alt_aa( + alt_aas=("P", "F", "*"), cds_stop_offset=1916, start_index=4, ref_codon_dict=self.ref_codon_dict) + + expected = ("S", 3) + + self.assertEqual(expected, observed) + + def test_get_fs_first_alt_aa_del(self): + """Tests that the first amino acid not matching the reference amino acid for a frameshift del is determined.""" + + observed = self.appris_aa_mapper._get_fs_first_alt_aa( + alt_aas=("F", "*"), cds_stop_offset=1916, start_index=2, ref_codon_dict=self.ref_codon_dict) + + expected = ("P", 2) + + self.assertEqual(expected, observed) + + def test_annotate_dup_hgvs_cds_fs(self): + """Tests for correct MAVE-HGVS annotation of a single-nt duplication in the CDS.""" + + observed = self.appris_aa_mapper._annotate_dup_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.CDS_ID, pos=265, alt_len=2, cds_start_offset=260, + cds_stop_offset=1916, start_index=4, alt_aas=("P", "F", "*"), ref_codon_dict=self.ref_codon_dict) + + expected = ("ENST00000398165.7:c.5dup", "ENST00000398165.7:r.5dup", "p.Ser3fs",) + + self.assertEqual(expected, observed) - observed = self.appris_aa_mapper.get_codon_and_aa_changes( - trx_id="ENST00000291547.9", pos=218, ref="GCT", alt="ACA") + def test_annotate_dup_hgvs_cds_inframe_single(self): + """Tests for correct MAVE-HGVS annotation of an in-frame duplication of a single amino acid in the CDS.""" + + observed = self.appris_aa_mapper._annotate_dup_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.CDS_ID, pos=266, alt_len=4, cds_start_offset=260, + cds_stop_offset=1916, start_index=5, alt_aas=("P", "P"), ref_codon_dict=self.ref_codon_dict) + + expected = ("ENST00000398165.7:c.4_6dup", "ENST00000398165.7:r.4_6dup", "p.Pro2dup",) + + self.assertEqual(expected, observed) + + def test_annotate_dup_hgvs_cds_inframe_multi(self): + """Tests for correct MAVE-HGVS annotation of an in-frame duplication of a single amino acid in the CDS.""" + + observed = self.appris_aa_mapper._annotate_dup_hgvs( + trx_id="ENST00000398165.7", alt_len=7, location=self.appris_aa_mapper.CDS_ID, pos=266, cds_start_offset=260, + cds_stop_offset=1916, start_index=5, alt_aas=("P", "M", "P"), ref_codon_dict=self.ref_codon_dict) + + expected = ("ENST00000398165.7:c.1_6dup", "ENST00000398165.7:r.1_6dup", "p.Met1_Pro2dup",) + + self.assertEqual(expected, observed) + + def test_annotate_dup_hgvs_utr5(self): + """Tests for correct MAVE-HGVS annotation of a duplication in the 5' UTR.""" + + observed = self.appris_aa_mapper._annotate_dup_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.FIVEPRIME_UTR, pos=255, alt_len=4, + cds_start_offset=260, cds_stop_offset=1916, start_index=None, alt_aas=None, ref_codon_dict=None) + + # 255:C:CGTC + expected = ("ENST00000398165.7:c.-8_-6dup", "ENST00000398165.7:r.-8_-6dup", "p.(=)",) + + self.assertEqual(expected, observed) + + def test_annotate_dup_hgvs_utr3(self): + """Tests for correct MAVE-HGVS annotation of a duplication in the 3' UTR.""" + + observed = self.appris_aa_mapper._annotate_dup_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.THREEPRIME_UTR, pos=1920, alt_len=4, + cds_start_offset=260, cds_stop_offset=1916, start_index=None, alt_aas=None, ref_codon_dict=None) + + # 1920:C:CGTC + expected = ("ENST00000398165.7:c.*2_*4dup", "ENST00000398165.7:r.*2_*4dup", "p.(=)",) + + self.assertEqual(expected, observed) + + def test_annotate_dup_hgvs_cds_utr3(self): + """Tests for correct MAVE-HGVS annotation of a duplication spanning the CDS and 3' UTR.""" + + observed = self.appris_aa_mapper._annotate_dup_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.THREEPRIME_UTR, pos=1917, alt_len=4, + cds_start_offset=260, cds_stop_offset=1916, start_index=None, alt_aas=None, ref_codon_dict=None) + + # 1917:A:AGAA + expected = ("ENST00000398165.7:c.1915_*1dup", "ENST00000398165.7:r.1915_*1dup", "p.(=)",) + + self.assertEqual(expected, observed) + + def test_annotate_ins_hgvs_cds_inframe_single(self): + """Tests for correct MAVE-HGVS annotation of an in-frame, single amino acid insertion in the CDS.""" + + observed = self.appris_aa_mapper._annotate_ins_hgvs( + trx_id="ENST00000398165.7", trx_seq=self.appris_cbs_trx_seq, location=self.appris_aa_mapper.CDS_ID, + pos=266, alt="TATG", cds_start_offset=260, cds_stop_offset=1916, start_index=5, alt_aas=("P", "M",), + ref_codon_dict=self.ref_codon_dict) + + expected = ("ENST00000398165.7:c.6_7insatg", "ENST00000398165.7:r.6_7insaug", "p.Pro2_Ser3insMet") + + self.assertEqual(expected, observed) + + def test_annotate_ins_hgvs_cds_inframe_multi(self): + """Tests for correct MAVE-HGVS annotation of an in-frame, multi amino acid insertion in the CDS.""" + + observed = self.appris_aa_mapper._annotate_ins_hgvs( + trx_id="ENST00000398165.7", trx_seq=self.appris_cbs_trx_seq, location=self.appris_aa_mapper.CDS_ID, + pos=266, alt="TATGAAA", cds_start_offset=260, cds_stop_offset=1916, start_index=5, alt_aas=("P", "M", "K"), + ref_codon_dict=self.ref_codon_dict) + + expected = ("ENST00000398165.7:c.6_7insatgaaa", "ENST00000398165.7:r.6_7insaugaaa", "p.Pro2_Ser3insMetLys",) + + self.assertEqual(expected, observed) + + def test_annotate_ins_hgvs_utr5(self): + """Tests for correct MAVE-HGVS annotation of an insertion in the 5' UTR.""" + + observed = self.appris_aa_mapper._annotate_ins_hgvs( + trx_id="ENST00000398165.7", trx_seq=self.appris_cbs_trx_seq, location=self.appris_aa_mapper.FIVEPRIME_UTR, + pos=255, alt="CT", cds_start_offset=260, cds_stop_offset=1916, start_index=None, alt_aas=None, + ref_codon_dict=self.ref_codon_dict) + + expected = ("ENST00000398165.7:c.-6_-5inst", "ENST00000398165.7:r.-6_-5insu", "p.(=)",) + + self.assertEqual(expected, observed) + + def test_annotate_ins_hgvs_utr3(self): + """Tests for correct MAVE-HGVS annotation of an insertion in the 3' UTR.""" + + observed = self.appris_aa_mapper._annotate_ins_hgvs( + trx_id="ENST00000398165.7", trx_seq=self.appris_cbs_trx_seq, location=self.appris_aa_mapper.THREEPRIME_UTR, + pos=1920, alt="CT", cds_start_offset=260, cds_stop_offset=1916, start_index=None, alt_aas=None, + ref_codon_dict=self.ref_codon_dict) + + expected = ("ENST00000398165.7:c.*4_*5inst", "ENST00000398165.7:r.*4_*5insu", "p.(=)",) + + self.assertEqual(expected, observed) + + def test_annotate_del_hgvs_cds_inframe_single(self): + """Tests for correct MAVE-HGVS annotation of an in-frame, single amino acid deletion in the CDS.""" + + observed = self.appris_aa_mapper._annotate_del_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.CDS_ID, pos=266, ref="TTCT", + cds_start_offset=260, cds_stop_offset=1916, start_index=5, alt_aas=("P",), + ref_codon_dict=self.ref_codon_dict) + + expected = ("ENST00000398165.7:c.7_9del", "ENST00000398165.7:r.7_9del", "p.Ser3del",) + + self.assertEqual(expected, observed) + + def test_annotate_del_hgvs_cds_inframe_multi(self): + """Tests for correct MAVE-HGVS annotation of an in-frame, multi amino acid deletion in the CDS.""" + + observed = self.appris_aa_mapper._annotate_del_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.CDS_ID, pos=266, ref="TTCTGAG", + cds_start_offset=260, cds_stop_offset=1916, start_index=5, alt_aas=("P",), + ref_codon_dict=self.ref_codon_dict) + + expected = ("ENST00000398165.7:c.7_12del", "ENST00000398165.7:r.7_12del", "p.Ser3_Glu4del",) self.assertEqual(expected, observed) + + def test_annotate_del_hgvs_utr5(self): + """Tests for correct MAVE-HGVS annotation of an isolated deletion in the 5' UTR.""" + + observed = self.appris_aa_mapper._annotate_del_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.FIVEPRIME_UTR, pos=255, ref="CCC", + cds_start_offset=260, cds_stop_offset=1916, start_index=None, alt_aas=None, + ref_codon_dict=self.ref_codon_dict) + + expected = ("ENST00000398165.7:c.-5_-4del", "ENST00000398165.7:r.-5_-4del", "p.(=)",) + + self.assertEqual(expected, observed) + + def test_annotate_del_hgvs_utr5_cds(self): + """Tests for correct MAVE-HGVS annotation of a deletion spanning the 5' UTR - CDS junction.""" + + observed = self.appris_aa_mapper._annotate_del_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.FIVEPRIME_UTR, pos=259, ref="GCATGCCT", + cds_start_offset=260, cds_stop_offset=1916, start_index=None, alt_aas=None, + ref_codon_dict=self.ref_codon_dict) + + expected = ("ENST00000398165.7:c.-1_6del", "ENST00000398165.7:r.-1_6del", "p.Met1_Pro2del",) + + self.assertEqual(expected, observed) + + def test_annotate_del_hgvs_utr3(self): + """Tests for correct MAVE-HGVS annotation of an isolated deletion in the 3' UTR.""" + + observed = self.appris_aa_mapper._annotate_del_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.THREEPRIME_UTR, pos=1920, ref="CCG", + cds_start_offset=260, cds_stop_offset=1916, start_index=None, alt_aas=None, + ref_codon_dict=self.ref_codon_dict) + + expected = ("ENST00000398165.7:c.*5_*6del", "ENST00000398165.7:r.*5_*6del", "p.(=)",) + + self.assertEqual(expected, observed) + + def test_annotate_stopvar_hgvs_synon(self): + """Tests for correct MAVE-HGVS annotation of a synonymous SNP in the stop codon.""" + + ref_codon_dict = self.appris_aa_mapper._get_pos_codon_dict(self.appris_cbs_cds_seq) + alt_codon_dict = self.appris_aa_mapper._get_pos_codon_dict(self.appris_cbs_cds_seq[:-2] + "AA") + + # 1915:G:A synonymous SNP + observed = self.appris_aa_mapper._annotate_stopvar_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.CDS_ID, trx_seq=self.appris_cbs_trx_seq, + pos=1915, ref="G", alt="A", ref_len=1, alt_len=1, cds_start_offset=260, cds_stop_offset=1916, + start_index=1654, end_index=1654, ref_codon_dict=ref_codon_dict, alt_codon_dict=alt_codon_dict) + + expected = ("ENST00000398165.7:c.1655g>a", "ENST00000398165.7:r.1655g>a", "p.Ter552=",) + + self.assertEqual(expected, observed) + + def test_annotate_stopvar_hgvs_fs(self): + """Tests for correct MAVE-HGVS annotation of a frameshift at the stop codon.""" + + ref_codon_dict = self.appris_aa_mapper._get_pos_codon_dict(self.appris_cbs_cds_seq) + alt_codon_dict = self.appris_aa_mapper._get_pos_codon_dict(self.appris_cbs_cds_seq[:-1] + "T") + + observed = self.appris_aa_mapper._annotate_stopvar_hgvs( + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.CDS_ID, trx_seq=self.appris_cbs_trx_seq, + pos=1915, ref="G", alt="GT", ref_len=1, alt_len=2, cds_start_offset=260, cds_stop_offset=1916, + start_index=1654, end_index=1654, ref_codon_dict=ref_codon_dict, alt_codon_dict=alt_codon_dict) + + expected = ("ENST00000398165.7:c.1655_1656inst", "ENST00000398165.7:r.1655_1656insu", "p.Ter552fs",) + + self.assertEqual(expected, observed) + + def test_get_codon_and_aa_changes_trx_not_found(self): + """Tests that a TranscriptNotFound Exception is raised if transcript is not in the input annotations.""" + + with self.assertRaises(cm.TranscriptNotFound): + self.appris_aa_mapper.get_codon_and_aa_changes(trx_id="ENSTnonexistent", pos=2, ref="T", alt="G") + + def test_get_codon_and_aa_changes_integenic_left(self): + """Tests that an empty MUT_INFO tuple is returned with intergenic location if pos < 1 passed.""" + + expected = cm.MUT_INFO_TUPLE(location=cm.AminoAcidMapper.INTERGENIC, **cm.AminoAcidMapper.DEFAULT_KWARGS) + observed = self.appris_aa_mapper.get_codon_and_aa_changes(trx_id="ENST00000398165.7", pos=0, ref="T", alt="G") + self.assertEqual(expected, observed) + + def test_get_codon_and_aa_changes_integenic_right(self): + """Tests that an empty MUT_INFO tuple is returned with intergenic location if pos > trx len is passed.""" + + expected = cm.MUT_INFO_TUPLE(location=cm.AminoAcidMapper.INTERGENIC, **cm.AminoAcidMapper.DEFAULT_KWARGS) + observed = self.appris_aa_mapper.get_codon_and_aa_changes(trx_id="ENST00000398165.7", pos=22756, ref="T", alt="G") + self.assertEqual(expected, observed) + + def test_get_codon_and_aa_changes_ref_mismatches(self): + """Tests that an RuntimeError is raised when the passed REF does not match the transcript reference.""" + + with self.assertRaises(RuntimeError): + self.appris_aa_mapper.get_codon_and_aa_changes(trx_id="ENST00000398165.7", pos=260, ref="T", alt="G") diff --git a/src/tests/analysis/test_variant_caller.py b/src/tests/analysis/test_variant_caller.py index c2150d7..3dcba2c 100644 --- a/src/tests/analysis/test_variant_caller.py +++ b/src/tests/analysis/test_variant_caller.py @@ -35,6 +35,32 @@ MG01HS02:1483:HG7MTBCX3:2:1213:4430:45261_ACTTTTGA 163 CBS_pEZY3 2290 44 105M = 2290 105 GGAGAAGGGCTTACAAGAGGCGCCCGAGGAGGATGCGGCGGGGGTAATCCTGGGAATGGTGAAGCTTGGGAACATGCTATCGTCCCTGCTTGCCGGGAAAGTGCA 0<00011<0111<1111<1D0/ Date: Fri, 15 Sep 2023 17:15:23 -0500 Subject: [PATCH 2/3] IH 15SEP2023 version to v1.1.1. --- CHANGES.txt | 3 + README.md | 15 ++- docs/satmut_utils_manual.md | 8 ++ setup.cfg | 2 +- src/analysis/coordinate_mapper.py | 125 ++++++++++++------- src/satmut_utils/satmut_utils.py | 2 +- src/tests/analysis/test_coordinate_mapper.py | 52 +++++--- 7 files changed, 140 insertions(+), 67 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 0c13f1b..9311339 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -20,3 +20,6 @@ Release Summary: 1.1.0, September 11, 2023 Added support for simple insertion and deletion calling. Added MAVE-HGVS annotations. + +1.1.1, September 15, 2023 + Fixed bugs in HGVS annotations for single in-frame insertions and MNPs spanning the CDS-3' UTR junction. Ensure deletion-insertion annotations follow HGVS proposal SVD-WG010. diff --git a/README.md b/README.md index 86bd52b..112aa40 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ Additionally, the user can set a directory for temporary files by setting an env ### Run 'sim' -Run 'sim' on *in silico* alignments to generate SNPs, MNPs, and InDels. Structural variants and gene fusions are not currently supported. +Run 'sim' on *in silico* alignments to generate SNPs, MNPs, insertions, and deletions. Structural variants and gene fusions are not currently supported. ``` TEST_DIR="$SATMUT_ROOT/src/tests/test_data" OUTPUT_DIR="/tmp/satmut_utils_test" @@ -112,8 +112,19 @@ To run unit tests, execute the following from the satmut\_utils repository: ```nose2 -q``` +## Accessory scripts + +Two command-line interfaces are provided to enable pre-processing of reads prior to satmut_utils 'sim': + +1. satmut\_trim +2. satmut\_align + +satmut\_trim is a wrapper around cutadapt, and satmut\_align a wrapper around bowtie2. satmut\_align should be used to generate the BAM file accepted by 'sim'. If reads have been aligned with some other method, there is no guarantee 'sim' will complete without error, as alignment tags output by bowtie2 are required for 'sim' (MD, NM). + +Additionally, a number of helpful scripts are available on the development [repository](https://github.com/ijhoskins/satmut_utils) of satmut\_utils in the src/scripts directory. For example, if subdomains of a gene were targeted, MAVE-HGVS annotations can be updated with run\_mave\_hgvs\_annot.py to report positions relative to target regions (as required by MAVEdb). + ## Citation If you use satmut\_utils, please cite the following paper: -Hoskins I, Sun S, Cote A, Roth FP, Cenik C. satmut_utils: a simulation and variant calling package for multiplexed assays of variant effect. Genome Biol. BioMed Central; 2023 Apr 20;24(1):1–2 +Hoskins I, Sun S, Cote A, Roth FP, Cenik C. satmut_utils: a simulation and variant calling package for multiplexed assays of variant effect. Genome Biol. BioMed Central; 2023 Apr 20;24(1):1–2 \ No newline at end of file diff --git a/docs/satmut_utils_manual.md b/docs/satmut_utils_manual.md index e7c7dac..6bf9a3d 100644 --- a/docs/satmut_utils_manual.md +++ b/docs/satmut_utils_manual.md @@ -336,6 +336,12 @@ AA\_POS: Comma-delimited amino acid position(s). NA if the variant is out of CDS MATCHES\_MUT\_SIG: Whether or not the variant matches the mutagenesis signature. +HGVS\_NT: MAVE-HGVS annotation for hgvs\_nt field in MAVEdb score or count table. + +HGVS\_TX: MAVE-HGVS annotation for hgvs\_tx field in MAVEdb score or count table. + +HGVS\_PRO: MAVE-HGVS annotation for hgvs\_pro field in MAVEdb score or count table. + ## satmut\_utils command line interface satmut\_utils provides the 'sim' and 'call' workflow as subcommands, which have common and unique options. @@ -532,3 +538,5 @@ Two command-line interfaces are provided to enable pre-processing of reads prior 2. satmut\_align satmut\_trim is a wrapper around cutadapt, and satmut\_align a wrapper around bowtie2. satmut\_align should be used to generate the BAM file accepted by 'sim'. If reads have been aligned with some other method, there is no guarantee 'sim' will complete without error, as alignment tags output by bowtie2 are required for 'sim' (MD, NM). + +Additionally, a number of helpful scripts are available on the development [repository](https://github.com/ijhoskins/satmut_utils) of satmut\_utils in the src/scripts directory. For example, if subdomains of a gene were targeted, MAVE-HGVS annotations can be updated with run\_mave\_hgvs\_annot.py to report positions relative to target regions (as required by MAVEdb). diff --git a/setup.cfg b/setup.cfg index a2f5a97..1c0d134 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = satmut_utils -version = 1.1.0 +version = 1.1.1 author = Ian Hoskins author_email = ianjameshoskins@utexas.edu description = Tools for variant simulation and variant calling in paired end, targeted sequencing saturation mutagenesis experiments diff --git a/src/analysis/coordinate_mapper.py b/src/analysis/coordinate_mapper.py index 885eb90..02501dc 100644 --- a/src/analysis/coordinate_mapper.py +++ b/src/analysis/coordinate_mapper.py @@ -428,13 +428,14 @@ def _annotate_snp(self, ref_codon_dict, alt_codon_dict, start_index): return ref_codons, alt_codons, ref_aas, alt_aas, aa_changes, matches_mut_sigs - def _annotate_mnp(self, ref_codon_dict, alt_codon_dict, start_index, end_index): + def _annotate_mnp(self, ref_codon_dict, alt_codon_dict, start_index, end_index, cds_len): """Gets annotations for a multi-codon change. :param dict ref_codon_dict: index in the REF CDS: CODON_TUPLE (codon, index of base in the codon) :param dict alt_codon_dict: index in the ALT CDS: CODON_TUPLE (codon, index of base in the codon) :param int start_index: 0-based position of the first mismatch in the CDS :param int end_index: 0-based position of the last mismatch in the CDS + :param int cds_len: length of CDS :return tuple: (ref_codons, alt_codons, ref_aas, alt_aas, aa_changes, matches_mut_sigs) """ @@ -445,9 +446,13 @@ def _annotate_mnp(self, ref_codon_dict, alt_codon_dict, start_index, end_index): aa_changes = [] matches_mut_sigs = [] - # Require end_index + 3 for MNPs spanning codons + # Require end_index + 3 for MNPs spanning codons; however this creates bug for MNP starting in the stop codon + # so filter out indices that are not within the CDS for i in range(start_index, end_index + 3, 3): + if i - cds_len >= 0: + break + ref_codon = ref_codon_dict[i].codon alt_codon = alt_codon_dict[i].codon @@ -490,7 +495,7 @@ def _annotate_ins(self, trx_seq, ref_codon_dict, start_index, pos, ref, alt): if ref_codon_dict[start_index].base_index == 2 and ((alt_len - 1) % 3) == 0: # Insertion is in-frame and does not change current/downstream codons alt_ins = alt[1:] - alt_list = [alt_ins[i:i + 3] for i in range(0, alt_len, 3)] + alt_list = [alt_ins[i:i + 3] for i in range(0, alt_len - 1, 3)] ref_codons = (ref_codon_dict[start_index].codon,) alt_codons = (ref_codons[0], *alt_list,) ref_aas = (translate(ref_codons[0]),) @@ -555,7 +560,8 @@ def _annotate_del(self, trx_seq, ref_codon_dict, start_index, pos, ref, alt): return ref_codons, alt_codons, ref_aas, alt_aas, aa_changes, matches_mut_sigs - def _annotate_stopvar(self, trx_seq, pos, ref, alt, cds_start_offset, ref_codon_dict, alt_codon_dict): + def _annotate_stopvar(self, trx_seq, pos, ref, alt, cds_start_offset, cds_len, + ref_codon_dict, alt_codon_dict): """Annotates a variant in the stop codon. :param str trx_seq: transcript sequence @@ -563,6 +569,7 @@ def _annotate_stopvar(self, trx_seq, pos, ref, alt, cds_start_offset, ref_codon_ :param str ref: reference bases :param str alt: alternate bases :param int cds_start_offset: transcript position of the CDS start nucleotide + :param int cds_len: CDS length :param dict ref_codon_dict: index in the REF CDS: CODON_TUPLE (codon, index of base in the codon) :param dict alt_codon_dict: index in the ALT CDS: CODON_TUPLE (codon, index of base in the codon) :return collections.namedtuple: MUT_INFO_TUPLE @@ -585,7 +592,7 @@ def _annotate_stopvar(self, trx_seq, pos, ref, alt, cds_start_offset, ref_codon_ # Here we have a single-codon MNP or a multi-codon change (haplotype, multivariant) # NOTE: this is not accurate if a MNP spans the CDS/3' UTR junction, because alt_codon_dict # derives from a sliced mut_cds_seq in get_codon_and_aa_changes() - mut_info_tuple = self._annotate_mnp(ref_codon_dict, alt_codon_dict, start_index, end_index) + mut_info_tuple = self._annotate_mnp(ref_codon_dict, alt_codon_dict, start_index, end_index, cds_len) elif ref_len < alt_len: @@ -599,7 +606,6 @@ def _annotate_stopvar(self, trx_seq, pos, ref, alt, cds_start_offset, ref_codon_ return mut_info_tuple - # Note: MAVE-HGVS formatting assumes the transcript has a coding region (not noncoding) def _annotate_snp_hgvs(self, trx_id, location, pos, ref, alt, cds_start_offset, cds_stop_offset, start_index, ref_aa, aa_pos, alt_aa): """Annotates MAVE-HGVS for a substitution. @@ -643,9 +649,9 @@ def _annotate_snp_hgvs(self, trx_id, location, pos, ref, alt, cds_start_offset, return hgvs_nt, hgvs_tx, hgvs_pro - def _annotate_mnp_hgvs(self, trx_id, location, pos, alt, cds_start_offset, cds_stop_offset, start_index, + def _annotate_mnp_hgvs(self, trx_id, location, pos, alt, cds_start_offset, cds_stop_offset, cds_len, start_index, ref_codon_dict, alt_codon_dict): - """Annotates MAVE-HGVS for a MNP (single or dicodon change). + """Annotates MAVE-HGVS for a MNP (single multicodon change). :param str trx_id: transcript ID :param str location: transcript location, one of {5_UTR, CDS, 3_UTR} @@ -653,6 +659,7 @@ def _annotate_mnp_hgvs(self, trx_id, location, pos, alt, cds_start_offset, cds_s :param str alt: alternate bases :param int cds_start_offset: 0-based transcript position of the CDS start nucleotide :param int cds_stop_offset: 0-based transcript position of the nucleotide after the last CDS position + :param int cds_len: CDS length :param int | None start_index: 0-based position of the first mismatch in the CDS :param dict ref_codon_dict: index in the REF CDS: CODON_TUPLE (codon, index of base in the codon) :param dict alt_codon_dict: index in the ALT CDS: CODON_TUPLE (codon, index of base in the codon) @@ -695,36 +702,57 @@ def _annotate_mnp_hgvs(self, trx_id, location, pos, alt, cds_start_offset, cds_s hgvs_pro = "p.(=)" elif location == self.CDS_ID: - hgvs_nt = "%s:c.%i_%idelins%s" % (trx_id, start_index + 1, start_index + alt_len, alt.lower()) - hgvs_tx = "%s:r.%i_%idelins%s" % ( - trx_id, start_index + 1, start_index + alt_len, su.dna_to_rna(alt.lower())) - ref_aa = translate(ref_codon_dict[start_index].codon) - alt_aa = translate(alt_codon_dict[start_index].codon) + end_index = start_index + alt_len + if end_index > cds_len: + # MNP spans the CDS/3' UTR junction + hgvs_nt = "%s:c.%i_*%idelins%s" % (trx_id, start_index + 1, end_index - cds_len, alt.lower()) + hgvs_tx = "%s:r.%i_*%idelins%s" % ( + trx_id, start_index + 1, end_index - cds_len, su.dna_to_rna(alt.lower())) + else: + hgvs_nt = "%s:c.%i_%idelins%s" % (trx_id, start_index + 1, end_index, alt.lower()) + hgvs_tx = "%s:r.%i_%idelins%s" % (trx_id, start_index + 1, end_index, su.dna_to_rna(alt.lower())) # Determine if the MNP spans codons if (ref_codon_dict[start_index].base_index == 1 and alt_len == 3) or \ - ref_codon_dict[start_index].base_index == 2: + ref_codon_dict[start_index].base_index == 2 or alt_len > 3: # MNP spans codons - second_ref_aa = translate(ref_codon_dict[start_index + 3].codon) - ref_aas = "".join((ref_aa, second_ref_aa,)) - alt_aas = "".join((AA_MAP[alt_aa], AA_MAP[translate(alt_codon_dict[start_index + 3].codon)],)) - - if ref_aas == alt_aas: + alt_len_offset = len(alt) + alt_len_offset = 4 if alt_len_offset <= 3 else alt_len_offset + ref_aa_list = [] + alt_aa_list = [] + + for i in range(start_index, start_index + alt_len_offset, 3): + # Need to ensure we don't index into the 3' UTR + if i < cds_len: + ref_aa_list.append(AA_MAP[translate(ref_codon_dict[i].codon)]) + alt_aa_list.append(AA_MAP[translate(alt_codon_dict[i].codon)]) + # last_index will be assigned because at least one index is in the CDS + last_index = i + + if ref_aa_list == alt_aa_list: hgvs_pro = "p.%s%i_%s%i=" % ( - AA_MAP[ref_aa], ref_codon_dict[start_index].codon_pos, AA_MAP[second_ref_aa], - ref_codon_dict[start_index + 3].codon_pos) + ref_aa_list[0], ref_codon_dict[start_index].codon_pos, ref_aa_list[len(ref_aa_list) - 1], + ref_codon_dict[last_index].codon_pos) else: hgvs_pro = "p.%s%i_%s%idelins%s" % ( - AA_MAP[ref_aa], ref_codon_dict[start_index].codon_pos, AA_MAP[second_ref_aa], - ref_codon_dict[start_index + 3].codon_pos, alt_aas) + ref_aa_list[0], ref_codon_dict[start_index].codon_pos, ref_aa_list[len(ref_aa_list) - 1], + ref_codon_dict[last_index].codon_pos, "".join(alt_aa_list)) + else: # MNP affects one codon + ref_aa = AA_MAP[translate(ref_codon_dict[start_index].codon)] + alt_aa = AA_MAP[translate(alt_codon_dict[start_index].codon)] + if ref_aa == alt_aa: - hgvs_pro = "p.%s%i=" % (AA_MAP[ref_aa], ref_codon_dict[start_index].codon_pos) + hgvs_pro = "p.%s%i=" % (ref_aa, ref_codon_dict[start_index].codon_pos) else: - hgvs_pro = "p.%s%idelins%s" % (AA_MAP[ref_aa], ref_codon_dict[start_index].codon_pos, AA_MAP[alt_aa]) + hgvs_pro = "p.%s%idelins%s" % (ref_aa, ref_codon_dict[start_index].codon_pos, alt_aa) + + if end_index > cds_len: + # MNP spans the CDS/3' UTR junction; so we know some mismatches affect the 3' UTR + hgvs_pro += ",p.(=)" else: raise NotImplementedError @@ -746,11 +774,13 @@ def _group_mismatches(mm_positions, mm_indices): continue for j, mm_group in enumerate(mm_groups): - - mm_group_positions = [i[1] for i in mm_group] - min_pos = min(mm_group_positions) - - if mm_pos - min_pos < 3: + """As of 9/14/2023 HGVS recommends the following annotation changes, pending a decision + # https://varnomen.hgvs.org/bg-material/consultation/svd-wg010/ + pairs of variants should be considered in order of increasing sequencing position. + If variants A, B, and C occur in that order on a sequence, and A and B might be merged, and B and C + might be merged, A, B and C should be merged and described as a single “delins” variant""" + mm_group_max_pos = max([mm_group[e][1] for e in range(len(mm_group))]) + if (mm_pos - mm_group_max_pos) < 3: mm_groups[j].append((mm_idx, mm_pos,)) break else: @@ -759,9 +789,9 @@ def _group_mismatches(mm_positions, mm_indices): return tuple(mm_groups) - def _annotate_multivariant_hgvs(self, trx_id, location, pos, ref, alt, cds_start_offset, cds_stop_offset, + def _annotate_multivariant_hgvs(self, trx_id, location, pos, ref, alt, cds_start_offset, cds_stop_offset, cds_len, ref_codon_dict, alt_codon_dict): - """Concatenates HGVS for multivariants (alleles). + """Concatenates HGVS for multivariants (alleles) or simple MNPs. :param str trx_id: transcript ID :param str location: transcript location, one of {5_UTR, CDS, 3_UTR} @@ -770,6 +800,7 @@ def _annotate_multivariant_hgvs(self, trx_id, location, pos, ref, alt, cds_start :param str alt: alternate bases :param int cds_start_offset: 0-based transcript position of the CDS start nucleotide :param int cds_stop_offset: 0-based transcript position of the nucleotide after the last CDS position + :param int cds_len: CDS length :param dict ref_codon_dict: index in the REF CDS: CODON_TUPLE (codon, index of base in the codon) :param dict alt_codon_dict: index in the ALT CDS: CODON_TUPLE (codon, index of base in the codon) :return tuple: (hgvs_nt, hgvs_tx, hgvs_pro) @@ -808,16 +839,16 @@ def _annotate_multivariant_hgvs(self, trx_id, location, pos, ref, alt, cds_start cds_start_offset, cds_stop_offset, start_index, ref_aa, aa_pos, alt_aa) else: - # Call an delins + # Call a delins if location == self.FIVEPRIME_UTR or location == self.THREEPRIME_UTR: hgvs_nt, hgvs_tx, hgvs_pro = self._annotate_mnp_hgvs( trx_id, location, mm_group[0][1], alt[mm_group[0][0]:mm_group[len(mm_group) - 1][0] + 1], - cds_start_offset, cds_stop_offset, None, ref_codon_dict, alt_codon_dict) + cds_start_offset, cds_stop_offset, cds_len, None, ref_codon_dict, alt_codon_dict) else: start_index = mm_group[0][1] - cds_start_offset - 1 hgvs_nt, hgvs_tx, hgvs_pro = self._annotate_mnp_hgvs( trx_id, location, mm_group[0][1], alt[mm_group[0][0]:mm_group[len(mm_group) - 1][0] + 1], - cds_start_offset, cds_stop_offset, start_index, ref_codon_dict, alt_codon_dict) + cds_start_offset, cds_stop_offset, cds_len, start_index, ref_codon_dict, alt_codon_dict) hgvs_nt_list.append(hgvs_nt) hgvs_tx_list.append(hgvs_tx) @@ -1148,7 +1179,7 @@ def _annotate_del_hgvs(self, trx_id, location, pos, ref, cds_start_offset, cds_s return hgvs_nt, hgvs_tx, hgvs_pro def _annotate_stopvar_hgvs(self, trx_id, location, trx_seq, pos, ref, alt, ref_len, alt_len, cds_start_offset, - cds_stop_offset, start_index, end_index, ref_codon_dict, alt_codon_dict): + cds_stop_offset, cds_len, start_index, end_index, ref_codon_dict, alt_codon_dict): """Annotates MAVE-HGVS for a variant at the stop codon (including nonstop/extension variant). :param str trx_id: transcript ID @@ -1161,6 +1192,7 @@ def _annotate_stopvar_hgvs(self, trx_id, location, trx_seq, pos, ref, alt, ref_l :param int alt_len: alternate length :param int cds_start_offset: 0-based transcript position of the CDS start nucleotide :param int cds_stop_offset: 0-based transcript position of the nucleotide after the last CDS position + :param int cds_len: CDS length :param int start_index: 0-based position of the first mismatch in the CDS :param int end_index: 0-based position of the last mismatch in the CDS :param dict ref_codon_dict: index in the REF CDS: CODON_TUPLE (codon, index of base in the codon) @@ -1181,7 +1213,8 @@ def _annotate_stopvar_hgvs(self, trx_id, location, trx_seq, pos, ref, alt, ref_l else: # Here we have a single-codon MNP or a multi-codon change (haplotype, multivariant) hgvs_nt, hgvs_tx, _ = self._annotate_multivariant_hgvs( - trx_id, location, pos, ref, alt, cds_start_offset, cds_stop_offset, ref_codon_dict, alt_codon_dict) + trx_id, location, pos, ref, alt, cds_start_offset, cds_stop_offset, cds_len, + ref_codon_dict, alt_codon_dict) elif ref_len < alt_len: # Here we have an insertion or duplication @@ -1252,15 +1285,16 @@ def _get_mut_info(self, trx_id, location, trx_seq, wt_cds_seq, mut_cds_seq, pos, if translate(ref_codon_dict[start_index].codon) == STOP_AA: # Annotate variants at the stop codon first as they are a special case - # Any nonstop variant will be assumed a frameshift variant + # Any nonstop variant that's not silent will be assumed a frameshift variant alt_codon_dict = self._get_pos_codon_dict(mut_cds_seq) + cds_len = len(wt_cds_seq) ref_codons, alt_codons, ref_aas, alt_aas, aa_changes, matches_mut_sigs = self._annotate_stopvar( - trx_seq, pos, ref, alt, cds_start_offset, ref_codon_dict, alt_codon_dict) + trx_seq, pos, ref, alt, cds_start_offset, cds_len, ref_codon_dict, alt_codon_dict) # Special case requires investigation of all variant types to determine if a non-stop variant exists hgvs_nt, hgvs_tx, hgvs_pro = self._annotate_stopvar_hgvs( - trx_id, location, trx_seq, pos, ref, alt, ref_len, alt_len, cds_start_offset, cds_stop_offset, + trx_id, location, trx_seq, pos, ref, alt, ref_len, alt_len, cds_start_offset, cds_stop_offset, cds_len, start_index, end_index, ref_codon_dict, alt_codon_dict) elif ref_len == alt_len: @@ -1285,14 +1319,17 @@ def _get_mut_info(self, trx_id, location, trx_seq, wt_cds_seq, mut_cds_seq, pos, else: # Here we have a single-codon MNP or a multi-codon change (haplotype, multivariant) + cds_len = len(wt_cds_seq) + if location == self.CDS_ID: - # For MNPs that span the start codon, we will not have custom annotations, but the MAVE-HGVS - # annotation will be correct ref_codons, alt_codons, ref_aas, alt_aas, aa_changes, matches_mut_sigs = self._annotate_mnp( - ref_codon_dict, alt_codon_dict, start_index, end_index) + ref_codon_dict, alt_codon_dict, start_index, end_index, cds_len) + # For MNPs that span the start codon, we will not have custom annotations, but the MAVE-HGVS + # annotation will be correct hgvs_nt, hgvs_tx, hgvs_pro = self._annotate_multivariant_hgvs( - trx_id, location, pos, ref, alt, cds_start_offset, cds_stop_offset, ref_codon_dict, alt_codon_dict) + trx_id, location, pos, ref, alt, cds_start_offset, cds_stop_offset, cds_len, + ref_codon_dict, alt_codon_dict) elif ref_len < alt_len: diff --git a/src/satmut_utils/satmut_utils.py b/src/satmut_utils/satmut_utils.py index b6678dc..1c7603c 100644 --- a/src/satmut_utils/satmut_utils.py +++ b/src/satmut_utils/satmut_utils.py @@ -23,7 +23,7 @@ __author__ = "Ian Hoskins" __credits__ = ["Ian Hoskins"] __license__ = "GPLv3" -__version__ = "1.1.0" +__version__ = "1.1.1" __maintainer__ = "Ian Hoskins" __email__ = "ianjameshoskins@utexas.edu" __status__ = "Development" diff --git a/src/tests/analysis/test_coordinate_mapper.py b/src/tests/analysis/test_coordinate_mapper.py index 1736d0d..b322cbe 100644 --- a/src/tests/analysis/test_coordinate_mapper.py +++ b/src/tests/analysis/test_coordinate_mapper.py @@ -214,14 +214,15 @@ def test_annotate_mnp(self): observed = self.appris_aa_mapper._annotate_mnp( ref_codon_dict=self.ref_codon_dict, - alt_codon_dict=self.appris_aa_mapper._get_pos_codon_dict("ATGCCCACTGAG"), start_index=5, end_index=6) + alt_codon_dict=self.appris_aa_mapper._get_pos_codon_dict("ATGCCCACTGAG"), + start_index=5, end_index=6, cds_len=3315) expected = (("CCT", "TCT",), ("CCC", "ACT",), ("P", "S",), ("P", "T",), ("p.P2P", "p.S3T",), (False, True,)) self.assertEqual(expected, observed) def test_annotate_ins(self): - """Tests for correct custom annotation of an insertion.""" + """Tests for correct custom annotation of an out-of-frame insertion.""" observed = self.appris_aa_mapper._annotate_ins( trx_seq=self.appris_cbs_trx_seq, ref_codon_dict=self.ref_codon_dict, start_index=4, @@ -231,6 +232,17 @@ def test_annotate_ins(self): self.assertEqual(expected, observed) + def test_annotate_ins_inframe(self): + """Tests for correct custom annotation of an in-frame insertion.""" + + observed = self.appris_aa_mapper._annotate_ins( + trx_seq=self.appris_cbs_trx_seq, ref_codon_dict=self.ref_codon_dict, start_index=5, + pos=266, ref="T", alt="TATG") + + expected = (("CCT",), ("CCT", "ATG",), ("P",), ("P", "M",), ("p.P2P,M",), (True,)) + + self.assertEqual(expected, observed) + def test_annotate_del(self): """Tests for correct custom annotation of a deletion.""" @@ -251,7 +263,7 @@ def test_annotate_stopvar(self): alt_codon_dict = self.appris_aa_mapper._get_pos_codon_dict(self.appris_cbs_cds_seq[:-1] + "T") observed = self.appris_aa_mapper._annotate_stopvar( - trx_seq=self.appris_cbs_trx_seq, pos=1916, ref="A", alt="T", cds_start_offset=260, + trx_seq=self.appris_cbs_trx_seq, pos=1916, ref="A", alt="T", cds_start_offset=260, cds_len=3315, ref_codon_dict=ref_codon_dict, alt_codon_dict=alt_codon_dict) expected = (("TGA",), ("TGT",), ("*",), ("C",), ("p.*552C",), (True,)) @@ -298,7 +310,8 @@ def test_annotate_mnp_hgvs_cds_single_codon(self): observed = self.appris_aa_mapper._annotate_mnp_hgvs( trx_id="ENST00000398165.7", location=self.appris_aa_mapper.CDS_ID, pos=267, alt="AAA", cds_start_offset=260, - cds_stop_offset=1916, start_index=6, ref_codon_dict=self.ref_codon_dict, alt_codon_dict=alt_codon_dict) + cds_stop_offset=1916, cds_len=3315, start_index=6, ref_codon_dict=self.ref_codon_dict, + alt_codon_dict=alt_codon_dict) expected = ("ENST00000398165.7:c.7_9delinsaaa", "ENST00000398165.7:r.7_9delinsaaa", "p.Ser3delinsLys",) @@ -311,7 +324,8 @@ def test_annotate_mnp_hgvs_cds_multi_codon(self): observed = self.appris_aa_mapper._annotate_mnp_hgvs( trx_id="ENST00000398165.7", location=self.appris_aa_mapper.CDS_ID, pos=269, alt="AC", cds_start_offset=260, - cds_stop_offset=1916, start_index=8, ref_codon_dict=self.ref_codon_dict, alt_codon_dict=alt_codon_dict) + cds_stop_offset=1916, cds_len=3315, start_index=8, ref_codon_dict=self.ref_codon_dict, + alt_codon_dict=alt_codon_dict) expected = ("ENST00000398165.7:c.9_10delinsac", "ENST00000398165.7:r.9_10delinsac", "p.Ser3_Glu4delinsSerGln",) @@ -322,8 +336,8 @@ def test_annotate_mnp_hgvs_utr5_multi(self): observed = self.appris_aa_mapper._annotate_mnp_hgvs( trx_id="ENST00000398165.7", location=self.appris_aa_mapper.FIVEPRIME_UTR, pos=255, alt="GCG", - cds_start_offset=260, cds_stop_offset=1916, start_index=None, ref_codon_dict=self.ref_codon_dict, - alt_codon_dict=self.ref_codon_dict) + cds_start_offset=260, cds_stop_offset=1916, cds_len=3315, start_index=None, + ref_codon_dict=self.ref_codon_dict, alt_codon_dict=self.ref_codon_dict) expected = ("ENST00000398165.7:c.-6_-4delinsgcg", "ENST00000398165.7:r.-6_-4delinsgcg", "p.(=)",) @@ -334,8 +348,8 @@ def test_annotate_mnp_hgvs_utr3_multi(self): observed = self.appris_aa_mapper._annotate_mnp_hgvs( trx_id="ENST00000398165.7", location=self.appris_aa_mapper.THREEPRIME_UTR, pos=1918, alt="TC", - cds_start_offset=260, cds_stop_offset=1916, start_index=None, ref_codon_dict=self.ref_codon_dict, - alt_codon_dict=self.ref_codon_dict) + cds_start_offset=260, cds_stop_offset=1916, cds_len=3315, start_index=None, + ref_codon_dict=self.ref_codon_dict, alt_codon_dict=self.ref_codon_dict) expected = ("ENST00000398165.7:c.*2_*3delinstc", "ENST00000398165.7:r.*2_*3delinsuc", "p.(=)",) @@ -344,11 +358,11 @@ def test_annotate_mnp_hgvs_utr3_multi(self): def test_group_mismatches(self): """Tests that mismatches in multivariants are appropriately grouped to determine substitution versus delins.""" - mm_positions = (500, 503, 504, 510, 511, 512) - mm_indices = (0, 3, 4, 10, 11, 12) + mm_positions = (500, 503, 504, 510, 511, 512, 520, 522) + mm_indices = (0, 3, 4, 10, 11, 12, 20, 22) observed = self.appris_aa_mapper._group_mismatches(mm_positions, mm_indices) - expected = ([(0, 500)], [(3, 503), (4, 504)], [(10, 510), (11, 511), (12, 512)],) + expected = ([(0, 500)], [(3, 503), (4, 504)], [(10, 510), (11, 511), (12, 512)], [(20, 520), (22, 522)],) self.assertEqual(expected, observed) @@ -358,8 +372,8 @@ def test_annotate_multivariant_hgvs_cds(self): alt_codon_dict = self.appris_aa_mapper._get_pos_codon_dict("ATGTTTTGTGAG") observed = self.appris_aa_mapper._annotate_multivariant_hgvs( - trx_id="ENST00000398165.7", location=self.appris_aa_mapper.CDS_ID, pos=264, ref="CCTTCT", - alt="TTTTGT", cds_start_offset=260, cds_stop_offset=1916, ref_codon_dict=self.ref_codon_dict, + trx_id="ENST00000398165.7", location=self.appris_aa_mapper.CDS_ID, pos=264, ref="CCTTC", + alt="TTTTG", cds_start_offset=260, cds_stop_offset=1916, cds_len=3315, ref_codon_dict=self.ref_codon_dict, alt_codon_dict=alt_codon_dict) expected = ("ENST00000398165.7:c.4_5delinstt,ENST00000398165.7:c.8c>g", @@ -374,7 +388,7 @@ def test_annotate_multivariant_hgvs_utr5_isolated(self): # Need to pass codon dictionaries as the might be needed for 5' UTR variants, although here they are not req'd observed = self.appris_aa_mapper._annotate_multivariant_hgvs( trx_id="ENST00000398165.7", location=self.appris_aa_mapper.FIVEPRIME_UTR, pos=255, ref="CCCAGC", - alt="TCCGAC", cds_start_offset=260, cds_stop_offset=1916, ref_codon_dict=self.ref_codon_dict, + alt="TCCGAC", cds_start_offset=260, cds_stop_offset=1916, cds_len=3315, ref_codon_dict=self.ref_codon_dict, alt_codon_dict=self.ref_codon_dict) expected = ("ENST00000398165.7:c.-6c>t,ENST00000398165.7:c.-3_-2delinsga", @@ -389,7 +403,7 @@ def test_annotate_multivariant_hgvs_utr5_overlapping(self): observed = self.appris_aa_mapper._annotate_multivariant_hgvs( trx_id="ENST00000398165.7", location=self.appris_aa_mapper.FIVEPRIME_UTR, pos=258, ref="AGCAT", - alt="TGCTG", cds_start_offset=260, cds_stop_offset=1916, ref_codon_dict=self.ref_codon_dict, + alt="TGCTG", cds_start_offset=260, cds_stop_offset=1916, cds_len=3315, ref_codon_dict=self.ref_codon_dict, alt_codon_dict=alt_codon_dict) expected = ("ENST00000398165.7:c.-3a>t,ENST00000398165.7:c.1_2delinstg", @@ -406,7 +420,7 @@ def test_annotate_multivariant_hgvs_utr3_isolated(self): observed = self.appris_aa_mapper._annotate_multivariant_hgvs( trx_id="ENST00000398165.7", location=self.appris_aa_mapper.THREEPRIME_UTR, pos=1920, ref="CCGGAG", - alt="TCGCCG", cds_start_offset=260, cds_stop_offset=1916, ref_codon_dict=self.ref_codon_dict, + alt="TCGCCG", cds_start_offset=260, cds_stop_offset=1916, cds_len=3315, ref_codon_dict=self.ref_codon_dict, alt_codon_dict=self.ref_codon_dict) expected = ("ENST00000398165.7:c.*4c>t,ENST00000398165.7:c.*7_*8delinscc", @@ -644,7 +658,7 @@ def test_annotate_stopvar_hgvs_synon(self): # 1915:G:A synonymous SNP observed = self.appris_aa_mapper._annotate_stopvar_hgvs( trx_id="ENST00000398165.7", location=self.appris_aa_mapper.CDS_ID, trx_seq=self.appris_cbs_trx_seq, - pos=1915, ref="G", alt="A", ref_len=1, alt_len=1, cds_start_offset=260, cds_stop_offset=1916, + pos=1915, ref="G", alt="A", ref_len=1, alt_len=1, cds_start_offset=260, cds_stop_offset=1916, cds_len=3315, start_index=1654, end_index=1654, ref_codon_dict=ref_codon_dict, alt_codon_dict=alt_codon_dict) expected = ("ENST00000398165.7:c.1655g>a", "ENST00000398165.7:r.1655g>a", "p.Ter552=",) @@ -659,7 +673,7 @@ def test_annotate_stopvar_hgvs_fs(self): observed = self.appris_aa_mapper._annotate_stopvar_hgvs( trx_id="ENST00000398165.7", location=self.appris_aa_mapper.CDS_ID, trx_seq=self.appris_cbs_trx_seq, - pos=1915, ref="G", alt="GT", ref_len=1, alt_len=2, cds_start_offset=260, cds_stop_offset=1916, + pos=1915, ref="G", alt="GT", ref_len=1, alt_len=2, cds_start_offset=260, cds_stop_offset=1916, cds_len=3315, start_index=1654, end_index=1654, ref_codon_dict=ref_codon_dict, alt_codon_dict=alt_codon_dict) expected = ("ENST00000398165.7:c.1655_1656inst", "ENST00000398165.7:r.1655_1656insu", "p.Ter552fs",) From aa7eeb7a19b56733bc2e0c7be5e83bed19201b65 Mon Sep 17 00:00:00 2001 From: Ian Hoskins Date: Mon, 25 Dec 2023 16:04:34 -0600 Subject: [PATCH 3/3] IH 25DEC2023 version to v1.1.2. --- CHANGES.txt | 3 +++ setup.cfg | 2 +- src/analysis/coordinate_mapper.py | 5 +---- src/satmut_utils/satmut_utils.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 9311339..4cd3927 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -23,3 +23,6 @@ Release Summary: 1.1.1, September 15, 2023 Fixed bugs in HGVS annotations for single in-frame insertions and MNPs spanning the CDS-3' UTR junction. Ensure deletion-insertion annotations follow HGVS proposal SVD-WG010. + +1.1.2, December 25, 2023 + Fixed bug in HGVS annotations for out-of-frame duplications > 1 nt. diff --git a/setup.cfg b/setup.cfg index 1c0d134..533ce75 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = satmut_utils -version = 1.1.1 +version = 1.1.2 author = Ian Hoskins author_email = ianjameshoskins@utexas.edu description = Tools for variant simulation and variant calling in paired end, targeted sequencing saturation mutagenesis experiments diff --git a/src/analysis/coordinate_mapper.py b/src/analysis/coordinate_mapper.py index 02501dc..a786e03 100644 --- a/src/analysis/coordinate_mapper.py +++ b/src/analysis/coordinate_mapper.py @@ -137,9 +137,6 @@ class MapperBase(object): J_GENE_SEG_ID = "J_gene_segment" C_GENE_SEG_ID = "C_gene_segment" - # Omit transcripts on non-NC contigs - CONTIGS_TO_OMIT = re.compile("|".join(["NT", "NW"])) - FEATURES_TO_OMIT = [R_RNA_ID, T_RNA_ID, MI_RNA_ID, SN_RNA_ID, SNO_RNA_ID, NC_RNA_ID, LNC_RNA_ID, TELO_RNA_ID, DLOOP_ID, VAULT_RNA_ID, ANTIS_RNA_ID, Y_RNA, RNASE_MRP_RNA_ID, RNASE_P_RNA_ID, SRP_RNA_ID, PROMOTER_ID, ENHANCER_ID, REPEAT_ID, REGION_ID, SEQ_FEAT_ID, MATCH_ID, CDNA_MATCH_ID, @@ -997,7 +994,7 @@ def _annotate_dup_hgvs(self, trx_id, location, pos, alt_len, cds_start_offset, c else: # Frameshift dup ref_aa, aa_pos = self._get_fs_first_alt_aa(alt_aas, cds_stop_offset, start_index, ref_codon_dict) - hgvs_pro = "p.%s%ifs" % (AA_MAP[translate(ref_aa)], aa_pos) + hgvs_pro = "p.%s%ifs" % (AA_MAP[ref_aa], aa_pos) else: raise NotImplementedError diff --git a/src/satmut_utils/satmut_utils.py b/src/satmut_utils/satmut_utils.py index 1c7603c..62f70a1 100644 --- a/src/satmut_utils/satmut_utils.py +++ b/src/satmut_utils/satmut_utils.py @@ -23,7 +23,7 @@ __author__ = "Ian Hoskins" __credits__ = ["Ian Hoskins"] __license__ = "GPLv3" -__version__ = "1.1.1" +__version__ = "1.1.2" __maintainer__ = "Ian Hoskins" __email__ = "ianjameshoskins@utexas.edu" __status__ = "Development"