Skip to content

Commit

Permalink
Merge pull request #54 from Arcadia-Science/ter/aa-with-x
Browse files Browse the repository at this point in the history
Deal with false mismatches in translation around Ns/Xs in nucleotide and amino acid sequences
  • Loading branch information
taylorreiter authored Jun 24, 2024
2 parents 34bd1b9 + 9cd28b8 commit 1488232
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 5 deletions.
5 changes: 4 additions & 1 deletion scripts/extract_plmutils_nucleotide_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,10 @@ def extract_nucleotide_peptide_sequences(
)
protein_peptide_sequence = peptide_record.seq
if not utils.verify_translation(
nucleotide_peptide_sequence, protein_peptide_sequence, to_stop=False
nucleotide_peptide_sequence,
protein_peptide_sequence,
to_stop=False,
allow_wildcard_x=True,
):
raise ValueError(
f"Warning: Translation mismatch for {transcript_id}. "
Expand Down
35 changes: 31 additions & 4 deletions scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,37 @@
from Bio.Seq import Seq


def verify_translation(nucleotide_seq, amino_acid_seq, to_stop):
"""Verify that a nucleotide sequence translates correctly to its amino acid sequence."""
translated_seq = Seq(str(nucleotide_seq)).translate(to_stop=to_stop)
return str(translated_seq) == str(amino_acid_seq)
def verify_translation(nucleotide_seq, amino_acid_seq, to_stop, allow_wildcard_x=False):
"""
Verify that a nucleotide sequence correctly translates to its corresponding amino acid sequence.
The `allow_wildcard_x` argument permits any amino acid encoded by an "X" in the input sequence
to be replaced with the corresponding translated nucleotide sequence. This option is needed when
`amino_acid_seq` was translated by a tool like Orfipy that does not translate codons that
contain an "N". This is because Biopython (which is used here to translate `nucleotide_seq` into
an amino acid sequence) is less restrictive and will translate codons containing "N"s if they
can only translate into a single amino acid, regardless of the "N". For example, the leucine
codons all begins with CT. Therefore, even if the codon is CTN, Biopython translates it to
leucine.
Notes:
- This function requires the sequences to be the same length.
- This function only verifies that two translations match; it does not edit the amino acid
sequence.
"""
translated_seq = str(Seq(str(nucleotide_seq)).translate(to_stop=to_stop))
amino_acid_seq = str(amino_acid_seq)

if len(translated_seq) != len(amino_acid_seq):
return False

if allow_wildcard_x:
amino_acid_seq = "".join(
translated_seq[i] if amino_acid == "X" else amino_acid
for i, amino_acid in enumerate(amino_acid_seq)
)

return translated_seq == amino_acid_seq


def read_fasta(fasta_file):
Expand Down

0 comments on commit 1488232

Please sign in to comment.