From 43180b6036e0f3c05bd96e953b88726690e67f22 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Sun, 26 May 2024 19:58:02 +0300 Subject: [PATCH] prec: fix wrong assignment of indices for unmodeled residues --- src/pp5/align.py | 10 ++++------ src/pp5/prec.py | 40 ++++++++++++++++++++++++++++++++++------ 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/src/pp5/align.py b/src/pp5/align.py index 284a734..d27e649 100644 --- a/src/pp5/align.py +++ b/src/pp5/align.py @@ -26,7 +26,7 @@ with warnings.catch_warnings(): warnings.simplefilter("ignore", BiopythonExperimentalWarning) - from Bio.Align import substitution_matrices, PairwiseAligner + from Bio.Align import substitution_matrices, PairwiseAligner, Alignment from Bio.AlignIO import MultipleSeqAlignment as MSA from Bio.SeqRecord import SeqRecord @@ -63,7 +63,7 @@ def pairwise_alignment_map( tgt_seq: str, open_gap_score: float = -10, extend_gap_score: float = -0.5, -) -> Tuple[float, Dict[int, int]]: +) -> Tuple[Alignment, Dict[int, int]]: """ Aligns between two AA sequences and produces a map from the indices of the source sequence to the indices of the target sequence. @@ -74,7 +74,7 @@ def pairwise_alignment_map( :param open_gap_score: Penalty for opening a gap in the alignment. :param extend_gap_score: Penalty for extending a gap by one residue. :return: A tuple with two elements: - - The alignment score, higher is better. + - The alignment object produced by the aligner. - A dict mapping from an index in the source sequence to the corresponding index in the target sequence. """ @@ -93,8 +93,6 @@ def pairwise_alignment_map( multi_alignments = aligner.align(src_seq, tgt_seq) alignment = sorted(multi_alignments, key=lambda a: a.score)[-1] - score = alignment.score - # LOGGER.info(f"{self}: PDB to UNP sequence alignment score={alignment.score}") # Alignment contains two tuples each of length N (for N matching sub-sequences) # ( @@ -117,7 +115,7 @@ def pairwise_alignment_map( continue src_to_tgt.append((src_start + j, tgt_start + j)) - return score, dict(src_to_tgt) + return alignment, dict(src_to_tgt) def multiseq_align( diff --git a/src/pp5/prec.py b/src/pp5/prec.py index f56e410..d7dfe6a 100644 --- a/src/pp5/prec.py +++ b/src/pp5/prec.py @@ -883,9 +883,13 @@ def __init__( ] # Add un-modelled residues by aligning to the canonical PDB sequence. - _, meta_to_struct_idx = pairwise_alignment_map( + meta_to_struct_seq_alignment, meta_to_struct_idx = pairwise_alignment_map( pdb_meta_aa_seq, pdb_modelled_aa_seq ) + LOGGER.info( + f"{self}: Canonical to structure seq alignment:\n" + f"{str(meta_to_struct_seq_alignment).strip()}" + ) matching_residues: List[Residue] = [] # residues both in modelled and in meta missing_residues: List[Residue] = [] # residues only in meta for curr_meta_seq_idx in range(len(pdb_meta_aa_seq)): @@ -898,8 +902,33 @@ def __init__( # This residue is not modelled (missing from the structure), need to add missing_res_name_single = pdb_meta_aa_seq[curr_meta_seq_idx] missing_res_name = ACIDS_1TO3[missing_res_name_single] + + # We need to determine the residue sequence index for the missing + # residue. It needs to be consistent with the sequence index of the + # modelled residues. + if len(matching_residues) > 0: + # We have previous matching residues, so we can infer the index + # based on the last matching residue. + missing_idx = matching_residues[-1].get_id()[1] + 1 + else: + # We have no matching residues yet. We need to determine how far + # we are from the first one, take its index and subtract the + # distance. + first_matching_meta_idx = next(iter(meta_to_struct_idx)) + dist_to_first_matching = first_matching_meta_idx - curr_meta_seq_idx + assert dist_to_first_matching >= 0 + first_matching_struct_idx = meta_to_struct_idx[ + first_matching_meta_idx + ] + first_matching_modelled_res = modelled_residues[ + first_matching_struct_idx + ] + missing_idx = ( + first_matching_modelled_res.get_id()[1] - dist_to_first_matching + ) + curr_residue = Residue( - (" ", curr_meta_seq_idx, ICODE_UNMODELED_RES), missing_res_name, 0 + (" ", missing_idx, ICODE_UNMODELED_RES), missing_res_name, 0 ) missing_residues.append(curr_residue) @@ -954,10 +983,10 @@ def __init__( ) # Align PDB sequence to UNP - unp_alignment_score, pdb_to_unp_idx = pairwise_alignment_map( + unp_alignment, pdb_to_unp_idx = pairwise_alignment_map( all_aa_seq, self.unp_rec.sequence ) - LOGGER.info(f"{self}: PDB to UNP alignment score={unp_alignment_score}") + LOGGER.info(f"{self}: PDB to UNP alignment score={unp_alignment.score}") # Create a ResidueRecord holding all data we need per residue residue_recs = [] @@ -1270,8 +1299,7 @@ def _find_dna_alignment( f"{self}: Translated DNA to PDB alignment " f"(norm_score=" f"{best_alignments.score / len(pdb_aa_seq):.2f}, " - f"num={len(best_alignments)})\n" - f"{str(best_alignment).strip()}" + f"num={len(best_alignments)})" ) # Map each AA to a dict of (codon->count)