From 43180b6036e0f3c05bd96e953b88726690e67f22 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Sun, 26 May 2024 19:58:02 +0300
Subject: [PATCH] prec: fix wrong assignment of indices for unmodeled residues

---
 src/pp5/align.py | 10 ++++------
 src/pp5/prec.py  | 40 ++++++++++++++++++++++++++++++++++------
 2 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/src/pp5/align.py b/src/pp5/align.py
index 284a734..d27e649 100644
--- a/src/pp5/align.py
+++ b/src/pp5/align.py
@@ -26,7 +26,7 @@
 
 with warnings.catch_warnings():
     warnings.simplefilter("ignore", BiopythonExperimentalWarning)
-    from Bio.Align import substitution_matrices, PairwiseAligner
+    from Bio.Align import substitution_matrices, PairwiseAligner, Alignment
 
 from Bio.AlignIO import MultipleSeqAlignment as MSA
 from Bio.SeqRecord import SeqRecord
@@ -63,7 +63,7 @@ def pairwise_alignment_map(
     tgt_seq: str,
     open_gap_score: float = -10,
     extend_gap_score: float = -0.5,
-) -> Tuple[float, Dict[int, int]]:
+) -> Tuple[Alignment, Dict[int, int]]:
     """
     Aligns between two AA sequences and produces a map from the indices of
     the source sequence to the indices of the target sequence.
@@ -74,7 +74,7 @@ def pairwise_alignment_map(
     :param open_gap_score: Penalty for opening a gap in the alignment.
     :param extend_gap_score: Penalty for extending a gap by one residue.
     :return: A tuple with two elements:
-    -  The alignment score, higher is better.
+    -  The alignment object produced by the aligner.
     -  A dict mapping from an index in the source sequence to the corresponding
        index in the target sequence.
     """
@@ -93,8 +93,6 @@ def pairwise_alignment_map(
 
     multi_alignments = aligner.align(src_seq, tgt_seq)
     alignment = sorted(multi_alignments, key=lambda a: a.score)[-1]
-    score = alignment.score
-    # LOGGER.info(f"{self}: PDB to UNP sequence alignment score={alignment.score}")
 
     # Alignment contains two tuples each of length N (for N matching sub-sequences)
     # (
@@ -117,7 +115,7 @@ def pairwise_alignment_map(
                 continue
             src_to_tgt.append((src_start + j, tgt_start + j))
 
-    return score, dict(src_to_tgt)
+    return alignment, dict(src_to_tgt)
 
 
 def multiseq_align(
diff --git a/src/pp5/prec.py b/src/pp5/prec.py
index f56e410..d7dfe6a 100644
--- a/src/pp5/prec.py
+++ b/src/pp5/prec.py
@@ -883,9 +883,13 @@ def __init__(
         ]
 
         # Add un-modelled residues by aligning to the canonical PDB sequence.
-        _, meta_to_struct_idx = pairwise_alignment_map(
+        meta_to_struct_seq_alignment, meta_to_struct_idx = pairwise_alignment_map(
             pdb_meta_aa_seq, pdb_modelled_aa_seq
         )
+        LOGGER.info(
+            f"{self}: Canonical to structure seq alignment:\n"
+            f"{str(meta_to_struct_seq_alignment).strip()}"
+        )
         matching_residues: List[Residue] = []  # residues both in modelled and in meta
         missing_residues: List[Residue] = []  # residues only in meta
         for curr_meta_seq_idx in range(len(pdb_meta_aa_seq)):
@@ -898,8 +902,33 @@ def __init__(
                 # This residue is not modelled (missing from the structure), need to add
                 missing_res_name_single = pdb_meta_aa_seq[curr_meta_seq_idx]
                 missing_res_name = ACIDS_1TO3[missing_res_name_single]
+
+                # We need to determine the residue sequence index for the missing
+                # residue. It needs to be consistent with the sequence index of the
+                # modelled residues.
+                if len(matching_residues) > 0:
+                    # We have previous matching residues, so we can infer the index
+                    # based on the last matching residue.
+                    missing_idx = matching_residues[-1].get_id()[1] + 1
+                else:
+                    # We have no matching residues yet. We need to determine how far
+                    # we are from the first one, take its index and subtract the
+                    # distance.
+                    first_matching_meta_idx = next(iter(meta_to_struct_idx))
+                    dist_to_first_matching = first_matching_meta_idx - curr_meta_seq_idx
+                    assert dist_to_first_matching >= 0
+                    first_matching_struct_idx = meta_to_struct_idx[
+                        first_matching_meta_idx
+                    ]
+                    first_matching_modelled_res = modelled_residues[
+                        first_matching_struct_idx
+                    ]
+                    missing_idx = (
+                        first_matching_modelled_res.get_id()[1] - dist_to_first_matching
+                    )
+
                 curr_residue = Residue(
-                    (" ", curr_meta_seq_idx, ICODE_UNMODELED_RES), missing_res_name, 0
+                    (" ", missing_idx, ICODE_UNMODELED_RES), missing_res_name, 0
                 )
                 missing_residues.append(curr_residue)
 
@@ -954,10 +983,10 @@ def __init__(
                 )
 
         # Align PDB sequence to UNP
-        unp_alignment_score, pdb_to_unp_idx = pairwise_alignment_map(
+        unp_alignment, pdb_to_unp_idx = pairwise_alignment_map(
             all_aa_seq, self.unp_rec.sequence
         )
-        LOGGER.info(f"{self}: PDB to UNP alignment score={unp_alignment_score}")
+        LOGGER.info(f"{self}: PDB to UNP alignment score={unp_alignment.score}")
 
         # Create a ResidueRecord holding all data we need per residue
         residue_recs = []
@@ -1270,8 +1299,7 @@ def _find_dna_alignment(
             f"{self}: Translated DNA to PDB alignment "
             f"(norm_score="
             f"{best_alignments.score / len(pdb_aa_seq):.2f}, "
-            f"num={len(best_alignments)})\n"
-            f"{str(best_alignment).strip()}"
+            f"num={len(best_alignments)})"
         )
 
         # Map each AA to a dict of (codon->count)