Skip to content

Commit

Permalink
prec: fix wrong assignment of indices for unmodeled residues
Browse files Browse the repository at this point in the history
  • Loading branch information
avivrosenberg committed May 26, 2024
1 parent c406360 commit 43180b6
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 12 deletions.
10 changes: 4 additions & 6 deletions src/pp5/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

with warnings.catch_warnings():
warnings.simplefilter("ignore", BiopythonExperimentalWarning)
from Bio.Align import substitution_matrices, PairwiseAligner
from Bio.Align import substitution_matrices, PairwiseAligner, Alignment

from Bio.AlignIO import MultipleSeqAlignment as MSA
from Bio.SeqRecord import SeqRecord
Expand Down Expand Up @@ -63,7 +63,7 @@ def pairwise_alignment_map(
tgt_seq: str,
open_gap_score: float = -10,
extend_gap_score: float = -0.5,
) -> Tuple[float, Dict[int, int]]:
) -> Tuple[Alignment, Dict[int, int]]:
"""
Aligns between two AA sequences and produces a map from the indices of
the source sequence to the indices of the target sequence.
Expand All @@ -74,7 +74,7 @@ def pairwise_alignment_map(
:param open_gap_score: Penalty for opening a gap in the alignment.
:param extend_gap_score: Penalty for extending a gap by one residue.
:return: A tuple with two elements:
- The alignment score, higher is better.
- The alignment object produced by the aligner.
- A dict mapping from an index in the source sequence to the corresponding
index in the target sequence.
"""
Expand All @@ -93,8 +93,6 @@ def pairwise_alignment_map(

multi_alignments = aligner.align(src_seq, tgt_seq)
alignment = sorted(multi_alignments, key=lambda a: a.score)[-1]
score = alignment.score
# LOGGER.info(f"{self}: PDB to UNP sequence alignment score={alignment.score}")

# Alignment contains two tuples each of length N (for N matching sub-sequences)
# (
Expand All @@ -117,7 +115,7 @@ def pairwise_alignment_map(
continue
src_to_tgt.append((src_start + j, tgt_start + j))

return score, dict(src_to_tgt)
return alignment, dict(src_to_tgt)


def multiseq_align(
Expand Down
40 changes: 34 additions & 6 deletions src/pp5/prec.py
Original file line number Diff line number Diff line change
Expand Up @@ -883,9 +883,13 @@ def __init__(
]

# Add un-modelled residues by aligning to the canonical PDB sequence.
_, meta_to_struct_idx = pairwise_alignment_map(
meta_to_struct_seq_alignment, meta_to_struct_idx = pairwise_alignment_map(
pdb_meta_aa_seq, pdb_modelled_aa_seq
)
LOGGER.info(
f"{self}: Canonical to structure seq alignment:\n"
f"{str(meta_to_struct_seq_alignment).strip()}"
)
matching_residues: List[Residue] = [] # residues both in modelled and in meta
missing_residues: List[Residue] = [] # residues only in meta
for curr_meta_seq_idx in range(len(pdb_meta_aa_seq)):
Expand All @@ -898,8 +902,33 @@ def __init__(
# This residue is not modelled (missing from the structure), need to add
missing_res_name_single = pdb_meta_aa_seq[curr_meta_seq_idx]
missing_res_name = ACIDS_1TO3[missing_res_name_single]

# We need to determine the residue sequence index for the missing
# residue. It needs to be consistent with the sequence index of the
# modelled residues.
if len(matching_residues) > 0:
# We have previous matching residues, so we can infer the index
# based on the last matching residue.
missing_idx = matching_residues[-1].get_id()[1] + 1
else:
# We have no matching residues yet. We need to determine how far
# we are from the first one, take its index and subtract the
# distance.
first_matching_meta_idx = next(iter(meta_to_struct_idx))
dist_to_first_matching = first_matching_meta_idx - curr_meta_seq_idx
assert dist_to_first_matching >= 0
first_matching_struct_idx = meta_to_struct_idx[
first_matching_meta_idx
]
first_matching_modelled_res = modelled_residues[
first_matching_struct_idx
]
missing_idx = (
first_matching_modelled_res.get_id()[1] - dist_to_first_matching
)

curr_residue = Residue(
(" ", curr_meta_seq_idx, ICODE_UNMODELED_RES), missing_res_name, 0
(" ", missing_idx, ICODE_UNMODELED_RES), missing_res_name, 0
)
missing_residues.append(curr_residue)

Expand Down Expand Up @@ -954,10 +983,10 @@ def __init__(
)

# Align PDB sequence to UNP
unp_alignment_score, pdb_to_unp_idx = pairwise_alignment_map(
unp_alignment, pdb_to_unp_idx = pairwise_alignment_map(
all_aa_seq, self.unp_rec.sequence
)
LOGGER.info(f"{self}: PDB to UNP alignment score={unp_alignment_score}")
LOGGER.info(f"{self}: PDB to UNP alignment score={unp_alignment.score}")

# Create a ResidueRecord holding all data we need per residue
residue_recs = []
Expand Down Expand Up @@ -1270,8 +1299,7 @@ def _find_dna_alignment(
f"{self}: Translated DNA to PDB alignment "
f"(norm_score="
f"{best_alignments.score / len(pdb_aa_seq):.2f}, "
f"num={len(best_alignments)})\n"
f"{str(best_alignment).strip()}"
f"num={len(best_alignments)})"
)

# Map each AA to a dict of (codon->count)
Expand Down

0 comments on commit 43180b6

Please sign in to comment.