Repository URL to install this package:
|
Version:
1.87 ▾
|
# Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk)
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Map residues of two structures to each other based on a FASTA alignment."""
from typing import Optional
import warnings
from Bio.Align import Alignment, MultipleSeqAlignment, PairwiseAligner
from Bio.Data import PDBData
from Bio.PDB import Selection
from Bio.PDB.Model import Model
from Bio.PDB.Polypeptide import is_aa
from Bio.PDB.Residue import Residue
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
class StructureAlignment:
"""Class to align two structures based on an alignment of their sequences."""
def __init__(
self,
fasta_align: MultipleSeqAlignment | Alignment | None = None,
m1: Model | None = None,
m2: Model | None = None,
si: int = 0,
sj: int = 1,
aligner: PairwiseAligner | None = None,
) -> None:
"""Initialize.
Attributes:
- fasta_align - Alignment object / MSA object or None (if None, one will be generated automatically)
- m1, m2 - two models (Bio.PDB.Model.Model objects). Their default values are set to None to maintain legacy code, but they CANNOT be None
- si, sj - the sequences in the Alignment object that correspond to the structures
- aligner - Optional aligner, mutually exclusive with fasta_align, allows for customization of automatic
alignment, otherwise blastp defaults will be used
"""
# Check that fasta_align and aligner are not both provided
if fasta_align is not None and aligner is not None:
raise ValueError(
"fasta_align and aligner cannot be both provided, fasta_align uses precomputed alignments",
"while the aligner is to allow a custom alignment tool to be used for automatically aligning",
"input sequences",
)
self.aligner = aligner
# Validate that models are provided
if m1 is None or m2 is None:
raise ValueError("Both m1 and m2 models must be provided")
if fasta_align is None:
fasta_align = self._generate_alignment_from_models(
m1, m2
) # MultipleSeqAlignment
else:
# if fasta_align is explicitly provided, raise DeprecationWarning letting user
# know that fasta_align is no longer required
warnings.warn(
"fasta_align is no longer a required argument (will be automatically computed "
"if not provided), and the function signature will change in a future release",
DeprecationWarning,
stacklevel=2,
)
if isinstance(fasta_align, MultipleSeqAlignment):
ncolumns = fasta_align.get_alignment_length()
elif isinstance(fasta_align, Alignment):
_, ncolumns = fasta_align.shape
else:
raise ValueError(
f"Could not infer length from alignment object: {fasta_align}. "
"Alignment must be of type MultipleSeqAlignment or Alignment."
)
# Get the residues in the models
rl1 = Selection.unfold_entities(m1, "R")
rl2 = Selection.unfold_entities(m2, "R")
# Residue positions
p1 = 0
p2 = 0
# Map equivalent residues to each other
map12 = {}
map21 = {}
# List of residue pairs (None if -)
duos = []
for i in range(ncolumns):
aa1 = fasta_align[si, i]
aa2 = fasta_align[sj, i]
if aa1 != "-":
# Position in seq1 is not -
while True:
# Loop until an aa is found
r1 = rl1[p1]
p1 = p1 + 1
if is_aa(r1):
break
self._test_equivalence(r1, aa1)
else:
r1 = None
if aa2 != "-":
# Position in seq2 is not -
while True:
# Loop until an aa is found
r2 = rl2[p2]
p2 = p2 + 1
if is_aa(r2):
break
self._test_equivalence(r2, aa2)
else:
r2 = None
if r1:
# Map residue in seq1 to its equivalent in seq2
map12[r1] = r2
if r2:
# Map residue in seq2 to its equivalent in seq1
map21[r2] = r1
# Append aligned pair (r is None if gap)
duos.append((r1, r2))
self.map12 = map12
self.map21 = map21
self.duos = duos
def _generate_alignment_from_models(self, m1, m2) -> Alignment:
"""Generate a MultipleSeqAlignment from two protein models .
Uses Bio.Align.PairwiseAligner to create a proper sequence alignment
with gaps, rather than simply placing sequences side-by-side.
This ensures that homologous residues are properly aligned.
"""
seq1_record = self._extract_sequence_from_model(m1, "structure1")
seq2_record = self._extract_sequence_from_model(m2, "structure2")
if self.aligner is not None:
assert isinstance(
self.aligner, PairwiseAligner
), f"custom aligner must be a PairwiseAligner object, not {type(self.aligner)}"
else:
aligner = PairwiseAligner("blastp")
alignments = aligner.align(seq1_record.seq, seq2_record.seq)
best_alignment = alignments[0]
return best_alignment
def _extract_sequence_from_model(self, model, seq_id):
"""Extract amino acid sequence from a protein model."""
residues = Selection.unfold_entities(model, "R")
sequence = ""
for residue in residues:
if isinstance(residue, Residue) and is_aa(residue):
resname = residue.get_resname()
aa_code = PDBData.protein_letters_3to1_extended.get(resname, "X")
sequence += aa_code
return SeqRecord(Seq(sequence), id=seq_id)
def _test_equivalence(self, r1, aa1):
"""Test if aa in sequence fits aa in structure (PRIVATE)."""
resname = r1.get_resname()
resname = PDBData.protein_letters_3to1_extended[resname]
assert aa1 == resname
def get_maps(self):
"""Map residues between the structures.
Return two dictionaries that map a residue in one structure to
the equivealent residue in the other structure.
"""
return self.map12, self.map21
def get_iterator(self):
"""Create an iterator over all residue pairs."""
for i in range(len(self.duos)):
yield self.duos[i]