Skip to content

Commit

Permalink
Trim non-represented residues from IHM sequence
Browse files Browse the repository at this point in the history
PDB policy is for amino acid Entity sequences to be polymers (so
they should include any loops or other gaps in the midst of the
sequence) but for the termini to be trimmed of any not-modeled
residues. Add a function to, after the PMI system is built,
modify the Entity sequence to remove any parts that are not
included in any representation. This will change the seq_id numbering
if any N-terminal residues are removed. If this happens, set the
existing PMI-to-IHM offset accordingly, and renumber any
existing objects.
  • Loading branch information
benmwebb committed Sep 27, 2024
1 parent 95043d1 commit 86a8bbb
Show file tree
Hide file tree
Showing 4 changed files with 172 additions and 33 deletions.
88 changes: 73 additions & 15 deletions pyext/src/mmcif.py
Original file line number Diff line number Diff line change
Expand Up @@ -1186,10 +1186,12 @@ def __set_name(self, val):
class Entity(ihm.Entity):
"""A single entity in the system.
This functions identically to the base ihm.Entity class, but it
allows identifying residues by either the IHM numbering scheme
(seq_id, which is always contiguous starting at 1) or by the PMI
scheme (which is similar, but may not start at 1 if the sequence in
the FASTA file has one or more N-terminal gaps"""
allows identifying residues by either the PMI numbering scheme
(which is always contiguous starting at 1, covering the entire
sequence in the FASTA files, or the IHM scheme (seq_id, which also
starts at 1, but which only covers the modeled subset of the full
sequence, with non-modeled N-terminal or C-terminal residues
removed)."""
def __init__(self, sequence, pmi_offset, *args, **keys):
# Offset between PMI numbering and IHM; <pmi_#> = <ihm_#> + pmi_offset
# (pmi_offset is also the number of N-terminal gaps in the FASTA file)
Expand All @@ -1209,10 +1211,12 @@ def pmi_range(self, res_id_begin, res_id_end):
class AsymUnit(ihm.AsymUnit):
"""A single asymmetric unit in the system.
This functions identically to the base ihm.AsymUnit class, but it
allows identifying residues by either the IHM numbering scheme
(seq_id, which is always contiguous starting at 1) or by the PMI
scheme (which is similar, but may not start at 1 if the sequence in
the FASTA file has one or more N-terminal gaps"""
allows identifying residues by either the PMI numbering scheme
(which is always contiguous starting at 1, covering the entire
sequence in the FASTA files, or the IHM scheme (seq_id, which also
starts at 1, but which only covers the modeled subset of the full
sequence, with non-modeled N-terminal or C-terminal residues
removed)."""

def __init__(self, entity, *args, **keys):
super().__init__(
Expand Down Expand Up @@ -1375,18 +1379,15 @@ def add_component_sequence(self, state, name, seq, asym_name=None,
if asym_name is None:
asym_name = name

def get_offset(seq):
# Count length of gaps at start of sequence
for i in range(len(seq)):
if seq[i] != '-':
return seq[i:], i
seq, offset = get_offset(seq)
if name in self.sequence_dict:
if self.sequence_dict[name] != seq:
raise ValueError("Sequence mismatch for component %s" % name)
else:
self.sequence_dict[name] = seq
self.entities.add(name, seq, offset, alphabet)
# Offset is always zero to start with; this may be modified
# later in finalize_build() if any non-modeled N-terminal
# residues are removed
self.entities.add(name, seq, 0, alphabet)
if asym_name in self.asym_units:
if self.asym_units[asym_name] is None:
# Set up a new asymmetric unit for this component
Expand All @@ -1396,6 +1397,12 @@ def get_offset(seq):
self.asym_units[asym_name] = asym
state.modeled_assembly.append(self.asym_units[asym_name])

def finalize_build(self):
"""Called immediately after the PMI system is built"""
for entity in self.system.entities:
_trim_unrep_termini(entity, self.system.asym_units,
self.system.orphan_representations)

def _add_restraint_model_fits(self):
"""Add fits to restraints for all known models"""
for group, m in self.system._all_models():
Expand Down Expand Up @@ -1740,3 +1747,54 @@ def parse_file(self, filename):
elif line.startswith('# ncenters: '):
ret['number_of_gaussians'] = int(line[12:])
return ret


def _trim_unrep_termini(entity, asyms, representations):
"""Trim Entity sequence to only cover represented residues.
PDB policy is for amino acid Entity sequences to be polymers (so
they should include any loops or other gaps in the midst of the
sequence) but for the termini to be trimmed of any not-modeled
residues. Here, we modify the Entity sequence to remove any parts
that are not included in any representation. This may change the
numbering if any N-terminal residues are removed, and thus the offset
between PMI and IHM numbering, as both count from 1."""
rep_range = None
for rep in representations:
for seg in rep:
if seg.asym_unit.entity is entity:
seg_range = seg.asym_unit.seq_id_range
if rep_range is None:
rep_range = list(seg_range)
else:
rep_range[0] = min(rep_range[0], seg_range[0])
rep_range[1] = max(rep_range[1], seg_range[1])
# Do nothing if no representations or if everything is represented
if rep_range is None or rep_range == [1, len(entity.sequence)]:
return
# Update offset (= number of unrepresented N terminal entity residues)
# in entity and all asyms
pmi_offset = rep_range[0] - 1
entity.pmi_offset = pmi_offset
for asym in asyms:
if asym.entity is entity:
asym.auth_seq_id_map = entity.pmi_offset
# Modify entity sequence to only include represented residues
entity.sequence = entity.sequence[rep_range[0] - 1:rep_range[1]]

# If N terminal deletions, we must fix the numbering of all segments
# and starting models
if pmi_offset != 0:
for rep in representations:
for seg in rep:
if seg.asym_unit.entity is entity:
seg_range = seg.asym_unit.seq_id_range
seg.asym_unit.seq_id_range = (seg_range[0] - pmi_offset,
seg_range[1] - pmi_offset)
if seg.starting_model:
model = seg.starting_model
seg_range = model.asym_unit.seq_id_range
model.asym_unit.seq_id_range = \
(seg_range[0] - pmi_offset,
seg_range[1] - pmi_offset)
model.offset = model.offset - pmi_offset
2 changes: 2 additions & 0 deletions pyext/src/topology/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ def build(self, **kwargs):
for state in self.states:
state.build(**kwargs)
self.built = True
for po in self._protocol_output:
po.finalize_build()
return self.hier

def add_protocol_output(self, p):
Expand Down
39 changes: 24 additions & 15 deletions test/test_mmcif.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,11 +661,19 @@ def test_starting_model_dumper(self):
nup84_2.add_structure(self.get_input_file_name('test.nup84.pdb'), 'A')
nup84_2.add_representation(resolutions=[1])

nup85 = st2.create_molecule("Nup85", "SELM", "B")
nup85 = st2.create_molecule("Nup85", "GGGGSELMGG", "B")
# Residues S, E should be residues 8, 9 in test.nup85.pdb; map them
# to FASTA sequence
nup85.add_structure(self.get_input_file_name('test.nup85.pdb'), 'A',
res_range=(8, 9), offset=-7)
nup85.add_representation(resolutions=[1])
res_range=(8, 9), offset=-3)
nup85.add_representation(residues=nup85[4:8], resolutions=[1])
_ = s.build()
# Since we didn't represent the first 4 or last 2 residues in nup85,
# the IHM sequence should just be "SELM", and the PDB offset should be
# modified to match (from -3 to -7)
self.assertEqual(''.join(x.code
for x in po.system.entities[1].sequence),
'SELM')

self.assign_entity_asym_ids(po.system)

Expand Down Expand Up @@ -1214,26 +1222,27 @@ class DummyRestraint:
s.add_protocol_output(po)
state = s.create_state()
po_state = state._protocol_output[0][1]
nup84 = state.create_molecule("Nup84", "MELS", "A")
nup84.add_structure(self.get_input_file_name('test.nup84.pdb'), 'A')
nup84.add_representation(nup84.get_atomic_residues(), resolutions=[1])
nup84.add_representation(nup84.get_non_atomic_residues(),
resolutions=[10])
# First 10 residues are not represented, so PMI residue indexes
# range from 11 to 14, but IHM seq_ids from 1 to 4
nup84 = state.create_molecule("Nup84", "P" * 10 + "MELS", "A")
nup84.add_structure(self.get_input_file_name('test.nup84.pdb'), 'A',
offset=10)
nup84.add_representation(nup84[10:], resolutions=[1])
hier = s.build()

r = DummyRestraint()
r.dataset = DummyDataset()
r.dataset._id = 42
xl_group = po.get_cross_link_group(r)
ex_xl = po.add_experimental_cross_link(1, 'Nup84',
2, 'Nup84', xl_group)
_ = po.add_experimental_cross_link(1, 'Nup84',
3, 'Nup84', xl_group)
ex_xl = po.add_experimental_cross_link(11, 'Nup84',
12, 'Nup84', xl_group)
_ = po.add_experimental_cross_link(11, 'Nup84',
13, 'Nup84', xl_group)
# Duplicates should be ignored
po.add_experimental_cross_link(1, 'Nup84', 3, 'Nup84', xl_group)
po.add_experimental_cross_link(11, 'Nup84', 13, 'Nup84', xl_group)
# Non-modeled component should be ignored
nm_ex_xl = po.add_experimental_cross_link(1, 'Nup85',
2, 'Nup84', xl_group)
nm_ex_xl = po.add_experimental_cross_link(11, 'Nup85',
12, 'Nup84', xl_group)
self.assertEqual(nm_ex_xl, None)
sel = IMP.atom.Selection(hier, molecule='Nup84',
resolution=1)
Expand Down
76 changes: 73 additions & 3 deletions test/test_mmcif_pmi2.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,9 @@ def test_entity(self):
w = ihm.format.CifWriter(fh)
d = ihm.dumper._EntityDumper()
d.finalize(po.system)
self.assertEqual(''.join(x.code
for x in po.system.entities[0].sequence),
'MELS')
d.dump(po.system, w)
out = fh.getvalue()
self.assertEqual(out, """#
Expand All @@ -146,17 +149,37 @@ def test_entity(self):
#
""")

def test_entity_trimmed(self):
"""Test that Entity sequence is trimmed to represented region"""
m = IMP.Model()
s = IMP.pmi.topology.System(m)
po = IMP.pmi.mmcif.ProtocolOutput()
s.add_protocol_output(po)
state = s.create_state()
nup84 = state.create_molecule("Nup84", "GGGMELSGGGG", "A")
# Only "MELS" is represented, so the output entity should be of
# length 4 and weigh 532, not the full PMI sequence
nup84.add_representation(residues=nup84[3:7], resolutions=[1])
hier = s.build()
self.assertEqual(''.join(x.code
for x in po.system.entities[0].sequence),
'MELS')
self.assertAlmostEqual(po.system.entities[0].formula_weight,
532.606, delta=1e-2)

def test_model_representation(self):
"""Test ModelRepresentationDumper with PMI2-style init"""
m = IMP.Model()
s = IMP.pmi.topology.System(m)
po = IMP.pmi.mmcif.ProtocolOutput()
s.add_protocol_output(po)
state = s.create_state()
nup84 = state.create_molecule("Nup84", "MELS", "A")
nup84.add_structure(self.get_input_file_name('test.nup84.pdb'), 'A')
nup84.add_representation(resolutions=[1])
nup84 = state.create_molecule("Nup84", "GGGGMELSGG", "A")
nup84.add_structure(self.get_input_file_name('test.nup84.pdb'), 'A',
offset=4)
nup84.add_representation(resolutions=[1], residues=nup84[4:8])
hier = s.build()
print([x.code for x in po.system.entities[0].sequence])
fh = StringIO()
w = ihm.format.CifWriter(fh)
self.assign_entity_asym_ids(po.system)
Expand Down Expand Up @@ -196,5 +219,52 @@ def test_model_representation(self):
#
""")

def test_poly_seq_scheme_trimmed(self):
"""Test that non-modeled termini are trimmed in scheme tables"""
m = IMP.Model()
s = IMP.pmi.topology.System(m)
po = IMP.pmi.mmcif.ProtocolOutput()
s.add_protocol_output(po)
state = s.create_state()
nup84_a = state.create_molecule("Nup84a", "PWYACMELSWAG", "A")
nup84_a.add_representation(residues=nup84_a[4:7], resolutions=[1])
nup84_b = state.create_molecule("Nup84b", "PWYACMELSWAG", "A")
nup84_b.add_representation(residues=nup84_b[2:5], resolutions=[1])
hier = s.build()
self.assertEqual(len(po.system.entities), 1)
self.assertEqual(po.system.entities[0].pmi_offset, 2)
fh = StringIO()
w = ihm.format.CifWriter(fh)
self.assign_entity_asym_ids(po.system)
self.assign_range_ids(po.system)
d = ihm.dumper._PolySeqSchemeDumper()
d.dump(po.system, w)
out = fh.getvalue()
self.assertEqual(out, """#
loop_
_pdbx_poly_seq_scheme.asym_id
_pdbx_poly_seq_scheme.entity_id
_pdbx_poly_seq_scheme.seq_id
_pdbx_poly_seq_scheme.mon_id
_pdbx_poly_seq_scheme.pdb_seq_num
_pdbx_poly_seq_scheme.auth_seq_num
_pdbx_poly_seq_scheme.pdb_mon_id
_pdbx_poly_seq_scheme.auth_mon_id
_pdbx_poly_seq_scheme.pdb_strand_id
_pdbx_poly_seq_scheme.pdb_ins_code
A 1 1 TYR 3 3 TYR TYR A .
A 1 2 ALA 4 4 ALA ALA A .
A 1 3 CYS 5 5 CYS CYS A .
A 1 4 MET 6 6 MET MET A .
A 1 5 GLU 7 7 GLU GLU A .
B 1 1 TYR 3 3 TYR TYR B .
B 1 2 ALA 4 4 ALA ALA B .
B 1 3 CYS 5 5 CYS CYS B .
B 1 4 MET 6 6 MET MET B .
B 1 5 GLU 7 7 GLU GLU B .
#
""")


if __name__ == '__main__':
IMP.test.main()

0 comments on commit 86a8bbb

Please sign in to comment.