Skip to content

Commit

Permalink
Add method to add UniProt reference
Browse files Browse the repository at this point in the history
Keep track of any stated UniProt accession for PMI
molecules (either provided explicitly or read from
the FASTA file) and provide a method to fill in the
IHM reference tables using this accession.
  • Loading branch information
benmwebb committed Dec 5, 2024
1 parent a4bb2ff commit 4a4cd0d
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 11 deletions.
39 changes: 33 additions & 6 deletions pyext/src/mmcif.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import ihm.representation
import ihm.geometry
import ihm.cross_linkers
import ihm.reference


def _assign_id(obj, seen_objs, obj_by_id):
Expand Down Expand Up @@ -1082,7 +1083,7 @@ def _get_alphabet(self, alphabet):
else:
raise TypeError("Don't know how to handle %s" % alphabet)

def add(self, component_name, sequence, offset, alphabet):
def add(self, component_name, sequence, offset, alphabet, uniprot):
def entity_seq(sequence):
# Map X to UNK
if 'X' in sequence:
Expand All @@ -1095,7 +1096,8 @@ def entity_seq(sequence):
d = component_name.split("@")[0].split(".")[0]
entity = Entity(entity_seq(sequence), description=d,
pmi_offset=offset,
alphabet=self._get_alphabet(alphabet))
alphabet=self._get_alphabet(alphabet),
uniprot=uniprot)
self.system.entities.append(entity)
self._sequence_dict[sequence] = entity
self[component_name] = self._sequence_dict[sequence]
Expand Down Expand Up @@ -1198,11 +1200,18 @@ class Entity(ihm.Entity):
removed). The actual offset (which is the integer to be added to the
IHM numbering to get PMI numbering, or equivalently the number of
not-represented N-terminal residues in the PMI sequence) is
available in the `pmi_offset` member."""
def __init__(self, sequence, pmi_offset, *args, **keys):
available in the `pmi_offset` member.
If a UniProt accession was provided for the sequence (either when
State.create_molecule() was called, or in the FASTA alignment file
header) then that is available in the `uniprot` member, and can be
added to the IHM system with the add_uniprot_reference method.
"""
def __init__(self, sequence, pmi_offset, uniprot, *args, **keys):
# Offset between PMI numbering and IHM; <pmi_#> = <ihm_#> + pmi_offset
# (pmi_offset is also the number of N-terminal gaps in the FASTA file)
self.pmi_offset = pmi_offset
self.uniprot = uniprot
super().__init__(sequence, *args, **keys)

def pmi_residue(self, res_id):
Expand All @@ -1214,6 +1223,24 @@ def pmi_range(self, res_id_begin, res_id_end):
off = self.pmi_offset
return self(res_id_begin - off, res_id_end - off)

def add_uniprot_reference(self):
"""Add UniProt accession (if available) to the IHM system.
If a UniProt accession was provided for the sequence (either when
State.create_molecule() was called, or in the FASTA alignment file
header), then look this up at the UniProt web site (requires
network access) to get full information, and add it to the IHM
system. The resulting reference object is returned. If the IMP
and UniProt sequences are not identical, then this object may
need to be modified by specifying an alignment and/or
single-point mutations.
"""
if self.uniprot:
print('Adding UniProt accession %s reference for entity %s'
% (self.uniprot, self.description))
ref = ihm.reference.UniProtSequence.from_accession(self.uniprot)
self.references.append(ref)
return ref


class AsymUnit(ihm.AsymUnit):
"""A single asymmetric unit in the system. This roughly corresponds to
Expand Down Expand Up @@ -1397,7 +1424,7 @@ def create_component(self, state, name, modeled, asym_name=None):
self.all_modeled_components.append(name)

def add_component_sequence(self, state, name, seq, asym_name=None,
alphabet=None):
alphabet=None, uniprot=None):
if asym_name is None:
asym_name = name

Expand All @@ -1409,7 +1436,7 @@ def add_component_sequence(self, state, name, seq, asym_name=None,
# Offset is always zero to start with; this may be modified
# later in finalize_build() if any non-modeled N-terminal
# residues are removed
self.entities.add(name, seq, 0, alphabet)
self.entities.add(name, seq, 0, alphabet, uniprot)
if asym_name in self.asym_units:
if self.asym_units[asym_name] is None:
# Set up a new asymmetric unit for this component
Expand Down
3 changes: 2 additions & 1 deletion pyext/src/topology/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -809,7 +809,8 @@ def _build_protocol_output(self):
asym_name=self._name_with_copy)
po.add_component_sequence(state, name, self.sequence,
asym_name=self._name_with_copy,
alphabet=self.alphabet)
alphabet=self.alphabet,
uniprot=self.uniprot)

def _finalize_build(self):
# For clones, pass the representation of the original molecule
Expand Down
30 changes: 26 additions & 4 deletions test/test_mmcif.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,21 +315,43 @@ def test_cif_entities(self):
"""Test _EntityMapper class"""
system = ihm.System()
c = IMP.pmi.mmcif._EntityMapper(system)
c.add('foo', 'MELS', 0, alphabet=None)
c.add('bar', 'SELM', 0, alphabet=IMP.pmi.alphabets.amino_acid)
c.add('foo_2', 'MELS', 0, alphabet=None)
c.add('foo', 'MELS', 0, alphabet=None, uniprot=None)
c.add('bar', 'SELM', 0, alphabet=IMP.pmi.alphabets.amino_acid,
uniprot='baracc')
c.add('foo_2', 'MELS', 0, alphabet=None, uniprot=None)
self.assertRaises(TypeError, c.add, 'baz', 'MELSXX', 0,
alphabet='garbage')
alphabet='garbage', uniprot=None)
self.assertEqual(len(system.entities), 2)
self.assertIs(c['foo'], c['foo_2'])
self.assertIsNot(c['foo'], c['bar'])
a = system.entities
self.assertEqual(len(a), 2)
self.assertEqual(a[0].description, 'foo')
self.assertIsNone(a[0].uniprot)
self.assertEqual(''.join(x.code for x in a[0].sequence), 'MELS')
self.assertEqual(a[1].description, 'bar')
self.assertEqual(a[1].uniprot, 'baracc')
self.assertEqual(''.join(x.code for x in a[1].sequence), 'SELM')

def test_entity_add_uniprot_reference(self):
"""Test Entity.add_uniprot_reference()"""
system = ihm.System()
c = IMP.pmi.mmcif._EntityMapper(system)
c.add('foo', 'MELS', 0, alphabet=None, uniprot=None)
c.add('bar', 'SELM', 0, alphabet=None, uniprot='baracc')
# Mock out UniProtSequence.from_accession
orig = ihm.reference.UniProtSequence.from_accession
def mock_from_acc(acc):
return "mock+" + acc
try:
ihm.reference.UniProtSequence.from_accession = mock_from_acc
ref = c['foo'].add_uniprot_reference()
self.assertIsNone(ref)
ref = c['bar'].add_uniprot_reference()
self.assertEqual(ref, 'mock+baracc')
finally:
ihm.reference.UniProtSequence.from_accession = orig

def test_all_datasets_all_group(self):
"""Test AllDatasets.get_all_group()"""
s = ihm.System()
Expand Down

0 comments on commit 4a4cd0d

Please sign in to comment.