Skip to content

Commit

Permalink
Squashed 'modules/pmi/' changes from e610c5f057..e469c4f9fe
Browse files Browse the repository at this point in the history
e469c4f9fe Pass UniProt accession through to Molecule
91646dd985 Extract UniProt accessions from FASTA headers
4ece080c1b Add max version to placate CMake

git-subtree-dir: modules/pmi
git-subtree-split: e469c4f9fe2718fba6497ba53b06e6fa7ff2f341
  • Loading branch information
benmwebb committed Sep 21, 2023
1 parent 61efd79 commit 61a29bf
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 11 deletions.
2 changes: 1 addition & 1 deletion modules/pmi/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Are we running cmake from this directory (out of tree build) ?
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
cmake_minimum_required(VERSION 2.8.12)
cmake_minimum_required(VERSION 2.8.12...3.6.0)
project(imp_module)

if(POLICY CMP0058)
Expand Down
10 changes: 6 additions & 4 deletions modules/pmi/pyext/src/macros.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,12 +683,14 @@ def add_state(self, reader, keep_chain_id=False, fasta_name_map=None,
fasta_flag = copy[0].fasta_flag
if fasta_flag in self._alphabets:
alphabet = self._alphabets[fasta_flag]
seq = IMP.pmi.topology.Sequences(
copy[0].fasta_file, fasta_name_map)[copy[0].fasta_id]
seqs = IMP.pmi.topology.Sequences(
copy[0].fasta_file, fasta_name_map)
seq = seqs[copy[0].fasta_id]
print("BuildSystem.add_state: molecule %s sequence has "
"%s residues" % (molname, len(seq)))
orig_mol = state.create_molecule(molname, seq, chain_id,
alphabet=alphabet)
orig_mol = state.create_molecule(
molname, seq, chain_id, alphabet=alphabet,
uniprot=seqs.uniprot.get(copy[0].fasta_id))
mol = orig_mol
numchain += 1
else:
Expand Down
30 changes: 25 additions & 5 deletions modules/pmi/pyext/src/topology/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,13 +267,15 @@ def get_molecule(self, name, copy_num=0):
return self.molecules[name][copy_num]

def create_molecule(self, name, sequence='', chain_id='',
alphabet=IMP.pmi.alphabets.amino_acid):
alphabet=IMP.pmi.alphabets.amino_acid,
uniprot=None):
"""Create a new Molecule within this State
@param name the name of the molecule (string);
it must not be already used
@param sequence sequence (string)
@param chain_id Chain ID to assign to this molecule
@param alphabet Mapping from FASTA codes to residue types
@param uniprot UniProt accession, if available
"""
# check whether the molecule name is already assigned
if name in self.molecules:
Expand All @@ -290,7 +292,7 @@ def create_molecule(self, name, sequence='', chain_id='',
"is desired." % name, IMP.pmi.ParameterWarning)

mol = Molecule(self, name, sequence, chain_id, copy_num=0,
alphabet=alphabet)
alphabet=alphabet, uniprot=uniprot)
self.molecules[name] = [mol]
return mol

Expand Down Expand Up @@ -401,7 +403,8 @@ class Molecule(_SystemBase):
"""

def __init__(self, state, name, sequence, chain_id, copy_num,
mol_to_clone=None, alphabet=IMP.pmi.alphabets.amino_acid):
mol_to_clone=None, alphabet=IMP.pmi.alphabets.amino_acid,
uniprot=None):
"""The user should not call this directly; instead call
State.create_molecule()
Expand All @@ -423,6 +426,7 @@ def __init__(self, state, name, sequence, chain_id, copy_num,
self.alphabet = alphabet
self.representations = [] # list of stuff to build
self._pdb_elements = []
self.uniprot = uniprot
# residues with representation
self._represented = IMP.pmi.tools.OrderedSet()
# helps you place beads by storing structure
Expand All @@ -439,6 +443,8 @@ def __init__(self, state, name, sequence, chain_id, copy_num,
self.chain = IMP.atom.Chain.setup_particle(self.hier, chain_id)
self.chain.set_sequence(self.sequence)
self.chain.set_chain_type(alphabet.get_chain_type())
if self.uniprot:
self.chain.set_uniprot_accession(self.uniprot)
# create TempResidues from the sequence (if passed)
self.residues = []
for ns, s in enumerate(sequence):
Expand Down Expand Up @@ -997,13 +1003,23 @@ def find_nearest_coord(self, query):
class Sequences(object):
"""A dictionary-like wrapper for reading and storing sequence data.
Keys are FASTA sequence names, and each value a string of one-letter
codes."""
codes.
The FASTA header may contain multiple fields split by pipe (|)
characters. If so, the FASTA sequence name is the first field and
the second field (if present) is the UniProt accession.
For example, ">cop9|Q13098" yields a FASTA sequence name of "cop9"
and UniProt accession of "Q13098".
"""
def __init__(self, fasta_fn, name_map=None):
"""Read a FASTA file and extract all the requested sequences
@param fasta_fn sequence file
@param name_map dictionary mapping the FASTA name to final stored name
"""
# Mapping from sequence name to primary sequence
self.sequences = IMP.pmi.tools.OrderedDict()
# Mapping from sequence name to UniProt accession, if available
self.uniprot = {}
self.read_sequences(fasta_fn, name_map)

def __len__(self):
Expand Down Expand Up @@ -1040,13 +1056,17 @@ def read_sequences(self, fasta_fn, name_map=None):
if line.startswith('>'):
if seq is not None:
self.sequences[code] = seq.strip('*')
code = line.rstrip()[1:]
spl = line[1:].split('|')
code = spl[0].strip()
if name_map is not None:
try:
code = name_map[code]
except KeyError:
pass
seq = ''
if len(spl) >= 2:
up_accession = spl[1].strip()
self.uniprot[code] = up_accession
else:
line = line.rstrip()
if line: # Skip blank lines
Expand Down
8 changes: 8 additions & 0 deletions modules/pmi/test/input/uniprot.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
>1WCM:A
MVGQQYSS
>1WCM:B
AADESAPIT
>test1|acc1
AADESAPIT
>test2|acc2
AADESAPIT
5 changes: 4 additions & 1 deletion modules/pmi/test/medium_test_topology.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,14 +421,17 @@ def test_create_molecules(self):
m1 = st.create_molecule("Prot1", sequence=seqs["Prot1"])
self.assertRaises(ValueError, st.create_molecule, "Prot1",
sequence=seqs["Prot1"])
m2 = st.create_molecule("Prot2", sequence=seqs["Prot2"])
m2 = st.create_molecule("Prot2", sequence=seqs["Prot2"],
uniprot="testup")
self.assertEqual(m1.get_hierarchy().get_parent(), st.get_hierarchy())
self.assertEqual(m2.get_hierarchy().get_parent(), st.get_hierarchy())
self.assertEqual(m1.model, st.model)
self.assertEqual(m2.model, st.model)
self.assertEqual(m1.get_name(), "Prot1")
self.assertEqual(m2.get_name(), "Prot2")
self.assertEqual(len(st.get_hierarchy().get_children()), 2)
self.assertIsNone(m1.uniprot)
self.assertEqual(m2.uniprot, 'testup')

# create state 2 with one molecule
st2 = s.create_state()
Expand Down
9 changes: 9 additions & 0 deletions modules/pmi/test/test_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,15 @@ def test_sequences_name_map(self):
self.assertNotIn('1WCM:A', s)
self.assertEqual(s['foo'], 'MVGQQYSS')

def test_sequences_uniprot(self):
"""Test reading sequences with UniProt info from a FASTA file"""
s = IMP.pmi.topology.Sequences(
self.get_input_file_name('uniprot.fasta'))
self.assertIn('test1', s)
self.assertIn('test2', s)
self.assertEqual(s.uniprot['test1'], 'acc1')
self.assertEqual(s.uniprot['test2'], 'acc2')

def test_sequences_bad(self):
"""Test reading sequences from an invalid FASTA file"""
fname = 'test_sequences_bad.fasta'
Expand Down

0 comments on commit 61a29bf

Please sign in to comment.