From 7c81f435ba2cd52e153cc42735fd0103d6c6ed40 Mon Sep 17 00:00:00 2001 From: Martin Engqvist Date: Mon, 25 Jan 2021 16:09:53 +0100 Subject: [PATCH] Fill out README file, correct spelling errors in parser.py and add one EC html page as example --- README.md | 128 +- brenparse/data/1.1.3.15.html | 20514 +++++++++++++++++++++++++++++++++ brenparse/parser.py | 380 +- 3 files changed, 20641 insertions(+), 381 deletions(-) create mode 100644 brenparse/data/1.1.3.15.html diff --git a/README.md b/README.md index 2f0699e..6c0965b 100755 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Description of brenparse library -The aim of this package is to parse BRENDA html files to obtain data therein. +Much of the data in the BRENDA database (https://www.brenda-enzymes.org/) is available through their SOAP client, but not all. The aim of this package is address this issue by parsing BRENDA html files to obtain the data therein. One needs to first download the html pages that one wishes to parse and this library can then be used to extract data. ## Installation @@ -9,19 +9,133 @@ Download repository and unzip (alternatively fork or clone), cd to the project b pip3 install -e . ``` -If using an anaconda environment you may have to first locate the anaconda pip using whereis. -``` -whereis pip +__If using an anaconda environment__ you may have to first locate the anaconda pip using whereis. +```bash +>>> whereis pip ``` Locate the appropriate file path (the one that has anaconda and the correct environment in the filepath) and run the modified command. For example: +```bash +>>> /home/username/anaconda3/envs/py37/bin/pip install -e . ``` -/home/username/anaconda3/envs/py37/bin/pip install -e . + +__If _not_ using an anaconda environment__ simply install using pip: + +```bash +>>> pip install -e . ``` -The library should now be available for loading in all your python scripts. +The dependency beautifulsoup4 will be installed automatically (scripts were tested using beautifulsoup4 version 4.9.3). The library should now be available for loading in all your python scripts. ## Requirements -**Placeholder** +* Unix system +* python3 +* beautifulsoup4 + + +# How to use the brenparse library +The BRENDA database is parsed one html page at a time, with each page holding information for a single EC class (for example https://www.brenda-enzymes.org/enzyme.php?ecno=1.1.3.15). This page needs to be downloaded and stored locally. The path to this file represents the input to brenparse. + +```python3 +>>> from brenparse import parser +>>> filepath = prarser.EXAMPLE_PAGE +>>> soup_obj = parser.open_ec(filepath) +``` + + The generated soup object is subsequently passed to functions that parse the individual tables. + +## Parsing BRENDA tables +The various tables in BRENDA contain a differing number of columns and the output obtained after parsing is therefore different. See each entry below for details. + +### The organism table +The organism table represents a special case as there are no data values. The data is returned as as a dictionary with organism names as keys holding a list of UniProt identifiers as values. By default only records with UniProt identifiers are returned (i.e. leaving out records without identifiers). + +```python3 +>>> parser.Organism(soup_obj).get_data() # Parsing the ORGANISM table in BRENDA +{'Mus musculus': ['Q9NYQ2', 'Q9WU19'], 'Pachysandra terminalis': ['Q19U05'], 'Phaeodactylum tricornutum': ['B7FUG8'], 'Rattus norvegicus': ['Q07523'], 'Homo sapiens': ['Q9NYQ3', 'Q9UJM8'], 'Lactococcus lactis': ['Q9CG58'], 'Streptococcus iniae': ['A9QH69'], 'Arabidopsis thaliana': ['Q24JJ8', 'Q56ZN0', 'Q9LJH5', 'Q9LRR9'], 'Oryza sativa': ['Q10CE4']} +``` + +If the uid_orgs_only variable in get_data() is set to False then all records are returned. + +```python3 +>>> parser.Organism(soup_obj).get_data(uid_orgs_only=False) # Parsing the ORGANISM table in BRENDA +{'Amaranthus retroflexus': [], 'Brassica rapa': [], 'Cucumis sativus': [], 'Geotrichum candidum': [], 'Homo sapiens': ['Q9NYQ3', 'Q9UJM8'], 'Lactococcus lactis': ['Q9CG58'], 'Mammalia': [], 'Mus musculus': ['Q9NYQ2', 'Q9WU19'], 'Mycolicibacterium smegmatis': [], 'Oceanimonas doudoroffii': [], 'Pachysandra terminalis': ['Q19U05'], 'Phaeodactylum tricornutum': ['B7FUG8'], 'Pseudomonas stutzeri': [], 'Rattus norvegicus': ['Q07523'], 'Rattus sp': [], 'Spinacia oleracea': [], 'Streptococcus iniae': ['A9QH69'], 'Streptococcus pneumoniae': [], 'Sus scrofa': [], 'Triticum aestivum': [], 'Zea mays': [], 'Aerococcus viridans': [], 'Arabidopsis thaliana': ['Q24JJ8', 'Q56ZN0', 'Q9LJH5', 'Q9LRR9'], 'Carica papaya': [], 'Gallus gallus': [], 'Glycine max': [], 'Lathyrus sativus': [], 'Mesostigma viride': [], 'Nicotiana tabacum': [], 'Oryza sativa': ['Q10CE4'], 'Pediococcus sp': [], 'Plant': [], 'Rana pipiens': [], 'Roseobacter sp': [], 'Streptococcus cristatus': [], 'Streptococcus pyogenes': [], 'Tetrahymena pyriformis': [], 'Vigna unguiculata': []} +``` + + +### Three-level tables +The three-level tables have the structure: value, organism, uniprot_id. The value can be numeric or text, depending on the table. The data is returned as a dictionary of organism names as keys, holding dictionaries as values wherein each UniProt identifier holds a list of values. By default only records with UniProt identifiers are returned (i.e. leaving out records without identifiers) + +```python3 +>>> parser.TemperatureOptimum(soup_obj).get_data() # Parsing the TEMPERATURE OPTIMUM table in BRENDA +{'Arabidopsis thaliana': {'Q24JJ8': [25.0], 'Q9LJH5': [25.0], 'Q9LRR9': [25.0]}} +``` + +If the uid_orgs_only variable in get_data() is set to False then all records are returned. For each organism, records without identifiers are collected under the "unknown" key. + +```python3 +>>> parser.TemperatureOptimum(soup_obj).get_data(uid_orgs_only=False) # Parsing the TEMPERATURE OPTIMUM table in BRENDA +{'Homo sapiens': {'unknown': [30.0, 37.0]}, '4 entries': {'unknown': [25.0]}, 'Aerococcus viridans': {'unknown': [25.0]}, 'Arabidopsis thaliana': {'Q24JJ8': [25.0], 'Q9LJH5': [25.0], 'Q9LRR9': [25.0]}, 'Pediococcus sp': {'unknown': [25.0]}, 'Rattus norvegicus': {'unknown': [25.0]}} +``` + + +A full list of the three-level tables: +```python3 +>>> TemperatureOptimum(soub_obj).get_data() # Parsing the TEMPERATURE OPTIMUM table in BRENDA +>>> Cofactor(soub_obj).get_data() # Parsing the COFACTOR table in BRENDA +>>> MetalsAndIons(soub_obj).get_data() # Parsing the METALS and IONS table in BRENDA +>>> Inhibitors(soub_obj).get_data() # the INHIBITORS table in BRENDA +>>> ActivatingCompound(soub_obj).get_data() # Parsing the ACTIVATING COMPOUND table in BRENDA +>>> SpecificActivity(soub_obj).get_data() # Parsing the SPECIFIC ACTIVITY table in BRENDA +>>> PhOptimum(soub_obj).get_data() # Parsing the pH OPTIMUM table in BRENDA +>>> PhRange(soub_obj).get_data() # Parsing the pH RANGE table in BRENDA +>>> TemperatureRange(soub_obj).get_data() # Parsing the TEMPERATURE RANGE table in BRENDA +>>> PhStability(soub_obj).get_data() # Parsing the pH STABILITY table in BRENDA +>>> TemperatureStability(soub_obj).get_data() # Parsing the TEMPERATURE STABILITY table in BRENDA +``` + +### Four-level tables +The four-level tables have the structure: value, comment, organism, uniprot_id. The value is always numeric. The data is returned as a dictionary of organism names as keys, holding nested dictionaries as values. The structure is dictionary[organism][uniprot_id][substrate] where the last level holds lists containing the values. By default only records with UniProt identifiers are returned (i.e. leaving out records without identifiers) + +```python3 +>>> parser.Km(soup_obj).get_data() # Parsing the KM VALUE [mM] table in BRENDA +{'Arabidopsis thaliana': {'Q24JJ8': [25.0], 'Q9LJH5': [25.0], 'Q9LRR9': [25.0]}} +``` + +If the uid_orgs_only variable in get_data() is set to False then all records are returned. For each organism, records without identifiers are collected under the "unknown" key. + +```python3 +>>> parser.Km(soup_obj).get_data(uid_orgs_only=False) # Parsing the KM VALUE [mM] table in BRENDA +{'Homo sapiens': {'unknown': {'2,6-dichlorophenolindophenol': [0.033], '2-Hydroxyoctanoate': [0.045], '2-oxo-octanoate': [0.04], 'glycolate': [0.0056, 0.12, 0.141, 0.2, 0.23, 0.32, 2.0], 'L-Mandelate': [1.5], 'glyoxylate': [2.2, 3.4], 'L-lactate': [16.5], 'O2': [0.44, 0.59, 0.64]}}, 'Rattus sp': {'unknown': {'2-mercaptoethanol-glyoxylate adduct': [0.75], 'Bromopyruvate': [4.4], 'DL-2-hydroxy-3-heptynoate': [0.38], 'DL-2-hydroxy-3-octynoate': [0.14], 'DL-methionine': [4.0], 'DL-vinylglycolate': [10.0], 'N-acetylcysteamine-glyoxylate adduct': [0.4], 'pantetheine-glyoxylate adduct': [0.7], 'coenzyme A-glyoxylate adduct': [2.2], 'DL-2-hydroxy-3-butynoate': [4.0], 'DL-2-hydroxy-3-hexynoate': [7.0], 'DL-2-hydroxy-3-pentynoate': [9.0], 'DL-2-hydroxyisocaproate': [0.6], 'DL-alpha-phenyllactate': [71.0], 'DL-lactate': [27.0], 'DL-phenyllactate': [0.1], 'L-alpha-hydroxyphenyllactate': [1.9], 'L-lysine': [90.0], 'L-methionine': [53.0], 'propane-1,3-dithiol-glyoxylate adduct': [0.03], 'DL-2-hydroxy-4-methylthiobutanoic acid': [0.7, 1.1], 'DL-2-hydroxycaproate': [0.15, 0.25, 1, 1.34, 3.2], 'DL-2-hydroxyisovalerate': [0.6, 8.0], 'DL-3-chlorolactate': [0.7, 0.8, 28.0], 'glycolate': [0.22, 0.24, 0.5, 2.1], 'L-2-Hydroxyisocaproate': [0.3, 0.32, 0.7, 0.9, 1.24, 1.26, 1.65], 'L-leucine': [5.3, 6.0, 6.4, 15.0], 'L-Mandelate': [0.16, 0.23, 0.4, 0.8], 'L-Phenyllactate': [0.09, 0.13], 'DL-2-hydroxybutyrate': [0.6, 0.6, 1.0, 1.2, 2.04, 3, 2.5, 12.7, 14.0], 'DL-2-hydroxyvalerate': [0.25, 0.35, 0.6, 13.0], 'glyoxylate': [1.41, 1.78], 'L-lactate': [1.8, 3.4, 4.68, 4.7, 6, 6.1, 8.5], 'L-tryptophan': [35.0, 40.0], 'O2': [0.3, 0.46]}}, 'Sus scrofa': {'unknown': {'dichlorophenolindophenol': [0.28], 'L-beta-Phenyllactate': [2.2], 'L-2-hydroxy-beta-methylvalerate': [2.4], 'glycolate': [0.31, 0.42], 'L-2-Hydroxyisocaproate': [0.68, 2.5], 'L-lactate': [16.0]}}, 'Dl-2-hydroxy-4-methylthiobutanoic acid': {'unknown': {'': [1]}}, 'Dl-2-hydroxycaproate': {'unknown': {'': [2]}}, 'Dl-2-hydroxyisovalerate': {'unknown': {'': [4]}}, 'Dl-3-chlorolactate': {'unknown': {'': [14]}}, 'Dl-alpha-hydroxy-n-valerate': {'unknown': {'': [8]}}, 'Dl-glycerate': {'unknown': {'': [29]}}, 'Glycolate': {'unknown': {'': [1]}}, 'Rattus norvegicus': {'unknown': {'L-2-hydroxy octanoate': [0.046], 'L-2-hydroxy palmitate': [1.36], '(S)-lactate': [0.0052]}}, 'Gallus gallus': {'unknown': {'L-2-hydroxy-4-methylthiobutanoic acid': [1.82], 'glycolate': [0.1]}}, 'L-2-hydroxyisocaproate': {'unknown': {'': [1]}}, 'Aerococcus viridans': {'unknown': {'L-alpha-hydroxy-isovalerate': [125.0], 'L-alpha-hydroxy-beta-methylvalerate': [140.0], 'DL-alpha-hydroxy-n-valerate': [5.5, 10.0], 'DL-glycerate': [5.0, 53.0], 'L-Mandelate': [0.3, 20.0], '(S)-lactate': [0.157, 0.175, 0.529, 0.863, 0.87, 0.94, 6.75, 7.5, 24.3, 25.5, 47.6, 50.7, 103.0], 'DL-alpha-hydroxy-n-butyrate': [18.0, 27.0], 'L-lactate': [0.34, 0.94], 'O2': [0.022, 0.029, 0.03, 0.16, 0.16]}}, 'L-leucine': {'unknown': {'': [10]}}, 'L-mandelate': {'unknown': {'': [10]}}, 'L-phenyllactate': {'unknown': {'': [0]}}, '(s)-lactate': {'unknown': {'': [52]}}, 'Dl-2-hydroxybutyrate': {'unknown': {'': [7]}}, 'Dl-2-hydroxyvalerate': {'unknown': {'': [7]}}, 'Dl-alpha-hydroxy-n-butyrate': {'unknown': {'': [22]}}, 'Spinacia oleracea': {'unknown': {'glycerate': [7.14]}}, 'Glyoxylate': {'unknown': {'': [2]}}, 'L-lactate': {'unknown': {'': [8]}}, 'L-tryptophan': {'unknown': {'': [38]}}, 'O2': {'unknown': {'': [0]}}, 'Amaranthus retroflexus': {'unknown': {'glycolate': [0.02, 0.058]}}, 'Zea mays': {'unknown': {'glycolate': [0.02, 0.056]}}, 'Glycine max': {'unknown': {'glycolate': [0.06]}}, 'Mesostigma viride': {'unknown': {'glycolate': [0.3], 'L-lactate': [9.3]}}, 'Geotrichum candidum': {'unknown': {'(S)-lactate': [3.6]}}} +``` + +A full list of the four-level tables: +```python3 +>>> Km(soub_obj) # Parsing the KM VALUE [mM] table in BRENDA +>>> Kcat(soub_obj) # Parsing the TURNOVER NUMBER [1/s] table in BRENDA +>>> KcatDivKm(soub_obj) # Parsing the TURNOVER NUMBER [1/s] table in BRENDA +``` + + +### Five-level tables +The five-level tables have the structure: value1, value2, comment, organism, uniprot_id. The value is always text. The data is returned as a dictionary of organism names as keys, holding nested dictionaries as values. The structure is dictionary[organism][uniprot_id]["sub"/"prod"] where the last level stands for "substrate" or "product" and holds lists containing the text strings. By default only records with UniProt identifiers are returned (i.e. leaving out records without identifiers) + +```python3 +>>> parser.NaturalSubstrate(soup_obj).get_data() # Parsing the NATURAL SUBSTRATE table in BRENDA +{'Arabidopsis thaliana': {'Q24JJ8': [{'sub': ['2-hydroxycaprylate', 'O2'], 'prod': ['2-oxocaprylate', 'H2O2']}, {'sub': ['2-hydroxycaproate', 'O2'], 'prod': ['2-oxocaproate', 'H2O2']}, {'sub': ['2-hydroxypalmitate', 'O2'], 'prod': ['2-oxopalmitate', 'H2O2']}, {'sub': ['an (S)-2-hydroxy carboxylate', 'O2'], 'prod': ['a 2-oxo carboxylate', 'H2O2']}, {'sub': ['L-lactate', 'O2'], 'prod': ['pyruvate', 'H2O2']}], 'Q9LJH5': [{'sub': ['2-hydroxycaprylate', 'O2'], 'prod': ['2-oxocaprylate', 'H2O2']}, {'sub': ['2-hydroxycaproate', 'O2'], 'prod': ['2-oxocaproate', 'H2O2']}, {'sub': ['2-hydroxypalmitate', 'O2'], 'prod': ['2-oxopalmitate', 'H2O2']}, {'sub': ['an (S)-2-hydroxy carboxylate', 'O2'], 'prod': ['a 2-oxo carboxylate', 'H2O2']}, {'sub': ['L-lactate', 'O2'], 'prod': ['pyruvate', 'H2O2']}], 'Q9LRR9': [{'sub': ['2-hydroxycaprylate', 'O2'], 'prod': ['2-oxocaprylate', 'H2O2']}, {'sub': ['2-hydroxycaproate', 'O2'], 'prod': ['2-oxocaproate', 'H2O2']}, {'sub': ['2-hydroxypalmitate', 'O2'], 'prod': ['2-oxopalmitate', 'H2O2']}, {'sub': ['an (S)-2-hydroxy carboxylate', 'O2'], 'prod': ['a 2-oxo carboxylate', 'H2O2']}, {'sub': ['L-lactate', 'O2'], 'prod': ['pyruvate', 'H2O2']}], 'Q56ZN0': [{'sub': ['glycolate', 'O2'], 'prod': ['glyoxylate', 'H2O2']}]}, 'Phaeodactylum tricornutum': {'B7FUG8': [{'sub': ['glycolate', 'acceptor'], 'prod': ['glyoxylate', 'reduced acceptor']}]}, 'Lactococcus lactis': {'Q9CG58': [{'sub': ['lactate', 'O2'], 'prod': ['pyruvate', 'H2O2']}, {'sub': ['lactate', 'O2'], 'prod': ['pyruvate', 'H2O2']}]}, 'Streptococcus iniae': {'A9QH69': [{'sub': ['lactate', 'O2'], 'prod': ['pyruvate', 'H2O2']}], 'A9QH71': [{'sub': ['lactate', 'O2'], 'prod': ['pyruvate', 'H2O2']}, {'sub': ['lactate', 'O2'], 'prod': ['pyruvate', 'H2O2']}]}, 'Homo sapiens': {'Q9NYQ3': [{'sub': ['an (S)-2-hydroxy carboxylate', 'O2'], 'prod': ['a 2-oxo carboxylate', 'H2O2']}]}, 'Mus musculus': {'Q9NYQ2': [{'sub': ['an (S)-2-hydroxy carboxylate', 'O2'], 'prod': ['a 2-oxo carboxylate', 'H2O2']}, {'sub': ['an (S)-2-hydroxy carboxylate', 'O2'], 'prod': ['a 2-oxo carboxylate', 'H2O2']}], 'Q9WU19': [{'sub': ['an (S)-2-hydroxy carboxylate', 'O2'], 'prod': ['a 2-oxo carboxylate', 'H2O2']}, {'sub': ['glycolate', 'O2'], 'prod': ['glyoxylate', 'H2O2']}, {'sub': ['glycolate', 'O2'], 'prod': ['glyoxylate', 'H2O2']}]}, 'Rattus norvegicus': {'Q07523': [{'sub': ['an (S)-2-hydroxy carboxylate', 'O2'], 'prod': ['a 2-oxo carboxylate', 'H2O2']}]}, 'Oryza sativa': {'Q10CE4': [{'sub': ['glycolate', 'O2'], 'prod': ['glyoxylate', 'H2O2']}]}} +``` + +If the uid_orgs_only variable in get_data() is set to False then all records are returned. For each organism, records without identifiers are collected under the "unknown" key. + +```python3 +>>> parser.NaturalSubstrate(soup_obj).get_data(uid_orgs_only=False) # Parsing the SUBSTRATE table in BRENDA +{'5 entries': {'unknown': [{'sub': ['(S)-lactate', 'O2'], 'prod': ['pyruvate', 'H2O2']}, {'sub': ['lactate', 'O2'], 'prod': ['pyruvate', 'H2O2']}, {'sub': ['DL-2-hydroxyisovalerate', 'O2'], 'prod': ['2-oxoisovalerate', 'H2O2']}]}, 'Arabidopsis thaliana': {'Q24JJ8': [{'sub': ['2-hydroxycaprylate', 'O2'], 'prod': ['2-oxocaprylate', 'H2O2']}, {'sub': ['2-hydroxycaproate', 'O2'], 'prod': ['2-oxocaproate', 'H2O2']}, {'sub': ['2-hydroxypalmitate', 'O2'], 'prod': ['2-oxopalmitate', 'H2O2']}, {'sub': ['an (S)-2-hydroxy carboxylate', 'O2'], 'prod': ['a 2-oxo carboxylate', 'H2O2']}, {'sub': ['L-lactate', 'O2'], 'prod': ['pyruvate', 'H2O2']}], ... +``` + +A full list of the five-level tables: +```python3 +>>> Substrate(soub_obj) # Parsing the SUBSTRATE table in BRENDA +>>> NaturalSubstrate(soub_obj) # Parsing the NATURAL SUBSTRATE table in BRENDA +``` diff --git a/brenparse/data/1.1.3.15.html b/brenparse/data/1.1.3.15.html new file mode 100644 index 0000000..a80029b --- /dev/null +++ b/brenparse/data/1.1.3.15.html @@ -0,0 +1,20514 @@ + + + BRENDA - Information on EC 1.1.3.15 - (S)-2-hydroxy-acid oxidase + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +

Information on EC 1.1.3.15 - (S)-2-hydroxy-acid oxidase

for references in articles please use BRENDA:EC1.1.3.15
+ +
EC Tree
+
     1 Oxidoreductases
+
         1.1 Acting on the CH-OH group of donors
+
             1.1.3 With oxygen as acceptor
+
                1.1.3.15 (S)-2-hydroxy-acid oxidase
+
+
IUBMB Comments
A + flavoprotein (FMN). Exists as two major isoenzymes; the A form +preferentially oxidizes short-chain aliphatic hydroxy acids, and was +previously listed as EC 1.1.3.1, glycolate oxidase; the B form +preferentially oxidizes long-chain and aromatic hydroxy acids. The rat +isoenzyme B also acts as EC 1.4.3.2, L-amino-acid oxidase.
+
+
Specify your search results
+ +
+
+
+
+
+
+
+
+
Select one or more organisms in this record: ?
+
+ +
+
+
+
Word Map
+ +
+ +
+
+
The enzyme appears in viruses and cellular organisms
Synonyms
(L)-2-HAOX, (S)-2-hydroxy-acid oxidase, peroxisomal, 2-hydroxy acid oxidase, GLO, Glo1, GLO3, Glo4, glycolate oxidase, GO, GO1, more
+
+
+ + + + + + +
+
+
REACTION
+
REACTION DIAGRAM
+
COMMENTARY hide
+
ORGANISM
+
UNIPROT
+
LITERATURE
+
+
+
an (S)-2-hydroxy carboxylate + O2 = a 2-oxo carboxylate + H2O2 +
show the reaction diagram +
+
+
+
+
+ + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
Select items on the left to see more content.
+
+
+
+
+ + +
+ + + +
+
+ + +
+ + + + \ No newline at end of file diff --git a/brenparse/parser.py b/brenparse/parser.py index 9253a6f..2b38400 100644 --- a/brenparse/parser.py +++ b/brenparse/parser.py @@ -9,8 +9,9 @@ from os.path import join, exists from bs4 import BeautifulSoup import re +from pkg_resources import resource_stream - +EXAMPLE_PAGE = resource_stream(__name__, 'data/1.1.3.15.html').name def open_ec(filepath): ''' @@ -113,7 +114,7 @@ class _ThreeLevelDiv(_BrendaBaseClass): The divs have different "depths", different number of cells before I get to the UNIPROT ID. This class can parse divs that has a depth of three. "numeric" determines whether the first value in the table is expected to be numeric or not. - The expcted structure is "value, organism, uniprot_id" + The expected structure is "value, organism, uniprot_id" ''' def __init__(self, soup_instance, numeric): _BrendaBaseClass.__init__(self, soup_instance) @@ -258,7 +259,7 @@ class _FourLevelDiv(_BrendaBaseClass): The divs have different "depths", different number of cells before I get to the UNIPROT ID. This class can parse divs that has a depth of four. "numeric" determines whether the first value in the table is expected to be numeric or not. - The expcted structure is "value, information, organism, uniprot_id" + The expected structure is "value, information, organism, uniprot_id" ''' def __init__(self, soup_instance, numeric): _BrendaBaseClass.__init__(self, soup_instance) @@ -413,7 +414,7 @@ class _FiveLevelDiv(_BrendaBaseClass): The divs have different "depths", different number of cells before I get to the UNIPROT ID. This class can parse divs that has a depth of five. "numeric" determines whether the first value in the table is expected to be numeric or not. - The expcted structure is "value1, value2, information, organism, uniprot_id" + The expected structure is "value1, value2, information, organism, uniprot_id" ''' def __init__(self, soup_instance, numeric): _BrendaBaseClass.__init__(self, soup_instance) @@ -817,7 +818,7 @@ def __init__(self, soup_instance): class KcatDivKm(_FourLevelDiv): ''' - Parsing the TURNOVER NUMBER [1/s] table in BRENDA. + Parsing the kcat/KM VALUE [1/mMs-1] table in BRENDA. ''' def __init__(self, soup_instance): _FourLevelDiv.__init__(self, soup_instance, numeric=True) @@ -862,372 +863,3 @@ def __init__(self, soup_instance): ### other ### -# -# -# -# si = open_ec('/data/Work/projects/sampling-1-1-3-n/data/raw_external/BRENDA_html/html_data/1.1.1.3.html') -# x = Cofactor(si) -# data = x.get_data() -# print(data) - - - -# -# -# -# def get_data(): -# mycmd = "wget 'http://brenda-enzymes.org/all_enzymes.php' -U 'Mozilla/5.0 (X11; Linux x86_64; rv:30.0) Gecko/20100101 Firefox/30.0' --referer='http://brenda-enzymes.org' -O %s --no-clobber" % (join(RAW_FOLDER, 'all_enzymes.php.html')) -# os.system(mycmd) -# -# -# #open the list of EC numbers and find all -# filepath = join(RAW_FOLDER, 'all_enzymes.php.html') -# with open(filepath, 'r') as f: -# data = f.read() -# -# all_ec = set(re.findall('[0-9]+\.[0-9]+\.[0-9]+\.[0-9a-zA-Z]+', data)) -# -# total = len(list(all_ec)) -# print('Number of EC: %s' % total) -# -# #process each of these -# counter = 0 -# for ec in sorted(list(all_ec)): -# -# if counter % 500 == 0: -# print('%s of %s processed' % (counter, total)) -# counter +=1 -# -# # Skip files that exist -# if isfile(join(RAW_FOLDER, 'sequences', '%s.csv' % ec)): -# if os.path.getsize(join(RAW_FOLDER, 'sequences', '%s.csv' % ec)) > 2: -# continue -# # elif isfile(join(RAW_FOLDER, 'sequences', '%s.fasta' % ec)): -# # continue -# -# ##download html file if it does not exist -# mycmd = "wget 'http://brenda-enzymes.org/enzyme.php?ecno=%s' -U 'Mozilla/5.0 (X11; Linux x86_64; rv:30.0) Gecko/20100101 Firefox/30.0' --referer='https://www.brenda-enzymes.org/ecexplorer.php?browser=1&f[nodes]=21,1&f[action]=open&f[change]=22' -O %s " % (ec, join(RAW_FOLDER, 'html', '%s.html' % ec)) -# os.system(mycmd) -# -# #download sequences for ec number, if the file does not exist -# mycmd = "wget 'https://www.brenda-enzymes.org/sequences.php?download=allcsv&ec=%s' -U 'Mozilla/5.0 (X11; Linux x86_64; rv:30.0) Gecko/20100101 Firefox/30.0' --referer='https://www.brenda-enzymes.org/ecexplorer.php?browser=1&f[nodes]=21,1&f[action]=open&f[change]=22' -O %s " % (ec, join(RAW_FOLDER, 'sequences', '%s.csv' % ec)) -# os.system(mycmd) -# -# time.sleep(1) -# -# -# def make_fasta(): -# '''Convert BENDA csv file to FASTA''' -# -# counter = 0 -# files = os.listdir(join(RAW_FOLDER, 'sequences')) -# for fi in files: -# -# # skip non-csv files -# if not fi.endswith('.csv'): -# continue -# -# if counter % 500 == 0: -# print('%s processed' % (counter)) -# counter +=1 -# -# infile = join(RAW_FOLDER, 'sequences', fi) -# outfile = join(RAW_FOLDER, 'sequences', fi.replace('.csv', '.fasta')) -# -# # skip files that have been converted -# if isfile(outfile): -# if os.path.getsize(outfile) > 2: -# continue -# -# with open(infile, 'r' ,encoding='ISO-8859-1') as f: -# firstline = f.readline() -# -# #there are four types of document formats expected -# if firstline.strip() == '': -# # make file but keep it empty -# with open(outfile, 'w') as fo: -# fo.write('\n') -# continue -# -# elif len(firstline.split('\t')) == 7 and len(firstline.split('\t')[2].split('.')) == 4: -# line = firstline -# -# elif firstline.strip() == '#This file is tab stop separated': -# header_line = f.readline() # get rid of header -# -# # make sure header is ok. I.e. skip files where the third line is not the header -# if not header_line.startswith('Accession_Code'): -# print(fi) -# print(header_line) -# line = f.readline() -# -# elif firstline.startswith('Accession_Code'): -# line = f.readline() -# -# else: -# print(fi) -# print(firstline) -# continue -# -# with open(outfile, 'w', encoding='utf-8') as fo: -# #write data to fasta file -# header = ';'.join(line.split('\t')[:-1]) -# seq = line.split('\t')[-1].strip() -# fo.write('>%s\n%s\n' % (header, seq)) -# lastline = line -# -# for line in f: -# line = line.encode('utf-8', 'xmlcharrefreplace').decode('utf-8') -# -# #skip lines that are exactly the same (there seems to be some duplications) -# if lastline == line: -# continue -# -# #write data to fasta file -# header = ';'.join(line.split('\t')[:-1]) -# seq = line.split('\t')[-1].strip() -# fo.write('>%s\n%s\n' % (header, seq)) -# lastline = line -# -# def compress_data(): -# # compress all .html files, remove the uncompressed ones -# mycmd = "zip -jm %s %s" % (join(RAW_FOLDER, 'html', 'html_data.zip'), join(RAW_FOLDER, 'html', '*')) -# os.system(mycmd) -# -# # compress all .csv files, remove uncompressed ones -# mycmd = "zip -jm %s %s" % (join(RAW_FOLDER, 'sequences', 'sequence_data_csv.zip'), join(RAW_FOLDER, 'sequences', '*.csv')) -# os.system(mycmd) -# -# # compress all .fasta files, remove the uncompressed ones -# mycmd = "zip -jm %s %s" % (join(RAW_FOLDER, 'sequences', 'sequence_data.zip'), join(RAW_FOLDER, 'sequences', '*.fasta')) -# os.system(mycmd) -# - - - - - - - - -# -# def get_all_orgs(self): -# ''' -# Get all organism data from BRENDA -# ''' -# -# #open the list of EC numbers and find all -# filepath = join(RAW_FOLDER, 'all_enzymes.php.html') -# with open(filepath, 'r') as f: -# data = f.read() -# all_ec = set(re.findall('[0-9]+\.[0-9]+\.[0-9]+\.[0-9a-zA-Z]+', data)) -# -# total = len(list(all_ec)) -# print('Number of EC: %s' % total) -# -# #process each of these -# data = {} -# counter = 0 -# for ec in sorted(list(all_ec)): -# if ec.startswith('1.1.1'): -# print(ec) -# counter +=1 -# if counter % 1000 == 0: -# print('%s of %s processed' % (counter, total)) -# div_data = get_organism_divs_from_data(ec) -# #if div_data is not None: -# data[ec] = div_data -# -# -# #count how many -# with open('test.tsv', 'w') as f: -# f.write('ec\tuniprot_identifiers\n') -# -# all_uid = [] -# for ec in sorted(data.keys()): -# ec_uids = [] -# -# if data[ec] is None: -# f.write('%s\t%s\n' % (ec, 0)) -# else: -# for org in data[ec].keys(): -# if data[ec][org] is not None: -# all_uid.extend(data[ec][org]) -# -# ec_uids.extend(data[ec][org]) -# f.write('%s\t%s\n' % (ec, len(set(ec_uids)))) -# print(len(set(all_uid))) -# -# -# -# def get_all_uniprot_id(): -# ''' -# Use regex to get all the uniprot identifiers. -# Intended as an alternate method that does not depend on parsing the html. -# ''' -# -# #open the list of EC numbers and find all -# filepath = join(RAW_FOLDER, 'all_enzymes.php.html') -# with open(filepath, 'r') as f: -# data = f.read() -# all_ec = set(re.findall('[0-9]+\.[0-9]+\.[0-9]+\.[0-9a-zA-Z]+', data)) -# -# total = len(list(all_ec)) -# print('Number of EC: %s' % total) -# -# #process each of these -# data = {} -# counter = 0 -# for ec in sorted(list(all_ec)): -# #if ec.startswith('1.1.3'): -# print(ec) -# html_doc = join(RAW_FOLDER, '%s.html' % ec) -# -# #read the html page -# with open(html_doc, 'r') as f: -# document = f.read() -# -# #http://www.uniprot.org/help/accession_numbers -# m = re.findall('>([OPQ][0-9](?:[A-Z0-9]){3}[0-9])<|>([A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})<', document) -# data[ec] = m -# -# #count how many -# with open('test.tsv', 'w') as f: -# f.write('ec\tuniprot_identifiers\n') -# -# all_uid = [] -# for ec in sorted(data.keys()): -# if data[ec] == []: -# f.write('%s\t%s\n' % (ec, 0)) -# else: -# all_uid.extend(data[ec]) -# f.write('%s\t%s\n' % (ec, len(set(data[ec])))) -# print(len(set(all_uid))) -# -# -# get_all_uniprot_id() -# -# -# def get_all(folder_path, table_class): -# '''Get all temperature data from BRENDA''' -# -# #open the list of EC numbers and find all -# filepath = join(folder_path, 'all_enzymes.php.html') -# with open(filepath, 'r') as f: -# data = f.read() -# all_ec = set(re.findall('[0-9]+\.[0-9]+\.[0-9]+\.[0-9a-zA-Z]+', data)) -# -# total = len(list(all_ec)) -# print('Number of EC: %s' % total) -# -# #process each of these -# data = {} -# counter = 0 -# for ec in list(all_ec): -# counter +=1 -# if counter % 1000 == 0: -# print('%s of %s processed' % (counter, total)) -# -# soup = open_ec(filepath) -# -# brenda_obj = table_class(soup) -# -# data[ec] = brenda_obj.get_data() -# -# return data - # - # #save as shelve - # sh = shelve.open(join(FINAL_FOLDER, '1_temperature_optimum_data.db')) - # sh['data'] = data - # sh.close() -# -# -# def make_flatfile(): -# '''Make a tab-delimited flatfile of data''' -# sh = shelve.open(join(FINAL_FOLDER, '1_temperature_optimum_data.db')) -# data = sh['data'] -# sh.close() -# -# with open(join(FINAL_FOLDER, '1_temperature_optimum_data.tsv'), 'w') as f: -# f.write('ec\torganism\ttemperature\tuniprot_id\n') -# for ec in sorted(data.keys()): -# if data[ec] is None: -# continue -# for org in sorted(data[ec]): -# for uniprot_id in sorted(data[ec][org]): -# temps = data[ec][org][uniprot_id] -# temperature = int(round(sum(temps)/len(temps))) -# f.write('%s\t%s\t%s\t%s\n' % (ec, org.lower().replace(' ', '_'), temperature, uniprot_id)) -# -# -# -# def get_sequences(): -# '''For each uniprot_id, get the sequence''' -# sh = shelve.open(join(FINAL_FOLDER, '1_temperature_optimum_data.db')) -# data = sh['data'] -# sh.close() -# -# for ec in sorted(data.keys()): -# if data[ec] is None: -# continue -# for org in sorted(data[ec]): -# for uniprot_id in sorted(data[ec][org]): -# url = 'http://www.uniprot.org/uniprot/%s.fasta' % uniprot_id -# dlfile(folder=join(DATA_BASE_FOLDER, 'raw_external/', 'uniprot_records'), filename='%s.fasta' % uniprot_id, url=url) -# -# -# -# def make_fasta_files(): -# '''Combine uniprot records into fasta files. Annotate with temperature''' -# sh = shelve.open(join(FINAL_FOLDER, '1_temperature_optimum_data.db')) -# data = sh['data'] -# sh.close() -# -# orgs_in_training_set = [] -# all_orgs = [] -# -# #for each record in folder -# folder=join(DATA_BASE_FOLDER, 'raw_external/', 'uniprot_records') -# all_files = os.listdir(folder) -# for fi in sorted(all_files): -# with open(join(folder, fi), 'r') as f: -# header = f.readline() -# seq = f.read() -# uniprot_id = fi.replace('.fasta', '') -# -# #parse out orgname -# org = re.search('OS=[\[\]a-zA-Z]+\s[a-zA-Z]+', header).group(0) -# org = org.replace('OS=', '').replace('[', '').replace(']', '') -# -# #pair with its measured temperature -# for ec in data.keys(): -# if data[ec] is None: -# continue -# if data[ec].get(org, {}).get(uniprot_id) is not None: -# temps = data[ec][org][uniprot_id] -# temperature = int(round(sum(temps)/len(temps))) -# break -# -# #craft the output -# out_record = '>%s;%s\n%s' % (org, temperature, seq) -# -# #check whether this organism is in my growth data -# if org_temp_new.in_data(org) is True: -# #make one fasta file with organisms that were in my dataset -# orgs_in_training_set.append(out_record) -# -# #make one fasta file with all records -# all_orgs.append(out_record) -# -# -# with open(join(FINAL_FOLDER, 'orgs_in_training_set.fasta'), 'w') as f: -# f.write('\n'.join(orgs_in_training_set)) -# -# with open(join(FINAL_FOLDER, 'all_orgs.fasta'), 'w') as f: -# f.write('\n'.join(all_orgs)) - - -#get_all() -#make_flatfile() -#get_sequences() -#make_fasta_files()