-
Notifications
You must be signed in to change notification settings - Fork 103
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #125 from padix-key/gecos
Color schemes from Gecos software
- Loading branch information
Showing
28 changed files
with
969 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
""" | ||
Biotite color schemes for protein sequences | ||
=========================================== | ||
This script shows the same multiple protein sequence alignment | ||
in the different color schemes available in *Biotite*. | ||
- **rainbow** - Default color scheme in *Biotite* | ||
- **clustalx** - Default color scheme of the *ClustalX* software | ||
- Color schemes generated with the software *Gecos* [1]_: | ||
- **flower** - Light color scheme, based on *BLOSUM62* | ||
- **blossom** - Light color scheme with high contrast, based on | ||
*BLOSUM62*, depicts symbol similarity worse than *flower* | ||
- **spring** - Light color scheme, based on *BLOSUM62*, | ||
with alanine fixed to gray | ||
- **wither** - Dark color scheme, analogous to *blossom* | ||
- **autumn** - Dark color scheme, analogous to *spring* | ||
- **sunset** - Red-green color vision deficiency adapated color | ||
scheme, based on *BLOSUM62* | ||
- **ocean** - Blue shifted, light color scheme, | ||
based on *BLOSUM62* | ||
- Color schemes adapted from *JalView* [2]_: | ||
- **zappo** - Color scheme that depicts physicochemical properties | ||
- **taylor** - Color scheme invented by Willie Taylor | ||
- **buried** - Color scheme depicting the *buried index* | ||
- **hydrophobicity** - Color scheme depicting hydrophobicity | ||
- **prophelix** - Color scheme depicting secondary structure | ||
propensities | ||
- **propstrand** - Color scheme depicting secondary structure | ||
propensities | ||
- **propturn** - Color scheme depicting secondary structure | ||
propensities | ||
.. [1] P Kunzmann, B Mayer, | ||
"Substitution matrix based color schemes for sequence alignment | ||
visualizations." | ||
Unpublished | ||
.. [2] M Clamp, J Cuff, SM Searle, GJ Barton, | ||
"The Jalview Java alignment editor." | ||
Bioinformatics, 20, 426-427 (2004). | ||
""" | ||
|
||
# Code source: Patrick Kunzmann | ||
# License: BSD 3 clause | ||
|
||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
from matplotlib.gridspec import GridSpec | ||
import biotite | ||
import biotite.sequence as seq | ||
import biotite.sequence.io.fasta as fasta | ||
import biotite.sequence.align as align | ||
import biotite.sequence.graphics as graphics | ||
import biotite.database.entrez as entrez | ||
|
||
|
||
# Generate example alignment | ||
# (the same as in the bacterial luciferase example) | ||
query = entrez.SimpleQuery("luxA", "Gene Name") \ | ||
& entrez.SimpleQuery("srcdb_swiss-prot", "Properties") | ||
uids = entrez.search(query, db_name="protein") | ||
file_name = entrez.fetch_single_file( | ||
uids, biotite.temp_file("fasta"), db_name="protein", ret_type="fasta" | ||
) | ||
fasta_file = fasta.FastaFile() | ||
fasta_file.read(file_name) | ||
sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()] | ||
matrix = align.SubstitutionMatrix.std_protein_matrix() | ||
alignment, order, _, _ = align.align_multiple(sequences, matrix) | ||
# Order alignment according to the guide tree | ||
alignment = alignment[:, order] | ||
alignment = alignment[220:300] | ||
|
||
# Get color scheme names | ||
alphabet = seq.ProteinSequence.alphabet | ||
schemes = [ | ||
"rainbow", "clustalx", | ||
"flower", "blossom", "spring", "wither", "autumn", "sunset", "ocean", | ||
"zappo", "taylor", "buried", "hydrophobicity", | ||
"prophelix", "propstrand", "propturn" | ||
] | ||
count = len(schemes) | ||
# Assert that this example displays all available amino acid color schemes | ||
all_schemes = graphics.list_color_scheme_names(alphabet) | ||
assert set(schemes) == set(all_schemes) | ||
|
||
|
||
# Visualize each scheme using the example alignment | ||
fig = plt.figure(figsize=(8.0, count*2.0)) | ||
gridspec = GridSpec(2, count) | ||
for i, name in enumerate(schemes): | ||
for j, color_symbols in enumerate([False, True]): | ||
ax = fig.add_subplot(count, 2, 2*i + j + 1) | ||
if j == 0: | ||
ax.set_ylabel(name) | ||
alignment_part = alignment[:40] | ||
else: | ||
alignment_part = alignment[40:] | ||
graphics.plot_alignment_type_based( | ||
ax, alignment_part, symbols_per_line=len(alignment_part), | ||
color_scheme=name, color_symbols=color_symbols, symbol_size=8 | ||
) | ||
fig.tight_layout() | ||
fig.subplots_adjust(wspace=0) | ||
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
The protein block substitution matrix, the protein block reference angles | ||
and the RSMDA calculation were taken from the PBxplore software, | ||
governed by the following license: | ||
|
||
|
||
The MIT License (MIT) | ||
|
||
Copyright (c) 2013 Poulain, A. G. de Brevern | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
""" | ||
Structural alignment of lysozyme variants using 'Protein Blocks' | ||
================================================================ | ||
In this example we perform a structural alignment of mutiple lysozyme | ||
variants from different organisms. | ||
A feasible approach to perfrom such a multiple structure alignment is the | ||
usage of a structural alphabet: | ||
At first the structure is translated into a sequence that represents | ||
the structure. | ||
Then the sequences can be aligned with the standard sequence alignment | ||
techniques, using the substitution matrix of the structural alphabet. | ||
In this example, the structural alphabet we will use is called | ||
*protein blocks* (PBs) [1]_ [2]_: | ||
There are 16 different PBs, represented by the symbols ``a`` to ``p``. | ||
Each one depicts a different set of the backbone dihedral angles of a | ||
peptide 5-mer. | ||
To assign a PB to an amino acid, the 5-mer centered on the respective | ||
residue is taken, its backbone dihedral angles are calculated and the | ||
PB with the least deviation to this set of angles is chosen. | ||
.. [1] AG de Brevern, C Etchebest and S Hazout, | ||
"Bayesian probabilistic approach for predicting backbone structures | ||
in terms of protein blocks." | ||
Proteins, 41, 271-288 (2000). | ||
.. [2] J Barnoud, H Santuz, P Craveur, AP Joseph, | ||
V Jallu, AG de Brevern, P Poulain, | ||
"PBxplore: a tool to analyze local protein structure and deformability | ||
with Protein Blocks." | ||
PeerJ, 5, (2017). | ||
""" | ||
|
||
# Code source: Patrick Kunzmann | ||
# License: BSD 3 clause | ||
|
||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
import biotite | ||
import biotite.sequence as seq | ||
import biotite.sequence.align as align | ||
import biotite.sequence.graphics as graphics | ||
import biotite.structure as struc | ||
import biotite.structure.io.mmtf as mmtf | ||
import biotite.database.rcsb as rcsb | ||
|
||
|
||
# PB alphabet | ||
pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop") | ||
|
||
# PB substitution matrix, adapted from PBxplore | ||
matrix_str = """ | ||
a b c d e f g h i j k l m n o p | ||
a 516 -59 113 -105 -411 -177 -27 -361 47 -103 -644 -259 -599 -372 -124 -83 | ||
b -59 541 -146 -210 -155 -310 -97 90 182 -128 -30 29 -745 -242 -165 22 | ||
c 113 -146 360 -14 -333 -240 49 -438 -269 -282 -688 -682 -608 -455 -147 6 | ||
d -105 -210 -14 221 5 -131 -349 -278 -253 -173 -585 -670 -1573 -1048 -691 -497 | ||
e -411 -155 -333 5 520 185 186 138 -378 -70 -112 -514 -1136 -469 -617 -632 | ||
f -177 -310 -240 -131 185 459 -99 -45 -445 83 -214 -88 -547 -629 -406 -552 | ||
g -27 -97 49 -349 186 -99 665 -99 -89 -118 -409 -138 -124 172 128 254 | ||
h -361 90 -438 -278 138 -45 -99 632 -205 316 192 -108 -712 -359 95 -399 | ||
i 47 182 -269 -253 -378 -445 -89 -205 696 186 8 15 -709 -269 -169 226 | ||
j -103 -128 -282 -173 -70 83 -118 316 186 768 196 5 -398 -340 -117 -104 | ||
k -644 -30 -688 -585 -112 -214 -409 192 8 196 568 -65 -270 -231 -471 -382 | ||
l -259 29 -682 -670 -514 -88 -138 -108 15 5 -65 533 -131 8 -11 -316 | ||
m -599 -745 -608 -1573 -1136 -547 -124 -712 -709 -398 -270 -131 241 -4 -190 -155 | ||
n -372 -242 -455 -1048 -469 -629 172 -359 -269 -340 -231 8 -4 703 88 146 | ||
o -124 -165 -147 -691 -617 -406 128 95 -169 -117 -471 -11 -190 88 716 58 | ||
p -83 22 6 -497 -632 -552 254 -399 226 -104 -382 -316 -155 146 58 609 | ||
""" | ||
|
||
# PB reference angles, adapted from PBxplore | ||
ref_angles = np.array([ | ||
[ 41.14, 75.53, 13.92, -99.80, 131.88, -96.27, 122.08, -99.68], | ||
[108.24, -90.12, 119.54, -92.21, -18.06, -128.93, 147.04, -99.90], | ||
[-11.61, -105.66, 94.81, -106.09, 133.56, -106.93, 135.97, -100.63], | ||
[141.98, -112.79, 132.20, -114.79, 140.11, -111.05, 139.54, -103.16], | ||
[133.25, -112.37, 137.64, -108.13, 133.00, -87.30, 120.54, 77.40], | ||
[116.40, -105.53, 129.32, -96.68, 140.72, -74.19, -26.65, -94.51], | ||
[ 0.40, -81.83, 4.91, -100.59, 85.50, -71.65, 130.78, 84.98], | ||
[119.14, -102.58, 130.83, -67.91, 121.55, 76.25, -2.95, -90.88], | ||
[130.68, -56.92, 119.26, 77.85, 10.42, -99.43, 141.40, -98.01], | ||
[114.32, -121.47, 118.14, 82.88, -150.05, -83.81, 23.35, -85.82], | ||
[117.16, -95.41, 140.40, -59.35, -29.23, -72.39, -25.08, -76.16], | ||
[139.20, -55.96, -32.70, -68.51, -26.09, -74.44, -22.60, -71.74], | ||
[-39.62, -64.73, -39.52, -65.54, -38.88, -66.89, -37.76, -70.19], | ||
[-35.34, -65.03, -38.12, -66.34, -29.51, -89.10, -2.91, 77.90], | ||
[-45.29, -67.44, -27.72, -87.27, 5.13, 77.49, 30.71, -93.23], | ||
[-27.09, -86.14, 0.30, 59.85, 21.51, -96.30, 132.67, -92.91], | ||
]) | ||
|
||
|
||
# Fetch animal lysoyzme structures | ||
lyso_files = rcsb.fetch( | ||
["1REX", "1AKI", "1DKJ", "1GD6"], | ||
format="mmtf", target_path=biotite.temp_dir() | ||
) | ||
organisms = ["H. sapiens", "G. gallus", "C. viginianus", "B. mori"] | ||
|
||
# Create a PB sequence from each structure | ||
pb_seqs = [] | ||
for file_name in lyso_files: | ||
file = mmtf.MMTFFile() | ||
file.read(file_name) | ||
# Take only the first model into account | ||
array = mmtf.get_structure(file, model=1) | ||
# Remove everything but the first protein chain | ||
array = array[struc.filter_amino_acids(array)] | ||
array = array[array.chain_id == array.chain_id[0]] | ||
|
||
# Calculate backbone dihedral angles, | ||
# as the PBs are determined from them | ||
phi, psi, omega = struc.dihedral_backbone(array) | ||
# A PB requires the 8 phi/psi angles of 5 amino acids, | ||
# centered on the amino acid to calculate the PB for | ||
# Hence, the PBs are not defined for the two amino acids | ||
# at each terminus | ||
pb_angles = np.full((len(phi)-4, 8), np.nan) | ||
pb_angles[:, 0] = psi[ : -4] | ||
pb_angles[:, 1] = phi[1 : -3] | ||
pb_angles[:, 2] = psi[1 : -3] | ||
pb_angles[:, 3] = phi[2 : -2] | ||
pb_angles[:, 4] = psi[2 : -2] | ||
pb_angles[:, 5] = phi[3 : -1] | ||
pb_angles[:, 6] = psi[3 : -1] | ||
pb_angles[:, 7] = phi[4 : ] | ||
pb_angles = np.rad2deg(pb_angles) | ||
|
||
# Angle RMSD of all reference angles with all actual angles | ||
rmsda = np.sum( | ||
( | ||
( | ||
ref_angles[:, np.newaxis] - pb_angles[np.newaxis, :] + 180 | ||
) % 360 - 180 | ||
)**2, | ||
axis=-1 | ||
) | ||
# Chose PB, where the RMSDA to the reference angle is lowest | ||
# Due to the definition of Biotite symbol codes | ||
# the index of the chosen PB is directly the symbol code | ||
pb_seq_code = np.argmin(rmsda, axis=0) | ||
# Put the array of symbol codes into actual sequence objects | ||
pb_sequence = seq.GeneralSequence(pb_alphabet) | ||
pb_sequence.code = pb_seq_code | ||
pb_seqs.append(pb_sequence) | ||
|
||
# Perfrom a multiple sequence alignment of the PB sequences | ||
matrix_dict = align.SubstitutionMatrix.dict_from_str(matrix_str) | ||
matrix = align.SubstitutionMatrix(pb_alphabet, pb_alphabet, matrix_dict) | ||
alignment, order, _, _ = align.align_multiple( | ||
pb_seqs, matrix, gap_penalty=(-500,-100), terminal_penalty=False | ||
) | ||
|
||
# Visualize the alignment | ||
# Order alignment according to guide tree | ||
alignment = alignment[:, order.tolist()] | ||
labels = [organisms[i] for i in order] | ||
fig = plt.figure(figsize=(8.0, 4.0)) | ||
ax = fig.add_subplot(111) | ||
# The color scheme was generated with the 'Gecos' software | ||
graphics.plot_alignment_type_based( | ||
ax, alignment, labels=labels, symbols_per_line=45, spacing=2, | ||
show_numbers=True, color_scheme="flower" | ||
) | ||
# Organism names in italic | ||
ax.set_yticklabels(ax.get_yticklabels(), fontdict={"fontstyle":"italic"}) | ||
fig.tight_layout() | ||
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.