Skip to content

Commit

Permalink
Merge pull request #125 from padix-key/gecos
Browse files Browse the repository at this point in the history
Color schemes from Gecos software
  • Loading branch information
padix-key authored Aug 16, 2019
2 parents 016df09 + 8cf1f07 commit bed7b6d
Show file tree
Hide file tree
Showing 28 changed files with 969 additions and 19 deletions.
30 changes: 22 additions & 8 deletions doc/examples/scripts/sequence/color_schemes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@
import biotite.sequence.graphics as graphics
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.patches import Rectangle

def plot_colors(ax, alphabet):
x_space=0.1
y_space=0.3
scheme_names = graphics.list_color_scheme_names(alphabet)
scheme_names = sorted(graphics.list_color_scheme_names(alphabet))
scheme_names.reverse()
schemes = [graphics.get_color_scheme(name, alphabet)
for name in scheme_names]
for i, scheme in enumerate(schemes):
Expand All @@ -36,15 +38,27 @@ def plot_colors(ax, alphabet):
ax.spines["right"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["top"].set_visible(False)
ax.xaxis.set_ticks_position("none")
ax.yaxis.set_ticks_position("none")
ax.xaxis.set_ticks_position("none")
ax.yaxis.set_ticks_position("none")

figure = plt.figure()
ax = figure.add_subplot(211)
nuc_alphabet = seq.NucleotideSequence.alphabet_amb
prot_alphabet = seq.ProteinSequence.alphabet
pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop")

figure = plt.figure(figsize=(8.0, 5.0))
gs = GridSpec(
3, 1,
height_ratios=[len(graphics.list_color_scheme_names(alphabet))
for alphabet in (nuc_alphabet, prot_alphabet, pb_alphabet)],
)
ax = figure.add_subplot(gs[0,0])
ax.set_title("Nucleotide color schemes")
plot_colors(ax, seq.NucleotideSequence.alphabet)
ax = figure.add_subplot(212)
plot_colors(ax, nuc_alphabet)
ax = figure.add_subplot(gs[1,0])
ax.set_title("Protein color schemes")
plot_colors(ax, seq.ProteinSequence.alphabet)
plot_colors(ax, prot_alphabet)
ax = figure.add_subplot(gs[2,0])
ax.set_title("Protein block color schemes")
plot_colors(ax, pb_alphabet)
plt.tight_layout()
plt.show()
106 changes: 106 additions & 0 deletions doc/examples/scripts/sequence/color_schemes_protein.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""
Biotite color schemes for protein sequences
===========================================
This script shows the same multiple protein sequence alignment
in the different color schemes available in *Biotite*.
- **rainbow** - Default color scheme in *Biotite*
- **clustalx** - Default color scheme of the *ClustalX* software
- Color schemes generated with the software *Gecos* [1]_:
- **flower** - Light color scheme, based on *BLOSUM62*
- **blossom** - Light color scheme with high contrast, based on
*BLOSUM62*, depicts symbol similarity worse than *flower*
- **spring** - Light color scheme, based on *BLOSUM62*,
with alanine fixed to gray
- **wither** - Dark color scheme, analogous to *blossom*
- **autumn** - Dark color scheme, analogous to *spring*
- **sunset** - Red-green color vision deficiency adapated color
scheme, based on *BLOSUM62*
- **ocean** - Blue shifted, light color scheme,
based on *BLOSUM62*
- Color schemes adapted from *JalView* [2]_:
- **zappo** - Color scheme that depicts physicochemical properties
- **taylor** - Color scheme invented by Willie Taylor
- **buried** - Color scheme depicting the *buried index*
- **hydrophobicity** - Color scheme depicting hydrophobicity
- **prophelix** - Color scheme depicting secondary structure
propensities
- **propstrand** - Color scheme depicting secondary structure
propensities
- **propturn** - Color scheme depicting secondary structure
propensities
.. [1] P Kunzmann, B Mayer,
"Substitution matrix based color schemes for sequence alignment
visualizations."
Unpublished
.. [2] M Clamp, J Cuff, SM Searle, GJ Barton,
"The Jalview Java alignment editor."
Bioinformatics, 20, 426-427 (2004).
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import biotite
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.database.entrez as entrez


# Generate example alignment
# (the same as in the bacterial luciferase example)
query = entrez.SimpleQuery("luxA", "Gene Name") \
& entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
uids = entrez.search(query, db_name="protein")
file_name = entrez.fetch_single_file(
uids, biotite.temp_file("fasta"), db_name="protein", ret_type="fasta"
)
fasta_file = fasta.FastaFile()
fasta_file.read(file_name)
sequences = [seq.ProteinSequence(seq_str) for seq_str in fasta_file.values()]
matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, _, _ = align.align_multiple(sequences, matrix)
# Order alignment according to the guide tree
alignment = alignment[:, order]
alignment = alignment[220:300]

# Get color scheme names
alphabet = seq.ProteinSequence.alphabet
schemes = [
"rainbow", "clustalx",
"flower", "blossom", "spring", "wither", "autumn", "sunset", "ocean",
"zappo", "taylor", "buried", "hydrophobicity",
"prophelix", "propstrand", "propturn"
]
count = len(schemes)
# Assert that this example displays all available amino acid color schemes
all_schemes = graphics.list_color_scheme_names(alphabet)
assert set(schemes) == set(all_schemes)


# Visualize each scheme using the example alignment
fig = plt.figure(figsize=(8.0, count*2.0))
gridspec = GridSpec(2, count)
for i, name in enumerate(schemes):
for j, color_symbols in enumerate([False, True]):
ax = fig.add_subplot(count, 2, 2*i + j + 1)
if j == 0:
ax.set_ylabel(name)
alignment_part = alignment[:40]
else:
alignment_part = alignment[40:]
graphics.plot_alignment_type_based(
ax, alignment_part, symbols_per_line=len(alignment_part),
color_scheme=name, color_symbols=color_symbols, symbol_size=8
)
fig.tight_layout()
fig.subplots_adjust(wspace=0)
plt.show()
2 changes: 1 addition & 1 deletion doc/examples/scripts/sequence/hcn_hydropathy.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def hydropathy_to_color(hydropathy, colormap):
ax = fig.add_subplot(111)
# Color the symbols instead of the background
graphics.plot_alignment_type_based(
ax, alignment[:600], labels=names, show_numbers=True, color_symbols=True,
ax, alignment[:600], labels=names, show_numbers=True,
color_scheme=colorscheme
)

Expand Down
26 changes: 26 additions & 0 deletions doc/examples/scripts/structure/pb_alignment.license
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
The protein block substitution matrix, the protein block reference angles
and the RSMDA calculation were taken from the PBxplore software,
governed by the following license:


The MIT License (MIT)

Copyright (c) 2013 Poulain, A. G. de Brevern

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
169 changes: 169 additions & 0 deletions doc/examples/scripts/structure/pb_alignment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
"""
Structural alignment of lysozyme variants using 'Protein Blocks'
================================================================
In this example we perform a structural alignment of mutiple lysozyme
variants from different organisms.
A feasible approach to perfrom such a multiple structure alignment is the
usage of a structural alphabet:
At first the structure is translated into a sequence that represents
the structure.
Then the sequences can be aligned with the standard sequence alignment
techniques, using the substitution matrix of the structural alphabet.
In this example, the structural alphabet we will use is called
*protein blocks* (PBs) [1]_ [2]_:
There are 16 different PBs, represented by the symbols ``a`` to ``p``.
Each one depicts a different set of the backbone dihedral angles of a
peptide 5-mer.
To assign a PB to an amino acid, the 5-mer centered on the respective
residue is taken, its backbone dihedral angles are calculated and the
PB with the least deviation to this set of angles is chosen.
.. [1] AG de Brevern, C Etchebest and S Hazout,
"Bayesian probabilistic approach for predicting backbone structures
in terms of protein blocks."
Proteins, 41, 271-288 (2000).
.. [2] J Barnoud, H Santuz, P Craveur, AP Joseph,
V Jallu, AG de Brevern, P Poulain,
"PBxplore: a tool to analyze local protein structure and deformability
with Protein Blocks."
PeerJ, 5, (2017).
"""

# Code source: Patrick Kunzmann
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
import biotite
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import biotite.structure as struc
import biotite.structure.io.mmtf as mmtf
import biotite.database.rcsb as rcsb


# PB alphabet
pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop")

# PB substitution matrix, adapted from PBxplore
matrix_str = """
a b c d e f g h i j k l m n o p
a 516 -59 113 -105 -411 -177 -27 -361 47 -103 -644 -259 -599 -372 -124 -83
b -59 541 -146 -210 -155 -310 -97 90 182 -128 -30 29 -745 -242 -165 22
c 113 -146 360 -14 -333 -240 49 -438 -269 -282 -688 -682 -608 -455 -147 6
d -105 -210 -14 221 5 -131 -349 -278 -253 -173 -585 -670 -1573 -1048 -691 -497
e -411 -155 -333 5 520 185 186 138 -378 -70 -112 -514 -1136 -469 -617 -632
f -177 -310 -240 -131 185 459 -99 -45 -445 83 -214 -88 -547 -629 -406 -552
g -27 -97 49 -349 186 -99 665 -99 -89 -118 -409 -138 -124 172 128 254
h -361 90 -438 -278 138 -45 -99 632 -205 316 192 -108 -712 -359 95 -399
i 47 182 -269 -253 -378 -445 -89 -205 696 186 8 15 -709 -269 -169 226
j -103 -128 -282 -173 -70 83 -118 316 186 768 196 5 -398 -340 -117 -104
k -644 -30 -688 -585 -112 -214 -409 192 8 196 568 -65 -270 -231 -471 -382
l -259 29 -682 -670 -514 -88 -138 -108 15 5 -65 533 -131 8 -11 -316
m -599 -745 -608 -1573 -1136 -547 -124 -712 -709 -398 -270 -131 241 -4 -190 -155
n -372 -242 -455 -1048 -469 -629 172 -359 -269 -340 -231 8 -4 703 88 146
o -124 -165 -147 -691 -617 -406 128 95 -169 -117 -471 -11 -190 88 716 58
p -83 22 6 -497 -632 -552 254 -399 226 -104 -382 -316 -155 146 58 609
"""

# PB reference angles, adapted from PBxplore
ref_angles = np.array([
[ 41.14, 75.53, 13.92, -99.80, 131.88, -96.27, 122.08, -99.68],
[108.24, -90.12, 119.54, -92.21, -18.06, -128.93, 147.04, -99.90],
[-11.61, -105.66, 94.81, -106.09, 133.56, -106.93, 135.97, -100.63],
[141.98, -112.79, 132.20, -114.79, 140.11, -111.05, 139.54, -103.16],
[133.25, -112.37, 137.64, -108.13, 133.00, -87.30, 120.54, 77.40],
[116.40, -105.53, 129.32, -96.68, 140.72, -74.19, -26.65, -94.51],
[ 0.40, -81.83, 4.91, -100.59, 85.50, -71.65, 130.78, 84.98],
[119.14, -102.58, 130.83, -67.91, 121.55, 76.25, -2.95, -90.88],
[130.68, -56.92, 119.26, 77.85, 10.42, -99.43, 141.40, -98.01],
[114.32, -121.47, 118.14, 82.88, -150.05, -83.81, 23.35, -85.82],
[117.16, -95.41, 140.40, -59.35, -29.23, -72.39, -25.08, -76.16],
[139.20, -55.96, -32.70, -68.51, -26.09, -74.44, -22.60, -71.74],
[-39.62, -64.73, -39.52, -65.54, -38.88, -66.89, -37.76, -70.19],
[-35.34, -65.03, -38.12, -66.34, -29.51, -89.10, -2.91, 77.90],
[-45.29, -67.44, -27.72, -87.27, 5.13, 77.49, 30.71, -93.23],
[-27.09, -86.14, 0.30, 59.85, 21.51, -96.30, 132.67, -92.91],
])


# Fetch animal lysoyzme structures
lyso_files = rcsb.fetch(
["1REX", "1AKI", "1DKJ", "1GD6"],
format="mmtf", target_path=biotite.temp_dir()
)
organisms = ["H. sapiens", "G. gallus", "C. viginianus", "B. mori"]

# Create a PB sequence from each structure
pb_seqs = []
for file_name in lyso_files:
file = mmtf.MMTFFile()
file.read(file_name)
# Take only the first model into account
array = mmtf.get_structure(file, model=1)
# Remove everything but the first protein chain
array = array[struc.filter_amino_acids(array)]
array = array[array.chain_id == array.chain_id[0]]

# Calculate backbone dihedral angles,
# as the PBs are determined from them
phi, psi, omega = struc.dihedral_backbone(array)
# A PB requires the 8 phi/psi angles of 5 amino acids,
# centered on the amino acid to calculate the PB for
# Hence, the PBs are not defined for the two amino acids
# at each terminus
pb_angles = np.full((len(phi)-4, 8), np.nan)
pb_angles[:, 0] = psi[ : -4]
pb_angles[:, 1] = phi[1 : -3]
pb_angles[:, 2] = psi[1 : -3]
pb_angles[:, 3] = phi[2 : -2]
pb_angles[:, 4] = psi[2 : -2]
pb_angles[:, 5] = phi[3 : -1]
pb_angles[:, 6] = psi[3 : -1]
pb_angles[:, 7] = phi[4 : ]
pb_angles = np.rad2deg(pb_angles)

# Angle RMSD of all reference angles with all actual angles
rmsda = np.sum(
(
(
ref_angles[:, np.newaxis] - pb_angles[np.newaxis, :] + 180
) % 360 - 180
)**2,
axis=-1
)
# Chose PB, where the RMSDA to the reference angle is lowest
# Due to the definition of Biotite symbol codes
# the index of the chosen PB is directly the symbol code
pb_seq_code = np.argmin(rmsda, axis=0)
# Put the array of symbol codes into actual sequence objects
pb_sequence = seq.GeneralSequence(pb_alphabet)
pb_sequence.code = pb_seq_code
pb_seqs.append(pb_sequence)

# Perfrom a multiple sequence alignment of the PB sequences
matrix_dict = align.SubstitutionMatrix.dict_from_str(matrix_str)
matrix = align.SubstitutionMatrix(pb_alphabet, pb_alphabet, matrix_dict)
alignment, order, _, _ = align.align_multiple(
pb_seqs, matrix, gap_penalty=(-500,-100), terminal_penalty=False
)

# Visualize the alignment
# Order alignment according to guide tree
alignment = alignment[:, order.tolist()]
labels = [organisms[i] for i in order]
fig = plt.figure(figsize=(8.0, 4.0))
ax = fig.add_subplot(111)
# The color scheme was generated with the 'Gecos' software
graphics.plot_alignment_type_based(
ax, alignment, labels=labels, symbols_per_line=45, spacing=2,
show_numbers=True, color_scheme="flower"
)
# Organism names in italic
ax.set_yticklabels(ax.get_yticklabels(), fontdict={"fontstyle":"italic"})
fig.tight_layout()
plt.show()
20 changes: 19 additions & 1 deletion doc/extensions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,24 @@
Extensions
==========

None yet.
This page shows *Biotite*'s extension packages:
These are packages that build upon *Biotite*'s functionality, but are separate
Python packages that are developed independently.

.. raw:: html

<div
class="sphx-glr-thumbcontainer"
tooltip="An automatic generator for alignment color schemes based on substitution matrices"
>

.. only:: html

.. figure:: https://gecos.biotite-python.org/_downloads/3ebefd489b8ac7424d85cd4967cb06b1/gecos_icon.svg
:target: https://gecos.biotite-python.org/

`Gecos - Generated color schemes <https://gecos.biotite-python.org/>`_

.. raw:: html

</div>
Loading

0 comments on commit bed7b6d

Please sign in to comment.