diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..3d63f71 Binary files /dev/null and b/.DS_Store differ diff --git a/.ipynb_checkpoints/check_k3l_test-checkpoint.ipynb b/.ipynb_checkpoints/check_k3l_test-checkpoint.ipynb new file mode 100644 index 0000000..1b4d044 --- /dev/null +++ b/.ipynb_checkpoints/check_k3l_test-checkpoint.ipynb @@ -0,0 +1,1242 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 62, + "id": "fbb00f64-ecc8-412d-ae57-9a4036440f6a", + "metadata": {}, + "outputs": [], + "source": [ + "# check that k3l_test contains appropirate variants for k3l\n", + "import pandas as pd\n", + "from Bio.Seq import Seq" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "e154ddf7-fe49-461b-9e1e-35142b170139", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namesub_window_namewtpositioniupaccodon_subiupac_aasynonymous_codonsno_stop_codonsprimer...sub_windowforward_primerforward_primer_tmforward_primer_gcforward_primer_lenreverse_primer_namereverse_primerreverse_primer_tmreverse_primer_gcreverse_primer_len
0window_1-1_GCG11HCGwindow_1-1GCG11HCGGCG11HCGTPS0.00.0TTTGTTATTCGTTGCCCAATHCGggwgaygtwatwaarGGCAGAGT......HCGggwgaygtwatwaarGGCAGAGTATACGAGAAGGATT51.345.522rev_window_1-1ATTGGGCAACGAATAACAAAATGCA55.136.025
1window_1-1_GCG11GDGwindow_1-1GCG11GDGGCG11GDGEGV0.00.0TTTGTTATTCGTTGCCCAATGDGggwgaygtwatwaarGGCAGAGT......GDGggwgaygtwatwaarGGCAGAGTATACGAGAAGGATT51.345.522rev_window_1-1ATTGGGCAACGAATAACAAAATGCA55.136.025
2window_1-1_GGT12HGTwindow_1-1GGT12HGTGGT12HGTSRC0.00.0TTTGTTATTCGTTGCCCAATgcwHGTgaygtwatwaarGGCAGAGT......gcwHGTgaygtwatwaarGGCAGAGTATACGAGAAGGATT51.345.522rev_window_1-1ATTGGGCAACGAATAACAAAATGCA55.136.025
3window_1-1_GGT12GHTwindow_1-1GGT12GHTGGT12GHTDAV0.00.0TTTGTTATTCGTTGCCCAATgcwGHTgaygtwatwaarGGCAGAGT......gcwGHTgaygtwatwaarGGCAGAGTATACGAGAAGGATT51.345.522rev_window_1-1ATTGGGCAACGAATAACAAAATGCA55.136.025
4window_1-1_GAT13HATwindow_1-1GAT13HATGAT13HATNHY0.00.0TTTGTTATTCGTTGCCCAATgcwggwHATgtwatwaarGGCAGAGT......gcwggwHATgtwatwaarGGCAGAGTATACGAGAAGGATT51.345.522rev_window_1-1ATTGGGCAACGAATAACAAAATGCA55.136.025
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " name sub_window_name wt position iupac codon_sub \\\n", + "0 window_1-1_GCG11HCG window_1-1 GCG 11 HCG GCG11HCG \n", + "1 window_1-1_GCG11GDG window_1-1 GCG 11 GDG GCG11GDG \n", + "2 window_1-1_GGT12HGT window_1-1 GGT 12 HGT GGT12HGT \n", + "3 window_1-1_GGT12GHT window_1-1 GGT 12 GHT GGT12GHT \n", + "4 window_1-1_GAT13HAT window_1-1 GAT 13 HAT GAT13HAT \n", + "\n", + " iupac_aa synonymous_codons no_stop_codons \\\n", + "0 TPS 0.0 0.0 \n", + "1 EGV 0.0 0.0 \n", + "2 SRC 0.0 0.0 \n", + "3 DAV 0.0 0.0 \n", + "4 NHY 0.0 0.0 \n", + "\n", + " primer ... sub_window \\\n", + "0 TTTGTTATTCGTTGCCCAATHCGggwgaygtwatwaarGGCAGAGT... ... HCGggwgaygtwatwaar \n", + "1 TTTGTTATTCGTTGCCCAATGDGggwgaygtwatwaarGGCAGAGT... ... GDGggwgaygtwatwaar \n", + "2 TTTGTTATTCGTTGCCCAATgcwHGTgaygtwatwaarGGCAGAGT... ... gcwHGTgaygtwatwaar \n", + "3 TTTGTTATTCGTTGCCCAATgcwGHTgaygtwatwaarGGCAGAGT... ... gcwGHTgaygtwatwaar \n", + "4 TTTGTTATTCGTTGCCCAATgcwggwHATgtwatwaarGGCAGAGT... ... gcwggwHATgtwatwaar \n", + "\n", + " forward_primer forward_primer_tm forward_primer_gc \\\n", + "0 GGCAGAGTATACGAGAAGGATT 51.3 45.5 \n", + "1 GGCAGAGTATACGAGAAGGATT 51.3 45.5 \n", + "2 GGCAGAGTATACGAGAAGGATT 51.3 45.5 \n", + "3 GGCAGAGTATACGAGAAGGATT 51.3 45.5 \n", + "4 GGCAGAGTATACGAGAAGGATT 51.3 45.5 \n", + "\n", + " forward_primer_len reverse_primer_name reverse_primer \\\n", + "0 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA \n", + "1 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA \n", + "2 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA \n", + "3 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA \n", + "4 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA \n", + "\n", + " reverse_primer_tm reverse_primer_gc reverse_primer_len \n", + "0 55.1 36.0 25 \n", + "1 55.1 36.0 25 \n", + "2 55.1 36.0 25 \n", + "3 55.1 36.0 25 \n", + "4 55.1 36.0 25 \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('k3l_test.tsv', sep='\\t')\n", + "df.fillna('', inplace=True)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "94820b6e-8399-42d0-a9e9-57fdc15967d1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
positioniupac_aawt
011TPSEGVGCG
112SRCDAVGGT
213NHYAGVGAT
314ILEAGGTA
415VLKTRATA
\n", + "
" + ], + "text/plain": [ + " position iupac_aa wt\n", + "0 11 TPSEGV GCG\n", + "1 12 SRCDAV GGT\n", + "2 13 NHYAGV GAT\n", + "3 14 ILEAG GTA\n", + "4 15 VLKTR ATA" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1 = df.groupby('position')['iupac_aa'].apply(list).reset_index()\n", + "map_dict = dict(zip(df.position, df.wt))\n", + "df1['wt'] =df1.position.map(map_dict)\n", + "df1.iupac_aa = df1.iupac_aa.str.join('')\n", + "df1.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "a55033c8-a3df-4472-a3a3-f5fb15db9600", + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'str' object has no attribute 'append'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/var/folders/fq/q1cflf795cgbnmbhdyn8d9nntyw5jt/T/ipykernel_23793/3362652463.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;34m'ACGT'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'J'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m: 'str' object has no attribute 'append'" + ] + } + ], + "source": [ + "'ACGT'.append('J')" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "7dec58b0-3c8d-4f1e-9a38-c368962f93a3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'TPSEGV'" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get the missense variants for the wt codon\n", + "def aa_missense_variants(codon):\n", + " \n", + " nucleotides = 'ACGT'\n", + " wt_aa = str(Seq(codon).translate())\n", + " aa_list = []\n", + " for position in range(3): \n", + " for n in nucleotides:\n", + " new_codon = codon[:position] + n + codon[position + 1:]\n", + " new_aa = str(Seq(new_codon).translate())\n", + " if new_aa != wt_aa:\n", + " aa_list.append(new_aa)\n", + " else: \n", + " continue\n", + " return ''.join(aa_list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a1fa0e8-75a8-4b1f-850c-ab46ebb1987f", + "metadata": {}, + "outputs": [], + "source": [ + "# take sets and remove iupac_aa from wt_aa (reciprocal, may have synonymous in the iupac and stops in the wt)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b519e16-beb1-4200-9cc4-98b6aa50ca55", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2bd569a4-ee28-4f39-8c17-664c35d18313", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Seq('XGDVIK')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check that synonymous variants look correct\n", + "Seq('HCGggwgaygtwatwaar').translate()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "9fe36459-2a34-4186-a977-87a7e15c914b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
codonaapositionmissense_nucmissense_codonsmissense_aamissense_iupacmissense_iupac_codonsele_codonssele_aa...syn_boolsyn_codonssyn_aasyn_iupac_codonno_stop_codonsno_stop_aano_stop_iupac_codonno_stop_syn_codonsno_stop_syn_aano_stop_syn_iupac_codon
0AAAK0CGTCAA GAA TAA*QEBBAACAA GAA TAAQE*...FalseBAACAA GAAQESAACAA GAAQESAA
1AAAK1CGTACA AGA ATATRIBABAACA AGA ATATRI...FalseABAACA AGA ATATRIABAACA AGA ATATRIABA
2AAAK2CTAAC AATNYAAYAATN...TrueAAG AATKNAAKAATNAATAAG AATKNAAK
3AACN0CGTCAC GAC TACHDYBBACCAC GAC TACHDY...FalseBACCAC GAC TACHDYBACCAC GAC TACHDYBAC
4AACN1CGTACC AGC ATCTSIBABCACC AGC ATCTSI...FalseABCACC AGC ATCTSIABCACC AGC ATCTSIABC
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " codon aa position missense_nuc missense_codons missense_aa missense_iupac \\\n", + "0 AAA K 0 CGT CAA GAA TAA *QE B \n", + "1 AAA K 1 CGT ACA AGA ATA TRI B \n", + "2 AAA K 2 CT AAC AAT N Y \n", + "3 AAC N 0 CGT CAC GAC TAC HDY B \n", + "4 AAC N 1 CGT ACC AGC ATC TSI B \n", + "\n", + " missense_iupac_codon sele_codons sele_aa ... syn_bool syn_codons syn_aa \\\n", + "0 BAA CAA GAA TAA QE* ... False \n", + "1 ABA ACA AGA ATA TRI ... False \n", + "2 AAY AAT N ... True AAG AAT KN \n", + "3 BAC CAC GAC TAC HDY ... False \n", + "4 ABC ACC AGC ATC TSI ... False \n", + "\n", + " syn_iupac_codon no_stop_codons no_stop_aa no_stop_iupac_codon \\\n", + "0 BAA CAA GAA QE SAA \n", + "1 ABA ACA AGA ATA TRI ABA \n", + "2 AAK AAT N AAT \n", + "3 BAC CAC GAC TAC HDY BAC \n", + "4 ABC ACC AGC ATC TSI ABC \n", + "\n", + " no_stop_syn_codons no_stop_syn_aa no_stop_syn_iupac_codon \n", + "0 CAA GAA QE SAA \n", + "1 ACA AGA ATA TRI ABA \n", + "2 AAG AAT KN AAK \n", + "3 CAC GAC TAC HDY BAC \n", + "4 ACC AGC ATC TSI ABC \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# add the aa info to the iupacs\n", + "df=pd.read_csv('data/final_codon_table.csv')\n", + "df.fillna('', inplace=True)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "a2e76ec8-0b3e-41d8-b96d-4fb11b11c9e1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'QE*'" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# function that translates iupac codon into all AAs\n", + "iupac_dict = {'A':'A','C':'C','G':'G','T':'T','AC':'M','AG':'R','AT':'W','CG':'S','CT':'Y','GT':'K','ACG':'V','ACT':'H','AGT':'D','CGT':'B','ACGT':'N'}\n", + "rev_iupac_dict = {value:key for key,value in iupac_dict.items()}\n", + "\n", + "\n", + "def iupac_to_aa(iupac_codon):\n", + " \"\"\"Return string of AAs encoded by input iupac missense codon\"\"\"\n", + " aa_list = []\n", + " for i,n in enumerate(list(codon)):\n", + " if n in list('ACGT'):\n", + " continue\n", + " for new_nuc in rev_iupac_dict[n]:\n", + " new_codon = codon[:i] + new_nuc + codon[i + 1:]\n", + " aa_list.append(str(Seq(new_codon).translate()))\n", + " return ''.join(aa_list)\n", + "\n", + "iupac_to_aa('BAA')" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "28392453-7e13-4a09-bd4b-6dc845c6d6ab", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Q', 'E', '*']" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "aas" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "4fec7487-869f-4b43-a76b-bb5affab59a0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'AAA': {'iupac': ['BAA', 'ABA', 'AAT']},\n", + " 'AAC': {'iupac': ['BAC', 'ABC', 'AAA']},\n", + " 'AAG': {'iupac': ['BAG', 'ABG', 'AAT']},\n", + " 'AAT': {'iupac': ['BAT', 'ABT', 'AAA']},\n", + " 'ACA': {'iupac': ['BCA', 'ADA']},\n", + " 'ACC': {'iupac': ['BCC', 'AWC']},\n", + " 'ACG': {'iupac': ['BCG', 'ADG']},\n", + " 'ACT': {'iupac': ['BCT', 'AWT']},\n", + " 'AGA': {'iupac': ['KGA', 'AHA', 'AGT']},\n", + " 'AGC': {'iupac': ['KGC', 'AHC', 'AGA']},\n", + " 'AGG': {'iupac': ['KGG', 'AHG', 'AGT']},\n", + " 'AGT': {'iupac': ['KGT', 'AHT', 'AGA']},\n", + " 'ATA': {'iupac': ['KTA', 'AVA', 'ATG']},\n", + " 'ATC': {'iupac': ['BTC', 'AVC', 'ATG']},\n", + " 'ATG': {'iupac': ['KTG', 'AVG', 'ATT']},\n", + " 'ATT': {'iupac': ['BTT', 'AVT', 'ATG']},\n", + " 'CAA': {'iupac': ['DAA', 'CBA', 'CAT']},\n", + " 'CAC': {'iupac': ['DAC', 'CBC', 'CAA']},\n", + " 'CAG': {'iupac': ['DAG', 'CBG', 'CAT']},\n", + " 'CAT': {'iupac': ['DAT', 'CBT', 'CAA']},\n", + " 'CCA': {'iupac': ['DCA', 'CDA']},\n", + " 'CCC': {'iupac': ['DCC', 'CDC']},\n", + " 'CCG': {'iupac': ['DCG', 'CDG']},\n", + " 'CCT': {'iupac': ['DCT', 'CDT']},\n", + " 'CGA': {'iupac': ['KGA', 'CHA']},\n", + " 'CGC': {'iupac': ['DGC', 'CHC']},\n", + " 'CGG': {'iupac': ['KGG', 'CHG']},\n", + " 'CGT': {'iupac': ['DGT', 'CHT']},\n", + " 'CTA': {'iupac': ['RTA', 'CVA']},\n", + " 'CTC': {'iupac': ['DTC', 'CVC']},\n", + " 'CTG': {'iupac': ['RTG', 'CVG']},\n", + " 'CTT': {'iupac': ['DTT', 'CVT']},\n", + " 'GAA': {'iupac': ['HAA', 'GBA', 'GAT']},\n", + " 'GAC': {'iupac': ['HAC', 'GBC', 'GAA']},\n", + " 'GAG': {'iupac': ['HAG', 'GBG', 'GAT']},\n", + " 'GAT': {'iupac': ['HAT', 'GBT', 'GAA']},\n", + " 'GCA': {'iupac': ['HCA', 'GDA']},\n", + " 'GCC': {'iupac': ['HCC', 'GDC']},\n", + " 'GCG': {'iupac': ['HCG', 'GDG']},\n", + " 'GCT': {'iupac': ['HCT', 'GDT']},\n", + " 'GGA': {'iupac': ['WGA', 'GHA']},\n", + " 'GGC': {'iupac': ['HGC', 'GHC']},\n", + " 'GGG': {'iupac': ['WGG', 'GHG']},\n", + " 'GGT': {'iupac': ['HGT', 'GHT']},\n", + " 'GTA': {'iupac': ['WTA', 'GVA']},\n", + " 'GTC': {'iupac': ['HTC', 'GVC']},\n", + " 'GTG': {'iupac': ['WTG', 'GVG']},\n", + " 'GTT': {'iupac': ['HTT', 'GVT']},\n", + " 'TAA': {'iupac': ['VAA', 'TYA', 'TAT']},\n", + " 'TAC': {'iupac': ['VAC', 'TBC', 'TAA']},\n", + " 'TAG': {'iupac': ['VAG', 'TBG', 'TAT']},\n", + " 'TAT': {'iupac': ['VAT', 'TBT', 'TAA']},\n", + " 'TCA': {'iupac': ['VCA', 'TWA']},\n", + " 'TCC': {'iupac': ['VCC', 'TDC']},\n", + " 'TCG': {'iupac': ['VCG', 'TDG']},\n", + " 'TCT': {'iupac': ['VCT', 'TDT']},\n", + " 'TGA': {'iupac': ['RGA', 'TYA', 'TGK']},\n", + " 'TGC': {'iupac': ['SGC', 'THC', 'TGR']},\n", + " 'TGG': {'iupac': ['RGG', 'THG', 'TGM']},\n", + " 'TGT': {'iupac': ['SGT', 'THT', 'TGR']},\n", + " 'TTA': {'iupac': ['RTA', 'TMA', 'TTT']},\n", + " 'TTC': {'iupac': ['RTC', 'TVC', 'TTG']},\n", + " 'TTG': {'iupac': ['RTG', 'TVG', 'TTT']},\n", + " 'TTT': {'iupac': ['RTT', 'TVT', 'TTG']}}" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21eb3a9a-231e-4ce3-b4ca-c364a68dc07d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3895e8f2-99c3-4632-a520-51ea78390360", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cc9eed9-3ebf-4fc2-8213-c811a63816dd", + "metadata": {}, + "outputs": [], + "source": [ + "for key,value in temp_dict.items():\n", + " temp_dict[key] = list(itertools.chain.from_iterable([codon.split(' ') for codon in value]))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "a355cbbc-595d-4fc1-b5a1-7766c1045840", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'AAA': {0: {'sele_iupac_codon': 'BAA', 'sele_aa': 'QE*'},\n", + " 1: {'sele_iupac_codon': 'ABA', 'sele_aa': 'TRI'},\n", + " 2: {'sele_iupac_codon': 'AAT', 'sele_aa': 'N'}},\n", + " 'AAC': {3: {'sele_iupac_codon': 'BAC', 'sele_aa': 'HDY'},\n", + " 4: {'sele_iupac_codon': 'ABC', 'sele_aa': 'TSI'},\n", + " 5: {'sele_iupac_codon': 'AAA', 'sele_aa': 'K'}},\n", + " 'AAG': {6: {'sele_iupac_codon': 'BAG', 'sele_aa': 'QE*'},\n", + " 7: {'sele_iupac_codon': 'ABG', 'sele_aa': 'TRM'},\n", + " 8: {'sele_iupac_codon': 'AAT', 'sele_aa': 'N'}},\n", + " 'AAT': {9: {'sele_iupac_codon': 'BAT', 'sele_aa': 'HDY'},\n", + " 10: {'sele_iupac_codon': 'ABT', 'sele_aa': 'TSI'},\n", + " 11: {'sele_iupac_codon': 'AAA', 'sele_aa': 'K'}},\n", + " 'ACA': {12: {'sele_iupac_codon': 'BCA', 'sele_aa': 'PAS'},\n", + " 13: {'sele_iupac_codon': 'ADA', 'sele_aa': 'KRI'},\n", + " 14: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'ACC': {15: {'sele_iupac_codon': 'BCC', 'sele_aa': 'PAS'},\n", + " 16: {'sele_iupac_codon': 'AWC', 'sele_aa': 'NI'},\n", + " 17: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'ACG': {18: {'sele_iupac_codon': 'BCG', 'sele_aa': 'PAS'},\n", + " 19: {'sele_iupac_codon': 'ADG', 'sele_aa': 'KRM'},\n", + " 20: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'ACT': {21: {'sele_iupac_codon': 'BCT', 'sele_aa': 'PAS'},\n", + " 22: {'sele_iupac_codon': 'AWT', 'sele_aa': 'NI'},\n", + " 23: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'AGA': {24: {'sele_iupac_codon': 'KGA', 'sele_aa': 'G*'},\n", + " 25: {'sele_iupac_codon': 'AHA', 'sele_aa': 'KTI'},\n", + " 26: {'sele_iupac_codon': 'AGT', 'sele_aa': 'S'}},\n", + " 'AGC': {27: {'sele_iupac_codon': 'KGC', 'sele_aa': 'GC'},\n", + " 28: {'sele_iupac_codon': 'AHC', 'sele_aa': 'NTI'},\n", + " 29: {'sele_iupac_codon': 'AGA', 'sele_aa': 'R'}},\n", + " 'AGG': {30: {'sele_iupac_codon': 'KGG', 'sele_aa': 'GW'},\n", + " 31: {'sele_iupac_codon': 'AHG', 'sele_aa': 'KTM'},\n", + " 32: {'sele_iupac_codon': 'AGT', 'sele_aa': 'S'}},\n", + " 'AGT': {33: {'sele_iupac_codon': 'KGT', 'sele_aa': 'GC'},\n", + " 34: {'sele_iupac_codon': 'AHT', 'sele_aa': 'NTI'},\n", + " 35: {'sele_iupac_codon': 'AGA', 'sele_aa': 'R'}},\n", + " 'ATA': {36: {'sele_iupac_codon': 'KTA', 'sele_aa': 'LV'},\n", + " 37: {'sele_iupac_codon': 'AVA', 'sele_aa': 'KTR'},\n", + " 38: {'sele_iupac_codon': 'ATG', 'sele_aa': 'M'}},\n", + " 'ATC': {39: {'sele_iupac_codon': 'BTC', 'sele_aa': 'LVF'},\n", + " 40: {'sele_iupac_codon': 'AVC', 'sele_aa': 'NTS'},\n", + " 41: {'sele_iupac_codon': 'ATG', 'sele_aa': 'M'}},\n", + " 'ATG': {42: {'sele_iupac_codon': 'KTG', 'sele_aa': 'LV'},\n", + " 43: {'sele_iupac_codon': 'AVG', 'sele_aa': 'KTR'},\n", + " 44: {'sele_iupac_codon': 'ATT', 'sele_aa': 'I'}},\n", + " 'ATT': {45: {'sele_iupac_codon': 'BTT', 'sele_aa': 'LVF'},\n", + " 46: {'sele_iupac_codon': 'AVT', 'sele_aa': 'NTS'},\n", + " 47: {'sele_iupac_codon': 'ATG', 'sele_aa': 'M'}},\n", + " 'CAA': {48: {'sele_iupac_codon': 'DAA', 'sele_aa': 'KE*'},\n", + " 49: {'sele_iupac_codon': 'CBA', 'sele_aa': 'PRL'},\n", + " 50: {'sele_iupac_codon': 'CAT', 'sele_aa': 'H'}},\n", + " 'CAC': {51: {'sele_iupac_codon': 'DAC', 'sele_aa': 'NDY'},\n", + " 52: {'sele_iupac_codon': 'CBC', 'sele_aa': 'PRL'},\n", + " 53: {'sele_iupac_codon': 'CAA', 'sele_aa': 'Q'}},\n", + " 'CAG': {54: {'sele_iupac_codon': 'DAG', 'sele_aa': 'KE*'},\n", + " 55: {'sele_iupac_codon': 'CBG', 'sele_aa': 'PRL'},\n", + " 56: {'sele_iupac_codon': 'CAT', 'sele_aa': 'H'}},\n", + " 'CAT': {57: {'sele_iupac_codon': 'DAT', 'sele_aa': 'NDY'},\n", + " 58: {'sele_iupac_codon': 'CBT', 'sele_aa': 'PRL'},\n", + " 59: {'sele_iupac_codon': 'CAA', 'sele_aa': 'Q'}},\n", + " 'CCA': {60: {'sele_iupac_codon': 'DCA', 'sele_aa': 'TAS'},\n", + " 61: {'sele_iupac_codon': 'CDA', 'sele_aa': 'QRL'},\n", + " 62: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CCC': {63: {'sele_iupac_codon': 'DCC', 'sele_aa': 'TAS'},\n", + " 64: {'sele_iupac_codon': 'CDC', 'sele_aa': 'HRL'},\n", + " 65: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CCG': {66: {'sele_iupac_codon': 'DCG', 'sele_aa': 'TAS'},\n", + " 67: {'sele_iupac_codon': 'CDG', 'sele_aa': 'QRL'},\n", + " 68: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CCT': {69: {'sele_iupac_codon': 'DCT', 'sele_aa': 'TAS'},\n", + " 70: {'sele_iupac_codon': 'CDT', 'sele_aa': 'HRL'},\n", + " 71: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CGA': {72: {'sele_iupac_codon': 'KGA', 'sele_aa': 'G*'},\n", + " 73: {'sele_iupac_codon': 'CHA', 'sele_aa': 'QPL'},\n", + " 74: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CGC': {75: {'sele_iupac_codon': 'DGC', 'sele_aa': 'SGC'},\n", + " 76: {'sele_iupac_codon': 'CHC', 'sele_aa': 'HPL'},\n", + " 77: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CGG': {78: {'sele_iupac_codon': 'KGG', 'sele_aa': 'GW'},\n", + " 79: {'sele_iupac_codon': 'CHG', 'sele_aa': 'QPL'},\n", + " 80: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CGT': {81: {'sele_iupac_codon': 'DGT', 'sele_aa': 'SGC'},\n", + " 82: {'sele_iupac_codon': 'CHT', 'sele_aa': 'HPL'},\n", + " 83: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CTA': {84: {'sele_iupac_codon': 'RTA', 'sele_aa': 'IV'},\n", + " 85: {'sele_iupac_codon': 'CVA', 'sele_aa': 'QPR'},\n", + " 86: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CTC': {87: {'sele_iupac_codon': 'DTC', 'sele_aa': 'IVF'},\n", + " 88: {'sele_iupac_codon': 'CVC', 'sele_aa': 'HPR'},\n", + " 89: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CTG': {90: {'sele_iupac_codon': 'RTG', 'sele_aa': 'MV'},\n", + " 91: {'sele_iupac_codon': 'CVG', 'sele_aa': 'QPR'},\n", + " 92: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CTT': {93: {'sele_iupac_codon': 'DTT', 'sele_aa': 'IVF'},\n", + " 94: {'sele_iupac_codon': 'CVT', 'sele_aa': 'HPR'},\n", + " 95: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GAA': {96: {'sele_iupac_codon': 'HAA', 'sele_aa': 'KQ*'},\n", + " 97: {'sele_iupac_codon': 'GBA', 'sele_aa': 'AGV'},\n", + " 98: {'sele_iupac_codon': 'GAT', 'sele_aa': 'D'}},\n", + " 'GAC': {99: {'sele_iupac_codon': 'HAC', 'sele_aa': 'NHY'},\n", + " 100: {'sele_iupac_codon': 'GBC', 'sele_aa': 'AGV'},\n", + " 101: {'sele_iupac_codon': 'GAA', 'sele_aa': 'E'}},\n", + " 'GAG': {102: {'sele_iupac_codon': 'HAG', 'sele_aa': 'KQ*'},\n", + " 103: {'sele_iupac_codon': 'GBG', 'sele_aa': 'AGV'},\n", + " 104: {'sele_iupac_codon': 'GAT', 'sele_aa': 'D'}},\n", + " 'GAT': {105: {'sele_iupac_codon': 'HAT', 'sele_aa': 'NHY'},\n", + " 106: {'sele_iupac_codon': 'GBT', 'sele_aa': 'AGV'},\n", + " 107: {'sele_iupac_codon': 'GAA', 'sele_aa': 'E'}},\n", + " 'GCA': {108: {'sele_iupac_codon': 'HCA', 'sele_aa': 'TPS'},\n", + " 109: {'sele_iupac_codon': 'GDA', 'sele_aa': 'EGV'},\n", + " 110: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GCC': {111: {'sele_iupac_codon': 'HCC', 'sele_aa': 'TPS'},\n", + " 112: {'sele_iupac_codon': 'GDC', 'sele_aa': 'DGV'},\n", + " 113: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GCG': {114: {'sele_iupac_codon': 'HCG', 'sele_aa': 'TPS'},\n", + " 115: {'sele_iupac_codon': 'GDG', 'sele_aa': 'EGV'},\n", + " 116: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GCT': {117: {'sele_iupac_codon': 'HCT', 'sele_aa': 'TPS'},\n", + " 118: {'sele_iupac_codon': 'GDT', 'sele_aa': 'DGV'},\n", + " 119: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GGA': {120: {'sele_iupac_codon': 'WGA', 'sele_aa': 'R*'},\n", + " 121: {'sele_iupac_codon': 'GHA', 'sele_aa': 'EAV'},\n", + " 122: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GGC': {123: {'sele_iupac_codon': 'HGC', 'sele_aa': 'SRC'},\n", + " 124: {'sele_iupac_codon': 'GHC', 'sele_aa': 'DAV'},\n", + " 125: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GGG': {126: {'sele_iupac_codon': 'WGG', 'sele_aa': 'RW'},\n", + " 127: {'sele_iupac_codon': 'GHG', 'sele_aa': 'EAV'},\n", + " 128: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GGT': {129: {'sele_iupac_codon': 'HGT', 'sele_aa': 'SRC'},\n", + " 130: {'sele_iupac_codon': 'GHT', 'sele_aa': 'DAV'},\n", + " 131: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GTA': {132: {'sele_iupac_codon': 'WTA', 'sele_aa': 'IL'},\n", + " 133: {'sele_iupac_codon': 'GVA', 'sele_aa': 'EAG'},\n", + " 134: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GTC': {135: {'sele_iupac_codon': 'HTC', 'sele_aa': 'ILF'},\n", + " 136: {'sele_iupac_codon': 'GVC', 'sele_aa': 'DAG'},\n", + " 137: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GTG': {138: {'sele_iupac_codon': 'WTG', 'sele_aa': 'ML'},\n", + " 139: {'sele_iupac_codon': 'GVG', 'sele_aa': 'EAG'},\n", + " 140: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GTT': {141: {'sele_iupac_codon': 'HTT', 'sele_aa': 'ILF'},\n", + " 142: {'sele_iupac_codon': 'GVT', 'sele_aa': 'DAG'},\n", + " 143: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'TAA': {144: {'sele_iupac_codon': 'VAA', 'sele_aa': 'KQE'},\n", + " 145: {'sele_iupac_codon': 'TYA', 'sele_aa': 'SL'},\n", + " 146: {'sele_iupac_codon': 'TAT', 'sele_aa': 'Y'}},\n", + " 'TAC': {147: {'sele_iupac_codon': 'VAC', 'sele_aa': 'NHD'},\n", + " 148: {'sele_iupac_codon': 'TBC', 'sele_aa': 'SCF'},\n", + " 149: {'sele_iupac_codon': 'TAA', 'sele_aa': '*'}},\n", + " 'TAG': {150: {'sele_iupac_codon': 'VAG', 'sele_aa': 'KQE'},\n", + " 151: {'sele_iupac_codon': 'TBG', 'sele_aa': 'SWL'},\n", + " 152: {'sele_iupac_codon': 'TAT', 'sele_aa': 'Y'}},\n", + " 'TAT': {153: {'sele_iupac_codon': 'VAT', 'sele_aa': 'NHD'},\n", + " 154: {'sele_iupac_codon': 'TBT', 'sele_aa': 'SCF'},\n", + " 155: {'sele_iupac_codon': 'TAA', 'sele_aa': '*'}},\n", + " 'TCA': {156: {'sele_iupac_codon': 'VCA', 'sele_aa': 'TPA'},\n", + " 157: {'sele_iupac_codon': 'TWA', 'sele_aa': '*L'},\n", + " 158: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'TCC': {159: {'sele_iupac_codon': 'VCC', 'sele_aa': 'TPA'},\n", + " 160: {'sele_iupac_codon': 'TDC', 'sele_aa': 'YCF'},\n", + " 161: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'TCG': {162: {'sele_iupac_codon': 'VCG', 'sele_aa': 'TPA'},\n", + " 163: {'sele_iupac_codon': 'TDG', 'sele_aa': '*WL'},\n", + " 164: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'TCT': {165: {'sele_iupac_codon': 'VCT', 'sele_aa': 'TPA'},\n", + " 166: {'sele_iupac_codon': 'TDT', 'sele_aa': 'YCF'},\n", + " 167: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'TGA': {168: {'sele_iupac_codon': 'RGA', 'sele_aa': 'RG'},\n", + " 169: {'sele_iupac_codon': 'TYA', 'sele_aa': 'SL'},\n", + " 170: {'sele_iupac_codon': 'TGK', 'sele_aa': 'CW'}},\n", + " 'TGC': {171: {'sele_iupac_codon': 'SGC', 'sele_aa': 'RG'},\n", + " 172: {'sele_iupac_codon': 'THC', 'sele_aa': 'YSF'},\n", + " 173: {'sele_iupac_codon': 'TGR', 'sele_aa': '*W'}},\n", + " 'TGG': {174: {'sele_iupac_codon': 'RGG', 'sele_aa': 'RG'},\n", + " 175: {'sele_iupac_codon': 'THG', 'sele_aa': '*SL'},\n", + " 176: {'sele_iupac_codon': 'TGM', 'sele_aa': '*C'}},\n", + " 'TGT': {177: {'sele_iupac_codon': 'SGT', 'sele_aa': 'RG'},\n", + " 178: {'sele_iupac_codon': 'THT', 'sele_aa': 'YSF'},\n", + " 179: {'sele_iupac_codon': 'TGR', 'sele_aa': '*W'}},\n", + " 'TTA': {180: {'sele_iupac_codon': 'RTA', 'sele_aa': 'IV'},\n", + " 181: {'sele_iupac_codon': 'TMA', 'sele_aa': '*S'},\n", + " 182: {'sele_iupac_codon': 'TTT', 'sele_aa': 'F'}},\n", + " 'TTC': {183: {'sele_iupac_codon': 'RTC', 'sele_aa': 'IV'},\n", + " 184: {'sele_iupac_codon': 'TVC', 'sele_aa': 'YSC'},\n", + " 185: {'sele_iupac_codon': 'TTG', 'sele_aa': 'L'}},\n", + " 'TTG': {186: {'sele_iupac_codon': 'RTG', 'sele_aa': 'MV'},\n", + " 187: {'sele_iupac_codon': 'TVG', 'sele_aa': '*SW'},\n", + " 188: {'sele_iupac_codon': 'TTT', 'sele_aa': 'F'}},\n", + " 'TTT': {189: {'sele_iupac_codon': 'RTT', 'sele_aa': 'IV'},\n", + " 190: {'sele_iupac_codon': 'TVT', 'sele_aa': 'YSC'},\n", + " 191: {'sele_iupac_codon': 'TTG', 'sele_aa': 'L'}}}" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby('codon')[['sele_iupac_codon','sele_aa']].apply(lambda x: x.to_dict(orient='index')).to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e69cf6e-2acc-4677-885a-c31ef092063e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80855e33-087e-4bfc-9ebf-31d67afa92d3", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "fe40cc88-bde0-4e98-bbc3-8b2fd68d30e3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
positioniupacwt
011[HCG, GDG]GCG
112[HGT, GHT]GGT
213[HAT, GBT, GAA]GAT
314[WTA, GVA]GTA
415[KTA, AVA, ATG]ATA
\n", + "
" + ], + "text/plain": [ + " position iupac wt\n", + "0 11 [HCG, GDG] GCG\n", + "1 12 [HGT, GHT] GGT\n", + "2 13 [HAT, GBT, GAA] GAT\n", + "3 14 [WTA, GVA] GTA\n", + "4 15 [KTA, AVA, ATG] ATA" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check that iupac encodes all AAs for given position\n", + "\n", + "iupac_dict = {'A':'A','C':'C','G':'G','T':'T','AC':'M','AG':'R','AT':'W','CG':'S','CT':'Y','GT':'K','ACG':'V','ACT':'H','AGT':'D','CGT':'B','ACGT':'N'}\n", + "rev_iupac_dict = {value:key for key,value in iupac_dict.items()}\n", + "\n", + "df1 = df.groupby('position')['iupac'].apply(list).reset_index()\n", + "map_dict = dict(zip(df.position, df.wt))\n", + "df1['wt'] =df1.position.map(map_dict)\n", + "df1.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aea536e8-8331-4ed1-a459-22ecba07f126", + "metadata": {}, + "outputs": [], + "source": [ + "# get iupac aas\n", + "def iupac_aas(row):\n", + " iupac_codons = row['iupac']\n", + " for iupac in iupac_codons" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "d71ecc0b-37b0-40e2-b36c-f288044d90ec", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{11: 'GCG',\n", + " 12: 'GGT',\n", + " 13: 'GAT',\n", + " 14: 'GTA',\n", + " 15: 'ATA',\n", + " 16: 'AAG',\n", + " 17: 'GGC',\n", + " 18: 'AGA',\n", + " 43: 'AGT',\n", + " 44: 'GTT',\n", + " 45: 'AAG',\n", + " 46: 'ATG',\n", + " 47: 'CAT',\n", + " 48: 'ATG',\n", + " 49: 'GAT',\n", + " 50: 'AGA',\n", + " 51: 'TAT',\n", + " 71: 'GAT',\n", + " 72: 'TAT',\n", + " 73: 'ACA',\n", + " 74: 'AAA',\n", + " 75: 'GGA',\n", + " 76: 'TAT',\n", + " 77: 'ATA',\n", + " 78: 'GAT',\n", + " 79: 'GTC',\n", + " 80: 'AAT',\n", + " 81: 'TAC',\n", + " 82: 'AAA',\n", + " 83: 'AGG',\n", + " 84: 'ATG',\n", + " 85: 'TGT',\n", + " 86: 'AGA',\n", + " 87: 'CAT',\n", + " 88: 'CAA'}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "map_dict = dict(zip(df.position, df.wt))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f81b314-2635-42b2-98f8-422ed96826a0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/final_codon_table.csv b/.ipynb_checkpoints/final_codon_table-checkpoint.csv similarity index 100% rename from final_codon_table.csv rename to .ipynb_checkpoints/final_codon_table-checkpoint.csv diff --git a/.ipynb_checkpoints/main-checkpoint.ipynb b/.ipynb_checkpoints/main-checkpoint.ipynb index 6cd6698..09e0254 100644 --- a/.ipynb_checkpoints/main-checkpoint.ipynb +++ b/.ipynb_checkpoints/main-checkpoint.ipynb @@ -1,13 +1,14 @@ { "cells": [ { - "cell_type": "code", - "execution_count": null, - "id": "af77e84f-70ae-4906-87fc-b30f93696adf", + "cell_type": "markdown", + "id": "84354767-b851-46dc-814a-3c03b0b0c48d", "metadata": {}, - "outputs": [], "source": [ - "# maybe the dictionaries should be saved as a pkl file, or simple csv file" + "TODO\n", + "- need to duplicate into script\n", + "- need to locate the codon dictionaries from the main_package directory\n", + "- stop codon variable" ] }, { @@ -27,7 +28,7 @@ "import numpy as np\n", "import pandas as pd\n", "#import random\n", - "import main_package # my package\n" + "import main_package # my package" ] }, { @@ -82,6 +83,7 @@ "parser.add_argument(\"--melt_temp\", help=\"Melting temp of fwd primer\", type=int, default=50)\n", "parser.add_argument(\"--rev_melt_temp\", help=\"Melting temp of rev primer\", type=int, default=55)\n", "parser.add_argument(\"--syn_snp_rate\", help=\"Percentage of synonymous SNPs 0-1\", type=float, default=.05)\n", + "parser.add_argument(\"--stop_rate\", help=\"Percentage of stop codon SNPs, default = keep 10% of stop SNPs\", type=float, default=.10)\n", "parser.add_argument(\"--rng_seed\", help=\"Set seed for repoducibly selecting synonymous codon sites\", type=int, default=42)\n", "parser.add_argument(\"--out_dir\", help=\"Local output directory e.g. 'data/'\", type=str, default=.05)\n", "args = parser.parse_args()" @@ -90,76 +92,20 @@ { "cell_type": "code", "execution_count": 2, - "id": "b4c392be", - "metadata": {}, - "outputs": [], - "source": [ - "# generate missense codon dictionary\n", - "#missense_dict = main_package.codon_table.iupac_missense_codon_dict(codon_table=args.codon_table)\n", - "#missense_dict = main_package.codon_table.iupac_missense_codon_dict(codon_table='Standard')\n", - "missense_dict = main_package.codon_table.selected_iupac_codons_dict()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "22e96932", - "metadata": {}, - "outputs": [], - "source": [ - "# generate synonymous missense codon dictionary\n", - "#main_package.codon_table.iupac_synonymous_codon_dict(codon_table=args.codon_table)\n", - "#synonymous_dict = main_package.codon_table.iupac_synonymous_codon_dict(codon_table='Standard')\n", - "synonymous_missense_dict = main_package.codon_table.synonymous_iupac_codons_dict()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "7dca8b90-84b9-4dd7-9b85-9a3f31c1a3c7", + "id": "dac45e16-75b2-4207-9f31-2a015d090d24", "metadata": {}, "outputs": [], "source": [ + "# generate iupac codon dictionaries to generate doped primers\n", + "missense_dict, synonymous_dict, no_stop_dict, no_stop_syn_dict = main_package.codon_table.iupac_codon_dicts()\n", + "\n", "# generate yeast synonymous codon dictionary (no missense variants)\n", "yeast_synonymous_dict = main_package.codon_table.synonymous_yeast_codons_dict()" ] }, { "cell_type": "code", - "execution_count": 5, - "id": "d5f1bb72", - "metadata": {}, - "outputs": [ - { - "ename": "IndentationError", - "evalue": "expected an indented block (2364269451.py, line 15)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m File \u001b[0;32m\"/var/folders/fq/q1cflf795cgbnmbhdyn8d9nntyw5jt/T/ipykernel_9275/2364269451.py\"\u001b[0;36m, line \u001b[0;32m15\u001b[0m\n\u001b[0;31m # drop-in missense sub-window\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m expected an indented block\n" - ] - } - ], - "source": [ - "# parse .gb file, loop for each matching feature \"window\"\n", - "for window in file:\n", - " # check that window is divisible by 3, codons\n", - " # check for upstream homology and downstream primer space (20bp, 40bp)\n", - "\n", - " # assign sub-window start index value\n", - " # define which codons will contain synonymous controls at 5% frequency (based on wt)\n", - " # define synonymous codons (based on vector)\n", - " # (create all the codon variants immediately, then define subwindows based on primers)\n", - " \n", - " # begin sub-window loop:\n", - " # homology arm\n", - " # primer design\n", - " # redefine sub-window start index\n", - " # drop-in missense sub-window" - ] - }, - { - "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "id": "a7ed0c7d", "metadata": {}, "outputs": [], @@ -180,7 +126,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "id": "8294e447-8d42-45d7-991f-4ae9b1eaf188", "metadata": {}, "outputs": [ @@ -194,29 +140,16 @@ } ], "source": [ - "# get codon positions\n", + "# get start and stop of gene for codon positions\n", "for feature in wt_file.features:\n", " if feature.type == 'gene':\n", " gene_start = feature.location.start.position\n", - " gene_end = feature.location.end.position\n", - " print(gene_start)\n", - " print(gene_end)" + " gene_end = feature.location.end.position" ] }, { "cell_type": "code", - "execution_count": 55, - "id": "8ed1b7bd-4781-4276-80e3-36a7742ec491", - "metadata": {}, - "outputs": [], - "source": [ - "# create dataframe\n", - "df = pd.DataFrame()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "id": "c22b4eed", "metadata": {}, "outputs": [], @@ -235,45 +168,42 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "id": "74cc27ed-1256-44c2-8b54-4eabcee29f3f", "metadata": {}, "outputs": [], "source": [ - "\n", + "# hardcode test variables\n", "homo_len = 20 ### args.homo_len\n", "rev_melt_temp = 55 ### args.rev_melt_temp\n", "primer_len = 60 ### args.primer_len\n", "melt_temp = 50 ### args.melt_temp\n", - "syn_snp_rate = .05 ### args.syn_snp_rate" + "syn_snp_rate = .05 ### args.syn_snp_rate\n", + "stop_rate = .1 ### args.stop_rate\n", + "output_prefix = 'k3l_test' # maybe have a prefix for output" ] }, { "cell_type": "code", - "execution_count": 59, - "id": "9d651f93", + "execution_count": 10, + "id": "57df82ff-a49c-4075-8691-c8284c2528b1", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"\\n'primer':,\\n'primer_tm':,\\n'primer_gc':,\\n'primer_len':,\\n'rev_temp':,\\n'rev_gc':,\\n'rev_primer_len':, \\n\"" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "### loop through the window\n", "\n", - "# setup RNG in the loop, maybe this needs to be moved\n", + "# create empty dataframe\n", + "df = pd.DataFrame()\n", + "\n", + "# set RNG with seed \n", "rng = np.random.RandomState(42)\n", "\n", "# this needs to be fixed (user input? yaml?)\n", "targ_windows = ['window_1', 'window_2', 'window_3']\n", "\n", + "# setup .fa output, truncate if file exists\n", + "file = open(f\"{output_prefix}.fa\",'w+')\n", + "\n", "for feature in wt_file.features:\n", " if feature.type not in targ_windows:\n", " continue\n", @@ -292,26 +222,34 @@ " start_index = feature.location.start.position\n", " window_end = feature.location.end.position\n", " \n", - " # loop for each mini_window\n", - " mini_win = 1\n", + " # loop for each sub_window\n", + " sub_window_n = 1\n", " while start_index < window_end: # this could be an issue to toggle\n", " data_dict = {}\n", + " data_dict['start_index'] = start_index\n", + " data_dict['sub_window_name'] = {str(feature.type)}-{sub_window_n}\n", " \n", " # 1. homology arm\n", + " # INPUT: data dictionary, args (homo_len)\n", + " # OUTPUT: data dictionary\n", " homology_arm = vector_seq[start_index - homo_len:start_index] ### args.homo_len\n", " data_dict['homology_arm'] = homology_arm\n", " \n", " # 2. reverse primer\n", + " # INPUT: data dictionary\n", + " # OUTPUT: data dictionary\n", " reverse_seq = str(Seq(vector_seq[:start_index]).reverse_complement())\n", " reverse_primer = reverse_seq[:15]\n", " while mt.Tm_NN(reverse_primer) < rev_melt_temp: ### args.rev_melt_temp\n", " reverse_primer = reverse_seq[:len(reverse_primer)+1] ### args.rev_melt_temp\n", " data_dict['reverse_primer'] = reverse_primer\n", " \n", - " reverse_primer_name = f'rev_{str(feature.type)}-{mini_win}'\n", + " reverse_primer_name = f'rev_{str(feature.type)}-{sub_window_n}'\n", " data_dict['reverse_primer_name'] = reverse_primer_name\n", " \n", " # 3. forward primer\n", + " # INPUT: data dictionary, start index\n", + " # OUTPUT: \n", " primer_end = start_index + (primer_len - homo_len) ### args.primer_len homo_len\n", " if primer_end > window_end:\n", " primer_end == window_end\n", @@ -358,6 +296,8 @@ " data_dict['forward_primer'] = forward_primer\n", " \n", " # 4. variant window\n", + " # INPUT: data dictionary, .tsv output, .fa output\n", + " # OUTPUT: dataframe, .fa file lines\n", " mut_len = (primer_start) - start_index\n", " mut_end = start_index + mut_len\n", "\n", @@ -369,269 +309,87 @@ " vect_list = codons_list(vector_seq[start_index:mut_end])\n", "\n", " # generate synonymous vector codon list (top 2 codons for yeast)\n", - " synonymous_win = [yeast_synonymous_dict[i] for i in vect_list]\n", + " synonymous_win = [yeast_synonymous_dict[i].lower() for i in vect_list]\n", " \n", " # generate iupac missense codons list (with synonymous codons) \n", " doped_codons = []\n", " for i, wt_codon in enumerate(wt_list): \n", - " syn_bool = rng.choice([True, False], p=[syn_snp_rate, 1-syn_snp_rate])\n", - " data_dict['synonymous_included'] = syn_bool\n", - " if n < syn_snp_rate: ### args.syn_snp_rate\n", - " doped_codons.append(synonymous_missense_dict[wt_codon])\n", + " syn_bool = rng.choice([True, False], p=[syn_snp_rate, 1-syn_snp_rate]) ### args.syn_snp_rate\n", + " data_dict['synonymous_codons'] = syn_bool\n", + " \n", + " no_stop_bool = rng.choice([True, False], p=[stop_rate, 1-stop_rate]) ### args.stop_rate\n", + " data_dict['no_stop_codons'] = no_stop_bool\n", + " # missense_dict, synonymous_dict, no_stop_dict, no_stop_syn_dict\n", + " if syn_bool and no_stop_bool:\n", + " # use no_stop_syn_dictionary\n", + " doped_codons.append(no_stop_syn_dict[wt_codon])\n", + " elif syn_bool and not no_stop_bool:\n", + " # use synonymous_dictionary\n", + " doped_codons.append(synonymous_dict[wt_codon])\n", + " elif no_stop_bool and not syn_bool: \n", + " # use no_stop_dict\n", + " doped_codons.append(no_stop_dict[wt_codon])\n", " else:\n", + " # use missense dict\n", " doped_codons.append(missense_dict[wt_codon])\n", - "\n", + " \n", " # generate the mut primer and all info\n", " for i, iupac_list in enumerate(doped_codons):\n", - " res_pos = int((((start_index-gene_start)/3)+1)+i)\n", + " aa_position = int((((start_index-gene_start)/3)+1)+i)\n", " for iupac_codon in iupac_list:\n", "\n", - " codon_sub = wt_list[i] + str(res_pos) + iupac_codon\n", + " codon_sub = wt_list[i] + str(aa_position) + iupac_codon\n", " variant_win = ''.join(synonymous_win[:i] + [iupac_codon] + synonymous_win[i+1:])\n", - " primer_name = f'{str(feature.type)}-{mini_win}_{codon_sub}'\n", + " primer_name = f'{str(feature.type)}-{sub_window_n}_{codon_sub}'\n", " primer = homology_arm + variant_win + forward_primer\n", " \n", - " dict_keys = ['name','codon_sub','wt','pos','iupac']\n", - " dict_values = [primer_name, codon_sub, wt_list[i], res_pos, iupac_codon]\n", + " # drop iupac_codon into sub_window\n", + " sub_window = ''.join(synonymous_win[:i] + [iupac_codon] + synonymous_win[i+1:])\n", + " full_primer = homology_arm + sub_window + forward_primer\n", + " \n", + " dict_keys = ['name','codon_sub','wt','position','iupac', 'sub_window', 'primer']\n", + " dict_values = [primer_name, codon_sub, wt_list[i], aa_position, iupac_codon, sub_window, full_primer]\n", " for (key,value) in zip(dict_keys,dict_values):\n", " data_dict[key] = value\n", "\n", " # append values to dataframe\n", - " df.append(data_dict, ignore_index=True)\n", + " df = df.append(data_dict, ignore_index=True)\n", + " \n", + " # write out to .fa\n", + " file.writelines([f\">{primer_name}\\n\", f\"{full_primer}\\n\"])\n", " \n", " # reset the start index for the next mini-window\n", " start_index = primer_start\n", - " mini_win += 1 \n", - "\n", - "dict_keys = ['name','codon_sub','wt','pos','iupac','syn_bool','homo','variant_win','primer','rev_primer_name','rev_primer'] \n", + " sub_window_n += 1 \n", " \n", - "# organize data into .tsv \n", + "file.close()\n", "\n", - "# to add\n", - "\"\"\"\n", - "'primer':,\n", - "'primer_tm':,\n", - "'primer_gc':,\n", - "'primer_len':,\n", - "'rev_temp':,\n", - "'rev_gc':,\n", - "'rev_primer_len':, \n", - "\"\"\"\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "id": "52608f32-3365-43e9-b60a-2543e4455067", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "f35f500a-fefc-4cfb-8a70-0587857aeaaf", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1920" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "window_end" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "3ba47ee2-86b4-4405-9acc-663a50d78d98", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "70.0" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "(start_index-gene_start)/3\n" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "1661f4a3-8d06-4f18-9f81-40fa7f79b0ed", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Seq('TATTCGTTGCCCAAT')" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Seq('ATTGGGCAACGAATA').reverse_complement()" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "a09b5daa-bcc6-4d8a-903e-846785ec0b53", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'CGAACGCCAGCAAGACGTAGCCCAGCGCGTCGGCCGCCATGCCGGCGATAATGGCCTGCTTCTCGCCGAAACGTTTGGTGGCGGGACCAGTGACGAAGGCTTGAGCGAGGGCGTGCAAGATTCCGAATACCGCAAGCGACAGGCCGATCATCGTCGCGCTCCAGCGAAAGCGGTCCTCGCCGAAAATGACCCAGAGCGCTGCCGGCACCTGTCCTACGAGTTGCATGATAAAGAAGACAGTCATAAGTGCGGCGACGATAGTCATGCCCCGCGCCCACCGGAAGGAGCTGACTGGGTTGAAGGCTCTCAAGGGCATCGGTCGACGCTCTCCCTTATGCGACTCCTGCATTAGGAAGCAGCCCAGTAGTAGGTTGAGGCCGTTGAGCACCGCCGCCGCAAGGAATGGTGCATGCAAGGAGATGGCGCCCAACAGTCCCCCGGCCACGGGGCCTGCCACCATACCCACGCCGAAACAAGCGCTCATGAGCCCGAAGTGGCGAGCCCGATCTTCCCCATCGGTGATGTCGGCGATATAGGCGCCAGCAACCGCACCTGTGGCGCCGGTGATGCCGGCCACGATGCGTCCGGCGTAGAGGATCCACAGGACGGGTGTGGTCGCCATGATCGCGTAGTCGATAGTGGCTCCAAGTAGCGAAGCGAGCAGGACTGGGCGGCGGCCAAAGCGGTCGGACAGTGCTCCGAGAACGGGTGCGCATAGAAATTGCATCAACGCATATAGCGCTAGCAGCACGCCATAGTGACTGGCGATGCTGTCGGAATGGACGATATCCCGCAAGAGGCCCGGCAGTACCGGCATAACCAAGCCTATGCCTACAGCATCCAGGGTGACGGTGCCGAGGATGACGATGAGCGCATTGTTAGATTTCATACACGGTGCCTGACTGCGTTAGCAATTTAACTGTGATAAACTACCGCATTAAAGCTTATCGATGAGCTCCTTATGCGGATCTGTAGCAGCTGTCATTATCAATACTGCCATTTCAAAGAATACGTAAATAATTAATAGTAGTGATTTTCCTAACTTTATTTAGTCAAAAAATTAGCCTTTTAATTCTGCTGTAACCCGTACATGCCCAAAATAGGGGGCGGGTTACACAGAATATATAACATCGTAGGTGTCTGGGTGAACAGTTTATTCCTGGCATCCACTAAATATAATGGAGCCCGCTTTTTAAGCTGGCATCCAGAAAAAAAAAGAATCCCAGCACCAAAATATTGTTTTCTTCACCAACCATCAGTTCATAGGTCCATTCTCTTAGCGCAACTACAGAGAACAGGGGCACAAACAGGCAAAAAACGGGCACAACCTCAATGGAGTGATGCAACCTGCCTGGAGTAAATGATGACACAAGGCAATTGACCCACGCATGTATCTATCTCATTTTCTTACACCTTCTATTACCTTCTGCTCTCTCTGATTTGGAAAAAGCTGAAAAAAAAGGTTGAAACCAGTTCCCTGAAATTATTCCCCTACTTGACTAATAAGTATATAAAGACGGTAGGTATTGATTGTAATTCTGTAAATCTATTTCTTAAACTTCTTAAATTCTACTTTTATAGTTAGTCTTTTTTTTAGTTTTAAAACACCAAGAACTTAGTTTCGAATAAACACACATAAACAAACAAAACGCGTCCATGCTTGCATTTTGTTATTCGTTGCCCAATGCGGGTGATGTAATAAAGGGCAGAGTATACGAGAAGGATTATGCTCTATATATTTATCTTTTTGACTATCCTCACTTTGAAGCTATCTTGGCAGAGAGTGTTAAGATGCATATGGATAGATATGTTGAATATAGGGATAAACTGGTAGGGAAAACTGTAAAAGTTAAAGTGATTAGAGTTGATTATACAAAAGGATATATAGATGTCAATTACAAAAGGATGTGTAGACATCAATAGTAGCTGTCGAGTCGCAGCTCTATATAAACTCATTTACTTATGTAGGAATAAAGAGTATCATCTTTCAAAGTTAGCCGAGCATACAGATGGGTCTGTCNNNNNAANNNNNAANNNNNTTNNNNNGCGGCCGCTGATTGTTGTACAGAAACTTGGTCAC'" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "vector_seq" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8ee773f3-46b3-4cef-8441-a284f450fc40", - "metadata": {}, - "outputs": [], - "source": [ - "win_list\n", - "homo_list\n", - "rev_primer_list\n", - "fwd_primer_list\n", - "mut_windows_list\n", - "mut_name_list\n", - "full_window_list\n", - "full_name_list\n", - "full_primer_list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e123580f", - "metadata": {}, - "outputs": [], - "source": [ - "# need to double-check the conditional logic on the indices" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "cf375e02", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'GATTATACAAAAGGATATATAGATGTCAATTACAAAAGGATGTGTAGACATCAA'" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "wt_seq[start_index:window_end]" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "add9a201", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1923" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "feature.location.start.position" + "# polish dataframe\n", + "df['position'] = df['position'].astype(int)\n", + "df.drop(columns=['start_index'], inplace=True)\n", + "\n", + "df['forward_primer_tm'] = df['forward_primer'].apply(lambda x: mt.Tm_NN(x)).round(1)\n", + "df['forward_primer_gc'] = df['forward_primer'].apply(GC).round(1)\n", + "df['forward_primer_len'] = df['forward_primer'].str.len()\n", + "\n", + "df['reverse_primer_tm'] = df['reverse_primer'].apply(lambda x: mt.Tm_NN(x)).round(1)\n", + "df['reverse_primer_gc'] = df['reverse_primer'].apply(GC).round(1)\n", + "df['reverse_primer_len'] = df['reverse_primer'].str.len()\n", + "\n", + "cols = ['name','sub_window_name','wt','position','iupac','codon_sub','synonymous_codons','no_stop_codons','primer','homology_arm','sub_window','forward_primer','forward_primer_tm','forward_primer_gc','forward_primer_len','reverse_primer','reverse_primer_name','reverse_primer_tm','reverse_primer_gc','reverse_primer_len']\n", + "df = df[cols]\n", + "\n", + "# save dataframe as .tsv\n", + "df.to_csv(f'{output_prefix}.tsv', index=False, sep='\\t')" ] }, { "cell_type": "code", "execution_count": null, - "id": "3096168c", + "id": "d1ca397e-cfc5-4c72-bf0a-de554217481d", "metadata": {}, "outputs": [], - "source": [ - "# SKETCH: aggregate into dataframe" - ] + "source": [] } ], "metadata": { diff --git a/.ipynb_checkpoints/main-script-checkpoint.ipynb b/.ipynb_checkpoints/main-script-checkpoint.ipynb new file mode 100644 index 0000000..f3f711b --- /dev/null +++ b/.ipynb_checkpoints/main-script-checkpoint.ipynb @@ -0,0 +1,151 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "08ba2028-5b81-4d9b-93fc-37efa999a608", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "import argparse\n", + "from Bio.SeqUtils import GC\n", + "from Bio.SeqUtils import MeltingTemp as mt\n", + "from Bio.Seq import Seq\n", + "from Bio import SeqIO\n", + "import math\n", + "import numpy as np\n", + "import pandas as pd\n", + "import main_package # my package\n", + "\n", + "# parse arguments\n", + "parser = argparse.ArgumentParser()\n", + "parser.add_argument(\"wt\", help=\"Genbank file path containing wild type (WT) sequence\", type=str)\n", + "parser.add_argument(\"o\", help=\"Output prefix\", type=str)\n", + "parser.add_argument(\"--vector\", help=\"Genbank file path containing vector sequence\", type=str, default=False)\n", + "parser.add_argument(\"--codon_table\", help=\"Specify codon table to use\", type=str, default='Standard')\n", + "parser.add_argument(\"--homo_len\", help=\"Length of homology arm in fwd primer\", type=int, default=20)\n", + "parser.add_argument(\"--oligo_len\", help=\"Ideal max total length of oligo\", type=int, default=60)\n", + "parser.add_argument(\"--melt_temp\", help=\"Melting temp of fwd primer\", type=int, default=50)\n", + "parser.add_argument(\"--rev_melt_temp\", help=\"Melting temp of rev primer\", type=int, default=55)\n", + "parser.add_argument(\"--syn_snp_rate\", help=\"Percentage of synonymous SNPs 0-1\", type=float, default=.05)\n", + "parser.add_argument(\"--stop_rate\", help=\"Percentage of stop codon SNPs, default = keep 10% of stop SNPs\", type=float, default=.10)\n", + "parser.add_argument(\"--rng_seed\", help=\"Set seed for repoducibly selecting synonymous codon sites\", type=int, default=42)\n", + "parser.add_argument(\"--out_dir\", help='Local output directory e.g. \"data\"', type=str)\n", + "args = parser.parse_args()\n", + "\n", + "# parse genbank files\n", + "wt_file = SeqIO.read(args.wt, 'genbank')\n", + "\n", + "# check for vector file\n", + "if not args.vector:\n", + " args.vector = args.wt\n", + "vector_file = SeqIO.read(vector_input, 'genbank')\n", + " \n", + "wt_seq = str(wt_file.seq.upper())\n", + "vector_seq = str(vector_file.seq.upper())\n", + "\n", + "# ERROR CHECKS\n", + "if len(wt_seq) != len(vector_seq):\n", + " print('ERROR: WildType and Vector GenBank sequences are not of equal length')\n", + " return\n", + "# check for -20 bp homology\n", + "# check that the strand is going forward\n", + "\n", + "# get start and stop of gene for codon positions\n", + "for feature in wt_file.features:\n", + " if feature.type == 'gene':\n", + " gene_start = feature.location.start.position\n", + " gene_end = feature.location.end.position\n", + "\n", + "# setup seq_data\n", + "seq_data = {}\n", + "seq_data['wt_seq'] = wt_seq\n", + "seq_data['vector_seq'] = vector_seq\n", + "seq_data['gene_start'] = gene_start\n", + "seq_data['gene_end'] = gene_end\n", + "seq_data['fasta_file'] = []\n", + "seq_data['df'] = pd.DataFrame()\n", + "seq_data['rng'] = np.random.RandomState(42)\n", + "\n", + "# this needs to be fixed (user input? yaml?)\n", + "targ_windows = ['window_1', 'window_2', 'window_3']\n", + "\n", + "for feature in wt_file.features:\n", + " if feature.type not in targ_windows:\n", + " continue\n", + " \n", + " start_index = feature.location.start.position\n", + " window_end = feature.location.end.position\n", + " \n", + " # loop for each sub_window\n", + " sub_window_n = 1\n", + " while start_index < window_end: # this could be an issue to toggle\n", + " data_dict = {}\n", + " data_dict['start_index'] = start_index\n", + " data_dict['window_end'] = window_end\n", + " data_dict['sub_window_name'] = {str(feature.type)}-{sub_window_n}\n", + " \n", + " # 1. homology arm\n", + " data_dict = main_package.primer_design.homology_arm(seq_data, data_dict, args)\n", + " \n", + " # 2. reverse primer\n", + " data_dict = main_package.primer_design.reverse_primer(seq_data, data_dict, args)\n", + " \n", + " # 3. forward primer\n", + " data_dict = forward_primer(seq_data, data_dict, args)\n", + " \n", + " # 4. variant window\n", + " seq_data, data_dict = main_package.primer_design.sub_window(seq_data, data_dict, args)\n", + " \n", + " # reset the start index for the next mini-window\n", + " start_index = primer_start\n", + " sub_window_n += 1 \n", + "\n", + "# setup .fa output, truncate if file exists\n", + "file = open(f\"{output_prefix}.fa\",'w+')\n", + "file.writelines(seq_data['fasta_file'])\n", + "file.close()\n", + "\n", + "# polish dataframe\n", + "df = seq_data['df']\n", + "df['position'] = df['position'].astype(int)\n", + "\n", + "df['forward_primer_tm'] = df['forward_primer'].apply(lambda x: mt.Tm_NN(x)).round(1)\n", + "df['forward_primer_gc'] = df['forward_primer'].apply(GC).round(1)\n", + "df['forward_primer_len'] = df['forward_primer'].str.len()\n", + "\n", + "df['reverse_primer_tm'] = df['reverse_primer'].apply(lambda x: mt.Tm_NN(x)).round(1)\n", + "df['reverse_primer_gc'] = df['reverse_primer'].apply(GC).round(1)\n", + "df['reverse_primer_len'] = df['reverse_primer'].str.len()\n", + "\n", + "cols = ['name','sub_window_name','wt','position','iupac','codon_sub','synonymous_codons','no_stop_codons','primer','homology_arm','sub_window','forward_primer','forward_primer_tm','forward_primer_gc','forward_primer_len','reverse_primer','reverse_primer_name','reverse_primer_tm','reverse_primer_gc','reverse_primer_len']\n", + "df = df[cols]\n", + "\n", + "# save dataframe as .tsv\n", + "df.to_csv(f'{output_prefix}.tsv', index=False, sep='\\t')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "alignparse-environment", + "language": "python", + "name": "alignparse-environment" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/check_k3l_test.ipynb b/check_k3l_test.ipynb new file mode 100644 index 0000000..1313e4a --- /dev/null +++ b/check_k3l_test.ipynb @@ -0,0 +1,2540 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 197, + "id": "fbb00f64-ecc8-412d-ae57-9a4036440f6a", + "metadata": {}, + "outputs": [], + "source": [ + "# check that k3l_test contains appropirate variants for k3l\n", + "import pandas as pd\n", + "from Bio.Seq import Seq" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "id": "26ba439a-d6c4-4037-82a1-cd8c07350940", + "metadata": {}, + "outputs": [], + "source": [ + "# read script output\n", + "input_file = 'k3l_test.tsv'\n", + "df = pd.read_csv(input_file, sep='\\t')\n", + "df.fillna('', inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 199, + "id": "63e6329d-bd91-4b54-8c8a-670fb45e183d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namesub_window_namewtpositioniupaccodon_subiupac_aasynonymous_codonsno_stop_codonsprimer...sub_windowforward_primerforward_primer_tmforward_primer_gcforward_primer_lenreverse_primer_namereverse_primerreverse_primer_tmreverse_primer_gcreverse_primer_len
4window_1-1_GAT13HATwindow_1-1GAT13HATGAT13HATNHY0.00.0TTTGTTATTCGTTGCCCAATgcwggwHATgtwatwaarGGCAGAGT......gcwggwHATgtwatwaarGGCAGAGTATACGAGAAGGATT51.345.522rev_window_1-1ATTGGGCAACGAATAACAAAATGCA55.136.025
5window_1-1_GAT13GBTwindow_1-1GAT13GBTGAT13GBTAGV0.00.0TTTGTTATTCGTTGCCCAATgcwggwGBTgtwatwaarGGCAGAGT......gcwggwGBTgtwatwaarGGCAGAGTATACGAGAAGGATT51.345.522rev_window_1-1ATTGGGCAACGAATAACAAAATGCA55.136.025
6window_1-1_GAT13GAAwindow_1-1GAT13GAAGAT13GAAE0.00.0TTTGTTATTCGTTGCCCAATgcwggwGAAgtwatwaarGGCAGAGT......gcwggwGAAgtwatwaarGGCAGAGTATACGAGAAGGATT51.345.522rev_window_1-1ATTGGGCAACGAATAACAAAATGCA55.136.025
\n", + "

3 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " name sub_window_name wt position iupac codon_sub \\\n", + "4 window_1-1_GAT13HAT window_1-1 GAT 13 HAT GAT13HAT \n", + "5 window_1-1_GAT13GBT window_1-1 GAT 13 GBT GAT13GBT \n", + "6 window_1-1_GAT13GAA window_1-1 GAT 13 GAA GAT13GAA \n", + "\n", + " iupac_aa synonymous_codons no_stop_codons \\\n", + "4 NHY 0.0 0.0 \n", + "5 AGV 0.0 0.0 \n", + "6 E 0.0 0.0 \n", + "\n", + " primer ... sub_window \\\n", + "4 TTTGTTATTCGTTGCCCAATgcwggwHATgtwatwaarGGCAGAGT... ... gcwggwHATgtwatwaar \n", + "5 TTTGTTATTCGTTGCCCAATgcwggwGBTgtwatwaarGGCAGAGT... ... gcwggwGBTgtwatwaar \n", + "6 TTTGTTATTCGTTGCCCAATgcwggwGAAgtwatwaarGGCAGAGT... ... gcwggwGAAgtwatwaar \n", + "\n", + " forward_primer forward_primer_tm forward_primer_gc \\\n", + "4 GGCAGAGTATACGAGAAGGATT 51.3 45.5 \n", + "5 GGCAGAGTATACGAGAAGGATT 51.3 45.5 \n", + "6 GGCAGAGTATACGAGAAGGATT 51.3 45.5 \n", + "\n", + " forward_primer_len reverse_primer_name reverse_primer \\\n", + "4 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA \n", + "5 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA \n", + "6 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA \n", + "\n", + " reverse_primer_tm reverse_primer_gc reverse_primer_len \n", + "4 55.1 36.0 25 \n", + "5 55.1 36.0 25 \n", + "6 55.1 36.0 25 \n", + "\n", + "[3 rows x 21 columns]" + ] + }, + "execution_count": 199, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.query('position == 13')" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "id": "1005e0a7-f88d-4d96-a82f-2af7975d45c8", + "metadata": {}, + "outputs": [], + "source": [ + "# gather iupac-encoded aa missense variants by position\n", + "df1 = df.groupby('position')['iupac_aa'].apply(list).reset_index()\n", + "map_dict = dict(zip(df.position, df.wt))\n", + "df1['wt_codon'] =df1.position.map(map_dict)\n", + "df1.iupac_aa = df1.iupac_aa.str.join('').str.split('')\n", + "df1.iupac_aa = df1.iupac_aa.apply(lambda x: set(x))\n", + "df1.iupac_aa.apply(lambda x: x.remove(''))\n", + "df1['wt_aa'] = df1.wt_codon.apply(lambda x: str(Seq(x).translate()))" + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "id": "73d853dc-ab2d-4161-8977-103a95824fdc", + "metadata": {}, + "outputs": [], + "source": [ + "# get the missense variants for the wt codon\n", + "def aa_missense_variants(codon): \n", + " nucleotides = 'ACGT'\n", + " wt_aa = str(Seq(codon).translate())\n", + " missense_aa = []\n", + " for position in range(3): \n", + " for n in nucleotides:\n", + " new_codon = codon[:position] + n + codon[position + 1:]\n", + " new_aa = str(Seq(new_codon).translate())\n", + " if new_aa != wt_aa:\n", + " missense_aa.append(new_aa)\n", + " else: \n", + " continue\n", + " return set(missense_aa)\n", + "df1['wt_missense'] = df1.wt_codon.apply(aa_missense_variants)\n", + "\n", + "# take sets and remove iupac_aa from wt_aa (reciprocal, may have synonymous in the iupac and stops in the wt)\n", + "#df1['difference'] = (df1.wt_aa - df1.iupac_aa)" + ] + }, + { + "cell_type": "code", + "execution_count": 202, + "id": "6583af6b-0349-4e05-ba94-d97f11346d4f", + "metadata": {}, + "outputs": [], + "source": [ + "df1['diff_1'] = df1.wt_missense - df1.iupac_aa\n", + "df1['diff_2'] = df1.iupac_aa - df1.wt_missense\n", + "\n", + "df1['sym_diff'] = df1.apply(lambda x: x['iupac_aa'].symmetric_difference(x['wt_missense']), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "id": "afa9f8d5-429b-4756-9ed6-4b22ea0f4839", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
positioniupac_aawt_codonwt_aawt_missensediff_1diff_2sym_diff
011{S, G, P, V, E, T}GCGA{G, S, P, V, E, T}{}{}{}
112{S, C, D, R, V, A}GGTG{S, C, D, R, V, A}{}{}{}
213{G, N, V, A, E, Y, H}GATD{G, N, V, A, E, Y, H}{}{}{}
314{G, I, L, A, E}GTAV{G, I, L, A, E}{}{}{}
415{K, V, R, L, M, T}ATAI{K, V, R, L, M, T}{}{}{}
516{Q, N, K, R, *, E, M, T}AAGK{Q, N, R, *, E, M, T}{}{K}{K}
617{S, C, D, R, V, A}GGCG{S, C, D, R, V, A}{}{}{}
718{G, S, I, K, *, T}AGAR{G, S, I, K, *, T}{}{}{}
843{G, I, N, C, R, T}AGTS{G, I, N, C, R, T}{}{}{}
944{G, I, D, L, A, F}GTTV{G, I, D, L, A, F}{}{}{}
1045{Q, N, R, *, E, M, T}AAGK{Q, N, R, *, E, M, T}{}{}{}
1146{I, K, V, R, L, T}ATGM{I, K, V, R, L, T}{}{}{}
1247{Q, N, P, D, R, L, Y}CATH{Q, N, P, D, R, L, Y}{}{}{}
1348{I, K, V, R, L, T}ATGM{I, K, V, R, L, T}{}{}{}
1449{G, N, V, A, E, Y, H}GATD{G, N, V, A, E, Y, H}{}{}{}
1550{G, S, I, K, *, T}AGAR{G, S, I, K, *, T}{}{}{}
1651{S, N, C, D, *, F, H}TATY{S, N, C, D, *, F, H}{}{}{}
1771{G, N, V, A, E, Y, H}GATD{G, N, V, A, E, Y, H}{}{}{}
1872{S, N, C, D, F, H}TATY{S, N, C, D, *, F, H}{*}{}{*}
1973{S, I, K, P, R, A}ACAT{S, I, K, P, R, A}{}{}{}
2074{Q, I, N, R, *, E, T}AAAK{Q, I, N, R, *, E, T}{}{}{}
2175{G, V, R, *, A, E}GGAG{V, R, *, A, E}{}{G}{G}
2276{S, N, C, D, *, F, H}TATY{S, N, C, D, *, F, H}{}{}{}
2377{K, V, R, L, M, T}ATAI{K, V, R, L, M, T}{}{}{}
2478{G, N, V, A, E, Y, H}GATD{G, N, V, A, E, Y, H}{}{}{}
2579{G, I, D, L, A, F}GTCV{G, I, D, L, A, F}{}{}{}
2680{S, I, K, D, T, Y, H}AATN{S, I, K, D, T, Y, H}{}{}{}
2781{S, N, C, D, *, F, H}TACY{S, N, C, D, *, F, H}{}{}{}
2882{Q, I, N, R, *, E, T}AAAK{Q, I, N, R, *, E, T}{}{}{}
2983{G, W, S, K, R, M, T}AGGR{G, W, S, K, M, T}{}{R}{R}
3084{I, K, V, R, L, T}ATGM{I, K, V, R, L, T}{}{}{}
3185{G, S, W, R, *, F, Y}TGTC{S, G, W, R, *, F, Y}{}{}{}
3286{G, S, I, K, *, T}AGAR{G, S, I, K, *, T}{}{}{}
3387{Q, N, P, D, R, L, Y}CATH{Q, N, P, D, R, L, Y}{}{}{}
3488{K, P, R, *, L, E, H}CAAQ{K, P, R, *, L, E, H}{}{}{}
\n", + "
" + ], + "text/plain": [ + " position iupac_aa wt_codon wt_aa wt_missense \\\n", + "0 11 {S, G, P, V, E, T} GCG A {G, S, P, V, E, T} \n", + "1 12 {S, C, D, R, V, A} GGT G {S, C, D, R, V, A} \n", + "2 13 {G, N, V, A, E, Y, H} GAT D {G, N, V, A, E, Y, H} \n", + "3 14 {G, I, L, A, E} GTA V {G, I, L, A, E} \n", + "4 15 {K, V, R, L, M, T} ATA I {K, V, R, L, M, T} \n", + "5 16 {Q, N, K, R, *, E, M, T} AAG K {Q, N, R, *, E, M, T} \n", + "6 17 {S, C, D, R, V, A} GGC G {S, C, D, R, V, A} \n", + "7 18 {G, S, I, K, *, T} AGA R {G, S, I, K, *, T} \n", + "8 43 {G, I, N, C, R, T} AGT S {G, I, N, C, R, T} \n", + "9 44 {G, I, D, L, A, F} GTT V {G, I, D, L, A, F} \n", + "10 45 {Q, N, R, *, E, M, T} AAG K {Q, N, R, *, E, M, T} \n", + "11 46 {I, K, V, R, L, T} ATG M {I, K, V, R, L, T} \n", + "12 47 {Q, N, P, D, R, L, Y} CAT H {Q, N, P, D, R, L, Y} \n", + "13 48 {I, K, V, R, L, T} ATG M {I, K, V, R, L, T} \n", + "14 49 {G, N, V, A, E, Y, H} GAT D {G, N, V, A, E, Y, H} \n", + "15 50 {G, S, I, K, *, T} AGA R {G, S, I, K, *, T} \n", + "16 51 {S, N, C, D, *, F, H} TAT Y {S, N, C, D, *, F, H} \n", + "17 71 {G, N, V, A, E, Y, H} GAT D {G, N, V, A, E, Y, H} \n", + "18 72 {S, N, C, D, F, H} TAT Y {S, N, C, D, *, F, H} \n", + "19 73 {S, I, K, P, R, A} ACA T {S, I, K, P, R, A} \n", + "20 74 {Q, I, N, R, *, E, T} AAA K {Q, I, N, R, *, E, T} \n", + "21 75 {G, V, R, *, A, E} GGA G {V, R, *, A, E} \n", + "22 76 {S, N, C, D, *, F, H} TAT Y {S, N, C, D, *, F, H} \n", + "23 77 {K, V, R, L, M, T} ATA I {K, V, R, L, M, T} \n", + "24 78 {G, N, V, A, E, Y, H} GAT D {G, N, V, A, E, Y, H} \n", + "25 79 {G, I, D, L, A, F} GTC V {G, I, D, L, A, F} \n", + "26 80 {S, I, K, D, T, Y, H} AAT N {S, I, K, D, T, Y, H} \n", + "27 81 {S, N, C, D, *, F, H} TAC Y {S, N, C, D, *, F, H} \n", + "28 82 {Q, I, N, R, *, E, T} AAA K {Q, I, N, R, *, E, T} \n", + "29 83 {G, W, S, K, R, M, T} AGG R {G, W, S, K, M, T} \n", + "30 84 {I, K, V, R, L, T} ATG M {I, K, V, R, L, T} \n", + "31 85 {G, S, W, R, *, F, Y} TGT C {S, G, W, R, *, F, Y} \n", + "32 86 {G, S, I, K, *, T} AGA R {G, S, I, K, *, T} \n", + "33 87 {Q, N, P, D, R, L, Y} CAT H {Q, N, P, D, R, L, Y} \n", + "34 88 {K, P, R, *, L, E, H} CAA Q {K, P, R, *, L, E, H} \n", + "\n", + " diff_1 diff_2 sym_diff \n", + "0 {} {} {} \n", + "1 {} {} {} \n", + "2 {} {} {} \n", + "3 {} {} {} \n", + "4 {} {} {} \n", + "5 {} {K} {K} \n", + "6 {} {} {} \n", + "7 {} {} {} \n", + "8 {} {} {} \n", + "9 {} {} {} \n", + "10 {} {} {} \n", + "11 {} {} {} \n", + "12 {} {} {} \n", + "13 {} {} {} \n", + "14 {} {} {} \n", + "15 {} {} {} \n", + "16 {} {} {} \n", + "17 {} {} {} \n", + "18 {*} {} {*} \n", + "19 {} {} {} \n", + "20 {} {} {} \n", + "21 {} {G} {G} \n", + "22 {} {} {} \n", + "23 {} {} {} \n", + "24 {} {} {} \n", + "25 {} {} {} \n", + "26 {} {} {} \n", + "27 {} {} {} \n", + "28 {} {} {} \n", + "29 {} {R} {R} \n", + "30 {} {} {} \n", + "31 {} {} {} \n", + "32 {} {} {} \n", + "33 {} {} {} \n", + "34 {} {} {} " + ] + }, + "execution_count": 203, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fdfa6aaa-3d36-4a02-a180-b8dd9c9eea72", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e7309f8-8029-4a6d-b988-66270dbaedb8", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "572ec73a-c225-42c4-94ec-40875eea7ae6", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba0678b8-918d-4e09-bc90-c34edc203c3e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5b62334-4a0c-4f69-8359-a521caaa73ec", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 169, + "id": "90016c3f-8bcc-4194-8fa4-614163ea42bd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'NHY'" + ] + }, + "execution_count": 169, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# there's an issue with this function in codon_table module\n", + "# test with \"GAT\" position 13\n", + "iupac_dict = {'A':'A','C':'C','G':'G','T':'T','AC':'M','AG':'R','AT':'W','CG':'S','CT':'Y','GT':'K','ACG':'V','ACT':'H','AGT':'D','CGT':'B','ACGT':'N'}\n", + "rev_iupac_dict = {value:key for key,value in iupac_dict.items()}\n", + "\n", + "def iupac_to_aa(iupac_codon):\n", + " \"\"\"Return string of AAs encoded by input iupac missense codon\"\"\"\n", + " aa_list = []\n", + " for i,n in enumerate(list(iupac_codon)):\n", + " if n in list('ACGT'):\n", + " continue\n", + " for new_nuc in rev_iupac_dict[n]:\n", + " new_codon = iupac_codon[:i] + new_nuc + iupac_codon[i + 1:]\n", + " aa_list.append(str(Seq(new_codon).translate()))\n", + " return ''.join(aa_list)\n", + "\n", + "iupac_to_aa('HAT')" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "id": "14c17ab5-55c6-4157-ac3f-8bdc28acac81", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'AGV'" + ] + }, + "execution_count": 170, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iupac_to_aa('GBT')" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "id": "3d73d64a-d41e-4007-be90-f0e116f624ce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "''" + ] + }, + "execution_count": 171, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iupac_to_aa('GAA')" + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "id": "26e42017-1afb-4f9e-a4e8-8c0783fe9450", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['AAT', 'CAT', 'TAT']" + ] + }, + "execution_count": 193, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import itertools\n", + "a = [['A','C','T'],['A'],['T']]\n", + "[''.join(i) for i in list(itertools.product(*a))]" + ] + }, + { + "cell_type": "code", + "execution_count": 190, + "id": "13050128-098c-4fc1-9c86-7fd791465a1b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['A', 'C', 'T']" + ] + }, + "execution_count": 190, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(rev_iupac_dict['H'])" + ] + }, + { + "cell_type": "code", + "execution_count": 194, + "id": "b11b20ac-2512-4f51-bb5a-0f68143e9c43", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'NHY'" + ] + }, + "execution_count": 194, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def new_iupac_to_aa(iupac_codon):\n", + " \"\"\"Return string of AAs encoded by input iupac missense codon\"\"\"\n", + " nuc_lists = [list(rev_iupac_dict[n]) for n in iupac_codon]\n", + " codon_list = [''.join(i) for i in list(itertools.product(*nuc_lists))]\n", + " aa_list = [str(Seq(codon).translate()) for codon in codon_list]\n", + " return ''.join(aa_list)\n", + "\n", + "new_iupac_to_aa('HAT')" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "id": "060f7628-ebc4-4dae-90e0-bcbf4ecda962", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'AGV'" + ] + }, + "execution_count": 195, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_iupac_to_aa('GBT')" + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "id": "07324e81-efe3-4565-8d27-60dec557170c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'E'" + ] + }, + "execution_count": 196, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_iupac_to_aa('GAA')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f22f43e7-0dec-443c-ad4a-bd0a4c865d8d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 134, + "id": "95dec735-6e31-4869-8d15-14eb2c8833cf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
positioniupac_aawt_codonwt_aawt_missensedifferencedifference_2contains_wt_aa
011[, T, P, S, E, G, V, ]GCGA[T, P, S, E, G, V]{A}{, S, G, P, V, E, T}True
112[, S, R, C, D, A, V, ]GGTG[S, R, C, D, A, V]{G}{, S, C, D, R, V, A}True
213[, N, H, Y, A, G, V, ]GATD[N, H, Y, A, G, V, E, E]{D}{, G, N, V, A, Y, H}True
314[, I, L, E, A, G, ]GTAV[I, L, L, E, A, G]{V}{, G, I, L, A, E}True
415[, V, L, K, T, R, ]ATAI[L, V, L, K, T, R, M]{I}{, K, V, R, L, T}True
516[, Q, E, *, T, R, M, K, N, ]AAGK[Q, E, *, T, R, M, N, N]{}{, Q, N, R, *, E, M, T}False
617[, S, R, C, D, A, V, ]GGCG[S, R, C, D, A, V]{G}{, S, C, D, R, V, A}True
718[, G, *, K, T, I, ]AGAR[G, *, K, T, I, S, S]{R}{, G, I, K, *, T}True
843[, G, C, N, T, I, ]AGTS[R, G, C, N, T, I, R, R]{S}{, G, I, N, C, T}True
944[, I, L, F, D, A, G, ]GTTV[I, L, F, D, A, G]{V}{, G, I, D, L, A, F}True
1045[, Q, E, *, T, R, M, ]AAGK[Q, E, *, T, R, M, N, N]{K}{, Q, R, *, E, M, T}True
1146[, V, L, K, T, R, ]ATGM[L, V, L, K, T, R, I, I, I]{M}{, K, V, R, L, T}True
1247[, N, D, Y, P, R, L, ]CATH[N, D, Y, P, R, L, Q, Q]{H}{, N, P, D, R, L, Y}True
1348[, V, L, K, T, R, ]ATGM[L, V, L, K, T, R, I, I, I]{M}{, K, V, R, L, T}True
1449[, N, H, Y, A, G, V, ]GATD[N, H, Y, A, G, V, E, E]{D}{, G, N, V, A, Y, H}True
1550[, G, *, K, T, I, ]AGAR[G, *, K, T, I, S, S]{R}{, G, I, K, *, T}True
1651[, N, H, D, S, C, F, ]TATY[N, H, D, S, C, F, *, *]{Y}{, S, N, C, D, F, H}True
1771[, N, H, Y, A, G, V, ]GATD[N, H, Y, A, G, V, E, E]{D}{, G, N, V, A, Y, H}True
1872[, N, H, D, S, C, F, ]TATY[N, H, D, S, C, F, *, *]{Y}{, S, N, C, D, F, H}True
1973[, P, A, S, K, R, I, ]ACAT[P, A, S, K, R, I]{T}{, S, I, K, P, R, A}True
2074[, Q, E, *, T, R, I, ]AAAK[Q, E, *, T, R, I, N, N]{K}{, Q, I, R, *, E, T}True
2175[, R, *, E, A, V, ]GGAG[R, R, *, E, A, V]{G}{, V, R, *, A, E}True
2276[, N, H, D, S, C, F, ]TATY[N, H, D, S, C, F, *, *]{Y}{, S, N, C, D, F, H}True
2377[, V, L, K, T, R, ]ATAI[L, V, L, K, T, R, M]{I}{, K, V, R, L, T}True
2478[, N, H, Y, A, G, V, ]GATD[N, H, Y, A, G, V, E, E]{D}{, G, N, V, A, Y, H}True
2579[, I, L, F, D, A, G, ]GTCV[I, L, F, D, A, G]{V}{, G, I, D, L, A, F}True
2680[, H, D, Y, T, S, I, ]AATN[H, D, Y, T, S, I, K, K]{N}{, S, I, D, T, Y, H}True
2781[, N, H, D, S, C, F, ]TACY[N, H, D, S, C, F, *, *]{Y}{, S, N, C, D, F, H}True
2882[, Q, E, *, T, R, I, ]AAAK[Q, E, *, T, R, I, N, N]{K}{, Q, I, R, *, E, T}True
2983[, G, W, K, T, M, R, S, ]AGGR[G, W, K, T, M, S, S]{}{, G, W, S, K, M, T}False
3084[, V, L, K, T, R, ]ATGM[L, V, L, K, T, R, I, I, I]{M}{, K, V, R, L, T}True
3185[, R, G, Y, S, F, *, W, ]TGTC[S, R, G, Y, S, F, *, W]{C}{, G, S, W, R, *, F, Y}True
3286[, G, *, K, T, I, ]AGAR[G, *, K, T, I, S, S]{R}{, G, I, K, *, T}True
3387[, N, D, Y, P, R, L, ]CATH[N, D, Y, P, R, L, Q, Q]{H}{, N, P, D, R, L, Y}True
3488[, K, E, *, P, R, L, ]CAAQ[K, E, *, P, R, L, H, H]{Q}{, K, P, R, *, L, E}True
\n", + "
" + ], + "text/plain": [ + " position iupac_aa wt_codon wt_aa \\\n", + "0 11 [, T, P, S, E, G, V, ] GCG A \n", + "1 12 [, S, R, C, D, A, V, ] GGT G \n", + "2 13 [, N, H, Y, A, G, V, ] GAT D \n", + "3 14 [, I, L, E, A, G, ] GTA V \n", + "4 15 [, V, L, K, T, R, ] ATA I \n", + "5 16 [, Q, E, *, T, R, M, K, N, ] AAG K \n", + "6 17 [, S, R, C, D, A, V, ] GGC G \n", + "7 18 [, G, *, K, T, I, ] AGA R \n", + "8 43 [, G, C, N, T, I, ] AGT S \n", + "9 44 [, I, L, F, D, A, G, ] GTT V \n", + "10 45 [, Q, E, *, T, R, M, ] AAG K \n", + "11 46 [, V, L, K, T, R, ] ATG M \n", + "12 47 [, N, D, Y, P, R, L, ] CAT H \n", + "13 48 [, V, L, K, T, R, ] ATG M \n", + "14 49 [, N, H, Y, A, G, V, ] GAT D \n", + "15 50 [, G, *, K, T, I, ] AGA R \n", + "16 51 [, N, H, D, S, C, F, ] TAT Y \n", + "17 71 [, N, H, Y, A, G, V, ] GAT D \n", + "18 72 [, N, H, D, S, C, F, ] TAT Y \n", + "19 73 [, P, A, S, K, R, I, ] ACA T \n", + "20 74 [, Q, E, *, T, R, I, ] AAA K \n", + "21 75 [, R, *, E, A, V, ] GGA G \n", + "22 76 [, N, H, D, S, C, F, ] TAT Y \n", + "23 77 [, V, L, K, T, R, ] ATA I \n", + "24 78 [, N, H, Y, A, G, V, ] GAT D \n", + "25 79 [, I, L, F, D, A, G, ] GTC V \n", + "26 80 [, H, D, Y, T, S, I, ] AAT N \n", + "27 81 [, N, H, D, S, C, F, ] TAC Y \n", + "28 82 [, Q, E, *, T, R, I, ] AAA K \n", + "29 83 [, G, W, K, T, M, R, S, ] AGG R \n", + "30 84 [, V, L, K, T, R, ] ATG M \n", + "31 85 [, R, G, Y, S, F, *, W, ] TGT C \n", + "32 86 [, G, *, K, T, I, ] AGA R \n", + "33 87 [, N, D, Y, P, R, L, ] CAT H \n", + "34 88 [, K, E, *, P, R, L, ] CAA Q \n", + "\n", + " wt_missense difference difference_2 \\\n", + "0 [T, P, S, E, G, V] {A} {, S, G, P, V, E, T} \n", + "1 [S, R, C, D, A, V] {G} {, S, C, D, R, V, A} \n", + "2 [N, H, Y, A, G, V, E, E] {D} {, G, N, V, A, Y, H} \n", + "3 [I, L, L, E, A, G] {V} {, G, I, L, A, E} \n", + "4 [L, V, L, K, T, R, M] {I} {, K, V, R, L, T} \n", + "5 [Q, E, *, T, R, M, N, N] {} {, Q, N, R, *, E, M, T} \n", + "6 [S, R, C, D, A, V] {G} {, S, C, D, R, V, A} \n", + "7 [G, *, K, T, I, S, S] {R} {, G, I, K, *, T} \n", + "8 [R, G, C, N, T, I, R, R] {S} {, G, I, N, C, T} \n", + "9 [I, L, F, D, A, G] {V} {, G, I, D, L, A, F} \n", + "10 [Q, E, *, T, R, M, N, N] {K} {, Q, R, *, E, M, T} \n", + "11 [L, V, L, K, T, R, I, I, I] {M} {, K, V, R, L, T} \n", + "12 [N, D, Y, P, R, L, Q, Q] {H} {, N, P, D, R, L, Y} \n", + "13 [L, V, L, K, T, R, I, I, I] {M} {, K, V, R, L, T} \n", + "14 [N, H, Y, A, G, V, E, E] {D} {, G, N, V, A, Y, H} \n", + "15 [G, *, K, T, I, S, S] {R} {, G, I, K, *, T} \n", + "16 [N, H, D, S, C, F, *, *] {Y} {, S, N, C, D, F, H} \n", + "17 [N, H, Y, A, G, V, E, E] {D} {, G, N, V, A, Y, H} \n", + "18 [N, H, D, S, C, F, *, *] {Y} {, S, N, C, D, F, H} \n", + "19 [P, A, S, K, R, I] {T} {, S, I, K, P, R, A} \n", + "20 [Q, E, *, T, R, I, N, N] {K} {, Q, I, R, *, E, T} \n", + "21 [R, R, *, E, A, V] {G} {, V, R, *, A, E} \n", + "22 [N, H, D, S, C, F, *, *] {Y} {, S, N, C, D, F, H} \n", + "23 [L, V, L, K, T, R, M] {I} {, K, V, R, L, T} \n", + "24 [N, H, Y, A, G, V, E, E] {D} {, G, N, V, A, Y, H} \n", + "25 [I, L, F, D, A, G] {V} {, G, I, D, L, A, F} \n", + "26 [H, D, Y, T, S, I, K, K] {N} {, S, I, D, T, Y, H} \n", + "27 [N, H, D, S, C, F, *, *] {Y} {, S, N, C, D, F, H} \n", + "28 [Q, E, *, T, R, I, N, N] {K} {, Q, I, R, *, E, T} \n", + "29 [G, W, K, T, M, S, S] {} {, G, W, S, K, M, T} \n", + "30 [L, V, L, K, T, R, I, I, I] {M} {, K, V, R, L, T} \n", + "31 [S, R, G, Y, S, F, *, W] {C} {, G, S, W, R, *, F, Y} \n", + "32 [G, *, K, T, I, S, S] {R} {, G, I, K, *, T} \n", + "33 [N, D, Y, P, R, L, Q, Q] {H} {, N, P, D, R, L, Y} \n", + "34 [K, E, *, P, R, L, H, H] {Q} {, K, P, R, *, L, E} \n", + "\n", + " contains_wt_aa \n", + "0 True \n", + "1 True \n", + "2 True \n", + "3 True \n", + "4 True \n", + "5 False \n", + "6 True \n", + "7 True \n", + "8 True \n", + "9 True \n", + "10 True \n", + "11 True \n", + "12 True \n", + "13 True \n", + "14 True \n", + "15 True \n", + "16 True \n", + "17 True \n", + "18 True \n", + "19 True \n", + "20 True \n", + "21 True \n", + "22 True \n", + "23 True \n", + "24 True \n", + "25 True \n", + "26 True \n", + "27 True \n", + "28 True \n", + "29 False \n", + "30 True \n", + "31 True \n", + "32 True \n", + "33 True \n", + "34 True " + ] + }, + "execution_count": 134, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1['difference_2'] = (df1.iupac_aa.apply(set) - df1.wt_aa.apply(set))\n", + "df1['contains_wt_aa'] = df1.apply(lambda x: x['wt_aa'] in x['difference'], axis=1)\n", + "\n", + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "id": "db215cdf-78f0-4956-809b-6635a9f196f3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'a', 't'}" + ] + }, + "execution_count": 138, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a = {'', 'a', 't'}\n", + "a.remove('')\n", + "a\n" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "id": "efdf424f-fee1-4bc7-8323-97e0b6908677", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 {E, G, S, T, P, V}\n", + "1 {S, C, V, D, R, A}\n", + "2 {G, N, V, Y, H, A}\n", + "3 {E, G, I, L, A}\n", + "4 {K, T, V, R, L}\n", + "5 {Q, N, K, R, *, E, M, T}\n", + "6 {S, C, V, D, R, A}\n", + "7 {G, I, K, T, *}\n", + "8 {G, I, N, T, C}\n", + "9 {G, F, I, D, L, A}\n", + "10 {E, Q, M, T, R, *}\n", + "11 {K, T, V, R, L}\n", + "12 {N, P, D, R, L, Y}\n", + "13 {K, T, V, R, L}\n", + "14 {G, N, V, Y, H, A}\n", + "15 {G, I, K, T, *}\n", + "16 {S, F, N, C, D, H}\n", + "17 {G, N, V, Y, H, A}\n", + "18 {S, F, N, C, D, H}\n", + "19 {S, I, K, P, R, A}\n", + "20 {E, Q, I, T, R, *}\n", + "21 {E, V, R, *, A}\n", + "22 {S, F, N, C, D, H}\n", + "23 {K, T, V, R, L}\n", + "24 {G, N, V, Y, H, A}\n", + "25 {G, F, I, D, L, A}\n", + "26 {S, I, T, D, H, Y}\n", + "27 {S, F, N, C, D, H}\n", + "28 {E, Q, I, T, R, *}\n", + "29 {G, W, S, M, K, T, R}\n", + "30 {K, T, V, R, L}\n", + "31 {G, S, W, F, Y, R, *}\n", + "32 {G, I, K, T, *}\n", + "33 {N, P, D, R, L, Y}\n", + "34 {E, K, P, R, *, L}\n", + "Name: iupac_aa, dtype: object" + ] + }, + "execution_count": 146, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 135, + "id": "7bd0a9bc-bd29-4b71-b919-11a0271aeb5d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 False\n", + "1 False\n", + "2 False\n", + "3 False\n", + "4 False\n", + "5 False\n", + "6 False\n", + "7 False\n", + "8 False\n", + "9 False\n", + "10 False\n", + "11 False\n", + "12 False\n", + "13 False\n", + "14 False\n", + "15 False\n", + "16 False\n", + "17 False\n", + "18 False\n", + "19 False\n", + "20 False\n", + "21 False\n", + "22 False\n", + "23 False\n", + "24 False\n", + "25 False\n", + "26 False\n", + "27 False\n", + "28 False\n", + "29 False\n", + "30 False\n", + "31 False\n", + "32 False\n", + "33 False\n", + "34 False\n", + "dtype: bool" + ] + }, + "execution_count": 135, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1.difference == df1.difference_2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebb14959-7901-4500-bb11-04c59c5bda3b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d46a8bc9-81a9-492f-9b41-5e22627b3f5f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2bd569a4-ee28-4f39-8c17-664c35d18313", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Seq('XGDVIK')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check that synonymous variants look correct\n", + "Seq('HCGggwgaygtwatwaar').translate()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "9fe36459-2a34-4186-a977-87a7e15c914b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
codonaapositionmissense_nucmissense_codonsmissense_aamissense_iupacmissense_iupac_codonsele_codonssele_aa...syn_boolsyn_codonssyn_aasyn_iupac_codonno_stop_codonsno_stop_aano_stop_iupac_codonno_stop_syn_codonsno_stop_syn_aano_stop_syn_iupac_codon
0AAAK0CGTCAA GAA TAA*QEBBAACAA GAA TAAQE*...FalseBAACAA GAAQESAACAA GAAQESAA
1AAAK1CGTACA AGA ATATRIBABAACA AGA ATATRI...FalseABAACA AGA ATATRIABAACA AGA ATATRIABA
2AAAK2CTAAC AATNYAAYAATN...TrueAAG AATKNAAKAATNAATAAG AATKNAAK
3AACN0CGTCAC GAC TACHDYBBACCAC GAC TACHDY...FalseBACCAC GAC TACHDYBACCAC GAC TACHDYBAC
4AACN1CGTACC AGC ATCTSIBABCACC AGC ATCTSI...FalseABCACC AGC ATCTSIABCACC AGC ATCTSIABC
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " codon aa position missense_nuc missense_codons missense_aa missense_iupac \\\n", + "0 AAA K 0 CGT CAA GAA TAA *QE B \n", + "1 AAA K 1 CGT ACA AGA ATA TRI B \n", + "2 AAA K 2 CT AAC AAT N Y \n", + "3 AAC N 0 CGT CAC GAC TAC HDY B \n", + "4 AAC N 1 CGT ACC AGC ATC TSI B \n", + "\n", + " missense_iupac_codon sele_codons sele_aa ... syn_bool syn_codons syn_aa \\\n", + "0 BAA CAA GAA TAA QE* ... False \n", + "1 ABA ACA AGA ATA TRI ... False \n", + "2 AAY AAT N ... True AAG AAT KN \n", + "3 BAC CAC GAC TAC HDY ... False \n", + "4 ABC ACC AGC ATC TSI ... False \n", + "\n", + " syn_iupac_codon no_stop_codons no_stop_aa no_stop_iupac_codon \\\n", + "0 BAA CAA GAA QE SAA \n", + "1 ABA ACA AGA ATA TRI ABA \n", + "2 AAK AAT N AAT \n", + "3 BAC CAC GAC TAC HDY BAC \n", + "4 ABC ACC AGC ATC TSI ABC \n", + "\n", + " no_stop_syn_codons no_stop_syn_aa no_stop_syn_iupac_codon \n", + "0 CAA GAA QE SAA \n", + "1 ACA AGA ATA TRI ABA \n", + "2 AAG AAT KN AAK \n", + "3 CAC GAC TAC HDY BAC \n", + "4 ACC AGC ATC TSI ABC \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# add the aa info to the iupacs\n", + "df=pd.read_csv('data/final_codon_table.csv')\n", + "df.fillna('', inplace=True)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "a2e76ec8-0b3e-41d8-b96d-4fb11b11c9e1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'QE*'" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# function that translates iupac codon into all AAs\n", + "iupac_dict = {'A':'A','C':'C','G':'G','T':'T','AC':'M','AG':'R','AT':'W','CG':'S','CT':'Y','GT':'K','ACG':'V','ACT':'H','AGT':'D','CGT':'B','ACGT':'N'}\n", + "rev_iupac_dict = {value:key for key,value in iupac_dict.items()}\n", + "\n", + "\n", + "def iupac_to_aa(iupac_codon):\n", + " \"\"\"Return string of AAs encoded by input iupac missense codon\"\"\"\n", + " aa_list = []\n", + " for i,n in enumerate(list(codon)):\n", + " if n in list('ACGT'):\n", + " continue\n", + " for new_nuc in rev_iupac_dict[n]:\n", + " new_codon = codon[:i] + new_nuc + codon[i + 1:]\n", + " aa_list.append(str(Seq(new_codon).translate()))\n", + " return ''.join(aa_list)\n", + "\n", + "iupac_to_aa('BAA')" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "28392453-7e13-4a09-bd4b-6dc845c6d6ab", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Q', 'E', '*']" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "aas" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "4fec7487-869f-4b43-a76b-bb5affab59a0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'AAA': {'iupac': ['BAA', 'ABA', 'AAT']},\n", + " 'AAC': {'iupac': ['BAC', 'ABC', 'AAA']},\n", + " 'AAG': {'iupac': ['BAG', 'ABG', 'AAT']},\n", + " 'AAT': {'iupac': ['BAT', 'ABT', 'AAA']},\n", + " 'ACA': {'iupac': ['BCA', 'ADA']},\n", + " 'ACC': {'iupac': ['BCC', 'AWC']},\n", + " 'ACG': {'iupac': ['BCG', 'ADG']},\n", + " 'ACT': {'iupac': ['BCT', 'AWT']},\n", + " 'AGA': {'iupac': ['KGA', 'AHA', 'AGT']},\n", + " 'AGC': {'iupac': ['KGC', 'AHC', 'AGA']},\n", + " 'AGG': {'iupac': ['KGG', 'AHG', 'AGT']},\n", + " 'AGT': {'iupac': ['KGT', 'AHT', 'AGA']},\n", + " 'ATA': {'iupac': ['KTA', 'AVA', 'ATG']},\n", + " 'ATC': {'iupac': ['BTC', 'AVC', 'ATG']},\n", + " 'ATG': {'iupac': ['KTG', 'AVG', 'ATT']},\n", + " 'ATT': {'iupac': ['BTT', 'AVT', 'ATG']},\n", + " 'CAA': {'iupac': ['DAA', 'CBA', 'CAT']},\n", + " 'CAC': {'iupac': ['DAC', 'CBC', 'CAA']},\n", + " 'CAG': {'iupac': ['DAG', 'CBG', 'CAT']},\n", + " 'CAT': {'iupac': ['DAT', 'CBT', 'CAA']},\n", + " 'CCA': {'iupac': ['DCA', 'CDA']},\n", + " 'CCC': {'iupac': ['DCC', 'CDC']},\n", + " 'CCG': {'iupac': ['DCG', 'CDG']},\n", + " 'CCT': {'iupac': ['DCT', 'CDT']},\n", + " 'CGA': {'iupac': ['KGA', 'CHA']},\n", + " 'CGC': {'iupac': ['DGC', 'CHC']},\n", + " 'CGG': {'iupac': ['KGG', 'CHG']},\n", + " 'CGT': {'iupac': ['DGT', 'CHT']},\n", + " 'CTA': {'iupac': ['RTA', 'CVA']},\n", + " 'CTC': {'iupac': ['DTC', 'CVC']},\n", + " 'CTG': {'iupac': ['RTG', 'CVG']},\n", + " 'CTT': {'iupac': ['DTT', 'CVT']},\n", + " 'GAA': {'iupac': ['HAA', 'GBA', 'GAT']},\n", + " 'GAC': {'iupac': ['HAC', 'GBC', 'GAA']},\n", + " 'GAG': {'iupac': ['HAG', 'GBG', 'GAT']},\n", + " 'GAT': {'iupac': ['HAT', 'GBT', 'GAA']},\n", + " 'GCA': {'iupac': ['HCA', 'GDA']},\n", + " 'GCC': {'iupac': ['HCC', 'GDC']},\n", + " 'GCG': {'iupac': ['HCG', 'GDG']},\n", + " 'GCT': {'iupac': ['HCT', 'GDT']},\n", + " 'GGA': {'iupac': ['WGA', 'GHA']},\n", + " 'GGC': {'iupac': ['HGC', 'GHC']},\n", + " 'GGG': {'iupac': ['WGG', 'GHG']},\n", + " 'GGT': {'iupac': ['HGT', 'GHT']},\n", + " 'GTA': {'iupac': ['WTA', 'GVA']},\n", + " 'GTC': {'iupac': ['HTC', 'GVC']},\n", + " 'GTG': {'iupac': ['WTG', 'GVG']},\n", + " 'GTT': {'iupac': ['HTT', 'GVT']},\n", + " 'TAA': {'iupac': ['VAA', 'TYA', 'TAT']},\n", + " 'TAC': {'iupac': ['VAC', 'TBC', 'TAA']},\n", + " 'TAG': {'iupac': ['VAG', 'TBG', 'TAT']},\n", + " 'TAT': {'iupac': ['VAT', 'TBT', 'TAA']},\n", + " 'TCA': {'iupac': ['VCA', 'TWA']},\n", + " 'TCC': {'iupac': ['VCC', 'TDC']},\n", + " 'TCG': {'iupac': ['VCG', 'TDG']},\n", + " 'TCT': {'iupac': ['VCT', 'TDT']},\n", + " 'TGA': {'iupac': ['RGA', 'TYA', 'TGK']},\n", + " 'TGC': {'iupac': ['SGC', 'THC', 'TGR']},\n", + " 'TGG': {'iupac': ['RGG', 'THG', 'TGM']},\n", + " 'TGT': {'iupac': ['SGT', 'THT', 'TGR']},\n", + " 'TTA': {'iupac': ['RTA', 'TMA', 'TTT']},\n", + " 'TTC': {'iupac': ['RTC', 'TVC', 'TTG']},\n", + " 'TTG': {'iupac': ['RTG', 'TVG', 'TTT']},\n", + " 'TTT': {'iupac': ['RTT', 'TVT', 'TTG']}}" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21eb3a9a-231e-4ce3-b4ca-c364a68dc07d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3895e8f2-99c3-4632-a520-51ea78390360", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cc9eed9-3ebf-4fc2-8213-c811a63816dd", + "metadata": {}, + "outputs": [], + "source": [ + "for key,value in temp_dict.items():\n", + " temp_dict[key] = list(itertools.chain.from_iterable([codon.split(' ') for codon in value]))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "a355cbbc-595d-4fc1-b5a1-7766c1045840", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'AAA': {0: {'sele_iupac_codon': 'BAA', 'sele_aa': 'QE*'},\n", + " 1: {'sele_iupac_codon': 'ABA', 'sele_aa': 'TRI'},\n", + " 2: {'sele_iupac_codon': 'AAT', 'sele_aa': 'N'}},\n", + " 'AAC': {3: {'sele_iupac_codon': 'BAC', 'sele_aa': 'HDY'},\n", + " 4: {'sele_iupac_codon': 'ABC', 'sele_aa': 'TSI'},\n", + " 5: {'sele_iupac_codon': 'AAA', 'sele_aa': 'K'}},\n", + " 'AAG': {6: {'sele_iupac_codon': 'BAG', 'sele_aa': 'QE*'},\n", + " 7: {'sele_iupac_codon': 'ABG', 'sele_aa': 'TRM'},\n", + " 8: {'sele_iupac_codon': 'AAT', 'sele_aa': 'N'}},\n", + " 'AAT': {9: {'sele_iupac_codon': 'BAT', 'sele_aa': 'HDY'},\n", + " 10: {'sele_iupac_codon': 'ABT', 'sele_aa': 'TSI'},\n", + " 11: {'sele_iupac_codon': 'AAA', 'sele_aa': 'K'}},\n", + " 'ACA': {12: {'sele_iupac_codon': 'BCA', 'sele_aa': 'PAS'},\n", + " 13: {'sele_iupac_codon': 'ADA', 'sele_aa': 'KRI'},\n", + " 14: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'ACC': {15: {'sele_iupac_codon': 'BCC', 'sele_aa': 'PAS'},\n", + " 16: {'sele_iupac_codon': 'AWC', 'sele_aa': 'NI'},\n", + " 17: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'ACG': {18: {'sele_iupac_codon': 'BCG', 'sele_aa': 'PAS'},\n", + " 19: {'sele_iupac_codon': 'ADG', 'sele_aa': 'KRM'},\n", + " 20: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'ACT': {21: {'sele_iupac_codon': 'BCT', 'sele_aa': 'PAS'},\n", + " 22: {'sele_iupac_codon': 'AWT', 'sele_aa': 'NI'},\n", + " 23: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'AGA': {24: {'sele_iupac_codon': 'KGA', 'sele_aa': 'G*'},\n", + " 25: {'sele_iupac_codon': 'AHA', 'sele_aa': 'KTI'},\n", + " 26: {'sele_iupac_codon': 'AGT', 'sele_aa': 'S'}},\n", + " 'AGC': {27: {'sele_iupac_codon': 'KGC', 'sele_aa': 'GC'},\n", + " 28: {'sele_iupac_codon': 'AHC', 'sele_aa': 'NTI'},\n", + " 29: {'sele_iupac_codon': 'AGA', 'sele_aa': 'R'}},\n", + " 'AGG': {30: {'sele_iupac_codon': 'KGG', 'sele_aa': 'GW'},\n", + " 31: {'sele_iupac_codon': 'AHG', 'sele_aa': 'KTM'},\n", + " 32: {'sele_iupac_codon': 'AGT', 'sele_aa': 'S'}},\n", + " 'AGT': {33: {'sele_iupac_codon': 'KGT', 'sele_aa': 'GC'},\n", + " 34: {'sele_iupac_codon': 'AHT', 'sele_aa': 'NTI'},\n", + " 35: {'sele_iupac_codon': 'AGA', 'sele_aa': 'R'}},\n", + " 'ATA': {36: {'sele_iupac_codon': 'KTA', 'sele_aa': 'LV'},\n", + " 37: {'sele_iupac_codon': 'AVA', 'sele_aa': 'KTR'},\n", + " 38: {'sele_iupac_codon': 'ATG', 'sele_aa': 'M'}},\n", + " 'ATC': {39: {'sele_iupac_codon': 'BTC', 'sele_aa': 'LVF'},\n", + " 40: {'sele_iupac_codon': 'AVC', 'sele_aa': 'NTS'},\n", + " 41: {'sele_iupac_codon': 'ATG', 'sele_aa': 'M'}},\n", + " 'ATG': {42: {'sele_iupac_codon': 'KTG', 'sele_aa': 'LV'},\n", + " 43: {'sele_iupac_codon': 'AVG', 'sele_aa': 'KTR'},\n", + " 44: {'sele_iupac_codon': 'ATT', 'sele_aa': 'I'}},\n", + " 'ATT': {45: {'sele_iupac_codon': 'BTT', 'sele_aa': 'LVF'},\n", + " 46: {'sele_iupac_codon': 'AVT', 'sele_aa': 'NTS'},\n", + " 47: {'sele_iupac_codon': 'ATG', 'sele_aa': 'M'}},\n", + " 'CAA': {48: {'sele_iupac_codon': 'DAA', 'sele_aa': 'KE*'},\n", + " 49: {'sele_iupac_codon': 'CBA', 'sele_aa': 'PRL'},\n", + " 50: {'sele_iupac_codon': 'CAT', 'sele_aa': 'H'}},\n", + " 'CAC': {51: {'sele_iupac_codon': 'DAC', 'sele_aa': 'NDY'},\n", + " 52: {'sele_iupac_codon': 'CBC', 'sele_aa': 'PRL'},\n", + " 53: {'sele_iupac_codon': 'CAA', 'sele_aa': 'Q'}},\n", + " 'CAG': {54: {'sele_iupac_codon': 'DAG', 'sele_aa': 'KE*'},\n", + " 55: {'sele_iupac_codon': 'CBG', 'sele_aa': 'PRL'},\n", + " 56: {'sele_iupac_codon': 'CAT', 'sele_aa': 'H'}},\n", + " 'CAT': {57: {'sele_iupac_codon': 'DAT', 'sele_aa': 'NDY'},\n", + " 58: {'sele_iupac_codon': 'CBT', 'sele_aa': 'PRL'},\n", + " 59: {'sele_iupac_codon': 'CAA', 'sele_aa': 'Q'}},\n", + " 'CCA': {60: {'sele_iupac_codon': 'DCA', 'sele_aa': 'TAS'},\n", + " 61: {'sele_iupac_codon': 'CDA', 'sele_aa': 'QRL'},\n", + " 62: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CCC': {63: {'sele_iupac_codon': 'DCC', 'sele_aa': 'TAS'},\n", + " 64: {'sele_iupac_codon': 'CDC', 'sele_aa': 'HRL'},\n", + " 65: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CCG': {66: {'sele_iupac_codon': 'DCG', 'sele_aa': 'TAS'},\n", + " 67: {'sele_iupac_codon': 'CDG', 'sele_aa': 'QRL'},\n", + " 68: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CCT': {69: {'sele_iupac_codon': 'DCT', 'sele_aa': 'TAS'},\n", + " 70: {'sele_iupac_codon': 'CDT', 'sele_aa': 'HRL'},\n", + " 71: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CGA': {72: {'sele_iupac_codon': 'KGA', 'sele_aa': 'G*'},\n", + " 73: {'sele_iupac_codon': 'CHA', 'sele_aa': 'QPL'},\n", + " 74: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CGC': {75: {'sele_iupac_codon': 'DGC', 'sele_aa': 'SGC'},\n", + " 76: {'sele_iupac_codon': 'CHC', 'sele_aa': 'HPL'},\n", + " 77: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CGG': {78: {'sele_iupac_codon': 'KGG', 'sele_aa': 'GW'},\n", + " 79: {'sele_iupac_codon': 'CHG', 'sele_aa': 'QPL'},\n", + " 80: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CGT': {81: {'sele_iupac_codon': 'DGT', 'sele_aa': 'SGC'},\n", + " 82: {'sele_iupac_codon': 'CHT', 'sele_aa': 'HPL'},\n", + " 83: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CTA': {84: {'sele_iupac_codon': 'RTA', 'sele_aa': 'IV'},\n", + " 85: {'sele_iupac_codon': 'CVA', 'sele_aa': 'QPR'},\n", + " 86: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CTC': {87: {'sele_iupac_codon': 'DTC', 'sele_aa': 'IVF'},\n", + " 88: {'sele_iupac_codon': 'CVC', 'sele_aa': 'HPR'},\n", + " 89: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CTG': {90: {'sele_iupac_codon': 'RTG', 'sele_aa': 'MV'},\n", + " 91: {'sele_iupac_codon': 'CVG', 'sele_aa': 'QPR'},\n", + " 92: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'CTT': {93: {'sele_iupac_codon': 'DTT', 'sele_aa': 'IVF'},\n", + " 94: {'sele_iupac_codon': 'CVT', 'sele_aa': 'HPR'},\n", + " 95: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GAA': {96: {'sele_iupac_codon': 'HAA', 'sele_aa': 'KQ*'},\n", + " 97: {'sele_iupac_codon': 'GBA', 'sele_aa': 'AGV'},\n", + " 98: {'sele_iupac_codon': 'GAT', 'sele_aa': 'D'}},\n", + " 'GAC': {99: {'sele_iupac_codon': 'HAC', 'sele_aa': 'NHY'},\n", + " 100: {'sele_iupac_codon': 'GBC', 'sele_aa': 'AGV'},\n", + " 101: {'sele_iupac_codon': 'GAA', 'sele_aa': 'E'}},\n", + " 'GAG': {102: {'sele_iupac_codon': 'HAG', 'sele_aa': 'KQ*'},\n", + " 103: {'sele_iupac_codon': 'GBG', 'sele_aa': 'AGV'},\n", + " 104: {'sele_iupac_codon': 'GAT', 'sele_aa': 'D'}},\n", + " 'GAT': {105: {'sele_iupac_codon': 'HAT', 'sele_aa': 'NHY'},\n", + " 106: {'sele_iupac_codon': 'GBT', 'sele_aa': 'AGV'},\n", + " 107: {'sele_iupac_codon': 'GAA', 'sele_aa': 'E'}},\n", + " 'GCA': {108: {'sele_iupac_codon': 'HCA', 'sele_aa': 'TPS'},\n", + " 109: {'sele_iupac_codon': 'GDA', 'sele_aa': 'EGV'},\n", + " 110: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GCC': {111: {'sele_iupac_codon': 'HCC', 'sele_aa': 'TPS'},\n", + " 112: {'sele_iupac_codon': 'GDC', 'sele_aa': 'DGV'},\n", + " 113: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GCG': {114: {'sele_iupac_codon': 'HCG', 'sele_aa': 'TPS'},\n", + " 115: {'sele_iupac_codon': 'GDG', 'sele_aa': 'EGV'},\n", + " 116: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GCT': {117: {'sele_iupac_codon': 'HCT', 'sele_aa': 'TPS'},\n", + " 118: {'sele_iupac_codon': 'GDT', 'sele_aa': 'DGV'},\n", + " 119: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GGA': {120: {'sele_iupac_codon': 'WGA', 'sele_aa': 'R*'},\n", + " 121: {'sele_iupac_codon': 'GHA', 'sele_aa': 'EAV'},\n", + " 122: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GGC': {123: {'sele_iupac_codon': 'HGC', 'sele_aa': 'SRC'},\n", + " 124: {'sele_iupac_codon': 'GHC', 'sele_aa': 'DAV'},\n", + " 125: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GGG': {126: {'sele_iupac_codon': 'WGG', 'sele_aa': 'RW'},\n", + " 127: {'sele_iupac_codon': 'GHG', 'sele_aa': 'EAV'},\n", + " 128: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GGT': {129: {'sele_iupac_codon': 'HGT', 'sele_aa': 'SRC'},\n", + " 130: {'sele_iupac_codon': 'GHT', 'sele_aa': 'DAV'},\n", + " 131: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GTA': {132: {'sele_iupac_codon': 'WTA', 'sele_aa': 'IL'},\n", + " 133: {'sele_iupac_codon': 'GVA', 'sele_aa': 'EAG'},\n", + " 134: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GTC': {135: {'sele_iupac_codon': 'HTC', 'sele_aa': 'ILF'},\n", + " 136: {'sele_iupac_codon': 'GVC', 'sele_aa': 'DAG'},\n", + " 137: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GTG': {138: {'sele_iupac_codon': 'WTG', 'sele_aa': 'ML'},\n", + " 139: {'sele_iupac_codon': 'GVG', 'sele_aa': 'EAG'},\n", + " 140: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'GTT': {141: {'sele_iupac_codon': 'HTT', 'sele_aa': 'ILF'},\n", + " 142: {'sele_iupac_codon': 'GVT', 'sele_aa': 'DAG'},\n", + " 143: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'TAA': {144: {'sele_iupac_codon': 'VAA', 'sele_aa': 'KQE'},\n", + " 145: {'sele_iupac_codon': 'TYA', 'sele_aa': 'SL'},\n", + " 146: {'sele_iupac_codon': 'TAT', 'sele_aa': 'Y'}},\n", + " 'TAC': {147: {'sele_iupac_codon': 'VAC', 'sele_aa': 'NHD'},\n", + " 148: {'sele_iupac_codon': 'TBC', 'sele_aa': 'SCF'},\n", + " 149: {'sele_iupac_codon': 'TAA', 'sele_aa': '*'}},\n", + " 'TAG': {150: {'sele_iupac_codon': 'VAG', 'sele_aa': 'KQE'},\n", + " 151: {'sele_iupac_codon': 'TBG', 'sele_aa': 'SWL'},\n", + " 152: {'sele_iupac_codon': 'TAT', 'sele_aa': 'Y'}},\n", + " 'TAT': {153: {'sele_iupac_codon': 'VAT', 'sele_aa': 'NHD'},\n", + " 154: {'sele_iupac_codon': 'TBT', 'sele_aa': 'SCF'},\n", + " 155: {'sele_iupac_codon': 'TAA', 'sele_aa': '*'}},\n", + " 'TCA': {156: {'sele_iupac_codon': 'VCA', 'sele_aa': 'TPA'},\n", + " 157: {'sele_iupac_codon': 'TWA', 'sele_aa': '*L'},\n", + " 158: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'TCC': {159: {'sele_iupac_codon': 'VCC', 'sele_aa': 'TPA'},\n", + " 160: {'sele_iupac_codon': 'TDC', 'sele_aa': 'YCF'},\n", + " 161: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'TCG': {162: {'sele_iupac_codon': 'VCG', 'sele_aa': 'TPA'},\n", + " 163: {'sele_iupac_codon': 'TDG', 'sele_aa': '*WL'},\n", + " 164: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'TCT': {165: {'sele_iupac_codon': 'VCT', 'sele_aa': 'TPA'},\n", + " 166: {'sele_iupac_codon': 'TDT', 'sele_aa': 'YCF'},\n", + " 167: {'sele_iupac_codon': nan, 'sele_aa': nan}},\n", + " 'TGA': {168: {'sele_iupac_codon': 'RGA', 'sele_aa': 'RG'},\n", + " 169: {'sele_iupac_codon': 'TYA', 'sele_aa': 'SL'},\n", + " 170: {'sele_iupac_codon': 'TGK', 'sele_aa': 'CW'}},\n", + " 'TGC': {171: {'sele_iupac_codon': 'SGC', 'sele_aa': 'RG'},\n", + " 172: {'sele_iupac_codon': 'THC', 'sele_aa': 'YSF'},\n", + " 173: {'sele_iupac_codon': 'TGR', 'sele_aa': '*W'}},\n", + " 'TGG': {174: {'sele_iupac_codon': 'RGG', 'sele_aa': 'RG'},\n", + " 175: {'sele_iupac_codon': 'THG', 'sele_aa': '*SL'},\n", + " 176: {'sele_iupac_codon': 'TGM', 'sele_aa': '*C'}},\n", + " 'TGT': {177: {'sele_iupac_codon': 'SGT', 'sele_aa': 'RG'},\n", + " 178: {'sele_iupac_codon': 'THT', 'sele_aa': 'YSF'},\n", + " 179: {'sele_iupac_codon': 'TGR', 'sele_aa': '*W'}},\n", + " 'TTA': {180: {'sele_iupac_codon': 'RTA', 'sele_aa': 'IV'},\n", + " 181: {'sele_iupac_codon': 'TMA', 'sele_aa': '*S'},\n", + " 182: {'sele_iupac_codon': 'TTT', 'sele_aa': 'F'}},\n", + " 'TTC': {183: {'sele_iupac_codon': 'RTC', 'sele_aa': 'IV'},\n", + " 184: {'sele_iupac_codon': 'TVC', 'sele_aa': 'YSC'},\n", + " 185: {'sele_iupac_codon': 'TTG', 'sele_aa': 'L'}},\n", + " 'TTG': {186: {'sele_iupac_codon': 'RTG', 'sele_aa': 'MV'},\n", + " 187: {'sele_iupac_codon': 'TVG', 'sele_aa': '*SW'},\n", + " 188: {'sele_iupac_codon': 'TTT', 'sele_aa': 'F'}},\n", + " 'TTT': {189: {'sele_iupac_codon': 'RTT', 'sele_aa': 'IV'},\n", + " 190: {'sele_iupac_codon': 'TVT', 'sele_aa': 'YSC'},\n", + " 191: {'sele_iupac_codon': 'TTG', 'sele_aa': 'L'}}}" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby('codon')[['sele_iupac_codon','sele_aa']].apply(lambda x: x.to_dict(orient='index')).to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e69cf6e-2acc-4677-885a-c31ef092063e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80855e33-087e-4bfc-9ebf-31d67afa92d3", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "fe40cc88-bde0-4e98-bbc3-8b2fd68d30e3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
positioniupacwt
011[HCG, GDG]GCG
112[HGT, GHT]GGT
213[HAT, GBT, GAA]GAT
314[WTA, GVA]GTA
415[KTA, AVA, ATG]ATA
\n", + "
" + ], + "text/plain": [ + " position iupac wt\n", + "0 11 [HCG, GDG] GCG\n", + "1 12 [HGT, GHT] GGT\n", + "2 13 [HAT, GBT, GAA] GAT\n", + "3 14 [WTA, GVA] GTA\n", + "4 15 [KTA, AVA, ATG] ATA" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check that iupac encodes all AAs for given position\n", + "\n", + "iupac_dict = {'A':'A','C':'C','G':'G','T':'T','AC':'M','AG':'R','AT':'W','CG':'S','CT':'Y','GT':'K','ACG':'V','ACT':'H','AGT':'D','CGT':'B','ACGT':'N'}\n", + "rev_iupac_dict = {value:key for key,value in iupac_dict.items()}\n", + "\n", + "df1 = df.groupby('position')['iupac'].apply(list).reset_index()\n", + "map_dict = dict(zip(df.position, df.wt))\n", + "df1['wt'] =df1.position.map(map_dict)\n", + "df1.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aea536e8-8331-4ed1-a459-22ecba07f126", + "metadata": {}, + "outputs": [], + "source": [ + "# get iupac aas\n", + "def iupac_aas(row):\n", + " iupac_codons = row['iupac']\n", + " for iupac in iupac_codons" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "d71ecc0b-37b0-40e2-b36c-f288044d90ec", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{11: 'GCG',\n", + " 12: 'GGT',\n", + " 13: 'GAT',\n", + " 14: 'GTA',\n", + " 15: 'ATA',\n", + " 16: 'AAG',\n", + " 17: 'GGC',\n", + " 18: 'AGA',\n", + " 43: 'AGT',\n", + " 44: 'GTT',\n", + " 45: 'AAG',\n", + " 46: 'ATG',\n", + " 47: 'CAT',\n", + " 48: 'ATG',\n", + " 49: 'GAT',\n", + " 50: 'AGA',\n", + " 51: 'TAT',\n", + " 71: 'GAT',\n", + " 72: 'TAT',\n", + " 73: 'ACA',\n", + " 74: 'AAA',\n", + " 75: 'GGA',\n", + " 76: 'TAT',\n", + " 77: 'ATA',\n", + " 78: 'GAT',\n", + " 79: 'GTC',\n", + " 80: 'AAT',\n", + " 81: 'TAC',\n", + " 82: 'AAA',\n", + " 83: 'AGG',\n", + " 84: 'ATG',\n", + " 85: 'TGT',\n", + " 86: 'AGA',\n", + " 87: 'CAT',\n", + " 88: 'CAA'}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "map_dict = dict(zip(df.position, df.wt))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f81b314-2635-42b2-98f8-422ed96826a0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data/.ipynb_checkpoints/final_codon_table-checkpoint.csv b/data/.ipynb_checkpoints/final_codon_table-checkpoint.csv new file mode 100644 index 0000000..c183f47 --- /dev/null +++ b/data/.ipynb_checkpoints/final_codon_table-checkpoint.csv @@ -0,0 +1,193 @@ +codon,aa,position,missense_nuc,missense_codons,missense_aa,missense_iupac,missense_iupac_codon,sele_codons,sele_aa,sele_iupac_codon,syn_bool,syn_codons,syn_aa,syn_iupac_codon,no_stop_codons,no_stop_aa,no_stop_iupac_codon,no_stop_syn_codons,no_stop_syn_aa,no_stop_syn_iupac_codon +AAA,K,0,CGT,CAA GAA TAA,*QE,B,BAA,CAA GAA TAA,QE*,BAA,False,,,BAA,CAA GAA,QE,SAA,CAA GAA,QE,SAA +AAA,K,1,CGT,ACA AGA ATA,TRI,B,ABA,ACA AGA ATA,TRI,ABA,False,,,ABA,ACA AGA ATA,TRI,ABA,ACA AGA ATA,TRI,ABA +AAA,K,2,CT,AAC AAT,N,Y,AAY,AAT,N,AAT,True,AAG AAT,KN,AAK,AAT,N,AAT,AAG AAT,KN,AAK +AAC,N,0,CGT,CAC GAC TAC,HDY,B,BAC,CAC GAC TAC,HDY,BAC,False,,,BAC,CAC GAC TAC,HDY,BAC,CAC GAC TAC,HDY,BAC +AAC,N,1,CGT,ACC AGC ATC,TSI,B,ABC,ACC AGC ATC,TSI,ABC,False,,,ABC,ACC AGC ATC,TSI,ABC,ACC AGC ATC,TSI,ABC +AAC,N,2,AG,AAA AAG,K,R,AAR,AAA,K,AAA,True,AAA AAT,KN,AAW,AAA,K,AAA,AAA AAT,KN,AAW +AAG,K,0,CGT,CAG GAG TAG,*QE,B,BAG,CAG GAG TAG,QE*,BAG,False,,,BAG,CAG GAG,QE,SAG,CAG GAG,QE,SAG +AAG,K,1,CGT,ACG AGG ATG,TRM,B,ABG,ACG AGG ATG,TRM,ABG,False,,,ABG,ACG AGG ATG,TRM,ABG,ACG AGG ATG,TRM,ABG +AAG,K,2,CT,AAC AAT,N,Y,AAY,AAT,N,AAT,True,AAA AAT,KN,AAW,AAT,N,AAT,AAA AAT,KN,AAW +AAT,N,0,CGT,CAT GAT TAT,HDY,B,BAT,CAT GAT TAT,HDY,BAT,False,,,BAT,CAT GAT TAT,HDY,BAT,CAT GAT TAT,HDY,BAT +AAT,N,1,CGT,ACT AGT ATT,TSI,B,ABT,ACT AGT ATT,TSI,ABT,False,,,ABT,ACT AGT ATT,TSI,ABT,ACT AGT ATT,TSI,ABT +AAT,N,2,AG,AAA AAG,K,R,AAR,AAA,K,AAA,True,AAA AAC,KN,AAM,AAA,K,AAA,AAA AAC,KN,AAM +ACA,T,0,CGT,CCA GCA TCA,SAP,B,BCA,CCA GCA TCA,PAS,BCA,False,,,BCA,CCA GCA TCA,PAS,BCA,CCA GCA TCA,PAS,BCA +ACA,T,1,AGT,AAA AGA ATA,RIK,D,ADA,AAA AGA ATA,KRI,ADA,False,,,ADA,AAA AGA ATA,KRI,ADA,AAA AGA ATA,KRI,ADA +ACA,T,2,,,,A,ACA,,,,True,ACT,T,ACT,,,,ACT,T,ACT +ACC,T,0,CGT,CCC GCC TCC,SAP,B,BCC,CCC GCC TCC,PAS,BCC,False,,,BCC,CCC GCC TCC,PAS,BCC,CCC GCC TCC,PAS,BCC +ACC,T,1,AGT,AAC AGC ATC,NSI,D,ADC,AAC ATC,NI,AWC,False,,,AWC,AAC ATC,NI,AWC,AAC ATC,NI,AWC +ACC,T,2,,,,C,ACC,,,,True,ACT,T,ACT,,,,ACT,T,ACT +ACG,T,0,CGT,CCG GCG TCG,SAP,B,BCG,CCG GCG TCG,PAS,BCG,False,,,BCG,CCG GCG TCG,PAS,BCG,CCG GCG TCG,PAS,BCG +ACG,T,1,AGT,AAG AGG ATG,RMK,D,ADG,AAG AGG ATG,KRM,ADG,False,,,ADG,AAG AGG ATG,KRM,ADG,AAG AGG ATG,KRM,ADG +ACG,T,2,,,,G,ACG,,,,True,ACT,T,ACT,,,,ACT,T,ACT +ACT,T,0,CGT,CCT GCT TCT,SAP,B,BCT,CCT GCT TCT,PAS,BCT,False,,,BCT,CCT GCT TCT,PAS,BCT,CCT GCT TCT,PAS,BCT +ACT,T,1,AGT,AAT AGT ATT,NSI,D,ADT,AAT ATT,NI,AWT,False,,,AWT,AAT ATT,NI,AWT,AAT ATT,NI,AWT +ACT,T,2,,,,T,ACT,,,,True,ACA,T,ACA,,,,ACA,T,ACA +AGA,R,0,GT,GGA TGA,*G,K,KGA,GGA TGA,G*,KGA,False,,,KGA,GGA,G,GGA,GGA,G,GGA +AGA,R,1,ACT,AAA ACA ATA,TIK,H,AHA,AAA ACA ATA,KTI,AHA,False,,,AHA,AAA ACA ATA,KTI,AHA,AAA ACA ATA,KTI,AHA +AGA,R,2,CT,AGC AGT,S,Y,AGY,AGT,S,AGT,True,AGG AGT,RS,AGK,AGT,S,AGT,AGG AGT,RS,AGK +AGC,S,0,CGT,CGC GGC TGC,RCG,B,BGC,GGC TGC,GC,KGC,False,,,KGC,GGC TGC,GC,KGC,GGC TGC,GC,KGC +AGC,S,1,ACT,AAC ACC ATC,NTI,H,AHC,AAC ACC ATC,NTI,AHC,False,,,AHC,AAC ACC ATC,NTI,AHC,AAC ACC ATC,NTI,AHC +AGC,S,2,AG,AGA AGG,R,R,AGR,AGA,R,AGA,True,AGA AGT,RS,AGW,AGA,R,AGA,AGA AGT,RS,AGW +AGG,R,0,GT,GGG TGG,WG,K,KGG,GGG TGG,GW,KGG,False,,,KGG,GGG TGG,GW,KGG,GGG TGG,GW,KGG +AGG,R,1,ACT,AAG ACG ATG,TMK,H,AHG,AAG ACG ATG,KTM,AHG,False,,,AHG,AAG ACG ATG,KTM,AHG,AAG ACG ATG,KTM,AHG +AGG,R,2,CT,AGC AGT,S,Y,AGY,AGT,S,AGT,True,AGA AGT,RS,AGW,AGT,S,AGT,AGA AGT,RS,AGW +AGT,S,0,CGT,CGT GGT TGT,RCG,B,BGT,GGT TGT,GC,KGT,False,,,KGT,GGT TGT,GC,KGT,GGT TGT,GC,KGT +AGT,S,1,ACT,AAT ACT ATT,NTI,H,AHT,AAT ACT ATT,NTI,AHT,False,,,AHT,AAT ACT ATT,NTI,AHT,AAT ACT ATT,NTI,AHT +AGT,S,2,AG,AGA AGG,R,R,AGR,AGA,R,AGA,True,AGA AGC,RS,AGM,AGA,R,AGA,AGA AGC,RS,AGM +ATA,I,0,CGT,CTA GTA TTA,LV,B,BTA,TTA GTA,LV,KTA,False,,,KTA,TTA GTA,LV,KTA,TTA GTA,LV,KTA +ATA,I,1,ACG,AAA ACA AGA,TRK,V,AVA,AAA ACA AGA,KTR,AVA,False,,,AVA,AAA ACA AGA,KTR,AVA,AAA ACA AGA,KTR,AVA +ATA,I,2,G,ATG,M,G,ATG,ATG,M,ATG,True,ATG ATT,MI,ATK,ATG,M,ATG,ATG ATT,MI,ATK +ATC,I,0,CGT,CTC GTC TTC,LFV,B,BTC,CTC GTC TTC,LVF,BTC,False,,,BTC,CTC GTC TTC,LVF,BTC,CTC GTC TTC,LVF,BTC +ATC,I,1,ACG,AAC ACC AGC,NST,V,AVC,AAC ACC AGC,NTS,AVC,False,,,AVC,AAC ACC AGC,NTS,AVC,AAC ACC AGC,NTS,AVC +ATC,I,2,G,ATG,M,G,ATG,ATG,M,ATG,True,ATG ATT,MI,ATK,ATG,M,ATG,ATG ATT,MI,ATK +ATG,M,0,CGT,CTG GTG TTG,LV,B,BTG,TTA GTA,LV,KTG,False,,,KTG,TTA GTA,LV,KTG,TTA GTA,LV,KTG +ATG,M,1,ACG,AAG ACG AGG,TRK,V,AVG,AAG ACG AGG,KTR,AVG,False,,,AVG,AAG ACG AGG,KTR,AVG,AAG ACG AGG,KTR,AVG +ATG,M,2,ACT,ATA ATC ATT,I,H,ATH,ATT,I,ATT,True,ATG ATT,MI,ATK,ATT,I,ATT,ATG ATT,MI,ATK +ATT,I,0,CGT,CTT GTT TTT,LFV,B,BTT,CTT GTT TTT,LVF,BTT,False,,,BTT,CTT GTT TTT,LVF,BTT,CTT GTT TTT,LVF,BTT +ATT,I,1,ACG,AAT ACT AGT,NST,V,AVT,AAT ACT AGT,NTS,AVT,False,,,AVT,AAT ACT AGT,NTS,AVT,AAT ACT AGT,NTS,AVT +ATT,I,2,G,ATG,M,G,ATG,ATG,M,ATG,True,ATA ATG,IM,ATR,ATG,M,ATG,ATA ATG,IM,ATR +CAA,Q,0,AGT,AAA GAA TAA,*EK,D,DAA,AAA GAA TAA,KE*,DAA,False,,,DAA,AAA GAA,KE,RAA,AAA GAA,KE,RAA +CAA,Q,1,CGT,CCA CGA CTA,RLP,B,CBA,CCA CGA CTA,PRL,CBA,False,,,CBA,CCA CGA CTA,PRL,CBA,CCA CGA CTA,PRL,CBA +CAA,Q,2,CT,CAC CAT,H,Y,CAY,CAT,H,CAT,True,CAG CAT,QH,CAK,CAT,H,CAT,CAG CAT,QH,CAK +CAC,H,0,AGT,AAC GAC TAC,NDY,D,DAC,AAC GAC TAC,NDY,DAC,False,,,DAC,AAC GAC TAC,NDY,DAC,AAC GAC TAC,NDY,DAC +CAC,H,1,CGT,CCC CGC CTC,RLP,B,CBC,CCC CGC CTC,PRL,CBC,False,,,CBC,CCC CGC CTC,PRL,CBC,CCC CGC CTC,PRL,CBC +CAC,H,2,AG,CAA CAG,Q,R,CAR,CAA,Q,CAA,True,CAA CAT,QH,CAW,CAA,Q,CAA,CAA CAT,QH,CAW +CAG,Q,0,AGT,AAG GAG TAG,*EK,D,DAG,AAG GAG TAG,KE*,DAG,False,,,DAG,AAG GAG,KE,RAG,AAG GAG,KE,RAG +CAG,Q,1,CGT,CCG CGG CTG,RLP,B,CBG,CCG CGG CTG,PRL,CBG,False,,,CBG,CCG CGG CTG,PRL,CBG,CCG CGG CTG,PRL,CBG +CAG,Q,2,CT,CAC CAT,H,Y,CAY,CAT,H,CAT,True,CAA CAT,QH,CAW,CAT,H,CAT,CAA CAT,QH,CAW +CAT,H,0,AGT,AAT GAT TAT,NDY,D,DAT,AAT GAT TAT,NDY,DAT,False,,,DAT,AAT GAT TAT,NDY,DAT,AAT GAT TAT,NDY,DAT +CAT,H,1,CGT,CCT CGT CTT,RLP,B,CBT,CCT CGT CTT,PRL,CBT,False,,,CBT,CCT CGT CTT,PRL,CBT,CCT CGT CTT,PRL,CBT +CAT,H,2,AG,CAA CAG,Q,R,CAR,CAA,Q,CAA,True,CAA CAC,QH,CAM,CAA,Q,CAA,CAA CAC,QH,CAM +CCA,P,0,AGT,ACA GCA TCA,TSA,D,DCA,ACA GCA TCA,TAS,DCA,False,,,DCA,ACA GCA TCA,TAS,DCA,ACA GCA TCA,TAS,DCA +CCA,P,1,AGT,CAA CGA CTA,RQL,D,CDA,CAA CGA CTA,QRL,CDA,False,,,CDA,CAA CGA CTA,QRL,CDA,CAA CGA CTA,QRL,CDA +CCA,P,2,,,,A,CCA,,,,True,CCT,P,CCT,,,,CCT,P,CCT +CCC,P,0,AGT,ACC GCC TCC,TSA,D,DCC,ACC GCC TCC,TAS,DCC,False,,,DCC,ACC GCC TCC,TAS,DCC,ACC GCC TCC,TAS,DCC +CCC,P,1,AGT,CAC CGC CTC,RHL,D,CDC,CAC CGC CTC,HRL,CDC,False,,,CDC,CAC CGC CTC,HRL,CDC,CAC CGC CTC,HRL,CDC +CCC,P,2,,,,C,CCC,,,,True,CCA,P,CCA,,,,CCA,P,CCA +CCG,P,0,AGT,ACG GCG TCG,TSA,D,DCG,ACG GCG TCG,TAS,DCG,False,,,DCG,ACG GCG TCG,TAS,DCG,ACG GCG TCG,TAS,DCG +CCG,P,1,AGT,CAG CGG CTG,RQL,D,CDG,CAG CGG CTG,QRL,CDG,False,,,CDG,CAG CGG CTG,QRL,CDG,CAG CGG CTG,QRL,CDG +CCG,P,2,,,,G,CCG,,,,True,CCA,P,CCA,,,,CCA,P,CCA +CCT,P,0,AGT,ACT GCT TCT,TSA,D,DCT,ACT GCT TCT,TAS,DCT,False,,,DCT,ACT GCT TCT,TAS,DCT,ACT GCT TCT,TAS,DCT +CCT,P,1,AGT,CAT CGT CTT,RHL,D,CDT,CAT CGT CTT,HRL,CDT,False,,,CDT,CAT CGT CTT,HRL,CDT,CAT CGT CTT,HRL,CDT +CCT,P,2,,,,T,CCT,,,,True,CCA,P,CCA,,,,CCA,P,CCA +CGA,R,0,GT,GGA TGA,*G,K,KGA,GGA TGA,G*,KGA,True,AGA GGA TGA,RG*,DGA,GGA,G,GGA,AGA GGA,RG,RGA +CGA,R,1,ACT,CAA CCA CTA,LQP,H,CHA,CAA CCA CTA,QPL,CHA,False,,,CHA,CAA CCA CTA,QPL,CHA,CAA CCA CTA,QPL,CHA +CGA,R,2,,,,A,CGA,,,,False,,,,,,,,, +CGC,R,0,AGT,AGC GGC TGC,SCG,D,DGC,AGC GGC TGC,SGC,DGC,False,,,DGC,AGC GGC TGC,SGC,DGC,AGC GGC TGC,SGC,DGC +CGC,R,1,ACT,CAC CCC CTC,LHP,H,CHC,CAC CCC CTC,HPL,CHC,False,,,CHC,CAC CCC CTC,HPL,CHC,CAC CCC CTC,HPL,CHC +CGC,R,2,,,,C,CGC,,,,True,CGT,R,CGT,,,,CGT,R,CGT +CGG,R,0,GT,GGG TGG,WG,K,KGG,GGG TGG,GW,KGG,True,AGG GGG TGG,RGW,DGG,GGG TGG,GW,KGG,AGG GGG TGG,RGW,DGG +CGG,R,1,ACT,CAG CCG CTG,LQP,H,CHG,CAG CCG CTG,QPL,CHG,False,,,CHG,CAG CCG CTG,QPL,CHG,CAG CCG CTG,QPL,CHG +CGG,R,2,,,,G,CGG,,,,False,,,,,,,,, +CGT,R,0,AGT,AGT GGT TGT,SCG,D,DGT,AGT GGT TGT,SGC,DGT,False,,,DGT,AGT GGT TGT,SGC,DGT,AGT GGT TGT,SGC,DGT +CGT,R,1,ACT,CAT CCT CTT,LHP,H,CHT,CAT CCT CTT,HPL,CHT,False,,,CHT,CAT CCT CTT,HPL,CHT,CAT CCT CTT,HPL,CHT +CGT,R,2,,,,T,CGT,,,,True,CGA,R,CGA,,,,CGA,R,CGA +CTA,L,0,AG,ATA GTA,IV,R,RTA,ATA GTA,IV,RTA,True,ATA GTA TTA,IVL,DTA,ATA GTA,IV,RTA,ATA GTA TTA,IVL,DTA +CTA,L,1,ACG,CAA CCA CGA,RQP,V,CVA,CAA CCA CGA,QPR,CVA,False,,,CVA,CAA CCA CGA,QPR,CVA,CAA CCA CGA,QPR,CVA +CTA,L,2,,,,A,CTA,,,,False,,,,,,,,, +CTC,L,0,AGT,ATC GTC TTC,FIV,D,DTC,ATC GTC TTC,IVF,DTC,False,,,DTC,ATC GTC TTC,IVF,DTC,ATC GTC TTC,IVF,DTC +CTC,L,1,ACG,CAC CCC CGC,RHP,V,CVC,CAC CCC CGC,HPR,CVC,False,,,CVC,CAC CCC CGC,HPR,CVC,CAC CCC CGC,HPR,CVC +CTC,L,2,,,,C,CTC,,,,True,CTA,L,CTA,,,,CTA,L,CTA +CTG,L,0,AG,ATG GTG,VM,R,RTG,ATG GTG,MV,RTG,True,ATG GTG TTG,MVL,DTG,ATG GTG,MV,RTG,ATG GTG TTG,MVL,DTG +CTG,L,1,ACG,CAG CCG CGG,RQP,V,CVG,CAG CCG CGG,QPR,CVG,False,,,CVG,CAG CCG CGG,QPR,CVG,CAG CCG CGG,QPR,CVG +CTG,L,2,,,,G,CTG,,,,False,,,,,,,,, +CTT,L,0,AGT,ATT GTT TTT,FIV,D,DTT,ATT GTT TTT,IVF,DTT,False,,,DTT,ATT GTT TTT,IVF,DTT,ATT GTT TTT,IVF,DTT +CTT,L,1,ACG,CAT CCT CGT,RHP,V,CVT,CAT CCT CGT,HPR,CVT,False,,,CVT,CAT CCT CGT,HPR,CVT,CAT CCT CGT,HPR,CVT +CTT,L,2,,,,T,CTT,,,,True,CTA,L,CTA,,,,CTA,L,CTA +GAA,E,0,ACT,AAA CAA TAA,*QK,H,HAA,AAA CAA TAA,KQ*,HAA,False,,,HAA,AAA CAA,KQ,MAA,AAA CAA,KQ,MAA +GAA,E,1,CGT,GCA GGA GTA,VAG,B,GBA,GCA GGA GTA,AGV,GBA,False,,,GBA,GCA GGA GTA,AGV,GBA,GCA GGA GTA,AGV,GBA +GAA,E,2,CT,GAC GAT,D,Y,GAY,GAT,D,GAT,True,GAG GAT,ED,GAK,GAT,D,GAT,GAG GAT,ED,GAK +GAC,D,0,ACT,AAC CAC TAC,NHY,H,HAC,AAC CAC TAC,NHY,HAC,False,,,HAC,AAC CAC TAC,NHY,HAC,AAC CAC TAC,NHY,HAC +GAC,D,1,CGT,GCC GGC GTC,VAG,B,GBC,GCC GGC GTC,AGV,GBC,False,,,GBC,GCC GGC GTC,AGV,GBC,GCC GGC GTC,AGV,GBC +GAC,D,2,AG,GAA GAG,E,R,GAR,GAA,E,GAA,True,GAA GAT,ED,GAW,GAA,E,GAA,GAA GAT,ED,GAW +GAG,E,0,ACT,AAG CAG TAG,*QK,H,HAG,AAG CAG TAG,KQ*,HAG,False,,,HAG,AAG CAG,KQ,MAG,AAG CAG,KQ,MAG +GAG,E,1,CGT,GCG GGG GTG,VAG,B,GBG,GCG GGG GTG,AGV,GBG,False,,,GBG,GCG GGG GTG,AGV,GBG,GCG GGG GTG,AGV,GBG +GAG,E,2,CT,GAC GAT,D,Y,GAY,GAT,D,GAT,True,GAA GAT,ED,GAW,GAT,D,GAT,GAA GAT,ED,GAW +GAT,D,0,ACT,AAT CAT TAT,NHY,H,HAT,AAT CAT TAT,NHY,HAT,False,,,HAT,AAT CAT TAT,NHY,HAT,AAT CAT TAT,NHY,HAT +GAT,D,1,CGT,GCT GGT GTT,VAG,B,GBT,GCT GGT GTT,AGV,GBT,False,,,GBT,GCT GGT GTT,AGV,GBT,GCT GGT GTT,AGV,GBT +GAT,D,2,AG,GAA GAG,E,R,GAR,GAA,E,GAA,True,GAA GAC,ED,GAM,GAA,E,GAA,GAA GAC,ED,GAM +GCA,A,0,ACT,ACA CCA TCA,TSP,H,HCA,ACA CCA TCA,TPS,HCA,False,,,HCA,ACA CCA TCA,TPS,HCA,ACA CCA TCA,TPS,HCA +GCA,A,1,AGT,GAA GGA GTA,GVE,D,GDA,GAA GGA GTA,EGV,GDA,False,,,GDA,GAA GGA GTA,EGV,GDA,GAA GGA GTA,EGV,GDA +GCA,A,2,,,,A,GCA,,,,True,GCT,A,GCT,,,,GCT,A,GCT +GCC,A,0,ACT,ACC CCC TCC,TSP,H,HCC,ACC CCC TCC,TPS,HCC,False,,,HCC,ACC CCC TCC,TPS,HCC,ACC CCC TCC,TPS,HCC +GCC,A,1,AGT,GAC GGC GTC,VDG,D,GDC,GAC GGC GTC,DGV,GDC,False,,,GDC,GAC GGC GTC,DGV,GDC,GAC GGC GTC,DGV,GDC +GCC,A,2,,,,C,GCC,,,,True,GCT,A,GCT,,,,GCT,A,GCT +GCG,A,0,ACT,ACG CCG TCG,TSP,H,HCG,ACG CCG TCG,TPS,HCG,False,,,HCG,ACG CCG TCG,TPS,HCG,ACG CCG TCG,TPS,HCG +GCG,A,1,AGT,GAG GGG GTG,GVE,D,GDG,GAG GGG GTG,EGV,GDG,False,,,GDG,GAG GGG GTG,EGV,GDG,GAG GGG GTG,EGV,GDG +GCG,A,2,,,,G,GCG,,,,True,GCT,A,GCT,,,,GCT,A,GCT +GCT,A,0,ACT,ACT CCT TCT,TSP,H,HCT,ACT CCT TCT,TPS,HCT,False,,,HCT,ACT CCT TCT,TPS,HCT,ACT CCT TCT,TPS,HCT +GCT,A,1,AGT,GAT GGT GTT,VDG,D,GDT,GAT GGT GTT,DGV,GDT,False,,,GDT,GAT GGT GTT,DGV,GDT,GAT GGT GTT,DGV,GDT +GCT,A,2,,,,T,GCT,,,,True,GCA,A,GCA,,,,GCA,A,GCA +GGA,G,0,ACT,AGA CGA TGA,R*,H,HGA,AGA TGA,R*,WGA,False,,,WGA,AGA,R,AGA,AGA,R,AGA +GGA,G,1,ACT,GAA GCA GTA,VAE,H,GHA,GAA GCA GTA,EAV,GHA,False,,,GHA,GAA GCA GTA,EAV,GHA,GAA GCA GTA,EAV,GHA +GGA,G,2,,,,A,GGA,,,,True,GGT,G,GGT,,,,GGT,G,GGT +GGC,G,0,ACT,AGC CGC TGC,RSC,H,HGC,AGC CGC TGC,SRC,HGC,False,,,HGC,AGC CGC TGC,SRC,HGC,AGC CGC TGC,SRC,HGC +GGC,G,1,ACT,GAC GCC GTC,ADV,H,GHC,GAC GCC GTC,DAV,GHC,False,,,GHC,GAC GCC GTC,DAV,GHC,GAC GCC GTC,DAV,GHC +GGC,G,2,,,,C,GGC,,,,True,GGT,G,GGT,,,,GGT,G,GGT +GGG,G,0,ACT,AGG CGG TGG,RW,H,HGG,AGG TGG,RW,WGG,False,,,WGG,AGG TGG,RW,WGG,AGG TGG,RW,WGG +GGG,G,1,ACT,GAG GCG GTG,VAE,H,GHG,GAG GCG GTG,EAV,GHG,False,,,GHG,GAG GCG GTG,EAV,GHG,GAG GCG GTG,EAV,GHG +GGG,G,2,,,,G,GGG,,,,True,GGT,G,GGT,,,,GGT,G,GGT +GGT,G,0,ACT,AGT CGT TGT,RSC,H,HGT,AGT CGT TGT,SRC,HGT,False,,,HGT,AGT CGT TGT,SRC,HGT,AGT CGT TGT,SRC,HGT +GGT,G,1,ACT,GAT GCT GTT,ADV,H,GHT,GAT GCT GTT,DAV,GHT,False,,,GHT,GAT GCT GTT,DAV,GHT,GAT GCT GTT,DAV,GHT +GGT,G,2,,,,T,GGT,,,,True,GGA,G,GGA,,,,GGA,G,GGA +GTA,V,0,ACT,ATA CTA TTA,LI,H,HTA,ATA TTA,IL,WTA,False,,,WTA,ATA TTA,IL,WTA,ATA TTA,IL,WTA +GTA,V,1,ACG,GAA GCA GGA,GAE,V,GVA,GAA GCA GGA,EAG,GVA,False,,,GVA,GAA GCA GGA,EAG,GVA,GAA GCA GGA,EAG,GVA +GTA,V,2,,,,A,GTA,,,,True,GTT,V,GTT,,,,GTT,V,GTT +GTC,V,0,ACT,ATC CTC TTC,LFI,H,HTC,ATC CTC TTC,ILF,HTC,False,,,HTC,ATC CTC TTC,ILF,HTC,ATC CTC TTC,ILF,HTC +GTC,V,1,ACG,GAC GCC GGC,ADG,V,GVC,GAC GCC GGC,DAG,GVC,False,,,GVC,GAC GCC GGC,DAG,GVC,GAC GCC GGC,DAG,GVC +GTC,V,2,,,,C,GTC,,,,True,GTT,V,GTT,,,,GTT,V,GTT +GTG,V,0,ACT,ATG CTG TTG,LM,H,HTG,ATG TTG,ML,WTG,False,,,WTG,ATG TTG,ML,WTG,ATG TTG,ML,WTG +GTG,V,1,ACG,GAG GCG GGG,GAE,V,GVG,GAG GCG GGG,EAG,GVG,False,,,GVG,GAG GCG GGG,EAG,GVG,GAG GCG GGG,EAG,GVG +GTG,V,2,,,,G,GTG,,,,True,GTT,V,GTT,,,,GTT,V,GTT +GTT,V,0,ACT,ATT CTT TTT,LFI,H,HTT,ATT CTT TTT,ILF,HTT,False,,,HTT,ATT CTT TTT,ILF,HTT,ATT CTT TTT,ILF,HTT +GTT,V,1,ACG,GAT GCT GGT,ADG,V,GVT,GAT GCT GGT,DAG,GVT,False,,,GVT,GAT GCT GGT,DAG,GVT,GAT GCT GGT,DAG,GVT +GTT,V,2,,,,T,GTT,,,,True,GTA,V,GTA,,,,GTA,V,GTA +TAA,*,0,ACG,AAA CAA GAA,QEK,V,VAA,AAA CAA GAA,KQE,VAA,False,,,VAA,AAA CAA GAA,KQE,VAA,AAA CAA GAA,KQE,VAA +TAA,*,1,CT,TCA TTA,LS,Y,TYA,TCA TTA,SL,TYA,False,,,TYA,TCA TTA,SL,TYA,TCA TTA,SL,TYA +TAA,*,2,CT,TAC TAT,Y,Y,TAY,TAT,Y,TAT,True,TAG TAT,*Y,TAK,TAT,Y,TAT,TAT,Y,TAT +TAC,Y,0,ACG,AAC CAC GAC,NHD,V,VAC,AAC CAC GAC,NHD,VAC,False,,,VAC,AAC CAC GAC,NHD,VAC,AAC CAC GAC,NHD,VAC +TAC,Y,1,CGT,TCC TGC TTC,SFC,B,TBC,TCC TGC TTC,SCF,TBC,False,,,TBC,TCC TGC TTC,SCF,TBC,TCC TGC TTC,SCF,TBC +TAC,Y,2,AG,TAA TAG,*,R,TAR,TAA,*,TAA,True,TAA TAT,*Y,TAW,,,,TAT,Y,TAT +TAG,*,0,ACG,AAG CAG GAG,QEK,V,VAG,AAG CAG GAG,KQE,VAG,False,,,VAG,AAG CAG GAG,KQE,VAG,AAG CAG GAG,KQE,VAG +TAG,*,1,CGT,TCG TGG TTG,LSW,B,TBG,TCG TGG TTG,SWL,TBG,False,,,TBG,TCG TGG TTG,SWL,TBG,TCG TGG TTG,SWL,TBG +TAG,*,2,CT,TAC TAT,Y,Y,TAY,TAT,Y,TAT,True,TAA TAT,*Y,TAW,TAT,Y,TAT,TAT,Y,TAT +TAT,Y,0,ACG,AAT CAT GAT,NHD,V,VAT,AAT CAT GAT,NHD,VAT,False,,,VAT,AAT CAT GAT,NHD,VAT,AAT CAT GAT,NHD,VAT +TAT,Y,1,CGT,TCT TGT TTT,SFC,B,TBT,TCT TGT TTT,SCF,TBT,False,,,TBT,TCT TGT TTT,SCF,TBT,TCT TGT TTT,SCF,TBT +TAT,Y,2,AG,TAA TAG,*,R,TAR,TAA,*,TAA,True,TAA TAC,*Y,TAM,,,,TAC,Y,TAC +TCA,S,0,ACG,ACA CCA GCA,TAP,V,VCA,ACA CCA GCA,TPA,VCA,False,,,VCA,ACA CCA GCA,TPA,VCA,ACA CCA GCA,TPA,VCA +TCA,S,1,AGT,TAA TGA TTA,L*,D,TDA,TAA TTA,*L,TWA,False,,,TWA,TTA,L,TTA,TTA,L,TTA +TCA,S,2,,,,A,TCA,,,,True,TCT,S,TCT,,,,TCT,S,TCT +TCC,S,0,ACG,ACC CCC GCC,TAP,V,VCC,ACC CCC GCC,TPA,VCC,False,,,VCC,ACC CCC GCC,TPA,VCC,ACC CCC GCC,TPA,VCC +TCC,S,1,AGT,TAC TGC TTC,FCY,D,TDC,TAC TGC TTC,YCF,TDC,False,,,TDC,TAC TGC TTC,YCF,TDC,TAC TGC TTC,YCF,TDC +TCC,S,2,,,,C,TCC,,,,True,TCT,S,TCT,,,,TCT,S,TCT +TCG,S,0,ACG,ACG CCG GCG,TAP,V,VCG,ACG CCG GCG,TPA,VCG,False,,,VCG,ACG CCG GCG,TPA,VCG,ACG CCG GCG,TPA,VCG +TCG,S,1,AGT,TAG TGG TTG,L*W,D,TDG,TAG TGG TTG,*WL,TDG,False,,,TDG,TGG TTG,WL,TKG,TGG TTG,WL,TKG +TCG,S,2,,,,G,TCG,,,,True,TCT,S,TCT,,,,TCT,S,TCT +TCT,S,0,ACG,ACT CCT GCT,TAP,V,VCT,ACT CCT GCT,TPA,VCT,False,,,VCT,ACT CCT GCT,TPA,VCT,ACT CCT GCT,TPA,VCT +TCT,S,1,AGT,TAT TGT TTT,FCY,D,TDT,TAT TGT TTT,YCF,TDT,False,,,TDT,TAT TGT TTT,YCF,TDT,TAT TGT TTT,YCF,TDT +TCT,S,2,,,,T,TCT,,,,True,TCA,S,TCA,,,,TCA,S,TCA +TGA,*,0,ACG,AGA CGA GGA,RG,V,VGA,AGA GGA,RG,RGA,False,,,RGA,AGA GGA,RG,RGA,AGA GGA,RG,RGA +TGA,*,1,CT,TCA TTA,LS,Y,TYA,TCA TTA,SL,TYA,True,TAA TCA TTA,*SL,THA,TCA TTA,SL,TYA,TCA TTA,SL,TYA +TGA,*,2,CGT,TGC TGG TGT,CW,B,TGB,TGT TGG,CW,TGK,False,,,TGK,TGT TGG,CW,TGK,TGT TGG,CW,TGK +TGC,C,0,ACG,AGC CGC GGC,RSG,V,VGC,CGC GGC,RG,SGC,False,,,SGC,CGC GGC,RG,SGC,CGC GGC,RG,SGC +TGC,C,1,ACT,TAC TCC TTC,SFY,H,THC,TAC TCC TTC,YSF,THC,False,,,THC,TAC TCC TTC,YSF,THC,TAC TCC TTC,YSF,THC +TGC,C,2,AG,TGA TGG,*W,R,TGR,TGA TGG,*W,TGR,True,TGA TGG TGT,*WC,TGD,TGG,W,TGG,TGG TGT,WC,TGK +TGG,W,0,ACG,AGG CGG GGG,RG,V,VGG,AGG GGG,RG,RGG,False,,,RGG,AGG GGG,RG,RGG,AGG GGG,RG,RGG +TGG,W,1,ACT,TAG TCG TTG,L*S,H,THG,TAG TCG TTG,*SL,THG,False,,,THG,TCG TTG,SL,TYG,TCG TTG,SL,TYG +TGG,W,2,ACT,TGA TGC TGT,*C,H,TGH,TGA TGC,*C,TGM,True,TGC TGG TGT,CWC,TGB,TGC,C,TGC,TGC TGG TGT,CWC,TGB +TGT,C,0,ACG,AGT CGT GGT,RSG,V,VGT,CGT GGT,RG,SGT,False,,,SGT,CGT GGT,RG,SGT,CGT GGT,RG,SGT +TGT,C,1,ACT,TAT TCT TTT,SFY,H,THT,TAT TCT TTT,YSF,THT,False,,,THT,TAT TCT TTT,YSF,THT,TAT TCT TTT,YSF,THT +TGT,C,2,AG,TGA TGG,*W,R,TGR,TGA TGG,*W,TGR,True,TGA TGC TGG,*CW,TGV,TGG,W,TGG,TGC TGG,CW,TGS +TTA,L,0,AG,ATA GTA,IV,R,RTA,ATA GTA,IV,RTA,False,,,RTA,ATA GTA,IV,RTA,ATA GTA,IV,RTA +TTA,L,1,ACG,TAA TCA TGA,*S,V,TVA,TAA TCA,*S,TMA,False,,,TMA,TCA,S,TCA,TCA,S,TCA +TTA,L,2,CT,TTC TTT,F,Y,TTY,TTT,F,TTT,True,TTG TTT,LF,TTK,TTT,F,TTT,TTG TTT,LF,TTK +TTC,F,0,ACG,ATC CTC GTC,LIV,V,VTC,ATC GTC,IV,RTC,False,,,RTC,ATC GTC,IV,RTC,ATC GTC,IV,RTC +TTC,F,1,ACG,TAC TCC TGC,SCY,V,TVC,TAC TCC TGC,YSC,TVC,False,,,TVC,TAC TCC TGC,YSC,TVC,TAC TCC TGC,YSC,TVC +TTC,F,2,AG,TTA TTG,L,R,TTR,TTG,L,TTG,True,TTG TTT,LF,TTK,TTG,L,TTG,TTG TTT,LF,TTK +TTG,L,0,AG,ATG GTG,VM,R,RTG,ATG GTG,MV,RTG,False,,,RTG,ATG GTG,MV,RTG,ATG GTG,MV,RTG +TTG,L,1,ACG,TAG TCG TGG,*SW,V,TVG,TAG TCG TGG,*SW,TVG,False,,,TVG,TCG TGG,SW,TSG,TCG TGG,SW,TSG +TTG,L,2,CT,TTC TTT,F,Y,TTY,TTT,F,TTT,True,TTA TTT,LF,TTW,TTT,F,TTT,TTA TTT,LF,TTW +TTT,F,0,ACG,ATT CTT GTT,LIV,V,VTT,ATT GTT,IV,RTT,False,,,RTT,ATT GTT,IV,RTT,ATT GTT,IV,RTT +TTT,F,1,ACG,TAT TCT TGT,SCY,V,TVT,TAT TCT TGT,YSC,TVT,False,,,TVT,TAT TCT TGT,YSC,TVT,TAT TCT TGT,YSC,TVT +TTT,F,2,AG,TTA TTG,L,R,TTR,TTG,L,TTG,True,TTC TTG,FL,TTS,TTG,L,TTG,TTC TTG,FL,TTS diff --git a/data/.ipynb_checkpoints/missense_codon_table-checkpoint.ipynb b/data/.ipynb_checkpoints/missense_codon_table-checkpoint.ipynb new file mode 100644 index 0000000..363fcab --- /dev/null +++ b/data/.ipynb_checkpoints/missense_codon_table-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/bespoke_codon_table.csv b/data/bespoke_codon_table.csv similarity index 100% rename from bespoke_codon_table.csv rename to data/bespoke_codon_table.csv diff --git a/data/final_codon_table.csv b/data/final_codon_table.csv new file mode 100644 index 0000000..c183f47 --- /dev/null +++ b/data/final_codon_table.csv @@ -0,0 +1,193 @@ +codon,aa,position,missense_nuc,missense_codons,missense_aa,missense_iupac,missense_iupac_codon,sele_codons,sele_aa,sele_iupac_codon,syn_bool,syn_codons,syn_aa,syn_iupac_codon,no_stop_codons,no_stop_aa,no_stop_iupac_codon,no_stop_syn_codons,no_stop_syn_aa,no_stop_syn_iupac_codon +AAA,K,0,CGT,CAA GAA TAA,*QE,B,BAA,CAA GAA TAA,QE*,BAA,False,,,BAA,CAA GAA,QE,SAA,CAA GAA,QE,SAA +AAA,K,1,CGT,ACA AGA ATA,TRI,B,ABA,ACA AGA ATA,TRI,ABA,False,,,ABA,ACA AGA ATA,TRI,ABA,ACA AGA ATA,TRI,ABA +AAA,K,2,CT,AAC AAT,N,Y,AAY,AAT,N,AAT,True,AAG AAT,KN,AAK,AAT,N,AAT,AAG AAT,KN,AAK +AAC,N,0,CGT,CAC GAC TAC,HDY,B,BAC,CAC GAC TAC,HDY,BAC,False,,,BAC,CAC GAC TAC,HDY,BAC,CAC GAC TAC,HDY,BAC +AAC,N,1,CGT,ACC AGC ATC,TSI,B,ABC,ACC AGC ATC,TSI,ABC,False,,,ABC,ACC AGC ATC,TSI,ABC,ACC AGC ATC,TSI,ABC +AAC,N,2,AG,AAA AAG,K,R,AAR,AAA,K,AAA,True,AAA AAT,KN,AAW,AAA,K,AAA,AAA AAT,KN,AAW +AAG,K,0,CGT,CAG GAG TAG,*QE,B,BAG,CAG GAG TAG,QE*,BAG,False,,,BAG,CAG GAG,QE,SAG,CAG GAG,QE,SAG +AAG,K,1,CGT,ACG AGG ATG,TRM,B,ABG,ACG AGG ATG,TRM,ABG,False,,,ABG,ACG AGG ATG,TRM,ABG,ACG AGG ATG,TRM,ABG +AAG,K,2,CT,AAC AAT,N,Y,AAY,AAT,N,AAT,True,AAA AAT,KN,AAW,AAT,N,AAT,AAA AAT,KN,AAW +AAT,N,0,CGT,CAT GAT TAT,HDY,B,BAT,CAT GAT TAT,HDY,BAT,False,,,BAT,CAT GAT TAT,HDY,BAT,CAT GAT TAT,HDY,BAT +AAT,N,1,CGT,ACT AGT ATT,TSI,B,ABT,ACT AGT ATT,TSI,ABT,False,,,ABT,ACT AGT ATT,TSI,ABT,ACT AGT ATT,TSI,ABT +AAT,N,2,AG,AAA AAG,K,R,AAR,AAA,K,AAA,True,AAA AAC,KN,AAM,AAA,K,AAA,AAA AAC,KN,AAM +ACA,T,0,CGT,CCA GCA TCA,SAP,B,BCA,CCA GCA TCA,PAS,BCA,False,,,BCA,CCA GCA TCA,PAS,BCA,CCA GCA TCA,PAS,BCA +ACA,T,1,AGT,AAA AGA ATA,RIK,D,ADA,AAA AGA ATA,KRI,ADA,False,,,ADA,AAA AGA ATA,KRI,ADA,AAA AGA ATA,KRI,ADA +ACA,T,2,,,,A,ACA,,,,True,ACT,T,ACT,,,,ACT,T,ACT +ACC,T,0,CGT,CCC GCC TCC,SAP,B,BCC,CCC GCC TCC,PAS,BCC,False,,,BCC,CCC GCC TCC,PAS,BCC,CCC GCC TCC,PAS,BCC +ACC,T,1,AGT,AAC AGC ATC,NSI,D,ADC,AAC ATC,NI,AWC,False,,,AWC,AAC ATC,NI,AWC,AAC ATC,NI,AWC +ACC,T,2,,,,C,ACC,,,,True,ACT,T,ACT,,,,ACT,T,ACT +ACG,T,0,CGT,CCG GCG TCG,SAP,B,BCG,CCG GCG TCG,PAS,BCG,False,,,BCG,CCG GCG TCG,PAS,BCG,CCG GCG TCG,PAS,BCG +ACG,T,1,AGT,AAG AGG ATG,RMK,D,ADG,AAG AGG ATG,KRM,ADG,False,,,ADG,AAG AGG ATG,KRM,ADG,AAG AGG ATG,KRM,ADG +ACG,T,2,,,,G,ACG,,,,True,ACT,T,ACT,,,,ACT,T,ACT +ACT,T,0,CGT,CCT GCT TCT,SAP,B,BCT,CCT GCT TCT,PAS,BCT,False,,,BCT,CCT GCT TCT,PAS,BCT,CCT GCT TCT,PAS,BCT +ACT,T,1,AGT,AAT AGT ATT,NSI,D,ADT,AAT ATT,NI,AWT,False,,,AWT,AAT ATT,NI,AWT,AAT ATT,NI,AWT +ACT,T,2,,,,T,ACT,,,,True,ACA,T,ACA,,,,ACA,T,ACA +AGA,R,0,GT,GGA TGA,*G,K,KGA,GGA TGA,G*,KGA,False,,,KGA,GGA,G,GGA,GGA,G,GGA +AGA,R,1,ACT,AAA ACA ATA,TIK,H,AHA,AAA ACA ATA,KTI,AHA,False,,,AHA,AAA ACA ATA,KTI,AHA,AAA ACA ATA,KTI,AHA +AGA,R,2,CT,AGC AGT,S,Y,AGY,AGT,S,AGT,True,AGG AGT,RS,AGK,AGT,S,AGT,AGG AGT,RS,AGK +AGC,S,0,CGT,CGC GGC TGC,RCG,B,BGC,GGC TGC,GC,KGC,False,,,KGC,GGC TGC,GC,KGC,GGC TGC,GC,KGC +AGC,S,1,ACT,AAC ACC ATC,NTI,H,AHC,AAC ACC ATC,NTI,AHC,False,,,AHC,AAC ACC ATC,NTI,AHC,AAC ACC ATC,NTI,AHC +AGC,S,2,AG,AGA AGG,R,R,AGR,AGA,R,AGA,True,AGA AGT,RS,AGW,AGA,R,AGA,AGA AGT,RS,AGW +AGG,R,0,GT,GGG TGG,WG,K,KGG,GGG TGG,GW,KGG,False,,,KGG,GGG TGG,GW,KGG,GGG TGG,GW,KGG +AGG,R,1,ACT,AAG ACG ATG,TMK,H,AHG,AAG ACG ATG,KTM,AHG,False,,,AHG,AAG ACG ATG,KTM,AHG,AAG ACG ATG,KTM,AHG +AGG,R,2,CT,AGC AGT,S,Y,AGY,AGT,S,AGT,True,AGA AGT,RS,AGW,AGT,S,AGT,AGA AGT,RS,AGW +AGT,S,0,CGT,CGT GGT TGT,RCG,B,BGT,GGT TGT,GC,KGT,False,,,KGT,GGT TGT,GC,KGT,GGT TGT,GC,KGT +AGT,S,1,ACT,AAT ACT ATT,NTI,H,AHT,AAT ACT ATT,NTI,AHT,False,,,AHT,AAT ACT ATT,NTI,AHT,AAT ACT ATT,NTI,AHT +AGT,S,2,AG,AGA AGG,R,R,AGR,AGA,R,AGA,True,AGA AGC,RS,AGM,AGA,R,AGA,AGA AGC,RS,AGM +ATA,I,0,CGT,CTA GTA TTA,LV,B,BTA,TTA GTA,LV,KTA,False,,,KTA,TTA GTA,LV,KTA,TTA GTA,LV,KTA +ATA,I,1,ACG,AAA ACA AGA,TRK,V,AVA,AAA ACA AGA,KTR,AVA,False,,,AVA,AAA ACA AGA,KTR,AVA,AAA ACA AGA,KTR,AVA +ATA,I,2,G,ATG,M,G,ATG,ATG,M,ATG,True,ATG ATT,MI,ATK,ATG,M,ATG,ATG ATT,MI,ATK +ATC,I,0,CGT,CTC GTC TTC,LFV,B,BTC,CTC GTC TTC,LVF,BTC,False,,,BTC,CTC GTC TTC,LVF,BTC,CTC GTC TTC,LVF,BTC +ATC,I,1,ACG,AAC ACC AGC,NST,V,AVC,AAC ACC AGC,NTS,AVC,False,,,AVC,AAC ACC AGC,NTS,AVC,AAC ACC AGC,NTS,AVC +ATC,I,2,G,ATG,M,G,ATG,ATG,M,ATG,True,ATG ATT,MI,ATK,ATG,M,ATG,ATG ATT,MI,ATK +ATG,M,0,CGT,CTG GTG TTG,LV,B,BTG,TTA GTA,LV,KTG,False,,,KTG,TTA GTA,LV,KTG,TTA GTA,LV,KTG +ATG,M,1,ACG,AAG ACG AGG,TRK,V,AVG,AAG ACG AGG,KTR,AVG,False,,,AVG,AAG ACG AGG,KTR,AVG,AAG ACG AGG,KTR,AVG +ATG,M,2,ACT,ATA ATC ATT,I,H,ATH,ATT,I,ATT,True,ATG ATT,MI,ATK,ATT,I,ATT,ATG ATT,MI,ATK +ATT,I,0,CGT,CTT GTT TTT,LFV,B,BTT,CTT GTT TTT,LVF,BTT,False,,,BTT,CTT GTT TTT,LVF,BTT,CTT GTT TTT,LVF,BTT +ATT,I,1,ACG,AAT ACT AGT,NST,V,AVT,AAT ACT AGT,NTS,AVT,False,,,AVT,AAT ACT AGT,NTS,AVT,AAT ACT AGT,NTS,AVT +ATT,I,2,G,ATG,M,G,ATG,ATG,M,ATG,True,ATA ATG,IM,ATR,ATG,M,ATG,ATA ATG,IM,ATR +CAA,Q,0,AGT,AAA GAA TAA,*EK,D,DAA,AAA GAA TAA,KE*,DAA,False,,,DAA,AAA GAA,KE,RAA,AAA GAA,KE,RAA +CAA,Q,1,CGT,CCA CGA CTA,RLP,B,CBA,CCA CGA CTA,PRL,CBA,False,,,CBA,CCA CGA CTA,PRL,CBA,CCA CGA CTA,PRL,CBA +CAA,Q,2,CT,CAC CAT,H,Y,CAY,CAT,H,CAT,True,CAG CAT,QH,CAK,CAT,H,CAT,CAG CAT,QH,CAK +CAC,H,0,AGT,AAC GAC TAC,NDY,D,DAC,AAC GAC TAC,NDY,DAC,False,,,DAC,AAC GAC TAC,NDY,DAC,AAC GAC TAC,NDY,DAC +CAC,H,1,CGT,CCC CGC CTC,RLP,B,CBC,CCC CGC CTC,PRL,CBC,False,,,CBC,CCC CGC CTC,PRL,CBC,CCC CGC CTC,PRL,CBC +CAC,H,2,AG,CAA CAG,Q,R,CAR,CAA,Q,CAA,True,CAA CAT,QH,CAW,CAA,Q,CAA,CAA CAT,QH,CAW +CAG,Q,0,AGT,AAG GAG TAG,*EK,D,DAG,AAG GAG TAG,KE*,DAG,False,,,DAG,AAG GAG,KE,RAG,AAG GAG,KE,RAG +CAG,Q,1,CGT,CCG CGG CTG,RLP,B,CBG,CCG CGG CTG,PRL,CBG,False,,,CBG,CCG CGG CTG,PRL,CBG,CCG CGG CTG,PRL,CBG +CAG,Q,2,CT,CAC CAT,H,Y,CAY,CAT,H,CAT,True,CAA CAT,QH,CAW,CAT,H,CAT,CAA CAT,QH,CAW +CAT,H,0,AGT,AAT GAT TAT,NDY,D,DAT,AAT GAT TAT,NDY,DAT,False,,,DAT,AAT GAT TAT,NDY,DAT,AAT GAT TAT,NDY,DAT +CAT,H,1,CGT,CCT CGT CTT,RLP,B,CBT,CCT CGT CTT,PRL,CBT,False,,,CBT,CCT CGT CTT,PRL,CBT,CCT CGT CTT,PRL,CBT +CAT,H,2,AG,CAA CAG,Q,R,CAR,CAA,Q,CAA,True,CAA CAC,QH,CAM,CAA,Q,CAA,CAA CAC,QH,CAM +CCA,P,0,AGT,ACA GCA TCA,TSA,D,DCA,ACA GCA TCA,TAS,DCA,False,,,DCA,ACA GCA TCA,TAS,DCA,ACA GCA TCA,TAS,DCA +CCA,P,1,AGT,CAA CGA CTA,RQL,D,CDA,CAA CGA CTA,QRL,CDA,False,,,CDA,CAA CGA CTA,QRL,CDA,CAA CGA CTA,QRL,CDA +CCA,P,2,,,,A,CCA,,,,True,CCT,P,CCT,,,,CCT,P,CCT +CCC,P,0,AGT,ACC GCC TCC,TSA,D,DCC,ACC GCC TCC,TAS,DCC,False,,,DCC,ACC GCC TCC,TAS,DCC,ACC GCC TCC,TAS,DCC +CCC,P,1,AGT,CAC CGC CTC,RHL,D,CDC,CAC CGC CTC,HRL,CDC,False,,,CDC,CAC CGC CTC,HRL,CDC,CAC CGC CTC,HRL,CDC +CCC,P,2,,,,C,CCC,,,,True,CCA,P,CCA,,,,CCA,P,CCA +CCG,P,0,AGT,ACG GCG TCG,TSA,D,DCG,ACG GCG TCG,TAS,DCG,False,,,DCG,ACG GCG TCG,TAS,DCG,ACG GCG TCG,TAS,DCG +CCG,P,1,AGT,CAG CGG CTG,RQL,D,CDG,CAG CGG CTG,QRL,CDG,False,,,CDG,CAG CGG CTG,QRL,CDG,CAG CGG CTG,QRL,CDG +CCG,P,2,,,,G,CCG,,,,True,CCA,P,CCA,,,,CCA,P,CCA +CCT,P,0,AGT,ACT GCT TCT,TSA,D,DCT,ACT GCT TCT,TAS,DCT,False,,,DCT,ACT GCT TCT,TAS,DCT,ACT GCT TCT,TAS,DCT +CCT,P,1,AGT,CAT CGT CTT,RHL,D,CDT,CAT CGT CTT,HRL,CDT,False,,,CDT,CAT CGT CTT,HRL,CDT,CAT CGT CTT,HRL,CDT +CCT,P,2,,,,T,CCT,,,,True,CCA,P,CCA,,,,CCA,P,CCA +CGA,R,0,GT,GGA TGA,*G,K,KGA,GGA TGA,G*,KGA,True,AGA GGA TGA,RG*,DGA,GGA,G,GGA,AGA GGA,RG,RGA +CGA,R,1,ACT,CAA CCA CTA,LQP,H,CHA,CAA CCA CTA,QPL,CHA,False,,,CHA,CAA CCA CTA,QPL,CHA,CAA CCA CTA,QPL,CHA +CGA,R,2,,,,A,CGA,,,,False,,,,,,,,, +CGC,R,0,AGT,AGC GGC TGC,SCG,D,DGC,AGC GGC TGC,SGC,DGC,False,,,DGC,AGC GGC TGC,SGC,DGC,AGC GGC TGC,SGC,DGC +CGC,R,1,ACT,CAC CCC CTC,LHP,H,CHC,CAC CCC CTC,HPL,CHC,False,,,CHC,CAC CCC CTC,HPL,CHC,CAC CCC CTC,HPL,CHC +CGC,R,2,,,,C,CGC,,,,True,CGT,R,CGT,,,,CGT,R,CGT +CGG,R,0,GT,GGG TGG,WG,K,KGG,GGG TGG,GW,KGG,True,AGG GGG TGG,RGW,DGG,GGG TGG,GW,KGG,AGG GGG TGG,RGW,DGG +CGG,R,1,ACT,CAG CCG CTG,LQP,H,CHG,CAG CCG CTG,QPL,CHG,False,,,CHG,CAG CCG CTG,QPL,CHG,CAG CCG CTG,QPL,CHG +CGG,R,2,,,,G,CGG,,,,False,,,,,,,,, +CGT,R,0,AGT,AGT GGT TGT,SCG,D,DGT,AGT GGT TGT,SGC,DGT,False,,,DGT,AGT GGT TGT,SGC,DGT,AGT GGT TGT,SGC,DGT +CGT,R,1,ACT,CAT CCT CTT,LHP,H,CHT,CAT CCT CTT,HPL,CHT,False,,,CHT,CAT CCT CTT,HPL,CHT,CAT CCT CTT,HPL,CHT +CGT,R,2,,,,T,CGT,,,,True,CGA,R,CGA,,,,CGA,R,CGA +CTA,L,0,AG,ATA GTA,IV,R,RTA,ATA GTA,IV,RTA,True,ATA GTA TTA,IVL,DTA,ATA GTA,IV,RTA,ATA GTA TTA,IVL,DTA +CTA,L,1,ACG,CAA CCA CGA,RQP,V,CVA,CAA CCA CGA,QPR,CVA,False,,,CVA,CAA CCA CGA,QPR,CVA,CAA CCA CGA,QPR,CVA +CTA,L,2,,,,A,CTA,,,,False,,,,,,,,, +CTC,L,0,AGT,ATC GTC TTC,FIV,D,DTC,ATC GTC TTC,IVF,DTC,False,,,DTC,ATC GTC TTC,IVF,DTC,ATC GTC TTC,IVF,DTC +CTC,L,1,ACG,CAC CCC CGC,RHP,V,CVC,CAC CCC CGC,HPR,CVC,False,,,CVC,CAC CCC CGC,HPR,CVC,CAC CCC CGC,HPR,CVC +CTC,L,2,,,,C,CTC,,,,True,CTA,L,CTA,,,,CTA,L,CTA +CTG,L,0,AG,ATG GTG,VM,R,RTG,ATG GTG,MV,RTG,True,ATG GTG TTG,MVL,DTG,ATG GTG,MV,RTG,ATG GTG TTG,MVL,DTG +CTG,L,1,ACG,CAG CCG CGG,RQP,V,CVG,CAG CCG CGG,QPR,CVG,False,,,CVG,CAG CCG CGG,QPR,CVG,CAG CCG CGG,QPR,CVG +CTG,L,2,,,,G,CTG,,,,False,,,,,,,,, +CTT,L,0,AGT,ATT GTT TTT,FIV,D,DTT,ATT GTT TTT,IVF,DTT,False,,,DTT,ATT GTT TTT,IVF,DTT,ATT GTT TTT,IVF,DTT +CTT,L,1,ACG,CAT CCT CGT,RHP,V,CVT,CAT CCT CGT,HPR,CVT,False,,,CVT,CAT CCT CGT,HPR,CVT,CAT CCT CGT,HPR,CVT +CTT,L,2,,,,T,CTT,,,,True,CTA,L,CTA,,,,CTA,L,CTA +GAA,E,0,ACT,AAA CAA TAA,*QK,H,HAA,AAA CAA TAA,KQ*,HAA,False,,,HAA,AAA CAA,KQ,MAA,AAA CAA,KQ,MAA +GAA,E,1,CGT,GCA GGA GTA,VAG,B,GBA,GCA GGA GTA,AGV,GBA,False,,,GBA,GCA GGA GTA,AGV,GBA,GCA GGA GTA,AGV,GBA +GAA,E,2,CT,GAC GAT,D,Y,GAY,GAT,D,GAT,True,GAG GAT,ED,GAK,GAT,D,GAT,GAG GAT,ED,GAK +GAC,D,0,ACT,AAC CAC TAC,NHY,H,HAC,AAC CAC TAC,NHY,HAC,False,,,HAC,AAC CAC TAC,NHY,HAC,AAC CAC TAC,NHY,HAC +GAC,D,1,CGT,GCC GGC GTC,VAG,B,GBC,GCC GGC GTC,AGV,GBC,False,,,GBC,GCC GGC GTC,AGV,GBC,GCC GGC GTC,AGV,GBC +GAC,D,2,AG,GAA GAG,E,R,GAR,GAA,E,GAA,True,GAA GAT,ED,GAW,GAA,E,GAA,GAA GAT,ED,GAW +GAG,E,0,ACT,AAG CAG TAG,*QK,H,HAG,AAG CAG TAG,KQ*,HAG,False,,,HAG,AAG CAG,KQ,MAG,AAG CAG,KQ,MAG +GAG,E,1,CGT,GCG GGG GTG,VAG,B,GBG,GCG GGG GTG,AGV,GBG,False,,,GBG,GCG GGG GTG,AGV,GBG,GCG GGG GTG,AGV,GBG +GAG,E,2,CT,GAC GAT,D,Y,GAY,GAT,D,GAT,True,GAA GAT,ED,GAW,GAT,D,GAT,GAA GAT,ED,GAW +GAT,D,0,ACT,AAT CAT TAT,NHY,H,HAT,AAT CAT TAT,NHY,HAT,False,,,HAT,AAT CAT TAT,NHY,HAT,AAT CAT TAT,NHY,HAT +GAT,D,1,CGT,GCT GGT GTT,VAG,B,GBT,GCT GGT GTT,AGV,GBT,False,,,GBT,GCT GGT GTT,AGV,GBT,GCT GGT GTT,AGV,GBT +GAT,D,2,AG,GAA GAG,E,R,GAR,GAA,E,GAA,True,GAA GAC,ED,GAM,GAA,E,GAA,GAA GAC,ED,GAM +GCA,A,0,ACT,ACA CCA TCA,TSP,H,HCA,ACA CCA TCA,TPS,HCA,False,,,HCA,ACA CCA TCA,TPS,HCA,ACA CCA TCA,TPS,HCA +GCA,A,1,AGT,GAA GGA GTA,GVE,D,GDA,GAA GGA GTA,EGV,GDA,False,,,GDA,GAA GGA GTA,EGV,GDA,GAA GGA GTA,EGV,GDA +GCA,A,2,,,,A,GCA,,,,True,GCT,A,GCT,,,,GCT,A,GCT +GCC,A,0,ACT,ACC CCC TCC,TSP,H,HCC,ACC CCC TCC,TPS,HCC,False,,,HCC,ACC CCC TCC,TPS,HCC,ACC CCC TCC,TPS,HCC +GCC,A,1,AGT,GAC GGC GTC,VDG,D,GDC,GAC GGC GTC,DGV,GDC,False,,,GDC,GAC GGC GTC,DGV,GDC,GAC GGC GTC,DGV,GDC +GCC,A,2,,,,C,GCC,,,,True,GCT,A,GCT,,,,GCT,A,GCT +GCG,A,0,ACT,ACG CCG TCG,TSP,H,HCG,ACG CCG TCG,TPS,HCG,False,,,HCG,ACG CCG TCG,TPS,HCG,ACG CCG TCG,TPS,HCG +GCG,A,1,AGT,GAG GGG GTG,GVE,D,GDG,GAG GGG GTG,EGV,GDG,False,,,GDG,GAG GGG GTG,EGV,GDG,GAG GGG GTG,EGV,GDG +GCG,A,2,,,,G,GCG,,,,True,GCT,A,GCT,,,,GCT,A,GCT +GCT,A,0,ACT,ACT CCT TCT,TSP,H,HCT,ACT CCT TCT,TPS,HCT,False,,,HCT,ACT CCT TCT,TPS,HCT,ACT CCT TCT,TPS,HCT +GCT,A,1,AGT,GAT GGT GTT,VDG,D,GDT,GAT GGT GTT,DGV,GDT,False,,,GDT,GAT GGT GTT,DGV,GDT,GAT GGT GTT,DGV,GDT +GCT,A,2,,,,T,GCT,,,,True,GCA,A,GCA,,,,GCA,A,GCA +GGA,G,0,ACT,AGA CGA TGA,R*,H,HGA,AGA TGA,R*,WGA,False,,,WGA,AGA,R,AGA,AGA,R,AGA +GGA,G,1,ACT,GAA GCA GTA,VAE,H,GHA,GAA GCA GTA,EAV,GHA,False,,,GHA,GAA GCA GTA,EAV,GHA,GAA GCA GTA,EAV,GHA +GGA,G,2,,,,A,GGA,,,,True,GGT,G,GGT,,,,GGT,G,GGT +GGC,G,0,ACT,AGC CGC TGC,RSC,H,HGC,AGC CGC TGC,SRC,HGC,False,,,HGC,AGC CGC TGC,SRC,HGC,AGC CGC TGC,SRC,HGC +GGC,G,1,ACT,GAC GCC GTC,ADV,H,GHC,GAC GCC GTC,DAV,GHC,False,,,GHC,GAC GCC GTC,DAV,GHC,GAC GCC GTC,DAV,GHC +GGC,G,2,,,,C,GGC,,,,True,GGT,G,GGT,,,,GGT,G,GGT +GGG,G,0,ACT,AGG CGG TGG,RW,H,HGG,AGG TGG,RW,WGG,False,,,WGG,AGG TGG,RW,WGG,AGG TGG,RW,WGG +GGG,G,1,ACT,GAG GCG GTG,VAE,H,GHG,GAG GCG GTG,EAV,GHG,False,,,GHG,GAG GCG GTG,EAV,GHG,GAG GCG GTG,EAV,GHG +GGG,G,2,,,,G,GGG,,,,True,GGT,G,GGT,,,,GGT,G,GGT +GGT,G,0,ACT,AGT CGT TGT,RSC,H,HGT,AGT CGT TGT,SRC,HGT,False,,,HGT,AGT CGT TGT,SRC,HGT,AGT CGT TGT,SRC,HGT +GGT,G,1,ACT,GAT GCT GTT,ADV,H,GHT,GAT GCT GTT,DAV,GHT,False,,,GHT,GAT GCT GTT,DAV,GHT,GAT GCT GTT,DAV,GHT +GGT,G,2,,,,T,GGT,,,,True,GGA,G,GGA,,,,GGA,G,GGA +GTA,V,0,ACT,ATA CTA TTA,LI,H,HTA,ATA TTA,IL,WTA,False,,,WTA,ATA TTA,IL,WTA,ATA TTA,IL,WTA +GTA,V,1,ACG,GAA GCA GGA,GAE,V,GVA,GAA GCA GGA,EAG,GVA,False,,,GVA,GAA GCA GGA,EAG,GVA,GAA GCA GGA,EAG,GVA +GTA,V,2,,,,A,GTA,,,,True,GTT,V,GTT,,,,GTT,V,GTT +GTC,V,0,ACT,ATC CTC TTC,LFI,H,HTC,ATC CTC TTC,ILF,HTC,False,,,HTC,ATC CTC TTC,ILF,HTC,ATC CTC TTC,ILF,HTC +GTC,V,1,ACG,GAC GCC GGC,ADG,V,GVC,GAC GCC GGC,DAG,GVC,False,,,GVC,GAC GCC GGC,DAG,GVC,GAC GCC GGC,DAG,GVC +GTC,V,2,,,,C,GTC,,,,True,GTT,V,GTT,,,,GTT,V,GTT +GTG,V,0,ACT,ATG CTG TTG,LM,H,HTG,ATG TTG,ML,WTG,False,,,WTG,ATG TTG,ML,WTG,ATG TTG,ML,WTG +GTG,V,1,ACG,GAG GCG GGG,GAE,V,GVG,GAG GCG GGG,EAG,GVG,False,,,GVG,GAG GCG GGG,EAG,GVG,GAG GCG GGG,EAG,GVG +GTG,V,2,,,,G,GTG,,,,True,GTT,V,GTT,,,,GTT,V,GTT +GTT,V,0,ACT,ATT CTT TTT,LFI,H,HTT,ATT CTT TTT,ILF,HTT,False,,,HTT,ATT CTT TTT,ILF,HTT,ATT CTT TTT,ILF,HTT +GTT,V,1,ACG,GAT GCT GGT,ADG,V,GVT,GAT GCT GGT,DAG,GVT,False,,,GVT,GAT GCT GGT,DAG,GVT,GAT GCT GGT,DAG,GVT +GTT,V,2,,,,T,GTT,,,,True,GTA,V,GTA,,,,GTA,V,GTA +TAA,*,0,ACG,AAA CAA GAA,QEK,V,VAA,AAA CAA GAA,KQE,VAA,False,,,VAA,AAA CAA GAA,KQE,VAA,AAA CAA GAA,KQE,VAA +TAA,*,1,CT,TCA TTA,LS,Y,TYA,TCA TTA,SL,TYA,False,,,TYA,TCA TTA,SL,TYA,TCA TTA,SL,TYA +TAA,*,2,CT,TAC TAT,Y,Y,TAY,TAT,Y,TAT,True,TAG TAT,*Y,TAK,TAT,Y,TAT,TAT,Y,TAT +TAC,Y,0,ACG,AAC CAC GAC,NHD,V,VAC,AAC CAC GAC,NHD,VAC,False,,,VAC,AAC CAC GAC,NHD,VAC,AAC CAC GAC,NHD,VAC +TAC,Y,1,CGT,TCC TGC TTC,SFC,B,TBC,TCC TGC TTC,SCF,TBC,False,,,TBC,TCC TGC TTC,SCF,TBC,TCC TGC TTC,SCF,TBC +TAC,Y,2,AG,TAA TAG,*,R,TAR,TAA,*,TAA,True,TAA TAT,*Y,TAW,,,,TAT,Y,TAT +TAG,*,0,ACG,AAG CAG GAG,QEK,V,VAG,AAG CAG GAG,KQE,VAG,False,,,VAG,AAG CAG GAG,KQE,VAG,AAG CAG GAG,KQE,VAG +TAG,*,1,CGT,TCG TGG TTG,LSW,B,TBG,TCG TGG TTG,SWL,TBG,False,,,TBG,TCG TGG TTG,SWL,TBG,TCG TGG TTG,SWL,TBG +TAG,*,2,CT,TAC TAT,Y,Y,TAY,TAT,Y,TAT,True,TAA TAT,*Y,TAW,TAT,Y,TAT,TAT,Y,TAT +TAT,Y,0,ACG,AAT CAT GAT,NHD,V,VAT,AAT CAT GAT,NHD,VAT,False,,,VAT,AAT CAT GAT,NHD,VAT,AAT CAT GAT,NHD,VAT +TAT,Y,1,CGT,TCT TGT TTT,SFC,B,TBT,TCT TGT TTT,SCF,TBT,False,,,TBT,TCT TGT TTT,SCF,TBT,TCT TGT TTT,SCF,TBT +TAT,Y,2,AG,TAA TAG,*,R,TAR,TAA,*,TAA,True,TAA TAC,*Y,TAM,,,,TAC,Y,TAC +TCA,S,0,ACG,ACA CCA GCA,TAP,V,VCA,ACA CCA GCA,TPA,VCA,False,,,VCA,ACA CCA GCA,TPA,VCA,ACA CCA GCA,TPA,VCA +TCA,S,1,AGT,TAA TGA TTA,L*,D,TDA,TAA TTA,*L,TWA,False,,,TWA,TTA,L,TTA,TTA,L,TTA +TCA,S,2,,,,A,TCA,,,,True,TCT,S,TCT,,,,TCT,S,TCT +TCC,S,0,ACG,ACC CCC GCC,TAP,V,VCC,ACC CCC GCC,TPA,VCC,False,,,VCC,ACC CCC GCC,TPA,VCC,ACC CCC GCC,TPA,VCC +TCC,S,1,AGT,TAC TGC TTC,FCY,D,TDC,TAC TGC TTC,YCF,TDC,False,,,TDC,TAC TGC TTC,YCF,TDC,TAC TGC TTC,YCF,TDC +TCC,S,2,,,,C,TCC,,,,True,TCT,S,TCT,,,,TCT,S,TCT +TCG,S,0,ACG,ACG CCG GCG,TAP,V,VCG,ACG CCG GCG,TPA,VCG,False,,,VCG,ACG CCG GCG,TPA,VCG,ACG CCG GCG,TPA,VCG +TCG,S,1,AGT,TAG TGG TTG,L*W,D,TDG,TAG TGG TTG,*WL,TDG,False,,,TDG,TGG TTG,WL,TKG,TGG TTG,WL,TKG +TCG,S,2,,,,G,TCG,,,,True,TCT,S,TCT,,,,TCT,S,TCT +TCT,S,0,ACG,ACT CCT GCT,TAP,V,VCT,ACT CCT GCT,TPA,VCT,False,,,VCT,ACT CCT GCT,TPA,VCT,ACT CCT GCT,TPA,VCT +TCT,S,1,AGT,TAT TGT TTT,FCY,D,TDT,TAT TGT TTT,YCF,TDT,False,,,TDT,TAT TGT TTT,YCF,TDT,TAT TGT TTT,YCF,TDT +TCT,S,2,,,,T,TCT,,,,True,TCA,S,TCA,,,,TCA,S,TCA +TGA,*,0,ACG,AGA CGA GGA,RG,V,VGA,AGA GGA,RG,RGA,False,,,RGA,AGA GGA,RG,RGA,AGA GGA,RG,RGA +TGA,*,1,CT,TCA TTA,LS,Y,TYA,TCA TTA,SL,TYA,True,TAA TCA TTA,*SL,THA,TCA TTA,SL,TYA,TCA TTA,SL,TYA +TGA,*,2,CGT,TGC TGG TGT,CW,B,TGB,TGT TGG,CW,TGK,False,,,TGK,TGT TGG,CW,TGK,TGT TGG,CW,TGK +TGC,C,0,ACG,AGC CGC GGC,RSG,V,VGC,CGC GGC,RG,SGC,False,,,SGC,CGC GGC,RG,SGC,CGC GGC,RG,SGC +TGC,C,1,ACT,TAC TCC TTC,SFY,H,THC,TAC TCC TTC,YSF,THC,False,,,THC,TAC TCC TTC,YSF,THC,TAC TCC TTC,YSF,THC +TGC,C,2,AG,TGA TGG,*W,R,TGR,TGA TGG,*W,TGR,True,TGA TGG TGT,*WC,TGD,TGG,W,TGG,TGG TGT,WC,TGK +TGG,W,0,ACG,AGG CGG GGG,RG,V,VGG,AGG GGG,RG,RGG,False,,,RGG,AGG GGG,RG,RGG,AGG GGG,RG,RGG +TGG,W,1,ACT,TAG TCG TTG,L*S,H,THG,TAG TCG TTG,*SL,THG,False,,,THG,TCG TTG,SL,TYG,TCG TTG,SL,TYG +TGG,W,2,ACT,TGA TGC TGT,*C,H,TGH,TGA TGC,*C,TGM,True,TGC TGG TGT,CWC,TGB,TGC,C,TGC,TGC TGG TGT,CWC,TGB +TGT,C,0,ACG,AGT CGT GGT,RSG,V,VGT,CGT GGT,RG,SGT,False,,,SGT,CGT GGT,RG,SGT,CGT GGT,RG,SGT +TGT,C,1,ACT,TAT TCT TTT,SFY,H,THT,TAT TCT TTT,YSF,THT,False,,,THT,TAT TCT TTT,YSF,THT,TAT TCT TTT,YSF,THT +TGT,C,2,AG,TGA TGG,*W,R,TGR,TGA TGG,*W,TGR,True,TGA TGC TGG,*CW,TGV,TGG,W,TGG,TGC TGG,CW,TGS +TTA,L,0,AG,ATA GTA,IV,R,RTA,ATA GTA,IV,RTA,False,,,RTA,ATA GTA,IV,RTA,ATA GTA,IV,RTA +TTA,L,1,ACG,TAA TCA TGA,*S,V,TVA,TAA TCA,*S,TMA,False,,,TMA,TCA,S,TCA,TCA,S,TCA +TTA,L,2,CT,TTC TTT,F,Y,TTY,TTT,F,TTT,True,TTG TTT,LF,TTK,TTT,F,TTT,TTG TTT,LF,TTK +TTC,F,0,ACG,ATC CTC GTC,LIV,V,VTC,ATC GTC,IV,RTC,False,,,RTC,ATC GTC,IV,RTC,ATC GTC,IV,RTC +TTC,F,1,ACG,TAC TCC TGC,SCY,V,TVC,TAC TCC TGC,YSC,TVC,False,,,TVC,TAC TCC TGC,YSC,TVC,TAC TCC TGC,YSC,TVC +TTC,F,2,AG,TTA TTG,L,R,TTR,TTG,L,TTG,True,TTG TTT,LF,TTK,TTG,L,TTG,TTG TTT,LF,TTK +TTG,L,0,AG,ATG GTG,VM,R,RTG,ATG GTG,MV,RTG,False,,,RTG,ATG GTG,MV,RTG,ATG GTG,MV,RTG +TTG,L,1,ACG,TAG TCG TGG,*SW,V,TVG,TAG TCG TGG,*SW,TVG,False,,,TVG,TCG TGG,SW,TSG,TCG TGG,SW,TSG +TTG,L,2,CT,TTC TTT,F,Y,TTY,TTT,F,TTT,True,TTA TTT,LF,TTW,TTT,F,TTT,TTA TTT,LF,TTW +TTT,F,0,ACG,ATT CTT GTT,LIV,V,VTT,ATT GTT,IV,RTT,False,,,RTT,ATT GTT,IV,RTT,ATT GTT,IV,RTT +TTT,F,1,ACG,TAT TCT TGT,SCY,V,TVT,TAT TCT TGT,YSC,TVT,False,,,TVT,TAT TCT TGT,YSC,TVT,TAT TCT TGT,YSC,TVT +TTT,F,2,AG,TTA TTG,L,R,TTR,TTG,L,TTG,True,TTC TTG,FL,TTS,TTG,L,TTG,TTC TTG,FL,TTS diff --git a/data/missense_codon_table.ipynb b/data/missense_codon_table.ipynb new file mode 100644 index 0000000..59a21fe --- /dev/null +++ b/data/missense_codon_table.ipynb @@ -0,0 +1,167 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "83691daa-92cf-4438-ba48-d4eeb44cc597", + "metadata": {}, + "outputs": [], + "source": [ + "# make missense codon tables\n", + "\n", + "# this uses a corrected function to make missense variants\n", + "\n", + "from itertools import product\n", + "from Bio.Seq import Seq\n", + "import pandas as pd\n", + "\n", + "iupac_dict = {'A':'A','C':'C','G':'G','T':'T','AC':'M','AG':'R','AT':'W','CG':'S','CT':'Y','GT':'K','ACG':'V','ACT':'H','AGT':'D','CGT':'B','ACGT':'N'}\n", + "\n", + "\n", + "# AA LEVEL (64 entries)\n", + "nucleotides = 'ACGT'\n", + "\n", + "codon_list = []\n", + "aa_list = []\n", + "iupac_codon_list = []\n", + "missense_aa_list = []\n", + "\n", + "\n", + "for codon in list(''.join(w) for w in product(nucleotides, repeat=3)):\n", + " aa = Seq(codon).translate(table='Standard')[0]\n", + " new_aas = []\n", + " iupac_codon = []\n", + " \n", + " # loop through each position in codon\n", + " for position in range(3):\n", + " iupac_n = []\n", + " \n", + " # looping through each nucleotide\n", + " for n in nucleotides:\n", + " new_codon = codon[:position] + n + codon[position + 1:]\n", + " new_aa = Seq(new_codon).translate(table='Standard')[0]\n", + " if new_aa != aa:\n", + " new_aas.append(new_aa)\n", + " iupac_n.append(n)\n", + " else:\n", + " continue\n", + " \n", + " # iupac symbols\n", + " if not iupac_n: # check if the list is empty\n", + " iupac_codon.append(codon[position])\n", + " else:\n", + " for i in iupac_dict.keys():\n", + " if set(i) == set(iupac_n):\n", + " iupac_codon.append(iupac_dict[i])\n", + " \n", + " # make assignments here\n", + " codon_list.append(codon)\n", + " aa_list.append(aa)\n", + " iupac_codon_list.append(''.join(iupac_codon))\n", + " missense_aa_list.append(''.join(set(new_aas)))\n", + "\n", + "# create AA Level dictionary\n", + "aa_level_dict = {\n", + " 'codon':codon_list, \n", + " 'aa':aa_list, \n", + " 'iupac_codon':iupac_codon_list,\n", + " 'missense_aa':missense_aa_list \n", + "}\n", + "\n", + "df = pd.DataFrame.from_dict(aa_level_dict)\n", + "df.to_csv('data/aa_missense_table.csv', index=False)\n", + "\n", + "# CODON LEVEL\n", + "# df: codon, aa, position, nucleotides, missense_codons, missense_aa, iupac, iupac_codon\n", + "nucleotides = 'ACGT'\n", + "\n", + "codon_list = []\n", + "aa_list = []\n", + "position_list = []\n", + "nucleotides_list = []\n", + "missense_codons_list = []\n", + "missense_aa_list = []\n", + "iupac_list = []\n", + "iupac_codon_list = []\n", + "\n", + "for codon in list(''.join(w) for w in product(nucleotides, repeat=3)):\n", + " aa = Seq(codon).translate(table='Standard')[0]\n", + " \n", + " # loop through each position in codon\n", + " for position in range(3): \n", + " new_aas = []\n", + " iupac_n = []\n", + " new_codons = []\n", + " # looping through each nucleotide\n", + " for i in nucleotides:\n", + " new_codon = codon[:position] + i + codon[position + 1:]\n", + " new_aa = Seq(new_codon).translate(table='Standard')[0]\n", + " if not new_aa == aa:\n", + " new_aas.append(new_aa)\n", + " iupac_n.append(i)\n", + " new_codons.append(new_codon)\n", + " else:\n", + " continue\n", + " \n", + " #iupac\n", + " if not iupac_n: # check if iupac_n is empty\n", + " iupac = codon[position]\n", + " else:\n", + " for i in iupac_dict.keys():\n", + " if set(i) == set(iupac_n):\n", + " iupac = iupac_dict[i]\n", + " \n", + " # make assignments here\n", + " codon_list.append(codon)\n", + " aa_list.append(aa)\n", + " position_list.append(position)\n", + " nucleotides_list.append(''.join(iupac_n))\n", + " missense_codons_list.append(' '.join(new_codons))\n", + " missense_aa_list.append(''.join(set(new_aas)))\n", + " \n", + " \n", + " for i in iupac_dict.keys():\n", + " if set(i) == set(iupac_n):\n", + " iupac = iupac_dict[i]\n", + " iupac_list.append(iupac)\n", + " iupac_codon_list.append(codon[:position] + iupac + codon[position + 1:])\n", + "\n", + "# create dictionary\n", + "codon_level_dict = {\n", + " 'codon':codon_list, \n", + " 'aa':aa_list, \n", + " 'position':position_list,\n", + " 'missense_nucleotides':nucleotides_list,\n", + " 'missense_codons':missense_codons_list,\n", + " 'missense_aa':missense_aa_list,\n", + " 'iupac':iupac_list,\n", + " 'iupac_codon':iupac_codon_list\n", + "}\n", + "\n", + "df = pd.DataFrame.from_dict(codon_level_dict)\n", + "df.to_csv('data/codon_missense_table.csv', index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/yeast_synonymous_codon_table.csv b/data/yeast_synonymous_codon_table.csv similarity index 100% rename from yeast_synonymous_codon_table.csv rename to data/yeast_synonymous_codon_table.csv diff --git a/dms_codon_table_v2.csv b/dms_codon_table_v2.csv deleted file mode 100644 index 621de68..0000000 --- a/dms_codon_table_v2.csv +++ /dev/null @@ -1,193 +0,0 @@ -codon,aa,position,all_missense_aa,all_missense_nucleotides,all_missense_codons,all_iupac,all_iupac_codon,sele_missense_codons,syn_iupac_codon,syn_codon,sele_missense_nucleotides,sele_iupac,sele_iupac_codon,sele_aa,syn_aa,syn_missense_codons -AAA,K,0,*QE,CGT,CAA GAA TAA,B,BAA,CAA GAA TAA,BAA,,CGT,B,BAA,*QE,*QE,CAA GAA TAA -AAA,K,1,TRI,CGT,ACA AGA ATA,B,ABA,ACA AGA ATA,ABA,,CGT,B,ABA,RTI,RTI,ACA AGA ATA -AAA,K,2,N,CT,AAC AAT,Y,AAY,AAT,AAK,AAG,T,T,AAT,KN,KN,AAG AAT -AAC,N,0,HDY,CGT,CAC GAC TAC,B,BAC,CAC GAC TAC,BAC,,CGT,B,BAC,DYH,DYH,CAC GAC TAC -AAC,N,1,TSI,CGT,ACC AGC ATC,B,ABC,ACC AGC ATC,ABC,,CGT,B,ABC,STI,STI,ACC AGC ATC -AAC,N,2,K,AG,AAA AAG,R,AAR,AAA,AAW,AAT,A,A,AAA,KN,KN,AAA AAT -AAG,K,0,*QE,CGT,CAG GAG TAG,B,BAG,CAG GAG TAG,BAG,,CGT,B,BAG,*QE,*QE,CAG GAG TAG -AAG,K,1,TRM,CGT,ACG AGG ATG,B,ABG,ACG AGG ATG,ABG,,CGT,B,ABG,RTM,RTM,ACG AGG ATG -AAG,K,2,N,CT,AAC AAT,Y,AAY,AAT,AAW,AAA,T,T,AAT,KN,KN,AAA AAT -AAT,N,0,HDY,CGT,CAT GAT TAT,B,BAT,CAT GAT TAT,BAT,,CGT,B,BAT,DYH,DYH,CAT GAT TAT -AAT,N,1,TSI,CGT,ACT AGT ATT,B,ABT,ACT AGT ATT,ABT,,CGT,B,ABT,STI,STI,ACT AGT ATT -AAT,N,2,K,AG,AAA AAG,R,AAR,AAA,AAM,AAC,A,A,AAA,KN,KN,AAA AAC -ACA,T,0,SAP,CGT,CCA GCA TCA,B,BCA,CCA GCA TCA,BCA,,CGT,B,BCA,ASP,ASP,CCA GCA TCA -ACA,T,1,RIK,AGT,AAA AGA ATA,D,ADA,AAA AGA ATA,ADA,,AGT,D,ADA,KRI,KRI,AAA AGA ATA -ACA,T,2,,,,A,ACA,,ACT,,,,,T,T,ACT -ACC,T,0,SAP,CGT,CCC GCC TCC,B,BCC,CCC GCC TCC,BCC,,CGT,B,BCC,ASP,ASP,CCC GCC TCC -ACC,T,1,NSI,AGT,AAC AGC ATC,D,ADC,AAC ATC,ADC,,AT,W,AWC,NSI,NSI,AAC AGC ATC -ACC,T,2,,,,C,ACC,,ACT,,,,,T,T,ACT -ACG,T,0,SAP,CGT,CCG GCG TCG,B,BCG,CCG GCG TCG,BCG,,CGT,B,BCG,ASP,ASP,CCG GCG TCG -ACG,T,1,RMK,AGT,AAG AGG ATG,D,ADG,AAG AGG ATG,ADG,,AGT,D,ADG,KMR,KMR,AAG AGG ATG -ACG,T,2,,,,G,ACG,,ACT,,,,,T,T,ACT -ACT,T,0,SAP,CGT,CCT GCT TCT,B,BCT,CCT GCT TCT,BCT,,CGT,B,BCT,ASP,ASP,CCT GCT TCT -ACT,T,1,NSI,AGT,AAT AGT ATT,D,ADT,AAT ATT,ADT,,AT,W,AWT,NSI,NSI,AAT AGT ATT -ACT,T,2,,,,T,ACT,,ACA,,,,,T,T,ACA -AGA,R,0,*G,GT,GGA TGA,K,KGA,GGA TGA,KGA,,GT,K,KGA,*G,*G,GGA TGA -AGA,R,1,TIK,ACT,AAA ACA ATA,H,AHA,AAA ACA ATA,AHA,,ACT,H,AHA,KTI,KTI,AAA ACA ATA -AGA,R,2,S,CT,AGC AGT,Y,AGY,AGT,AGK,AGG,T,T,AGT,RS,RS,AGG AGT -AGC,S,0,RCG,CGT,CGC GGC TGC,B,BGC,GGC TGC,BGC,,GT,K,KGC,RCG,RCG,CGC GGC TGC -AGC,S,1,NTI,ACT,AAC ACC ATC,H,AHC,AAC ACC ATC,AHC,,ACT,H,AHC,NTI,NTI,AAC ACC ATC -AGC,S,2,R,AG,AGA AGG,R,AGR,AGA,AGW,AGT,A,A,AGA,RS,RS,AGA AGT -AGG,R,0,WG,GT,GGG TGG,K,KGG,GGG TGG,KGG,,GT,K,KGG,WG,WG,GGG TGG -AGG,R,1,TMK,ACT,AAG ACG ATG,H,AHG,AAG ACG ATG,AHG,,ACT,H,AHG,KTM,KTM,AAG ACG ATG -AGG,R,2,S,CT,AGC AGT,Y,AGY,AGT,AGW,AGA,T,T,AGT,RS,RS,AGA AGT -AGT,S,0,RCG,CGT,CGT GGT TGT,B,BGT,GGT TGT,BGT,,GT,K,KGT,RCG,RCG,CGT GGT TGT -AGT,S,1,NTI,ACT,AAT ACT ATT,H,AHT,AAT ACT ATT,AHT,,ACT,H,AHT,NTI,NTI,AAT ACT ATT -AGT,S,2,R,AG,AGA AGG,R,AGR,AGA,AGM,AGC,A,A,AGA,RS,RS,AGA AGC -ATA,I,0,LV,CGT,CTA GTA TTA,B,BTA,TTA GTA,BTA,,TG,K,KTA,VL,VL,CTA GTA TTA -ATA,I,1,TRK,ACG,AAA ACA AGA,V,AVA,AAA ACA AGA,AVA,,ACG,V,AVA,KTR,KTR,AAA ACA AGA -ATA,I,2,M,G,ATG,G,ATG,ATG,ATK,ATT,G,G,ATG,MI,MI,ATG ATT -ATC,I,0,LFV,CGT,CTC GTC TTC,B,BTC,CTC GTC TTC,BTC,,CGT,B,BTC,VFL,VFL,CTC GTC TTC -ATC,I,1,NST,ACG,AAC ACC AGC,V,AVC,AAC ACC AGC,AVC,,ACG,V,AVC,NTS,NTS,AAC ACC AGC -ATC,I,2,M,G,ATG,G,ATG,ATG,ATK,ATT,G,G,ATG,MI,MI,ATG ATT -ATG,M,0,LV,CGT,CTG GTG TTG,B,BTG,TTA GTA,BTG,,TG,K,KTG,VL,VL,CTG GTG TTG -ATG,M,1,TRK,ACG,AAG ACG AGG,V,AVG,AAG ACG AGG,AVG,,ACG,V,AVG,KTR,KTR,AAG ACG AGG -ATG,M,2,I,ACT,ATA ATC ATT,H,ATH,ATT,ATK,ATG,T,T,ATT,MI,MI,ATG ATT -ATT,I,0,LFV,CGT,CTT GTT TTT,B,BTT,CTT GTT TTT,BTT,,CGT,B,BTT,VFL,VFL,CTT GTT TTT -ATT,I,1,NST,ACG,AAT ACT AGT,V,AVT,AAT ACT AGT,AVT,,ACG,V,AVT,NTS,NTS,AAT ACT AGT -ATT,I,2,M,G,ATG,G,ATG,ATG,ATR,ATA,G,G,ATG,MI,MI,ATA ATG -CAA,Q,0,*EK,AGT,AAA GAA TAA,D,DAA,AAA GAA TAA,DAA,,AGT,D,DAA,K*E,K*E,AAA GAA TAA -CAA,Q,1,RLP,CGT,CCA CGA CTA,B,CBA,CCA CGA CTA,CBA,,CGT,B,CBA,RLP,RLP,CCA CGA CTA -CAA,Q,2,H,CT,CAC CAT,Y,CAY,CAT,CAK,CAG,T,T,CAT,QH,QH,CAG CAT -CAC,H,0,NDY,AGT,AAC GAC TAC,D,DAC,AAC GAC TAC,DAC,,AGT,D,DAC,NDY,NDY,AAC GAC TAC -CAC,H,1,RLP,CGT,CCC CGC CTC,B,CBC,CCC CGC CTC,CBC,,CGT,B,CBC,RLP,RLP,CCC CGC CTC -CAC,H,2,Q,AG,CAA CAG,R,CAR,CAA,CAW,CAT,A,A,CAA,QH,QH,CAA CAT -CAG,Q,0,*EK,AGT,AAG GAG TAG,D,DAG,AAG GAG TAG,DAG,,AGT,D,DAG,K*E,K*E,AAG GAG TAG -CAG,Q,1,RLP,CGT,CCG CGG CTG,B,CBG,CCG CGG CTG,CBG,,CGT,B,CBG,RLP,RLP,CCG CGG CTG -CAG,Q,2,H,CT,CAC CAT,Y,CAY,CAT,CAW,CAA,T,T,CAT,QH,QH,CAA CAT -CAT,H,0,NDY,AGT,AAT GAT TAT,D,DAT,AAT GAT TAT,DAT,,AGT,D,DAT,NDY,NDY,AAT GAT TAT -CAT,H,1,RLP,CGT,CCT CGT CTT,B,CBT,CCT CGT CTT,CBT,,CGT,B,CBT,RLP,RLP,CCT CGT CTT -CAT,H,2,Q,AG,CAA CAG,R,CAR,CAA,CAM,CAC,A,A,CAA,QH,QH,CAA CAC -CCA,P,0,TSA,AGT,ACA GCA TCA,D,DCA,ACA GCA TCA,DCA,,AGT,D,DCA,ATS,ATS,ACA GCA TCA -CCA,P,1,RQL,AGT,CAA CGA CTA,D,CDA,CAA CGA CTA,CDA,,AGT,D,CDA,RQL,RQL,CAA CGA CTA -CCA,P,2,,,,A,CCA,,CCT,,,,,P,P,CCT -CCC,P,0,TSA,AGT,ACC GCC TCC,D,DCC,ACC GCC TCC,DCC,,AGT,D,DCC,ATS,ATS,ACC GCC TCC -CCC,P,1,RHL,AGT,CAC CGC CTC,D,CDC,CAC CGC CTC,CDC,,AGT,D,CDC,RLH,RLH,CAC CGC CTC -CCC,P,2,,,,C,CCC,,CCA,,,,,P,P,CCA -CCG,P,0,TSA,AGT,ACG GCG TCG,D,DCG,ACG GCG TCG,DCG,,AGT,D,DCG,ATS,ATS,ACG GCG TCG -CCG,P,1,RQL,AGT,CAG CGG CTG,D,CDG,CAG CGG CTG,CDG,,AGT,D,CDG,RQL,RQL,CAG CGG CTG -CCG,P,2,,,,G,CCG,,CCA,,,,,P,P,CCA -CCT,P,0,TSA,AGT,ACT GCT TCT,D,DCT,ACT GCT TCT,DCT,,AGT,D,DCT,ATS,ATS,ACT GCT TCT -CCT,P,1,RHL,AGT,CAT CGT CTT,D,CDT,CAT CGT CTT,CDT,,AGT,D,CDT,RLH,RLH,CAT CGT CTT -CCT,P,2,,,,T,CCT,,CCA,,,,,P,P,CCA -CGA,R,0,*G,GT,GGA TGA,K,KGA,GGA TGA,DGA,AGA,GT,K,KGA,R*G,R*G,AGA GGA TGA -CGA,R,1,LQP,ACT,CAA CCA CTA,H,CHA,CAA CCA CTA,CHA,,ACT,H,CHA,QLP,QLP,CAA CCA CTA -CGA,R,2,,,,A,CGA,,CGA,,,,,R,R,CGA -CGC,R,0,SCG,AGT,AGC GGC TGC,D,DGC,AGC GGC TGC,DGC,,AGT,D,DGC,CSG,CSG,AGC GGC TGC -CGC,R,1,LHP,ACT,CAC CCC CTC,H,CHC,CAC CCC CTC,CHC,,ACT,H,CHC,LPH,LPH,CAC CCC CTC -CGC,R,2,,,,C,CGC,,CGT,,,,,R,R,CGT -CGG,R,0,WG,GT,GGG TGG,K,KGG,GGG TGG,DGG,AGG,GT,K,KGG,RWG,RWG,AGG GGG TGG -CGG,R,1,LQP,ACT,CAG CCG CTG,H,CHG,CAG CCG CTG,CHG,,ACT,H,CHG,QLP,QLP,CAG CCG CTG -CGG,R,2,,,,G,CGG,,CGG,,,,,R,R,CGG -CGT,R,0,SCG,AGT,AGT GGT TGT,D,DGT,AGT GGT TGT,DGT,,AGT,D,DGT,CSG,CSG,AGT GGT TGT -CGT,R,1,LHP,ACT,CAT CCT CTT,H,CHT,CAT CCT CTT,CHT,,ACT,H,CHT,LPH,LPH,CAT CCT CTT -CGT,R,2,,,,T,CGT,,CGA,,,,,R,R,CGA -CTA,L,0,IV,AG,ATA GTA,R,RTA,ATA GTA,DTA,TTA,AG,R,RTA,VLI,VLI,ATA GTA TTA -CTA,L,1,RQP,ACG,CAA CCA CGA,V,CVA,CAA CCA CGA,CVA,,ACG,V,CVA,RQP,RQP,CAA CCA CGA -CTA,L,2,,,,A,CTA,,CTA,,,,,L,L,CTA -CTC,L,0,FIV,AGT,ATC GTC TTC,D,DTC,ATC GTC TTC,DTC,,AGT,D,DTC,VFI,VFI,ATC GTC TTC -CTC,L,1,RHP,ACG,CAC CCC CGC,V,CVC,CAC CCC CGC,CVC,,ACG,V,CVC,RPH,RPH,CAC CCC CGC -CTC,L,2,,,,C,CTC,,CTA,,,,,L,L,CTA -CTG,L,0,VM,AG,ATG GTG,R,RTG,ATG GTG,DTG,TTG,AG,R,RTG,MVL,MVL,ATG GTG TTG -CTG,L,1,RQP,ACG,CAG CCG CGG,V,CVG,CAG CCG CGG,CVG,,ACG,V,CVG,RQP,RQP,CAG CCG CGG -CTG,L,2,,,,G,CTG,,CTG,,,,,L,L,CTG -CTT,L,0,FIV,AGT,ATT GTT TTT,D,DTT,ATT GTT TTT,DTT,,AGT,D,DTT,VFI,VFI,ATT GTT TTT -CTT,L,1,RHP,ACG,CAT CCT CGT,V,CVT,CAT CCT CGT,CVT,,ACG,V,CVT,RPH,RPH,CAT CCT CGT -CTT,L,2,,,,T,CTT,,CTA,,,,,L,L,CTA -GAA,E,0,*QK,ACT,AAA CAA TAA,H,HAA,AAA CAA TAA,HAA,,ACT,H,HAA,K*Q,K*Q,AAA CAA TAA -GAA,E,1,VAG,CGT,GCA GGA GTA,B,GBA,GCA GGA GTA,GBA,,CGT,B,GBA,AVG,AVG,GCA GGA GTA -GAA,E,2,D,CT,GAC GAT,Y,GAY,GAT,GAK,GAG,T,T,GAT,DE,DE,GAG GAT -GAC,D,0,NHY,ACT,AAC CAC TAC,H,HAC,AAC CAC TAC,HAC,,ACT,H,HAC,NYH,NYH,AAC CAC TAC -GAC,D,1,VAG,CGT,GCC GGC GTC,B,GBC,GCC GGC GTC,GBC,,CGT,B,GBC,AVG,AVG,GCC GGC GTC -GAC,D,2,E,AG,GAA GAG,R,GAR,GAA,GAW,GAT,A,A,GAA,DE,DE,GAA GAT -GAG,E,0,*QK,ACT,AAG CAG TAG,H,HAG,AAG CAG TAG,HAG,,ACT,H,HAG,K*Q,K*Q,AAG CAG TAG -GAG,E,1,VAG,CGT,GCG GGG GTG,B,GBG,GCG GGG GTG,GBG,,CGT,B,GBG,AVG,AVG,GCG GGG GTG -GAG,E,2,D,CT,GAC GAT,Y,GAY,GAT,GAW,GAA,T,T,GAT,DE,DE,GAA GAT -GAT,D,0,NHY,ACT,AAT CAT TAT,H,HAT,AAT CAT TAT,HAT,,ACT,H,HAT,NYH,NYH,AAT CAT TAT -GAT,D,1,VAG,CGT,GCT GGT GTT,B,GBT,GCT GGT GTT,GBT,,CGT,B,GBT,AVG,AVG,GCT GGT GTT -GAT,D,2,E,AG,GAA GAG,R,GAR,GAA,GAM,GAC,A,A,GAA,DE,DE,GAA GAC -GCA,A,0,TSP,ACT,ACA CCA TCA,H,HCA,ACA CCA TCA,HCA,,ACT,H,HCA,STP,STP,ACA CCA TCA -GCA,A,1,GVE,AGT,GAA GGA GTA,D,GDA,GAA GGA GTA,GDA,,AGT,D,GDA,VGE,VGE,GAA GGA GTA -GCA,A,2,,,,A,GCA,,GCT,,,,,A,A,GCT -GCC,A,0,TSP,ACT,ACC CCC TCC,H,HCC,ACC CCC TCC,HCC,,ACT,H,HCC,STP,STP,ACC CCC TCC -GCC,A,1,VDG,AGT,GAC GGC GTC,D,GDC,GAC GGC GTC,GDC,,AGT,D,GDC,VDG,VDG,GAC GGC GTC -GCC,A,2,,,,C,GCC,,GCT,,,,,A,A,GCT -GCG,A,0,TSP,ACT,ACG CCG TCG,H,HCG,ACG CCG TCG,HCG,,ACT,H,HCG,STP,STP,ACG CCG TCG -GCG,A,1,GVE,AGT,GAG GGG GTG,D,GDG,GAG GGG GTG,GDG,,AGT,D,GDG,VGE,VGE,GAG GGG GTG -GCG,A,2,,,,G,GCG,,GCT,,,,,A,A,GCT -GCT,A,0,TSP,ACT,ACT CCT TCT,H,HCT,ACT CCT TCT,HCT,,ACT,H,HCT,STP,STP,ACT CCT TCT -GCT,A,1,VDG,AGT,GAT GGT GTT,D,GDT,GAT GGT GTT,GDT,,AGT,D,GDT,VDG,VDG,GAT GGT GTT -GCT,A,2,,,,T,GCT,,GCA,,,,,A,A,GCA -GGA,G,0,R*,ACT,AGA CGA TGA,H,HGA,AGA TGA,HGA,,AT,W,WGA,R*,R*,AGA CGA TGA -GGA,G,1,VAE,ACT,GAA GCA GTA,H,GHA,GAA GCA GTA,GHA,,ACT,H,GHA,AVE,AVE,GAA GCA GTA -GGA,G,2,,,,A,GGA,,GGT,,,,,G,G,GGT -GGC,G,0,RSC,ACT,AGC CGC TGC,H,HGC,AGC CGC TGC,HGC,,ACT,H,HGC,RCS,RCS,AGC CGC TGC -GGC,G,1,ADV,ACT,GAC GCC GTC,H,GHC,GAC GCC GTC,GHC,,ACT,H,GHC,ADV,ADV,GAC GCC GTC -GGC,G,2,,,,C,GGC,,GGT,,,,,G,G,GGT -GGG,G,0,RW,ACT,AGG CGG TGG,H,HGG,AGG TGG,HGG,,AT,W,WGG,RW,RW,AGG CGG TGG -GGG,G,1,VAE,ACT,GAG GCG GTG,H,GHG,GAG GCG GTG,GHG,,ACT,H,GHG,AVE,AVE,GAG GCG GTG -GGG,G,2,,,,G,GGG,,GGT,,,,,G,G,GGT -GGT,G,0,RSC,ACT,AGT CGT TGT,H,HGT,AGT CGT TGT,HGT,,ACT,H,HGT,RCS,RCS,AGT CGT TGT -GGT,G,1,ADV,ACT,GAT GCT GTT,H,GHT,GAT GCT GTT,GHT,,ACT,H,GHT,ADV,ADV,GAT GCT GTT -GGT,G,2,,,,T,GGT,,GGA,,,,,G,G,GGA -GTA,V,0,LI,ACT,ATA CTA TTA,H,HTA,ATA TTA,HTA,,AT,W,WTA,LI,LI,ATA CTA TTA -GTA,V,1,GAE,ACG,GAA GCA GGA,V,GVA,GAA GCA GGA,GVA,,ACG,V,GVA,AGE,AGE,GAA GCA GGA -GTA,V,2,,,,A,GTA,,GTT,,,,,V,V,GTT -GTC,V,0,LFI,ACT,ATC CTC TTC,H,HTC,ATC CTC TTC,HTC,,ACT,H,HTC,FLI,FLI,ATC CTC TTC -GTC,V,1,ADG,ACG,GAC GCC GGC,V,GVC,GAC GCC GGC,GVC,,ACG,V,GVC,ADG,ADG,GAC GCC GGC -GTC,V,2,,,,C,GTC,,GTT,,,,,V,V,GTT -GTG,V,0,LM,ACT,ATG CTG TTG,H,HTG,ATG TTG,HTG,,AT,W,WTG,ML,ML,ATG CTG TTG -GTG,V,1,GAE,ACG,GAG GCG GGG,V,GVG,GAG GCG GGG,GVG,,ACG,V,GVG,AGE,AGE,GAG GCG GGG -GTG,V,2,,,,G,GTG,,GTT,,,,,V,V,GTT -GTT,V,0,LFI,ACT,ATT CTT TTT,H,HTT,ATT CTT TTT,HTT,,ACT,H,HTT,FLI,FLI,ATT CTT TTT -GTT,V,1,ADG,ACG,GAT GCT GGT,V,GVT,GAT GCT GGT,GVT,,ACG,V,GVT,ADG,ADG,GAT GCT GGT -GTT,V,2,,,,T,GTT,,GTA,,,,,V,V,GTA -TAA,*,0,QEK,ACG,AAA CAA GAA,V,VAA,AAA CAA GAA,VAA,,ACG,V,VAA,KQE,KQE,AAA CAA GAA -TAA,*,1,LS,CT,TCA TTA,Y,TYA,TCA TTA,TYA,,CT,Y,TYA,LS,LS,TCA TTA -TAA,*,2,Y,CT,TAC TAT,Y,TAY,TAT,TAK,TAG,T,T,TAT,*Y,*Y,TAG TAT -TAC,Y,0,NHD,ACG,AAC CAC GAC,V,VAC,AAC CAC GAC,VAC,,ACG,V,VAC,NDH,NDH,AAC CAC GAC -TAC,Y,1,SFC,CGT,TCC TGC TTC,B,TBC,TCC TGC TTC,TBC,,CGT,B,TBC,CSF,CSF,TCC TGC TTC -TAC,Y,2,*,AG,TAA TAG,R,TAR,TAA,TAW,TAT,A,A,TAA,*Y,*Y,TAA TAT -TAG,*,0,QEK,ACG,AAG CAG GAG,V,VAG,AAG CAG GAG,VAG,,ACG,V,VAG,KQE,KQE,AAG CAG GAG -TAG,*,1,LSW,CGT,TCG TGG TTG,B,TBG,TCG TGG TTG,TBG,,CGT,B,TBG,WLS,WLS,TCG TGG TTG -TAG,*,2,Y,CT,TAC TAT,Y,TAY,TAT,TAW,TAA,T,T,TAT,*Y,*Y,TAA TAT -TAT,Y,0,NHD,ACG,AAT CAT GAT,V,VAT,AAT CAT GAT,VAT,,ACG,V,VAT,NDH,NDH,AAT CAT GAT -TAT,Y,1,SFC,CGT,TCT TGT TTT,B,TBT,TCT TGT TTT,TBT,,CGT,B,TBT,CSF,CSF,TCT TGT TTT -TAT,Y,2,*,AG,TAA TAG,R,TAR,TAA,TAM,TAC,A,A,TAA,*Y,*Y,TAA TAC -TCA,S,0,TAP,ACG,ACA CCA GCA,V,VCA,ACA CCA GCA,VCA,,ACG,V,VCA,ATP,ATP,ACA CCA GCA -TCA,S,1,L*,AGT,TAA TGA TTA,D,TDA,TAA TTA,TDA,,AT,W,TWA,*L,*L,TAA TGA TTA -TCA,S,2,,,,A,TCA,,TCT,,,,,S,S,TCT -TCC,S,0,TAP,ACG,ACC CCC GCC,V,VCC,ACC CCC GCC,VCC,,ACG,V,VCC,ATP,ATP,ACC CCC GCC -TCC,S,1,FCY,AGT,TAC TGC TTC,D,TDC,TAC TGC TTC,TDC,,AGT,D,TDC,CYF,CYF,TAC TGC TTC -TCC,S,2,,,,C,TCC,,TCT,,,,,S,S,TCT -TCG,S,0,TAP,ACG,ACG CCG GCG,V,VCG,ACG CCG GCG,VCG,,ACG,V,VCG,ATP,ATP,ACG CCG GCG -TCG,S,1,L*W,AGT,TAG TGG TTG,D,TDG,TAG TGG TTG,TDG,,AGT,D,TDG,W*L,W*L,TAG TGG TTG -TCG,S,2,,,,G,TCG,,TCT,,,,,S,S,TCT -TCT,S,0,TAP,ACG,ACT CCT GCT,V,VCT,ACT CCT GCT,VCT,,ACG,V,VCT,ATP,ATP,ACT CCT GCT -TCT,S,1,FCY,AGT,TAT TGT TTT,D,TDT,TAT TGT TTT,TDT,,AGT,D,TDT,CYF,CYF,TAT TGT TTT -TCT,S,2,,,,T,TCT,,TCA,,,,,S,S,TCA -TGA,*,0,RG,ACG,AGA CGA GGA,V,VGA,AGA GGA,VGA,,AG,R,RGA,RG,RG,AGA CGA GGA -TGA,*,1,LS,CT,TCA TTA,Y,TYA,TCA TTA,THA,TAA,CT,Y,TYA,L*S,L*S,TAA TCA TTA -TGA,*,2,CW,CGT,TGC TGG TGT,B,TGB,TGT TGG,TGB,,TG,K,TGK,WC,WC,TGC TGG TGT -TGC,C,0,RSG,ACG,AGC CGC GGC,V,VGC,CGC GGC,VGC,,CG,S,SGC,RSG,RSG,AGC CGC GGC -TGC,C,1,SFY,ACT,TAC TCC TTC,H,THC,TAC TCC TTC,THC,,ACT,H,THC,FSY,FSY,TAC TCC TTC -TGC,C,2,*W,AG,TGA TGG,R,TGR,TGA TGG,TGD,TGT,AG,R,TGR,W*C,W*C,TGA TGG TGT -TGG,W,0,RG,ACG,AGG CGG GGG,V,VGG,AGG GGG,VGG,,AG,R,RGG,RG,RG,AGG CGG GGG -TGG,W,1,L*S,ACT,TAG TCG TTG,H,THG,TAG TCG TTG,THG,,ACT,H,THG,L*S,L*S,TAG TCG TTG -TGG,W,2,*C,ACT,TGA TGC TGT,H,TGH,TGA TGC,TGV,TGG,AC,M,TGM,W*C,W*C,TGA TGC TGG -TGT,C,0,RSG,ACG,AGT CGT GGT,V,VGT,CGT GGT,VGT,,CG,S,SGT,RSG,RSG,AGT CGT GGT -TGT,C,1,SFY,ACT,TAT TCT TTT,H,THT,TAT TCT TTT,THT,,ACT,H,THT,FSY,FSY,TAT TCT TTT -TGT,C,2,*W,AG,TGA TGG,R,TGR,TGA TGG,TGV,TGC,AG,R,TGR,W*C,W*C,TGA TGC TGG -TTA,L,0,IV,AG,ATA GTA,R,RTA,ATA GTA,RTA,,AG,R,RTA,VI,VI,ATA GTA -TTA,L,1,*S,ACG,TAA TCA TGA,V,TVA,TAA TCA,TVA,,AC,M,TMA,*S,*S,TAA TCA TGA -TTA,L,2,F,CT,TTC TTT,Y,TTY,TTT,TTK,TTG,T,T,TTT,FL,FL,TTG TTT -TTC,F,0,LIV,ACG,ATC CTC GTC,V,VTC,ATC GTC,VTC,,AG,R,RTC,VLI,VLI,ATC CTC GTC -TTC,F,1,SCY,ACG,TAC TCC TGC,V,TVC,TAC TCC TGC,TVC,,ACG,V,TVC,CSY,CSY,TAC TCC TGC -TTC,F,2,L,AG,TTA TTG,R,TTR,TTG,TTK,TTT,G,G,TTG,FL,FL,TTG TTT -TTG,L,0,VM,AG,ATG GTG,R,RTG,ATG GTG,RTG,,AG,R,RTG,MV,MV,ATG GTG -TTG,L,1,*SW,ACG,TAG TCG TGG,V,TVG,TAG TCG TGG,TVG,,ACG,V,TVG,W*S,W*S,TAG TCG TGG -TTG,L,2,F,CT,TTC TTT,Y,TTY,TTT,TTW,TTA,T,T,TTT,FL,FL,TTA TTT -TTT,F,0,LIV,ACG,ATT CTT GTT,V,VTT,ATT GTT,VTT,,AG,R,RTT,VLI,VLI,ATT CTT GTT -TTT,F,1,SCY,ACG,TAT TCT TGT,V,TVT,TAT TCT TGT,TVT,,ACG,V,TVT,CSY,CSY,TAT TCT TGT -TTT,F,2,L,AG,TTA TTG,R,TTR,TTG,TTS,TTC,G,G,TTG,FL,FL,TTC TTG diff --git a/dms_no_stops.csv b/dms_no_stops.csv deleted file mode 100644 index 2a27d45..0000000 --- a/dms_no_stops.csv +++ /dev/null @@ -1,193 +0,0 @@ -codon,aa,position,all_missense_aa,all_missense_nucleotides,all_missense_codons,all_iupac,all_iupac_codon,sele_missense_codons,syn_iupac_codon,syn_codon,sele_missense_nucleotides,sele_iupac,sele_iupac_codon,sele_aa,syn_aa,syn_missense_codons,no_stop_codons,no_stop_iupac,no_stop_iupac_codon,no_stop_nucleotides,no_stop_aa -AAA,K,0,*QE,CGT,CAA GAA TAA,B,BAA,CAA GAA TAA,BAA,,CGT,B,BAA,QE*,*QE,CAA GAA TAA,CAA GAA,S,SAA,CG,QE -AAA,K,1,TRI,CGT,ACA AGA ATA,B,ABA,ACA AGA ATA,ABA,,CGT,B,ABA,TRI,RTI,ACA AGA ATA,ACA AGA ATA,B,ABA,CGT,TRI -AAA,K,2,N,CT,AAC AAT,Y,AAY,AAT,AAK,AAG,T,T,AAT,N,KN,AAG AAT,AAT,T,AAT,T,N -AAC,N,0,HDY,CGT,CAC GAC TAC,B,BAC,CAC GAC TAC,BAC,,CGT,B,BAC,HDY,DYH,CAC GAC TAC,CAC GAC TAC,B,BAC,CGT,HDY -AAC,N,1,TSI,CGT,ACC AGC ATC,B,ABC,ACC AGC ATC,ABC,,CGT,B,ABC,TSI,STI,ACC AGC ATC,ACC AGC ATC,B,ABC,CGT,TSI -AAC,N,2,K,AG,AAA AAG,R,AAR,AAA,AAW,AAT,A,A,AAA,K,KN,AAA AAT,AAA,A,AAA,A,K -AAG,K,0,*QE,CGT,CAG GAG TAG,B,BAG,CAG GAG TAG,BAG,,CGT,B,BAG,QE*,*QE,CAG GAG TAG,CAG GAG,S,SAG,CG,QE -AAG,K,1,TRM,CGT,ACG AGG ATG,B,ABG,ACG AGG ATG,ABG,,CGT,B,ABG,TRM,RTM,ACG AGG ATG,ACG AGG ATG,B,ABG,CGT,TRM -AAG,K,2,N,CT,AAC AAT,Y,AAY,AAT,AAW,AAA,T,T,AAT,N,KN,AAA AAT,AAT,T,AAT,T,N -AAT,N,0,HDY,CGT,CAT GAT TAT,B,BAT,CAT GAT TAT,BAT,,CGT,B,BAT,HDY,DYH,CAT GAT TAT,CAT GAT TAT,B,BAT,CGT,HDY -AAT,N,1,TSI,CGT,ACT AGT ATT,B,ABT,ACT AGT ATT,ABT,,CGT,B,ABT,TSI,STI,ACT AGT ATT,ACT AGT ATT,B,ABT,CGT,TSI -AAT,N,2,K,AG,AAA AAG,R,AAR,AAA,AAM,AAC,A,A,AAA,K,KN,AAA AAC,AAA,A,AAA,A,K -ACA,T,0,SAP,CGT,CCA GCA TCA,B,BCA,CCA GCA TCA,BCA,,CGT,B,BCA,PAS,ASP,CCA GCA TCA,CCA GCA TCA,B,BCA,CGT,PAS -ACA,T,1,RIK,AGT,AAA AGA ATA,D,ADA,AAA AGA ATA,ADA,,AGT,D,ADA,KRI,KRI,AAA AGA ATA,AAA AGA ATA,D,ADA,AGT,KRI -ACA,T,2,,,,A,ACA,,ACT,,,,,,T,ACT,,,,, -ACC,T,0,SAP,CGT,CCC GCC TCC,B,BCC,CCC GCC TCC,BCC,,CGT,B,BCC,PAS,ASP,CCC GCC TCC,CCC GCC TCC,B,BCC,CGT,PAS -ACC,T,1,NSI,AGT,AAC AGC ATC,D,ADC,AAC ATC,ADC,,AT,W,AWC,NI,NSI,AAC AGC ATC,AAC ATC,W,AWC,AT,NI -ACC,T,2,,,,C,ACC,,ACT,,,,,,T,ACT,,,,, -ACG,T,0,SAP,CGT,CCG GCG TCG,B,BCG,CCG GCG TCG,BCG,,CGT,B,BCG,PAS,ASP,CCG GCG TCG,CCG GCG TCG,B,BCG,CGT,PAS -ACG,T,1,RMK,AGT,AAG AGG ATG,D,ADG,AAG AGG ATG,ADG,,AGT,D,ADG,KRM,KMR,AAG AGG ATG,AAG AGG ATG,D,ADG,AGT,KRM -ACG,T,2,,,,G,ACG,,ACT,,,,,,T,ACT,,,,, -ACT,T,0,SAP,CGT,CCT GCT TCT,B,BCT,CCT GCT TCT,BCT,,CGT,B,BCT,PAS,ASP,CCT GCT TCT,CCT GCT TCT,B,BCT,CGT,PAS -ACT,T,1,NSI,AGT,AAT AGT ATT,D,ADT,AAT ATT,ADT,,AT,W,AWT,NI,NSI,AAT AGT ATT,AAT ATT,W,AWT,AT,NI -ACT,T,2,,,,T,ACT,,ACA,,,,,,T,ACA,,,,, -AGA,R,0,*G,GT,GGA TGA,K,KGA,GGA TGA,KGA,,GT,K,KGA,G*,*G,GGA TGA,GGA,G,GGA,G,G -AGA,R,1,TIK,ACT,AAA ACA ATA,H,AHA,AAA ACA ATA,AHA,,ACT,H,AHA,KTI,KTI,AAA ACA ATA,AAA ACA ATA,H,AHA,ACT,KTI -AGA,R,2,S,CT,AGC AGT,Y,AGY,AGT,AGK,AGG,T,T,AGT,S,RS,AGG AGT,AGT,T,AGT,T,S -AGC,S,0,RCG,CGT,CGC GGC TGC,B,BGC,GGC TGC,BGC,,GT,K,KGC,GC,RCG,CGC GGC TGC,GGC TGC,K,KGC,GT,GC -AGC,S,1,NTI,ACT,AAC ACC ATC,H,AHC,AAC ACC ATC,AHC,,ACT,H,AHC,NTI,NTI,AAC ACC ATC,AAC ACC ATC,H,AHC,ACT,NTI -AGC,S,2,R,AG,AGA AGG,R,AGR,AGA,AGW,AGT,A,A,AGA,R,RS,AGA AGT,AGA,A,AGA,A,R -AGG,R,0,WG,GT,GGG TGG,K,KGG,GGG TGG,KGG,,GT,K,KGG,GW,WG,GGG TGG,GGG TGG,K,KGG,GT,GW -AGG,R,1,TMK,ACT,AAG ACG ATG,H,AHG,AAG ACG ATG,AHG,,ACT,H,AHG,KTM,KTM,AAG ACG ATG,AAG ACG ATG,H,AHG,ACT,KTM -AGG,R,2,S,CT,AGC AGT,Y,AGY,AGT,AGW,AGA,T,T,AGT,S,RS,AGA AGT,AGT,T,AGT,T,S -AGT,S,0,RCG,CGT,CGT GGT TGT,B,BGT,GGT TGT,BGT,,GT,K,KGT,GC,RCG,CGT GGT TGT,GGT TGT,K,KGT,GT,GC -AGT,S,1,NTI,ACT,AAT ACT ATT,H,AHT,AAT ACT ATT,AHT,,ACT,H,AHT,NTI,NTI,AAT ACT ATT,AAT ACT ATT,H,AHT,ACT,NTI -AGT,S,2,R,AG,AGA AGG,R,AGR,AGA,AGM,AGC,A,A,AGA,R,RS,AGA AGC,AGA,A,AGA,A,R -ATA,I,0,LV,CGT,CTA GTA TTA,B,BTA,TTA GTA,BTA,,TG,K,KTA,LV,VL,CTA GTA TTA,TTA GTA,K,KTA,TG,LV -ATA,I,1,TRK,ACG,AAA ACA AGA,V,AVA,AAA ACA AGA,AVA,,ACG,V,AVA,KTR,KTR,AAA ACA AGA,AAA ACA AGA,V,AVA,ACG,KTR -ATA,I,2,M,G,ATG,G,ATG,ATG,ATK,ATT,G,G,ATG,M,MI,ATG ATT,ATG,G,ATG,G,M -ATC,I,0,LFV,CGT,CTC GTC TTC,B,BTC,CTC GTC TTC,BTC,,CGT,B,BTC,LVF,VFL,CTC GTC TTC,CTC GTC TTC,B,BTC,CGT,LVF -ATC,I,1,NST,ACG,AAC ACC AGC,V,AVC,AAC ACC AGC,AVC,,ACG,V,AVC,NTS,NTS,AAC ACC AGC,AAC ACC AGC,V,AVC,ACG,NTS -ATC,I,2,M,G,ATG,G,ATG,ATG,ATK,ATT,G,G,ATG,M,MI,ATG ATT,ATG,G,ATG,G,M -ATG,M,0,LV,CGT,CTG GTG TTG,B,BTG,TTA GTA,BTG,,TG,K,KTG,LV,VL,CTG GTG TTG,TTA GTA,K,KTG,TG,LV -ATG,M,1,TRK,ACG,AAG ACG AGG,V,AVG,AAG ACG AGG,AVG,,ACG,V,AVG,KTR,KTR,AAG ACG AGG,AAG ACG AGG,V,AVG,ACG,KTR -ATG,M,2,I,ACT,ATA ATC ATT,H,ATH,ATT,ATK,ATG,T,T,ATT,I,MI,ATG ATT,ATT,T,ATT,T,I -ATT,I,0,LFV,CGT,CTT GTT TTT,B,BTT,CTT GTT TTT,BTT,,CGT,B,BTT,LVF,VFL,CTT GTT TTT,CTT GTT TTT,B,BTT,CGT,LVF -ATT,I,1,NST,ACG,AAT ACT AGT,V,AVT,AAT ACT AGT,AVT,,ACG,V,AVT,NTS,NTS,AAT ACT AGT,AAT ACT AGT,V,AVT,ACG,NTS -ATT,I,2,M,G,ATG,G,ATG,ATG,ATR,ATA,G,G,ATG,M,MI,ATA ATG,ATG,G,ATG,G,M -CAA,Q,0,*EK,AGT,AAA GAA TAA,D,DAA,AAA GAA TAA,DAA,,AGT,D,DAA,KE*,K*E,AAA GAA TAA,AAA GAA,R,RAA,AG,KE -CAA,Q,1,RLP,CGT,CCA CGA CTA,B,CBA,CCA CGA CTA,CBA,,CGT,B,CBA,PRL,RLP,CCA CGA CTA,CCA CGA CTA,B,CBA,CGT,PRL -CAA,Q,2,H,CT,CAC CAT,Y,CAY,CAT,CAK,CAG,T,T,CAT,H,QH,CAG CAT,CAT,T,CAT,T,H -CAC,H,0,NDY,AGT,AAC GAC TAC,D,DAC,AAC GAC TAC,DAC,,AGT,D,DAC,NDY,NDY,AAC GAC TAC,AAC GAC TAC,D,DAC,AGT,NDY -CAC,H,1,RLP,CGT,CCC CGC CTC,B,CBC,CCC CGC CTC,CBC,,CGT,B,CBC,PRL,RLP,CCC CGC CTC,CCC CGC CTC,B,CBC,CGT,PRL -CAC,H,2,Q,AG,CAA CAG,R,CAR,CAA,CAW,CAT,A,A,CAA,Q,QH,CAA CAT,CAA,A,CAA,A,Q -CAG,Q,0,*EK,AGT,AAG GAG TAG,D,DAG,AAG GAG TAG,DAG,,AGT,D,DAG,KE*,K*E,AAG GAG TAG,AAG GAG,R,RAG,AG,KE -CAG,Q,1,RLP,CGT,CCG CGG CTG,B,CBG,CCG CGG CTG,CBG,,CGT,B,CBG,PRL,RLP,CCG CGG CTG,CCG CGG CTG,B,CBG,CGT,PRL -CAG,Q,2,H,CT,CAC CAT,Y,CAY,CAT,CAW,CAA,T,T,CAT,H,QH,CAA CAT,CAT,T,CAT,T,H -CAT,H,0,NDY,AGT,AAT GAT TAT,D,DAT,AAT GAT TAT,DAT,,AGT,D,DAT,NDY,NDY,AAT GAT TAT,AAT GAT TAT,D,DAT,AGT,NDY -CAT,H,1,RLP,CGT,CCT CGT CTT,B,CBT,CCT CGT CTT,CBT,,CGT,B,CBT,PRL,RLP,CCT CGT CTT,CCT CGT CTT,B,CBT,CGT,PRL -CAT,H,2,Q,AG,CAA CAG,R,CAR,CAA,CAM,CAC,A,A,CAA,Q,QH,CAA CAC,CAA,A,CAA,A,Q -CCA,P,0,TSA,AGT,ACA GCA TCA,D,DCA,ACA GCA TCA,DCA,,AGT,D,DCA,TAS,ATS,ACA GCA TCA,ACA GCA TCA,D,DCA,AGT,TAS -CCA,P,1,RQL,AGT,CAA CGA CTA,D,CDA,CAA CGA CTA,CDA,,AGT,D,CDA,QRL,RQL,CAA CGA CTA,CAA CGA CTA,D,CDA,AGT,QRL -CCA,P,2,,,,A,CCA,,CCT,,,,,,P,CCT,,,,, -CCC,P,0,TSA,AGT,ACC GCC TCC,D,DCC,ACC GCC TCC,DCC,,AGT,D,DCC,TAS,ATS,ACC GCC TCC,ACC GCC TCC,D,DCC,AGT,TAS -CCC,P,1,RHL,AGT,CAC CGC CTC,D,CDC,CAC CGC CTC,CDC,,AGT,D,CDC,HRL,RLH,CAC CGC CTC,CAC CGC CTC,D,CDC,AGT,HRL -CCC,P,2,,,,C,CCC,,CCA,,,,,,P,CCA,,,,, -CCG,P,0,TSA,AGT,ACG GCG TCG,D,DCG,ACG GCG TCG,DCG,,AGT,D,DCG,TAS,ATS,ACG GCG TCG,ACG GCG TCG,D,DCG,AGT,TAS -CCG,P,1,RQL,AGT,CAG CGG CTG,D,CDG,CAG CGG CTG,CDG,,AGT,D,CDG,QRL,RQL,CAG CGG CTG,CAG CGG CTG,D,CDG,AGT,QRL -CCG,P,2,,,,G,CCG,,CCA,,,,,,P,CCA,,,,, -CCT,P,0,TSA,AGT,ACT GCT TCT,D,DCT,ACT GCT TCT,DCT,,AGT,D,DCT,TAS,ATS,ACT GCT TCT,ACT GCT TCT,D,DCT,AGT,TAS -CCT,P,1,RHL,AGT,CAT CGT CTT,D,CDT,CAT CGT CTT,CDT,,AGT,D,CDT,HRL,RLH,CAT CGT CTT,CAT CGT CTT,D,CDT,AGT,HRL -CCT,P,2,,,,T,CCT,,CCA,,,,,,P,CCA,,,,, -CGA,R,0,*G,GT,GGA TGA,K,KGA,GGA TGA,DGA,AGA,GT,K,KGA,G*,R*G,AGA GGA TGA,GGA,G,GGA,G,G -CGA,R,1,LQP,ACT,CAA CCA CTA,H,CHA,CAA CCA CTA,CHA,,ACT,H,CHA,QPL,QLP,CAA CCA CTA,CAA CCA CTA,H,CHA,ACT,QPL -CGA,R,2,,,,A,CGA,,CGA,,,,,,R,CGA,,,,, -CGC,R,0,SCG,AGT,AGC GGC TGC,D,DGC,AGC GGC TGC,DGC,,AGT,D,DGC,SGC,CSG,AGC GGC TGC,AGC GGC TGC,D,DGC,AGT,SGC -CGC,R,1,LHP,ACT,CAC CCC CTC,H,CHC,CAC CCC CTC,CHC,,ACT,H,CHC,HPL,LPH,CAC CCC CTC,CAC CCC CTC,H,CHC,ACT,HPL -CGC,R,2,,,,C,CGC,,CGT,,,,,,R,CGT,,,,, -CGG,R,0,WG,GT,GGG TGG,K,KGG,GGG TGG,DGG,AGG,GT,K,KGG,GW,RWG,AGG GGG TGG,GGG TGG,K,KGG,GT,GW -CGG,R,1,LQP,ACT,CAG CCG CTG,H,CHG,CAG CCG CTG,CHG,,ACT,H,CHG,QPL,QLP,CAG CCG CTG,CAG CCG CTG,H,CHG,ACT,QPL -CGG,R,2,,,,G,CGG,,CGG,,,,,,R,CGG,,,,, -CGT,R,0,SCG,AGT,AGT GGT TGT,D,DGT,AGT GGT TGT,DGT,,AGT,D,DGT,SGC,CSG,AGT GGT TGT,AGT GGT TGT,D,DGT,AGT,SGC -CGT,R,1,LHP,ACT,CAT CCT CTT,H,CHT,CAT CCT CTT,CHT,,ACT,H,CHT,HPL,LPH,CAT CCT CTT,CAT CCT CTT,H,CHT,ACT,HPL -CGT,R,2,,,,T,CGT,,CGA,,,,,,R,CGA,,,,, -CTA,L,0,IV,AG,ATA GTA,R,RTA,ATA GTA,DTA,TTA,AG,R,RTA,IV,VLI,ATA GTA TTA,ATA GTA,R,RTA,AG,IV -CTA,L,1,RQP,ACG,CAA CCA CGA,V,CVA,CAA CCA CGA,CVA,,ACG,V,CVA,QPR,RQP,CAA CCA CGA,CAA CCA CGA,V,CVA,ACG,QPR -CTA,L,2,,,,A,CTA,,CTA,,,,,,L,CTA,,,,, -CTC,L,0,FIV,AGT,ATC GTC TTC,D,DTC,ATC GTC TTC,DTC,,AGT,D,DTC,IVF,VFI,ATC GTC TTC,ATC GTC TTC,D,DTC,AGT,IVF -CTC,L,1,RHP,ACG,CAC CCC CGC,V,CVC,CAC CCC CGC,CVC,,ACG,V,CVC,HPR,RPH,CAC CCC CGC,CAC CCC CGC,V,CVC,ACG,HPR -CTC,L,2,,,,C,CTC,,CTA,,,,,,L,CTA,,,,, -CTG,L,0,VM,AG,ATG GTG,R,RTG,ATG GTG,DTG,TTG,AG,R,RTG,MV,MVL,ATG GTG TTG,ATG GTG,R,RTG,AG,MV -CTG,L,1,RQP,ACG,CAG CCG CGG,V,CVG,CAG CCG CGG,CVG,,ACG,V,CVG,QPR,RQP,CAG CCG CGG,CAG CCG CGG,V,CVG,ACG,QPR -CTG,L,2,,,,G,CTG,,CTG,,,,,,L,CTG,,,,, -CTT,L,0,FIV,AGT,ATT GTT TTT,D,DTT,ATT GTT TTT,DTT,,AGT,D,DTT,IVF,VFI,ATT GTT TTT,ATT GTT TTT,D,DTT,AGT,IVF -CTT,L,1,RHP,ACG,CAT CCT CGT,V,CVT,CAT CCT CGT,CVT,,ACG,V,CVT,HPR,RPH,CAT CCT CGT,CAT CCT CGT,V,CVT,ACG,HPR -CTT,L,2,,,,T,CTT,,CTA,,,,,,L,CTA,,,,, -GAA,E,0,*QK,ACT,AAA CAA TAA,H,HAA,AAA CAA TAA,HAA,,ACT,H,HAA,KQ*,K*Q,AAA CAA TAA,AAA CAA,M,MAA,AC,KQ -GAA,E,1,VAG,CGT,GCA GGA GTA,B,GBA,GCA GGA GTA,GBA,,CGT,B,GBA,AGV,AVG,GCA GGA GTA,GCA GGA GTA,B,GBA,CGT,AGV -GAA,E,2,D,CT,GAC GAT,Y,GAY,GAT,GAK,GAG,T,T,GAT,D,DE,GAG GAT,GAT,T,GAT,T,D -GAC,D,0,NHY,ACT,AAC CAC TAC,H,HAC,AAC CAC TAC,HAC,,ACT,H,HAC,NHY,NYH,AAC CAC TAC,AAC CAC TAC,H,HAC,ACT,NHY -GAC,D,1,VAG,CGT,GCC GGC GTC,B,GBC,GCC GGC GTC,GBC,,CGT,B,GBC,AGV,AVG,GCC GGC GTC,GCC GGC GTC,B,GBC,CGT,AGV -GAC,D,2,E,AG,GAA GAG,R,GAR,GAA,GAW,GAT,A,A,GAA,E,DE,GAA GAT,GAA,A,GAA,A,E -GAG,E,0,*QK,ACT,AAG CAG TAG,H,HAG,AAG CAG TAG,HAG,,ACT,H,HAG,KQ*,K*Q,AAG CAG TAG,AAG CAG,M,MAG,AC,KQ -GAG,E,1,VAG,CGT,GCG GGG GTG,B,GBG,GCG GGG GTG,GBG,,CGT,B,GBG,AGV,AVG,GCG GGG GTG,GCG GGG GTG,B,GBG,CGT,AGV -GAG,E,2,D,CT,GAC GAT,Y,GAY,GAT,GAW,GAA,T,T,GAT,D,DE,GAA GAT,GAT,T,GAT,T,D -GAT,D,0,NHY,ACT,AAT CAT TAT,H,HAT,AAT CAT TAT,HAT,,ACT,H,HAT,NHY,NYH,AAT CAT TAT,AAT CAT TAT,H,HAT,ACT,NHY -GAT,D,1,VAG,CGT,GCT GGT GTT,B,GBT,GCT GGT GTT,GBT,,CGT,B,GBT,AGV,AVG,GCT GGT GTT,GCT GGT GTT,B,GBT,CGT,AGV -GAT,D,2,E,AG,GAA GAG,R,GAR,GAA,GAM,GAC,A,A,GAA,E,DE,GAA GAC,GAA,A,GAA,A,E -GCA,A,0,TSP,ACT,ACA CCA TCA,H,HCA,ACA CCA TCA,HCA,,ACT,H,HCA,TPS,STP,ACA CCA TCA,ACA CCA TCA,H,HCA,ACT,TPS -GCA,A,1,GVE,AGT,GAA GGA GTA,D,GDA,GAA GGA GTA,GDA,,AGT,D,GDA,EGV,VGE,GAA GGA GTA,GAA GGA GTA,D,GDA,AGT,EGV -GCA,A,2,,,,A,GCA,,GCT,,,,,,A,GCT,,,,, -GCC,A,0,TSP,ACT,ACC CCC TCC,H,HCC,ACC CCC TCC,HCC,,ACT,H,HCC,TPS,STP,ACC CCC TCC,ACC CCC TCC,H,HCC,ACT,TPS -GCC,A,1,VDG,AGT,GAC GGC GTC,D,GDC,GAC GGC GTC,GDC,,AGT,D,GDC,DGV,VDG,GAC GGC GTC,GAC GGC GTC,D,GDC,AGT,DGV -GCC,A,2,,,,C,GCC,,GCT,,,,,,A,GCT,,,,, -GCG,A,0,TSP,ACT,ACG CCG TCG,H,HCG,ACG CCG TCG,HCG,,ACT,H,HCG,TPS,STP,ACG CCG TCG,ACG CCG TCG,H,HCG,ACT,TPS -GCG,A,1,GVE,AGT,GAG GGG GTG,D,GDG,GAG GGG GTG,GDG,,AGT,D,GDG,EGV,VGE,GAG GGG GTG,GAG GGG GTG,D,GDG,AGT,EGV -GCG,A,2,,,,G,GCG,,GCT,,,,,,A,GCT,,,,, -GCT,A,0,TSP,ACT,ACT CCT TCT,H,HCT,ACT CCT TCT,HCT,,ACT,H,HCT,TPS,STP,ACT CCT TCT,ACT CCT TCT,H,HCT,ACT,TPS -GCT,A,1,VDG,AGT,GAT GGT GTT,D,GDT,GAT GGT GTT,GDT,,AGT,D,GDT,DGV,VDG,GAT GGT GTT,GAT GGT GTT,D,GDT,AGT,DGV -GCT,A,2,,,,T,GCT,,GCA,,,,,,A,GCA,,,,, -GGA,G,0,R*,ACT,AGA CGA TGA,H,HGA,AGA TGA,HGA,,AT,W,WGA,R*,R*,AGA CGA TGA,AGA,A,AGA,A,R -GGA,G,1,VAE,ACT,GAA GCA GTA,H,GHA,GAA GCA GTA,GHA,,ACT,H,GHA,EAV,AVE,GAA GCA GTA,GAA GCA GTA,H,GHA,ACT,EAV -GGA,G,2,,,,A,GGA,,GGT,,,,,,G,GGT,,,,, -GGC,G,0,RSC,ACT,AGC CGC TGC,H,HGC,AGC CGC TGC,HGC,,ACT,H,HGC,SRC,RCS,AGC CGC TGC,AGC CGC TGC,H,HGC,ACT,SRC -GGC,G,1,ADV,ACT,GAC GCC GTC,H,GHC,GAC GCC GTC,GHC,,ACT,H,GHC,DAV,ADV,GAC GCC GTC,GAC GCC GTC,H,GHC,ACT,DAV -GGC,G,2,,,,C,GGC,,GGT,,,,,,G,GGT,,,,, -GGG,G,0,RW,ACT,AGG CGG TGG,H,HGG,AGG TGG,HGG,,AT,W,WGG,RW,RW,AGG CGG TGG,AGG TGG,W,WGG,AT,RW -GGG,G,1,VAE,ACT,GAG GCG GTG,H,GHG,GAG GCG GTG,GHG,,ACT,H,GHG,EAV,AVE,GAG GCG GTG,GAG GCG GTG,H,GHG,ACT,EAV -GGG,G,2,,,,G,GGG,,GGT,,,,,,G,GGT,,,,, -GGT,G,0,RSC,ACT,AGT CGT TGT,H,HGT,AGT CGT TGT,HGT,,ACT,H,HGT,SRC,RCS,AGT CGT TGT,AGT CGT TGT,H,HGT,ACT,SRC -GGT,G,1,ADV,ACT,GAT GCT GTT,H,GHT,GAT GCT GTT,GHT,,ACT,H,GHT,DAV,ADV,GAT GCT GTT,GAT GCT GTT,H,GHT,ACT,DAV -GGT,G,2,,,,T,GGT,,GGA,,,,,,G,GGA,,,,, -GTA,V,0,LI,ACT,ATA CTA TTA,H,HTA,ATA TTA,HTA,,AT,W,WTA,IL,LI,ATA CTA TTA,ATA TTA,W,WTA,AT,IL -GTA,V,1,GAE,ACG,GAA GCA GGA,V,GVA,GAA GCA GGA,GVA,,ACG,V,GVA,EAG,AGE,GAA GCA GGA,GAA GCA GGA,V,GVA,ACG,EAG -GTA,V,2,,,,A,GTA,,GTT,,,,,,V,GTT,,,,, -GTC,V,0,LFI,ACT,ATC CTC TTC,H,HTC,ATC CTC TTC,HTC,,ACT,H,HTC,ILF,FLI,ATC CTC TTC,ATC CTC TTC,H,HTC,ACT,ILF -GTC,V,1,ADG,ACG,GAC GCC GGC,V,GVC,GAC GCC GGC,GVC,,ACG,V,GVC,DAG,ADG,GAC GCC GGC,GAC GCC GGC,V,GVC,ACG,DAG -GTC,V,2,,,,C,GTC,,GTT,,,,,,V,GTT,,,,, -GTG,V,0,LM,ACT,ATG CTG TTG,H,HTG,ATG TTG,HTG,,AT,W,WTG,ML,ML,ATG CTG TTG,ATG TTG,W,WTG,AT,ML -GTG,V,1,GAE,ACG,GAG GCG GGG,V,GVG,GAG GCG GGG,GVG,,ACG,V,GVG,EAG,AGE,GAG GCG GGG,GAG GCG GGG,V,GVG,ACG,EAG -GTG,V,2,,,,G,GTG,,GTT,,,,,,V,GTT,,,,, -GTT,V,0,LFI,ACT,ATT CTT TTT,H,HTT,ATT CTT TTT,HTT,,ACT,H,HTT,ILF,FLI,ATT CTT TTT,ATT CTT TTT,H,HTT,ACT,ILF -GTT,V,1,ADG,ACG,GAT GCT GGT,V,GVT,GAT GCT GGT,GVT,,ACG,V,GVT,DAG,ADG,GAT GCT GGT,GAT GCT GGT,V,GVT,ACG,DAG -GTT,V,2,,,,T,GTT,,GTA,,,,,,V,GTA,,,,, -TAA,*,0,QEK,ACG,AAA CAA GAA,V,VAA,AAA CAA GAA,VAA,,ACG,V,VAA,KQE,KQE,AAA CAA GAA,AAA CAA GAA,V,VAA,ACG,KQE -TAA,*,1,LS,CT,TCA TTA,Y,TYA,TCA TTA,TYA,,CT,Y,TYA,SL,LS,TCA TTA,TCA TTA,Y,TYA,CT,SL -TAA,*,2,Y,CT,TAC TAT,Y,TAY,TAT,TAK,TAG,T,T,TAT,Y,*Y,TAG TAT,TAT,T,TAT,T,Y -TAC,Y,0,NHD,ACG,AAC CAC GAC,V,VAC,AAC CAC GAC,VAC,,ACG,V,VAC,NHD,NDH,AAC CAC GAC,AAC CAC GAC,V,VAC,ACG,NHD -TAC,Y,1,SFC,CGT,TCC TGC TTC,B,TBC,TCC TGC TTC,TBC,,CGT,B,TBC,SCF,CSF,TCC TGC TTC,TCC TGC TTC,B,TBC,CGT,SCF -TAC,Y,2,*,AG,TAA TAG,R,TAR,TAA,TAW,TAT,A,A,TAA,*,*Y,TAA TAT,,,,, -TAG,*,0,QEK,ACG,AAG CAG GAG,V,VAG,AAG CAG GAG,VAG,,ACG,V,VAG,KQE,KQE,AAG CAG GAG,AAG CAG GAG,V,VAG,ACG,KQE -TAG,*,1,LSW,CGT,TCG TGG TTG,B,TBG,TCG TGG TTG,TBG,,CGT,B,TBG,SWL,WLS,TCG TGG TTG,TCG TGG TTG,B,TBG,CGT,SWL -TAG,*,2,Y,CT,TAC TAT,Y,TAY,TAT,TAW,TAA,T,T,TAT,Y,*Y,TAA TAT,TAT,T,TAT,T,Y -TAT,Y,0,NHD,ACG,AAT CAT GAT,V,VAT,AAT CAT GAT,VAT,,ACG,V,VAT,NHD,NDH,AAT CAT GAT,AAT CAT GAT,V,VAT,ACG,NHD -TAT,Y,1,SFC,CGT,TCT TGT TTT,B,TBT,TCT TGT TTT,TBT,,CGT,B,TBT,SCF,CSF,TCT TGT TTT,TCT TGT TTT,B,TBT,CGT,SCF -TAT,Y,2,*,AG,TAA TAG,R,TAR,TAA,TAM,TAC,A,A,TAA,*,*Y,TAA TAC,,,,, -TCA,S,0,TAP,ACG,ACA CCA GCA,V,VCA,ACA CCA GCA,VCA,,ACG,V,VCA,TPA,ATP,ACA CCA GCA,ACA CCA GCA,V,VCA,ACG,TPA -TCA,S,1,L*,AGT,TAA TGA TTA,D,TDA,TAA TTA,TDA,,AT,W,TWA,*L,*L,TAA TGA TTA,TTA,T,TTA,T,L -TCA,S,2,,,,A,TCA,,TCT,,,,,,S,TCT,,,,, -TCC,S,0,TAP,ACG,ACC CCC GCC,V,VCC,ACC CCC GCC,VCC,,ACG,V,VCC,TPA,ATP,ACC CCC GCC,ACC CCC GCC,V,VCC,ACG,TPA -TCC,S,1,FCY,AGT,TAC TGC TTC,D,TDC,TAC TGC TTC,TDC,,AGT,D,TDC,YCF,CYF,TAC TGC TTC,TAC TGC TTC,D,TDC,AGT,YCF -TCC,S,2,,,,C,TCC,,TCT,,,,,,S,TCT,,,,, -TCG,S,0,TAP,ACG,ACG CCG GCG,V,VCG,ACG CCG GCG,VCG,,ACG,V,VCG,TPA,ATP,ACG CCG GCG,ACG CCG GCG,V,VCG,ACG,TPA -TCG,S,1,L*W,AGT,TAG TGG TTG,D,TDG,TAG TGG TTG,TDG,,AGT,D,TDG,*WL,W*L,TAG TGG TTG,TGG TTG,K,TKG,GT,WL -TCG,S,2,,,,G,TCG,,TCT,,,,,,S,TCT,,,,, -TCT,S,0,TAP,ACG,ACT CCT GCT,V,VCT,ACT CCT GCT,VCT,,ACG,V,VCT,TPA,ATP,ACT CCT GCT,ACT CCT GCT,V,VCT,ACG,TPA -TCT,S,1,FCY,AGT,TAT TGT TTT,D,TDT,TAT TGT TTT,TDT,,AGT,D,TDT,YCF,CYF,TAT TGT TTT,TAT TGT TTT,D,TDT,AGT,YCF -TCT,S,2,,,,T,TCT,,TCA,,,,,,S,TCA,,,,, -TGA,*,0,RG,ACG,AGA CGA GGA,V,VGA,AGA GGA,VGA,,AG,R,RGA,RG,RG,AGA CGA GGA,AGA GGA,R,RGA,AG,RG -TGA,*,1,LS,CT,TCA TTA,Y,TYA,TCA TTA,THA,TAA,CT,Y,TYA,SL,L*S,TAA TCA TTA,TCA TTA,Y,TYA,CT,SL -TGA,*,2,CW,CGT,TGC TGG TGT,B,TGB,TGT TGG,TGB,,TG,K,TGK,CW,WC,TGC TGG TGT,TGT TGG,K,TGK,TG,CW -TGC,C,0,RSG,ACG,AGC CGC GGC,V,VGC,CGC GGC,VGC,,CG,S,SGC,RG,RSG,AGC CGC GGC,CGC GGC,S,SGC,CG,RG -TGC,C,1,SFY,ACT,TAC TCC TTC,H,THC,TAC TCC TTC,THC,,ACT,H,THC,YSF,FSY,TAC TCC TTC,TAC TCC TTC,H,THC,ACT,YSF -TGC,C,2,*W,AG,TGA TGG,R,TGR,TGA TGG,TGD,TGT,AG,R,TGR,*W,W*C,TGA TGG TGT,TGG,G,TGG,G,W -TGG,W,0,RG,ACG,AGG CGG GGG,V,VGG,AGG GGG,VGG,,AG,R,RGG,RG,RG,AGG CGG GGG,AGG GGG,R,RGG,AG,RG -TGG,W,1,L*S,ACT,TAG TCG TTG,H,THG,TAG TCG TTG,THG,,ACT,H,THG,*SL,L*S,TAG TCG TTG,TCG TTG,Y,TYG,CT,SL -TGG,W,2,*C,ACT,TGA TGC TGT,H,TGH,TGA TGC,TGV,TGG,AC,M,TGM,*C,W*C,TGA TGC TGG,TGC,C,TGC,C,C -TGT,C,0,RSG,ACG,AGT CGT GGT,V,VGT,CGT GGT,VGT,,CG,S,SGT,RG,RSG,AGT CGT GGT,CGT GGT,S,SGT,CG,RG -TGT,C,1,SFY,ACT,TAT TCT TTT,H,THT,TAT TCT TTT,THT,,ACT,H,THT,YSF,FSY,TAT TCT TTT,TAT TCT TTT,H,THT,ACT,YSF -TGT,C,2,*W,AG,TGA TGG,R,TGR,TGA TGG,TGV,TGC,AG,R,TGR,*W,W*C,TGA TGC TGG,TGG,G,TGG,G,W -TTA,L,0,IV,AG,ATA GTA,R,RTA,ATA GTA,RTA,,AG,R,RTA,IV,VI,ATA GTA,ATA GTA,R,RTA,AG,IV -TTA,L,1,*S,ACG,TAA TCA TGA,V,TVA,TAA TCA,TVA,,AC,M,TMA,*S,*S,TAA TCA TGA,TCA,C,TCA,C,S -TTA,L,2,F,CT,TTC TTT,Y,TTY,TTT,TTK,TTG,T,T,TTT,F,FL,TTG TTT,TTT,T,TTT,T,F -TTC,F,0,LIV,ACG,ATC CTC GTC,V,VTC,ATC GTC,VTC,,AG,R,RTC,IV,VLI,ATC CTC GTC,ATC GTC,R,RTC,AG,IV -TTC,F,1,SCY,ACG,TAC TCC TGC,V,TVC,TAC TCC TGC,TVC,,ACG,V,TVC,YSC,CSY,TAC TCC TGC,TAC TCC TGC,V,TVC,ACG,YSC -TTC,F,2,L,AG,TTA TTG,R,TTR,TTG,TTK,TTT,G,G,TTG,L,FL,TTG TTT,TTG,G,TTG,G,L -TTG,L,0,VM,AG,ATG GTG,R,RTG,ATG GTG,RTG,,AG,R,RTG,MV,MV,ATG GTG,ATG GTG,R,RTG,AG,MV -TTG,L,1,*SW,ACG,TAG TCG TGG,V,TVG,TAG TCG TGG,TVG,,ACG,V,TVG,*SW,W*S,TAG TCG TGG,TCG TGG,S,TSG,CG,SW -TTG,L,2,F,CT,TTC TTT,Y,TTY,TTT,TTW,TTA,T,T,TTT,F,FL,TTA TTT,TTT,T,TTT,T,F -TTT,F,0,LIV,ACG,ATT CTT GTT,V,VTT,ATT GTT,VTT,,AG,R,RTT,IV,VLI,ATT CTT GTT,ATT GTT,R,RTT,AG,IV -TTT,F,1,SCY,ACG,TAT TCT TGT,V,TVT,TAT TCT TGT,TVT,,ACG,V,TVT,YSC,CSY,TAT TCT TGT,TAT TCT TGT,V,TVT,ACG,YSC -TTT,F,2,L,AG,TTA TTG,R,TTR,TTG,TTS,TTC,G,G,TTG,L,FL,TTC TTG,TTG,G,TTG,G,L diff --git a/handcrafted-codon-table.csv b/handcrafted-codon-table.csv deleted file mode 100644 index 2a38f80..0000000 --- a/handcrafted-codon-table.csv +++ /dev/null @@ -1,193 +0,0 @@ -codon,aa,position,missense_codons,missense_aa,iupac_codon,sele_codons,syn_iupac_codon,syn_bool -AAA,K,0,CAA GAA TAA,*QE,BAA,CAA GAA TAA,BAA,FALSE -AAA,K,1,ACA AGA ATA,TRI,ABA,ACA AGA ATA,ABA,FALSE -AAA,K,2,AAC AAT,N,AAY,AAT,AAK,TRUE -AAC,N,0,CAC GAC TAC,HDY,BAC,CAC GAC TAC,BAC,FALSE -AAC,N,1,ACC AGC ATC,TSI,ABC,ACC AGC ATC,ABC,FALSE -AAC,N,2,AAA AAG,K,AAR,AAA,AAW,TRUE -AAG,K,0,CAG GAG TAG,*QE,BAG,CAG GAG TAG,BAG,FALSE -AAG,K,1,ACG AGG ATG,TRM,ABG,ACG AGG ATG,ABG,FALSE -AAG,K,2,AAC AAT,N,AAY,AAT,AAW,TRUE -AAT,N,0,CAT GAT TAT,HDY,BAT,CAT GAT TAT,BAT,FALSE -AAT,N,1,ACT AGT ATT,TSI,ABT,ACT AGT ATT,ABT,FALSE -AAT,N,2,AAA AAG,K,AAR,AAA,AAM,TRUE -ACA,T,0,CCA GCA TCA,SAP,BCA,CCA GCA TCA,BCA,FALSE -ACA,T,1,AAA AGA ATA,RIK,ADA,AAA AGA ATA,ADA,FALSE -ACA,T,2,,,ACA,,ACT,TRUE -ACC,T,0,CCC GCC TCC,SAP,BCC,CCC GCC TCC,BCC,FALSE -ACC,T,1,AAC AGC ATC,NSI,ADC,AAC ATC,ADC,FALSE -ACC,T,2,,,ACC,,ACT,TRUE -ACG,T,0,CCG GCG TCG,SAP,BCG,CCG GCG TCG,BCG,FALSE -ACG,T,1,AAG AGG ATG,RMK,ADG,AAG AGG ATG,ADG,FALSE -ACG,T,2,,,ACG,,ACT,TRUE -ACT,T,0,CCT GCT TCT,SAP,BCT,CCT GCT TCT,BCT,FALSE -ACT,T,1,AAT AGT ATT,NSI,ADT,AAT ATT,ADT,FALSE -ACT,T,2,,,ACT,,ACA,TRUE -AGA,R,0,GGA TGA,*G,KGA,GGA TGA,KGA,FALSE -AGA,R,1,AAA ACA ATA,TIK,AHA,AAA ACA ATA,AHA,FALSE -AGA,R,2,AGC AGT,S,AGY,AGT,AGK,TRUE -AGC,S,0,CGC GGC TGC,RCG,BGC,GGC TGC,BGC,FALSE -AGC,S,1,AAC ACC ATC,NTI,AHC,AAC ACC ATC,AHC,FALSE -AGC,S,2,AGA AGG,R,AGR,AGA,AGW,TRUE -AGG,R,0,GGG TGG,WG,KGG,GGG TGG,KGG,FALSE -AGG,R,1,AAG ACG ATG,TMK,AHG,AAG ACG ATG,AHG,FALSE -AGG,R,2,AGC AGT,S,AGY,AGT,AGW,TRUE -AGT,S,0,CGT GGT TGT,RCG,BGT,GGT TGT,BGT,FALSE -AGT,S,1,AAT ACT ATT,NTI,AHT,AAT ACT ATT,AHT,FALSE -AGT,S,2,AGA AGG,R,AGR,AGA,AGM,TRUE -ATA,I,0,CTA GTA TTA,LV,BTA,TTA GTA,BTA,FALSE -ATA,I,1,AAA ACA AGA,TRK,AVA,AAA ACA AGA,AVA,FALSE -ATA,I,2,ATG,M,ATG,ATG,ATK,TRUE -ATC,I,0,CTC GTC TTC,LFV,BTC,CTC GTC TTC,BTC,FALSE -ATC,I,1,AAC ACC AGC,NST,AVC,AAC ACC AGC,AVC,FALSE -ATC,I,2,ATG,M,ATG,ATG,ATK,TRUE -ATG,M,0,CTG GTG TTG,LV,BTG,TTA GTA,BTG,FALSE -ATG,M,1,AAG ACG AGG,TRK,AVG,AAG ACG AGG,AVG,FALSE -ATG,M,2,ATA ATC ATT,I,ATH,ATT,ATK,TRUE -ATT,I,0,CTT GTT TTT,LFV,BTT,CTT GTT TTT,BTT,FALSE -ATT,I,1,AAT ACT AGT,NST,AVT,AAT ACT AGT,AVT,FALSE -ATT,I,2,ATG,M,ATG,ATG,ATR,TRUE -CAA,Q,0,AAA GAA TAA,*EK,DAA,AAA GAA TAA,DAA,FALSE -CAA,Q,1,CCA CGA CTA,RLP,CBA,CCA CGA CTA,CBA,FALSE -CAA,Q,2,CAC CAT,H,CAY,CAT,CAK,TRUE -CAC,H,0,AAC GAC TAC,NDY,DAC,AAC GAC TAC,DAC,FALSE -CAC,H,1,CCC CGC CTC,RLP,CBC,CCC CGC CTC,CBC,FALSE -CAC,H,2,CAA CAG,Q,CAR,CAA,CAW,TRUE -CAG,Q,0,AAG GAG TAG,*EK,DAG,AAG GAG TAG,DAG,FALSE -CAG,Q,1,CCG CGG CTG,RLP,CBG,CCG CGG CTG,CBG,FALSE -CAG,Q,2,CAC CAT,H,CAY,CAT,CAW,TRUE -CAT,H,0,AAT GAT TAT,NDY,DAT,AAT GAT TAT,DAT,FALSE -CAT,H,1,CCT CGT CTT,RLP,CBT,CCT CGT CTT,CBT,FALSE -CAT,H,2,CAA CAG,Q,CAR,CAA,CAM,TRUE -CCA,P,0,ACA GCA TCA,TSA,DCA,ACA GCA TCA,DCA,FALSE -CCA,P,1,CAA CGA CTA,RQL,CDA,CAA CGA CTA,CDA,FALSE -CCA,P,2,,,CCA,,CCT,TRUE -CCC,P,0,ACC GCC TCC,TSA,DCC,ACC GCC TCC,DCC,FALSE -CCC,P,1,CAC CGC CTC,RHL,CDC,CAC CGC CTC,CDC,FALSE -CCC,P,2,,,CCC,,CCA,TRUE -CCG,P,0,ACG GCG TCG,TSA,DCG,ACG GCG TCG,DCG,FALSE -CCG,P,1,CAG CGG CTG,RQL,CDG,CAG CGG CTG,CDG,FALSE -CCG,P,2,,,CCG,,CCA,TRUE -CCT,P,0,ACT GCT TCT,TSA,DCT,ACT GCT TCT,DCT,FALSE -CCT,P,1,CAT CGT CTT,RHL,CDT,CAT CGT CTT,CDT,FALSE -CCT,P,2,,,CCT,,CCA,TRUE -CGA,R,0,GGA TGA,*G,KGA,GGA TGA,DGA,TRUE -CGA,R,1,CAA CCA CTA,LQP,CHA,CAA CCA CTA,CHA,FALSE -CGA,R,2,,,CGA,,CGA,FALSE -CGC,R,0,AGC GGC TGC,SCG,DGC,AGC GGC TGC,DGC,FALSE -CGC,R,1,CAC CCC CTC,LHP,CHC,CAC CCC CTC,CHC,FALSE -CGC,R,2,,,CGC,,CGT,TRUE -CGG,R,0,GGG TGG,WG,KGG,GGG TGG,DGG,TRUE -CGG,R,1,CAG CCG CTG,LQP,CHG,CAG CCG CTG,CHG,FALSE -CGG,R,2,,,CGG,,CGG,FALSE -CGT,R,0,AGT GGT TGT,SCG,DGT,AGT GGT TGT,DGT,FALSE -CGT,R,1,CAT CCT CTT,LHP,CHT,CAT CCT CTT,CHT,FALSE -CGT,R,2,,,CGT,,CGA,TRUE -CTA,L,0,ATA GTA,IV,RTA,ATA GTA,DTA,TRUE -CTA,L,1,CAA CCA CGA,RQP,CVA,CAA CCA CGA,CVA,FALSE -CTA,L,2,,,CTA,,CTA,FALSE -CTC,L,0,ATC GTC TTC,FIV,DTC,ATC GTC TTC,DTC,FALSE -CTC,L,1,CAC CCC CGC,RHP,CVC,CAC CCC CGC,CVC,FALSE -CTC,L,2,,,CTC,,CTA,TRUE -CTG,L,0,ATG GTG,VM,RTG,ATG GTG,DTG,TRUE -CTG,L,1,CAG CCG CGG,RQP,CVG,CAG CCG CGG,CVG,FALSE -CTG,L,2,,,CTG,,CTG,FALSE -CTT,L,0,ATT GTT TTT,FIV,DTT,ATT GTT TTT,DTT,FALSE -CTT,L,1,CAT CCT CGT,RHP,CVT,CAT CCT CGT,CVT,FALSE -CTT,L,2,,,CTT,,CTA,TRUE -GAA,E,0,AAA CAA TAA,*QK,HAA,AAA CAA TAA,HAA,FALSE -GAA,E,1,GCA GGA GTA,VAG,GBA,GCA GGA GTA,GBA,FALSE -GAA,E,2,GAC GAT,D,GAY,GAT,GAK,TRUE -GAC,D,0,AAC CAC TAC,NHY,HAC,AAC CAC TAC,HAC,FALSE -GAC,D,1,GCC GGC GTC,VAG,GBC,GCC GGC GTC,GBC,FALSE -GAC,D,2,GAA GAG,E,GAR,GAA,GAW,TRUE -GAG,E,0,AAG CAG TAG,*QK,HAG,AAG CAG TAG,HAG,FALSE -GAG,E,1,GCG GGG GTG,VAG,GBG,GCG GGG GTG,GBG,FALSE -GAG,E,2,GAC GAT,D,GAY,GAT,GAW,TRUE -GAT,D,0,AAT CAT TAT,NHY,HAT,AAT CAT TAT,HAT,FALSE -GAT,D,1,GCT GGT GTT,VAG,GBT,GCT GGT GTT,GBT,FALSE -GAT,D,2,GAA GAG,E,GAR,GAA,GAM,TRUE -GCA,A,0,ACA CCA TCA,TSP,HCA,ACA CCA TCA,HCA,FALSE -GCA,A,1,GAA GGA GTA,GVE,GDA,GAA GGA GTA,GDA,FALSE -GCA,A,2,,,GCA,,GCT,TRUE -GCC,A,0,ACC CCC TCC,TSP,HCC,ACC CCC TCC,HCC,FALSE -GCC,A,1,GAC GGC GTC,VDG,GDC,GAC GGC GTC,GDC,FALSE -GCC,A,2,,,GCC,,GCT,TRUE -GCG,A,0,ACG CCG TCG,TSP,HCG,ACG CCG TCG,HCG,FALSE -GCG,A,1,GAG GGG GTG,GVE,GDG,GAG GGG GTG,GDG,FALSE -GCG,A,2,,,GCG,,GCT,TRUE -GCT,A,0,ACT CCT TCT,TSP,HCT,ACT CCT TCT,HCT,FALSE -GCT,A,1,GAT GGT GTT,VDG,GDT,GAT GGT GTT,GDT,FALSE -GCT,A,2,,,GCT,,GCA,TRUE -GGA,G,0,AGA CGA TGA,R*,HGA,AGA TGA,HGA,FALSE -GGA,G,1,GAA GCA GTA,VAE,GHA,GAA GCA GTA,GHA,FALSE -GGA,G,2,,,GGA,,GGT,TRUE -GGC,G,0,AGC CGC TGC,RSC,HGC,AGC CGC TGC,HGC,FALSE -GGC,G,1,GAC GCC GTC,ADV,GHC,GAC GCC GTC,GHC,FALSE -GGC,G,2,,,GGC,,GGT,TRUE -GGG,G,0,AGG CGG TGG,RW,HGG,AGG TGG,HGG,FALSE -GGG,G,1,GAG GCG GTG,VAE,GHG,GAG GCG GTG,GHG,FALSE -GGG,G,2,,,GGG,,GGT,TRUE -GGT,G,0,AGT CGT TGT,RSC,HGT,AGT CGT TGT,HGT,FALSE -GGT,G,1,GAT GCT GTT,ADV,GHT,GAT GCT GTT,GHT,FALSE -GGT,G,2,,,GGT,,GGA,TRUE -GTA,V,0,ATA CTA TTA,LI,HTA,ATA TTA,HTA,FALSE -GTA,V,1,GAA GCA GGA,GAE,GVA,GAA GCA GGA,GVA,FALSE -GTA,V,2,,,GTA,,GTT,TRUE -GTC,V,0,ATC CTC TTC,LFI,HTC,ATC CTC TTC,HTC,FALSE -GTC,V,1,GAC GCC GGC,ADG,GVC,GAC GCC GGC,GVC,FALSE -GTC,V,2,,,GTC,,GTT,TRUE -GTG,V,0,ATG CTG TTG,LM,HTG,ATG TTG,HTG,FALSE -GTG,V,1,GAG GCG GGG,GAE,GVG,GAG GCG GGG,GVG,FALSE -GTG,V,2,,,GTG,,GTT,TRUE -GTT,V,0,ATT CTT TTT,LFI,HTT,ATT CTT TTT,HTT,FALSE -GTT,V,1,GAT GCT GGT,ADG,GVT,GAT GCT GGT,GVT,FALSE -GTT,V,2,,,GTT,,GTA,TRUE -TAA,*,0,AAA CAA GAA,QEK,VAA,AAA CAA GAA,VAA,FALSE -TAA,*,1,TCA TTA,LS,TYA,TCA TTA,TYA,FALSE -TAA,*,2,TAC TAT,Y,TAY,TAT,TAK,TRUE -TAC,Y,0,AAC CAC GAC,NHD,VAC,AAC CAC GAC,VAC,FALSE -TAC,Y,1,TCC TGC TTC,SFC,TBC,TCC TGC TTC,TBC,FALSE -TAC,Y,2,TAA TAG,*,TAR,TAA,TAW,TRUE -TAG,*,0,AAG CAG GAG,QEK,VAG,AAG CAG GAG,VAG,FALSE -TAG,*,1,TCG TGG TTG,LSW,TBG,TCG TGG TTG,TBG,FALSE -TAG,*,2,TAC TAT,Y,TAY,TAT,TAW,TRUE -TAT,Y,0,AAT CAT GAT,NHD,VAT,AAT CAT GAT,VAT,FALSE -TAT,Y,1,TCT TGT TTT,SFC,TBT,TCT TGT TTT,TBT,FALSE -TAT,Y,2,TAA TAG,*,TAR,TAA,TAM,TRUE -TCA,S,0,ACA CCA GCA,TAP,VCA,ACA CCA GCA,VCA,FALSE -TCA,S,1,TAA TGA TTA,L*,TDA,TAA TTA,TDA,FALSE -TCA,S,2,,,TCA,,TCT,TRUE -TCC,S,0,ACC CCC GCC,TAP,VCC,ACC CCC GCC,VCC,FALSE -TCC,S,1,TAC TGC TTC,FCY,TDC,TAC TGC TTC,TDC,FALSE -TCC,S,2,,,TCC,,TCT,TRUE -TCG,S,0,ACG CCG GCG,TAP,VCG,ACG CCG GCG,VCG,FALSE -TCG,S,1,TAG TGG TTG,L*W,TDG,TAG TGG TTG,TDG,FALSE -TCG,S,2,,,TCG,,TCT,TRUE -TCT,S,0,ACT CCT GCT,TAP,VCT,ACT CCT GCT,VCT,FALSE -TCT,S,1,TAT TGT TTT,FCY,TDT,TAT TGT TTT,TDT,FALSE -TCT,S,2,,,TCT,,TCA,TRUE -TGA,*,0,AGA CGA GGA,RG,VGA,AGA GGA,VGA,FALSE -TGA,*,1,TCA TTA,LS,TYA,TCA TTA,THA,TRUE -TGA,*,2,TGC TGG TGT,CW,TGB,TGT TGG,TGB,FALSE -TGC,C,0,AGC CGC GGC,RSG,VGC,CGC GGC,VGC,FALSE -TGC,C,1,TAC TCC TTC,SFY,THC,TAC TCC TTC,THC,FALSE -TGC,C,2,TGA TGG,*W,TGR,TGA TGG,TGD,TRUE -TGG,W,0,AGG CGG GGG,RG,VGG,AGG GGG,VGG,FALSE -TGG,W,1,TAG TCG TTG,L*S,THG,TAG TCG TTG,THG,FALSE -TGG,W,2,TGA TGC TGT,*C,TGH,TGC TGT,TGB,TRUE -TGT,C,0,AGT CGT GGT,RSG,VGT,CGT GGT,VGT,FALSE -TGT,C,1,TAT TCT TTT,SFY,THT,TAT TCT TTT,THT,FALSE -TGT,C,2,TGA TGG,*W,TGR,TGA TGG,TGV,TRUE -TTA,L,0,ATA GTA,IV,RTA,ATA GTA,RTA,FALSE -TTA,L,1,TAA TCA TGA,*S,TVA,TAA TCA,TVA,FALSE -TTA,L,2,TTC TTT,F,TTY,TTT,TTK,TRUE -TTC,F,0,ATC CTC GTC,LIV,VTC,ATC GTC,VTC,FALSE -TTC,F,1,TAC TCC TGC,SCY,TVC,TAC TCC TGC,TVC,FALSE -TTC,F,2,TTA TTG,L,TTR,TTG,TTK,TRUE -TTG,L,0,ATG GTG,VM,RTG,ATG GTG,RTG,FALSE -TTG,L,1,TAG TCG TGG,*SW,TVG,TAG TCG TGG,TVG,FALSE -TTG,L,2,TTC TTT,F,TTY,TTT,TTW,TRUE -TTT,F,0,ATT CTT GTT,LIV,VTT,ATT GTT,VTT,FALSE -TTT,F,1,TAT TCT TGT,SCY,TVT,TAT TCT TGT,TVT,FALSE -TTT,F,2,TTA TTG,L,TTR,TTG,TTS,TRUE \ No newline at end of file diff --git a/k3l_test.fa b/k3l_test.fa index 8b83754..1ac70b6 100644 --- a/k3l_test.fa +++ b/k3l_test.fa @@ -1,194 +1,194 @@ >window_1-1_GCG11HCG -TTTGTTATTCGTTGCCCAATHCGGGWGAYGTWATWAARGGCAGAGTATACGAGAAGGATT +TTTGTTATTCGTTGCCCAATHCGggwgaygtwatwaarGGCAGAGTATACGAGAAGGATT >window_1-1_GCG11GDG -TTTGTTATTCGTTGCCCAATGDGGGWGAYGTWATWAARGGCAGAGTATACGAGAAGGATT +TTTGTTATTCGTTGCCCAATGDGggwgaygtwatwaarGGCAGAGTATACGAGAAGGATT >window_1-1_GGT12HGT -TTTGTTATTCGTTGCCCAATGCWHGTGAYGTWATWAARGGCAGAGTATACGAGAAGGATT +TTTGTTATTCGTTGCCCAATgcwHGTgaygtwatwaarGGCAGAGTATACGAGAAGGATT >window_1-1_GGT12GHT -TTTGTTATTCGTTGCCCAATGCWGHTGAYGTWATWAARGGCAGAGTATACGAGAAGGATT +TTTGTTATTCGTTGCCCAATgcwGHTgaygtwatwaarGGCAGAGTATACGAGAAGGATT >window_1-1_GAT13HAT -TTTGTTATTCGTTGCCCAATGCWGGWHATGTWATWAARGGCAGAGTATACGAGAAGGATT +TTTGTTATTCGTTGCCCAATgcwggwHATgtwatwaarGGCAGAGTATACGAGAAGGATT >window_1-1_GAT13GBT -TTTGTTATTCGTTGCCCAATGCWGGWGBTGTWATWAARGGCAGAGTATACGAGAAGGATT +TTTGTTATTCGTTGCCCAATgcwggwGBTgtwatwaarGGCAGAGTATACGAGAAGGATT >window_1-1_GAT13GAA -TTTGTTATTCGTTGCCCAATGCWGGWGAAGTWATWAARGGCAGAGTATACGAGAAGGATT +TTTGTTATTCGTTGCCCAATgcwggwGAAgtwatwaarGGCAGAGTATACGAGAAGGATT >window_1-1_GTA14WTA -TTTGTTATTCGTTGCCCAATGCWGGWGAYWTAATWAARGGCAGAGTATACGAGAAGGATT +TTTGTTATTCGTTGCCCAATgcwggwgayWTAatwaarGGCAGAGTATACGAGAAGGATT >window_1-1_GTA14GVA -TTTGTTATTCGTTGCCCAATGCWGGWGAYGVAATWAARGGCAGAGTATACGAGAAGGATT +TTTGTTATTCGTTGCCCAATgcwggwgayGVAatwaarGGCAGAGTATACGAGAAGGATT >window_1-1_ATA15KTA -TTTGTTATTCGTTGCCCAATGCWGGWGAYGTWKTAAARGGCAGAGTATACGAGAAGGATT +TTTGTTATTCGTTGCCCAATgcwggwgaygtwKTAaarGGCAGAGTATACGAGAAGGATT >window_1-1_ATA15AVA -TTTGTTATTCGTTGCCCAATGCWGGWGAYGTWAVAAARGGCAGAGTATACGAGAAGGATT +TTTGTTATTCGTTGCCCAATgcwggwgaygtwAVAaarGGCAGAGTATACGAGAAGGATT >window_1-1_ATA15ATG -TTTGTTATTCGTTGCCCAATGCWGGWGAYGTWATGAARGGCAGAGTATACGAGAAGGATT +TTTGTTATTCGTTGCCCAATgcwggwgaygtwATGaarGGCAGAGTATACGAGAAGGATT >window_1-1_AAG16BAG -TTTGTTATTCGTTGCCCAATGCWGGWGAYGTWATWBAGGGCAGAGTATACGAGAAGGATT +TTTGTTATTCGTTGCCCAATgcwggwgaygtwatwBAGGGCAGAGTATACGAGAAGGATT >window_1-1_AAG16ABG -TTTGTTATTCGTTGCCCAATGCWGGWGAYGTWATWABGGGCAGAGTATACGAGAAGGATT ->window_1-1_AAG16AAT -TTTGTTATTCGTTGCCCAATGCWGGWGAYGTWATWAATGGCAGAGTATACGAGAAGGATT +TTTGTTATTCGTTGCCCAATgcwggwgaygtwatwABGGGCAGAGTATACGAGAAGGATT +>window_1-1_AAG16AAW +TTTGTTATTCGTTGCCCAATgcwggwgaygtwatwAAWGGCAGAGTATACGAGAAGGATT >window_1-2_GGC17HGC -ATGCGGGTGATGTAATAAAGHGCAGRGTATACGAGAAGGATTATGCTCTATAT +ATGCGGGTGATGTAATAAAGHGCagrGTATACGAGAAGGATTATGCTCTATAT >window_1-2_GGC17GHC -ATGCGGGTGATGTAATAAAGGHCAGRGTATACGAGAAGGATTATGCTCTATAT +ATGCGGGTGATGTAATAAAGGHCagrGTATACGAGAAGGATTATGCTCTATAT >window_1-2_AGA18KGA -ATGCGGGTGATGTAATAAAGGGWKGAGTATACGAGAAGGATTATGCTCTATAT +ATGCGGGTGATGTAATAAAGggwKGAGTATACGAGAAGGATTATGCTCTATAT >window_1-2_AGA18AHA -ATGCGGGTGATGTAATAAAGGGWAHAGTATACGAGAAGGATTATGCTCTATAT +ATGCGGGTGATGTAATAAAGggwAHAGTATACGAGAAGGATTATGCTCTATAT >window_1-2_AGA18AGT -ATGCGGGTGATGTAATAAAGGGWAGTGTATACGAGAAGGATTATGCTCTATAT +ATGCGGGTGATGTAATAAAGggwAGTGTATACGAGAAGGATTATGCTCTATAT >window_2-1_AGT43KGT -TTGAAGCTATCTTGGCAGAGKGTGTWAARATGCATATGGATAGATATGTTGAATATAGGGA +TTGAAGCTATCTTGGCAGAGKGTgtwaaratgCATATGGATAGATATGTTGAATATAGGGA >window_2-1_AGT43AHT -TTGAAGCTATCTTGGCAGAGAHTGTWAARATGCATATGGATAGATATGTTGAATATAGGGA +TTGAAGCTATCTTGGCAGAGAHTgtwaaratgCATATGGATAGATATGTTGAATATAGGGA >window_2-1_AGT43AGA -TTGAAGCTATCTTGGCAGAGAGAGTWAARATGCATATGGATAGATATGTTGAATATAGGGA +TTGAAGCTATCTTGGCAGAGAGAgtwaaratgCATATGGATAGATATGTTGAATATAGGGA >window_2-1_GTT44HTT -TTGAAGCTATCTTGGCAGAGTCWHTTAARATGCATATGGATAGATATGTTGAATATAGGGA +TTGAAGCTATCTTGGCAGAGtcwHTTaaratgCATATGGATAGATATGTTGAATATAGGGA >window_2-1_GTT44GVT -TTGAAGCTATCTTGGCAGAGTCWGVTAARATGCATATGGATAGATATGTTGAATATAGGGA +TTGAAGCTATCTTGGCAGAGtcwGVTaaratgCATATGGATAGATATGTTGAATATAGGGA >window_2-1_AAG45BAG -TTGAAGCTATCTTGGCAGAGTCWGTWBAGATGCATATGGATAGATATGTTGAATATAGGGA +TTGAAGCTATCTTGGCAGAGtcwgtwBAGatgCATATGGATAGATATGTTGAATATAGGGA >window_2-1_AAG45ABG -TTGAAGCTATCTTGGCAGAGTCWGTWABGATGCATATGGATAGATATGTTGAATATAGGGA +TTGAAGCTATCTTGGCAGAGtcwgtwABGatgCATATGGATAGATATGTTGAATATAGGGA >window_2-1_AAG45AAT -TTGAAGCTATCTTGGCAGAGTCWGTWAATATGCATATGGATAGATATGTTGAATATAGGGA +TTGAAGCTATCTTGGCAGAGtcwgtwAATatgCATATGGATAGATATGTTGAATATAGGGA >window_2-1_ATG46KTG -TTGAAGCTATCTTGGCAGAGTCWGTWAARKTGCATATGGATAGATATGTTGAATATAGGGA +TTGAAGCTATCTTGGCAGAGtcwgtwaarKTGCATATGGATAGATATGTTGAATATAGGGA >window_2-1_ATG46AVG -TTGAAGCTATCTTGGCAGAGTCWGTWAARAVGCATATGGATAGATATGTTGAATATAGGGA +TTGAAGCTATCTTGGCAGAGtcwgtwaarAVGCATATGGATAGATATGTTGAATATAGGGA >window_2-1_ATG46ATT -TTGAAGCTATCTTGGCAGAGTCWGTWAARATTCATATGGATAGATATGTTGAATATAGGGA +TTGAAGCTATCTTGGCAGAGtcwgtwaarATTCATATGGATAGATATGTTGAATATAGGGA >window_2-2_CAT47DAT -TGGCAGAGAGTGTTAAGATGDATATGGAYAGRTAYGTTGAATATAGGGATAAACTGGTAG +TGGCAGAGAGTGTTAAGATGDATatggayagrtayGTTGAATATAGGGATAAACTGGTAG >window_2-2_CAT47CBT -TGGCAGAGAGTGTTAAGATGCBTATGGAYAGRTAYGTTGAATATAGGGATAAACTGGTAG +TGGCAGAGAGTGTTAAGATGCBTatggayagrtayGTTGAATATAGGGATAAACTGGTAG >window_2-2_CAT47CAA -TGGCAGAGAGTGTTAAGATGCAAATGGAYAGRTAYGTTGAATATAGGGATAAACTGGTAG +TGGCAGAGAGTGTTAAGATGCAAatggayagrtayGTTGAATATAGGGATAAACTGGTAG >window_2-2_ATG48KTG -TGGCAGAGAGTGTTAAGATGCAYKTGGAYAGRTAYGTTGAATATAGGGATAAACTGGTAG +TGGCAGAGAGTGTTAAGATGcayKTGgayagrtayGTTGAATATAGGGATAAACTGGTAG >window_2-2_ATG48AVG -TGGCAGAGAGTGTTAAGATGCAYAVGGAYAGRTAYGTTGAATATAGGGATAAACTGGTAG +TGGCAGAGAGTGTTAAGATGcayAVGgayagrtayGTTGAATATAGGGATAAACTGGTAG >window_2-2_ATG48ATT -TGGCAGAGAGTGTTAAGATGCAYATTGAYAGRTAYGTTGAATATAGGGATAAACTGGTAG +TGGCAGAGAGTGTTAAGATGcayATTgayagrtayGTTGAATATAGGGATAAACTGGTAG >window_2-2_GAT49HAT -TGGCAGAGAGTGTTAAGATGCAYATGHATAGRTAYGTTGAATATAGGGATAAACTGGTAG +TGGCAGAGAGTGTTAAGATGcayatgHATagrtayGTTGAATATAGGGATAAACTGGTAG >window_2-2_GAT49GBT -TGGCAGAGAGTGTTAAGATGCAYATGGBTAGRTAYGTTGAATATAGGGATAAACTGGTAG +TGGCAGAGAGTGTTAAGATGcayatgGBTagrtayGTTGAATATAGGGATAAACTGGTAG >window_2-2_GAT49GAA -TGGCAGAGAGTGTTAAGATGCAYATGGAAAGRTAYGTTGAATATAGGGATAAACTGGTAG +TGGCAGAGAGTGTTAAGATGcayatgGAAagrtayGTTGAATATAGGGATAAACTGGTAG >window_2-2_AGA50KGA -TGGCAGAGAGTGTTAAGATGCAYATGGAYKGATAYGTTGAATATAGGGATAAACTGGTAG +TGGCAGAGAGTGTTAAGATGcayatggayKGAtayGTTGAATATAGGGATAAACTGGTAG >window_2-2_AGA50AHA -TGGCAGAGAGTGTTAAGATGCAYATGGAYAHATAYGTTGAATATAGGGATAAACTGGTAG +TGGCAGAGAGTGTTAAGATGcayatggayAHAtayGTTGAATATAGGGATAAACTGGTAG >window_2-2_AGA50AGT -TGGCAGAGAGTGTTAAGATGCAYATGGAYAGTTAYGTTGAATATAGGGATAAACTGGTAG +TGGCAGAGAGTGTTAAGATGcayatggayAGTtayGTTGAATATAGGGATAAACTGGTAG >window_2-2_TAT51VAT -TGGCAGAGAGTGTTAAGATGCAYATGGAYAGRVATGTTGAATATAGGGATAAACTGGTAG +TGGCAGAGAGTGTTAAGATGcayatggayagrVATGTTGAATATAGGGATAAACTGGTAG >window_2-2_TAT51TBT -TGGCAGAGAGTGTTAAGATGCAYATGGAYAGRTBTGTTGAATATAGGGATAAACTGGTAG +TGGCAGAGAGTGTTAAGATGcayatggayagrTBTGTTGAATATAGGGATAAACTGGTAG >window_2-2_TAT51TAA -TGGCAGAGAGTGTTAAGATGCAYATGGAYAGRTAAGTTGAATATAGGGATAAACTGGTAG +TGGCAGAGAGTGTTAAGATGcayatggayagrTAAGTTGAATATAGGGATAAACTGGTAG >window_3-1_GAT71HAT -AAGTTAAAGTGATTAGAGTTHATTAYACWAARGGATATATAGATGTCAATTACAAAAGGATG +AAGTTAAAGTGATTAGAGTTHATtayacwaarGGATATATAGATGTCAATTACAAAAGGATG >window_3-1_GAT71GBT -AAGTTAAAGTGATTAGAGTTGBTTAYACWAARGGATATATAGATGTCAATTACAAAAGGATG +AAGTTAAAGTGATTAGAGTTGBTtayacwaarGGATATATAGATGTCAATTACAAAAGGATG >window_3-1_GAT71GAA -AAGTTAAAGTGATTAGAGTTGAATAYACWAARGGATATATAGATGTCAATTACAAAAGGATG +AAGTTAAAGTGATTAGAGTTGAAtayacwaarGGATATATAGATGTCAATTACAAAAGGATG >window_3-1_TAT72VAT -AAGTTAAAGTGATTAGAGTTGAYVATACWAARGGATATATAGATGTCAATTACAAAAGGATG +AAGTTAAAGTGATTAGAGTTgayVATacwaarGGATATATAGATGTCAATTACAAAAGGATG >window_3-1_TAT72TBT -AAGTTAAAGTGATTAGAGTTGAYTBTACWAARGGATATATAGATGTCAATTACAAAAGGATG ->window_3-1_TAT72TAA -AAGTTAAAGTGATTAGAGTTGAYTAAACWAARGGATATATAGATGTCAATTACAAAAGGATG +AAGTTAAAGTGATTAGAGTTgayTBTacwaarGGATATATAGATGTCAATTACAAAAGGATG >window_3-1_ACA73BCA -AAGTTAAAGTGATTAGAGTTGAYTAYBCAAARGGATATATAGATGTCAATTACAAAAGGATG +AAGTTAAAGTGATTAGAGTTgaytayBCAaarGGATATATAGATGTCAATTACAAAAGGATG >window_3-1_ACA73ADA -AAGTTAAAGTGATTAGAGTTGAYTAYADAAARGGATATATAGATGTCAATTACAAAAGGATG +AAGTTAAAGTGATTAGAGTTgaytayADAaarGGATATATAGATGTCAATTACAAAAGGATG >window_3-1_AAA74BAA -AAGTTAAAGTGATTAGAGTTGAYTAYACWBAAGGATATATAGATGTCAATTACAAAAGGATG +AAGTTAAAGTGATTAGAGTTgaytayacwBAAGGATATATAGATGTCAATTACAAAAGGATG >window_3-1_AAA74ABA -AAGTTAAAGTGATTAGAGTTGAYTAYACWABAGGATATATAGATGTCAATTACAAAAGGATG +AAGTTAAAGTGATTAGAGTTgaytayacwABAGGATATATAGATGTCAATTACAAAAGGATG >window_3-1_AAA74AAT -AAGTTAAAGTGATTAGAGTTGAYTAYACWAATGGATATATAGATGTCAATTACAAAAGGATG +AAGTTAAAGTGATTAGAGTTgaytayacwAATGGATATATAGATGTCAATTACAAAAGGATG >window_3-2_GGA75WGA -TTAGAGTTGATTATACAAAAWGATAYATWGAYGTWAATTACAAAAGGATGTGTAGACATC +TTAGAGTTGATTATACAAAAWGAtayatwgaygtwAATTACAAAAGGATGTGTAGACATC >window_3-2_GGA75GHA -TTAGAGTTGATTATACAAAAGHATAYATWGAYGTWAATTACAAAAGGATGTGTAGACATC +TTAGAGTTGATTATACAAAAGHAtayatwgaygtwAATTACAAAAGGATGTGTAGACATC +>window_3-2_GGA75GGT +TTAGAGTTGATTATACAAAAGGTtayatwgaygtwAATTACAAAAGGATGTGTAGACATC >window_3-2_TAT76VAT -TTAGAGTTGATTATACAAAAGGWVATATWGAYGTWAATTACAAAAGGATGTGTAGACATC +TTAGAGTTGATTATACAAAAggwVATatwgaygtwAATTACAAAAGGATGTGTAGACATC >window_3-2_TAT76TBT -TTAGAGTTGATTATACAAAAGGWTBTATWGAYGTWAATTACAAAAGGATGTGTAGACATC +TTAGAGTTGATTATACAAAAggwTBTatwgaygtwAATTACAAAAGGATGTGTAGACATC >window_3-2_TAT76TAA -TTAGAGTTGATTATACAAAAGGWTAAATWGAYGTWAATTACAAAAGGATGTGTAGACATC +TTAGAGTTGATTATACAAAAggwTAAatwgaygtwAATTACAAAAGGATGTGTAGACATC >window_3-2_ATA77KTA -TTAGAGTTGATTATACAAAAGGWTAYKTAGAYGTWAATTACAAAAGGATGTGTAGACATC +TTAGAGTTGATTATACAAAAggwtayKTAgaygtwAATTACAAAAGGATGTGTAGACATC >window_3-2_ATA77AVA -TTAGAGTTGATTATACAAAAGGWTAYAVAGAYGTWAATTACAAAAGGATGTGTAGACATC +TTAGAGTTGATTATACAAAAggwtayAVAgaygtwAATTACAAAAGGATGTGTAGACATC >window_3-2_ATA77ATG -TTAGAGTTGATTATACAAAAGGWTAYATGGAYGTWAATTACAAAAGGATGTGTAGACATC +TTAGAGTTGATTATACAAAAggwtayATGgaygtwAATTACAAAAGGATGTGTAGACATC >window_3-2_GAT78HAT -TTAGAGTTGATTATACAAAAGGWTAYATWHATGTWAATTACAAAAGGATGTGTAGACATC +TTAGAGTTGATTATACAAAAggwtayatwHATgtwAATTACAAAAGGATGTGTAGACATC >window_3-2_GAT78GBT -TTAGAGTTGATTATACAAAAGGWTAYATWGBTGTWAATTACAAAAGGATGTGTAGACATC +TTAGAGTTGATTATACAAAAggwtayatwGBTgtwAATTACAAAAGGATGTGTAGACATC >window_3-2_GAT78GAA -TTAGAGTTGATTATACAAAAGGWTAYATWGAAGTWAATTACAAAAGGATGTGTAGACATC +TTAGAGTTGATTATACAAAAggwtayatwGAAgtwAATTACAAAAGGATGTGTAGACATC >window_3-2_GTC79HTC -TTAGAGTTGATTATACAAAAGGWTAYATWGAYHTCAATTACAAAAGGATGTGTAGACATC +TTAGAGTTGATTATACAAAAggwtayatwgayHTCAATTACAAAAGGATGTGTAGACATC >window_3-2_GTC79GVC -TTAGAGTTGATTATACAAAAGGWTAYATWGAYGVCAATTACAAAAGGATGTGTAGACATC +TTAGAGTTGATTATACAAAAggwtayatwgayGVCAATTACAAAAGGATGTGTAGACATC >window_3-3_AAT80BAT -CAAAAGGATATATAGATGTCBATTAYAARAGRATGTGYAGACATCAATAGTAGCTGTCGA +CAAAAGGATATATAGATGTCBATtayaaragratgtgyAGACATCAATAGTAGCTGTCGA >window_3-3_AAT80ABT -CAAAAGGATATATAGATGTCABTTAYAARAGRATGTGYAGACATCAATAGTAGCTGTCGA +CAAAAGGATATATAGATGTCABTtayaaragratgtgyAGACATCAATAGTAGCTGTCGA >window_3-3_AAT80AAA -CAAAAGGATATATAGATGTCAAATAYAARAGRATGTGYAGACATCAATAGTAGCTGTCGA +CAAAAGGATATATAGATGTCAAAtayaaragratgtgyAGACATCAATAGTAGCTGTCGA >window_3-3_TAC81VAC -CAAAAGGATATATAGATGTCAAYVACAARAGRATGTGYAGACATCAATAGTAGCTGTCGA +CAAAAGGATATATAGATGTCaayVACaaragratgtgyAGACATCAATAGTAGCTGTCGA >window_3-3_TAC81TBC -CAAAAGGATATATAGATGTCAAYTBCAARAGRATGTGYAGACATCAATAGTAGCTGTCGA +CAAAAGGATATATAGATGTCaayTBCaaragratgtgyAGACATCAATAGTAGCTGTCGA >window_3-3_TAC81TAA -CAAAAGGATATATAGATGTCAAYTAAAARAGRATGTGYAGACATCAATAGTAGCTGTCGA +CAAAAGGATATATAGATGTCaayTAAaaragratgtgyAGACATCAATAGTAGCTGTCGA >window_3-3_AAA82BAA -CAAAAGGATATATAGATGTCAAYTAYBAAAGRATGTGYAGACATCAATAGTAGCTGTCGA +CAAAAGGATATATAGATGTCaaytayBAAagratgtgyAGACATCAATAGTAGCTGTCGA >window_3-3_AAA82ABA -CAAAAGGATATATAGATGTCAAYTAYABAAGRATGTGYAGACATCAATAGTAGCTGTCGA +CAAAAGGATATATAGATGTCaaytayABAagratgtgyAGACATCAATAGTAGCTGTCGA >window_3-3_AAA82AAT -CAAAAGGATATATAGATGTCAAYTAYAATAGRATGTGYAGACATCAATAGTAGCTGTCGA +CAAAAGGATATATAGATGTCaaytayAATagratgtgyAGACATCAATAGTAGCTGTCGA >window_3-3_AGG83KGG -CAAAAGGATATATAGATGTCAAYTAYAARKGGATGTGYAGACATCAATAGTAGCTGTCGA +CAAAAGGATATATAGATGTCaaytayaarKGGatgtgyAGACATCAATAGTAGCTGTCGA >window_3-3_AGG83AHG -CAAAAGGATATATAGATGTCAAYTAYAARAHGATGTGYAGACATCAATAGTAGCTGTCGA ->window_3-3_AGG83AGT -CAAAAGGATATATAGATGTCAAYTAYAARAGTATGTGYAGACATCAATAGTAGCTGTCGA +CAAAAGGATATATAGATGTCaaytayaarAHGatgtgyAGACATCAATAGTAGCTGTCGA +>window_3-3_AGG83AGW +CAAAAGGATATATAGATGTCaaytayaarAGWatgtgyAGACATCAATAGTAGCTGTCGA >window_3-3_ATG84KTG -CAAAAGGATATATAGATGTCAAYTAYAARAGRKTGTGYAGACATCAATAGTAGCTGTCGA +CAAAAGGATATATAGATGTCaaytayaaragrKTGtgyAGACATCAATAGTAGCTGTCGA >window_3-3_ATG84AVG -CAAAAGGATATATAGATGTCAAYTAYAARAGRAVGTGYAGACATCAATAGTAGCTGTCGA +CAAAAGGATATATAGATGTCaaytayaaragrAVGtgyAGACATCAATAGTAGCTGTCGA >window_3-3_ATG84ATT -CAAAAGGATATATAGATGTCAAYTAYAARAGRATTTGYAGACATCAATAGTAGCTGTCGA +CAAAAGGATATATAGATGTCaaytayaaragrATTtgyAGACATCAATAGTAGCTGTCGA >window_3-3_TGT85SGT -CAAAAGGATATATAGATGTCAAYTAYAARAGRATGSGTAGACATCAATAGTAGCTGTCGA +CAAAAGGATATATAGATGTCaaytayaaragratgSGTAGACATCAATAGTAGCTGTCGA >window_3-3_TGT85THT -CAAAAGGATATATAGATGTCAAYTAYAARAGRATGTHTAGACATCAATAGTAGCTGTCGA +CAAAAGGATATATAGATGTCaaytayaaragratgTHTAGACATCAATAGTAGCTGTCGA >window_3-3_TGT85TGR -CAAAAGGATATATAGATGTCAAYTAYAARAGRATGTGRAGACATCAATAGTAGCTGTCGA +CAAAAGGATATATAGATGTCaaytayaaragratgTGRAGACATCAATAGTAGCTGTCGA >window_3-4_AGA86KGA -TCAATTACAAAAGGATGTGTKGACAYCARTAGTAGCTGTCGAGTCGC +TCAATTACAAAAGGATGTGTKGAcaycarTAGTAGCTGTCGAGTCGC >window_3-4_AGA86AHA -TCAATTACAAAAGGATGTGTAHACAYCARTAGTAGCTGTCGAGTCGC +TCAATTACAAAAGGATGTGTAHAcaycarTAGTAGCTGTCGAGTCGC >window_3-4_AGA86AGT -TCAATTACAAAAGGATGTGTAGTCAYCARTAGTAGCTGTCGAGTCGC +TCAATTACAAAAGGATGTGTAGTcaycarTAGTAGCTGTCGAGTCGC >window_3-4_CAT87DAT -TCAATTACAAAAGGATGTGTAGRDATCARTAGTAGCTGTCGAGTCGC +TCAATTACAAAAGGATGTGTagrDATcarTAGTAGCTGTCGAGTCGC >window_3-4_CAT87CBT -TCAATTACAAAAGGATGTGTAGRCBTCARTAGTAGCTGTCGAGTCGC +TCAATTACAAAAGGATGTGTagrCBTcarTAGTAGCTGTCGAGTCGC >window_3-4_CAT87CAA -TCAATTACAAAAGGATGTGTAGRCAACARTAGTAGCTGTCGAGTCGC +TCAATTACAAAAGGATGTGTagrCAAcarTAGTAGCTGTCGAGTCGC >window_3-4_CAA88DAA -TCAATTACAAAAGGATGTGTAGRCAYDAATAGTAGCTGTCGAGTCGC +TCAATTACAAAAGGATGTGTagrcayDAATAGTAGCTGTCGAGTCGC >window_3-4_CAA88CBA -TCAATTACAAAAGGATGTGTAGRCAYCBATAGTAGCTGTCGAGTCGC +TCAATTACAAAAGGATGTGTagrcayCBATAGTAGCTGTCGAGTCGC >window_3-4_CAA88CAT -TCAATTACAAAAGGATGTGTAGRCAYCATTAGTAGCTGTCGAGTCGC +TCAATTACAAAAGGATGTGTagrcayCATTAGTAGCTGTCGAGTCGC diff --git a/k3l_test.tsv b/k3l_test.tsv new file mode 100644 index 0000000..0513709 --- /dev/null +++ b/k3l_test.tsv @@ -0,0 +1,98 @@ +name sub_window_name wt position iupac codon_sub iupac_aa synonymous_codons no_stop_codons primer homology_arm sub_window forward_primer forward_primer_tm forward_primer_gc forward_primer_len reverse_primer_name reverse_primer reverse_primer_tm reverse_primer_gc reverse_primer_len +window_1-1_GCG11HCG window_1-1 GCG 11 HCG GCG11HCG TPS 0.0 0.0 TTTGTTATTCGTTGCCCAATHCGggwgaygtwatwaarGGCAGAGTATACGAGAAGGATT TTTGTTATTCGTTGCCCAAT HCGggwgaygtwatwaar GGCAGAGTATACGAGAAGGATT 51.3 45.5 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA 55.1 36.0 25 +window_1-1_GCG11GDG window_1-1 GCG 11 GDG GCG11GDG EGV 0.0 0.0 TTTGTTATTCGTTGCCCAATGDGggwgaygtwatwaarGGCAGAGTATACGAGAAGGATT TTTGTTATTCGTTGCCCAAT GDGggwgaygtwatwaar GGCAGAGTATACGAGAAGGATT 51.3 45.5 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA 55.1 36.0 25 +window_1-1_GGT12HGT window_1-1 GGT 12 HGT GGT12HGT SRC 0.0 0.0 TTTGTTATTCGTTGCCCAATgcwHGTgaygtwatwaarGGCAGAGTATACGAGAAGGATT TTTGTTATTCGTTGCCCAAT gcwHGTgaygtwatwaar GGCAGAGTATACGAGAAGGATT 51.3 45.5 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA 55.1 36.0 25 +window_1-1_GGT12GHT window_1-1 GGT 12 GHT GGT12GHT DAV 0.0 0.0 TTTGTTATTCGTTGCCCAATgcwGHTgaygtwatwaarGGCAGAGTATACGAGAAGGATT TTTGTTATTCGTTGCCCAAT gcwGHTgaygtwatwaar GGCAGAGTATACGAGAAGGATT 51.3 45.5 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA 55.1 36.0 25 +window_1-1_GAT13HAT window_1-1 GAT 13 HAT GAT13HAT NHY 0.0 0.0 TTTGTTATTCGTTGCCCAATgcwggwHATgtwatwaarGGCAGAGTATACGAGAAGGATT TTTGTTATTCGTTGCCCAAT gcwggwHATgtwatwaar GGCAGAGTATACGAGAAGGATT 51.3 45.5 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA 55.1 36.0 25 +window_1-1_GAT13GBT window_1-1 GAT 13 GBT GAT13GBT AGV 0.0 0.0 TTTGTTATTCGTTGCCCAATgcwggwGBTgtwatwaarGGCAGAGTATACGAGAAGGATT TTTGTTATTCGTTGCCCAAT gcwggwGBTgtwatwaar GGCAGAGTATACGAGAAGGATT 51.3 45.5 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA 55.1 36.0 25 +window_1-1_GAT13GAA window_1-1 GAT 13 GAA GAT13GAA E 0.0 0.0 TTTGTTATTCGTTGCCCAATgcwggwGAAgtwatwaarGGCAGAGTATACGAGAAGGATT TTTGTTATTCGTTGCCCAAT gcwggwGAAgtwatwaar GGCAGAGTATACGAGAAGGATT 51.3 45.5 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA 55.1 36.0 25 +window_1-1_GTA14WTA window_1-1 GTA 14 WTA GTA14WTA IL 0.0 0.0 TTTGTTATTCGTTGCCCAATgcwggwgayWTAatwaarGGCAGAGTATACGAGAAGGATT TTTGTTATTCGTTGCCCAAT gcwggwgayWTAatwaar GGCAGAGTATACGAGAAGGATT 51.3 45.5 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA 55.1 36.0 25 +window_1-1_GTA14GVA window_1-1 GTA 14 GVA GTA14GVA EAG 0.0 0.0 TTTGTTATTCGTTGCCCAATgcwggwgayGVAatwaarGGCAGAGTATACGAGAAGGATT TTTGTTATTCGTTGCCCAAT gcwggwgayGVAatwaar GGCAGAGTATACGAGAAGGATT 51.3 45.5 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA 55.1 36.0 25 +window_1-1_ATA15KTA window_1-1 ATA 15 KTA ATA15KTA VL 0.0 0.0 TTTGTTATTCGTTGCCCAATgcwggwgaygtwKTAaarGGCAGAGTATACGAGAAGGATT TTTGTTATTCGTTGCCCAAT gcwggwgaygtwKTAaar GGCAGAGTATACGAGAAGGATT 51.3 45.5 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA 55.1 36.0 25 +window_1-1_ATA15AVA window_1-1 ATA 15 AVA ATA15AVA KTR 0.0 0.0 TTTGTTATTCGTTGCCCAATgcwggwgaygtwAVAaarGGCAGAGTATACGAGAAGGATT TTTGTTATTCGTTGCCCAAT gcwggwgaygtwAVAaar GGCAGAGTATACGAGAAGGATT 51.3 45.5 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA 55.1 36.0 25 +window_1-1_ATA15ATG window_1-1 ATA 15 ATG ATA15ATG M 0.0 0.0 TTTGTTATTCGTTGCCCAATgcwggwgaygtwATGaarGGCAGAGTATACGAGAAGGATT TTTGTTATTCGTTGCCCAAT gcwggwgaygtwATGaar GGCAGAGTATACGAGAAGGATT 51.3 45.5 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA 55.1 36.0 25 +window_1-1_AAG16BAG window_1-1 AAG 16 BAG AAG16BAG QE* 1.0 0.0 TTTGTTATTCGTTGCCCAATgcwggwgaygtwatwBAGGGCAGAGTATACGAGAAGGATT TTTGTTATTCGTTGCCCAAT gcwggwgaygtwatwBAG GGCAGAGTATACGAGAAGGATT 51.3 45.5 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA 55.1 36.0 25 +window_1-1_AAG16ABG window_1-1 AAG 16 ABG AAG16ABG TRM 1.0 0.0 TTTGTTATTCGTTGCCCAATgcwggwgaygtwatwABGGGCAGAGTATACGAGAAGGATT TTTGTTATTCGTTGCCCAAT gcwggwgaygtwatwABG GGCAGAGTATACGAGAAGGATT 51.3 45.5 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA 55.1 36.0 25 +window_1-1_AAG16AAW window_1-1 AAG 16 AAW AAG16AAW KN 1.0 0.0 TTTGTTATTCGTTGCCCAATgcwggwgaygtwatwAAWGGCAGAGTATACGAGAAGGATT TTTGTTATTCGTTGCCCAAT gcwggwgaygtwatwAAW GGCAGAGTATACGAGAAGGATT 51.3 45.5 22 rev_window_1-1 ATTGGGCAACGAATAACAAAATGCA 55.1 36.0 25 +window_1-2_GGC17HGC window_1-2 GGC 17 HGC GGC17HGC SRC 0.0 0.0 ATGCGGGTGATGTAATAAAGHGCagrGTATACGAGAAGGATTATGCTCTATAT ATGCGGGTGATGTAATAAAG HGCagr GTATACGAGAAGGATTATGCTCTATAT 50.1 33.3 27 rev_window_1-2 CTTTATTACATCACCCGCATTGGGC 56.8 48.0 25 +window_1-2_GGC17GHC window_1-2 GGC 17 GHC GGC17GHC DAV 0.0 0.0 ATGCGGGTGATGTAATAAAGGHCagrGTATACGAGAAGGATTATGCTCTATAT ATGCGGGTGATGTAATAAAG GHCagr GTATACGAGAAGGATTATGCTCTATAT 50.1 33.3 27 rev_window_1-2 CTTTATTACATCACCCGCATTGGGC 56.8 48.0 25 +window_1-2_AGA18KGA window_1-2 AGA 18 KGA AGA18KGA G* 0.0 0.0 ATGCGGGTGATGTAATAAAGggwKGAGTATACGAGAAGGATTATGCTCTATAT ATGCGGGTGATGTAATAAAG ggwKGA GTATACGAGAAGGATTATGCTCTATAT 50.1 33.3 27 rev_window_1-2 CTTTATTACATCACCCGCATTGGGC 56.8 48.0 25 +window_1-2_AGA18AHA window_1-2 AGA 18 AHA AGA18AHA KTI 0.0 0.0 ATGCGGGTGATGTAATAAAGggwAHAGTATACGAGAAGGATTATGCTCTATAT ATGCGGGTGATGTAATAAAG ggwAHA GTATACGAGAAGGATTATGCTCTATAT 50.1 33.3 27 rev_window_1-2 CTTTATTACATCACCCGCATTGGGC 56.8 48.0 25 +window_1-2_AGA18AGT window_1-2 AGA 18 AGT AGA18AGT S 0.0 0.0 ATGCGGGTGATGTAATAAAGggwAGTGTATACGAGAAGGATTATGCTCTATAT ATGCGGGTGATGTAATAAAG ggwAGT GTATACGAGAAGGATTATGCTCTATAT 50.1 33.3 27 rev_window_1-2 CTTTATTACATCACCCGCATTGGGC 56.8 48.0 25 +window_2-1_AGT43KGT window_2-1 AGT 43 KGT AGT43KGT GC 0.0 0.0 TTGAAGCTATCTTGGCAGAGKGTgtwaaratgCATATGGATAGATATGTTGAATATAGGGA TTGAAGCTATCTTGGCAGAG KGTgtwaaratg CATATGGATAGATATGTTGAATATAGGGA 50.5 31.0 29 rev_window_2-1 CTCTGCCAAGATAGCTTCAAAGTGA 55.1 44.0 25 +window_2-1_AGT43AHT window_2-1 AGT 43 AHT AGT43AHT NTI 0.0 0.0 TTGAAGCTATCTTGGCAGAGAHTgtwaaratgCATATGGATAGATATGTTGAATATAGGGA TTGAAGCTATCTTGGCAGAG AHTgtwaaratg CATATGGATAGATATGTTGAATATAGGGA 50.5 31.0 29 rev_window_2-1 CTCTGCCAAGATAGCTTCAAAGTGA 55.1 44.0 25 +window_2-1_AGT43AGA window_2-1 AGT 43 AGA AGT43AGA R 0.0 0.0 TTGAAGCTATCTTGGCAGAGAGAgtwaaratgCATATGGATAGATATGTTGAATATAGGGA TTGAAGCTATCTTGGCAGAG AGAgtwaaratg CATATGGATAGATATGTTGAATATAGGGA 50.5 31.0 29 rev_window_2-1 CTCTGCCAAGATAGCTTCAAAGTGA 55.1 44.0 25 +window_2-1_GTT44HTT window_2-1 GTT 44 HTT GTT44HTT ILF 0.0 0.0 TTGAAGCTATCTTGGCAGAGtcwHTTaaratgCATATGGATAGATATGTTGAATATAGGGA TTGAAGCTATCTTGGCAGAG tcwHTTaaratg CATATGGATAGATATGTTGAATATAGGGA 50.5 31.0 29 rev_window_2-1 CTCTGCCAAGATAGCTTCAAAGTGA 55.1 44.0 25 +window_2-1_GTT44GVT window_2-1 GTT 44 GVT GTT44GVT DAG 0.0 0.0 TTGAAGCTATCTTGGCAGAGtcwGVTaaratgCATATGGATAGATATGTTGAATATAGGGA TTGAAGCTATCTTGGCAGAG tcwGVTaaratg CATATGGATAGATATGTTGAATATAGGGA 50.5 31.0 29 rev_window_2-1 CTCTGCCAAGATAGCTTCAAAGTGA 55.1 44.0 25 +window_2-1_AAG45BAG window_2-1 AAG 45 BAG AAG45BAG QE* 0.0 0.0 TTGAAGCTATCTTGGCAGAGtcwgtwBAGatgCATATGGATAGATATGTTGAATATAGGGA TTGAAGCTATCTTGGCAGAG tcwgtwBAGatg CATATGGATAGATATGTTGAATATAGGGA 50.5 31.0 29 rev_window_2-1 CTCTGCCAAGATAGCTTCAAAGTGA 55.1 44.0 25 +window_2-1_AAG45ABG window_2-1 AAG 45 ABG AAG45ABG TRM 0.0 0.0 TTGAAGCTATCTTGGCAGAGtcwgtwABGatgCATATGGATAGATATGTTGAATATAGGGA TTGAAGCTATCTTGGCAGAG tcwgtwABGatg CATATGGATAGATATGTTGAATATAGGGA 50.5 31.0 29 rev_window_2-1 CTCTGCCAAGATAGCTTCAAAGTGA 55.1 44.0 25 +window_2-1_AAG45AAT window_2-1 AAG 45 AAT AAG45AAT N 0.0 0.0 TTGAAGCTATCTTGGCAGAGtcwgtwAATatgCATATGGATAGATATGTTGAATATAGGGA TTGAAGCTATCTTGGCAGAG tcwgtwAATatg CATATGGATAGATATGTTGAATATAGGGA 50.5 31.0 29 rev_window_2-1 CTCTGCCAAGATAGCTTCAAAGTGA 55.1 44.0 25 +window_2-1_ATG46KTG window_2-1 ATG 46 KTG ATG46KTG VL 0.0 0.0 TTGAAGCTATCTTGGCAGAGtcwgtwaarKTGCATATGGATAGATATGTTGAATATAGGGA TTGAAGCTATCTTGGCAGAG tcwgtwaarKTG CATATGGATAGATATGTTGAATATAGGGA 50.5 31.0 29 rev_window_2-1 CTCTGCCAAGATAGCTTCAAAGTGA 55.1 44.0 25 +window_2-1_ATG46AVG window_2-1 ATG 46 AVG ATG46AVG KTR 0.0 0.0 TTGAAGCTATCTTGGCAGAGtcwgtwaarAVGCATATGGATAGATATGTTGAATATAGGGA TTGAAGCTATCTTGGCAGAG tcwgtwaarAVG CATATGGATAGATATGTTGAATATAGGGA 50.5 31.0 29 rev_window_2-1 CTCTGCCAAGATAGCTTCAAAGTGA 55.1 44.0 25 +window_2-1_ATG46ATT window_2-1 ATG 46 ATT ATG46ATT I 0.0 0.0 TTGAAGCTATCTTGGCAGAGtcwgtwaarATTCATATGGATAGATATGTTGAATATAGGGA TTGAAGCTATCTTGGCAGAG tcwgtwaarATT CATATGGATAGATATGTTGAATATAGGGA 50.5 31.0 29 rev_window_2-1 CTCTGCCAAGATAGCTTCAAAGTGA 55.1 44.0 25 +window_2-2_CAT47DAT window_2-2 CAT 47 DAT CAT47DAT NDY 0.0 0.0 TGGCAGAGAGTGTTAAGATGDATatggayagrtayGTTGAATATAGGGATAAACTGGTAG TGGCAGAGAGTGTTAAGATG DATatggayagrtay GTTGAATATAGGGATAAACTGGTAG 48.9 36.0 25 rev_window_2-2 CATCTTAACACTCTCTGCCAAGATAGC 55.7 44.4 27 +window_2-2_CAT47CBT window_2-2 CAT 47 CBT CAT47CBT PRL 0.0 0.0 TGGCAGAGAGTGTTAAGATGCBTatggayagrtayGTTGAATATAGGGATAAACTGGTAG TGGCAGAGAGTGTTAAGATG CBTatggayagrtay GTTGAATATAGGGATAAACTGGTAG 48.9 36.0 25 rev_window_2-2 CATCTTAACACTCTCTGCCAAGATAGC 55.7 44.4 27 +window_2-2_CAT47CAA window_2-2 CAT 47 CAA CAT47CAA Q 0.0 0.0 TGGCAGAGAGTGTTAAGATGCAAatggayagrtayGTTGAATATAGGGATAAACTGGTAG TGGCAGAGAGTGTTAAGATG CAAatggayagrtay GTTGAATATAGGGATAAACTGGTAG 48.9 36.0 25 rev_window_2-2 CATCTTAACACTCTCTGCCAAGATAGC 55.7 44.4 27 +window_2-2_ATG48KTG window_2-2 ATG 48 KTG ATG48KTG VL 0.0 0.0 TGGCAGAGAGTGTTAAGATGcayKTGgayagrtayGTTGAATATAGGGATAAACTGGTAG TGGCAGAGAGTGTTAAGATG cayKTGgayagrtay GTTGAATATAGGGATAAACTGGTAG 48.9 36.0 25 rev_window_2-2 CATCTTAACACTCTCTGCCAAGATAGC 55.7 44.4 27 +window_2-2_ATG48AVG window_2-2 ATG 48 AVG ATG48AVG KTR 0.0 0.0 TGGCAGAGAGTGTTAAGATGcayAVGgayagrtayGTTGAATATAGGGATAAACTGGTAG TGGCAGAGAGTGTTAAGATG cayAVGgayagrtay GTTGAATATAGGGATAAACTGGTAG 48.9 36.0 25 rev_window_2-2 CATCTTAACACTCTCTGCCAAGATAGC 55.7 44.4 27 +window_2-2_ATG48ATT window_2-2 ATG 48 ATT ATG48ATT I 0.0 0.0 TGGCAGAGAGTGTTAAGATGcayATTgayagrtayGTTGAATATAGGGATAAACTGGTAG TGGCAGAGAGTGTTAAGATG cayATTgayagrtay GTTGAATATAGGGATAAACTGGTAG 48.9 36.0 25 rev_window_2-2 CATCTTAACACTCTCTGCCAAGATAGC 55.7 44.4 27 +window_2-2_GAT49HAT window_2-2 GAT 49 HAT GAT49HAT NHY 0.0 1.0 TGGCAGAGAGTGTTAAGATGcayatgHATagrtayGTTGAATATAGGGATAAACTGGTAG TGGCAGAGAGTGTTAAGATG cayatgHATagrtay GTTGAATATAGGGATAAACTGGTAG 48.9 36.0 25 rev_window_2-2 CATCTTAACACTCTCTGCCAAGATAGC 55.7 44.4 27 +window_2-2_GAT49GBT window_2-2 GAT 49 GBT GAT49GBT AGV 0.0 1.0 TGGCAGAGAGTGTTAAGATGcayatgGBTagrtayGTTGAATATAGGGATAAACTGGTAG TGGCAGAGAGTGTTAAGATG cayatgGBTagrtay GTTGAATATAGGGATAAACTGGTAG 48.9 36.0 25 rev_window_2-2 CATCTTAACACTCTCTGCCAAGATAGC 55.7 44.4 27 +window_2-2_GAT49GAA window_2-2 GAT 49 GAA GAT49GAA E 0.0 1.0 TGGCAGAGAGTGTTAAGATGcayatgGAAagrtayGTTGAATATAGGGATAAACTGGTAG TGGCAGAGAGTGTTAAGATG cayatgGAAagrtay GTTGAATATAGGGATAAACTGGTAG 48.9 36.0 25 rev_window_2-2 CATCTTAACACTCTCTGCCAAGATAGC 55.7 44.4 27 +window_2-2_AGA50KGA window_2-2 AGA 50 KGA AGA50KGA G* 0.0 0.0 TGGCAGAGAGTGTTAAGATGcayatggayKGAtayGTTGAATATAGGGATAAACTGGTAG TGGCAGAGAGTGTTAAGATG cayatggayKGAtay GTTGAATATAGGGATAAACTGGTAG 48.9 36.0 25 rev_window_2-2 CATCTTAACACTCTCTGCCAAGATAGC 55.7 44.4 27 +window_2-2_AGA50AHA window_2-2 AGA 50 AHA AGA50AHA KTI 0.0 0.0 TGGCAGAGAGTGTTAAGATGcayatggayAHAtayGTTGAATATAGGGATAAACTGGTAG TGGCAGAGAGTGTTAAGATG cayatggayAHAtay GTTGAATATAGGGATAAACTGGTAG 48.9 36.0 25 rev_window_2-2 CATCTTAACACTCTCTGCCAAGATAGC 55.7 44.4 27 +window_2-2_AGA50AGT window_2-2 AGA 50 AGT AGA50AGT S 0.0 0.0 TGGCAGAGAGTGTTAAGATGcayatggayAGTtayGTTGAATATAGGGATAAACTGGTAG TGGCAGAGAGTGTTAAGATG cayatggayAGTtay GTTGAATATAGGGATAAACTGGTAG 48.9 36.0 25 rev_window_2-2 CATCTTAACACTCTCTGCCAAGATAGC 55.7 44.4 27 +window_2-2_TAT51VAT window_2-2 TAT 51 VAT TAT51VAT NHD 0.0 0.0 TGGCAGAGAGTGTTAAGATGcayatggayagrVATGTTGAATATAGGGATAAACTGGTAG TGGCAGAGAGTGTTAAGATG cayatggayagrVAT GTTGAATATAGGGATAAACTGGTAG 48.9 36.0 25 rev_window_2-2 CATCTTAACACTCTCTGCCAAGATAGC 55.7 44.4 27 +window_2-2_TAT51TBT window_2-2 TAT 51 TBT TAT51TBT SCF 0.0 0.0 TGGCAGAGAGTGTTAAGATGcayatggayagrTBTGTTGAATATAGGGATAAACTGGTAG TGGCAGAGAGTGTTAAGATG cayatggayagrTBT GTTGAATATAGGGATAAACTGGTAG 48.9 36.0 25 rev_window_2-2 CATCTTAACACTCTCTGCCAAGATAGC 55.7 44.4 27 +window_2-2_TAT51TAA window_2-2 TAT 51 TAA TAT51TAA * 0.0 0.0 TGGCAGAGAGTGTTAAGATGcayatggayagrTAAGTTGAATATAGGGATAAACTGGTAG TGGCAGAGAGTGTTAAGATG cayatggayagrTAA GTTGAATATAGGGATAAACTGGTAG 48.9 36.0 25 rev_window_2-2 CATCTTAACACTCTCTGCCAAGATAGC 55.7 44.4 27 +window_3-1_GAT71HAT window_3-1 GAT 71 HAT GAT71HAT NHY 0.0 0.0 AAGTTAAAGTGATTAGAGTTHATtayacwaarGGATATATAGATGTCAATTACAAAAGGATG AAGTTAAAGTGATTAGAGTT HATtayacwaar GGATATATAGATGTCAATTACAAAAGGATG 51.1 30.0 30 rev_window_3-1 AACTCTAATCACTTTAACTTTTACAGTTTTCCC 55.1 30.3 33 +window_3-1_GAT71GBT window_3-1 GAT 71 GBT GAT71GBT AGV 0.0 0.0 AAGTTAAAGTGATTAGAGTTGBTtayacwaarGGATATATAGATGTCAATTACAAAAGGATG AAGTTAAAGTGATTAGAGTT GBTtayacwaar GGATATATAGATGTCAATTACAAAAGGATG 51.1 30.0 30 rev_window_3-1 AACTCTAATCACTTTAACTTTTACAGTTTTCCC 55.1 30.3 33 +window_3-1_GAT71GAA window_3-1 GAT 71 GAA GAT71GAA E 0.0 0.0 AAGTTAAAGTGATTAGAGTTGAAtayacwaarGGATATATAGATGTCAATTACAAAAGGATG AAGTTAAAGTGATTAGAGTT GAAtayacwaar GGATATATAGATGTCAATTACAAAAGGATG 51.1 30.0 30 rev_window_3-1 AACTCTAATCACTTTAACTTTTACAGTTTTCCC 55.1 30.3 33 +window_3-1_TAT72VAT window_3-1 TAT 72 VAT TAT72VAT NHD 0.0 1.0 AAGTTAAAGTGATTAGAGTTgayVATacwaarGGATATATAGATGTCAATTACAAAAGGATG AAGTTAAAGTGATTAGAGTT gayVATacwaar GGATATATAGATGTCAATTACAAAAGGATG 51.1 30.0 30 rev_window_3-1 AACTCTAATCACTTTAACTTTTACAGTTTTCCC 55.1 30.3 33 +window_3-1_TAT72TBT window_3-1 TAT 72 TBT TAT72TBT SCF 0.0 1.0 AAGTTAAAGTGATTAGAGTTgayTBTacwaarGGATATATAGATGTCAATTACAAAAGGATG AAGTTAAAGTGATTAGAGTT gayTBTacwaar GGATATATAGATGTCAATTACAAAAGGATG 51.1 30.0 30 rev_window_3-1 AACTCTAATCACTTTAACTTTTACAGTTTTCCC 55.1 30.3 33 +window_3-1_ACA73BCA window_3-1 ACA 73 BCA ACA73BCA PAS 0.0 0.0 AAGTTAAAGTGATTAGAGTTgaytayBCAaarGGATATATAGATGTCAATTACAAAAGGATG AAGTTAAAGTGATTAGAGTT gaytayBCAaar GGATATATAGATGTCAATTACAAAAGGATG 51.1 30.0 30 rev_window_3-1 AACTCTAATCACTTTAACTTTTACAGTTTTCCC 55.1 30.3 33 +window_3-1_ACA73ADA window_3-1 ACA 73 ADA ACA73ADA KRI 0.0 0.0 AAGTTAAAGTGATTAGAGTTgaytayADAaarGGATATATAGATGTCAATTACAAAAGGATG AAGTTAAAGTGATTAGAGTT gaytayADAaar GGATATATAGATGTCAATTACAAAAGGATG 51.1 30.0 30 rev_window_3-1 AACTCTAATCACTTTAACTTTTACAGTTTTCCC 55.1 30.3 33 +window_3-1_AAA74BAA window_3-1 AAA 74 BAA AAA74BAA QE* 0.0 0.0 AAGTTAAAGTGATTAGAGTTgaytayacwBAAGGATATATAGATGTCAATTACAAAAGGATG AAGTTAAAGTGATTAGAGTT gaytayacwBAA GGATATATAGATGTCAATTACAAAAGGATG 51.1 30.0 30 rev_window_3-1 AACTCTAATCACTTTAACTTTTACAGTTTTCCC 55.1 30.3 33 +window_3-1_AAA74ABA window_3-1 AAA 74 ABA AAA74ABA TRI 0.0 0.0 AAGTTAAAGTGATTAGAGTTgaytayacwABAGGATATATAGATGTCAATTACAAAAGGATG AAGTTAAAGTGATTAGAGTT gaytayacwABA GGATATATAGATGTCAATTACAAAAGGATG 51.1 30.0 30 rev_window_3-1 AACTCTAATCACTTTAACTTTTACAGTTTTCCC 55.1 30.3 33 +window_3-1_AAA74AAT window_3-1 AAA 74 AAT AAA74AAT N 0.0 0.0 AAGTTAAAGTGATTAGAGTTgaytayacwAATGGATATATAGATGTCAATTACAAAAGGATG AAGTTAAAGTGATTAGAGTT gaytayacwAAT GGATATATAGATGTCAATTACAAAAGGATG 51.1 30.0 30 rev_window_3-1 AACTCTAATCACTTTAACTTTTACAGTTTTCCC 55.1 30.3 33 +window_3-2_GGA75WGA window_3-2 GGA 75 WGA GGA75WGA R* 1.0 0.0 TTAGAGTTGATTATACAAAAWGAtayatwgaygtwAATTACAAAAGGATGTGTAGACATC TTAGAGTTGATTATACAAAA WGAtayatwgaygtw AATTACAAAAGGATGTGTAGACATC 49.7 32.0 25 rev_window_3-2 TTTTGTATAATCAACTCTAATCACTTTAACTTTTACAGT 55.2 23.1 39 +window_3-2_GGA75GHA window_3-2 GGA 75 GHA GGA75GHA EAV 1.0 0.0 TTAGAGTTGATTATACAAAAGHAtayatwgaygtwAATTACAAAAGGATGTGTAGACATC TTAGAGTTGATTATACAAAA GHAtayatwgaygtw AATTACAAAAGGATGTGTAGACATC 49.7 32.0 25 rev_window_3-2 TTTTGTATAATCAACTCTAATCACTTTAACTTTTACAGT 55.2 23.1 39 +window_3-2_GGA75GGT window_3-2 GGA 75 GGT GGA75GGT G 1.0 0.0 TTAGAGTTGATTATACAAAAGGTtayatwgaygtwAATTACAAAAGGATGTGTAGACATC TTAGAGTTGATTATACAAAA GGTtayatwgaygtw AATTACAAAAGGATGTGTAGACATC 49.7 32.0 25 rev_window_3-2 TTTTGTATAATCAACTCTAATCACTTTAACTTTTACAGT 55.2 23.1 39 +window_3-2_TAT76VAT window_3-2 TAT 76 VAT TAT76VAT NHD 0.0 0.0 TTAGAGTTGATTATACAAAAggwVATatwgaygtwAATTACAAAAGGATGTGTAGACATC TTAGAGTTGATTATACAAAA ggwVATatwgaygtw AATTACAAAAGGATGTGTAGACATC 49.7 32.0 25 rev_window_3-2 TTTTGTATAATCAACTCTAATCACTTTAACTTTTACAGT 55.2 23.1 39 +window_3-2_TAT76TBT window_3-2 TAT 76 TBT TAT76TBT SCF 0.0 0.0 TTAGAGTTGATTATACAAAAggwTBTatwgaygtwAATTACAAAAGGATGTGTAGACATC TTAGAGTTGATTATACAAAA ggwTBTatwgaygtw AATTACAAAAGGATGTGTAGACATC 49.7 32.0 25 rev_window_3-2 TTTTGTATAATCAACTCTAATCACTTTAACTTTTACAGT 55.2 23.1 39 +window_3-2_TAT76TAA window_3-2 TAT 76 TAA TAT76TAA * 0.0 0.0 TTAGAGTTGATTATACAAAAggwTAAatwgaygtwAATTACAAAAGGATGTGTAGACATC TTAGAGTTGATTATACAAAA ggwTAAatwgaygtw AATTACAAAAGGATGTGTAGACATC 49.7 32.0 25 rev_window_3-2 TTTTGTATAATCAACTCTAATCACTTTAACTTTTACAGT 55.2 23.1 39 +window_3-2_ATA77KTA window_3-2 ATA 77 KTA ATA77KTA VL 0.0 0.0 TTAGAGTTGATTATACAAAAggwtayKTAgaygtwAATTACAAAAGGATGTGTAGACATC TTAGAGTTGATTATACAAAA ggwtayKTAgaygtw AATTACAAAAGGATGTGTAGACATC 49.7 32.0 25 rev_window_3-2 TTTTGTATAATCAACTCTAATCACTTTAACTTTTACAGT 55.2 23.1 39 +window_3-2_ATA77AVA window_3-2 ATA 77 AVA ATA77AVA KTR 0.0 0.0 TTAGAGTTGATTATACAAAAggwtayAVAgaygtwAATTACAAAAGGATGTGTAGACATC TTAGAGTTGATTATACAAAA ggwtayAVAgaygtw AATTACAAAAGGATGTGTAGACATC 49.7 32.0 25 rev_window_3-2 TTTTGTATAATCAACTCTAATCACTTTAACTTTTACAGT 55.2 23.1 39 +window_3-2_ATA77ATG window_3-2 ATA 77 ATG ATA77ATG M 0.0 0.0 TTAGAGTTGATTATACAAAAggwtayATGgaygtwAATTACAAAAGGATGTGTAGACATC TTAGAGTTGATTATACAAAA ggwtayATGgaygtw AATTACAAAAGGATGTGTAGACATC 49.7 32.0 25 rev_window_3-2 TTTTGTATAATCAACTCTAATCACTTTAACTTTTACAGT 55.2 23.1 39 +window_3-2_GAT78HAT window_3-2 GAT 78 HAT GAT78HAT NHY 0.0 0.0 TTAGAGTTGATTATACAAAAggwtayatwHATgtwAATTACAAAAGGATGTGTAGACATC TTAGAGTTGATTATACAAAA ggwtayatwHATgtw AATTACAAAAGGATGTGTAGACATC 49.7 32.0 25 rev_window_3-2 TTTTGTATAATCAACTCTAATCACTTTAACTTTTACAGT 55.2 23.1 39 +window_3-2_GAT78GBT window_3-2 GAT 78 GBT GAT78GBT AGV 0.0 0.0 TTAGAGTTGATTATACAAAAggwtayatwGBTgtwAATTACAAAAGGATGTGTAGACATC TTAGAGTTGATTATACAAAA ggwtayatwGBTgtw AATTACAAAAGGATGTGTAGACATC 49.7 32.0 25 rev_window_3-2 TTTTGTATAATCAACTCTAATCACTTTAACTTTTACAGT 55.2 23.1 39 +window_3-2_GAT78GAA window_3-2 GAT 78 GAA GAT78GAA E 0.0 0.0 TTAGAGTTGATTATACAAAAggwtayatwGAAgtwAATTACAAAAGGATGTGTAGACATC TTAGAGTTGATTATACAAAA ggwtayatwGAAgtw AATTACAAAAGGATGTGTAGACATC 49.7 32.0 25 rev_window_3-2 TTTTGTATAATCAACTCTAATCACTTTAACTTTTACAGT 55.2 23.1 39 +window_3-2_GTC79HTC window_3-2 GTC 79 HTC GTC79HTC ILF 0.0 0.0 TTAGAGTTGATTATACAAAAggwtayatwgayHTCAATTACAAAAGGATGTGTAGACATC TTAGAGTTGATTATACAAAA ggwtayatwgayHTC AATTACAAAAGGATGTGTAGACATC 49.7 32.0 25 rev_window_3-2 TTTTGTATAATCAACTCTAATCACTTTAACTTTTACAGT 55.2 23.1 39 +window_3-2_GTC79GVC window_3-2 GTC 79 GVC GTC79GVC DAG 0.0 0.0 TTAGAGTTGATTATACAAAAggwtayatwgayGVCAATTACAAAAGGATGTGTAGACATC TTAGAGTTGATTATACAAAA ggwtayatwgayGVC AATTACAAAAGGATGTGTAGACATC 49.7 32.0 25 rev_window_3-2 TTTTGTATAATCAACTCTAATCACTTTAACTTTTACAGT 55.2 23.1 39 +window_3-3_AAT80BAT window_3-3 AAT 80 BAT AAT80BAT HDY 0.0 0.0 CAAAAGGATATATAGATGTCBATtayaaragratgtgyAGACATCAATAGTAGCTGTCGA CAAAAGGATATATAGATGTC BATtayaaragratgtgy AGACATCAATAGTAGCTGTCGA 50.6 40.9 22 rev_window_3-3 GACATCTATATATCCTTTTGTATAATCAACTCTAATCACT 55.5 27.5 40 +window_3-3_AAT80ABT window_3-3 AAT 80 ABT AAT80ABT TSI 0.0 0.0 CAAAAGGATATATAGATGTCABTtayaaragratgtgyAGACATCAATAGTAGCTGTCGA CAAAAGGATATATAGATGTC ABTtayaaragratgtgy AGACATCAATAGTAGCTGTCGA 50.6 40.9 22 rev_window_3-3 GACATCTATATATCCTTTTGTATAATCAACTCTAATCACT 55.5 27.5 40 +window_3-3_AAT80AAA window_3-3 AAT 80 AAA AAT80AAA K 0.0 0.0 CAAAAGGATATATAGATGTCAAAtayaaragratgtgyAGACATCAATAGTAGCTGTCGA CAAAAGGATATATAGATGTC AAAtayaaragratgtgy AGACATCAATAGTAGCTGTCGA 50.6 40.9 22 rev_window_3-3 GACATCTATATATCCTTTTGTATAATCAACTCTAATCACT 55.5 27.5 40 +window_3-3_TAC81VAC window_3-3 TAC 81 VAC TAC81VAC NHD 0.0 0.0 CAAAAGGATATATAGATGTCaayVACaaragratgtgyAGACATCAATAGTAGCTGTCGA CAAAAGGATATATAGATGTC aayVACaaragratgtgy AGACATCAATAGTAGCTGTCGA 50.6 40.9 22 rev_window_3-3 GACATCTATATATCCTTTTGTATAATCAACTCTAATCACT 55.5 27.5 40 +window_3-3_TAC81TBC window_3-3 TAC 81 TBC TAC81TBC SCF 0.0 0.0 CAAAAGGATATATAGATGTCaayTBCaaragratgtgyAGACATCAATAGTAGCTGTCGA CAAAAGGATATATAGATGTC aayTBCaaragratgtgy AGACATCAATAGTAGCTGTCGA 50.6 40.9 22 rev_window_3-3 GACATCTATATATCCTTTTGTATAATCAACTCTAATCACT 55.5 27.5 40 +window_3-3_TAC81TAA window_3-3 TAC 81 TAA TAC81TAA * 0.0 0.0 CAAAAGGATATATAGATGTCaayTAAaaragratgtgyAGACATCAATAGTAGCTGTCGA CAAAAGGATATATAGATGTC aayTAAaaragratgtgy AGACATCAATAGTAGCTGTCGA 50.6 40.9 22 rev_window_3-3 GACATCTATATATCCTTTTGTATAATCAACTCTAATCACT 55.5 27.5 40 +window_3-3_AAA82BAA window_3-3 AAA 82 BAA AAA82BAA QE* 0.0 0.0 CAAAAGGATATATAGATGTCaaytayBAAagratgtgyAGACATCAATAGTAGCTGTCGA CAAAAGGATATATAGATGTC aaytayBAAagratgtgy AGACATCAATAGTAGCTGTCGA 50.6 40.9 22 rev_window_3-3 GACATCTATATATCCTTTTGTATAATCAACTCTAATCACT 55.5 27.5 40 +window_3-3_AAA82ABA window_3-3 AAA 82 ABA AAA82ABA TRI 0.0 0.0 CAAAAGGATATATAGATGTCaaytayABAagratgtgyAGACATCAATAGTAGCTGTCGA CAAAAGGATATATAGATGTC aaytayABAagratgtgy AGACATCAATAGTAGCTGTCGA 50.6 40.9 22 rev_window_3-3 GACATCTATATATCCTTTTGTATAATCAACTCTAATCACT 55.5 27.5 40 +window_3-3_AAA82AAT window_3-3 AAA 82 AAT AAA82AAT N 0.0 0.0 CAAAAGGATATATAGATGTCaaytayAATagratgtgyAGACATCAATAGTAGCTGTCGA CAAAAGGATATATAGATGTC aaytayAATagratgtgy AGACATCAATAGTAGCTGTCGA 50.6 40.9 22 rev_window_3-3 GACATCTATATATCCTTTTGTATAATCAACTCTAATCACT 55.5 27.5 40 +window_3-3_AGG83KGG window_3-3 AGG 83 KGG AGG83KGG GW 1.0 0.0 CAAAAGGATATATAGATGTCaaytayaarKGGatgtgyAGACATCAATAGTAGCTGTCGA CAAAAGGATATATAGATGTC aaytayaarKGGatgtgy AGACATCAATAGTAGCTGTCGA 50.6 40.9 22 rev_window_3-3 GACATCTATATATCCTTTTGTATAATCAACTCTAATCACT 55.5 27.5 40 +window_3-3_AGG83AHG window_3-3 AGG 83 AHG AGG83AHG KTM 1.0 0.0 CAAAAGGATATATAGATGTCaaytayaarAHGatgtgyAGACATCAATAGTAGCTGTCGA CAAAAGGATATATAGATGTC aaytayaarAHGatgtgy AGACATCAATAGTAGCTGTCGA 50.6 40.9 22 rev_window_3-3 GACATCTATATATCCTTTTGTATAATCAACTCTAATCACT 55.5 27.5 40 +window_3-3_AGG83AGW window_3-3 AGG 83 AGW AGG83AGW RS 1.0 0.0 CAAAAGGATATATAGATGTCaaytayaarAGWatgtgyAGACATCAATAGTAGCTGTCGA CAAAAGGATATATAGATGTC aaytayaarAGWatgtgy AGACATCAATAGTAGCTGTCGA 50.6 40.9 22 rev_window_3-3 GACATCTATATATCCTTTTGTATAATCAACTCTAATCACT 55.5 27.5 40 +window_3-3_ATG84KTG window_3-3 ATG 84 KTG ATG84KTG VL 0.0 0.0 CAAAAGGATATATAGATGTCaaytayaaragrKTGtgyAGACATCAATAGTAGCTGTCGA CAAAAGGATATATAGATGTC aaytayaaragrKTGtgy AGACATCAATAGTAGCTGTCGA 50.6 40.9 22 rev_window_3-3 GACATCTATATATCCTTTTGTATAATCAACTCTAATCACT 55.5 27.5 40 +window_3-3_ATG84AVG window_3-3 ATG 84 AVG ATG84AVG KTR 0.0 0.0 CAAAAGGATATATAGATGTCaaytayaaragrAVGtgyAGACATCAATAGTAGCTGTCGA CAAAAGGATATATAGATGTC aaytayaaragrAVGtgy AGACATCAATAGTAGCTGTCGA 50.6 40.9 22 rev_window_3-3 GACATCTATATATCCTTTTGTATAATCAACTCTAATCACT 55.5 27.5 40 +window_3-3_ATG84ATT window_3-3 ATG 84 ATT ATG84ATT I 0.0 0.0 CAAAAGGATATATAGATGTCaaytayaaragrATTtgyAGACATCAATAGTAGCTGTCGA CAAAAGGATATATAGATGTC aaytayaaragrATTtgy AGACATCAATAGTAGCTGTCGA 50.6 40.9 22 rev_window_3-3 GACATCTATATATCCTTTTGTATAATCAACTCTAATCACT 55.5 27.5 40 +window_3-3_TGT85SGT window_3-3 TGT 85 SGT TGT85SGT RG 0.0 0.0 CAAAAGGATATATAGATGTCaaytayaaragratgSGTAGACATCAATAGTAGCTGTCGA CAAAAGGATATATAGATGTC aaytayaaragratgSGT AGACATCAATAGTAGCTGTCGA 50.6 40.9 22 rev_window_3-3 GACATCTATATATCCTTTTGTATAATCAACTCTAATCACT 55.5 27.5 40 +window_3-3_TGT85THT window_3-3 TGT 85 THT TGT85THT YSF 0.0 0.0 CAAAAGGATATATAGATGTCaaytayaaragratgTHTAGACATCAATAGTAGCTGTCGA CAAAAGGATATATAGATGTC aaytayaaragratgTHT AGACATCAATAGTAGCTGTCGA 50.6 40.9 22 rev_window_3-3 GACATCTATATATCCTTTTGTATAATCAACTCTAATCACT 55.5 27.5 40 +window_3-3_TGT85TGR window_3-3 TGT 85 TGR TGT85TGR *W 0.0 0.0 CAAAAGGATATATAGATGTCaaytayaaragratgTGRAGACATCAATAGTAGCTGTCGA CAAAAGGATATATAGATGTC aaytayaaragratgTGR AGACATCAATAGTAGCTGTCGA 50.6 40.9 22 rev_window_3-3 GACATCTATATATCCTTTTGTATAATCAACTCTAATCACT 55.5 27.5 40 +window_3-4_AGA86KGA window_3-4 AGA 86 KGA AGA86KGA G* 0.0 0.0 TCAATTACAAAAGGATGTGTKGAcaycarTAGTAGCTGTCGAGTCGC TCAATTACAAAAGGATGTGT KGAcaycar TAGTAGCTGTCGAGTCGC 50.3 55.6 18 rev_window_3-4 ACACATCCTTTTGTAATTGACATCTATATATCCT 55.0 29.4 34 +window_3-4_AGA86AHA window_3-4 AGA 86 AHA AGA86AHA KTI 0.0 0.0 TCAATTACAAAAGGATGTGTAHAcaycarTAGTAGCTGTCGAGTCGC TCAATTACAAAAGGATGTGT AHAcaycar TAGTAGCTGTCGAGTCGC 50.3 55.6 18 rev_window_3-4 ACACATCCTTTTGTAATTGACATCTATATATCCT 55.0 29.4 34 +window_3-4_AGA86AGT window_3-4 AGA 86 AGT AGA86AGT S 0.0 0.0 TCAATTACAAAAGGATGTGTAGTcaycarTAGTAGCTGTCGAGTCGC TCAATTACAAAAGGATGTGT AGTcaycar TAGTAGCTGTCGAGTCGC 50.3 55.6 18 rev_window_3-4 ACACATCCTTTTGTAATTGACATCTATATATCCT 55.0 29.4 34 +window_3-4_CAT87DAT window_3-4 CAT 87 DAT CAT87DAT NDY 0.0 0.0 TCAATTACAAAAGGATGTGTagrDATcarTAGTAGCTGTCGAGTCGC TCAATTACAAAAGGATGTGT agrDATcar TAGTAGCTGTCGAGTCGC 50.3 55.6 18 rev_window_3-4 ACACATCCTTTTGTAATTGACATCTATATATCCT 55.0 29.4 34 +window_3-4_CAT87CBT window_3-4 CAT 87 CBT CAT87CBT PRL 0.0 0.0 TCAATTACAAAAGGATGTGTagrCBTcarTAGTAGCTGTCGAGTCGC TCAATTACAAAAGGATGTGT agrCBTcar TAGTAGCTGTCGAGTCGC 50.3 55.6 18 rev_window_3-4 ACACATCCTTTTGTAATTGACATCTATATATCCT 55.0 29.4 34 +window_3-4_CAT87CAA window_3-4 CAT 87 CAA CAT87CAA Q 0.0 0.0 TCAATTACAAAAGGATGTGTagrCAAcarTAGTAGCTGTCGAGTCGC TCAATTACAAAAGGATGTGT agrCAAcar TAGTAGCTGTCGAGTCGC 50.3 55.6 18 rev_window_3-4 ACACATCCTTTTGTAATTGACATCTATATATCCT 55.0 29.4 34 +window_3-4_CAA88DAA window_3-4 CAA 88 DAA CAA88DAA KE* 0.0 0.0 TCAATTACAAAAGGATGTGTagrcayDAATAGTAGCTGTCGAGTCGC TCAATTACAAAAGGATGTGT agrcayDAA TAGTAGCTGTCGAGTCGC 50.3 55.6 18 rev_window_3-4 ACACATCCTTTTGTAATTGACATCTATATATCCT 55.0 29.4 34 +window_3-4_CAA88CBA window_3-4 CAA 88 CBA CAA88CBA PRL 0.0 0.0 TCAATTACAAAAGGATGTGTagrcayCBATAGTAGCTGTCGAGTCGC TCAATTACAAAAGGATGTGT agrcayCBA TAGTAGCTGTCGAGTCGC 50.3 55.6 18 rev_window_3-4 ACACATCCTTTTGTAATTGACATCTATATATCCT 55.0 29.4 34 +window_3-4_CAA88CAT window_3-4 CAA 88 CAT CAA88CAT H 0.0 0.0 TCAATTACAAAAGGATGTGTagrcayCATTAGTAGCTGTCGAGTCGC TCAATTACAAAAGGATGTGT agrcayCAT TAGTAGCTGTCGAGTCGC 50.3 55.6 18 rev_window_3-4 ACACATCCTTTTGTAATTGACATCTATATATCCT 55.0 29.4 34 diff --git a/main-script.ipynb b/main-script.ipynb new file mode 100644 index 0000000..f3f711b --- /dev/null +++ b/main-script.ipynb @@ -0,0 +1,151 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "08ba2028-5b81-4d9b-93fc-37efa999a608", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "import argparse\n", + "from Bio.SeqUtils import GC\n", + "from Bio.SeqUtils import MeltingTemp as mt\n", + "from Bio.Seq import Seq\n", + "from Bio import SeqIO\n", + "import math\n", + "import numpy as np\n", + "import pandas as pd\n", + "import main_package # my package\n", + "\n", + "# parse arguments\n", + "parser = argparse.ArgumentParser()\n", + "parser.add_argument(\"wt\", help=\"Genbank file path containing wild type (WT) sequence\", type=str)\n", + "parser.add_argument(\"o\", help=\"Output prefix\", type=str)\n", + "parser.add_argument(\"--vector\", help=\"Genbank file path containing vector sequence\", type=str, default=False)\n", + "parser.add_argument(\"--codon_table\", help=\"Specify codon table to use\", type=str, default='Standard')\n", + "parser.add_argument(\"--homo_len\", help=\"Length of homology arm in fwd primer\", type=int, default=20)\n", + "parser.add_argument(\"--oligo_len\", help=\"Ideal max total length of oligo\", type=int, default=60)\n", + "parser.add_argument(\"--melt_temp\", help=\"Melting temp of fwd primer\", type=int, default=50)\n", + "parser.add_argument(\"--rev_melt_temp\", help=\"Melting temp of rev primer\", type=int, default=55)\n", + "parser.add_argument(\"--syn_snp_rate\", help=\"Percentage of synonymous SNPs 0-1\", type=float, default=.05)\n", + "parser.add_argument(\"--stop_rate\", help=\"Percentage of stop codon SNPs, default = keep 10% of stop SNPs\", type=float, default=.10)\n", + "parser.add_argument(\"--rng_seed\", help=\"Set seed for repoducibly selecting synonymous codon sites\", type=int, default=42)\n", + "parser.add_argument(\"--out_dir\", help='Local output directory e.g. \"data\"', type=str)\n", + "args = parser.parse_args()\n", + "\n", + "# parse genbank files\n", + "wt_file = SeqIO.read(args.wt, 'genbank')\n", + "\n", + "# check for vector file\n", + "if not args.vector:\n", + " args.vector = args.wt\n", + "vector_file = SeqIO.read(vector_input, 'genbank')\n", + " \n", + "wt_seq = str(wt_file.seq.upper())\n", + "vector_seq = str(vector_file.seq.upper())\n", + "\n", + "# ERROR CHECKS\n", + "if len(wt_seq) != len(vector_seq):\n", + " print('ERROR: WildType and Vector GenBank sequences are not of equal length')\n", + " return\n", + "# check for -20 bp homology\n", + "# check that the strand is going forward\n", + "\n", + "# get start and stop of gene for codon positions\n", + "for feature in wt_file.features:\n", + " if feature.type == 'gene':\n", + " gene_start = feature.location.start.position\n", + " gene_end = feature.location.end.position\n", + "\n", + "# setup seq_data\n", + "seq_data = {}\n", + "seq_data['wt_seq'] = wt_seq\n", + "seq_data['vector_seq'] = vector_seq\n", + "seq_data['gene_start'] = gene_start\n", + "seq_data['gene_end'] = gene_end\n", + "seq_data['fasta_file'] = []\n", + "seq_data['df'] = pd.DataFrame()\n", + "seq_data['rng'] = np.random.RandomState(42)\n", + "\n", + "# this needs to be fixed (user input? yaml?)\n", + "targ_windows = ['window_1', 'window_2', 'window_3']\n", + "\n", + "for feature in wt_file.features:\n", + " if feature.type not in targ_windows:\n", + " continue\n", + " \n", + " start_index = feature.location.start.position\n", + " window_end = feature.location.end.position\n", + " \n", + " # loop for each sub_window\n", + " sub_window_n = 1\n", + " while start_index < window_end: # this could be an issue to toggle\n", + " data_dict = {}\n", + " data_dict['start_index'] = start_index\n", + " data_dict['window_end'] = window_end\n", + " data_dict['sub_window_name'] = {str(feature.type)}-{sub_window_n}\n", + " \n", + " # 1. homology arm\n", + " data_dict = main_package.primer_design.homology_arm(seq_data, data_dict, args)\n", + " \n", + " # 2. reverse primer\n", + " data_dict = main_package.primer_design.reverse_primer(seq_data, data_dict, args)\n", + " \n", + " # 3. forward primer\n", + " data_dict = forward_primer(seq_data, data_dict, args)\n", + " \n", + " # 4. variant window\n", + " seq_data, data_dict = main_package.primer_design.sub_window(seq_data, data_dict, args)\n", + " \n", + " # reset the start index for the next mini-window\n", + " start_index = primer_start\n", + " sub_window_n += 1 \n", + "\n", + "# setup .fa output, truncate if file exists\n", + "file = open(f\"{output_prefix}.fa\",'w+')\n", + "file.writelines(seq_data['fasta_file'])\n", + "file.close()\n", + "\n", + "# polish dataframe\n", + "df = seq_data['df']\n", + "df['position'] = df['position'].astype(int)\n", + "\n", + "df['forward_primer_tm'] = df['forward_primer'].apply(lambda x: mt.Tm_NN(x)).round(1)\n", + "df['forward_primer_gc'] = df['forward_primer'].apply(GC).round(1)\n", + "df['forward_primer_len'] = df['forward_primer'].str.len()\n", + "\n", + "df['reverse_primer_tm'] = df['reverse_primer'].apply(lambda x: mt.Tm_NN(x)).round(1)\n", + "df['reverse_primer_gc'] = df['reverse_primer'].apply(GC).round(1)\n", + "df['reverse_primer_len'] = df['reverse_primer'].str.len()\n", + "\n", + "cols = ['name','sub_window_name','wt','position','iupac','codon_sub','synonymous_codons','no_stop_codons','primer','homology_arm','sub_window','forward_primer','forward_primer_tm','forward_primer_gc','forward_primer_len','reverse_primer','reverse_primer_name','reverse_primer_tm','reverse_primer_gc','reverse_primer_len']\n", + "df = df[cols]\n", + "\n", + "# save dataframe as .tsv\n", + "df.to_csv(f'{output_prefix}.tsv', index=False, sep='\\t')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "alignparse-environment", + "language": "python", + "name": "alignparse-environment" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/main.ipynb b/main.ipynb index 41bf33e..09e0254 100644 --- a/main.ipynb +++ b/main.ipynb @@ -5,7 +5,8 @@ "id": "84354767-b851-46dc-814a-3c03b0b0c48d", "metadata": {}, "source": [ - "TODO:\n", + "TODO\n", + "- need to duplicate into script\n", "- need to locate the codon dictionaries from the main_package directory\n", "- stop codon variable" ] @@ -27,7 +28,7 @@ "import numpy as np\n", "import pandas as pd\n", "#import random\n", - "import main_package # my package\n" + "import main_package # my package" ] }, { @@ -91,76 +92,20 @@ { "cell_type": "code", "execution_count": 2, - "id": "b4c392be", - "metadata": {}, - "outputs": [], - "source": [ - "# generate missense codon dictionary\n", - "#missense_dict = main_package.codon_table.iupac_missense_codon_dict(codon_table=args.codon_table)\n", - "#missense_dict = main_package.codon_table.iupac_missense_codon_dict(codon_table='Standard')\n", - "missense_dict = main_package.codon_table.selected_iupac_codons_dict()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "22e96932", - "metadata": {}, - "outputs": [], - "source": [ - "# generate synonymous missense codon dictionary\n", - "#main_package.codon_table.iupac_synonymous_codon_dict(codon_table=args.codon_table)\n", - "#synonymous_dict = main_package.codon_table.iupac_synonymous_codon_dict(codon_table='Standard')\n", - "synonymous_missense_dict = main_package.codon_table.synonymous_iupac_codons_dict()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "7dca8b90-84b9-4dd7-9b85-9a3f31c1a3c7", + "id": "dac45e16-75b2-4207-9f31-2a015d090d24", "metadata": {}, "outputs": [], "source": [ + "# generate iupac codon dictionaries to generate doped primers\n", + "missense_dict, synonymous_dict, no_stop_dict, no_stop_syn_dict = main_package.codon_table.iupac_codon_dicts()\n", + "\n", "# generate yeast synonymous codon dictionary (no missense variants)\n", "yeast_synonymous_dict = main_package.codon_table.synonymous_yeast_codons_dict()" ] }, { "cell_type": "code", - "execution_count": 5, - "id": "d5f1bb72", - "metadata": {}, - "outputs": [ - { - "ename": "IndentationError", - "evalue": "expected an indented block (2364269451.py, line 15)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m File \u001b[0;32m\"/var/folders/fq/q1cflf795cgbnmbhdyn8d9nntyw5jt/T/ipykernel_9275/2364269451.py\"\u001b[0;36m, line \u001b[0;32m15\u001b[0m\n\u001b[0;31m # drop-in missense sub-window\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m expected an indented block\n" - ] - } - ], - "source": [ - "# parse .gb file, loop for each matching feature \"window\"\n", - "for window in file:\n", - " # check that window is divisible by 3, codons\n", - " # check for upstream homology and downstream primer space (20bp, 40bp)\n", - "\n", - " # assign sub-window start index value\n", - " # define which codons will contain synonymous controls at 5% frequency (based on wt)\n", - " # define synonymous codons (based on vector)\n", - " # (create all the codon variants immediately, then define subwindows based on primers)\n", - " \n", - " # begin sub-window loop:\n", - " # homology arm\n", - " # primer design\n", - " # redefine sub-window start index\n", - " # drop-in missense sub-window" - ] - }, - { - "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "id": "a7ed0c7d", "metadata": {}, "outputs": [], @@ -181,7 +126,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "id": "8294e447-8d42-45d7-991f-4ae9b1eaf188", "metadata": {}, "outputs": [ @@ -199,25 +144,12 @@ "for feature in wt_file.features:\n", " if feature.type == 'gene':\n", " gene_start = feature.location.start.position\n", - " gene_end = feature.location.end.position\n", - " print(gene_start)\n", - " print(gene_end)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "8ed1b7bd-4781-4276-80e3-36a7742ec491", - "metadata": {}, - "outputs": [], - "source": [ - "# create dataframe\n", - "df = pd.DataFrame()" + " gene_end = feature.location.end.position" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "id": "c22b4eed", "metadata": {}, "outputs": [], @@ -236,46 +168,42 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "id": "74cc27ed-1256-44c2-8b54-4eabcee29f3f", "metadata": {}, "outputs": [], "source": [ - "\n", + "# hardcode test variables\n", "homo_len = 20 ### args.homo_len\n", "rev_melt_temp = 55 ### args.rev_melt_temp\n", "primer_len = 60 ### args.primer_len\n", "melt_temp = 50 ### args.melt_temp\n", "syn_snp_rate = .05 ### args.syn_snp_rate\n", - "stop_rate = .1 ### args.stop_rate" + "stop_rate = .1 ### args.stop_rate\n", + "output_prefix = 'k3l_test' # maybe have a prefix for output" ] }, { "cell_type": "code", - "execution_count": 13, - "id": "9d651f93", + "execution_count": 10, + "id": "57df82ff-a49c-4075-8691-c8284c2528b1", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"\\n'primer':,\\n'primer_tm':,\\n'primer_gc':,\\n'primer_len':,\\n'rev_temp':,\\n'rev_gc':,\\n'rev_primer_len':, \\n\"" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "### loop through the window\n", "\n", - "# setup RNG in the loop, maybe this needs to be moved\n", + "# create empty dataframe\n", + "df = pd.DataFrame()\n", + "\n", + "# set RNG with seed \n", "rng = np.random.RandomState(42)\n", "\n", "# this needs to be fixed (user input? yaml?)\n", "targ_windows = ['window_1', 'window_2', 'window_3']\n", "\n", + "# setup .fa output, truncate if file exists\n", + "file = open(f\"{output_prefix}.fa\",'w+')\n", + "\n", "for feature in wt_file.features:\n", " if feature.type not in targ_windows:\n", " continue\n", @@ -294,26 +222,34 @@ " start_index = feature.location.start.position\n", " window_end = feature.location.end.position\n", " \n", - " # loop for each mini_window\n", - " mini_win = 1\n", + " # loop for each sub_window\n", + " sub_window_n = 1\n", " while start_index < window_end: # this could be an issue to toggle\n", " data_dict = {}\n", + " data_dict['start_index'] = start_index\n", + " data_dict['sub_window_name'] = {str(feature.type)}-{sub_window_n}\n", " \n", " # 1. homology arm\n", + " # INPUT: data dictionary, args (homo_len)\n", + " # OUTPUT: data dictionary\n", " homology_arm = vector_seq[start_index - homo_len:start_index] ### args.homo_len\n", " data_dict['homology_arm'] = homology_arm\n", " \n", " # 2. reverse primer\n", + " # INPUT: data dictionary\n", + " # OUTPUT: data dictionary\n", " reverse_seq = str(Seq(vector_seq[:start_index]).reverse_complement())\n", " reverse_primer = reverse_seq[:15]\n", " while mt.Tm_NN(reverse_primer) < rev_melt_temp: ### args.rev_melt_temp\n", " reverse_primer = reverse_seq[:len(reverse_primer)+1] ### args.rev_melt_temp\n", " data_dict['reverse_primer'] = reverse_primer\n", " \n", - " reverse_primer_name = f'rev_{str(feature.type)}-{mini_win}'\n", + " reverse_primer_name = f'rev_{str(feature.type)}-{sub_window_n}'\n", " data_dict['reverse_primer_name'] = reverse_primer_name\n", " \n", " # 3. forward primer\n", + " # INPUT: data dictionary, start index\n", + " # OUTPUT: \n", " primer_end = start_index + (primer_len - homo_len) ### args.primer_len homo_len\n", " if primer_end > window_end:\n", " primer_end == window_end\n", @@ -360,6 +296,8 @@ " data_dict['forward_primer'] = forward_primer\n", " \n", " # 4. variant window\n", + " # INPUT: data dictionary, .tsv output, .fa output\n", + " # OUTPUT: dataframe, .fa file lines\n", " mut_len = (primer_start) - start_index\n", " mut_end = start_index + mut_len\n", "\n", @@ -371,479 +309,87 @@ " vect_list = codons_list(vector_seq[start_index:mut_end])\n", "\n", " # generate synonymous vector codon list (top 2 codons for yeast)\n", - " synonymous_win = [yeast_synonymous_dict[i] for i in vect_list]\n", + " synonymous_win = [yeast_synonymous_dict[i].lower() for i in vect_list]\n", " \n", " # generate iupac missense codons list (with synonymous codons) \n", " doped_codons = []\n", " for i, wt_codon in enumerate(wt_list): \n", " syn_bool = rng.choice([True, False], p=[syn_snp_rate, 1-syn_snp_rate]) ### args.syn_snp_rate\n", - " data_dict['synonymous_included'] = syn_bool\n", - " if syn_bool: \n", - " doped_codons.append(synonymous_missense_dict[wt_codon])\n", + " data_dict['synonymous_codons'] = syn_bool\n", + " \n", + " no_stop_bool = rng.choice([True, False], p=[stop_rate, 1-stop_rate]) ### args.stop_rate\n", + " data_dict['no_stop_codons'] = no_stop_bool\n", + " # missense_dict, synonymous_dict, no_stop_dict, no_stop_syn_dict\n", + " if syn_bool and no_stop_bool:\n", + " # use no_stop_syn_dictionary\n", + " doped_codons.append(no_stop_syn_dict[wt_codon])\n", + " elif syn_bool and not no_stop_bool:\n", + " # use synonymous_dictionary\n", + " doped_codons.append(synonymous_dict[wt_codon])\n", + " elif no_stop_bool and not syn_bool: \n", + " # use no_stop_dict\n", + " doped_codons.append(no_stop_dict[wt_codon])\n", " else:\n", + " # use missense dict\n", " doped_codons.append(missense_dict[wt_codon])\n", - "\n", + " \n", " # generate the mut primer and all info\n", " for i, iupac_list in enumerate(doped_codons):\n", - " res_pos = int((((start_index-gene_start)/3)+1)+i)\n", + " aa_position = int((((start_index-gene_start)/3)+1)+i)\n", " for iupac_codon in iupac_list:\n", "\n", - " codon_sub = wt_list[i] + str(res_pos) + iupac_codon\n", + " codon_sub = wt_list[i] + str(aa_position) + iupac_codon\n", " variant_win = ''.join(synonymous_win[:i] + [iupac_codon] + synonymous_win[i+1:])\n", - " primer_name = f'{str(feature.type)}-{mini_win}_{codon_sub}'\n", + " primer_name = f'{str(feature.type)}-{sub_window_n}_{codon_sub}'\n", " primer = homology_arm + variant_win + forward_primer\n", " \n", - " dict_keys = ['name','codon_sub','wt','pos','iupac']\n", - " dict_values = [primer_name, codon_sub, wt_list[i], res_pos, iupac_codon]\n", + " # drop iupac_codon into sub_window\n", + " sub_window = ''.join(synonymous_win[:i] + [iupac_codon] + synonymous_win[i+1:])\n", + " full_primer = homology_arm + sub_window + forward_primer\n", + " \n", + " dict_keys = ['name','codon_sub','wt','position','iupac', 'sub_window', 'primer']\n", + " dict_values = [primer_name, codon_sub, wt_list[i], aa_position, iupac_codon, sub_window, full_primer]\n", " for (key,value) in zip(dict_keys,dict_values):\n", " data_dict[key] = value\n", "\n", " # append values to dataframe\n", " df = df.append(data_dict, ignore_index=True)\n", + " \n", + " # write out to .fa\n", + " file.writelines([f\">{primer_name}\\n\", f\"{full_primer}\\n\"])\n", " \n", " # reset the start index for the next mini-window\n", " start_index = primer_start\n", - " mini_win += 1 \n", - "\n", - "dict_keys = ['name','codon_sub','wt','pos','iupac','syn_bool','homo','variant_win','primer','rev_primer_name','rev_primer'] \n", + " sub_window_n += 1 \n", " \n", - "# organize data into .tsv \n", + "file.close()\n", "\n", - "# to add\n", - "\"\"\"\n", - "'primer':,\n", - "'primer_tm':,\n", - "'primer_gc':,\n", - "'primer_len':,\n", - "'rev_temp':,\n", - "'rev_gc':,\n", - "'rev_primer_len':, \n", - "\"\"\"\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "063f6e05-1bb0-4007-8584-28081a8ee6a0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Seq('X')" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "52608f32-3365-43e9-b60a-2543e4455067", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
homology_armreverse_primerreverse_primer_nameforward_primersynonymous_includednamecodon_subwtposiupac
0TTTGTTATTCGTTGCCCAATATTGGGCAACGAATAACAAAATGCArev_window_1-1GGCAGAGTATACGAGAAGGATT0.0window_1-1_GCG11HCGGCG11HCGGCG11.0HCG
1TTTGTTATTCGTTGCCCAATATTGGGCAACGAATAACAAAATGCArev_window_1-1GGCAGAGTATACGAGAAGGATT0.0window_1-1_GCG11GDGGCG11GDGGCG11.0GDG
2TTTGTTATTCGTTGCCCAATATTGGGCAACGAATAACAAAATGCArev_window_1-1GGCAGAGTATACGAGAAGGATT0.0window_1-1_GGT12HGTGGT12HGTGGT12.0HGT
3TTTGTTATTCGTTGCCCAATATTGGGCAACGAATAACAAAATGCArev_window_1-1GGCAGAGTATACGAGAAGGATT0.0window_1-1_GGT12GHTGGT12GHTGGT12.0GHT
4TTTGTTATTCGTTGCCCAATATTGGGCAACGAATAACAAAATGCArev_window_1-1GGCAGAGTATACGAGAAGGATT0.0window_1-1_GAT13HATGAT13HATGAT13.0HAT
.................................
92TCAATTACAAAAGGATGTGTACACATCCTTTTGTAATTGACATCTATATATCCTrev_window_3-4TAGTAGCTGTCGAGTCGC0.0window_3-4_CAT87CBTCAT87CBTCAT87.0CBT
93TCAATTACAAAAGGATGTGTACACATCCTTTTGTAATTGACATCTATATATCCTrev_window_3-4TAGTAGCTGTCGAGTCGC0.0window_3-4_CAT87CAACAT87CAACAT87.0CAA
94TCAATTACAAAAGGATGTGTACACATCCTTTTGTAATTGACATCTATATATCCTrev_window_3-4TAGTAGCTGTCGAGTCGC0.0window_3-4_CAA88DAACAA88DAACAA88.0DAA
95TCAATTACAAAAGGATGTGTACACATCCTTTTGTAATTGACATCTATATATCCTrev_window_3-4TAGTAGCTGTCGAGTCGC0.0window_3-4_CAA88CBACAA88CBACAA88.0CBA
96TCAATTACAAAAGGATGTGTACACATCCTTTTGTAATTGACATCTATATATCCTrev_window_3-4TAGTAGCTGTCGAGTCGC0.0window_3-4_CAA88CATCAA88CATCAA88.0CAT
\n", - "

97 rows × 10 columns

\n", - "
" - ], - "text/plain": [ - " homology_arm reverse_primer \\\n", - "0 TTTGTTATTCGTTGCCCAAT ATTGGGCAACGAATAACAAAATGCA \n", - "1 TTTGTTATTCGTTGCCCAAT ATTGGGCAACGAATAACAAAATGCA \n", - "2 TTTGTTATTCGTTGCCCAAT ATTGGGCAACGAATAACAAAATGCA \n", - "3 TTTGTTATTCGTTGCCCAAT ATTGGGCAACGAATAACAAAATGCA \n", - "4 TTTGTTATTCGTTGCCCAAT ATTGGGCAACGAATAACAAAATGCA \n", - ".. ... ... \n", - "92 TCAATTACAAAAGGATGTGT ACACATCCTTTTGTAATTGACATCTATATATCCT \n", - "93 TCAATTACAAAAGGATGTGT ACACATCCTTTTGTAATTGACATCTATATATCCT \n", - "94 TCAATTACAAAAGGATGTGT ACACATCCTTTTGTAATTGACATCTATATATCCT \n", - "95 TCAATTACAAAAGGATGTGT ACACATCCTTTTGTAATTGACATCTATATATCCT \n", - "96 TCAATTACAAAAGGATGTGT ACACATCCTTTTGTAATTGACATCTATATATCCT \n", - "\n", - " reverse_primer_name forward_primer synonymous_included \\\n", - "0 rev_window_1-1 GGCAGAGTATACGAGAAGGATT 0.0 \n", - "1 rev_window_1-1 GGCAGAGTATACGAGAAGGATT 0.0 \n", - "2 rev_window_1-1 GGCAGAGTATACGAGAAGGATT 0.0 \n", - "3 rev_window_1-1 GGCAGAGTATACGAGAAGGATT 0.0 \n", - "4 rev_window_1-1 GGCAGAGTATACGAGAAGGATT 0.0 \n", - ".. ... ... ... \n", - "92 rev_window_3-4 TAGTAGCTGTCGAGTCGC 0.0 \n", - "93 rev_window_3-4 TAGTAGCTGTCGAGTCGC 0.0 \n", - "94 rev_window_3-4 TAGTAGCTGTCGAGTCGC 0.0 \n", - "95 rev_window_3-4 TAGTAGCTGTCGAGTCGC 0.0 \n", - "96 rev_window_3-4 TAGTAGCTGTCGAGTCGC 0.0 \n", - "\n", - " name codon_sub wt pos iupac \n", - "0 window_1-1_GCG11HCG GCG11HCG GCG 11.0 HCG \n", - "1 window_1-1_GCG11GDG GCG11GDG GCG 11.0 GDG \n", - "2 window_1-1_GGT12HGT GGT12HGT GGT 12.0 HGT \n", - "3 window_1-1_GGT12GHT GGT12GHT GGT 12.0 GHT \n", - "4 window_1-1_GAT13HAT GAT13HAT GAT 13.0 HAT \n", - ".. ... ... ... ... ... \n", - "92 window_3-4_CAT87CBT CAT87CBT CAT 87.0 CBT \n", - "93 window_3-4_CAT87CAA CAT87CAA CAT 87.0 CAA \n", - "94 window_3-4_CAA88DAA CAA88DAA CAA 88.0 DAA \n", - "95 window_3-4_CAA88CBA CAA88CBA CAA 88.0 CBA \n", - "96 window_3-4_CAA88CAT CAA88CAT CAA 88.0 CAT \n", - "\n", - "[97 rows x 10 columns]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "f35f500a-fefc-4cfb-8a70-0587857aeaaf", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1920" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "window_end" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "3ba47ee2-86b4-4405-9acc-663a50d78d98", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "70.0" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "(start_index-gene_start)/3\n" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "1661f4a3-8d06-4f18-9f81-40fa7f79b0ed", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Seq('TATTCGTTGCCCAAT')" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Seq('ATTGGGCAACGAATA').reverse_complement()" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "a09b5daa-bcc6-4d8a-903e-846785ec0b53", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'CGAACGCCAGCAAGACGTAGCCCAGCGCGTCGGCCGCCATGCCGGCGATAATGGCCTGCTTCTCGCCGAAACGTTTGGTGGCGGGACCAGTGACGAAGGCTTGAGCGAGGGCGTGCAAGATTCCGAATACCGCAAGCGACAGGCCGATCATCGTCGCGCTCCAGCGAAAGCGGTCCTCGCCGAAAATGACCCAGAGCGCTGCCGGCACCTGTCCTACGAGTTGCATGATAAAGAAGACAGTCATAAGTGCGGCGACGATAGTCATGCCCCGCGCCCACCGGAAGGAGCTGACTGGGTTGAAGGCTCTCAAGGGCATCGGTCGACGCTCTCCCTTATGCGACTCCTGCATTAGGAAGCAGCCCAGTAGTAGGTTGAGGCCGTTGAGCACCGCCGCCGCAAGGAATGGTGCATGCAAGGAGATGGCGCCCAACAGTCCCCCGGCCACGGGGCCTGCCACCATACCCACGCCGAAACAAGCGCTCATGAGCCCGAAGTGGCGAGCCCGATCTTCCCCATCGGTGATGTCGGCGATATAGGCGCCAGCAACCGCACCTGTGGCGCCGGTGATGCCGGCCACGATGCGTCCGGCGTAGAGGATCCACAGGACGGGTGTGGTCGCCATGATCGCGTAGTCGATAGTGGCTCCAAGTAGCGAAGCGAGCAGGACTGGGCGGCGGCCAAAGCGGTCGGACAGTGCTCCGAGAACGGGTGCGCATAGAAATTGCATCAACGCATATAGCGCTAGCAGCACGCCATAGTGACTGGCGATGCTGTCGGAATGGACGATATCCCGCAAGAGGCCCGGCAGTACCGGCATAACCAAGCCTATGCCTACAGCATCCAGGGTGACGGTGCCGAGGATGACGATGAGCGCATTGTTAGATTTCATACACGGTGCCTGACTGCGTTAGCAATTTAACTGTGATAAACTACCGCATTAAAGCTTATCGATGAGCTCCTTATGCGGATCTGTAGCAGCTGTCATTATCAATACTGCCATTTCAAAGAATACGTAAATAATTAATAGTAGTGATTTTCCTAACTTTATTTAGTCAAAAAATTAGCCTTTTAATTCTGCTGTAACCCGTACATGCCCAAAATAGGGGGCGGGTTACACAGAATATATAACATCGTAGGTGTCTGGGTGAACAGTTTATTCCTGGCATCCACTAAATATAATGGAGCCCGCTTTTTAAGCTGGCATCCAGAAAAAAAAAGAATCCCAGCACCAAAATATTGTTTTCTTCACCAACCATCAGTTCATAGGTCCATTCTCTTAGCGCAACTACAGAGAACAGGGGCACAAACAGGCAAAAAACGGGCACAACCTCAATGGAGTGATGCAACCTGCCTGGAGTAAATGATGACACAAGGCAATTGACCCACGCATGTATCTATCTCATTTTCTTACACCTTCTATTACCTTCTGCTCTCTCTGATTTGGAAAAAGCTGAAAAAAAAGGTTGAAACCAGTTCCCTGAAATTATTCCCCTACTTGACTAATAAGTATATAAAGACGGTAGGTATTGATTGTAATTCTGTAAATCTATTTCTTAAACTTCTTAAATTCTACTTTTATAGTTAGTCTTTTTTTTAGTTTTAAAACACCAAGAACTTAGTTTCGAATAAACACACATAAACAAACAAAACGCGTCCATGCTTGCATTTTGTTATTCGTTGCCCAATGCGGGTGATGTAATAAAGGGCAGAGTATACGAGAAGGATTATGCTCTATATATTTATCTTTTTGACTATCCTCACTTTGAAGCTATCTTGGCAGAGAGTGTTAAGATGCATATGGATAGATATGTTGAATATAGGGATAAACTGGTAGGGAAAACTGTAAAAGTTAAAGTGATTAGAGTTGATTATACAAAAGGATATATAGATGTCAATTACAAAAGGATGTGTAGACATCAATAGTAGCTGTCGAGTCGCAGCTCTATATAAACTCATTTACTTATGTAGGAATAAAGAGTATCATCTTTCAAAGTTAGCCGAGCATACAGATGGGTCTGTCNNNNNAANNNNNAANNNNNTTNNNNNGCGGCCGCTGATTGTTGTACAGAAACTTGGTCAC'" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "vector_seq" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8ee773f3-46b3-4cef-8441-a284f450fc40", - "metadata": {}, - "outputs": [], - "source": [ - "win_list\n", - "homo_list\n", - "rev_primer_list\n", - "fwd_primer_list\n", - "mut_windows_list\n", - "mut_name_list\n", - "full_window_list\n", - "full_name_list\n", - "full_primer_list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e123580f", - "metadata": {}, - "outputs": [], - "source": [ - "# need to double-check the conditional logic on the indices" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "cf375e02", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'GATTATACAAAAGGATATATAGATGTCAATTACAAAAGGATGTGTAGACATCAA'" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "wt_seq[start_index:window_end]" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "add9a201", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1923" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "feature.location.start.position" + "# polish dataframe\n", + "df['position'] = df['position'].astype(int)\n", + "df.drop(columns=['start_index'], inplace=True)\n", + "\n", + "df['forward_primer_tm'] = df['forward_primer'].apply(lambda x: mt.Tm_NN(x)).round(1)\n", + "df['forward_primer_gc'] = df['forward_primer'].apply(GC).round(1)\n", + "df['forward_primer_len'] = df['forward_primer'].str.len()\n", + "\n", + "df['reverse_primer_tm'] = df['reverse_primer'].apply(lambda x: mt.Tm_NN(x)).round(1)\n", + "df['reverse_primer_gc'] = df['reverse_primer'].apply(GC).round(1)\n", + "df['reverse_primer_len'] = df['reverse_primer'].str.len()\n", + "\n", + "cols = ['name','sub_window_name','wt','position','iupac','codon_sub','synonymous_codons','no_stop_codons','primer','homology_arm','sub_window','forward_primer','forward_primer_tm','forward_primer_gc','forward_primer_len','reverse_primer','reverse_primer_name','reverse_primer_tm','reverse_primer_gc','reverse_primer_len']\n", + "df = df[cols]\n", + "\n", + "# save dataframe as .tsv\n", + "df.to_csv(f'{output_prefix}.tsv', index=False, sep='\\t')" ] }, { "cell_type": "code", "execution_count": null, - "id": "3096168c", + "id": "d1ca397e-cfc5-4c72-bf0a-de554217481d", "metadata": {}, "outputs": [], - "source": [ - "# SKETCH: aggregate into dataframe" - ] + "source": [] } ], "metadata": { diff --git a/main_package/.DS_Store b/main_package/.DS_Store new file mode 100644 index 0000000..ee0e370 Binary files /dev/null and b/main_package/.DS_Store differ diff --git a/main_package/.ipynb_checkpoints/__init__-checkpoint.py b/main_package/.ipynb_checkpoints/__init__-checkpoint.py index 0231f89..fb2990e 100644 --- a/main_package/.ipynb_checkpoints/__init__-checkpoint.py +++ b/main_package/.ipynb_checkpoints/__init__-checkpoint.py @@ -1 +1,3 @@ -import main_package.codon_table \ No newline at end of file +# __init__.py +import main_package.codon_table +import main_package.primer_design \ No newline at end of file diff --git a/main_package/.ipynb_checkpoints/codon_table-checkpoint.py b/main_package/.ipynb_checkpoints/codon_table-checkpoint.py index 49dd003..931fc96 100644 --- a/main_package/.ipynb_checkpoints/codon_table-checkpoint.py +++ b/main_package/.ipynb_checkpoints/codon_table-checkpoint.py @@ -1,10 +1,20 @@ #!/usr/bin/env python3 - +import pkg_resources from itertools import product from Bio.Seq import Seq import pandas as pd import itertools +iupac_dict = {'A':'A','C':'C','G':'G','T':'T','AC':'M','AG':'R','AT':'W','CG':'S','CT':'Y','GT':'K','ACG':'V','ACT':'H','AGT':'D','CGT':'B','ACGT':'N'} +rev_iupac_dict = {value:key for key,value in iupac_dict.items()} + +def iupac_to_aa(iupac_codon): + """Return string of AAs encoded by input iupac missense codon""" + nuc_lists = [list(rev_iupac_dict[n]) for n in iupac_codon] + codon_list = [''.join(i) for i in list(itertools.product(*nuc_lists))] + aa_list = [str(Seq(codon).translate()) for codon in codon_list] + return ''.join(aa_list) + def iupac_missense_codon_df(codon_table='Standard'): nucleotides = 'ACGT' iupac_dict = {'A':'A','C':'C','G':'G','T':'T','AC':'M','AG':'R','AT':'W','CG':'S','CT':'Y','GT':'K','ACG':'V','ACT':'H','AGT':'D','CGT':'B','ACGT':'N'} @@ -22,7 +32,7 @@ def iupac_missense_codon_df(codon_table='Standard'): aa = Seq(codon).translate(table=codon_table)[0] # loop through each position in codon - for position in range(3): + for position in range(3): new_aas = [] iupac_n = [] new_codons = [] @@ -62,8 +72,8 @@ def iupac_missense_codon_df(codon_table='Standard'): # create dictionary codon_level_dict = { - 'codon':codon_list, - 'aa':aa_list, + 'codon':codon_list, + 'aa':aa_list, 'position':position_list, 'missense_nucleotides':nucleotides_list, 'missense_codons':missense_codons_list, @@ -97,7 +107,7 @@ def iupac_synonymous_codon_df(codon_table='Standard'): aa = Seq(codon).translate(table='Standard')[0] # loop through each position in codon - for position in range(3): + for position in range(3): new_aas = [] iupac_n = [] new_codons = [] @@ -137,8 +147,8 @@ def iupac_synonymous_codon_df(codon_table='Standard'): # create dictionary codon_level_dict = { - 'codon':codon_list, - 'aa':aa_list, + 'codon':codon_list, + 'aa':aa_list, 'position':position_list, 'synonymous_nucleotides':nucleotides_list, 'synonymous_codons':synonymous_codons_list, @@ -161,6 +171,7 @@ def iupac_synonymous_codon_dict(codon_table='Standard'): #### I need something that gets the absolute path of the .csv file to be imported +# SCRATCH def selected_iupac_codons_dict(): """return codon table of selected missense codons PROBLEM: codon table must be in directory where function is being called""" @@ -171,6 +182,7 @@ def selected_iupac_codons_dict(): sele_dict[key] = list(itertools.chain.from_iterable([codon.split(' ') for codon in value])) return sele_dict +# SCRATCH def synonymous_iupac_codons_dict(): """return codon table including synonymous codons PROBLEM: codon table must be in directory where function is being called""" @@ -181,9 +193,36 @@ def synonymous_iupac_codons_dict(): syn_dict[key] = list(itertools.chain.from_iterable([codon.split(' ') for codon in value])) return syn_dict + +def iupac_codon_dicts(): + """Returns four mapping dictionaries to generate missense variants + RETURNS: + - missense_dict + - synonymous_dict + - no_stop_dict + - no_stop_syn_dict + """ + stream = pkg_resources.resource_stream(__name__, 'data/final_codon_table.csv') + df = pd.read_csv(stream) + df.fillna('', inplace=True) + col_list = ['sele_iupac_codon', 'syn_iupac_codon', 'no_stop_iupac_codon', 'no_stop_syn_iupac_codon'] + + # check that column names exist + for col in col_list: + if col not in df.columns.tolist(): + print(f"ERROR: Column '{col}' not contained in file '{codon_table}'") + return + dict_list = [] + for col in col_list: + temp_dict = df.query(f'{col} != ""').groupby('codon')[col].apply(list).to_dict() + for key,value in temp_dict.items(): + temp_dict[key] = list(itertools.chain.from_iterable([codon.split(' ') for codon in value])) + dict_list.append(temp_dict) + return dict_list + def synonymous_yeast_codons_dict(): - """return iupac codon dictionary for synonymous codons most frequently used in yeast - PROBLEM: codon table must be in directory where function is being called""" - df = pd.read_csv('yeast_synonymous_codon_table.csv') - temp_df = dict(zip(df.codon, df.iupac)) - return temp_df \ No newline at end of file + """Return mapping dictionary of doped synonymous codons optimized for yeast""" + stream = pkg_resources.resource_stream(__name__, 'data/yeast_synonymous_codon_table.csv') + df = pd.read_csv(stream) + syn_dict = dict(zip(df.codon, df.iupac)) + return syn_dict diff --git a/main_package/.ipynb_checkpoints/primer_design-checkpoint.py b/main_package/.ipynb_checkpoints/primer_design-checkpoint.py new file mode 100644 index 0000000..6f4ecf1 --- /dev/null +++ b/main_package/.ipynb_checkpoints/primer_design-checkpoint.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +from . import codon_table +from Bio.Seq import Seq +from Bio.SeqUtils import MeltingTemp as mt + +def homology_arm(seq_data, data_dict, args): + start_index = data_dict['start_index'] + vector_seq = seq_data['vector_seq'] + + homology_arm = vector_seq[start_index - args.homo_len:start_index] ### args.homo_len + data_dict['homology_arm'] = homology_arm + + return data_dict + +def reverse_primer(seq_data, data_dict, args): + sub_window_name = data_dict['sub_window_name'] + start_index = data_dict['start_index'] + vector_seq = seq_data['vector_seq'] + + reverse_seq = str(Seq(vector_seq[:start_index]).reverse_complement()) + reverse_primer = reverse_seq[:15] + while mt.Tm_NN(reverse_primer) < args.rev_melt_temp: ### args.rev_melt_temp + reverse_primer = reverse_seq[:len(reverse_primer)+1] + data_dict['reverse_primer'] = reverse_primer + + reverse_primer_name = f'rev_{sub_window_name}' + data_dict['reverse_primer_name'] = reverse_primer_name + + return data_dict + +def forward_primer(seq_data, data_dict, args): + start_index = data_dict['start_index'] + window_end = data_dict['window_end'] + vector_seq = seq_data['vector_seq'] + + primer_end = start_index + (args.oligo_len - args.homo_len) + if primer_end > window_end: + primer_end == window_end + + primer_start = primer_end - 15 + forward_primer = vector_seq[primer_start:primer_end] + + while mt.Tm_NN(forward_primer) < args.melt_temp: + primer_start -= 1 + forward_primer = vector_seq[primer_start:primer_end] + + # check if the primer is the max oligo length + if len(forward_primer) > (args.oligo_len - args.homo_len - 12): # 12 is a minimum window size of 4 codons + # fix mut window to 12, make a long primer + primer_start = start_index + 12 + primer_end = primer_start + 15 + forward_primer = vector_seq[primer_start:primer_end] + while True: + forward_primer = vector_seq[primer_start:primer_end] + if mt.Tm_NN(forward_primer) > args.melt_temp and forward_primer.upper().count('G') + forward_primer.upper().count('C') > 8: + break + else: + primer_end += 1 + + # even-out the primer length to accomodate codons + else: + # add or subtract a bp from the fwd primer to get mut_window in frame + if (primer_start - start_index)%3 == 2: + primer_start += 1 + forward_primer = vector_seq[primer_start:primer_end] + + elif (primer_start - start_index)%3 == 1: + primer_start -= 1 + forward_primer = vector_seq[primer_start:primer_end] + + # making the last primer in a window + if primer_start > window_end: + primer_start = window_end + primer_end = primer_start+15 + forward_primer = vector_seq[primer_start:primer_end] + while mt.Tm_NN(forward_primer) < args.melt_temp: + primer_end += 1 + forward_primer = vector_seq[primer_start:primer_end] + + data_dict['primer_start'] = primer_start + data_dict['forward_primer'] = forward_primer + + return data_dict + +def sub_window(seq_data, data_dict, args): + primer_start = data_dict['primer_start'] + start_index = data_dict['start_index'] + window_end = data_dict['window_end'] + sub_window_name = data_dict['sub_window_name'] + wt_seq = seq_data['wt_seq'] + vector_seq = seq_data['vector_seq'] + gene_start = seq_data['gene_start'] + rng = seq_data['rng'] + + # this may not work + missense_dict, synonymous_dict, no_stop_dict, no_stop_syn_dict = codon_table.iupac_codon_dicts() + yeast_synonymous_dict = codon_table.synonymous_yeast_codons_dict() + + sub_window_len = (primer_start) - start_index + sub_window_end = start_index + sub_window_len + + def codons_list(seq): + return [seq[i:i+3] for i in range(0, len(seq), 3)] + + # removing mis_list and syn_list + wt_list = codons_list(wt_seq[start_index:sub_window_end]) + vect_list = codons_list(vector_seq[start_index:sub_window_end]) + + # generate synonymous vector codon list (top 2 codons for yeast) + synonymous_win = [yeast_synonymous_dict[i].lower() for i in vect_list] + + # generate iupac missense codons list (with synonymous codons) + iupac_codons = [] + syn_bool_list = [] + no_stop_list = [] + for i, wt_codon in enumerate(wt_list): + syn_bool = rng.choice([True, False], p=[args.syn_snp_rate, 1-args.syn_snp_rate]) ### args.syn_snp_rate + syn_bool_list.append(syn_bool) + + no_stop_bool = rng.choice([True, False], p=[args.stop_rate, 1-args.stop_rate]) ### args.stop_rate + no_stop_list.append(no_stop_bool) + + # missense_dict, synonymous_dict, no_stop_dict, no_stop_syn_dict + if syn_bool and no_stop_bool: + # use no_stop_syn_dictionary + iupac_codons.append(no_stop_syn_dict[wt_codon]) + elif syn_bool and not no_stop_bool: + # use synonymous_dictionary + iupac_codons.append(synonymous_dict[wt_codon]) + elif no_stop_bool and not syn_bool: + # use no_stop_dict + iupac_codons.append(no_stop_dict[wt_codon]) + else: + # use missense dict + iupac_codons.append(missense_dict[wt_codon]) + + # make full-length oligo (homology arm, sub-window, primer), generate dataframe + for i, iupac_list in enumerate(iupac_codons): + aa_position = int((((start_index-gene_start)/3)+1)+i) + # could enumerate this out to get the aas + for iupac_codon in iupac_list: + # get AAs encoded by iupac codon + iupac_aa = iupac_to_aa(iupac_codon) + + # place iupac_codon into sub_window + sub_window = ''.join(synonymous_win[:i] + [iupac_codon] + synonymous_win[i+1:]) + + codon_sub = wt_list[i] + str(aa_position) + iupac_codon + forward_primer_name = f'{sub_window_name}_{codon_sub}' + full_forward_primer = data_dict['homology_arm'] + sub_window + data_dict['forward_primer'] + + # add values to data_dict + dict_keys = ['name','codon_sub','wt','position','iupac', 'iupac_aa','sub_window', 'primer', 'synonymous_codons', 'no_stop_codons'] + dict_values = [forward_primer_name, codon_sub, wt_list[i], aa_position, iupac_codon, iupac_aa, sub_window, full_forward_primer, syn_bool_list[i], no_stop_list[i]] + for (key,value) in zip(dict_keys,dict_values): + data_dict[key] = value + + # append data_dict to dataframe + seq_data['df'] = seq_data['df'].append(data_dict, ignore_index=True) + + # write primers to .fasta file + seq_data['fasta_file'] = seq_data['fasta_file'] + [f">{forward_primer_name}\n", f"{full_forward_primer}\n"] + + return seq_data, data_dict diff --git a/main_package/__init__.py b/main_package/__init__.py index 0231f89..fb2990e 100644 --- a/main_package/__init__.py +++ b/main_package/__init__.py @@ -1 +1,3 @@ -import main_package.codon_table \ No newline at end of file +# __init__.py +import main_package.codon_table +import main_package.primer_design \ No newline at end of file diff --git a/main_package/__pycache__/__init__.cpython-39.pyc b/main_package/__pycache__/__init__.cpython-39.pyc index e1f815b..3359c4f 100644 Binary files a/main_package/__pycache__/__init__.cpython-39.pyc and b/main_package/__pycache__/__init__.cpython-39.pyc differ diff --git a/main_package/__pycache__/codon_table.cpython-39.pyc b/main_package/__pycache__/codon_table.cpython-39.pyc index 4546ca5..bba7c0b 100644 Binary files a/main_package/__pycache__/codon_table.cpython-39.pyc and b/main_package/__pycache__/codon_table.cpython-39.pyc differ diff --git a/main_package/__pycache__/primer_design.cpython-39.pyc b/main_package/__pycache__/primer_design.cpython-39.pyc new file mode 100644 index 0000000..0b632f9 Binary files /dev/null and b/main_package/__pycache__/primer_design.cpython-39.pyc differ diff --git a/main_package/codon_table.py b/main_package/codon_table.py index 49dd003..931fc96 100644 --- a/main_package/codon_table.py +++ b/main_package/codon_table.py @@ -1,10 +1,20 @@ #!/usr/bin/env python3 - +import pkg_resources from itertools import product from Bio.Seq import Seq import pandas as pd import itertools +iupac_dict = {'A':'A','C':'C','G':'G','T':'T','AC':'M','AG':'R','AT':'W','CG':'S','CT':'Y','GT':'K','ACG':'V','ACT':'H','AGT':'D','CGT':'B','ACGT':'N'} +rev_iupac_dict = {value:key for key,value in iupac_dict.items()} + +def iupac_to_aa(iupac_codon): + """Return string of AAs encoded by input iupac missense codon""" + nuc_lists = [list(rev_iupac_dict[n]) for n in iupac_codon] + codon_list = [''.join(i) for i in list(itertools.product(*nuc_lists))] + aa_list = [str(Seq(codon).translate()) for codon in codon_list] + return ''.join(aa_list) + def iupac_missense_codon_df(codon_table='Standard'): nucleotides = 'ACGT' iupac_dict = {'A':'A','C':'C','G':'G','T':'T','AC':'M','AG':'R','AT':'W','CG':'S','CT':'Y','GT':'K','ACG':'V','ACT':'H','AGT':'D','CGT':'B','ACGT':'N'} @@ -22,7 +32,7 @@ def iupac_missense_codon_df(codon_table='Standard'): aa = Seq(codon).translate(table=codon_table)[0] # loop through each position in codon - for position in range(3): + for position in range(3): new_aas = [] iupac_n = [] new_codons = [] @@ -62,8 +72,8 @@ def iupac_missense_codon_df(codon_table='Standard'): # create dictionary codon_level_dict = { - 'codon':codon_list, - 'aa':aa_list, + 'codon':codon_list, + 'aa':aa_list, 'position':position_list, 'missense_nucleotides':nucleotides_list, 'missense_codons':missense_codons_list, @@ -97,7 +107,7 @@ def iupac_synonymous_codon_df(codon_table='Standard'): aa = Seq(codon).translate(table='Standard')[0] # loop through each position in codon - for position in range(3): + for position in range(3): new_aas = [] iupac_n = [] new_codons = [] @@ -137,8 +147,8 @@ def iupac_synonymous_codon_df(codon_table='Standard'): # create dictionary codon_level_dict = { - 'codon':codon_list, - 'aa':aa_list, + 'codon':codon_list, + 'aa':aa_list, 'position':position_list, 'synonymous_nucleotides':nucleotides_list, 'synonymous_codons':synonymous_codons_list, @@ -161,6 +171,7 @@ def iupac_synonymous_codon_dict(codon_table='Standard'): #### I need something that gets the absolute path of the .csv file to be imported +# SCRATCH def selected_iupac_codons_dict(): """return codon table of selected missense codons PROBLEM: codon table must be in directory where function is being called""" @@ -171,6 +182,7 @@ def selected_iupac_codons_dict(): sele_dict[key] = list(itertools.chain.from_iterable([codon.split(' ') for codon in value])) return sele_dict +# SCRATCH def synonymous_iupac_codons_dict(): """return codon table including synonymous codons PROBLEM: codon table must be in directory where function is being called""" @@ -181,9 +193,36 @@ def synonymous_iupac_codons_dict(): syn_dict[key] = list(itertools.chain.from_iterable([codon.split(' ') for codon in value])) return syn_dict + +def iupac_codon_dicts(): + """Returns four mapping dictionaries to generate missense variants + RETURNS: + - missense_dict + - synonymous_dict + - no_stop_dict + - no_stop_syn_dict + """ + stream = pkg_resources.resource_stream(__name__, 'data/final_codon_table.csv') + df = pd.read_csv(stream) + df.fillna('', inplace=True) + col_list = ['sele_iupac_codon', 'syn_iupac_codon', 'no_stop_iupac_codon', 'no_stop_syn_iupac_codon'] + + # check that column names exist + for col in col_list: + if col not in df.columns.tolist(): + print(f"ERROR: Column '{col}' not contained in file '{codon_table}'") + return + dict_list = [] + for col in col_list: + temp_dict = df.query(f'{col} != ""').groupby('codon')[col].apply(list).to_dict() + for key,value in temp_dict.items(): + temp_dict[key] = list(itertools.chain.from_iterable([codon.split(' ') for codon in value])) + dict_list.append(temp_dict) + return dict_list + def synonymous_yeast_codons_dict(): - """return iupac codon dictionary for synonymous codons most frequently used in yeast - PROBLEM: codon table must be in directory where function is being called""" - df = pd.read_csv('yeast_synonymous_codon_table.csv') - temp_df = dict(zip(df.codon, df.iupac)) - return temp_df \ No newline at end of file + """Return mapping dictionary of doped synonymous codons optimized for yeast""" + stream = pkg_resources.resource_stream(__name__, 'data/yeast_synonymous_codon_table.csv') + df = pd.read_csv(stream) + syn_dict = dict(zip(df.codon, df.iupac)) + return syn_dict diff --git a/main_package/data/.ipynb_checkpoints/final_codon_table-checkpoint.csv b/main_package/data/.ipynb_checkpoints/final_codon_table-checkpoint.csv new file mode 100644 index 0000000..c183f47 --- /dev/null +++ b/main_package/data/.ipynb_checkpoints/final_codon_table-checkpoint.csv @@ -0,0 +1,193 @@ +codon,aa,position,missense_nuc,missense_codons,missense_aa,missense_iupac,missense_iupac_codon,sele_codons,sele_aa,sele_iupac_codon,syn_bool,syn_codons,syn_aa,syn_iupac_codon,no_stop_codons,no_stop_aa,no_stop_iupac_codon,no_stop_syn_codons,no_stop_syn_aa,no_stop_syn_iupac_codon +AAA,K,0,CGT,CAA GAA TAA,*QE,B,BAA,CAA GAA TAA,QE*,BAA,False,,,BAA,CAA GAA,QE,SAA,CAA GAA,QE,SAA +AAA,K,1,CGT,ACA AGA ATA,TRI,B,ABA,ACA AGA ATA,TRI,ABA,False,,,ABA,ACA AGA ATA,TRI,ABA,ACA AGA ATA,TRI,ABA +AAA,K,2,CT,AAC AAT,N,Y,AAY,AAT,N,AAT,True,AAG AAT,KN,AAK,AAT,N,AAT,AAG AAT,KN,AAK +AAC,N,0,CGT,CAC GAC TAC,HDY,B,BAC,CAC GAC TAC,HDY,BAC,False,,,BAC,CAC GAC TAC,HDY,BAC,CAC GAC TAC,HDY,BAC +AAC,N,1,CGT,ACC AGC ATC,TSI,B,ABC,ACC AGC ATC,TSI,ABC,False,,,ABC,ACC AGC ATC,TSI,ABC,ACC AGC ATC,TSI,ABC +AAC,N,2,AG,AAA AAG,K,R,AAR,AAA,K,AAA,True,AAA AAT,KN,AAW,AAA,K,AAA,AAA AAT,KN,AAW +AAG,K,0,CGT,CAG GAG TAG,*QE,B,BAG,CAG GAG TAG,QE*,BAG,False,,,BAG,CAG GAG,QE,SAG,CAG GAG,QE,SAG +AAG,K,1,CGT,ACG AGG ATG,TRM,B,ABG,ACG AGG ATG,TRM,ABG,False,,,ABG,ACG AGG ATG,TRM,ABG,ACG AGG ATG,TRM,ABG +AAG,K,2,CT,AAC AAT,N,Y,AAY,AAT,N,AAT,True,AAA AAT,KN,AAW,AAT,N,AAT,AAA AAT,KN,AAW +AAT,N,0,CGT,CAT GAT TAT,HDY,B,BAT,CAT GAT TAT,HDY,BAT,False,,,BAT,CAT GAT TAT,HDY,BAT,CAT GAT TAT,HDY,BAT +AAT,N,1,CGT,ACT AGT ATT,TSI,B,ABT,ACT AGT ATT,TSI,ABT,False,,,ABT,ACT AGT ATT,TSI,ABT,ACT AGT ATT,TSI,ABT +AAT,N,2,AG,AAA AAG,K,R,AAR,AAA,K,AAA,True,AAA AAC,KN,AAM,AAA,K,AAA,AAA AAC,KN,AAM +ACA,T,0,CGT,CCA GCA TCA,SAP,B,BCA,CCA GCA TCA,PAS,BCA,False,,,BCA,CCA GCA TCA,PAS,BCA,CCA GCA TCA,PAS,BCA +ACA,T,1,AGT,AAA AGA ATA,RIK,D,ADA,AAA AGA ATA,KRI,ADA,False,,,ADA,AAA AGA ATA,KRI,ADA,AAA AGA ATA,KRI,ADA +ACA,T,2,,,,A,ACA,,,,True,ACT,T,ACT,,,,ACT,T,ACT +ACC,T,0,CGT,CCC GCC TCC,SAP,B,BCC,CCC GCC TCC,PAS,BCC,False,,,BCC,CCC GCC TCC,PAS,BCC,CCC GCC TCC,PAS,BCC +ACC,T,1,AGT,AAC AGC ATC,NSI,D,ADC,AAC ATC,NI,AWC,False,,,AWC,AAC ATC,NI,AWC,AAC ATC,NI,AWC +ACC,T,2,,,,C,ACC,,,,True,ACT,T,ACT,,,,ACT,T,ACT +ACG,T,0,CGT,CCG GCG TCG,SAP,B,BCG,CCG GCG TCG,PAS,BCG,False,,,BCG,CCG GCG TCG,PAS,BCG,CCG GCG TCG,PAS,BCG +ACG,T,1,AGT,AAG AGG ATG,RMK,D,ADG,AAG AGG ATG,KRM,ADG,False,,,ADG,AAG AGG ATG,KRM,ADG,AAG AGG ATG,KRM,ADG +ACG,T,2,,,,G,ACG,,,,True,ACT,T,ACT,,,,ACT,T,ACT +ACT,T,0,CGT,CCT GCT TCT,SAP,B,BCT,CCT GCT TCT,PAS,BCT,False,,,BCT,CCT GCT TCT,PAS,BCT,CCT GCT TCT,PAS,BCT +ACT,T,1,AGT,AAT AGT ATT,NSI,D,ADT,AAT ATT,NI,AWT,False,,,AWT,AAT ATT,NI,AWT,AAT ATT,NI,AWT +ACT,T,2,,,,T,ACT,,,,True,ACA,T,ACA,,,,ACA,T,ACA +AGA,R,0,GT,GGA TGA,*G,K,KGA,GGA TGA,G*,KGA,False,,,KGA,GGA,G,GGA,GGA,G,GGA +AGA,R,1,ACT,AAA ACA ATA,TIK,H,AHA,AAA ACA ATA,KTI,AHA,False,,,AHA,AAA ACA ATA,KTI,AHA,AAA ACA ATA,KTI,AHA +AGA,R,2,CT,AGC AGT,S,Y,AGY,AGT,S,AGT,True,AGG AGT,RS,AGK,AGT,S,AGT,AGG AGT,RS,AGK +AGC,S,0,CGT,CGC GGC TGC,RCG,B,BGC,GGC TGC,GC,KGC,False,,,KGC,GGC TGC,GC,KGC,GGC TGC,GC,KGC +AGC,S,1,ACT,AAC ACC ATC,NTI,H,AHC,AAC ACC ATC,NTI,AHC,False,,,AHC,AAC ACC ATC,NTI,AHC,AAC ACC ATC,NTI,AHC +AGC,S,2,AG,AGA AGG,R,R,AGR,AGA,R,AGA,True,AGA AGT,RS,AGW,AGA,R,AGA,AGA AGT,RS,AGW +AGG,R,0,GT,GGG TGG,WG,K,KGG,GGG TGG,GW,KGG,False,,,KGG,GGG TGG,GW,KGG,GGG TGG,GW,KGG +AGG,R,1,ACT,AAG ACG ATG,TMK,H,AHG,AAG ACG ATG,KTM,AHG,False,,,AHG,AAG ACG ATG,KTM,AHG,AAG ACG ATG,KTM,AHG +AGG,R,2,CT,AGC AGT,S,Y,AGY,AGT,S,AGT,True,AGA AGT,RS,AGW,AGT,S,AGT,AGA AGT,RS,AGW +AGT,S,0,CGT,CGT GGT TGT,RCG,B,BGT,GGT TGT,GC,KGT,False,,,KGT,GGT TGT,GC,KGT,GGT TGT,GC,KGT +AGT,S,1,ACT,AAT ACT ATT,NTI,H,AHT,AAT ACT ATT,NTI,AHT,False,,,AHT,AAT ACT ATT,NTI,AHT,AAT ACT ATT,NTI,AHT +AGT,S,2,AG,AGA AGG,R,R,AGR,AGA,R,AGA,True,AGA AGC,RS,AGM,AGA,R,AGA,AGA AGC,RS,AGM +ATA,I,0,CGT,CTA GTA TTA,LV,B,BTA,TTA GTA,LV,KTA,False,,,KTA,TTA GTA,LV,KTA,TTA GTA,LV,KTA +ATA,I,1,ACG,AAA ACA AGA,TRK,V,AVA,AAA ACA AGA,KTR,AVA,False,,,AVA,AAA ACA AGA,KTR,AVA,AAA ACA AGA,KTR,AVA +ATA,I,2,G,ATG,M,G,ATG,ATG,M,ATG,True,ATG ATT,MI,ATK,ATG,M,ATG,ATG ATT,MI,ATK +ATC,I,0,CGT,CTC GTC TTC,LFV,B,BTC,CTC GTC TTC,LVF,BTC,False,,,BTC,CTC GTC TTC,LVF,BTC,CTC GTC TTC,LVF,BTC +ATC,I,1,ACG,AAC ACC AGC,NST,V,AVC,AAC ACC AGC,NTS,AVC,False,,,AVC,AAC ACC AGC,NTS,AVC,AAC ACC AGC,NTS,AVC +ATC,I,2,G,ATG,M,G,ATG,ATG,M,ATG,True,ATG ATT,MI,ATK,ATG,M,ATG,ATG ATT,MI,ATK +ATG,M,0,CGT,CTG GTG TTG,LV,B,BTG,TTA GTA,LV,KTG,False,,,KTG,TTA GTA,LV,KTG,TTA GTA,LV,KTG +ATG,M,1,ACG,AAG ACG AGG,TRK,V,AVG,AAG ACG AGG,KTR,AVG,False,,,AVG,AAG ACG AGG,KTR,AVG,AAG ACG AGG,KTR,AVG +ATG,M,2,ACT,ATA ATC ATT,I,H,ATH,ATT,I,ATT,True,ATG ATT,MI,ATK,ATT,I,ATT,ATG ATT,MI,ATK +ATT,I,0,CGT,CTT GTT TTT,LFV,B,BTT,CTT GTT TTT,LVF,BTT,False,,,BTT,CTT GTT TTT,LVF,BTT,CTT GTT TTT,LVF,BTT +ATT,I,1,ACG,AAT ACT AGT,NST,V,AVT,AAT ACT AGT,NTS,AVT,False,,,AVT,AAT ACT AGT,NTS,AVT,AAT ACT AGT,NTS,AVT +ATT,I,2,G,ATG,M,G,ATG,ATG,M,ATG,True,ATA ATG,IM,ATR,ATG,M,ATG,ATA ATG,IM,ATR +CAA,Q,0,AGT,AAA GAA TAA,*EK,D,DAA,AAA GAA TAA,KE*,DAA,False,,,DAA,AAA GAA,KE,RAA,AAA GAA,KE,RAA +CAA,Q,1,CGT,CCA CGA CTA,RLP,B,CBA,CCA CGA CTA,PRL,CBA,False,,,CBA,CCA CGA CTA,PRL,CBA,CCA CGA CTA,PRL,CBA +CAA,Q,2,CT,CAC CAT,H,Y,CAY,CAT,H,CAT,True,CAG CAT,QH,CAK,CAT,H,CAT,CAG CAT,QH,CAK +CAC,H,0,AGT,AAC GAC TAC,NDY,D,DAC,AAC GAC TAC,NDY,DAC,False,,,DAC,AAC GAC TAC,NDY,DAC,AAC GAC TAC,NDY,DAC +CAC,H,1,CGT,CCC CGC CTC,RLP,B,CBC,CCC CGC CTC,PRL,CBC,False,,,CBC,CCC CGC CTC,PRL,CBC,CCC CGC CTC,PRL,CBC +CAC,H,2,AG,CAA CAG,Q,R,CAR,CAA,Q,CAA,True,CAA CAT,QH,CAW,CAA,Q,CAA,CAA CAT,QH,CAW +CAG,Q,0,AGT,AAG GAG TAG,*EK,D,DAG,AAG GAG TAG,KE*,DAG,False,,,DAG,AAG GAG,KE,RAG,AAG GAG,KE,RAG +CAG,Q,1,CGT,CCG CGG CTG,RLP,B,CBG,CCG CGG CTG,PRL,CBG,False,,,CBG,CCG CGG CTG,PRL,CBG,CCG CGG CTG,PRL,CBG +CAG,Q,2,CT,CAC CAT,H,Y,CAY,CAT,H,CAT,True,CAA CAT,QH,CAW,CAT,H,CAT,CAA CAT,QH,CAW +CAT,H,0,AGT,AAT GAT TAT,NDY,D,DAT,AAT GAT TAT,NDY,DAT,False,,,DAT,AAT GAT TAT,NDY,DAT,AAT GAT TAT,NDY,DAT +CAT,H,1,CGT,CCT CGT CTT,RLP,B,CBT,CCT CGT CTT,PRL,CBT,False,,,CBT,CCT CGT CTT,PRL,CBT,CCT CGT CTT,PRL,CBT +CAT,H,2,AG,CAA CAG,Q,R,CAR,CAA,Q,CAA,True,CAA CAC,QH,CAM,CAA,Q,CAA,CAA CAC,QH,CAM +CCA,P,0,AGT,ACA GCA TCA,TSA,D,DCA,ACA GCA TCA,TAS,DCA,False,,,DCA,ACA GCA TCA,TAS,DCA,ACA GCA TCA,TAS,DCA +CCA,P,1,AGT,CAA CGA CTA,RQL,D,CDA,CAA CGA CTA,QRL,CDA,False,,,CDA,CAA CGA CTA,QRL,CDA,CAA CGA CTA,QRL,CDA +CCA,P,2,,,,A,CCA,,,,True,CCT,P,CCT,,,,CCT,P,CCT +CCC,P,0,AGT,ACC GCC TCC,TSA,D,DCC,ACC GCC TCC,TAS,DCC,False,,,DCC,ACC GCC TCC,TAS,DCC,ACC GCC TCC,TAS,DCC +CCC,P,1,AGT,CAC CGC CTC,RHL,D,CDC,CAC CGC CTC,HRL,CDC,False,,,CDC,CAC CGC CTC,HRL,CDC,CAC CGC CTC,HRL,CDC +CCC,P,2,,,,C,CCC,,,,True,CCA,P,CCA,,,,CCA,P,CCA +CCG,P,0,AGT,ACG GCG TCG,TSA,D,DCG,ACG GCG TCG,TAS,DCG,False,,,DCG,ACG GCG TCG,TAS,DCG,ACG GCG TCG,TAS,DCG +CCG,P,1,AGT,CAG CGG CTG,RQL,D,CDG,CAG CGG CTG,QRL,CDG,False,,,CDG,CAG CGG CTG,QRL,CDG,CAG CGG CTG,QRL,CDG +CCG,P,2,,,,G,CCG,,,,True,CCA,P,CCA,,,,CCA,P,CCA +CCT,P,0,AGT,ACT GCT TCT,TSA,D,DCT,ACT GCT TCT,TAS,DCT,False,,,DCT,ACT GCT TCT,TAS,DCT,ACT GCT TCT,TAS,DCT +CCT,P,1,AGT,CAT CGT CTT,RHL,D,CDT,CAT CGT CTT,HRL,CDT,False,,,CDT,CAT CGT CTT,HRL,CDT,CAT CGT CTT,HRL,CDT +CCT,P,2,,,,T,CCT,,,,True,CCA,P,CCA,,,,CCA,P,CCA +CGA,R,0,GT,GGA TGA,*G,K,KGA,GGA TGA,G*,KGA,True,AGA GGA TGA,RG*,DGA,GGA,G,GGA,AGA GGA,RG,RGA +CGA,R,1,ACT,CAA CCA CTA,LQP,H,CHA,CAA CCA CTA,QPL,CHA,False,,,CHA,CAA CCA CTA,QPL,CHA,CAA CCA CTA,QPL,CHA +CGA,R,2,,,,A,CGA,,,,False,,,,,,,,, +CGC,R,0,AGT,AGC GGC TGC,SCG,D,DGC,AGC GGC TGC,SGC,DGC,False,,,DGC,AGC GGC TGC,SGC,DGC,AGC GGC TGC,SGC,DGC +CGC,R,1,ACT,CAC CCC CTC,LHP,H,CHC,CAC CCC CTC,HPL,CHC,False,,,CHC,CAC CCC CTC,HPL,CHC,CAC CCC CTC,HPL,CHC +CGC,R,2,,,,C,CGC,,,,True,CGT,R,CGT,,,,CGT,R,CGT +CGG,R,0,GT,GGG TGG,WG,K,KGG,GGG TGG,GW,KGG,True,AGG GGG TGG,RGW,DGG,GGG TGG,GW,KGG,AGG GGG TGG,RGW,DGG +CGG,R,1,ACT,CAG CCG CTG,LQP,H,CHG,CAG CCG CTG,QPL,CHG,False,,,CHG,CAG CCG CTG,QPL,CHG,CAG CCG CTG,QPL,CHG +CGG,R,2,,,,G,CGG,,,,False,,,,,,,,, +CGT,R,0,AGT,AGT GGT TGT,SCG,D,DGT,AGT GGT TGT,SGC,DGT,False,,,DGT,AGT GGT TGT,SGC,DGT,AGT GGT TGT,SGC,DGT +CGT,R,1,ACT,CAT CCT CTT,LHP,H,CHT,CAT CCT CTT,HPL,CHT,False,,,CHT,CAT CCT CTT,HPL,CHT,CAT CCT CTT,HPL,CHT +CGT,R,2,,,,T,CGT,,,,True,CGA,R,CGA,,,,CGA,R,CGA +CTA,L,0,AG,ATA GTA,IV,R,RTA,ATA GTA,IV,RTA,True,ATA GTA TTA,IVL,DTA,ATA GTA,IV,RTA,ATA GTA TTA,IVL,DTA +CTA,L,1,ACG,CAA CCA CGA,RQP,V,CVA,CAA CCA CGA,QPR,CVA,False,,,CVA,CAA CCA CGA,QPR,CVA,CAA CCA CGA,QPR,CVA +CTA,L,2,,,,A,CTA,,,,False,,,,,,,,, +CTC,L,0,AGT,ATC GTC TTC,FIV,D,DTC,ATC GTC TTC,IVF,DTC,False,,,DTC,ATC GTC TTC,IVF,DTC,ATC GTC TTC,IVF,DTC +CTC,L,1,ACG,CAC CCC CGC,RHP,V,CVC,CAC CCC CGC,HPR,CVC,False,,,CVC,CAC CCC CGC,HPR,CVC,CAC CCC CGC,HPR,CVC +CTC,L,2,,,,C,CTC,,,,True,CTA,L,CTA,,,,CTA,L,CTA +CTG,L,0,AG,ATG GTG,VM,R,RTG,ATG GTG,MV,RTG,True,ATG GTG TTG,MVL,DTG,ATG GTG,MV,RTG,ATG GTG TTG,MVL,DTG +CTG,L,1,ACG,CAG CCG CGG,RQP,V,CVG,CAG CCG CGG,QPR,CVG,False,,,CVG,CAG CCG CGG,QPR,CVG,CAG CCG CGG,QPR,CVG +CTG,L,2,,,,G,CTG,,,,False,,,,,,,,, +CTT,L,0,AGT,ATT GTT TTT,FIV,D,DTT,ATT GTT TTT,IVF,DTT,False,,,DTT,ATT GTT TTT,IVF,DTT,ATT GTT TTT,IVF,DTT +CTT,L,1,ACG,CAT CCT CGT,RHP,V,CVT,CAT CCT CGT,HPR,CVT,False,,,CVT,CAT CCT CGT,HPR,CVT,CAT CCT CGT,HPR,CVT +CTT,L,2,,,,T,CTT,,,,True,CTA,L,CTA,,,,CTA,L,CTA +GAA,E,0,ACT,AAA CAA TAA,*QK,H,HAA,AAA CAA TAA,KQ*,HAA,False,,,HAA,AAA CAA,KQ,MAA,AAA CAA,KQ,MAA +GAA,E,1,CGT,GCA GGA GTA,VAG,B,GBA,GCA GGA GTA,AGV,GBA,False,,,GBA,GCA GGA GTA,AGV,GBA,GCA GGA GTA,AGV,GBA +GAA,E,2,CT,GAC GAT,D,Y,GAY,GAT,D,GAT,True,GAG GAT,ED,GAK,GAT,D,GAT,GAG GAT,ED,GAK +GAC,D,0,ACT,AAC CAC TAC,NHY,H,HAC,AAC CAC TAC,NHY,HAC,False,,,HAC,AAC CAC TAC,NHY,HAC,AAC CAC TAC,NHY,HAC +GAC,D,1,CGT,GCC GGC GTC,VAG,B,GBC,GCC GGC GTC,AGV,GBC,False,,,GBC,GCC GGC GTC,AGV,GBC,GCC GGC GTC,AGV,GBC +GAC,D,2,AG,GAA GAG,E,R,GAR,GAA,E,GAA,True,GAA GAT,ED,GAW,GAA,E,GAA,GAA GAT,ED,GAW +GAG,E,0,ACT,AAG CAG TAG,*QK,H,HAG,AAG CAG TAG,KQ*,HAG,False,,,HAG,AAG CAG,KQ,MAG,AAG CAG,KQ,MAG +GAG,E,1,CGT,GCG GGG GTG,VAG,B,GBG,GCG GGG GTG,AGV,GBG,False,,,GBG,GCG GGG GTG,AGV,GBG,GCG GGG GTG,AGV,GBG +GAG,E,2,CT,GAC GAT,D,Y,GAY,GAT,D,GAT,True,GAA GAT,ED,GAW,GAT,D,GAT,GAA GAT,ED,GAW +GAT,D,0,ACT,AAT CAT TAT,NHY,H,HAT,AAT CAT TAT,NHY,HAT,False,,,HAT,AAT CAT TAT,NHY,HAT,AAT CAT TAT,NHY,HAT +GAT,D,1,CGT,GCT GGT GTT,VAG,B,GBT,GCT GGT GTT,AGV,GBT,False,,,GBT,GCT GGT GTT,AGV,GBT,GCT GGT GTT,AGV,GBT +GAT,D,2,AG,GAA GAG,E,R,GAR,GAA,E,GAA,True,GAA GAC,ED,GAM,GAA,E,GAA,GAA GAC,ED,GAM +GCA,A,0,ACT,ACA CCA TCA,TSP,H,HCA,ACA CCA TCA,TPS,HCA,False,,,HCA,ACA CCA TCA,TPS,HCA,ACA CCA TCA,TPS,HCA +GCA,A,1,AGT,GAA GGA GTA,GVE,D,GDA,GAA GGA GTA,EGV,GDA,False,,,GDA,GAA GGA GTA,EGV,GDA,GAA GGA GTA,EGV,GDA +GCA,A,2,,,,A,GCA,,,,True,GCT,A,GCT,,,,GCT,A,GCT +GCC,A,0,ACT,ACC CCC TCC,TSP,H,HCC,ACC CCC TCC,TPS,HCC,False,,,HCC,ACC CCC TCC,TPS,HCC,ACC CCC TCC,TPS,HCC +GCC,A,1,AGT,GAC GGC GTC,VDG,D,GDC,GAC GGC GTC,DGV,GDC,False,,,GDC,GAC GGC GTC,DGV,GDC,GAC GGC GTC,DGV,GDC +GCC,A,2,,,,C,GCC,,,,True,GCT,A,GCT,,,,GCT,A,GCT +GCG,A,0,ACT,ACG CCG TCG,TSP,H,HCG,ACG CCG TCG,TPS,HCG,False,,,HCG,ACG CCG TCG,TPS,HCG,ACG CCG TCG,TPS,HCG +GCG,A,1,AGT,GAG GGG GTG,GVE,D,GDG,GAG GGG GTG,EGV,GDG,False,,,GDG,GAG GGG GTG,EGV,GDG,GAG GGG GTG,EGV,GDG +GCG,A,2,,,,G,GCG,,,,True,GCT,A,GCT,,,,GCT,A,GCT +GCT,A,0,ACT,ACT CCT TCT,TSP,H,HCT,ACT CCT TCT,TPS,HCT,False,,,HCT,ACT CCT TCT,TPS,HCT,ACT CCT TCT,TPS,HCT +GCT,A,1,AGT,GAT GGT GTT,VDG,D,GDT,GAT GGT GTT,DGV,GDT,False,,,GDT,GAT GGT GTT,DGV,GDT,GAT GGT GTT,DGV,GDT +GCT,A,2,,,,T,GCT,,,,True,GCA,A,GCA,,,,GCA,A,GCA +GGA,G,0,ACT,AGA CGA TGA,R*,H,HGA,AGA TGA,R*,WGA,False,,,WGA,AGA,R,AGA,AGA,R,AGA +GGA,G,1,ACT,GAA GCA GTA,VAE,H,GHA,GAA GCA GTA,EAV,GHA,False,,,GHA,GAA GCA GTA,EAV,GHA,GAA GCA GTA,EAV,GHA +GGA,G,2,,,,A,GGA,,,,True,GGT,G,GGT,,,,GGT,G,GGT +GGC,G,0,ACT,AGC CGC TGC,RSC,H,HGC,AGC CGC TGC,SRC,HGC,False,,,HGC,AGC CGC TGC,SRC,HGC,AGC CGC TGC,SRC,HGC +GGC,G,1,ACT,GAC GCC GTC,ADV,H,GHC,GAC GCC GTC,DAV,GHC,False,,,GHC,GAC GCC GTC,DAV,GHC,GAC GCC GTC,DAV,GHC +GGC,G,2,,,,C,GGC,,,,True,GGT,G,GGT,,,,GGT,G,GGT +GGG,G,0,ACT,AGG CGG TGG,RW,H,HGG,AGG TGG,RW,WGG,False,,,WGG,AGG TGG,RW,WGG,AGG TGG,RW,WGG +GGG,G,1,ACT,GAG GCG GTG,VAE,H,GHG,GAG GCG GTG,EAV,GHG,False,,,GHG,GAG GCG GTG,EAV,GHG,GAG GCG GTG,EAV,GHG +GGG,G,2,,,,G,GGG,,,,True,GGT,G,GGT,,,,GGT,G,GGT +GGT,G,0,ACT,AGT CGT TGT,RSC,H,HGT,AGT CGT TGT,SRC,HGT,False,,,HGT,AGT CGT TGT,SRC,HGT,AGT CGT TGT,SRC,HGT +GGT,G,1,ACT,GAT GCT GTT,ADV,H,GHT,GAT GCT GTT,DAV,GHT,False,,,GHT,GAT GCT GTT,DAV,GHT,GAT GCT GTT,DAV,GHT +GGT,G,2,,,,T,GGT,,,,True,GGA,G,GGA,,,,GGA,G,GGA +GTA,V,0,ACT,ATA CTA TTA,LI,H,HTA,ATA TTA,IL,WTA,False,,,WTA,ATA TTA,IL,WTA,ATA TTA,IL,WTA +GTA,V,1,ACG,GAA GCA GGA,GAE,V,GVA,GAA GCA GGA,EAG,GVA,False,,,GVA,GAA GCA GGA,EAG,GVA,GAA GCA GGA,EAG,GVA +GTA,V,2,,,,A,GTA,,,,True,GTT,V,GTT,,,,GTT,V,GTT +GTC,V,0,ACT,ATC CTC TTC,LFI,H,HTC,ATC CTC TTC,ILF,HTC,False,,,HTC,ATC CTC TTC,ILF,HTC,ATC CTC TTC,ILF,HTC +GTC,V,1,ACG,GAC GCC GGC,ADG,V,GVC,GAC GCC GGC,DAG,GVC,False,,,GVC,GAC GCC GGC,DAG,GVC,GAC GCC GGC,DAG,GVC +GTC,V,2,,,,C,GTC,,,,True,GTT,V,GTT,,,,GTT,V,GTT +GTG,V,0,ACT,ATG CTG TTG,LM,H,HTG,ATG TTG,ML,WTG,False,,,WTG,ATG TTG,ML,WTG,ATG TTG,ML,WTG +GTG,V,1,ACG,GAG GCG GGG,GAE,V,GVG,GAG GCG GGG,EAG,GVG,False,,,GVG,GAG GCG GGG,EAG,GVG,GAG GCG GGG,EAG,GVG +GTG,V,2,,,,G,GTG,,,,True,GTT,V,GTT,,,,GTT,V,GTT +GTT,V,0,ACT,ATT CTT TTT,LFI,H,HTT,ATT CTT TTT,ILF,HTT,False,,,HTT,ATT CTT TTT,ILF,HTT,ATT CTT TTT,ILF,HTT +GTT,V,1,ACG,GAT GCT GGT,ADG,V,GVT,GAT GCT GGT,DAG,GVT,False,,,GVT,GAT GCT GGT,DAG,GVT,GAT GCT GGT,DAG,GVT +GTT,V,2,,,,T,GTT,,,,True,GTA,V,GTA,,,,GTA,V,GTA +TAA,*,0,ACG,AAA CAA GAA,QEK,V,VAA,AAA CAA GAA,KQE,VAA,False,,,VAA,AAA CAA GAA,KQE,VAA,AAA CAA GAA,KQE,VAA +TAA,*,1,CT,TCA TTA,LS,Y,TYA,TCA TTA,SL,TYA,False,,,TYA,TCA TTA,SL,TYA,TCA TTA,SL,TYA +TAA,*,2,CT,TAC TAT,Y,Y,TAY,TAT,Y,TAT,True,TAG TAT,*Y,TAK,TAT,Y,TAT,TAT,Y,TAT +TAC,Y,0,ACG,AAC CAC GAC,NHD,V,VAC,AAC CAC GAC,NHD,VAC,False,,,VAC,AAC CAC GAC,NHD,VAC,AAC CAC GAC,NHD,VAC +TAC,Y,1,CGT,TCC TGC TTC,SFC,B,TBC,TCC TGC TTC,SCF,TBC,False,,,TBC,TCC TGC TTC,SCF,TBC,TCC TGC TTC,SCF,TBC +TAC,Y,2,AG,TAA TAG,*,R,TAR,TAA,*,TAA,True,TAA TAT,*Y,TAW,,,,TAT,Y,TAT +TAG,*,0,ACG,AAG CAG GAG,QEK,V,VAG,AAG CAG GAG,KQE,VAG,False,,,VAG,AAG CAG GAG,KQE,VAG,AAG CAG GAG,KQE,VAG +TAG,*,1,CGT,TCG TGG TTG,LSW,B,TBG,TCG TGG TTG,SWL,TBG,False,,,TBG,TCG TGG TTG,SWL,TBG,TCG TGG TTG,SWL,TBG +TAG,*,2,CT,TAC TAT,Y,Y,TAY,TAT,Y,TAT,True,TAA TAT,*Y,TAW,TAT,Y,TAT,TAT,Y,TAT +TAT,Y,0,ACG,AAT CAT GAT,NHD,V,VAT,AAT CAT GAT,NHD,VAT,False,,,VAT,AAT CAT GAT,NHD,VAT,AAT CAT GAT,NHD,VAT +TAT,Y,1,CGT,TCT TGT TTT,SFC,B,TBT,TCT TGT TTT,SCF,TBT,False,,,TBT,TCT TGT TTT,SCF,TBT,TCT TGT TTT,SCF,TBT +TAT,Y,2,AG,TAA TAG,*,R,TAR,TAA,*,TAA,True,TAA TAC,*Y,TAM,,,,TAC,Y,TAC +TCA,S,0,ACG,ACA CCA GCA,TAP,V,VCA,ACA CCA GCA,TPA,VCA,False,,,VCA,ACA CCA GCA,TPA,VCA,ACA CCA GCA,TPA,VCA +TCA,S,1,AGT,TAA TGA TTA,L*,D,TDA,TAA TTA,*L,TWA,False,,,TWA,TTA,L,TTA,TTA,L,TTA +TCA,S,2,,,,A,TCA,,,,True,TCT,S,TCT,,,,TCT,S,TCT +TCC,S,0,ACG,ACC CCC GCC,TAP,V,VCC,ACC CCC GCC,TPA,VCC,False,,,VCC,ACC CCC GCC,TPA,VCC,ACC CCC GCC,TPA,VCC +TCC,S,1,AGT,TAC TGC TTC,FCY,D,TDC,TAC TGC TTC,YCF,TDC,False,,,TDC,TAC TGC TTC,YCF,TDC,TAC TGC TTC,YCF,TDC +TCC,S,2,,,,C,TCC,,,,True,TCT,S,TCT,,,,TCT,S,TCT +TCG,S,0,ACG,ACG CCG GCG,TAP,V,VCG,ACG CCG GCG,TPA,VCG,False,,,VCG,ACG CCG GCG,TPA,VCG,ACG CCG GCG,TPA,VCG +TCG,S,1,AGT,TAG TGG TTG,L*W,D,TDG,TAG TGG TTG,*WL,TDG,False,,,TDG,TGG TTG,WL,TKG,TGG TTG,WL,TKG +TCG,S,2,,,,G,TCG,,,,True,TCT,S,TCT,,,,TCT,S,TCT +TCT,S,0,ACG,ACT CCT GCT,TAP,V,VCT,ACT CCT GCT,TPA,VCT,False,,,VCT,ACT CCT GCT,TPA,VCT,ACT CCT GCT,TPA,VCT +TCT,S,1,AGT,TAT TGT TTT,FCY,D,TDT,TAT TGT TTT,YCF,TDT,False,,,TDT,TAT TGT TTT,YCF,TDT,TAT TGT TTT,YCF,TDT +TCT,S,2,,,,T,TCT,,,,True,TCA,S,TCA,,,,TCA,S,TCA +TGA,*,0,ACG,AGA CGA GGA,RG,V,VGA,AGA GGA,RG,RGA,False,,,RGA,AGA GGA,RG,RGA,AGA GGA,RG,RGA +TGA,*,1,CT,TCA TTA,LS,Y,TYA,TCA TTA,SL,TYA,True,TAA TCA TTA,*SL,THA,TCA TTA,SL,TYA,TCA TTA,SL,TYA +TGA,*,2,CGT,TGC TGG TGT,CW,B,TGB,TGT TGG,CW,TGK,False,,,TGK,TGT TGG,CW,TGK,TGT TGG,CW,TGK +TGC,C,0,ACG,AGC CGC GGC,RSG,V,VGC,CGC GGC,RG,SGC,False,,,SGC,CGC GGC,RG,SGC,CGC GGC,RG,SGC +TGC,C,1,ACT,TAC TCC TTC,SFY,H,THC,TAC TCC TTC,YSF,THC,False,,,THC,TAC TCC TTC,YSF,THC,TAC TCC TTC,YSF,THC +TGC,C,2,AG,TGA TGG,*W,R,TGR,TGA TGG,*W,TGR,True,TGA TGG TGT,*WC,TGD,TGG,W,TGG,TGG TGT,WC,TGK +TGG,W,0,ACG,AGG CGG GGG,RG,V,VGG,AGG GGG,RG,RGG,False,,,RGG,AGG GGG,RG,RGG,AGG GGG,RG,RGG +TGG,W,1,ACT,TAG TCG TTG,L*S,H,THG,TAG TCG TTG,*SL,THG,False,,,THG,TCG TTG,SL,TYG,TCG TTG,SL,TYG +TGG,W,2,ACT,TGA TGC TGT,*C,H,TGH,TGA TGC,*C,TGM,True,TGC TGG TGT,CWC,TGB,TGC,C,TGC,TGC TGG TGT,CWC,TGB +TGT,C,0,ACG,AGT CGT GGT,RSG,V,VGT,CGT GGT,RG,SGT,False,,,SGT,CGT GGT,RG,SGT,CGT GGT,RG,SGT +TGT,C,1,ACT,TAT TCT TTT,SFY,H,THT,TAT TCT TTT,YSF,THT,False,,,THT,TAT TCT TTT,YSF,THT,TAT TCT TTT,YSF,THT +TGT,C,2,AG,TGA TGG,*W,R,TGR,TGA TGG,*W,TGR,True,TGA TGC TGG,*CW,TGV,TGG,W,TGG,TGC TGG,CW,TGS +TTA,L,0,AG,ATA GTA,IV,R,RTA,ATA GTA,IV,RTA,False,,,RTA,ATA GTA,IV,RTA,ATA GTA,IV,RTA +TTA,L,1,ACG,TAA TCA TGA,*S,V,TVA,TAA TCA,*S,TMA,False,,,TMA,TCA,S,TCA,TCA,S,TCA +TTA,L,2,CT,TTC TTT,F,Y,TTY,TTT,F,TTT,True,TTG TTT,LF,TTK,TTT,F,TTT,TTG TTT,LF,TTK +TTC,F,0,ACG,ATC CTC GTC,LIV,V,VTC,ATC GTC,IV,RTC,False,,,RTC,ATC GTC,IV,RTC,ATC GTC,IV,RTC +TTC,F,1,ACG,TAC TCC TGC,SCY,V,TVC,TAC TCC TGC,YSC,TVC,False,,,TVC,TAC TCC TGC,YSC,TVC,TAC TCC TGC,YSC,TVC +TTC,F,2,AG,TTA TTG,L,R,TTR,TTG,L,TTG,True,TTG TTT,LF,TTK,TTG,L,TTG,TTG TTT,LF,TTK +TTG,L,0,AG,ATG GTG,VM,R,RTG,ATG GTG,MV,RTG,False,,,RTG,ATG GTG,MV,RTG,ATG GTG,MV,RTG +TTG,L,1,ACG,TAG TCG TGG,*SW,V,TVG,TAG TCG TGG,*SW,TVG,False,,,TVG,TCG TGG,SW,TSG,TCG TGG,SW,TSG +TTG,L,2,CT,TTC TTT,F,Y,TTY,TTT,F,TTT,True,TTA TTT,LF,TTW,TTT,F,TTT,TTA TTT,LF,TTW +TTT,F,0,ACG,ATT CTT GTT,LIV,V,VTT,ATT GTT,IV,RTT,False,,,RTT,ATT GTT,IV,RTT,ATT GTT,IV,RTT +TTT,F,1,ACG,TAT TCT TGT,SCY,V,TVT,TAT TCT TGT,YSC,TVT,False,,,TVT,TAT TCT TGT,YSC,TVT,TAT TCT TGT,YSC,TVT +TTT,F,2,AG,TTA TTG,L,R,TTR,TTG,L,TTG,True,TTC TTG,FL,TTS,TTG,L,TTG,TTC TTG,FL,TTS diff --git a/main_package/data/bespoke_codon_table.csv b/main_package/data/bespoke_codon_table.csv new file mode 100644 index 0000000..365c3d0 --- /dev/null +++ b/main_package/data/bespoke_codon_table.csv @@ -0,0 +1,193 @@ +codon,aa,position,missense_nuc,missense_codons,missense_aa,missense_iupac,missense_iupac_codon,sele_codons,sele_notes,syn_bool,syn_iupac_codon,syn_notes +AAA,K,0,CGT,CAA GAA TAA,*QE,B,BAA,CAA GAA TAA,y,,, +AAA,K,1,CGT,ACA AGA ATA,TRI,B,ABA,ACA AGA ATA,y,,, +AAA,K,2,CT,AAC AAT,N,Y,AAY,AAT,pick one,TRUE,AAK, +AAC,N,0,CGT,CAC GAC TAC,HDY,B,BAC,CAC GAC TAC,y,,, +AAC,N,1,CGT,ACC AGC ATC,TSI,B,ABC,ACC AGC ATC,y,,, +AAC,N,2,AG,AAA AAG,K,R,AAR,AAA,pick one,TRUE,AAW, +AAG,K,0,CGT,CAG GAG TAG,*QE,B,BAG,CAG GAG TAG,y,,, +AAG,K,1,CGT,ACG AGG ATG,TRM,B,ABG,ACG AGG ATG,y,,, +AAG,K,2,CT,AAC AAT,N,Y,AAY,AAT,pick one,TRUE,AAW, +AAT,N,0,CGT,CAT GAT TAT,HDY,B,BAT,CAT GAT TAT,y,,, +AAT,N,1,CGT,ACT AGT ATT,TSI,B,ABT,ACT AGT ATT,y,,, +AAT,N,2,AG,AAA AAG,K,R,AAR,AAA,pick one,TRUE,AAM, +ACA,T,0,CGT,CCA GCA TCA,SAP,B,BCA,CCA GCA TCA,y,,, +ACA,T,1,AGT,AAA AGA ATA,RIK,D,ADA,AAA AGA ATA,y,,, +ACA,T,2,,,,A,ACA,,,TRUE,ACT, +ACC,T,0,CGT,CCC GCC TCC,SAP,B,BCC,CCC GCC TCC,pick S,,,not the best S codon +ACC,T,1,AGT,AAC AGC ATC,NSI,D,ADC,AAC ATC,pick S,,, +ACC,T,2,,,,C,ACC,,,TRUE,ACT, +ACG,T,0,CGT,CCG GCG TCG,SAP,B,BCG,CCG GCG TCG,y,,, +ACG,T,1,AGT,AAG AGG ATG,RMK,D,ADG,AAG AGG ATG,y,,, +ACG,T,2,,,,G,ACG,,,TRUE,ACT, +ACT,T,0,CGT,CCT GCT TCT,SAP,B,BCT,CCT GCT TCT,pick S,,, +ACT,T,1,AGT,AAT AGT ATT,NSI,D,ADT,AAT ATT,pick S,,, +ACT,T,2,,,,T,ACT,,,TRUE,ACA, +AGA,R,0,GT,GGA TGA,*G,K,KGA,GGA TGA,y,,, +AGA,R,1,ACT,AAA ACA ATA,TIK,H,AHA,AAA ACA ATA,y,,, +AGA,R,2,CT,AGC AGT,S,Y,AGY,AGT,pick one,TRUE,AGK,not the best S codon +AGC,S,0,CGT,CGC GGC TGC,RCG,B,BGC,GGC TGC,pick R,,, +AGC,S,1,ACT,AAC ACC ATC,NTI,H,AHC,AAC ACC ATC,y,,, +AGC,S,2,AG,AGA AGG,R,R,AGR,AGA,CUT (unless optimum codon),TRUE,AGW, +AGG,R,0,GT,GGG TGG,WG,K,KGG,GGG TGG,y,,, +AGG,R,1,ACT,AAG ACG ATG,TMK,H,AHG,AAG ACG ATG,y,,, +AGG,R,2,CT,AGC AGT,S,Y,AGY,AGT,pick one,TRUE,AGW,not the best S codon +AGT,S,0,CGT,CGT GGT TGT,RCG,B,BGT,GGT TGT,CUT R,,, +AGT,S,1,ACT,AAT ACT ATT,NTI,H,AHT,AAT ACT ATT,y,,, +AGT,S,2,AG,AGA AGG,R,R,AGR,AGA,pick one,TRUE,AGM, +ATA,I,0,CGT,CTA GTA TTA,LV,B,BTA,TTA GTA,pick one L,,, +ATA,I,1,ACG,AAA ACA AGA,TRK,V,AVA,AAA ACA AGA,y,,, +ATA,I,2,G,ATG,M,G,ATG,ATG,y,TRUE,ATK, +ATC,I,0,CGT,CTC GTC TTC,LFV,B,BTC,CTC GTC TTC,y,,, +ATC,I,1,ACG,AAC ACC AGC,NST,V,AVC,AAC ACC AGC,y,,, +ATC,I,2,G,ATG,M,G,ATG,ATG,y,TRUE,ATK, +ATG,M,0,CGT,CTG GTG TTG,LV,B,BTG,TTA GTA,SLIM,,, +ATG,M,1,ACG,AAG ACG AGG,TRK,V,AVG,AAG ACG AGG,y,,, +ATG,M,2,ACT,ATA ATC ATT,I,H,ATH,ATT,pick one I,TRUE,ATK,"no synonymous changes for M, but would be good to see variants surrounding?" +ATT,I,0,CGT,CTT GTT TTT,LFV,B,BTT,CTT GTT TTT,y,,, +ATT,I,1,ACG,AAT ACT AGT,NST,V,AVT,AAT ACT AGT,y,,, +ATT,I,2,G,ATG,M,G,ATG,ATG,y,TRUE,ATR, +CAA,Q,0,AGT,AAA GAA TAA,*EK,D,DAA,AAA GAA TAA,y,,, +CAA,Q,1,CGT,CCA CGA CTA,RLP,B,CBA,CCA CGA CTA,y,,, +CAA,Q,2,CT,CAC CAT,H,Y,CAY,CAT,pick one,TRUE,CAK, +CAC,H,0,AGT,AAC GAC TAC,NDY,D,DAC,AAC GAC TAC,y,,, +CAC,H,1,CGT,CCC CGC CTC,RLP,B,CBC,CCC CGC CTC,y,,, +CAC,H,2,AG,CAA CAG,Q,R,CAR,CAA,pick one,TRUE,CAW, +CAG,Q,0,AGT,AAG GAG TAG,*EK,D,DAG,AAG GAG TAG,y,,, +CAG,Q,1,CGT,CCG CGG CTG,RLP,B,CBG,CCG CGG CTG,y,,, +CAG,Q,2,CT,CAC CAT,H,Y,CAY,CAT,pick one,TRUE,CAW, +CAT,H,0,AGT,AAT GAT TAT,NDY,D,DAT,AAT GAT TAT,y,,, +CAT,H,1,CGT,CCT CGT CTT,RLP,B,CBT,CCT CGT CTT,y,,, +CAT,H,2,AG,CAA CAG,Q,R,CAR,CAA,pick one,TRUE,CAM, +CCA,P,0,AGT,ACA GCA TCA,TSA,D,DCA,ACA GCA TCA,y,,, +CCA,P,1,AGT,CAA CGA CTA,RQL,D,CDA,CAA CGA CTA,y,,, +CCA,P,2,,,,A,CCA,,,TRUE,CCT, +CCC,P,0,AGT,ACC GCC TCC,TSA,D,DCC,ACC GCC TCC,y,,, +CCC,P,1,AGT,CAC CGC CTC,RHL,D,CDC,CAC CGC CTC,y,,, +CCC,P,2,,,,C,CCC,,,TRUE,CCA, +CCG,P,0,AGT,ACG GCG TCG,TSA,D,DCG,ACG GCG TCG,y,,, +CCG,P,1,AGT,CAG CGG CTG,RQL,D,CDG,CAG CGG CTG,y,,, +CCG,P,2,,,,G,CCG,,,TRUE,CCA, +CCT,P,0,AGT,ACT GCT TCT,TSA,D,DCT,ACT GCT TCT,y,,, +CCT,P,1,AGT,CAT CGT CTT,RHL,D,CDT,CAT CGT CTT,y,,, +CCT,P,2,,,,T,CCT,,,TRUE,CCA, +CGA,R,0,GT,GGA TGA,*G,K,KGA,GGA TGA,y,TRUE,DGA,specially designed +CGA,R,1,ACT,CAA CCA CTA,LQP,H,CHA,CAA CCA CTA,y,,, +CGA,R,2,,,,A,CGA,,,,, +CGC,R,0,AGT,AGC GGC TGC,SCG,D,DGC,AGC GGC TGC,y,,, +CGC,R,1,ACT,CAC CCC CTC,LHP,H,CHC,CAC CCC CTC,y,,, +CGC,R,2,,,,C,CGC,,,TRUE,CGT,not best codon frequency +CGG,R,0,GT,GGG TGG,WG,K,KGG,GGG TGG,y,TRUE,DGG,specially designed +CGG,R,1,ACT,CAG CCG CTG,LQP,H,CHG,CAG CCG CTG,y,,, +CGG,R,2,,,,G,CGG,,,,, +CGT,R,0,AGT,AGT GGT TGT,SCG,D,DGT,AGT GGT TGT,y,,, +CGT,R,1,ACT,CAT CCT CTT,LHP,H,CHT,CAT CCT CTT,y,,, +CGT,R,2,,,,T,CGT,,,TRUE,CGA,poor codon frequency +CTA,L,0,AG,ATA GTA,IV,R,RTA,ATA GTA,y,TRUE,DTA,specially designed +CTA,L,1,ACG,CAA CCA CGA,RQP,V,CVA,CAA CCA CGA,y,,, +CTA,L,2,,,,A,CTA,,,,, +CTC,L,0,AGT,ATC GTC TTC,FIV,D,DTC,ATC GTC TTC,y,,, +CTC,L,1,ACG,CAC CCC CGC,RHP,V,CVC,CAC CCC CGC,y,,, +CTC,L,2,,,,C,CTC,,,TRUE,CTA, +CTG,L,0,AG,ATG GTG,VM,R,RTG,ATG GTG,y,TRUE,DTG,specially designed +CTG,L,1,ACG,CAG CCG CGG,RQP,V,CVG,CAG CCG CGG,y,,, +CTG,L,2,,,,G,CTG,,,,, +CTT,L,0,AGT,ATT GTT TTT,FIV,D,DTT,ATT GTT TTT,y,,, +CTT,L,1,ACG,CAT CCT CGT,RHP,V,CVT,CAT CCT CGT,y,,, +CTT,L,2,,,,T,CTT,,,TRUE,CTA, +GAA,E,0,ACT,AAA CAA TAA,*QK,H,HAA,AAA CAA TAA,y,,, +GAA,E,1,CGT,GCA GGA GTA,VAG,B,GBA,GCA GGA GTA,y,,, +GAA,E,2,CT,GAC GAT,D,Y,GAY,GAT,pick one,TRUE,GAK, +GAC,D,0,ACT,AAC CAC TAC,NHY,H,HAC,AAC CAC TAC,y,,, +GAC,D,1,CGT,GCC GGC GTC,VAG,B,GBC,GCC GGC GTC,y,,, +GAC,D,2,AG,GAA GAG,E,R,GAR,GAA,pick one,TRUE,GAW, +GAG,E,0,ACT,AAG CAG TAG,*QK,H,HAG,AAG CAG TAG,y,,, +GAG,E,1,CGT,GCG GGG GTG,VAG,B,GBG,GCG GGG GTG,y,,, +GAG,E,2,CT,GAC GAT,D,Y,GAY,GAT,pick one,TRUE,GAW, +GAT,D,0,ACT,AAT CAT TAT,NHY,H,HAT,AAT CAT TAT,y,,, +GAT,D,1,CGT,GCT GGT GTT,VAG,B,GBT,GCT GGT GTT,y,,, +GAT,D,2,AG,GAA GAG,E,R,GAR,GAA,pick one,TRUE,GAM, +GCA,A,0,ACT,ACA CCA TCA,TSP,H,HCA,ACA CCA TCA,y,,, +GCA,A,1,AGT,GAA GGA GTA,GVE,D,GDA,GAA GGA GTA,y,,, +GCA,A,2,,,,A,GCA,,,TRUE,GCT, +GCC,A,0,ACT,ACC CCC TCC,TSP,H,HCC,ACC CCC TCC,y,,, +GCC,A,1,AGT,GAC GGC GTC,VDG,D,GDC,GAC GGC GTC,y,,, +GCC,A,2,,,,C,GCC,,,TRUE,GCT, +GCG,A,0,ACT,ACG CCG TCG,TSP,H,HCG,ACG CCG TCG,y,,, +GCG,A,1,AGT,GAG GGG GTG,GVE,D,GDG,GAG GGG GTG,y,,, +GCG,A,2,,,,G,GCG,,,TRUE,GCT, +GCT,A,0,ACT,ACT CCT TCT,TSP,H,HCT,ACT CCT TCT,y,,, +GCT,A,1,AGT,GAT GGT GTT,VDG,D,GDT,GAT GGT GTT,y,,, +GCT,A,2,,,,T,GCT,,,TRUE,GCA, +GGA,G,0,ACT,AGA CGA TGA,R*,H,HGA,AGA TGA,SLIM,,, +GGA,G,1,ACT,GAA GCA GTA,VAE,H,GHA,GAA GCA GTA,y,,, +GGA,G,2,,,,A,GGA,,,TRUE,GGT, +GGC,G,0,ACT,AGC CGC TGC,RSC,H,HGC,AGC CGC TGC,y,,, +GGC,G,1,ACT,GAC GCC GTC,ADV,H,GHC,GAC GCC GTC,y,,, +GGC,G,2,,,,C,GGC,,,TRUE,GGT, +GGG,G,0,ACT,AGG CGG TGG,RW,H,HGG,AGG TGG,SLIM,,, +GGG,G,1,ACT,GAG GCG GTG,VAE,H,GHG,GAG GCG GTG,y,,, +GGG,G,2,,,,G,GGG,,,TRUE,GGT, +GGT,G,0,ACT,AGT CGT TGT,RSC,H,HGT,AGT CGT TGT,y,,, +GGT,G,1,ACT,GAT GCT GTT,ADV,H,GHT,GAT GCT GTT,y,,, +GGT,G,2,,,,T,GGT,,,TRUE,GGA, +GTA,V,0,ACT,ATA CTA TTA,LI,H,HTA,ATA TTA,SLIM,,, +GTA,V,1,ACG,GAA GCA GGA,GAE,V,GVA,GAA GCA GGA,y,,, +GTA,V,2,,,,A,GTA,,,TRUE,GTT, +GTC,V,0,ACT,ATC CTC TTC,LFI,H,HTC,ATC CTC TTC,y,,, +GTC,V,1,ACG,GAC GCC GGC,ADG,V,GVC,GAC GCC GGC,y,,, +GTC,V,2,,,,C,GTC,,,TRUE,GTT, +GTG,V,0,ACT,ATG CTG TTG,LM,H,HTG,ATG TTG,SLIM,,, +GTG,V,1,ACG,GAG GCG GGG,GAE,V,GVG,GAG GCG GGG,y,,, +GTG,V,2,,,,G,GTG,,,TRUE,GTT, +GTT,V,0,ACT,ATT CTT TTT,LFI,H,HTT,ATT CTT TTT,y,,, +GTT,V,1,ACG,GAT GCT GGT,ADG,V,GVT,GAT GCT GGT,y,,, +GTT,V,2,,,,T,GTT,,,TRUE,GTA, +TAA,*,0,ACG,AAA CAA GAA,QEK,V,VAA,AAA CAA GAA,y,,, +TAA,*,1,CT,TCA TTA,LS,Y,TYA,TCA TTA,y,,, +TAA,*,2,CT,TAC TAT,Y,Y,TAY,TAT,pick one,TRUE,TAK, +TAC,Y,0,ACG,AAC CAC GAC,NHD,V,VAC,AAC CAC GAC,y,,, +TAC,Y,1,CGT,TCC TGC TTC,SFC,B,TBC,TCC TGC TTC,y,,, +TAC,Y,2,AG,TAA TAG,*,R,TAR,TAA,pick one,TRUE,TAW, +TAG,*,0,ACG,AAG CAG GAG,QEK,V,VAG,AAG CAG GAG,y,,, +TAG,*,1,CGT,TCG TGG TTG,LSW,B,TBG,TCG TGG TTG,y,,, +TAG,*,2,CT,TAC TAT,Y,Y,TAY,TAT,pick one,TRUE,TAW, +TAT,Y,0,ACG,AAT CAT GAT,NHD,V,VAT,AAT CAT GAT,y,,, +TAT,Y,1,CGT,TCT TGT TTT,SFC,B,TBT,TCT TGT TTT,y,,, +TAT,Y,2,AG,TAA TAG,*,R,TAR,TAA,pick one,TRUE,TAM, +TCA,S,0,ACG,ACA CCA GCA,TAP,V,VCA,ACA CCA GCA,y,,, +TCA,S,1,AGT,TAA TGA TTA,L*,D,TDA,TAA TTA,SLIM,,, +TCA,S,2,,,,A,TCA,,,TRUE,TCT, +TCC,S,0,ACG,ACC CCC GCC,TAP,V,VCC,ACC CCC GCC,y,,, +TCC,S,1,AGT,TAC TGC TTC,FCY,D,TDC,TAC TGC TTC,y,,, +TCC,S,2,,,,C,TCC,,,TRUE,TCT, +TCG,S,0,ACG,ACG CCG GCG,TAP,V,VCG,ACG CCG GCG,y,,, +TCG,S,1,AGT,TAG TGG TTG,L*W,D,TDG,TAG TGG TTG,y,,, +TCG,S,2,,,,G,TCG,,,TRUE,TCT, +TCT,S,0,ACG,ACT CCT GCT,TAP,V,VCT,ACT CCT GCT,y,,, +TCT,S,1,AGT,TAT TGT TTT,FCY,D,TDT,TAT TGT TTT,y,,, +TCT,S,2,,,,T,TCT,,,TRUE,TCA, +TGA,*,0,ACG,AGA CGA GGA,RG,V,VGA,AGA GGA,SLIM,,, +TGA,*,1,CT,TCA TTA,LS,Y,TYA,TCA TTA,y,TRUE,THA,"Oddball, second position synonymous" +TGA,*,2,CGT,TGC TGG TGT,CW,B,TGB,TGT TGG,SLIM,,, +TGC,C,0,ACG,AGC CGC GGC,RSG,V,VGC,CGC GGC,CUT S,,, +TGC,C,1,ACT,TAC TCC TTC,SFY,H,THC,TAC TCC TTC,y,,, +TGC,C,2,AG,TGA TGG,*W,R,TGR,TGA TGG,y,TRUE,TGD, +TGG,W,0,ACG,AGG CGG GGG,RG,V,VGG,AGG GGG,SLIM,,, +TGG,W,1,ACT,TAG TCG TTG,L*S,H,THG,TAG TCG TTG,y,,, +TGG,W,2,ACT,TGA TGC TGT,*C,H,TGH,TGA TGC,cut *,TRUE,TGB,Trp has no synonymous codons +TGT,C,0,ACG,AGT CGT GGT,RSG,V,VGT,CGT GGT,CUT S,,, +TGT,C,1,ACT,TAT TCT TTT,SFY,H,THT,TAT TCT TTT,y,,, +TGT,C,2,AG,TGA TGG,*W,R,TGR,TGA TGG,y,TRUE,TGV, +TTA,L,0,AG,ATA GTA,IV,R,RTA,ATA GTA,y,,, +TTA,L,1,ACG,TAA TCA TGA,*S,V,TVA,TAA TCA,SLIM *,,, +TTA,L,2,CT,TTC TTT,F,Y,TTY,TTT,SLIM,TRUE,TTK, +TTC,F,0,ACG,ATC CTC GTC,LIV,V,VTC,ATC GTC,remove L,,, +TTC,F,1,ACG,TAC TCC TGC,SCY,V,TVC,TAC TCC TGC,y,,, +TTC,F,2,AG,TTA TTG,L,R,TTR,TTG,SLIM L,TRUE,TTK, +TTG,L,0,AG,ATG GTG,VM,R,RTG,ATG GTG,y,,, +TTG,L,1,ACG,TAG TCG TGG,*SW,V,TVG,TAG TCG TGG,y,,, +TTG,L,2,CT,TTC TTT,F,Y,TTY,TTT,SLIM,TRUE,TTW, +TTT,F,0,ACG,ATT CTT GTT,LIV,V,VTT,ATT GTT,Remove L,,, +TTT,F,1,ACG,TAT TCT TGT,SCY,V,TVT,TAT TCT TGT,y,,, +TTT,F,2,AG,TTA TTG,L,R,TTR,TTG,SLIM L,TRUE,TTS, \ No newline at end of file diff --git a/main_package/data/final_codon_table.csv b/main_package/data/final_codon_table.csv new file mode 100644 index 0000000..c183f47 --- /dev/null +++ b/main_package/data/final_codon_table.csv @@ -0,0 +1,193 @@ +codon,aa,position,missense_nuc,missense_codons,missense_aa,missense_iupac,missense_iupac_codon,sele_codons,sele_aa,sele_iupac_codon,syn_bool,syn_codons,syn_aa,syn_iupac_codon,no_stop_codons,no_stop_aa,no_stop_iupac_codon,no_stop_syn_codons,no_stop_syn_aa,no_stop_syn_iupac_codon +AAA,K,0,CGT,CAA GAA TAA,*QE,B,BAA,CAA GAA TAA,QE*,BAA,False,,,BAA,CAA GAA,QE,SAA,CAA GAA,QE,SAA +AAA,K,1,CGT,ACA AGA ATA,TRI,B,ABA,ACA AGA ATA,TRI,ABA,False,,,ABA,ACA AGA ATA,TRI,ABA,ACA AGA ATA,TRI,ABA +AAA,K,2,CT,AAC AAT,N,Y,AAY,AAT,N,AAT,True,AAG AAT,KN,AAK,AAT,N,AAT,AAG AAT,KN,AAK +AAC,N,0,CGT,CAC GAC TAC,HDY,B,BAC,CAC GAC TAC,HDY,BAC,False,,,BAC,CAC GAC TAC,HDY,BAC,CAC GAC TAC,HDY,BAC +AAC,N,1,CGT,ACC AGC ATC,TSI,B,ABC,ACC AGC ATC,TSI,ABC,False,,,ABC,ACC AGC ATC,TSI,ABC,ACC AGC ATC,TSI,ABC +AAC,N,2,AG,AAA AAG,K,R,AAR,AAA,K,AAA,True,AAA AAT,KN,AAW,AAA,K,AAA,AAA AAT,KN,AAW +AAG,K,0,CGT,CAG GAG TAG,*QE,B,BAG,CAG GAG TAG,QE*,BAG,False,,,BAG,CAG GAG,QE,SAG,CAG GAG,QE,SAG +AAG,K,1,CGT,ACG AGG ATG,TRM,B,ABG,ACG AGG ATG,TRM,ABG,False,,,ABG,ACG AGG ATG,TRM,ABG,ACG AGG ATG,TRM,ABG +AAG,K,2,CT,AAC AAT,N,Y,AAY,AAT,N,AAT,True,AAA AAT,KN,AAW,AAT,N,AAT,AAA AAT,KN,AAW +AAT,N,0,CGT,CAT GAT TAT,HDY,B,BAT,CAT GAT TAT,HDY,BAT,False,,,BAT,CAT GAT TAT,HDY,BAT,CAT GAT TAT,HDY,BAT +AAT,N,1,CGT,ACT AGT ATT,TSI,B,ABT,ACT AGT ATT,TSI,ABT,False,,,ABT,ACT AGT ATT,TSI,ABT,ACT AGT ATT,TSI,ABT +AAT,N,2,AG,AAA AAG,K,R,AAR,AAA,K,AAA,True,AAA AAC,KN,AAM,AAA,K,AAA,AAA AAC,KN,AAM +ACA,T,0,CGT,CCA GCA TCA,SAP,B,BCA,CCA GCA TCA,PAS,BCA,False,,,BCA,CCA GCA TCA,PAS,BCA,CCA GCA TCA,PAS,BCA +ACA,T,1,AGT,AAA AGA ATA,RIK,D,ADA,AAA AGA ATA,KRI,ADA,False,,,ADA,AAA AGA ATA,KRI,ADA,AAA AGA ATA,KRI,ADA +ACA,T,2,,,,A,ACA,,,,True,ACT,T,ACT,,,,ACT,T,ACT +ACC,T,0,CGT,CCC GCC TCC,SAP,B,BCC,CCC GCC TCC,PAS,BCC,False,,,BCC,CCC GCC TCC,PAS,BCC,CCC GCC TCC,PAS,BCC +ACC,T,1,AGT,AAC AGC ATC,NSI,D,ADC,AAC ATC,NI,AWC,False,,,AWC,AAC ATC,NI,AWC,AAC ATC,NI,AWC +ACC,T,2,,,,C,ACC,,,,True,ACT,T,ACT,,,,ACT,T,ACT +ACG,T,0,CGT,CCG GCG TCG,SAP,B,BCG,CCG GCG TCG,PAS,BCG,False,,,BCG,CCG GCG TCG,PAS,BCG,CCG GCG TCG,PAS,BCG +ACG,T,1,AGT,AAG AGG ATG,RMK,D,ADG,AAG AGG ATG,KRM,ADG,False,,,ADG,AAG AGG ATG,KRM,ADG,AAG AGG ATG,KRM,ADG +ACG,T,2,,,,G,ACG,,,,True,ACT,T,ACT,,,,ACT,T,ACT +ACT,T,0,CGT,CCT GCT TCT,SAP,B,BCT,CCT GCT TCT,PAS,BCT,False,,,BCT,CCT GCT TCT,PAS,BCT,CCT GCT TCT,PAS,BCT +ACT,T,1,AGT,AAT AGT ATT,NSI,D,ADT,AAT ATT,NI,AWT,False,,,AWT,AAT ATT,NI,AWT,AAT ATT,NI,AWT +ACT,T,2,,,,T,ACT,,,,True,ACA,T,ACA,,,,ACA,T,ACA +AGA,R,0,GT,GGA TGA,*G,K,KGA,GGA TGA,G*,KGA,False,,,KGA,GGA,G,GGA,GGA,G,GGA +AGA,R,1,ACT,AAA ACA ATA,TIK,H,AHA,AAA ACA ATA,KTI,AHA,False,,,AHA,AAA ACA ATA,KTI,AHA,AAA ACA ATA,KTI,AHA +AGA,R,2,CT,AGC AGT,S,Y,AGY,AGT,S,AGT,True,AGG AGT,RS,AGK,AGT,S,AGT,AGG AGT,RS,AGK +AGC,S,0,CGT,CGC GGC TGC,RCG,B,BGC,GGC TGC,GC,KGC,False,,,KGC,GGC TGC,GC,KGC,GGC TGC,GC,KGC +AGC,S,1,ACT,AAC ACC ATC,NTI,H,AHC,AAC ACC ATC,NTI,AHC,False,,,AHC,AAC ACC ATC,NTI,AHC,AAC ACC ATC,NTI,AHC +AGC,S,2,AG,AGA AGG,R,R,AGR,AGA,R,AGA,True,AGA AGT,RS,AGW,AGA,R,AGA,AGA AGT,RS,AGW +AGG,R,0,GT,GGG TGG,WG,K,KGG,GGG TGG,GW,KGG,False,,,KGG,GGG TGG,GW,KGG,GGG TGG,GW,KGG +AGG,R,1,ACT,AAG ACG ATG,TMK,H,AHG,AAG ACG ATG,KTM,AHG,False,,,AHG,AAG ACG ATG,KTM,AHG,AAG ACG ATG,KTM,AHG +AGG,R,2,CT,AGC AGT,S,Y,AGY,AGT,S,AGT,True,AGA AGT,RS,AGW,AGT,S,AGT,AGA AGT,RS,AGW +AGT,S,0,CGT,CGT GGT TGT,RCG,B,BGT,GGT TGT,GC,KGT,False,,,KGT,GGT TGT,GC,KGT,GGT TGT,GC,KGT +AGT,S,1,ACT,AAT ACT ATT,NTI,H,AHT,AAT ACT ATT,NTI,AHT,False,,,AHT,AAT ACT ATT,NTI,AHT,AAT ACT ATT,NTI,AHT +AGT,S,2,AG,AGA AGG,R,R,AGR,AGA,R,AGA,True,AGA AGC,RS,AGM,AGA,R,AGA,AGA AGC,RS,AGM +ATA,I,0,CGT,CTA GTA TTA,LV,B,BTA,TTA GTA,LV,KTA,False,,,KTA,TTA GTA,LV,KTA,TTA GTA,LV,KTA +ATA,I,1,ACG,AAA ACA AGA,TRK,V,AVA,AAA ACA AGA,KTR,AVA,False,,,AVA,AAA ACA AGA,KTR,AVA,AAA ACA AGA,KTR,AVA +ATA,I,2,G,ATG,M,G,ATG,ATG,M,ATG,True,ATG ATT,MI,ATK,ATG,M,ATG,ATG ATT,MI,ATK +ATC,I,0,CGT,CTC GTC TTC,LFV,B,BTC,CTC GTC TTC,LVF,BTC,False,,,BTC,CTC GTC TTC,LVF,BTC,CTC GTC TTC,LVF,BTC +ATC,I,1,ACG,AAC ACC AGC,NST,V,AVC,AAC ACC AGC,NTS,AVC,False,,,AVC,AAC ACC AGC,NTS,AVC,AAC ACC AGC,NTS,AVC +ATC,I,2,G,ATG,M,G,ATG,ATG,M,ATG,True,ATG ATT,MI,ATK,ATG,M,ATG,ATG ATT,MI,ATK +ATG,M,0,CGT,CTG GTG TTG,LV,B,BTG,TTA GTA,LV,KTG,False,,,KTG,TTA GTA,LV,KTG,TTA GTA,LV,KTG +ATG,M,1,ACG,AAG ACG AGG,TRK,V,AVG,AAG ACG AGG,KTR,AVG,False,,,AVG,AAG ACG AGG,KTR,AVG,AAG ACG AGG,KTR,AVG +ATG,M,2,ACT,ATA ATC ATT,I,H,ATH,ATT,I,ATT,True,ATG ATT,MI,ATK,ATT,I,ATT,ATG ATT,MI,ATK +ATT,I,0,CGT,CTT GTT TTT,LFV,B,BTT,CTT GTT TTT,LVF,BTT,False,,,BTT,CTT GTT TTT,LVF,BTT,CTT GTT TTT,LVF,BTT +ATT,I,1,ACG,AAT ACT AGT,NST,V,AVT,AAT ACT AGT,NTS,AVT,False,,,AVT,AAT ACT AGT,NTS,AVT,AAT ACT AGT,NTS,AVT +ATT,I,2,G,ATG,M,G,ATG,ATG,M,ATG,True,ATA ATG,IM,ATR,ATG,M,ATG,ATA ATG,IM,ATR +CAA,Q,0,AGT,AAA GAA TAA,*EK,D,DAA,AAA GAA TAA,KE*,DAA,False,,,DAA,AAA GAA,KE,RAA,AAA GAA,KE,RAA +CAA,Q,1,CGT,CCA CGA CTA,RLP,B,CBA,CCA CGA CTA,PRL,CBA,False,,,CBA,CCA CGA CTA,PRL,CBA,CCA CGA CTA,PRL,CBA +CAA,Q,2,CT,CAC CAT,H,Y,CAY,CAT,H,CAT,True,CAG CAT,QH,CAK,CAT,H,CAT,CAG CAT,QH,CAK +CAC,H,0,AGT,AAC GAC TAC,NDY,D,DAC,AAC GAC TAC,NDY,DAC,False,,,DAC,AAC GAC TAC,NDY,DAC,AAC GAC TAC,NDY,DAC +CAC,H,1,CGT,CCC CGC CTC,RLP,B,CBC,CCC CGC CTC,PRL,CBC,False,,,CBC,CCC CGC CTC,PRL,CBC,CCC CGC CTC,PRL,CBC +CAC,H,2,AG,CAA CAG,Q,R,CAR,CAA,Q,CAA,True,CAA CAT,QH,CAW,CAA,Q,CAA,CAA CAT,QH,CAW +CAG,Q,0,AGT,AAG GAG TAG,*EK,D,DAG,AAG GAG TAG,KE*,DAG,False,,,DAG,AAG GAG,KE,RAG,AAG GAG,KE,RAG +CAG,Q,1,CGT,CCG CGG CTG,RLP,B,CBG,CCG CGG CTG,PRL,CBG,False,,,CBG,CCG CGG CTG,PRL,CBG,CCG CGG CTG,PRL,CBG +CAG,Q,2,CT,CAC CAT,H,Y,CAY,CAT,H,CAT,True,CAA CAT,QH,CAW,CAT,H,CAT,CAA CAT,QH,CAW +CAT,H,0,AGT,AAT GAT TAT,NDY,D,DAT,AAT GAT TAT,NDY,DAT,False,,,DAT,AAT GAT TAT,NDY,DAT,AAT GAT TAT,NDY,DAT +CAT,H,1,CGT,CCT CGT CTT,RLP,B,CBT,CCT CGT CTT,PRL,CBT,False,,,CBT,CCT CGT CTT,PRL,CBT,CCT CGT CTT,PRL,CBT +CAT,H,2,AG,CAA CAG,Q,R,CAR,CAA,Q,CAA,True,CAA CAC,QH,CAM,CAA,Q,CAA,CAA CAC,QH,CAM +CCA,P,0,AGT,ACA GCA TCA,TSA,D,DCA,ACA GCA TCA,TAS,DCA,False,,,DCA,ACA GCA TCA,TAS,DCA,ACA GCA TCA,TAS,DCA +CCA,P,1,AGT,CAA CGA CTA,RQL,D,CDA,CAA CGA CTA,QRL,CDA,False,,,CDA,CAA CGA CTA,QRL,CDA,CAA CGA CTA,QRL,CDA +CCA,P,2,,,,A,CCA,,,,True,CCT,P,CCT,,,,CCT,P,CCT +CCC,P,0,AGT,ACC GCC TCC,TSA,D,DCC,ACC GCC TCC,TAS,DCC,False,,,DCC,ACC GCC TCC,TAS,DCC,ACC GCC TCC,TAS,DCC +CCC,P,1,AGT,CAC CGC CTC,RHL,D,CDC,CAC CGC CTC,HRL,CDC,False,,,CDC,CAC CGC CTC,HRL,CDC,CAC CGC CTC,HRL,CDC +CCC,P,2,,,,C,CCC,,,,True,CCA,P,CCA,,,,CCA,P,CCA +CCG,P,0,AGT,ACG GCG TCG,TSA,D,DCG,ACG GCG TCG,TAS,DCG,False,,,DCG,ACG GCG TCG,TAS,DCG,ACG GCG TCG,TAS,DCG +CCG,P,1,AGT,CAG CGG CTG,RQL,D,CDG,CAG CGG CTG,QRL,CDG,False,,,CDG,CAG CGG CTG,QRL,CDG,CAG CGG CTG,QRL,CDG +CCG,P,2,,,,G,CCG,,,,True,CCA,P,CCA,,,,CCA,P,CCA +CCT,P,0,AGT,ACT GCT TCT,TSA,D,DCT,ACT GCT TCT,TAS,DCT,False,,,DCT,ACT GCT TCT,TAS,DCT,ACT GCT TCT,TAS,DCT +CCT,P,1,AGT,CAT CGT CTT,RHL,D,CDT,CAT CGT CTT,HRL,CDT,False,,,CDT,CAT CGT CTT,HRL,CDT,CAT CGT CTT,HRL,CDT +CCT,P,2,,,,T,CCT,,,,True,CCA,P,CCA,,,,CCA,P,CCA +CGA,R,0,GT,GGA TGA,*G,K,KGA,GGA TGA,G*,KGA,True,AGA GGA TGA,RG*,DGA,GGA,G,GGA,AGA GGA,RG,RGA +CGA,R,1,ACT,CAA CCA CTA,LQP,H,CHA,CAA CCA CTA,QPL,CHA,False,,,CHA,CAA CCA CTA,QPL,CHA,CAA CCA CTA,QPL,CHA +CGA,R,2,,,,A,CGA,,,,False,,,,,,,,, +CGC,R,0,AGT,AGC GGC TGC,SCG,D,DGC,AGC GGC TGC,SGC,DGC,False,,,DGC,AGC GGC TGC,SGC,DGC,AGC GGC TGC,SGC,DGC +CGC,R,1,ACT,CAC CCC CTC,LHP,H,CHC,CAC CCC CTC,HPL,CHC,False,,,CHC,CAC CCC CTC,HPL,CHC,CAC CCC CTC,HPL,CHC +CGC,R,2,,,,C,CGC,,,,True,CGT,R,CGT,,,,CGT,R,CGT +CGG,R,0,GT,GGG TGG,WG,K,KGG,GGG TGG,GW,KGG,True,AGG GGG TGG,RGW,DGG,GGG TGG,GW,KGG,AGG GGG TGG,RGW,DGG +CGG,R,1,ACT,CAG CCG CTG,LQP,H,CHG,CAG CCG CTG,QPL,CHG,False,,,CHG,CAG CCG CTG,QPL,CHG,CAG CCG CTG,QPL,CHG +CGG,R,2,,,,G,CGG,,,,False,,,,,,,,, +CGT,R,0,AGT,AGT GGT TGT,SCG,D,DGT,AGT GGT TGT,SGC,DGT,False,,,DGT,AGT GGT TGT,SGC,DGT,AGT GGT TGT,SGC,DGT +CGT,R,1,ACT,CAT CCT CTT,LHP,H,CHT,CAT CCT CTT,HPL,CHT,False,,,CHT,CAT CCT CTT,HPL,CHT,CAT CCT CTT,HPL,CHT +CGT,R,2,,,,T,CGT,,,,True,CGA,R,CGA,,,,CGA,R,CGA +CTA,L,0,AG,ATA GTA,IV,R,RTA,ATA GTA,IV,RTA,True,ATA GTA TTA,IVL,DTA,ATA GTA,IV,RTA,ATA GTA TTA,IVL,DTA +CTA,L,1,ACG,CAA CCA CGA,RQP,V,CVA,CAA CCA CGA,QPR,CVA,False,,,CVA,CAA CCA CGA,QPR,CVA,CAA CCA CGA,QPR,CVA +CTA,L,2,,,,A,CTA,,,,False,,,,,,,,, +CTC,L,0,AGT,ATC GTC TTC,FIV,D,DTC,ATC GTC TTC,IVF,DTC,False,,,DTC,ATC GTC TTC,IVF,DTC,ATC GTC TTC,IVF,DTC +CTC,L,1,ACG,CAC CCC CGC,RHP,V,CVC,CAC CCC CGC,HPR,CVC,False,,,CVC,CAC CCC CGC,HPR,CVC,CAC CCC CGC,HPR,CVC +CTC,L,2,,,,C,CTC,,,,True,CTA,L,CTA,,,,CTA,L,CTA +CTG,L,0,AG,ATG GTG,VM,R,RTG,ATG GTG,MV,RTG,True,ATG GTG TTG,MVL,DTG,ATG GTG,MV,RTG,ATG GTG TTG,MVL,DTG +CTG,L,1,ACG,CAG CCG CGG,RQP,V,CVG,CAG CCG CGG,QPR,CVG,False,,,CVG,CAG CCG CGG,QPR,CVG,CAG CCG CGG,QPR,CVG +CTG,L,2,,,,G,CTG,,,,False,,,,,,,,, +CTT,L,0,AGT,ATT GTT TTT,FIV,D,DTT,ATT GTT TTT,IVF,DTT,False,,,DTT,ATT GTT TTT,IVF,DTT,ATT GTT TTT,IVF,DTT +CTT,L,1,ACG,CAT CCT CGT,RHP,V,CVT,CAT CCT CGT,HPR,CVT,False,,,CVT,CAT CCT CGT,HPR,CVT,CAT CCT CGT,HPR,CVT +CTT,L,2,,,,T,CTT,,,,True,CTA,L,CTA,,,,CTA,L,CTA +GAA,E,0,ACT,AAA CAA TAA,*QK,H,HAA,AAA CAA TAA,KQ*,HAA,False,,,HAA,AAA CAA,KQ,MAA,AAA CAA,KQ,MAA +GAA,E,1,CGT,GCA GGA GTA,VAG,B,GBA,GCA GGA GTA,AGV,GBA,False,,,GBA,GCA GGA GTA,AGV,GBA,GCA GGA GTA,AGV,GBA +GAA,E,2,CT,GAC GAT,D,Y,GAY,GAT,D,GAT,True,GAG GAT,ED,GAK,GAT,D,GAT,GAG GAT,ED,GAK +GAC,D,0,ACT,AAC CAC TAC,NHY,H,HAC,AAC CAC TAC,NHY,HAC,False,,,HAC,AAC CAC TAC,NHY,HAC,AAC CAC TAC,NHY,HAC +GAC,D,1,CGT,GCC GGC GTC,VAG,B,GBC,GCC GGC GTC,AGV,GBC,False,,,GBC,GCC GGC GTC,AGV,GBC,GCC GGC GTC,AGV,GBC +GAC,D,2,AG,GAA GAG,E,R,GAR,GAA,E,GAA,True,GAA GAT,ED,GAW,GAA,E,GAA,GAA GAT,ED,GAW +GAG,E,0,ACT,AAG CAG TAG,*QK,H,HAG,AAG CAG TAG,KQ*,HAG,False,,,HAG,AAG CAG,KQ,MAG,AAG CAG,KQ,MAG +GAG,E,1,CGT,GCG GGG GTG,VAG,B,GBG,GCG GGG GTG,AGV,GBG,False,,,GBG,GCG GGG GTG,AGV,GBG,GCG GGG GTG,AGV,GBG +GAG,E,2,CT,GAC GAT,D,Y,GAY,GAT,D,GAT,True,GAA GAT,ED,GAW,GAT,D,GAT,GAA GAT,ED,GAW +GAT,D,0,ACT,AAT CAT TAT,NHY,H,HAT,AAT CAT TAT,NHY,HAT,False,,,HAT,AAT CAT TAT,NHY,HAT,AAT CAT TAT,NHY,HAT +GAT,D,1,CGT,GCT GGT GTT,VAG,B,GBT,GCT GGT GTT,AGV,GBT,False,,,GBT,GCT GGT GTT,AGV,GBT,GCT GGT GTT,AGV,GBT +GAT,D,2,AG,GAA GAG,E,R,GAR,GAA,E,GAA,True,GAA GAC,ED,GAM,GAA,E,GAA,GAA GAC,ED,GAM +GCA,A,0,ACT,ACA CCA TCA,TSP,H,HCA,ACA CCA TCA,TPS,HCA,False,,,HCA,ACA CCA TCA,TPS,HCA,ACA CCA TCA,TPS,HCA +GCA,A,1,AGT,GAA GGA GTA,GVE,D,GDA,GAA GGA GTA,EGV,GDA,False,,,GDA,GAA GGA GTA,EGV,GDA,GAA GGA GTA,EGV,GDA +GCA,A,2,,,,A,GCA,,,,True,GCT,A,GCT,,,,GCT,A,GCT +GCC,A,0,ACT,ACC CCC TCC,TSP,H,HCC,ACC CCC TCC,TPS,HCC,False,,,HCC,ACC CCC TCC,TPS,HCC,ACC CCC TCC,TPS,HCC +GCC,A,1,AGT,GAC GGC GTC,VDG,D,GDC,GAC GGC GTC,DGV,GDC,False,,,GDC,GAC GGC GTC,DGV,GDC,GAC GGC GTC,DGV,GDC +GCC,A,2,,,,C,GCC,,,,True,GCT,A,GCT,,,,GCT,A,GCT +GCG,A,0,ACT,ACG CCG TCG,TSP,H,HCG,ACG CCG TCG,TPS,HCG,False,,,HCG,ACG CCG TCG,TPS,HCG,ACG CCG TCG,TPS,HCG +GCG,A,1,AGT,GAG GGG GTG,GVE,D,GDG,GAG GGG GTG,EGV,GDG,False,,,GDG,GAG GGG GTG,EGV,GDG,GAG GGG GTG,EGV,GDG +GCG,A,2,,,,G,GCG,,,,True,GCT,A,GCT,,,,GCT,A,GCT +GCT,A,0,ACT,ACT CCT TCT,TSP,H,HCT,ACT CCT TCT,TPS,HCT,False,,,HCT,ACT CCT TCT,TPS,HCT,ACT CCT TCT,TPS,HCT +GCT,A,1,AGT,GAT GGT GTT,VDG,D,GDT,GAT GGT GTT,DGV,GDT,False,,,GDT,GAT GGT GTT,DGV,GDT,GAT GGT GTT,DGV,GDT +GCT,A,2,,,,T,GCT,,,,True,GCA,A,GCA,,,,GCA,A,GCA +GGA,G,0,ACT,AGA CGA TGA,R*,H,HGA,AGA TGA,R*,WGA,False,,,WGA,AGA,R,AGA,AGA,R,AGA +GGA,G,1,ACT,GAA GCA GTA,VAE,H,GHA,GAA GCA GTA,EAV,GHA,False,,,GHA,GAA GCA GTA,EAV,GHA,GAA GCA GTA,EAV,GHA +GGA,G,2,,,,A,GGA,,,,True,GGT,G,GGT,,,,GGT,G,GGT +GGC,G,0,ACT,AGC CGC TGC,RSC,H,HGC,AGC CGC TGC,SRC,HGC,False,,,HGC,AGC CGC TGC,SRC,HGC,AGC CGC TGC,SRC,HGC +GGC,G,1,ACT,GAC GCC GTC,ADV,H,GHC,GAC GCC GTC,DAV,GHC,False,,,GHC,GAC GCC GTC,DAV,GHC,GAC GCC GTC,DAV,GHC +GGC,G,2,,,,C,GGC,,,,True,GGT,G,GGT,,,,GGT,G,GGT +GGG,G,0,ACT,AGG CGG TGG,RW,H,HGG,AGG TGG,RW,WGG,False,,,WGG,AGG TGG,RW,WGG,AGG TGG,RW,WGG +GGG,G,1,ACT,GAG GCG GTG,VAE,H,GHG,GAG GCG GTG,EAV,GHG,False,,,GHG,GAG GCG GTG,EAV,GHG,GAG GCG GTG,EAV,GHG +GGG,G,2,,,,G,GGG,,,,True,GGT,G,GGT,,,,GGT,G,GGT +GGT,G,0,ACT,AGT CGT TGT,RSC,H,HGT,AGT CGT TGT,SRC,HGT,False,,,HGT,AGT CGT TGT,SRC,HGT,AGT CGT TGT,SRC,HGT +GGT,G,1,ACT,GAT GCT GTT,ADV,H,GHT,GAT GCT GTT,DAV,GHT,False,,,GHT,GAT GCT GTT,DAV,GHT,GAT GCT GTT,DAV,GHT +GGT,G,2,,,,T,GGT,,,,True,GGA,G,GGA,,,,GGA,G,GGA +GTA,V,0,ACT,ATA CTA TTA,LI,H,HTA,ATA TTA,IL,WTA,False,,,WTA,ATA TTA,IL,WTA,ATA TTA,IL,WTA +GTA,V,1,ACG,GAA GCA GGA,GAE,V,GVA,GAA GCA GGA,EAG,GVA,False,,,GVA,GAA GCA GGA,EAG,GVA,GAA GCA GGA,EAG,GVA +GTA,V,2,,,,A,GTA,,,,True,GTT,V,GTT,,,,GTT,V,GTT +GTC,V,0,ACT,ATC CTC TTC,LFI,H,HTC,ATC CTC TTC,ILF,HTC,False,,,HTC,ATC CTC TTC,ILF,HTC,ATC CTC TTC,ILF,HTC +GTC,V,1,ACG,GAC GCC GGC,ADG,V,GVC,GAC GCC GGC,DAG,GVC,False,,,GVC,GAC GCC GGC,DAG,GVC,GAC GCC GGC,DAG,GVC +GTC,V,2,,,,C,GTC,,,,True,GTT,V,GTT,,,,GTT,V,GTT +GTG,V,0,ACT,ATG CTG TTG,LM,H,HTG,ATG TTG,ML,WTG,False,,,WTG,ATG TTG,ML,WTG,ATG TTG,ML,WTG +GTG,V,1,ACG,GAG GCG GGG,GAE,V,GVG,GAG GCG GGG,EAG,GVG,False,,,GVG,GAG GCG GGG,EAG,GVG,GAG GCG GGG,EAG,GVG +GTG,V,2,,,,G,GTG,,,,True,GTT,V,GTT,,,,GTT,V,GTT +GTT,V,0,ACT,ATT CTT TTT,LFI,H,HTT,ATT CTT TTT,ILF,HTT,False,,,HTT,ATT CTT TTT,ILF,HTT,ATT CTT TTT,ILF,HTT +GTT,V,1,ACG,GAT GCT GGT,ADG,V,GVT,GAT GCT GGT,DAG,GVT,False,,,GVT,GAT GCT GGT,DAG,GVT,GAT GCT GGT,DAG,GVT +GTT,V,2,,,,T,GTT,,,,True,GTA,V,GTA,,,,GTA,V,GTA +TAA,*,0,ACG,AAA CAA GAA,QEK,V,VAA,AAA CAA GAA,KQE,VAA,False,,,VAA,AAA CAA GAA,KQE,VAA,AAA CAA GAA,KQE,VAA +TAA,*,1,CT,TCA TTA,LS,Y,TYA,TCA TTA,SL,TYA,False,,,TYA,TCA TTA,SL,TYA,TCA TTA,SL,TYA +TAA,*,2,CT,TAC TAT,Y,Y,TAY,TAT,Y,TAT,True,TAG TAT,*Y,TAK,TAT,Y,TAT,TAT,Y,TAT +TAC,Y,0,ACG,AAC CAC GAC,NHD,V,VAC,AAC CAC GAC,NHD,VAC,False,,,VAC,AAC CAC GAC,NHD,VAC,AAC CAC GAC,NHD,VAC +TAC,Y,1,CGT,TCC TGC TTC,SFC,B,TBC,TCC TGC TTC,SCF,TBC,False,,,TBC,TCC TGC TTC,SCF,TBC,TCC TGC TTC,SCF,TBC +TAC,Y,2,AG,TAA TAG,*,R,TAR,TAA,*,TAA,True,TAA TAT,*Y,TAW,,,,TAT,Y,TAT +TAG,*,0,ACG,AAG CAG GAG,QEK,V,VAG,AAG CAG GAG,KQE,VAG,False,,,VAG,AAG CAG GAG,KQE,VAG,AAG CAG GAG,KQE,VAG +TAG,*,1,CGT,TCG TGG TTG,LSW,B,TBG,TCG TGG TTG,SWL,TBG,False,,,TBG,TCG TGG TTG,SWL,TBG,TCG TGG TTG,SWL,TBG +TAG,*,2,CT,TAC TAT,Y,Y,TAY,TAT,Y,TAT,True,TAA TAT,*Y,TAW,TAT,Y,TAT,TAT,Y,TAT +TAT,Y,0,ACG,AAT CAT GAT,NHD,V,VAT,AAT CAT GAT,NHD,VAT,False,,,VAT,AAT CAT GAT,NHD,VAT,AAT CAT GAT,NHD,VAT +TAT,Y,1,CGT,TCT TGT TTT,SFC,B,TBT,TCT TGT TTT,SCF,TBT,False,,,TBT,TCT TGT TTT,SCF,TBT,TCT TGT TTT,SCF,TBT +TAT,Y,2,AG,TAA TAG,*,R,TAR,TAA,*,TAA,True,TAA TAC,*Y,TAM,,,,TAC,Y,TAC +TCA,S,0,ACG,ACA CCA GCA,TAP,V,VCA,ACA CCA GCA,TPA,VCA,False,,,VCA,ACA CCA GCA,TPA,VCA,ACA CCA GCA,TPA,VCA +TCA,S,1,AGT,TAA TGA TTA,L*,D,TDA,TAA TTA,*L,TWA,False,,,TWA,TTA,L,TTA,TTA,L,TTA +TCA,S,2,,,,A,TCA,,,,True,TCT,S,TCT,,,,TCT,S,TCT +TCC,S,0,ACG,ACC CCC GCC,TAP,V,VCC,ACC CCC GCC,TPA,VCC,False,,,VCC,ACC CCC GCC,TPA,VCC,ACC CCC GCC,TPA,VCC +TCC,S,1,AGT,TAC TGC TTC,FCY,D,TDC,TAC TGC TTC,YCF,TDC,False,,,TDC,TAC TGC TTC,YCF,TDC,TAC TGC TTC,YCF,TDC +TCC,S,2,,,,C,TCC,,,,True,TCT,S,TCT,,,,TCT,S,TCT +TCG,S,0,ACG,ACG CCG GCG,TAP,V,VCG,ACG CCG GCG,TPA,VCG,False,,,VCG,ACG CCG GCG,TPA,VCG,ACG CCG GCG,TPA,VCG +TCG,S,1,AGT,TAG TGG TTG,L*W,D,TDG,TAG TGG TTG,*WL,TDG,False,,,TDG,TGG TTG,WL,TKG,TGG TTG,WL,TKG +TCG,S,2,,,,G,TCG,,,,True,TCT,S,TCT,,,,TCT,S,TCT +TCT,S,0,ACG,ACT CCT GCT,TAP,V,VCT,ACT CCT GCT,TPA,VCT,False,,,VCT,ACT CCT GCT,TPA,VCT,ACT CCT GCT,TPA,VCT +TCT,S,1,AGT,TAT TGT TTT,FCY,D,TDT,TAT TGT TTT,YCF,TDT,False,,,TDT,TAT TGT TTT,YCF,TDT,TAT TGT TTT,YCF,TDT +TCT,S,2,,,,T,TCT,,,,True,TCA,S,TCA,,,,TCA,S,TCA +TGA,*,0,ACG,AGA CGA GGA,RG,V,VGA,AGA GGA,RG,RGA,False,,,RGA,AGA GGA,RG,RGA,AGA GGA,RG,RGA +TGA,*,1,CT,TCA TTA,LS,Y,TYA,TCA TTA,SL,TYA,True,TAA TCA TTA,*SL,THA,TCA TTA,SL,TYA,TCA TTA,SL,TYA +TGA,*,2,CGT,TGC TGG TGT,CW,B,TGB,TGT TGG,CW,TGK,False,,,TGK,TGT TGG,CW,TGK,TGT TGG,CW,TGK +TGC,C,0,ACG,AGC CGC GGC,RSG,V,VGC,CGC GGC,RG,SGC,False,,,SGC,CGC GGC,RG,SGC,CGC GGC,RG,SGC +TGC,C,1,ACT,TAC TCC TTC,SFY,H,THC,TAC TCC TTC,YSF,THC,False,,,THC,TAC TCC TTC,YSF,THC,TAC TCC TTC,YSF,THC +TGC,C,2,AG,TGA TGG,*W,R,TGR,TGA TGG,*W,TGR,True,TGA TGG TGT,*WC,TGD,TGG,W,TGG,TGG TGT,WC,TGK +TGG,W,0,ACG,AGG CGG GGG,RG,V,VGG,AGG GGG,RG,RGG,False,,,RGG,AGG GGG,RG,RGG,AGG GGG,RG,RGG +TGG,W,1,ACT,TAG TCG TTG,L*S,H,THG,TAG TCG TTG,*SL,THG,False,,,THG,TCG TTG,SL,TYG,TCG TTG,SL,TYG +TGG,W,2,ACT,TGA TGC TGT,*C,H,TGH,TGA TGC,*C,TGM,True,TGC TGG TGT,CWC,TGB,TGC,C,TGC,TGC TGG TGT,CWC,TGB +TGT,C,0,ACG,AGT CGT GGT,RSG,V,VGT,CGT GGT,RG,SGT,False,,,SGT,CGT GGT,RG,SGT,CGT GGT,RG,SGT +TGT,C,1,ACT,TAT TCT TTT,SFY,H,THT,TAT TCT TTT,YSF,THT,False,,,THT,TAT TCT TTT,YSF,THT,TAT TCT TTT,YSF,THT +TGT,C,2,AG,TGA TGG,*W,R,TGR,TGA TGG,*W,TGR,True,TGA TGC TGG,*CW,TGV,TGG,W,TGG,TGC TGG,CW,TGS +TTA,L,0,AG,ATA GTA,IV,R,RTA,ATA GTA,IV,RTA,False,,,RTA,ATA GTA,IV,RTA,ATA GTA,IV,RTA +TTA,L,1,ACG,TAA TCA TGA,*S,V,TVA,TAA TCA,*S,TMA,False,,,TMA,TCA,S,TCA,TCA,S,TCA +TTA,L,2,CT,TTC TTT,F,Y,TTY,TTT,F,TTT,True,TTG TTT,LF,TTK,TTT,F,TTT,TTG TTT,LF,TTK +TTC,F,0,ACG,ATC CTC GTC,LIV,V,VTC,ATC GTC,IV,RTC,False,,,RTC,ATC GTC,IV,RTC,ATC GTC,IV,RTC +TTC,F,1,ACG,TAC TCC TGC,SCY,V,TVC,TAC TCC TGC,YSC,TVC,False,,,TVC,TAC TCC TGC,YSC,TVC,TAC TCC TGC,YSC,TVC +TTC,F,2,AG,TTA TTG,L,R,TTR,TTG,L,TTG,True,TTG TTT,LF,TTK,TTG,L,TTG,TTG TTT,LF,TTK +TTG,L,0,AG,ATG GTG,VM,R,RTG,ATG GTG,MV,RTG,False,,,RTG,ATG GTG,MV,RTG,ATG GTG,MV,RTG +TTG,L,1,ACG,TAG TCG TGG,*SW,V,TVG,TAG TCG TGG,*SW,TVG,False,,,TVG,TCG TGG,SW,TSG,TCG TGG,SW,TSG +TTG,L,2,CT,TTC TTT,F,Y,TTY,TTT,F,TTT,True,TTA TTT,LF,TTW,TTT,F,TTT,TTA TTT,LF,TTW +TTT,F,0,ACG,ATT CTT GTT,LIV,V,VTT,ATT GTT,IV,RTT,False,,,RTT,ATT GTT,IV,RTT,ATT GTT,IV,RTT +TTT,F,1,ACG,TAT TCT TGT,SCY,V,TVT,TAT TCT TGT,YSC,TVT,False,,,TVT,TAT TCT TGT,YSC,TVT,TAT TCT TGT,YSC,TVT +TTT,F,2,AG,TTA TTG,L,R,TTR,TTG,L,TTG,True,TTC TTG,FL,TTS,TTG,L,TTG,TTC TTG,FL,TTS diff --git a/main_package/data/yeast_synonymous_codon_table.csv b/main_package/data/yeast_synonymous_codon_table.csv new file mode 100644 index 0000000..0918663 --- /dev/null +++ b/main_package/data/yeast_synonymous_codon_table.csv @@ -0,0 +1,65 @@ +codon,iupac +TTT,TTY +TTC,TTY +TTA,TTR +TTG,TTR +CTT,TTR +CTC,TTR +CTA,TTR +CTG,TTR +ATT,ATW +ATC,ATW +ATA,ATW +ATG,ATG +GTT,GTW +GTC,GTW +GTA,GTW +GTG,GTW +TCT,TCW +TCC,TCW +TCA,TCW +TCG,TCW +CCT,CCW +CCC,CCW +CCA,CCW +CCG,CCW +ACT,ACW +ACC,ACW +ACA,ACW +ACG,ACW +GCT,GCW +GCC,GCW +GCA,GCW +GCG,GCW +TAT,TAY +TAC,TAY +TAA,TRA +TAG,TRA +CAT,CAY +CAC,CAY +CAA,CAR +CAG,CAR +AAT,AAY +AAC,AAY +AAA,AAR +AAG,AAR +GAT,GAY +GAC,GAY +GAA,GAR +GAG,GAR +TGT,TGY +TGC,TGY +TGA,TRA +TGG,TGG +CGT,AGR +CGC,AGR +CGA,AGR +CGG,AGR +AGT,TCW +AGC,TCW +AGA,AGR +AGG,AGR +GGT,GGW +GGC,GGW +GGA,GGW +GGG,GGW \ No newline at end of file diff --git a/main_package/primer_design.ipynb b/main_package/primer_design.ipynb index 17ae998..55a9ecc 100644 --- a/main_package/primer_design.ipynb +++ b/main_package/primer_design.ipynb @@ -12,13 +12,23 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, "id": "341a01a4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "SyntaxError", + "evalue": "invalid syntax (816116968.py, line 3)", + "output_type": "error", + "traceback": [ + "\u001b[0;36m File \u001b[0;32m\"/var/folders/fq/q1cflf795cgbnmbhdyn8d9nntyw5jt/T/ipykernel_42058/816116968.py\"\u001b[0;36m, line \u001b[0;32m3\u001b[0m\n\u001b[0;31m from . import main_package.codon_table\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" + ] + } + ], "source": [ "from Bio import SeqIO\n", - "import numpy as np" + "import numpy as np\n", + "from . import main_package.codon_table" ] }, { @@ -246,7 +256,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.9.7" } }, "nbformat": 4, diff --git a/main_package/primer_design.py b/main_package/primer_design.py new file mode 100644 index 0000000..5341531 --- /dev/null +++ b/main_package/primer_design.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +from . import codon_table +from Bio.Seq import Seq +from Bio.SeqUtils import MeltingTemp as mt + +def homology_arm(seq_data, data_dict, args): + start_index = data_dict['start_index'] + vector_seq = seq_data['vector_seq'] + + homology_arm = vector_seq[start_index - args.homo_len:start_index] ### args.homo_len + data_dict['homology_arm'] = homology_arm + + return data_dict + +def reverse_primer(seq_data, data_dict, args): + sub_window_name = data_dict['sub_window_name'] + start_index = data_dict['start_index'] + vector_seq = seq_data['vector_seq'] + + reverse_seq = str(Seq(vector_seq[:start_index]).reverse_complement()) + reverse_primer = reverse_seq[:15] + while mt.Tm_NN(reverse_primer) < args.rev_melt_temp: ### args.rev_melt_temp + reverse_primer = reverse_seq[:len(reverse_primer)+1] + data_dict['reverse_primer'] = reverse_primer + + reverse_primer_name = f'rev_{sub_window_name}' + data_dict['reverse_primer_name'] = reverse_primer_name + + return data_dict + +def forward_primer(seq_data, data_dict, args): + start_index = data_dict['start_index'] + window_end = data_dict['window_end'] + vector_seq = seq_data['vector_seq'] + + primer_end = start_index + (args.oligo_len - args.homo_len) + if primer_end > window_end: + primer_end == window_end + + primer_start = primer_end - 15 + forward_primer = vector_seq[primer_start:primer_end] + + while mt.Tm_NN(forward_primer) < args.melt_temp: + primer_start -= 1 + forward_primer = vector_seq[primer_start:primer_end] + + # check if the primer is the max oligo length + if len(forward_primer) > (args.oligo_len - args.homo_len - 12): # 12 is a minimum window size of 4 codons + # fix mut window to 12, make a long primer + primer_start = start_index + 12 + primer_end = primer_start + 15 + forward_primer = vector_seq[primer_start:primer_end] + while True: + forward_primer = vector_seq[primer_start:primer_end] + if mt.Tm_NN(forward_primer) > args.melt_temp and forward_primer.upper().count('G') + forward_primer.upper().count('C') > 8: + break + else: + primer_end += 1 + + # even-out the primer length to accomodate codons + else: + # add or subtract a bp from the fwd primer to get mut_window in frame + if (primer_start - start_index)%3 == 2: + primer_start += 1 + forward_primer = vector_seq[primer_start:primer_end] + + elif (primer_start - start_index)%3 == 1: + primer_start -= 1 + forward_primer = vector_seq[primer_start:primer_end] + + # making the last primer in a window + if primer_start > window_end: + primer_start = window_end + primer_end = primer_start+15 + forward_primer = vector_seq[primer_start:primer_end] + while mt.Tm_NN(forward_primer) < args.melt_temp: + primer_end += 1 + forward_primer = vector_seq[primer_start:primer_end] + + data_dict['primer_start'] = primer_start + data_dict['forward_primer'] = forward_primer + + return data_dict + +def sub_window(seq_data, data_dict, args): + primer_start = data_dict['primer_start'] + start_index = data_dict['start_index'] + window_end = data_dict['window_end'] + sub_window_name = data_dict['sub_window_name'] + wt_seq = seq_data['wt_seq'] + vector_seq = seq_data['vector_seq'] + gene_start = seq_data['gene_start'] + rng = seq_data['rng'] + + # this may not work + missense_dict, synonymous_dict, no_stop_dict, no_stop_syn_dict = codon_table.iupac_codon_dicts() + yeast_synonymous_dict = codon_table.synonymous_yeast_codons_dict() + + sub_window_len = (primer_start) - start_index + sub_window_end = start_index + sub_window_len + + def codons_list(seq): + return [seq[i:i+3] for i in range(0, len(seq), 3)] + + # removing mis_list and syn_list + wt_list = codons_list(wt_seq[start_index:sub_window_end]) + vect_list = codons_list(vector_seq[start_index:sub_window_end]) + + # generate synonymous vector codon list (top 2 codons for yeast) + synonymous_win = [yeast_synonymous_dict[i].lower() for i in vect_list] + + # generate iupac missense codons list (with synonymous codons) + iupac_codons = [] + syn_bool_list = [] + no_stop_list = [] + for i, wt_codon in enumerate(wt_list): + syn_bool = rng.choice([True, False], p=[args.syn_snp_rate, 1-args.syn_snp_rate]) ### args.syn_snp_rate + syn_bool_list.append(syn_bool) + + no_stop_bool = rng.choice([True, False], p=[args.stop_rate, 1-args.stop_rate]) ### args.stop_rate + no_stop_list.append(no_stop_bool) + + # missense_dict, synonymous_dict, no_stop_dict, no_stop_syn_dict + + # need a sub-step to check if codon contains stop codon + # need a function to check if the wt codon encodes a missense stop codon + if syn_bool and no_stop_bool: + # use no_stop_syn_dictionary + iupac_codons.append(no_stop_syn_dict[wt_codon]) + elif syn_bool and not no_stop_bool: + # use synonymous_dictionary + iupac_codons.append(synonymous_dict[wt_codon]) + elif no_stop_bool and not syn_bool: + # use no_stop_dict + iupac_codons.append(no_stop_dict[wt_codon]) + else: + # use missense dict + iupac_codons.append(missense_dict[wt_codon]) + + # make full-length oligo (homology arm, sub-window, primer), generate dataframe + for i, iupac_list in enumerate(iupac_codons): + aa_position = int((((start_index-gene_start)/3)+1)+i) + # could enumerate this out to get the aas + for iupac_codon in iupac_list: + # get AAs encoded by iupac codon + iupac_aa = codon_table.iupac_to_aa(iupac_codon) + + # place iupac_codon into sub_window + sub_window = ''.join(synonymous_win[:i] + [iupac_codon] + synonymous_win[i+1:]) + + codon_sub = wt_list[i] + str(aa_position) + iupac_codon + forward_primer_name = f'{sub_window_name}_{codon_sub}' + full_forward_primer = data_dict['homology_arm'] + sub_window + data_dict['forward_primer'] + + # add values to data_dict + dict_keys = ['name','codon_sub','wt','position','iupac', 'iupac_aa','sub_window', 'primer', 'synonymous_codons', 'no_stop_codons'] + dict_values = [forward_primer_name, codon_sub, wt_list[i], aa_position, iupac_codon, iupac_aa, sub_window, full_forward_primer, syn_bool_list[i], no_stop_list[i]] + for (key,value) in zip(dict_keys,dict_values): + data_dict[key] = value + + # append data_dict to dataframe + seq_data['df'] = seq_data['df'].append(data_dict, ignore_index=True) + + # write primers to .fasta file + seq_data['fasta_file'] = seq_data['fasta_file'] + [f">{forward_primer_name}\n", f"{full_forward_primer}\n"] + + return seq_data, data_dict diff --git a/nb/codon_table.ipynb b/nb/codon_table.ipynb index b204b58..cdc855c 100644 --- a/nb/codon_table.ipynb +++ b/nb/codon_table.ipynb @@ -182,7 +182,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.9.7" } }, "nbformat": 4, diff --git a/readme.md b/readme.md index 3e86a7f..abc9043 100644 --- a/readme.md +++ b/readme.md @@ -1,11 +1,15 @@ # DMS Missense Variants v2 - +This is really two things: +- package of python functions +- command line tool Update on designing missense variant primers across windows of interest **STATUS:** -- use bool logic for synonymous variants and remove stop codons in mut windows +- need to place the main script into the package as a function +- should be using classes to pass data/info around +- use bool logic to reduce synonymous variants and stop codons - need to check the final codon table -- need a relative path setup for the dms_codon_table_v2 + ## Input: - Wildtype .gb file diff --git a/script.py b/script.py new file mode 100644 index 0000000..dff6d31 --- /dev/null +++ b/script.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 + +# imports +import argparse +from Bio.SeqUtils import GC +from Bio.SeqUtils import MeltingTemp as mt +from Bio.Seq import Seq +from Bio import SeqIO +import math +import numpy as np +import pandas as pd +import main_package # my package + +# parse arguments +parser = argparse.ArgumentParser() +parser.add_argument("wt", help="Genbank file path containing wild type (WT) sequence", type=str) +parser.add_argument("o", help="Output prefix", type=str) +parser.add_argument("--vector", help="Genbank file path containing vector sequence", type=str, default=False) +parser.add_argument("--codon_table", help="Specify codon table to use", type=str, default='Standard') +parser.add_argument("--homo_len", help="Length of homology arm in fwd primer", type=int, default=20) +parser.add_argument("--oligo_len", help="Ideal max total length of oligo", type=int, default=60) +parser.add_argument("--melt_temp", help="Melting temp of fwd primer", type=int, default=50) +parser.add_argument("--rev_melt_temp", help="Melting temp of rev primer", type=int, default=55) +parser.add_argument("--syn_snp_rate", help="Percentage of synonymous SNPs 0-1", type=float, default=.05) +parser.add_argument("--stop_rate", help="Percentage of stop codon SNPs to remove, default = 90% of stop SNPs", type=float, default=.90) +parser.add_argument("--rng_seed", help="Set seed for repoducibly selecting synonymous codon sites", type=int, default=42) +parser.add_argument("--out_dir", help='Local output directory e.g. "data"', type=str) +args = parser.parse_args() + +# parse genbank files +wt_file = SeqIO.read(args.wt, 'genbank') + +# check for vector file +if not args.vector: + args.vector = args.wt +vector_file = SeqIO.read(args.vector, 'genbank') + +wt_seq = str(wt_file.seq.upper()) +vector_seq = str(vector_file.seq.upper()) + +# ERROR CHECKS +if len(wt_seq) != len(vector_seq): + print('ERROR: WildType and Vector GenBank sequences are not of equal length') + exit() +# check for -20 bp homology +# check that the strand is going forward + +# get start and stop of gene for codon positions +for feature in wt_file.features: + if feature.type == 'gene': + gene_start = feature.location.start.position + gene_end = feature.location.end.position + +# setup seq_data +seq_data = {} +seq_data['wt_seq'] = wt_seq +seq_data['vector_seq'] = vector_seq +seq_data['gene_start'] = gene_start +seq_data['gene_end'] = gene_end +seq_data['fasta_file'] = [] +seq_data['df'] = pd.DataFrame() +seq_data['rng'] = np.random.RandomState(42) + +# this needs to be fixed (user input? yaml?) +targ_windows = ['window_1', 'window_2', 'window_3'] + +for feature in wt_file.features: + if feature.type not in targ_windows: + continue + + start_index = feature.location.start.position + window_end = feature.location.end.position + + # loop for each sub_window + sub_window_n = 1 + while start_index < window_end: # this could be an issue to toggle + data_dict = {} + data_dict['start_index'] = start_index + data_dict['window_end'] = window_end + data_dict['sub_window_name'] = f"{str(feature.type)}-{sub_window_n}" + + # 1. homology arm + data_dict = main_package.primer_design.homology_arm(seq_data, data_dict, args) + + # 2. reverse primer + data_dict = main_package.primer_design.reverse_primer(seq_data, data_dict, args) + + # 3. forward primer + data_dict = main_package.primer_design.forward_primer(seq_data, data_dict, args) + + # 4. variant window + seq_data, data_dict = main_package.primer_design.sub_window(seq_data, data_dict, args) + + # reset the start index for the next mini-window + start_index = data_dict['primer_start'] + sub_window_n += 1 + +# setup .fa output, truncate if file exists +file = open(f"{args.o}.fa",'w+') +file.writelines(seq_data['fasta_file']) +file.close() + +# polish dataframe +df = seq_data['df'] +df['position'] = df['position'].astype(int) + +df['forward_primer_tm'] = df['forward_primer'].apply(lambda x: mt.Tm_NN(x)).round(1) +df['forward_primer_gc'] = df['forward_primer'].apply(GC).round(1) +df['forward_primer_len'] = df['forward_primer'].str.len() + +df['reverse_primer_tm'] = df['reverse_primer'].apply(lambda x: mt.Tm_NN(x)).round(1) +df['reverse_primer_gc'] = df['reverse_primer'].apply(GC).round(1) +df['reverse_primer_len'] = df['reverse_primer'].str.len() + +cols = ['name','sub_window_name','wt','position','iupac','codon_sub','iupac_aa','synonymous_codons','no_stop_codons','primer','homology_arm','sub_window','forward_primer','forward_primer_tm','forward_primer_gc','forward_primer_len','reverse_primer_name','reverse_primer','reverse_primer_tm','reverse_primer_gc','reverse_primer_len'] +df = df[cols] + +# save dataframe as .tsv +df.to_csv(f'{args.o}.tsv', index=False, sep='\t')