-
Notifications
You must be signed in to change notification settings - Fork 1
/
cdr3_dna_search.py
97 lines (85 loc) · 3.69 KB
/
cdr3_dna_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/python
import os
__author__ = 'Csaba Kiss'
__email__ = 'csakis[at]lanl[dot]gov'
"""This scripts takes the fasta output of mothur and removes the identifiers,
reverse complements the DNA sequence and outputs the file with .rev extension.
It also finds CDR3s based on the cdr3_pattern regular expression pattern and translates
the found DNA sequunces into peptide sequences and saves the list as .cdr file."""
import sys
import re
from string import maketrans
def protein_translate(aa_dict, seq):
cdr3 = '' #empty protein sequence
codon_list = re.findall('...', seq) #separates the sequence by triplets
for codon in codon_list:
cdr3 = cdr3 + aa_dict[codon] #use the triplet list to translat to aa
return cdr3
def stars():
print '\n', '*' * 80
return
def reverse_complement(line):
"""
This function reverse-complements a DNA sequence that contains a \n character at the end
"""
revcomp_key = maketrans('ACGTRYMKBDHVacgtrymkbdhv', 'TGCAYRKMVHDBtgcayrkmvhdn') # the DNA complementation key
line = line[:-1] # remove new line character from end
line = line[::-1] #reverse sequence
rev_comp_line = line.translate(revcomp_key) + '\n' #complement sequence and add new line
return rev_comp_line
#build the AA translation dictionary
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
if not os.path.isfile(os.path.join(__location__, 'aa_table.csv')):
stars()
print 'You need to download the aa_table.csv file from the ionpython website!'
print 'http://sourceforge.net/ionpython/'
stars()
sys.exit(
'Please run the program after you copied the aa_table.csv file into the directory where all the other python scripts are located!')
aa_file = open(os.path.join(__location__, 'aa_table.csv')) # the csv file containing the translation table
aa_dict = {}
for line in aa_file.readlines():
aa = line.split(',')
aa_dict[aa[1][:-1]] = aa[0]
#AA dictionary ready
"""Searching for CDR3s."""
if len(sys.argv) < 2:
fasta_in = raw_input('Please input the sample file name (i.e.: sd6.trim.rev) ')
else:
fasta_in = sys.argv[1]
cdr_out = fasta_in + '.dna.cdr3'
cdr_file_name = fasta_in + 'dna.cdr3' #The file name that will contain the HCDR3 sequences.
# *** Here is the regular expression pattern. ***
cdr3_pattern = r'TA[CT](TT[CT]|TA[TC]|CA[TC]|GT[AGCT])TG[TC][GA][AGCT]([ACGT]{3}){5,32}[AGCT]TGGG[GCT][GCT]' #CDR3 search pattern
stars()
print 'Now we are looking for CDR3 sequences'
seq_count = 0
cdr3_count = 0
fasta_file = open(fasta_in, 'rU')
fasta_list = fasta_file.readlines()
cdr_file_out = open(cdr_out, 'w')
#CDR3 search loop
for DNA_seq in fasta_list: #Go through each line of the trimmed fasta file
seq_count += 1
if re.match('^[ACGTN]', DNA_seq): #check if the line contains seq ID or DNA seq
match = re.search(cdr3_pattern, reverse_complement(DNA_seq)) #check the reverse complement first
if match:
cdr3_count += 1
line_out = '%s, %s\n' % (match.group()[2:-1], protein_translate(aa_dict, match.group())[2:-1])
cdr_file_out.write(line_out) #write the CDR3 output file
else:
rev_match = re.search(cdr3_pattern, DNA_seq) #check if the seq contains a CDR3
if rev_match:
cdr3_count += 1
line_out = '%s, %s\n' % (rev_match.group()[2:-1], protein_translate(aa_dict, rev_match.group())[2:-1])
cdr_file_out.write(line_out) #write the CDR3 output file
if (seq_count % 250000) == 0:
print 'So far %d sequences have been checked.' % seq_count
#End CDR3 search loop
stars()
print '%d CRD3s were found.' % cdr3_count
fasta_file.close() #close trimmed fasta file
cdr_file_out.close() #close cdr file
print 'The %s file has been created and contains all the CDR3s.' % cdr_file_name
stars()
# End of CDR3 search