-
Notifications
You must be signed in to change notification settings - Fork 0
/
ProtCheck.py
107 lines (93 loc) · 3.65 KB
/
ProtCheck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#Procedural program containing methods of the XML_Parser_WMRGL script to experiment
#with protein extraction
#Program to import a specified LRG file and export corresponding fasta
import sys
import xml.etree.ElementTree as etree
import os
#Read input arguments - should be
# [0] - program name
# [1] - Input XML file name
# [2] - Padding length (intronic surrounding exons)
# [3] - "-g" for genomic sequence, "-p" for protein
#Read input file name from arguments
fileName = sys.argv[1]
assert len(sys.argv) <= 4, "Too many arguments!" #check no additional arguments provided on command line
#Check file name is valid .xml
assert fileName[-4:] == '.xml', 'You have the wrong input file'
#Scan for the optional argument specifying genomic/protein etc.
#If option is not identified, use genomic only
option = ''
try:
option = sys.argv[3]
except:
option = '-g'
#Read in the specified input file into a variable
try:
tree = etree.parse(fileName)
root = tree.getroot()
fixannot = root.find('fixed_annotation') #ensures only exons from the fixed annotation will be taken
genename = root.find('updatable_annotation/annotation_set/lrg_locus').text
refseqname = root.find('fixed_annotation/sequence_source').text
except IOError as fileNotPresent:
print "The specified file cannot be located: " + fileNotPresent.filename
exit()
#Padding option only needs to be specified for genomic sequence
#Choice should be possible for genomic AND protein (ref sequences)
if option in ['-g', '-gp', '-pg']:
try:
pad = int(sys.argv[2])
except:
pad = 0
print "Invalid/No padding provided: Padding defaulting to zero"
assert pad <= 2000, "Padding too large, please use a value below 2000 bases" #LRG files have 2000 additional genomic sequence on
#Check the version of the file we are opening is correct
if root.attrib['schema_version'] <> '1.8':
print 'This LRG file is not the correct version for this script'
print 'This is designed for v.1.8'
print 'This file is v.' + root.attrib['schema_version']
'''Grabs specific element from the xml file from a provided path'''
path = 'fixed_annotation/sequence'
try:
for item in root.findall(path):
result = item.text
print "DNA: ", result
except:
print "No sequence was identified"
#03/12/2014 at request of WMRGL
for item in root.findall('fixed_annotation/transcript/coding_region/translation'):
try:
prot_block = item.find("sequence")
protein_seq = prot_block.text
print protein_seq
except:
print "No protein sequence was found"
transcript = item.attrib['name']
print transcript
##if elif options for which sequences need to be grabbed
# if option == '-g':
# x = grab_element(, root)
# td = get_exoncoords(fixannot,pad,x)
# elif option == '-p':
# dnaSeq = grab_element('fixed_annotation/sequence', root)
# td = get_exoncoords(fixannot,pad,x)
# pd = get_proteincoords(
# for y in td.keys():
# outputfile = fileName.split('.')[0]+'_'+y+"_"+str(pad)+'_Out.fasta'
# outputFilePath = os.path.join('outputFiles',outputfile)
# existingFiles = os.listdir('outputFiles')
# if outputfile in existingFiles:
#tests whether file already exists
# print 'The output file already exists in the present directory'
# print 'Would you like to overwrite the file? y/n'
# c = 0
# while c == 0:
# userChoice = raw_input('> ')
# if userChoice == 'n':
# print "Program exited without creating file"
# exit() # can change later to offer alternate filename
# elif userChoice == 'y':
# c += 1
# else:
# print "Invalid selection please type y or n"
# out = open(outputFilePath, "w")
# print_exons(td[y],y,genename,refseqname,out)