forked from NSLS-II/lsdc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sanitize_sheet.py
88 lines (82 loc) · 3 KB
/
sanitize_sheet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import re
import math
valid_length = 25
valid_filename_chars = '[0-9a-zA-Z-_]{0,%s}' % valid_length
valid_chars_re = re.compile(valid_filename_chars)
valid_seq_file_re = re.compile('[0-9a-zA-Z-_]{0,%s}.?[0-9a-zA-Z]{0,3}' % valid_length)
def print_all_errors(all_errors):
to_return = "the following errors were found:\n"
for error in all_errors:
to_return.append('%s\n' % (error))
raise Exception(to_return)
def check_sampleNames(sampleNames):
#check for length
valid_sampleName_chars = '[0-9a-zA-Z-_]{0,%s}' % (valid_length)
for sampleName in sampleNames:
#check for anything besides letters, numbers, '-', '_' and length
if not valid_chars_re.fullmatch(sampleName):
raise Exception('invalid characters or bad length of samplename "%s". only up to %s numbers, letters, dash ("-"), and underscore ("_") are allowed' % (sampleName, valid_length))
return True
def create_containers():
pass
def add_samples():
pass
def check_for_sequence(sequence_entry):
#length must be less than valid_length
#if valid amino acids, then create a file and store it!
#must be standard amino acids
#otherwise, it should be a valid filename. shouldn't this allow one "." for extension as well?
valid_amino_acid_chars = '[ACDEFGHIKLMNPQRSTVWY]'
for sequence in sequence_entry:
try:
if math.isnan(sequence):
continue
except TypeError:
pass
if not re.match(valid_amino_acid_chars, sequence):
# must be a filename
if not valid_seq_file_re.fullmatch(sequence):
raise Exception('invalid filename for sequence "%s".' % (sequence))
else:
raise Exception('sequence should not be directly entered into filename entry!')
return True
def check_proposalNum(proposalNums):
all_errors = []
try:
for proposalNum in proposalNums:
int(proposalNum)
except ValueError:
raise Exception('proposal number "%s" must be a number! cannot contain letters' % proposalNum)
proposals = set()
for proposalNum in proposalNums:
proposals.add(proposalNum)
if len(proposals) > 1:
raise Exception('there cannot be multiple proposal numbers in spreadsheet:' + str(proposals))
return True
def check_for_duplicate_samples(sampleNames):
#sampleName must be unique
sampleNamesSet = set()
for sampleName in sampleNames:
if sampleName in sampleNamesSet:
raise Exception("duplicate sampleName: sampleName: %s" % (sampleName))
sampleNamesSet.add(sampleName)
return True
if __name__ == '__main__':
info = []
info.append('abcdef')
check_sampleNames(info)
info.append('abcdefghijklmnopqrstuvwxy1234')
try:
check_sampleNames(info)
except:
pass #expected failure
check_for_sequence('filename.seq')
try:
check_for_sequence('ACDEFGHIIH')
except:
pass
proposalNums = ['123456', 'su123456']
try:
check_proposalNum(proposalNums, '1234567')
except:
pass #expected failure with second item