From 816cbc8c089af78480fff8210edf1752ece17f07 Mon Sep 17 00:00:00 2001 From: Aroon Chande Date: Fri, 5 Jan 2018 10:43:31 -0500 Subject: [PATCH] Version 0.5 This update brings a new output format to stringMLST. NOtably, versions prior would return no output (columns omitted) for alleles which had 0 kmer hits. stringMLST will now return "NA" values for alleles with no hits. This should hopefully make the output more obvious in cases such as #35 --- stringMLST.py | 68 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 24 deletions(-) mode change 100644 => 100755 stringMLST.py diff --git a/stringMLST.py b/stringMLST.py old mode 100644 new mode 100755 index 7213634..0d0e0ca --- a/stringMLST.py +++ b/stringMLST.py @@ -15,7 +15,7 @@ except ImportError: from urllib import urlopen, urlretrieve import argparse -version = """ stringMLST v0.4.2 (updated : September 13, 2017) """ +version = """ stringMLST v0.5 (updated : January 5, 2018) """ """ stringMLST free for academic users and requires permission before any commercial @@ -515,6 +515,7 @@ def goodReads(read, k, non_overlapping_window): s = str(line[n:n+k]) if s in kmerDict[k]: for probLoc in kmerDict[k][s]: + # print(probLoc) if probLoc not in alleleCount: alleleCount[probLoc] = {} a = kmerDict[k][s][probLoc] @@ -558,6 +559,8 @@ def getMaxCount(alleleCount, fileName): maxSupport = {} secondSupport = {} finalProfileCount = {} + for locus in alleleNames: + finalProfileCount[locus] = {} num = '' for loc in alleleCount: n = 0 @@ -567,32 +570,42 @@ def getMaxCount(alleleCount, fileName): m = n n = alleleCount[loc][num] if n-m < fuzzy: - alleleCount[loc][num] = str(alleleCount[loc][num])+'*' - max_n[loc] = str(n)+'*' + try: + alleleCount[loc][num] + except: + pass + else: + alleleCount[loc][num] = str(alleleCount[loc][num])+'*' + max_n[loc] = str(n)+'*' else: max_n[loc] = n secondMax[loc] = m for loc in alleleCount: - maxSupport[loc] = {} - secondSupport[loc] = {} - num_max = [] - num_max2 = [] - compare = float(re.sub("\*$", "", str(max_n[loc]))) - for num in alleleCount[loc]: - if alleleCount[loc][num] == compare: - if "\*" in str(max_n[loc]): - insert = num + '*' - num_max.append(insert) - else: - num_max.append(num) - maxSupport[loc][num] = max_n[loc] - if alleleCount[loc][num] == secondMax[loc]: - num_max2.append(num) - secondSupport[loc][num] = secondMax[loc] try: - finalProfileCount[loc] = num_max[0] - except LookupError: - finalProfileCount[loc] = '0' + max_n[loc] + except: + pass + else: + maxSupport[loc] = {} + secondSupport[loc] = {} + num_max = [] + num_max2 = [] + compare = float(re.sub("\*$", "", str(max_n[loc]))) + for num in alleleCount[loc]: + if alleleCount[loc][num] == compare: + if "\*" in str(max_n[loc]): + insert = num + '*' + num_max.append(insert) + else: + num_max.append(num) + maxSupport[loc][num] = max_n[loc] + if alleleCount[loc][num] == secondMax[loc]: + num_max2.append(num) + secondSupport[loc][num] = secondMax[loc] + try: + finalProfileCount[loc] = num_max[0] + except LookupError: + finalProfileCount[loc] = '0' msgs = "Max Support :" + fileName + " : " + str(maxSupport) logging.debug(msgs) msgs = "Second Max Support :" + fileName + " : " + str(secondSupport) @@ -630,7 +643,8 @@ def findST(finalProfile, stProfile): exit(0) transformedFinalProfile = {} for gene, allele in finalProfile.items(): - allele = re.sub("\*", "", allele) + if allele: + allele = re.sub("\*", "", allele) transformedFinalProfile[finalGeneToSTGene[gene]] = allele # Check to see if the dictionary is empty, if so then means no allele were found at all if bool(transformedFinalProfile) is False: @@ -698,10 +712,13 @@ def loadKmerDict(dbFile): kmerTableDict = {} with open(dbFile, 'r') as kmerTableFile: lines = kmerTableFile.readlines() + global alleleNames + alleleNames = set() for line in lines: array = line.rstrip().rsplit('\t') kmerTableDict[array[0]] = {} kmerTableDict[array[0]][array[1]] = array[2][1:-1].rsplit(',') + alleleNames.add(array[1]) return kmerTableDict ############################################################# # Function : loadWeightDict @@ -820,7 +837,10 @@ def printResults(results, output_filename, overwrite, timeDisp): for l in sorted(results[s]): if l == 'ST' or l == 't': continue - sample += '\t'+results[s][l] + if results[s][l]: + sample += '\t'+results[s][l] + else: + sample += '\tNA' if timeDisp is True: sample += '\t' + str(results[s]['ST']) + '\t%.2f ' %results[s]['t'] else: