-
Notifications
You must be signed in to change notification settings - Fork 24
Test 3 Question 3
Meg Staton edited this page Dec 11, 2020
·
1 revision
Python code:
1,1 Top
from Bio import SeqIO
import sys
counts_file = sys.argv[1]
transcripts_file = "Ppersica_298_v2.1.transcript_primaryTranscriptOnly.ALT.fa"
### Get transcript lengths
transcript2len = {}
for seq_record in SeqIO.parse(transcripts_file, "fasta"):
transcript2len[seq_record.id] = len(str(seq_record.seq))
#print(seq_record.id + "\t" + str(len(str(seq_record.seq))))
### RPK - reads per gene per kb of gene length
id2rpk = {}
cntFH = open(counts_file, "r")
for line in cntFH:
fields = line.split()
id = fields[0]
count = int(fields[1])
rpk = count/(transcript2len[id]/1000)
id2rpk[id] = rpk
#print(id + "\t" + str(count) + "\t" + str(transcript2len[id]) + "\t" + str(rpk))
cntFH.close()
## Scaling factor
sfactor = 0
for id in id2rpk:
sfactor += id2rpk[id]
#print(str(sfactor))
sfactor = sfactor/1000000
print(str(sfactor))
## Final TPMs and output
outputFH = open(counts_file + ".tpm", "w")
cntFH = open(counts_file, "r")
for line in cntFH:
fields = line.split()
id = fields[0]
count = int(fields[1])
outputFH.write(id + "\t")
outputFH.write( str(count) + "\t")
outputFH.write( str(transcript2len[id]) + "\t")
rpk = round(id2rpk[id], 2)
outputFH.write( str(rpk) + "\t")
tpm = round(rpk/sfactor, 2)
outputFH.write( str(tpm) + "\n")
outputFH.close()
cntFH.close()