Skip to content

Test 3 Question 3

Meg Staton edited this page Dec 11, 2020 · 1 revision

Python code:

                                                                                                                                  1,1           Top
from Bio import SeqIO
import sys

counts_file = sys.argv[1]
transcripts_file = "Ppersica_298_v2.1.transcript_primaryTranscriptOnly.ALT.fa"

### Get transcript lengths

transcript2len = {}

for seq_record in SeqIO.parse(transcripts_file, "fasta"):
    transcript2len[seq_record.id] = len(str(seq_record.seq))
    #print(seq_record.id + "\t" + str(len(str(seq_record.seq))))


### RPK - reads per gene per kb of gene length
id2rpk = {}

cntFH = open(counts_file, "r")
for line in cntFH:
    fields = line.split()
    id = fields[0]
    count = int(fields[1])
    rpk = count/(transcript2len[id]/1000)
    id2rpk[id] = rpk
    #print(id + "\t" + str(count) + "\t" +  str(transcript2len[id]) + "\t" + str(rpk))

cntFH.close()

## Scaling factor

sfactor = 0
for id in id2rpk:
    sfactor += id2rpk[id]

#print(str(sfactor))
sfactor = sfactor/1000000
print(str(sfactor))

## Final TPMs and output

outputFH = open(counts_file + ".tpm", "w")
cntFH = open(counts_file, "r")

for line in cntFH:
    fields = line.split()
    id = fields[0]
    count = int(fields[1])
    outputFH.write(id + "\t")
    outputFH.write( str(count) + "\t")
    outputFH.write( str(transcript2len[id]) + "\t")
    rpk = round(id2rpk[id], 2)
    outputFH.write( str(rpk) + "\t")
    tpm = round(rpk/sfactor, 2)
    outputFH.write( str(tpm) + "\n")

outputFH.close()
cntFH.close()