-
Notifications
You must be signed in to change notification settings - Fork 3
/
BasicInfoPaser.py
87 lines (77 loc) · 2.52 KB
/
BasicInfoPaser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
__author__ = 'Chong Chu'
#!/usr/bin/env python
import sys
import os
import shutil
import glob
from subprocess import *
from Utility import OUTPUT_FOLDER
from Utility import get_output_folder
from Utility import print_command
def calc_total_coverage(bpaired,sfleft_reads,sfright_reads,sfsingle_reads,READ_LENGTH,GENOME_LENGTH):
print "Calculating average coverage...."
OUTPUT_FOLDER=get_output_folder()
sfcoverage=OUTPUT_FOLDER+"reads_coverage.txt"
f=2.0
if os.path.exists(sfcoverage)!=True:
#print "Test: ", sfleft_reads###########################################################################################################3
if bpaired==True:
f=calc_coverage(sfleft_reads,READ_LENGTH, GENOME_LENGTH) + calc_coverage(sfright_reads,READ_LENGTH, GENOME_LENGTH)
else:
f=calc_coverage(sfsingle_reads,READ_LENGTH, GENOME_LENGTH)
fout_cov=open(sfcoverage,"wt")
fout_cov.write(str(f))
fout_cov.close()
else:
#read in file
fin_cov=open(sfcoverage,"rt")
f=float(fin_cov.readline())
fin_cov.close()
stest_output="Read coverage is: {0}".format(f)
print stest_output
return f
#calculate coverage
def calc_coverage(fpath, read_length, genome_length):
cnt_lines=0
cmd=""
if fpath.lower().endswith(('.fq', '.fastq')):
cmd="echo $(wc -l {0})".format(fpath)
elif fpath.lower().endswith(('.fastq.gz', 'fq.gz', 'gz')):
cmd="echo $(zcat {0} | wc -l)".format(fpath)
else:
print "Something wrong with the raw reads files format:", fpath
print_command("Running command: "+ cmd +"...")
tp=tuple(Popen(cmd, shell = True, stdout = PIPE).communicate())
lcnt=str(tp[0]).split()
print_command("The number of reads in file {0} is {1}".format(fpath,tp))
cnt_lines=int(lcnt[0])
cnt_reads=int(cnt_lines)/4
cov=float(cnt_reads*read_length)/float(genome_length)
return cov
'''
Description:
Read in a fasta file
Parameters:
sffa: path of fa file
brstrip: whether strip '\n' or others can be striped by rstrip()
'''
def read_contig_fa(sffa,brstrip):
dcontigs={}
name=""
seq=""
last_name=""
ffa=open(sffa)
for line in ffa:
if brstrip==True:
line=line.rstrip()
if line[0]=='>':
name=line[1:]
if last_name!="":
dcontigs[last_name]=seq
seq=""
last_name=name
else:
seq=seq+line
dcontigs[last_name]=seq
ffa.close()
return dcontigs