Skip to content

Commit

Permalink
PC-2017.3.3 <luigi@VAIO Merge branch 'master'
Browse files Browse the repository at this point in the history
  • Loading branch information
lfaino committed Feb 26, 2018
2 parents 767aa73 + cac40df commit efacaf1
Show file tree
Hide file tree
Showing 8 changed files with 65 additions and 39 deletions.
12 changes: 7 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
FROM ubuntu:16.04
FROM ubuntu:xenial

RUN apt-get clean all && apt-get update && apt-get install -y build-essential apt-utils git wget perl \
python3.5 python2.7 python3-pip python-pip debconf-utils sudo python-numpy cmake samtools bedtools zlib1g-dev libc6 aptitude \
RUN apt-get clean all && apt-get update && apt-get install -y -q build-essential git wget perl \
python3.5 python2.7 software-properties-common python3-pip python-pip debconf-utils sudo python-numpy cmake samtools bedtools zlib1g-dev libc6 aptitude \
libdbd-mysql-perl libdbi-perl libboost-all-dev libncurses5-dev bowtie default-jre parallel nano bowtie2 exonerate \
bzip2 liblzma-dev libbz2-dev
bzip2 liblzma-dev libbz2-dev software-properties-common libboost-iostreams-dev libboost-system-dev libboost-filesystem-dev \
zlibc gcc-multilib apt-utils zlib1g-dev cmake tcsh g++ iputils-ping


RUN rm /bin/sh && ln -s /bin/bash /bin/sh

Expand All @@ -13,7 +15,7 @@ RUN echo "mysql-server mysql-server/root_password_again password lorean" | debco

RUN apt-get install -y mysql-server mysql-client mysql-common bowtie bioperl apache2 libcairo2-dev libpango1.0-dev

RUN pip3 install biopython==1.68 bcbio-gff==0.6.4 pandas==0.19.1 pybedtools==0.7.8 gffutils regex pysam matplotlib progressbar2 \
RUN pip3 install numpy biopython==1.68 bcbio-gff==0.6.4 pandas==0.19.1 pybedtools==0.7.8 gffutils regex pysam matplotlib progressbar2 \
psutil memory_profiler pathlib colorama

WORKDIR /opt/
Expand Down
44 changes: 32 additions & 12 deletions code/collectOnly.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import os
import sys

from Bio import SeqIO

count_sequences = 0
Expand Down Expand Up @@ -107,18 +106,35 @@ def cat_assembled(wd):
"""
sys.stdout.write('\t###GENERATE FASTA FILE FROM CONTIGS###\n')
wd_tmp = wd
fileName = wd_tmp + 'assembly.fasta'
fileName = wd_tmp + 'assembly.fasta_tmp1'
testFasta = open(fileName, 'w')

for root, dirs, files in os.walk(wd_tmp):
for name in files:
wd_fasta = os.path.join(root, name)
if 'assembled.fasta' in wd_fasta:
t_file = open(wd_fasta, 'r')
for line in t_file:
testFasta.write(line)
t_file.close()
if wd_fasta.endswith('_assembled.fasta'):
input_file = open(wd_fasta)
fasta_dict = SeqIO.to_dict(SeqIO.parse(input_file, "fasta"))
evm = [key for key in fasta_dict if "evm" in key]
above = [key for key in fasta_dict if "above" in fasta_dict[key].description]
if len(fasta_dict) > 1:
if len(evm) > 0 and len(above) > 0:
if evm[0] in above:
SeqIO.write(fasta_dict[evm[0]], testFasta, "fasta")
else:
fasta_dict[above[0]].id = fasta_dict[evm[0]].id
SeqIO.write(fasta_dict[above[0]], testFasta, "fasta")
elif len(evm) > 1:
SeqIO.write(fasta_dict[evm[0]], testFasta, "fasta")
elif len(above) > 1:
SeqIO.write(fasta_dict[above[0]], testFasta, "fasta")

elif len(evm) > 0:
SeqIO.write(fasta_dict[evm[0]], testFasta, "fasta")
elif len(above) > 0:
SeqIO.write(fasta_dict[above[0]], testFasta, "fasta")


testFasta.close()
return fileName


Expand All @@ -145,16 +161,18 @@ def cat_assembled_all(wd):
return fileName


def add_EVM(whole_fasta_name, output_filename, output_merged_fasta_name):
def add_EVM(gffread_fasta_file, tmp_assembly, merged_fasta_filename):

"""
this module looks for genes that were not used in the consensus stage. usually are gene models without long reads
support
"""
sys.stdout.write('\t###APPEND EVM NOT USED FROM CONTIGS BUILDING###\n')
'''Adds the EVM records that are not present in the final contig evidence'''
whole_fasta = open(whole_fasta_name, 'r')
out_fasta_file = open(output_filename, 'r')
outputMerged = open(output_merged_fasta_name, 'w')
whole_fasta = open(gffread_fasta_file, 'r')
out_fasta_file = open(tmp_assembly, 'r')
outputMerged = open(merged_fasta_filename, 'w')

wholeDict = SeqIO.to_dict(SeqIO.parse(whole_fasta, 'fasta'))
count = 0
dictOut = {}
Expand All @@ -180,3 +198,5 @@ def add_EVM(whole_fasta_name, output_filename, output_merged_fasta_name):
outFasta.close()
outputMerged.close()

if __name__ == '__main__':
cat_assembled(*sys.argv[1:])
11 changes: 7 additions & 4 deletions code/consensusIAssembler.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
#!/usr/bin/env python3

import os
import progressbar
import re
import subprocess
import sys
import tempfile
import time
from multiprocessing import Pool, Manager

import progressbar
from Bio import SeqIO
from multiprocessing import Pool, Manager

#==========================================================================================================
# COMMANDS LIST
Expand Down Expand Up @@ -68,7 +67,7 @@ def cluster_pipeline(gff3_file, merge_distance, strand, verbose):
sys.stdout.write("\t ###CLUSTERING IN\033[32m STRANDED MODE\033[0m###\n")

else:
btmerge1 = BEDTOOLS_MERGE_ST % (str(dist))
btmerge1 = BEDTOOLS_MERGE % (str(dist))
sys.stdout.write("\t###CLUSTERING IN\033[32m NON-STRANDED MODE\033[0m ###\n")

btsort2 = BEDTOOLS_SORT
Expand All @@ -88,6 +87,7 @@ def cluster_pipeline(gff3_file, merge_distance, strand, verbose):
if verbose:
sys.stderr.write('Executing: %s\n\n' % btsort2)
outputBT = btsort2_call.communicate()[0]

final_output = outputBT.splitlines()
return final_output

Expand Down Expand Up @@ -208,3 +208,6 @@ def iAssembler(new_commands):
return False
log.close()
return outputDir

if __name__ == '__main__':
cluster_pipeline(*sys.argv[1:])
2 changes: 1 addition & 1 deletion code/createUser.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def create_user():
create_user_call = subprocess.Popen(com, stdout=log, stderr=err, shell=True)
create_user_call.communicate()

com = "chmod -R 775 /opt/LoReAn"
com = "chmod -R 775 /home/%s" % (name_user)
create_user_call = subprocess.Popen(com, stdout=log, stderr=err, shell=True)
create_user_call.communicate()

Expand Down
10 changes: 4 additions & 6 deletions code/getRightStrand.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
#!/usr/bin/env python3

import gffutils
import gffutils.gffwriter as gffwriter
import progressbar
import re
import shutil
import subprocess
import sys
import tempfile
import time
import warnings
from multiprocessing import Pool, Manager

import gffutils
import gffutils.gffwriter as gffwriter
import progressbar
from Bio import Seq
from Bio import SeqIO
from multiprocessing import Pool, Manager

#======================================================================================================================

Expand Down Expand Up @@ -140,7 +139,6 @@ def appendID(gff):


def longest(gff_file, fasta, proc, wd, verbose):
outputFilename = wd + 'finalAnnotation.strand.gff3'
outputFilenameLeft = tempfile.NamedTemporaryFile(delete=False, dir=wd, prefix="longest.")
gff_out = gffwriter.GFFWriter(outputFilenameLeft.name)

Expand Down
12 changes: 7 additions & 5 deletions code/lorean.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def main():
bam_file = short_sorted_bam.split("/")
short_bam = star_out + "/" + bam_file[-1]
if not os.path.exists(ref):
os.symlink(short_sorted_bam, short_bam)
os.link(short_sorted_bam, short_bam)

else:
short_sorted_bam = False
Expand Down Expand Up @@ -368,14 +368,16 @@ def main():
final_files.append(evm_gff3)
final_files.append(gff3_stat_file)

round_n = 1

if not args.short_reads and not args.long_reads:
last_gff3 = grs.newNames(evm_gff3)
#score_gff3 = score.score(last_gff3, evm_inputs)
now = datetime.datetime.now().strftime(fmtdate)
sys.exit("##### EVM FINISHED AT:\t" + now + "\t#####\n")

round_n = 1
if args.short_reads and not args.long_reads:
else:
#if args.short_reads and not args.long_reads:
now = datetime.datetime.now().strftime(fmtdate)
sys.stdout.write(('\n###UPDATE WITH PASA DATABASE STARTED AT:\t ' + now + '\t###\n'))
round_n += 1
Expand All @@ -385,8 +387,8 @@ def main():
updatedGff3 = grs.newNames(final_update)
#score_gff3 = score.score(updatedGff3, evm_inputs)
final_files.append(updatedGff3)
else:
updatedGff3 = evm_gff3
#else:
#updatedGff3 = evm_gff3


#score_gff3 = score.score(evm_gff3, evm_inputs)
Expand Down
8 changes: 4 additions & 4 deletions code/manipulateSeq.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
import os
import subprocess
import sys
from Bio import SeqIO
from Bio.Seq import reverse_complement
from multiprocessing import Pool

import ssw_lib
from Bio import SeqIO
from Bio.Seq import reverse_complement


def to_int(seq, lEle, dEle2Int):
Expand Down Expand Up @@ -171,7 +171,7 @@ def filterLongReads(fastq_filename, min_length, max_length, wd, adapter, threads
for adpter in list_seq_adap:
list_command.append([record_dict[key], adpter])
with Pool(processes=int(threads), maxtasksperchild=1000) as p:
align_resul = p.map(align_call, list_command)
align_resul = p.map(align_call, list_command, chunksize=1)
for aling_res in align_resul:
if len(aling_res) == 0:
next
Expand Down Expand Up @@ -201,7 +201,7 @@ def filterLongReads(fastq_filename, min_length, max_length, wd, adapter, threads
for adpter in list_seq_adap:
list_command.append([record_dict[key], adpter])
with Pool(processes=int(threads), maxtasksperchild=1000) as p:
align_resul = p.map(align_call, list_command)
align_resul = p.map(align_call, list_command, chunksize=1)
for aling_res in align_resul:
if len(aling_res) == 0:
next
Expand Down
5 changes: 3 additions & 2 deletions code/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import collectOnly as collect
import consensusIAssembler as consensus
import dirsAndFiles as logistic
import evmPipeline
import getRightStrand as grs
import manipulateSeq as mseq
import mapping
Expand Down Expand Up @@ -61,7 +62,7 @@ def upgrade():
ref_orig = os.path.abspath(args.reference)
ref = os.path.join(wd, args.reference)
if not os.path.exists(ref):
os.symlink(ref_orig, ref)
os.link(ref_orig, ref)

max_threads = multiprocessing.cpu_count()
if int(args.threads) > max_threads:
Expand Down Expand Up @@ -119,7 +120,7 @@ def upgrade():
bam_file = short_sorted_bam.split("/")
short_bam = star_out + "/" + bam_file[-1]
if not os.path.exists(ref):
os.symlink(short_sorted_bam, short_bam)
os.link(short_sorted_bam, short_bam)
else:
short_sorted_bam = False
sys.stdout.write('No short reads file')
Expand Down

0 comments on commit efacaf1

Please sign in to comment.