From a0e835ccf81e498849bbf98aa69ca77a00a8843d Mon Sep 17 00:00:00 2001 From: lfaino Date: Sun, 18 Feb 2018 08:31:17 +0100 Subject: [PATCH 1/3] v1.1 --- code/consensusIAssembler.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/code/consensusIAssembler.py b/code/consensusIAssembler.py index fb76dd3a..9b627c79 100644 --- a/code/consensusIAssembler.py +++ b/code/consensusIAssembler.py @@ -1,15 +1,14 @@ #!/usr/bin/env python3 import os +import progressbar import re import subprocess import sys import tempfile import time -from multiprocessing import Pool, Manager - -import progressbar from Bio import SeqIO +from multiprocessing import Pool, Manager #========================================================================================================== # COMMANDS LIST @@ -68,7 +67,7 @@ def cluster_pipeline(gff3_file, merge_distance, strand, verbose): sys.stdout.write("\t ###CLUSTERING IN\033[32m STRANDED MODE\033[0m###\n") else: - btmerge1 = BEDTOOLS_MERGE_ST % (str(dist)) + btmerge1 = BEDTOOLS_MERGE % (str(dist)) sys.stdout.write("\t###CLUSTERING IN\033[32m NON-STRANDED MODE\033[0m ###\n") btsort2 = BEDTOOLS_SORT From e93a4005050f5c78d3fc254bd6c41e5e80a39916 Mon Sep 17 00:00:00 2001 From: lfaino Date: Sun, 18 Feb 2018 08:39:40 +0100 Subject: [PATCH 2/3] v1.1 --- Dockerfile | 12 +++++----- code/collectOnly.py | 44 +++++++++++++++++++++++++++---------- code/consensusIAssembler.py | 4 ++++ code/createUser.py | 2 +- code/getRightStrand.py | 10 ++++----- code/lorean.py | 2 +- code/manipulateSeq.py | 8 +++---- code/update.py | 5 +++-- 8 files changed, 56 insertions(+), 31 deletions(-) diff --git a/Dockerfile b/Dockerfile index 26c3237a..fe3ace1d 100755 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,11 @@ -FROM ubuntu:16.04 +FROM ubuntu:xenial -RUN apt-get clean all && apt-get update && apt-get install -y build-essential apt-utils git wget perl \ - python3.5 python2.7 python3-pip python-pip debconf-utils sudo python-numpy cmake samtools bedtools zlib1g-dev libc6 aptitude \ +RUN apt-get clean all && apt-get update && apt-get install -y -q build-essential git wget perl \ + python3.5 python2.7 software-properties-common python3-pip python-pip debconf-utils sudo python-numpy cmake samtools bedtools zlib1g-dev libc6 aptitude \ libdbd-mysql-perl libdbi-perl libboost-all-dev libncurses5-dev bowtie default-jre parallel nano bowtie2 exonerate \ - bzip2 liblzma-dev libbz2-dev + bzip2 liblzma-dev libbz2-dev software-properties-common libboost-iostreams-dev libboost-system-dev libboost-filesystem-dev \ + zlibc gcc-multilib apt-utils zlib1g-dev cmake tcsh g++ iputils-ping + RUN rm /bin/sh && ln -s /bin/bash /bin/sh @@ -13,7 +15,7 @@ RUN echo "mysql-server mysql-server/root_password_again password lorean" | debco RUN apt-get install -y mysql-server mysql-client mysql-common bowtie bioperl apache2 libcairo2-dev libpango1.0-dev -RUN pip3 install biopython==1.68 bcbio-gff==0.6.4 pandas==0.19.1 pybedtools==0.7.8 gffutils regex pysam matplotlib progressbar2 \ +RUN pip3 install numpy biopython==1.68 bcbio-gff==0.6.4 pandas==0.19.1 pybedtools==0.7.8 gffutils regex pysam matplotlib progressbar2 \ psutil memory_profiler pathlib colorama WORKDIR /opt/ diff --git a/code/collectOnly.py b/code/collectOnly.py index 4e887511..d5afb9d3 100644 --- a/code/collectOnly.py +++ b/code/collectOnly.py @@ -2,7 +2,6 @@ import os import sys - from Bio import SeqIO count_sequences = 0 @@ -107,18 +106,35 @@ def cat_assembled(wd): """ sys.stdout.write('\t###GENERATE FASTA FILE FROM CONTIGS###\n') wd_tmp = wd - fileName = wd_tmp + 'assembly.fasta' + fileName = wd_tmp + 'assembly.fasta_tmp1' testFasta = open(fileName, 'w') + for root, dirs, files in os.walk(wd_tmp): for name in files: wd_fasta = os.path.join(root, name) - if 'assembled.fasta' in wd_fasta: - t_file = open(wd_fasta, 'r') - for line in t_file: - testFasta.write(line) - t_file.close() + if wd_fasta.endswith('_assembled.fasta'): + input_file = open(wd_fasta) + fasta_dict = SeqIO.to_dict(SeqIO.parse(input_file, "fasta")) + evm = [key for key in fasta_dict if "evm" in key] + above = [key for key in fasta_dict if "above" in fasta_dict[key].description] + if len(fasta_dict) > 1: + if len(evm) > 0 and len(above) > 0: + if evm[0] in above: + SeqIO.write(fasta_dict[evm[0]], testFasta, "fasta") + else: + fasta_dict[above[0]].id = fasta_dict[evm[0]].id + SeqIO.write(fasta_dict[above[0]], testFasta, "fasta") + elif len(evm) > 1: + SeqIO.write(fasta_dict[evm[0]], testFasta, "fasta") + elif len(above) > 1: + SeqIO.write(fasta_dict[above[0]], testFasta, "fasta") + + elif len(evm) > 0: + SeqIO.write(fasta_dict[evm[0]], testFasta, "fasta") + elif len(above) > 0: + SeqIO.write(fasta_dict[above[0]], testFasta, "fasta") + - testFasta.close() return fileName @@ -145,16 +161,18 @@ def cat_assembled_all(wd): return fileName -def add_EVM(whole_fasta_name, output_filename, output_merged_fasta_name): +def add_EVM(gffread_fasta_file, tmp_assembly, merged_fasta_filename): + """ this module looks for genes that were not used in the consensus stage. usually are gene models without long reads support """ sys.stdout.write('\t###APPEND EVM NOT USED FROM CONTIGS BUILDING###\n') '''Adds the EVM records that are not present in the final contig evidence''' - whole_fasta = open(whole_fasta_name, 'r') - out_fasta_file = open(output_filename, 'r') - outputMerged = open(output_merged_fasta_name, 'w') + whole_fasta = open(gffread_fasta_file, 'r') + out_fasta_file = open(tmp_assembly, 'r') + outputMerged = open(merged_fasta_filename, 'w') + wholeDict = SeqIO.to_dict(SeqIO.parse(whole_fasta, 'fasta')) count = 0 dictOut = {} @@ -180,3 +198,5 @@ def add_EVM(whole_fasta_name, output_filename, output_merged_fasta_name): outFasta.close() outputMerged.close() +if __name__ == '__main__': + cat_assembled(*sys.argv[1:]) \ No newline at end of file diff --git a/code/consensusIAssembler.py b/code/consensusIAssembler.py index 9b627c79..8ff63e08 100644 --- a/code/consensusIAssembler.py +++ b/code/consensusIAssembler.py @@ -87,6 +87,7 @@ def cluster_pipeline(gff3_file, merge_distance, strand, verbose): if verbose: sys.stderr.write('Executing: %s\n\n' % btsort2) outputBT = btsort2_call.communicate()[0] + final_output = outputBT.splitlines() return final_output @@ -207,3 +208,6 @@ def iAssembler(new_commands): return False log.close() return outputDir + +if __name__ == '__main__': + cluster_pipeline(*sys.argv[1:]) \ No newline at end of file diff --git a/code/createUser.py b/code/createUser.py index b0e20e18..f8ab6e44 100644 --- a/code/createUser.py +++ b/code/createUser.py @@ -70,7 +70,7 @@ def create_user(): create_user_call = subprocess.Popen(com, stdout=log, stderr=err, shell=True) create_user_call.communicate() - com = "chmod -R 775 /opt/LoReAn" + com = "chmod -R 775 /home/%s" % (name_user) create_user_call = subprocess.Popen(com, stdout=log, stderr=err, shell=True) create_user_call.communicate() diff --git a/code/getRightStrand.py b/code/getRightStrand.py index d506c088..b824dc9e 100644 --- a/code/getRightStrand.py +++ b/code/getRightStrand.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +import gffutils +import gffutils.gffwriter as gffwriter +import progressbar import re import shutil import subprocess @@ -7,13 +10,9 @@ import tempfile import time import warnings -from multiprocessing import Pool, Manager - -import gffutils -import gffutils.gffwriter as gffwriter -import progressbar from Bio import Seq from Bio import SeqIO +from multiprocessing import Pool, Manager #====================================================================================================================== @@ -140,7 +139,6 @@ def appendID(gff): def longest(gff_file, fasta, proc, wd, verbose): - outputFilename = wd + 'finalAnnotation.strand.gff3' outputFilenameLeft = tempfile.NamedTemporaryFile(delete=False, dir=wd, prefix="longest.") gff_out = gffwriter.GFFWriter(outputFilenameLeft.name) diff --git a/code/lorean.py b/code/lorean.py index a32acc56..dd98ba93 100755 --- a/code/lorean.py +++ b/code/lorean.py @@ -173,7 +173,7 @@ def main(): bam_file = short_sorted_bam.split("/") short_bam = star_out + "/" + bam_file[-1] if not os.path.exists(ref): - os.symlink(short_sorted_bam, short_bam) + os.link(short_sorted_bam, short_bam) else: short_sorted_bam = False diff --git a/code/manipulateSeq.py b/code/manipulateSeq.py index e0854eb0..b3d090df 100755 --- a/code/manipulateSeq.py +++ b/code/manipulateSeq.py @@ -4,11 +4,11 @@ import os import subprocess import sys +from Bio import SeqIO +from Bio.Seq import reverse_complement from multiprocessing import Pool import ssw_lib -from Bio import SeqIO -from Bio.Seq import reverse_complement def to_int(seq, lEle, dEle2Int): @@ -171,7 +171,7 @@ def filterLongReads(fastq_filename, min_length, max_length, wd, adapter, threads for adpter in list_seq_adap: list_command.append([record_dict[key], adpter]) with Pool(processes=int(threads), maxtasksperchild=1000) as p: - align_resul = p.map(align_call, list_command) + align_resul = p.map(align_call, list_command, chunksize=1) for aling_res in align_resul: if len(aling_res) == 0: next @@ -201,7 +201,7 @@ def filterLongReads(fastq_filename, min_length, max_length, wd, adapter, threads for adpter in list_seq_adap: list_command.append([record_dict[key], adpter]) with Pool(processes=int(threads), maxtasksperchild=1000) as p: - align_resul = p.map(align_call, list_command) + align_resul = p.map(align_call, list_command, chunksize=1) for aling_res in align_resul: if len(aling_res) == 0: next diff --git a/code/update.py b/code/update.py index 72b49bb8..9bb6e79e 100644 --- a/code/update.py +++ b/code/update.py @@ -12,6 +12,7 @@ import collectOnly as collect import consensusIAssembler as consensus import dirsAndFiles as logistic +import evmPipeline import getRightStrand as grs import manipulateSeq as mseq import mapping @@ -61,7 +62,7 @@ def upgrade(): ref_orig = os.path.abspath(args.reference) ref = os.path.join(wd, args.reference) if not os.path.exists(ref): - os.symlink(ref_orig, ref) + os.link(ref_orig, ref) max_threads = multiprocessing.cpu_count() if int(args.threads) > max_threads: @@ -119,7 +120,7 @@ def upgrade(): bam_file = short_sorted_bam.split("/") short_bam = star_out + "/" + bam_file[-1] if not os.path.exists(ref): - os.symlink(short_sorted_bam, short_bam) + os.link(short_sorted_bam, short_bam) else: short_sorted_bam = False sys.stdout.write('No short reads file') From cac40df11cc77096ba57bf3e57d592084e557acc Mon Sep 17 00:00:00 2001 From: lfaino Date: Sun, 25 Feb 2018 15:59:09 +0100 Subject: [PATCH 3/3] v1.1 --- code/lorean.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/code/lorean.py b/code/lorean.py index dd98ba93..461ff8f9 100755 --- a/code/lorean.py +++ b/code/lorean.py @@ -368,14 +368,16 @@ def main(): final_files.append(evm_gff3) final_files.append(gff3_stat_file) + round_n = 1 + if not args.short_reads and not args.long_reads: last_gff3 = grs.newNames(evm_gff3) #score_gff3 = score.score(last_gff3, evm_inputs) now = datetime.datetime.now().strftime(fmtdate) sys.exit("##### EVM FINISHED AT:\t" + now + "\t#####\n") - round_n = 1 - if args.short_reads and not args.long_reads: + else: + #if args.short_reads and not args.long_reads: now = datetime.datetime.now().strftime(fmtdate) sys.stdout.write(('\n###UPDATE WITH PASA DATABASE STARTED AT:\t ' + now + '\t###\n')) round_n += 1 @@ -385,8 +387,8 @@ def main(): updatedGff3 = grs.newNames(final_update) #score_gff3 = score.score(updatedGff3, evm_inputs) final_files.append(updatedGff3) - else: - updatedGff3 = evm_gff3 + #else: + #updatedGff3 = evm_gff3 #score_gff3 = score.score(evm_gff3, evm_inputs)