PC-2017.3.3 <luigi@VAIO Merge branch 'master'

eburgueno · Feb 26, 2018 · efacaf1 · efacaf1
2 parents 767aa73 + cac40df
commit efacaf1
Show file tree

Hide file tree

Showing 8 changed files with 65 additions and 39 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,9 +1,11 @@
-FROM ubuntu:16.04
+FROM ubuntu:xenial
 
-RUN apt-get clean all && apt-get update && apt-get install -y build-essential apt-utils git wget perl \
-    python3.5 python2.7 python3-pip python-pip debconf-utils sudo python-numpy cmake samtools bedtools zlib1g-dev libc6 aptitude \
+RUN apt-get clean all && apt-get update && apt-get install -y -q build-essential git wget perl \
+    python3.5 python2.7 software-properties-common python3-pip python-pip debconf-utils sudo python-numpy cmake samtools bedtools zlib1g-dev libc6 aptitude \
     libdbd-mysql-perl libdbi-perl libboost-all-dev libncurses5-dev bowtie default-jre parallel nano bowtie2 exonerate \
-    bzip2 liblzma-dev libbz2-dev
+    bzip2 liblzma-dev libbz2-dev software-properties-common libboost-iostreams-dev libboost-system-dev libboost-filesystem-dev \
+    zlibc gcc-multilib apt-utils zlib1g-dev cmake tcsh g++ iputils-ping
+
 
 RUN rm /bin/sh && ln -s /bin/bash /bin/sh
 
@@ -13,7 +15,7 @@ RUN echo "mysql-server mysql-server/root_password_again password lorean" | debco
 
 RUN apt-get install -y mysql-server mysql-client mysql-common bowtie bioperl apache2 libcairo2-dev libpango1.0-dev 
 
-RUN pip3 install biopython==1.68 bcbio-gff==0.6.4 pandas==0.19.1 pybedtools==0.7.8 gffutils regex pysam matplotlib progressbar2 \
+RUN pip3 install numpy biopython==1.68 bcbio-gff==0.6.4 pandas==0.19.1 pybedtools==0.7.8 gffutils regex pysam matplotlib progressbar2 \
     psutil memory_profiler pathlib colorama
 
 WORKDIR /opt/

diff --git a/code/collectOnly.py b/code/collectOnly.py
@@ -2,7 +2,6 @@
 
 import os
 import sys
-
 from Bio import SeqIO
 
 count_sequences = 0
@@ -107,18 +106,35 @@ def cat_assembled(wd):
     """
     sys.stdout.write('\t###GENERATE FASTA FILE FROM CONTIGS###\n')
     wd_tmp = wd
-    fileName = wd_tmp + 'assembly.fasta'
+    fileName = wd_tmp + 'assembly.fasta_tmp1'
     testFasta = open(fileName, 'w')
+
     for root, dirs, files in os.walk(wd_tmp):
         for name in files:
             wd_fasta = os.path.join(root, name)
-            if 'assembled.fasta' in wd_fasta:
-                t_file = open(wd_fasta, 'r')
-                for line in t_file:
-                    testFasta.write(line)
-                t_file.close()
+            if wd_fasta.endswith('_assembled.fasta'):
+                input_file = open(wd_fasta)
+                fasta_dict = SeqIO.to_dict(SeqIO.parse(input_file, "fasta"))
+                evm = [key for key in fasta_dict if "evm" in key]
+                above = [key for key in fasta_dict if "above" in fasta_dict[key].description]
+                if len(fasta_dict) > 1:
+                    if len(evm) > 0 and len(above) > 0:
+                        if evm[0] in above:
+                            SeqIO.write(fasta_dict[evm[0]], testFasta, "fasta")
+                        else:
+                            fasta_dict[above[0]].id = fasta_dict[evm[0]].id
+                            SeqIO.write(fasta_dict[above[0]], testFasta, "fasta")
+                    elif len(evm) > 1:
+                        SeqIO.write(fasta_dict[evm[0]], testFasta, "fasta")
+                    elif len(above) > 1:
+                        SeqIO.write(fasta_dict[above[0]], testFasta, "fasta")
+
+                elif len(evm) > 0:
+                    SeqIO.write(fasta_dict[evm[0]], testFasta, "fasta")
+                elif len(above) > 0:
+                    SeqIO.write(fasta_dict[above[0]], testFasta, "fasta")
+
 
-    testFasta.close()
     return fileName
 
 
@@ -145,16 +161,18 @@ def cat_assembled_all(wd):
     return fileName
 
 
-def add_EVM(whole_fasta_name, output_filename, output_merged_fasta_name):
+def add_EVM(gffread_fasta_file, tmp_assembly, merged_fasta_filename):
+
     """
     this module looks for genes that were not used in the consensus stage. usually are gene models without long reads
     support
     """
     sys.stdout.write('\t###APPEND EVM NOT USED FROM CONTIGS BUILDING###\n')
     '''Adds the EVM records that are not present in the final contig evidence'''
-    whole_fasta = open(whole_fasta_name, 'r')
-    out_fasta_file = open(output_filename, 'r')
-    outputMerged = open(output_merged_fasta_name, 'w')
+    whole_fasta = open(gffread_fasta_file, 'r')
+    out_fasta_file = open(tmp_assembly, 'r')
+    outputMerged = open(merged_fasta_filename, 'w')
+
     wholeDict = SeqIO.to_dict(SeqIO.parse(whole_fasta, 'fasta'))
     count = 0
     dictOut = {}
@@ -180,3 +198,5 @@ def add_EVM(whole_fasta_name, output_filename, output_merged_fasta_name):
     outFasta.close()
     outputMerged.close()
 
+if __name__ == '__main__':
+    cat_assembled(*sys.argv[1:])
diff --git a/code/consensusIAssembler.py b/code/consensusIAssembler.py
@@ -1,15 +1,14 @@
 #!/usr/bin/env python3
 
 import os
+import progressbar
 import re
 import subprocess
 import sys
 import tempfile
 import time
-from multiprocessing import Pool, Manager
-
-import progressbar
 from Bio import SeqIO
+from multiprocessing import Pool, Manager
 
 #==========================================================================================================
 # COMMANDS LIST
@@ -68,7 +67,7 @@ def cluster_pipeline(gff3_file, merge_distance, strand, verbose):
         sys.stdout.write("\t ###CLUSTERING IN\033[32m STRANDED MODE\033[0m###\n")
 
     else:
-        btmerge1 = BEDTOOLS_MERGE_ST % (str(dist))
+        btmerge1 = BEDTOOLS_MERGE % (str(dist))
         sys.stdout.write("\t###CLUSTERING IN\033[32m NON-STRANDED MODE\033[0m ###\n")
 
     btsort2 = BEDTOOLS_SORT
@@ -88,6 +87,7 @@ def cluster_pipeline(gff3_file, merge_distance, strand, verbose):
     if verbose:
         sys.stderr.write('Executing: %s\n\n' % btsort2)
     outputBT = btsort2_call.communicate()[0]
+
     final_output = outputBT.splitlines()
     return final_output
 
@@ -208,3 +208,6 @@ def iAssembler(new_commands):
         return False
     log.close()
     return outputDir
+
+if __name__ == '__main__':
+    cluster_pipeline(*sys.argv[1:])
diff --git a/code/createUser.py b/code/createUser.py
@@ -70,7 +70,7 @@ def create_user():
     create_user_call = subprocess.Popen(com, stdout=log, stderr=err, shell=True)
     create_user_call.communicate()
 
-    com = "chmod -R 775 /opt/LoReAn"
+    com = "chmod -R 775 /home/%s" % (name_user)
     create_user_call = subprocess.Popen(com, stdout=log, stderr=err, shell=True)
     create_user_call.communicate()
 

diff --git a/code/getRightStrand.py b/code/getRightStrand.py
@@ -1,19 +1,18 @@
 #!/usr/bin/env python3
 
+import gffutils
+import gffutils.gffwriter as gffwriter
+import progressbar
 import re
 import shutil
 import subprocess
 import sys
 import tempfile
 import time
 import warnings
-from multiprocessing import Pool, Manager
-
-import gffutils
-import gffutils.gffwriter as gffwriter
-import progressbar
 from Bio import Seq
 from Bio import SeqIO
+from multiprocessing import Pool, Manager
 
 #======================================================================================================================
 
@@ -140,7 +139,6 @@ def appendID(gff):
 
 
 def longest(gff_file, fasta, proc, wd, verbose):
-    outputFilename = wd + 'finalAnnotation.strand.gff3'
     outputFilenameLeft = tempfile.NamedTemporaryFile(delete=False, dir=wd, prefix="longest.")
     gff_out = gffwriter.GFFWriter(outputFilenameLeft.name)
 

diff --git a/code/lorean.py b/code/lorean.py
@@ -173,7 +173,7 @@ def main():
                 bam_file = short_sorted_bam.split("/")
                 short_bam = star_out + "/" + bam_file[-1]
                 if not os.path.exists(ref):
-                    os.symlink(short_sorted_bam, short_bam)
+                    os.link(short_sorted_bam, short_bam)
 
             else:
                 short_sorted_bam = False
@@ -368,14 +368,16 @@ def main():
         final_files.append(evm_gff3)
         final_files.append(gff3_stat_file)
 
+        round_n = 1
+
         if not args.short_reads and not args.long_reads:
             last_gff3 = grs.newNames(evm_gff3)
             #score_gff3 = score.score(last_gff3, evm_inputs)
             now = datetime.datetime.now().strftime(fmtdate)
             sys.exit("##### EVM FINISHED AT:\t" + now + "\t#####\n")
 
-        round_n = 1
-        if args.short_reads and not args.long_reads:
+        else:
+        #if args.short_reads and not args.long_reads:
             now = datetime.datetime.now().strftime(fmtdate)
             sys.stdout.write(('\n###UPDATE WITH PASA DATABASE STARTED AT:\t ' + now + '\t###\n'))
             round_n += 1
@@ -385,8 +387,8 @@ def main():
             updatedGff3 = grs.newNames(final_update)
             #score_gff3 = score.score(updatedGff3, evm_inputs)
             final_files.append(updatedGff3)
-        else:
-            updatedGff3 = evm_gff3
+        #else:
+            #updatedGff3 = evm_gff3
 
 
         #score_gff3 = score.score(evm_gff3, evm_inputs)

diff --git a/code/manipulateSeq.py b/code/manipulateSeq.py
@@ -4,11 +4,11 @@
 import os
 import subprocess
 import sys
+from Bio import SeqIO
+from Bio.Seq import reverse_complement
 from multiprocessing import Pool
 
 import ssw_lib
-from Bio import SeqIO
-from Bio.Seq import reverse_complement
 
 
 def to_int(seq, lEle, dEle2Int):
@@ -171,7 +171,7 @@ def filterLongReads(fastq_filename, min_length, max_length, wd, adapter, threads
             for adpter in list_seq_adap:
                 list_command.append([record_dict[key], adpter])
         with Pool(processes=int(threads), maxtasksperchild=1000) as p:
-            align_resul = p.map(align_call, list_command)
+            align_resul = p.map(align_call, list_command, chunksize=1)
         for aling_res in align_resul:
             if len(aling_res) == 0:
                 next
@@ -201,7 +201,7 @@ def filterLongReads(fastq_filename, min_length, max_length, wd, adapter, threads
             for adpter in list_seq_adap:
                 list_command.append([record_dict[key], adpter])
         with Pool(processes=int(threads), maxtasksperchild=1000) as p:
-            align_resul = p.map(align_call, list_command)
+            align_resul = p.map(align_call, list_command, chunksize=1)
         for aling_res in align_resul:
             if len(aling_res) == 0:
                 next

diff --git a/code/update.py b/code/update.py
@@ -12,6 +12,7 @@
 import collectOnly as collect
 import consensusIAssembler as consensus
 import dirsAndFiles as logistic
+import evmPipeline
 import getRightStrand as grs
 import manipulateSeq as mseq
 import mapping
@@ -61,7 +62,7 @@ def upgrade():
     ref_orig = os.path.abspath(args.reference)
     ref = os.path.join(wd, args.reference)
     if not os.path.exists(ref):
-        os.symlink(ref_orig, ref)
+        os.link(ref_orig, ref)
 
     max_threads = multiprocessing.cpu_count()
     if int(args.threads) > max_threads:
@@ -119,7 +120,7 @@ def upgrade():
             bam_file = short_sorted_bam.split("/")
             short_bam = star_out + "/" + bam_file[-1]
             if not os.path.exists(ref):
-                os.symlink(short_sorted_bam, short_bam)
+                os.link(short_sorted_bam, short_bam)
         else:
             short_sorted_bam = False
             sys.stdout.write('No short reads file')