From a0e835ccf81e498849bbf98aa69ca77a00a8843d Mon Sep 17 00:00:00 2001
From: lfaino <luigi.faino@gmail.com>
Date: Sun, 18 Feb 2018 08:31:17 +0100
Subject: [PATCH 1/3] v1.1

---
 code/consensusIAssembler.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/code/consensusIAssembler.py b/code/consensusIAssembler.py
index fb76dd3a..9b627c79 100644
--- a/code/consensusIAssembler.py
+++ b/code/consensusIAssembler.py
@@ -1,15 +1,14 @@
 #!/usr/bin/env python3
 
 import os
+import progressbar
 import re
 import subprocess
 import sys
 import tempfile
 import time
-from multiprocessing import Pool, Manager
-
-import progressbar
 from Bio import SeqIO
+from multiprocessing import Pool, Manager
 
 #==========================================================================================================
 # COMMANDS LIST
@@ -68,7 +67,7 @@ def cluster_pipeline(gff3_file, merge_distance, strand, verbose):
         sys.stdout.write("\t ###CLUSTERING IN\033[32m STRANDED MODE\033[0m###\n")
 
     else:
-        btmerge1 = BEDTOOLS_MERGE_ST % (str(dist))
+        btmerge1 = BEDTOOLS_MERGE % (str(dist))
         sys.stdout.write("\t###CLUSTERING IN\033[32m NON-STRANDED MODE\033[0m ###\n")
 
     btsort2 = BEDTOOLS_SORT

From e93a4005050f5c78d3fc254bd6c41e5e80a39916 Mon Sep 17 00:00:00 2001
From: lfaino <luigi.faino@gmail.com>
Date: Sun, 18 Feb 2018 08:39:40 +0100
Subject: [PATCH 2/3] v1.1

---
 Dockerfile                  | 12 +++++-----
 code/collectOnly.py         | 44 +++++++++++++++++++++++++++----------
 code/consensusIAssembler.py |  4 ++++
 code/createUser.py          |  2 +-
 code/getRightStrand.py      | 10 ++++-----
 code/lorean.py              |  2 +-
 code/manipulateSeq.py       |  8 +++----
 code/update.py              |  5 +++--
 8 files changed, 56 insertions(+), 31 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 26c3237a..fe3ace1d 100755
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,9 +1,11 @@
-FROM ubuntu:16.04
+FROM ubuntu:xenial
 
-RUN apt-get clean all && apt-get update && apt-get install -y build-essential apt-utils git wget perl \
-    python3.5 python2.7 python3-pip python-pip debconf-utils sudo python-numpy cmake samtools bedtools zlib1g-dev libc6 aptitude \
+RUN apt-get clean all && apt-get update && apt-get install -y -q build-essential git wget perl \
+    python3.5 python2.7 software-properties-common python3-pip python-pip debconf-utils sudo python-numpy cmake samtools bedtools zlib1g-dev libc6 aptitude \
     libdbd-mysql-perl libdbi-perl libboost-all-dev libncurses5-dev bowtie default-jre parallel nano bowtie2 exonerate \
-    bzip2 liblzma-dev libbz2-dev
+    bzip2 liblzma-dev libbz2-dev software-properties-common libboost-iostreams-dev libboost-system-dev libboost-filesystem-dev \
+    zlibc gcc-multilib apt-utils zlib1g-dev cmake tcsh g++ iputils-ping
+
 
 RUN rm /bin/sh && ln -s /bin/bash /bin/sh
 
@@ -13,7 +15,7 @@ RUN echo "mysql-server mysql-server/root_password_again password lorean" | debco
 
 RUN apt-get install -y mysql-server mysql-client mysql-common bowtie bioperl apache2 libcairo2-dev libpango1.0-dev 
 
-RUN pip3 install biopython==1.68 bcbio-gff==0.6.4 pandas==0.19.1 pybedtools==0.7.8 gffutils regex pysam matplotlib progressbar2 \
+RUN pip3 install numpy biopython==1.68 bcbio-gff==0.6.4 pandas==0.19.1 pybedtools==0.7.8 gffutils regex pysam matplotlib progressbar2 \
     psutil memory_profiler pathlib colorama
 
 WORKDIR /opt/
diff --git a/code/collectOnly.py b/code/collectOnly.py
index 4e887511..d5afb9d3 100644
--- a/code/collectOnly.py
+++ b/code/collectOnly.py
@@ -2,7 +2,6 @@
 
 import os
 import sys
-
 from Bio import SeqIO
 
 count_sequences = 0
@@ -107,18 +106,35 @@ def cat_assembled(wd):
     """
     sys.stdout.write('\t###GENERATE FASTA FILE FROM CONTIGS###\n')
     wd_tmp = wd
-    fileName = wd_tmp + 'assembly.fasta'
+    fileName = wd_tmp + 'assembly.fasta_tmp1'
     testFasta = open(fileName, 'w')
+
     for root, dirs, files in os.walk(wd_tmp):
         for name in files:
             wd_fasta = os.path.join(root, name)
-            if 'assembled.fasta' in wd_fasta:
-                t_file = open(wd_fasta, 'r')
-                for line in t_file:
-                    testFasta.write(line)
-                t_file.close()
+            if wd_fasta.endswith('_assembled.fasta'):
+                input_file = open(wd_fasta)
+                fasta_dict = SeqIO.to_dict(SeqIO.parse(input_file, "fasta"))
+                evm = [key for key in fasta_dict if "evm" in key]
+                above = [key for key in fasta_dict if "above" in fasta_dict[key].description]
+                if len(fasta_dict) > 1:
+                    if len(evm) > 0 and len(above) > 0:
+                        if evm[0] in above:
+                            SeqIO.write(fasta_dict[evm[0]], testFasta, "fasta")
+                        else:
+                            fasta_dict[above[0]].id = fasta_dict[evm[0]].id
+                            SeqIO.write(fasta_dict[above[0]], testFasta, "fasta")
+                    elif len(evm) > 1:
+                        SeqIO.write(fasta_dict[evm[0]], testFasta, "fasta")
+                    elif len(above) > 1:
+                        SeqIO.write(fasta_dict[above[0]], testFasta, "fasta")
+
+                elif len(evm) > 0:
+                    SeqIO.write(fasta_dict[evm[0]], testFasta, "fasta")
+                elif len(above) > 0:
+                    SeqIO.write(fasta_dict[above[0]], testFasta, "fasta")
+
 
-    testFasta.close()
     return fileName
 
 
@@ -145,16 +161,18 @@ def cat_assembled_all(wd):
     return fileName
 
 
-def add_EVM(whole_fasta_name, output_filename, output_merged_fasta_name):
+def add_EVM(gffread_fasta_file, tmp_assembly, merged_fasta_filename):
+
     """
     this module looks for genes that were not used in the consensus stage. usually are gene models without long reads
     support
     """
     sys.stdout.write('\t###APPEND EVM NOT USED FROM CONTIGS BUILDING###\n')
     '''Adds the EVM records that are not present in the final contig evidence'''
-    whole_fasta = open(whole_fasta_name, 'r')
-    out_fasta_file = open(output_filename, 'r')
-    outputMerged = open(output_merged_fasta_name, 'w')
+    whole_fasta = open(gffread_fasta_file, 'r')
+    out_fasta_file = open(tmp_assembly, 'r')
+    outputMerged = open(merged_fasta_filename, 'w')
+
     wholeDict = SeqIO.to_dict(SeqIO.parse(whole_fasta, 'fasta'))
     count = 0
     dictOut = {}
@@ -180,3 +198,5 @@ def add_EVM(whole_fasta_name, output_filename, output_merged_fasta_name):
     outFasta.close()
     outputMerged.close()
 
+if __name__ == '__main__':
+    cat_assembled(*sys.argv[1:])
\ No newline at end of file
diff --git a/code/consensusIAssembler.py b/code/consensusIAssembler.py
index 9b627c79..8ff63e08 100644
--- a/code/consensusIAssembler.py
+++ b/code/consensusIAssembler.py
@@ -87,6 +87,7 @@ def cluster_pipeline(gff3_file, merge_distance, strand, verbose):
     if verbose:
         sys.stderr.write('Executing: %s\n\n' % btsort2)
     outputBT = btsort2_call.communicate()[0]
+
     final_output = outputBT.splitlines()
     return final_output
 
@@ -207,3 +208,6 @@ def iAssembler(new_commands):
         return False
     log.close()
     return outputDir
+
+if __name__ == '__main__':
+    cluster_pipeline(*sys.argv[1:])
\ No newline at end of file
diff --git a/code/createUser.py b/code/createUser.py
index b0e20e18..f8ab6e44 100644
--- a/code/createUser.py
+++ b/code/createUser.py
@@ -70,7 +70,7 @@ def create_user():
     create_user_call = subprocess.Popen(com, stdout=log, stderr=err, shell=True)
     create_user_call.communicate()
 
-    com = "chmod -R 775 /opt/LoReAn"
+    com = "chmod -R 775 /home/%s" % (name_user)
     create_user_call = subprocess.Popen(com, stdout=log, stderr=err, shell=True)
     create_user_call.communicate()
 
diff --git a/code/getRightStrand.py b/code/getRightStrand.py
index d506c088..b824dc9e 100644
--- a/code/getRightStrand.py
+++ b/code/getRightStrand.py
@@ -1,5 +1,8 @@
 #!/usr/bin/env python3
 
+import gffutils
+import gffutils.gffwriter as gffwriter
+import progressbar
 import re
 import shutil
 import subprocess
@@ -7,13 +10,9 @@
 import tempfile
 import time
 import warnings
-from multiprocessing import Pool, Manager
-
-import gffutils
-import gffutils.gffwriter as gffwriter
-import progressbar
 from Bio import Seq
 from Bio import SeqIO
+from multiprocessing import Pool, Manager
 
 #======================================================================================================================
 
@@ -140,7 +139,6 @@ def appendID(gff):
 
 
 def longest(gff_file, fasta, proc, wd, verbose):
-    outputFilename = wd + 'finalAnnotation.strand.gff3'
     outputFilenameLeft = tempfile.NamedTemporaryFile(delete=False, dir=wd, prefix="longest.")
     gff_out = gffwriter.GFFWriter(outputFilenameLeft.name)
 
diff --git a/code/lorean.py b/code/lorean.py
index a32acc56..dd98ba93 100755
--- a/code/lorean.py
+++ b/code/lorean.py
@@ -173,7 +173,7 @@ def main():
                 bam_file = short_sorted_bam.split("/")
                 short_bam = star_out + "/" + bam_file[-1]
                 if not os.path.exists(ref):
-                    os.symlink(short_sorted_bam, short_bam)
+                    os.link(short_sorted_bam, short_bam)
 
             else:
                 short_sorted_bam = False
diff --git a/code/manipulateSeq.py b/code/manipulateSeq.py
index e0854eb0..b3d090df 100755
--- a/code/manipulateSeq.py
+++ b/code/manipulateSeq.py
@@ -4,11 +4,11 @@
 import os
 import subprocess
 import sys
+from Bio import SeqIO
+from Bio.Seq import reverse_complement
 from multiprocessing import Pool
 
 import ssw_lib
-from Bio import SeqIO
-from Bio.Seq import reverse_complement
 
 
 def to_int(seq, lEle, dEle2Int):
@@ -171,7 +171,7 @@ def filterLongReads(fastq_filename, min_length, max_length, wd, adapter, threads
             for adpter in list_seq_adap:
                 list_command.append([record_dict[key], adpter])
         with Pool(processes=int(threads), maxtasksperchild=1000) as p:
-            align_resul = p.map(align_call, list_command)
+            align_resul = p.map(align_call, list_command, chunksize=1)
         for aling_res in align_resul:
             if len(aling_res) == 0:
                 next
@@ -201,7 +201,7 @@ def filterLongReads(fastq_filename, min_length, max_length, wd, adapter, threads
             for adpter in list_seq_adap:
                 list_command.append([record_dict[key], adpter])
         with Pool(processes=int(threads), maxtasksperchild=1000) as p:
-            align_resul = p.map(align_call, list_command)
+            align_resul = p.map(align_call, list_command, chunksize=1)
         for aling_res in align_resul:
             if len(aling_res) == 0:
                 next
diff --git a/code/update.py b/code/update.py
index 72b49bb8..9bb6e79e 100644
--- a/code/update.py
+++ b/code/update.py
@@ -12,6 +12,7 @@
 import collectOnly as collect
 import consensusIAssembler as consensus
 import dirsAndFiles as logistic
+import evmPipeline
 import getRightStrand as grs
 import manipulateSeq as mseq
 import mapping
@@ -61,7 +62,7 @@ def upgrade():
     ref_orig = os.path.abspath(args.reference)
     ref = os.path.join(wd, args.reference)
     if not os.path.exists(ref):
-        os.symlink(ref_orig, ref)
+        os.link(ref_orig, ref)
 
     max_threads = multiprocessing.cpu_count()
     if int(args.threads) > max_threads:
@@ -119,7 +120,7 @@ def upgrade():
             bam_file = short_sorted_bam.split("/")
             short_bam = star_out + "/" + bam_file[-1]
             if not os.path.exists(ref):
-                os.symlink(short_sorted_bam, short_bam)
+                os.link(short_sorted_bam, short_bam)
         else:
             short_sorted_bam = False
             sys.stdout.write('No short reads file')

From cac40df11cc77096ba57bf3e57d592084e557acc Mon Sep 17 00:00:00 2001
From: lfaino <luigi.faino@gmail.com>
Date: Sun, 25 Feb 2018 15:59:09 +0100
Subject: [PATCH 3/3] v1.1

---
 code/lorean.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/code/lorean.py b/code/lorean.py
index dd98ba93..461ff8f9 100755
--- a/code/lorean.py
+++ b/code/lorean.py
@@ -368,14 +368,16 @@ def main():
         final_files.append(evm_gff3)
         final_files.append(gff3_stat_file)
 
+        round_n = 1
+
         if not args.short_reads and not args.long_reads:
             last_gff3 = grs.newNames(evm_gff3)
             #score_gff3 = score.score(last_gff3, evm_inputs)
             now = datetime.datetime.now().strftime(fmtdate)
             sys.exit("##### EVM FINISHED AT:\t" + now + "\t#####\n")
 
-        round_n = 1
-        if args.short_reads and not args.long_reads:
+        else:
+        #if args.short_reads and not args.long_reads:
             now = datetime.datetime.now().strftime(fmtdate)
             sys.stdout.write(('\n###UPDATE WITH PASA DATABASE STARTED AT:\t ' + now + '\t###\n'))
             round_n += 1
@@ -385,8 +387,8 @@ def main():
             updatedGff3 = grs.newNames(final_update)
             #score_gff3 = score.score(updatedGff3, evm_inputs)
             final_files.append(updatedGff3)
-        else:
-            updatedGff3 = evm_gff3
+        #else:
+            #updatedGff3 = evm_gff3
 
 
         #score_gff3 = score.score(evm_gff3, evm_inputs)