v4.2.1

Change MLST QA/QC: * Samples with known MLST scheme but for which it was not possible to find a scheme will now raise a warning instead of fail Saves the total number of reads and bp sequenced Update mlst version * Change base image of INNUca's Docker image (to perl:5.30, a Debian 9 image) for Perl and old Kernel version compatibilities * Install and update mlst dependencies * Install Java JDK headless but provide extra font loading capabilities (mainly for FastQC, but also Trimmomatic and Pilon) Minor changes: * Check if the mlst novel alleles file exists before trying to cleaning it * Catches subprocess error when the program to run is not installed
B-UMMI · Nov 27, 2019 · cc2b6f5 · cc2b6f5
1 parent b2608a8
commit cc2b6f5
Show file tree

Hide file tree

Showing 8 changed files with 175 additions and 104 deletions.
diff --git a/Docker/Dockerfile b/Docker/Dockerfile
@@ -1,16 +1,22 @@
-FROM ubuntu:16.04
+FROM perl:5.30-slim-stretch
 MAINTAINER Miguel Machado <[email protected]>
-LABEL version="4.2.0-03"
+LABEL version="4.2.1-01"
 
 WORKDIR /NGStools/
 
 RUN apt-get update
 
 # -- General Dependencies ---
-RUN apt-get install -y git wget make g++
+RUN apt-get install -y git wget g++
 
 # -- INNUca General Dependencies ---
-RUN apt-get install -y python-dev python-pip default-jre python3 python3-pip
+RUN apt-get install -y python-dev python-pip python3 python3-pip procps libfontconfig1
+# - Java -
+RUN wget https://download.java.net/openjdk/jdk8u40/ri/openjdk-8u40-b25-linux-x64-10_feb_2015.tar.gz && \
+    tar xf openjdk-8u40-b25-linux-x64-10_feb_2015.tar.gz && \
+    rm openjdk-8u40-b25-linux-x64-10_feb_2015.tar.gz
+ENV PATH="/NGStools/java-se-8u40-ri/bin:${PATH}"
+
 RUN pip install plotly
 
 # --- kraken2 --
@@ -27,12 +33,15 @@ ENV PATH="/NGStools/kraken2-2.0.7-beta:${PATH}"
 
 # -- mlst Dependencies --
 # Blast
-RUN wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.8.1/ncbi-blast-2.8.1+-x64-linux.tar.gz && \
-    tar -xf ncbi-blast-2.8.1+-x64-linux.tar.gz && \
-    rm ncbi-blast-2.8.1+-x64-linux.tar.gz
-ENV PATH="/NGStools/ncbi-blast-2.8.1+/bin:${PATH}"
+RUN wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.9.0/ncbi-blast-2.9.0+-x64-linux.tar.gz && \
+    tar -xf ncbi-blast-2.9.0+-x64-linux.tar.gz && \
+    rm ncbi-blast-2.9.0+-x64-linux.tar.gz
+# any2fasta
+RUN git clone https://github.com/tseemann/any2fasta.git
 # Perl libs
-RUN apt-get install -y libmoo-perl liblist-moreutils-perl libjson-perl libfile-slurp-perl
+RUN cpan Moo List::MoreUtils JSON
+
+ENV PATH="/NGStools/ncbi-blast-2.9.0+/bin:/NGStools/any2fasta:${PATH}"
 
 # --- mlst ----
 RUN git clone https://github.com/tseemann/mlst.git
@@ -45,22 +54,20 @@ ENV PATH="/NGStools/mlst/bin:${PATH}"
 #    pip3 install setuptools && \
 #    python3 setup.py install && \
 #    cd /NGStools
-RUN git clone https://github.com/B-UMMI/ReMatCh.git && \
-    cd ReMatCh
+RUN git clone https://github.com/B-UMMI/ReMatCh.git
 ENV PATH="/NGStools/ReMatCh/ReMatCh/src/samtools-1.3.1/bin:/NGStools/ReMatCh/ReMatCh/src/bcftools-1.3.1/bin:/NGStools/ReMatCh/ReMatCh/src/bowtie2-2.2.9:/NGStools/ReMatCh/ReMatCh:${PATH}"
 
 # --- INNUca ---
 RUN git clone https://github.com/B-UMMI/INNUca.git && \
     pip install setuptools
-# TODO: remove checkout
 ENV PATH="/NGStools/INNUca/src/fastqc_v0.11.5:/NGStools/INNUca/src/pilon_v1.23:/NGStools/INNUca/src/SPAdes-3.13.0-Linux/bin:/NGStools/INNUca/src/Trimmomatic-0.38:/NGStools/INNUca:${PATH}"
 
 # fixing permissions for MLST update
 RUN chmod +x /NGStools/INNUca/Docker/update_mlst_db.sh && chmod o+wr /NGStools/mlst/scripts/ && chmod -R o+wr /NGStools/mlst/db/ && sed -i "s#OUTDIR=pubmlst#OUTDIR=/NGStools/mlst/scripts/pubmlst#1" /NGStools/mlst/scripts/mlst-download_pub_mlst
 
 # Clean
 RUN pip3 uninstall setuptools && \
-    apt-get remove -y make g++ python-pip python3-pip && \
+    apt-get remove -y g++ python-pip python3-pip && \
     apt-get autoclean -y
 
 WORKDIR /data/
diff --git a/Docker/README.md b/Docker/README.md
@@ -10,15 +10,16 @@ INNUca - Reads Control and Assembly
 This is a dockerfile for using INNUca, with all dependencies already installed.
 
 Within this container you can find:
-- ubuntu:16.04
-- git v2.7.4
+- Debian Stretch (9)
+- Perl v5.30
+- git v2.11.0
 - Python v2.7
-- Java-JRE v1.8.0_171
-- [Blast+](https://blast.ncbi.nlm.nih.gov/Blast.cgi) v2.6.0
-- [mlst](https://github.com/tseemann/mlst) v2.15.1
-- [ReMatCh](https://github.com/B-UMMI/ReMatCh) v4.0
+- Java-JDK v1.8.0_40 headless
+- [Blast+](https://blast.ncbi.nlm.nih.gov/Blast.cgi) v2.9.0
+- [mlst](https://github.com/tseemann/mlst) v2.18.0
+- [ReMatCh](https://github.com/B-UMMI/ReMatCh) v4.1.0
 - [Kraken](https://ccb.jhu.edu/software/kraken/) v2.0.7
-- [INNUca](https://github.com/B-UMMI/INNUca) v4.1.0
+- [INNUca](https://github.com/B-UMMI/INNUca) v4.2.1
 
 
 
@@ -29,37 +30,37 @@ Within [play-with-docker](http://labs.play-with-docker.com/) webpage click on **
 will open with a big counter on the upper left corner. Click on **+ add new instance** and a terminal like instance should be generated on the right. On
 this terminal you can load this docker image as follows:
 
-`docker pull ummidock/innuca:4.2.0-01`
+`docker pull ummidock/innuca:4.2.1-01`
 
 #### Build this docker on your local machine
 
 For this, docker needs to be installed on your machine. Instructions for this can be found [here](https://docs.docker.com/engine/installation/).
 
 ##### Using DockerHub (automated build image)
 
-`docker pull ummidock/innuca:4.2.0-01`
+`docker pull ummidock/innuca:4.2.1-01`
 
 ##### Using GitHub (build docker image)
 
 1) `git clone https://github.com/B-UMMI/INNUca.git`  
-2) `docker build -t ummidock/innuca:4.2.0-01 ./INNUca/Docker/`
+2) `docker build -t ummidock/innuca:4.2.1-01 ./INNUca/Docker/`
 
 ### Run (using automated build image)
-    docker run --rm -u $(id -u):$(id -g) -it -v /local/folder/fastq_data:/data/ ummidock/innuca:4.2.0-01 INNUca.py --speciesExpected "Streptococcus agalactiae" --genomeSizeExpectedMb 2.1 --inputDirectory /data/ --outdir /data/innuca_output/ --threads 8 --maxNumberContigs 100
+    docker run --rm -u $(id -u):$(id -g) -it -v /local/folder/fastq_data:/data/ ummidock/innuca:4.2.1-01 INNUca.py --speciesExpected "Streptococcus agalactiae" --genomeSizeExpectedMb 2.1 --inputDirectory /data/ --outdir /data/innuca_output/ --threads 8 --maxNumberContigs 100
 
 ### udocker
 
 > "A basic user tool to execute simple docker containers in user space without requiring root privileges.". From [here](https://github.com/indigo-dc/udocker).
 
 ```bash
 # Get Docker image
-udocker pull ummidock/innuca:4.2.0-01
+udocker pull ummidock/innuca:4.2.1-01
 
 # Create container (only needed to be done once)
-udocker create --name=innuca_4-2-0_01 ummidock/innuca:4.2.0-01
+udocker create --name=innuca_4-2-1_01 ummidock/innuca:4.2.1-01
 
 # Run INNUca
-udocker run --user $(id -u):$(id -g) -v /local/folder/fastq_data:/data/ innuca_4-2-0_01 INNUca.py --speciesExpected "Streptococcus agalactiae" --genomeSizeExpectedMb 2.1 --inputDirectory /data/ --outdir /data/innuca_output/ --threads 8 --maxNumberContigs 100
+udocker run --user $(id -u):$(id -g) -v /local/folder/fastq_data:/data/ innuca_4-2-1_01 INNUca.py --speciesExpected "Streptococcus agalactiae" --genomeSizeExpectedMb 2.1 --inputDirectory /data/ --outdir /data/innuca_output/ --threads 8 --maxNumberContigs 100
 ```
 More examples on how to use **udocker** can be found in **udocker** [GitHub page](https://github.com/indigo-dc/udocker)  
 

diff --git a/INNUca.py b/INNUca.py
@@ -11,7 +11,7 @@
 
 Copyright (C) 2018 Miguel Machado <[email protected]>
 
-Last modified: December 28, 2018
+Last modified: November 25, 2019
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -91,7 +91,7 @@ def include_rematch_dependencies_path(do_not_use_provided_software):
 
 
 def main():
-    version = '4.2.0'
+    version = '4.2.1'
     args = utils.parseArguments(version)
 
     general_start_time = time.time()

diff --git a/modules/combine_reports.py b/modules/combine_reports.py
@@ -59,6 +59,7 @@ def combine_reports(innucaOut, outdir, json, time_str, number_samples):
         sys.exit('No samples found')
 
     fields = ['#samples',
+              'number_reads_sequenced', 'number_bp_sequenced',
               'min_reads_length', 'max_reads_length',
               'reads_kraken_number_taxon_found', 'reads_kraken_percentage_unknown_fragments',
               'reads_kraken_most_abundant_taxon', 'reads_kraken_percentage_most_abundant_taxon',
@@ -94,14 +95,16 @@ def combine_reports(innucaOut, outdir, json, time_str, number_samples):
                 name_file_found = file_found
                 file_found = os.path.join(directory, file_found)
 
-                if name_file_found == 'reads_length_report.tab':
+                if name_file_found == 'reads_num_length_num_bp_report.tab':
                     with open(file_found, 'rt') as reader:
                         for line in reader:
                             if len(line) > 0:
                                 if not line.startswith('#'):
                                     line = line.splitlines()[0].split('\t')
-                                    results[sample]['min_reads_length'] = line[0]
-                                    results[sample]['max_reads_length'] = line[1]
+                                    results[sample]['number_reads_sequenced'] = line[0]
+                                    results[sample]['number_bp_sequenced'] = line[1]
+                                    results[sample]['min_reads_length'] = line[2]
+                                    results[sample]['max_reads_length'] = line[3]
                 elif name_file_found.startswith('kraken_results.evaluation.') and name_file_found.endswith('fastq.tab'):
                     with open(file_found, 'rt') as reader:
                         for line in reader:

diff --git a/modules/fastQintegrity.py b/modules/fastQintegrity.py
@@ -13,20 +13,21 @@ def fastQintegrity(fastq, outdir):
 
     compression_type = utils.compressionType(fastq)
 
-    encoding, min_reads_length, max_reads_length = None, None, None
+    encoding, min_reads_length, max_reads_length, num_reads, num_bp = None, None, None, None, None
 
     if compression_type is not None:
         command = [compression_type[1], '--stdout', '--keep', fastq, '>', temporary_output_file]
         run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False)
 
         if run_successfully:
-            encoding, min_reads_length, max_reads_length = run_guess_encoding_single_thread(temporary_output_file, None,
-                                                                                            outdir)
+            encoding, min_reads_length, max_reads_length, num_reads, num_bp = \
+                run_guess_encoding_single_thread(temporary_output_file, None, outdir)
 
     if os.path.isfile(temporary_output_file):
         os.remove(temporary_output_file)
 
-    utils.saveVariableToPickle([run_successfully, encoding, min_reads_length, max_reads_length], outdir, os.path.basename(fastq))
+    utils.saveVariableToPickle([run_successfully, encoding, min_reads_length, max_reads_length, num_reads, num_bp],
+                               outdir, os.path.basename(fastq))
 
 
 def run_guess_encoding_single_thread(fastq_file, number_reads_access_None_all, outdir):
@@ -39,17 +40,22 @@ def run_guess_encoding_single_thread(fastq_file, number_reads_access_None_all, o
     final_enconding = guess_encoding.get_final_encoding(encoding_data)
 
     min_reads_length, max_reads_length, _, _ = guess_encoding.determine_min_max_reads_length(encoding_data)
+    num_reads, num_bp = guess_encoding.get_num_reads_bp(encoding_data)
 
     utils.removeDirectory(outdir_guess_encoding)
-    return final_enconding, min_reads_length, max_reads_length
+    return final_enconding, min_reads_length, max_reads_length, num_reads, num_bp
 
 
-def report_reads_length(min_reads_length_each_fastq, max_reads_length_each_fastq, outdir):
+def report_reads_num_length_num_bp(num_reads, num_bp, min_reads_length_each_fastq, max_reads_length_each_fastq, outdir):
     """
-    Writes reads length report
+    Writes the total number of reads and bp sequenced and min and max reads length
 
     Parameters
     ----------
+    num_reads : int
+        Total number of reads sequenced
+    num_bp : int
+        Total number of bp sequenced
     min_reads_length_each_fastq : list
         Minimum reads length found for each fastq file
     max_reads_length_each_fastq : list
@@ -62,9 +68,10 @@ def report_reads_length(min_reads_length_each_fastq, max_reads_length_each_fastq
 
     """
 
-    with open(os.path.join(outdir, 'reads_length_report.tab'), 'wt') as writer:
-        writer.write('#' + '\t'.join(['min', 'max']) + '\n')
-        writer.write('\t'.join([';'.join(map(str, set(min_reads_length_each_fastq))),
+    with open(os.path.join(outdir, 'reads_num_length_num_bp_report.tab'), 'wt') as writer:
+        writer.write('#' + '\t'.join(['num_reads', 'num_bp', 'min_reads_length', 'max_reads_length']) + '\n')
+        writer.write('\t'.join(list(map(str, [num_reads, num_bp])) +
+                               [';'.join(map(str, set(min_reads_length_each_fastq))),
                                 ';'.join(map(str, set(max_reads_length_each_fastq)))]) + '\n')
 
 
@@ -89,12 +96,18 @@ def runFastQintegrity(fastq_files, threads, outdir):
     pool.join()
 
     encoding = {}
+    num_reads, num_bp = 0, 0
     files = [f for f in os.listdir(fastQintegrity_folder) if not f.startswith('.') and os.path.isfile(os.path.join(fastQintegrity_folder, f))]
     for file_found in files:
         if file_found.endswith('.pkl'):
-            file_run_successfully, file_encoding, min_reads_length, max_reads_length = utils.extractVariableFromPickle(os.path.join(fastQintegrity_folder, file_found))
+            file_run_successfully, file_encoding, min_reads_length, max_reads_length, num_reads_fastq, num_bp_fastq = \
+                utils.extractVariableFromPickle(os.path.join(fastQintegrity_folder, file_found))
             if file_run_successfully:
-                encoding[file_found] = {'file_encoding': file_encoding, 'min_reads_length': min_reads_length, 'max_reads_length': max_reads_length}
+                encoding[file_found] = {'file_encoding': file_encoding,
+                                        'min_reads_length': min_reads_length,
+                                        'max_reads_length': max_reads_length}
+                num_reads += num_reads_fastq if num_reads_fastq is not None else 0
+                num_bp += num_bp_fastq if num_bp_fastq is not None else 0
             else:
                 failing[os.path.splitext(file_found)[0]] = ['The file is possibly corrupt']
                 print(os.path.splitext(file_found)[0] + ': the file is possibly corrupt')
@@ -105,15 +118,16 @@ def runFastQintegrity(fastq_files, threads, outdir):
         not_corruption_found = False
         pass_qc = False
 
-        min_reads_length_found, max_reads_length_found = None, None
+        min_reads_length_found, max_reads_length_found, num_reads, num_bp = None, None, None, None
 
     if len(encoding) == 0:
         encoding = None
         print('It was no possible to determine the FASTQ encodings')
     else:
         min_reads_length_found, max_reads_length_found, min_reads_length_each_fastq, max_reads_length_each_fastq = \
             guess_encoding.determine_min_max_reads_length(encoding)
-        report_reads_length(min_reads_length_each_fastq, max_reads_length_each_fastq, outdir)
+        report_reads_num_length_num_bp(num_reads, num_bp, min_reads_length_each_fastq, max_reads_length_each_fastq,
+                                       outdir)
 
         if len(set([x['file_encoding'][0] for x in encoding.values() if x['file_encoding'] is not None])) == 1:
             encoding = [x['file_encoding'][0] for x in encoding.values() if x['file_encoding'] is not None][0]

diff --git a/modules/guess_encoding.py b/modules/guess_encoding.py
@@ -68,7 +68,8 @@ def guess_encoding(fastq, number_reads_access_None_all, outdir):
 
     utils.saveVariableToPickle([fastq, valid_encodings,
                                 min(reads_length) if len(reads_length) > 0 else None,
-                                max(reads_length) if len(reads_length) > 0 else None],
+                                max(reads_length) if len(reads_length) > 0 else None,
+                                len(reads_length), sum(reads_length)],
                                outdir, 'encoding' + '.' + os.path.splitext(os.path.basename(fastq))[0])
 
 
@@ -80,8 +81,13 @@ def gather_data_together(data_directory):
         if file_found.startswith('encoding.') and file_found.endswith('.pkl'):
             file_path = os.path.join(data_directory, file_found)
 
-            fastq, valid_encodings, min_reads_length, max_reads_length = utils.extractVariableFromPickle(file_path)
-            data[fastq] = {'valid_encodings': valid_encodings, 'min_reads_length': min_reads_length, 'max_reads_length': max_reads_length}
+            fastq, valid_encodings, min_reads_length, max_reads_length, num_reads, num_bp = \
+                utils.extractVariableFromPickle(file_path)
+            data[fastq] = {'valid_encodings': valid_encodings,
+                           'min_reads_length': min_reads_length,
+                           'max_reads_length': max_reads_length,
+                           'num_reads': num_reads,
+                           'num_bp': num_bp}
 
             os.remove(file_path)
 
@@ -121,7 +127,9 @@ def determine_min_max_reads_length(encoding_data):
         Dictionary with encondig data, and reads length for each fastq. Something like
         data[fastq] = {'valid_encodings': valid_encodings,
                        'min_reads_length': min_reads_length,
-                       'max_reads_length': max_reads_length}
+                       'max_reads_length': max_reads_length,
+                       'num_reads': num_reads,
+                       'num_bp': num_bp}
 
     Returns
     -------
@@ -146,6 +154,36 @@ def determine_min_max_reads_length(encoding_data):
            max_length_each_fastq
 
 
+def get_num_reads_bp(encoding_data):
+    """
+    Returns the total number of reads and bp sequenced
+
+    Parameters
+    ----------
+    encoding_data : dict
+        Dictionary with encondig data, and reads length for each fastq. Something like
+        data[fastq] = {'valid_encodings': valid_encodings,
+                       'min_reads_length': min_reads_length,
+                       'max_reads_length': max_reads_length,
+                       'num_reads': num_reads,
+                       'num_bp': num_bp}
+
+    Returns
+    -------
+    num_reads : int
+        Total number of reads sequenced
+    num_bp : int
+        Total number of bp sequenced
+    """
+    num_reads = [encoding_data[fastq]['num_reads'] for fastq in encoding_data if
+                 encoding_data[fastq]['num_reads'] is not None]
+
+    num_bp = [encoding_data[fastq]['num_bp'] for fastq in encoding_data if
+              encoding_data[fastq]['num_bp'] is not None]
+
+    return sum(num_reads) if len(num_reads) > 0 else None, sum(num_bp) if len(num_bp) > 0 else None
+
+
 def fastq_files_enconding(fastq_files_list, number_reads_access_None_all, outdir, threads):
     pool = multiprocessing.Pool(processes=threads)
     for fastq in fastq_files_list:
@@ -159,4 +197,6 @@ def fastq_files_enconding(fastq_files_list, number_reads_access_None_all, outdir
 
     min_reads_length, max_reads_length, _, _ = determine_min_max_reads_length(encoding_data)
 
-    return final_encoding, min_reads_length, max_reads_length
+    num_reads, num_bp = get_num_reads_bp(encoding_data)
+
+    return final_encoding, min_reads_length, max_reads_length, num_reads, num_bp