diff --git a/INNUca.py b/INNUca.py index e4574ea..b5295d4 100755 --- a/INNUca.py +++ b/INNUca.py @@ -9,7 +9,7 @@ Copyright (C) 2016 Miguel Machado -Last modified: August 31, 2016 +Last modified: September 30, 2016 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -33,13 +33,14 @@ import modules.spades as spades import modules.pilon as pilon import modules.mlst as mlst +import modules.assembly_mapping as assembly_mapping import time import os import sys def main(): - version = '1.7' + version = '1.8' args = utils.parseArguments(version) general_start_time = time.time() @@ -73,11 +74,8 @@ def main(): print '\n' + 'VERSION INNUca.py:' utils.scriptVersionGit(version, os.getcwd(), script_path) - # Save CPU information - with open(os.path.join(outdir, 'cpu_information.' + time_str + '.cpu.txt'), 'wt') as reader: - command = ['cat', '/proc/cpuinfo'] - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None) - reader.write(stdout) + # Get CPU information + utils.get_cpu_information(outdir, time_str) # Set and print PATH variable utils.setPATHvariable(args.doNotUseProvidedSoftware, script_path) @@ -106,8 +104,13 @@ def main(): sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms)) # .jar paths - jar_path_trimmomatic = programs_version_dictionary['trimmomatic-0.36.jar'][3] - jar_path_pilon = programs_version_dictionary['pilon-1.18.jar'][3] + jar_path_trimmomatic = None + if not args.skipTrimmomatic: + jar_path_trimmomatic = programs_version_dictionary['trimmomatic-0.36.jar'][3] + + jar_path_pilon = None + if not args.skipPilon and not args.skipSPAdes: + jar_path_pilon = programs_version_dictionary['pilon-1.18.jar'][3] # Check if input directory exists with fastq files and store samples name that have fastq files inputDirectory = os.path.abspath(os.path.join(args.inputDirectory, '')) @@ -133,13 +136,14 @@ def main(): # Determine SPAdes maximum memory spadesMaxMemory = None if not args.skipSPAdes: + print '' spadesMaxMemory = spades.define_memory(args.spadesMaxMemory, args.threads) # Run comparisons for each sample for sample in samples: sample_start_time = time.time() - print '\n' + 'Sample: ' + sample + print '\n' + 'Sample: ' + sample + '\n' # Create sample outdir sample_outdir = os.path.abspath(os.path.join(outdir, sample, '')) @@ -154,7 +158,7 @@ def main(): continue print 'The following files will be used:' - print str(fastq_files) + print str(fastq_files) + '\n' # Run INNUca.py analysis run_successfully, pass_qc, run_report = run_INNUca(sample, sample_outdir, fastq_files, args, script_path, scheme, spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon) @@ -218,7 +222,7 @@ def run_INNUca(sampleName, outdir, fastq_files, args, script_path, scheme, spade if os.path.isfile(report_file): os.remove(report_file) # Run getEstimatedCoverage - runs['first_Coverage'] = coverage.getEstimatedCoverage(fastq_files, genomeSize, outdir) + runs['first_Coverage'] = coverage.getEstimatedCoverage(fastq_files, genomeSize, outdir, threads) else: print '--skipEstimatedCoverage set. Skipping First Estimated Coverage analysis' runs['first_Coverage'] = skipped @@ -242,14 +246,15 @@ def run_INNUca(sampleName, outdir, fastq_files, args, script_path, scheme, spade # Run second Estimated Coverage if not args.skipEstimatedCoverage: - runs['second_Coverage'] = coverage.getEstimatedCoverage(fastq_files, genomeSize, outdir) + runs['second_Coverage'] = coverage.getEstimatedCoverage(fastq_files, genomeSize, outdir, threads) else: print '--skipEstimatedCoverage set. Skipping Second Estimated Coverage analysis' runs['second_Coverage'] = skipped # Run second FastQC if not args.skipFastQC: - runs['second_FastQC'] = fastqc.runFastQCanalysis(outdir, threads, adaptersFasta, fastq_files)[0:4] + run_successfully, pass_qc, time_taken, failing, maximumReadsLength, nts2clip_based_ntsContent = fastqc.runFastQCanalysis(outdir, threads, adaptersFasta, fastq_files) + runs['second_FastQC'] = [run_successfully, pass_qc, time_taken, failing] else: print '--skipFastQC set. Skipping Second FastQC analysis' runs['second_FastQC'] = skipped @@ -266,20 +271,47 @@ def run_INNUca(sampleName, outdir, fastq_files, args, script_path, scheme, spade # Run SPAdes if not args.skipSPAdes: - run_successfully, pass_qc, time_taken, failing, contigs = spades.runSpades(sampleName, outdir, threads, fastq_files, args.spadesNotUseCareful, spadesMaxMemory, args.spadesMinCoverageAssembly, args.spadesMinContigsLength, genomeSize, args.spadesKmers, maximumReadsLength, args.spadesDefaultKmers, args.spadesMinCoverageContigs) + run_successfully, pass_qc, time_taken, failing, contigs_spades = spades.runSpades(sampleName, outdir, threads, fastq_files, args.spadesNotUseCareful, spadesMaxMemory, args.spadesMinCoverageAssembly, args.spadesMinContigsLength, genomeSize, args.spadesKmers, maximumReadsLength, args.spadesDefaultKmers, args.spadesMinCoverageContigs) runs['SPAdes'] = [run_successfully, pass_qc, time_taken, failing] if run_successfully: # Run Pilon + contigs = contigs_spades + if not args.skipPilon: - run_successfully, _, time_taken, failing, assembly_polished = pilon.runPilon(jar_path_pilon, contigs, fastq_files, threads, outdir, args.pilonKeepFiles, args.pilonKeepSPAdesAssembly) + run_successfully, _, time_taken, failing, assembly_polished, bam_file, pilon_folder = pilon.runPilon(jar_path_pilon, contigs_spades, fastq_files, threads, outdir) runs['Pilon'] = [run_successfully, None, time_taken, failing] if run_successfully: contigs = assembly_polished + + # Run Assembly Mapping check + if bam_file is not None: + if not args.skipAssemblyMapping: + run_successfully, pass_qc, time_taken, failing, assembly_filtered = assembly_mapping.runAssemblyMapping(bam_file, contigs_spades, threads, outdir, args.assemblyMinCoverageContigs, assembly_polished, args.assemblyKeepPilonContigs) + runs['Assembly_Mapping'] = [run_successfully, pass_qc, time_taken, failing] + + if run_successfully: + contigs = assembly_filtered + else: + print '--skipAssemblyMapping set. Skipping Assembly Mapping check' + runs['Assembly_Mapping'] = skipped + else: + print 'Pilon did not produce the bam file! Assembly Mapping check' + runs['Assembly_Mapping'] = skipped + + if assembly_polished is not None and not args.pilonKeepSPAdesAssembly: + os.remove(contigs_spades) + + if not args.pilonKeepFiles: + utils.removeDirectory(pilon_folder) + else: - print '--skipPilon set. Skipping Pilon correction' + print '--skipPilon set. Skipping Pilon correction and Assembly Mapping check' runs['Pilon'] = skipped + runs['Assembly_Mapping'] = skipped + + print '\n' + 'Assembly to use: ' + contigs + '\n' # Run MLST if not args.skipMLST: @@ -288,18 +320,20 @@ def run_INNUca(sampleName, outdir, fastq_files, args, script_path, scheme, spade print '--skipMLST set. Skipping MLST analysis' runs['MLST'] = skipped else: - print 'SPAdes did not run successfully! Skipping Pilon correction and MLST analysis' + print 'SPAdes did not run successfully! Skipping Pilon correction, Assembly Mapping check and MLST analysis' runs['Pilon'] = skipped + runs['Assembly_Mapping'] = skipped runs['MLST'] = skipped else: - print '--skipSPAdes set. Skipping SPAdes Pilon correction and MLST analysis' + print '--skipSPAdes set. Skipping SPAdes Pilon correction, Assembly Mapping check and MLST analysis' runs['SPAdes'] = skipped runs['Pilon'] = skipped + runs['Assembly_Mapping'] = skipped runs['MLST'] = skipped else: print 'Moving to the next sample' - for step in ('first_Coverage', 'first_FastQC', 'Trimmomatic', 'second_Coverage', 'second_FastQC', 'SPAdes', 'Pilon', 'MLST'): + for step in ('first_Coverage', 'first_FastQC', 'Trimmomatic', 'second_Coverage', 'second_FastQC', 'SPAdes', 'Pilon', 'Assembly_Mapping', 'MLST'): runs[step] = not_run # Remove Trimmomatic directory with cleaned reads @@ -317,8 +351,9 @@ def run_INNUca(sampleName, outdir, fastq_files, args, script_path, scheme, spade pass_fastqc = runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1]) pass_trimmomatic = runs['Trimmomatic'][1] is not False pass_spades = runs['SPAdes'][1] is not False + pass_assemblyMapping = runs['Assembly_Mapping'][1] is not False pass_mlst = runs['MLST'][1] is not False - pass_qc = all([pass_fastqIntegrity, pass_cov, pass_fastqc, pass_trimmomatic, pass_spades, pass_mlst]) + pass_qc = all([pass_fastqIntegrity, pass_cov, pass_fastqc, pass_trimmomatic, pass_spades, pass_assemblyMapping, pass_mlst]) return run_successfully, pass_qc, runs diff --git a/README.md b/README.md index 48a0631..b7c051e 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ INNUca - Reads Control and Assembly Requirements ------------ - - Illumina paired-end reads + - Illumina paired-end reads (gzip compressed) - Expected species name - Expected genome size in Mb @@ -48,7 +48,7 @@ Usage [-o /output/directory/] [-j N] [--doNotUseProvidedSoftware] [--pairEnd_filesSeparation "_left/rigth.fq.gz" "_left/rigth.fq.gz"] [--skipEstimatedCoverage] [--skipFastQC] [--skipTrimmomatic] - [--skipSPAdes] [--skipPilon] [--skipMLST] + [--skipSPAdes] [--skipPilon] [--skipAssemblyMapping] [--skipMLST] [--adapters adaptersFile.fasta | --doNotSearchAdapters] [--doNotTrimCrops | [[--trimCrop N] [--trimHeadCrop N]]] [--trimSlidingWindow window:meanQuality] [--trimMinLength N] @@ -98,9 +98,12 @@ Usage --skipTrimmomatic Tells the programme to not run Trimmomatic (default: False) --skipSPAdes Tells the programme to not run SPAdes and consequently - MLST analysis (requires SPAdes contigs) (default: - False) - --skipPilon Tells the programme to not run Pilon correction + Pilon correction, Assembly Mapping check and MLST analysis + (SPAdes contigs required) (default: False) + --skipPilon Tells the programme to not run Pilon correction and + consequently Assembly Mapping check (bam files required) + (default: False) + --skipAssemblyMapping Tells the programme to not run Assembly Mapping check') (default: False) --skipMLST Tells the programme to not run MLST analysis (default: False) @@ -141,13 +144,14 @@ Usage --careful option (default: False) --spadesMinContigsLength N Filter SPAdes contigs for length greater or equal than - this value (default: 200) - --spadesMaxMemory N The maximum amount of RAM Gb for SPAdes to use - (default: 25) + this value (default: maximum reads size or 200 bp) + --spadesMaxMemory N The maximum amount of RAM Gb for SPAdes to use (default: + free memory available at the beginning of INNUca) --spadesMinCoverageAssembly 10 The minimum number of reads to consider an edge in the de Bruijn graph (or path I am not sure). Can also be - auto or off (default: 'off') + + auto or off (default: 'off') --spadesMinCoverageContigs N Minimum contigs coverage. After assembly only keep contigs with reported coverage equal or above this @@ -166,6 +170,15 @@ Usage Tells INNUca.py to not remove the unpolished SPAdes assembly (default: False) + Assembly Mapping options: + --assemblyMinCoverageContigs + Minimum contigs average coverage. After mapping reads + back to the contigs, only keep contigs with at least + this average coverage (default: 2) + --assemblyKeepPilonContigs + Tells INNUca.py to not remove the polished Pilon contigs + (default: False) + Combine INNUca reports diff --git a/modules/assembly_mapping.py b/modules/assembly_mapping.py new file mode 100644 index 0000000..37142fe --- /dev/null +++ b/modules/assembly_mapping.py @@ -0,0 +1,322 @@ +import utils +import os +from functools import partial +import multiprocessing + + +def sequenceHeaders(fastaFile): + headers = [] + + with open(fastaFile, 'rtU') as lines: + for line in lines: + line = line.splitlines()[0] + if len(line) > 0: + if line[0] == '>': + headers.append(line[1:]) + + if len(headers) == 0: + headers = None + + return headers + + +# Get genome coverage data +def compute_genome_coverage_data(alignment_file, sequence_to_analyse, outdir, counter): + genome_coverage_data_file = os.path.join(outdir, 'samtools_depth.sequence_' + str(counter) + '.tab') + command = ['samtools', 'depth', '-a', '-r', sequence_to_analyse, alignment_file, '>', genome_coverage_data_file] + run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False) + return run_successfully, genome_coverage_data_file + + +def calculate_genome_coverage(genome_coverage_data_file): + sequence = None + position = 0 + coverage = 0 + problems_found = False + + if os.path.getsize(genome_coverage_data_file) > 0: + with open(genome_coverage_data_file, 'rtU') as reader: + for line in reader: + line = line.splitlines()[0] + if len(line) > 0: + line = line.split('\t') + if position == 0 and sequence is None: + sequence = line[0] + else: + if sequence != line[0]: + print 'WARNING: different sequences found in the same file' + problems_found = True + break + if int(line[1]) == position + 1: + coverage += int(line[2]) + else: + print 'WARNING: missing some positions!' + problems_found = True + break + position += 1 + + return problems_found, position, coverage + + +def get_sequence_coverage(alignment_file, sequence_to_analyse, outdir, counter): + problems_found = False + position = 0 + coverage = 0 + + run_successfully, genome_coverage_data_file = compute_genome_coverage_data(alignment_file, sequence_to_analyse, outdir, counter) + if run_successfully: + problems_found, position, coverage = calculate_genome_coverage(genome_coverage_data_file) + if not problems_found and position == 0: + # Assuming SPAdes headers renamed (with sample name at the beginning) + position = int(sequence_to_analyse.rsplit('_', 6)[4]) + else: + problems_found = True + + try: + os.remove(genome_coverage_data_file) + except Exception as e: + print e + + utils.saveVariableToPickle([sequence_to_analyse, run_successfully, problems_found, position, coverage], outdir, str('coverage.sequence_' + str(counter))) + + +def sample_coverage(referenceFile, alignment_file, outdir, threads): + coverage_outdir = os.path.join(outdir, 'samtools_depth', '') + utils.removeDirectory(coverage_outdir) + os.makedirs(coverage_outdir) + + sequences = sequenceHeaders(referenceFile) + + pool = multiprocessing.Pool(processes=threads) + counter = 0 + for sequence in sequences: + pool.apply_async(get_sequence_coverage, args=(alignment_file, sequence, coverage_outdir, counter,)) + counter += 1 + pool.close() + pool.join() + + sample_coverage_no_problems = True + mean_coverage_data = {} + files = [f for f in os.listdir(coverage_outdir) if not f.startswith('.') and os.path.isfile(os.path.join(coverage_outdir, f))] + for file_found in files: + if file_found.startswith('coverage.sequence_') and file_found.endswith('.pkl'): + file_path = os.path.join(coverage_outdir, file_found) + + if sample_coverage_no_problems: + sequence_to_analyse, run_successfully, problems_found, position, coverage = utils.extractVariableFromPickle(file_path) + if run_successfully and not problems_found: + mean_coverage_data[sequence_to_analyse] = {'position': position, 'coverage': coverage, 'mean_coverage': round((float(coverage) / float(position)), 2)} + else: + print 'WARNING: it was not possible to compute coverage information for sequence ' + sequence_to_analyse + sample_coverage_no_problems = False + + os.remove(file_path) + + return sample_coverage_no_problems, mean_coverage_data + + +def save_assembly_coverage_report(mean_coverage_data, outdir, minCoverageAssembly): + pass_qc = False + failing_reason = None + + report_file = os.path.join(outdir, 'assembly_coverage_report.txt') + + position = 0 + coverage = 0 + sequences_2_keep = [] + with open(report_file, 'wt') as writer: + writer.write('#by_contigs' + '\n') + for sequence in sorted(mean_coverage_data): + position += mean_coverage_data[sequence]['position'] + coverage += mean_coverage_data[sequence]['coverage'] + + if mean_coverage_data[sequence]['mean_coverage'] >= minCoverageAssembly: + sequences_2_keep.append(sequence) + + writer.write(str('>' + sequence) + '\n' + str(mean_coverage_data[sequence]['mean_coverage']) + '\n') + writer.flush() + writer.write('#general' + '\n' + str(round((float(coverage) / float(position)), 2)) + '\n') + + if round((float(coverage) / float(position)), 2) >= 30: + pass_qc = True + print 'Assembly coverage: ' + str(round((float(coverage) / float(position)), 2)) + 'x' + else: + failing_reason = 'Assembly coverage: ' + str(round((float(coverage) / float(position)), 2)) + 'x (lower than 30x)' + + return pass_qc, failing_reason, sequences_2_keep + + +def subsampleContigs(fastaFile, list_sequences, outputFile, pilon_run_successfuly): + writer = open(outputFile, 'wt') + + number_sequences = 0 + number_bases = 0 + seqHeader = '' + seqSequence = '' + + with open(fastaFile, 'rtU') as lines: + for line in lines: + if len(line.splitlines()[0]) > 0: + if line[0] == '>': + if seqHeader != '': + sequenced_found = False + + header_2_search = None + if not pilon_run_successfuly: + header_2_search = seqHeader[1:] + else: + header_2_search = seqHeader[1:].rsplit('_', 1)[0] + + if header_2_search in list_sequences: + sequenced_found = True + + if sequenced_found: + writer.write(seqHeader + '\n') + writer.write(seqSequence + '\n') + writer.flush() + + seqSequence = seqSequence.splitlines() + for seq in seqSequence: + number_bases += len(seq) + + number_sequences += 1 + + seqHeader = '' + seqSequence = '' + seqHeader = line.splitlines()[0] + else: + seqSequence = seqSequence + line + + sequenced_found = False + + header_2_search = None + if not pilon_run_successfuly: + header_2_search = seqHeader[1:] + else: + header_2_search = seqHeader[1:].rsplit('_', 1)[0] + + if header_2_search in list_sequences: + sequenced_found = True + + if sequenced_found: + writer.write(seqHeader + '\n') + writer.write(seqSequence + '\n') + writer.flush() + + seqSequence = seqSequence.splitlines() + for seq in seqSequence: + number_bases += len(seq) + + number_sequences += 1 + + writer.close() + + return number_sequences, number_bases + + +def getting_mapping_statistics(alignment_file): + command = ['samtools', 'flagstat', alignment_file] + run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, True) + + dict_mapping_statistics = {} + if run_successfully: + stdout = stdout.splitlines() + for line in stdout: + line = line.splitlines()[0] + if len(line) > 0: + line = line.split(' ', 3) + field = line[3].split('(', 1) + if len(field) == 0: + field = field[0].replace(' ', '_') + else: + field = field[0].rsplit(' ', 1)[0].replace(' ', '_') + dict_mapping_statistics[field] = {'qc_passed': int(line[0]), 'qc_failed': int(line[2])} + return run_successfully, dict_mapping_statistics + + +def save_mapping_statistics(dict_mapping_statistics, outdir): + pass_qc = False + failing_reason = None + + report_file = os.path.join(outdir, 'assembly_mapping_report.txt') + + total_reads = 0 + total_mapped_reads = 0 + with open(report_file, 'wt') as writer: + for field in sorted(dict_mapping_statistics): + writer.write(str('#' + field) + '\n' + str('>' + 'qc_passed') + '\n' + str(dict_mapping_statistics[field]['qc_passed']) + '\n' + str('>' + 'qc_failed') + '\n' + str(dict_mapping_statistics[field]['qc_failed']) + '\n') + + if field == 'in_total': + total_reads = dict_mapping_statistics[field]['qc_passed'] + dict_mapping_statistics[field]['qc_failed'] + elif field == 'mapped': + total_mapped_reads = dict_mapping_statistics[field]['qc_passed'] + dict_mapping_statistics[field]['qc_failed'] + + if total_mapped_reads > 0 and total_reads > 0: + if round((float(total_mapped_reads) / float(total_reads)), 2) >= 0.66: + pass_qc = True + print 'Mapped reads: ' + str(round((float(total_mapped_reads) / float(total_reads)), 2) * 100) + '%' + else: + failing_reason = 'Mapped reads: ' + str(round((float(total_mapped_reads) / float(total_reads)), 2) * 100) + '% (lower than 66%)' + else: + failing_reason = 'No reads were mapped' + + return pass_qc, failing_reason + + +assemblyMapping_timer = partial(utils.timer, name='Assembly mapping check') + + +@assemblyMapping_timer +def runAssemblyMapping(alignment_file, reference_file, threads, outdir, minCoverageAssembly, assembly_pilon, keep_pilon_assembly): + pass_qc = False + pass_qc_coverage = False + pass_qc_mapping = False + + failing = {} + + assemblyMapping_folder = os.path.join(outdir, 'assemblyMapping', '') + utils.removeDirectory(assemblyMapping_folder) + os.mkdir(assemblyMapping_folder) + + assembly_filtered = None + + # Get assembly coverage + sample_coverage_no_problems, mean_coverage_data = sample_coverage(reference_file, alignment_file, assemblyMapping_folder, threads) + if sample_coverage_no_problems: + pass_qc_coverage, failing_reason, sequences_2_keep = save_assembly_coverage_report(mean_coverage_data, outdir, minCoverageAssembly) + + assembly_filtered = os.path.splitext(assembly_pilon)[0] + '.filtered.fasta' + + assembly = reference_file if assembly_pilon is None else assembly_pilon + + pilon_run_successfuly = True if assembly_pilon is not None else False + + subsampleContigs(assembly, sequences_2_keep, assembly_filtered, pilon_run_successfuly) + + if not pass_qc_coverage: + failing['Coverage'] = [failing_reason] + else: + failing['Coverage'] = ['Did not run'] + + if not keep_pilon_assembly: + os.remove(assembly_pilon) + + # Save mapping statistics + sample_mapping_statistics_no_problems, dict_mapping_statistics = getting_mapping_statistics(alignment_file) + if sample_mapping_statistics_no_problems: + pass_qc_mapping, failing_reason = save_mapping_statistics(dict_mapping_statistics, outdir) + + if not pass_qc_mapping: + failing['Mapping'] = [failing_reason] + else: + failing['Mapping'] = ['Did not run'] + + run_successfully = sample_coverage_no_problems and sample_mapping_statistics_no_problems + pass_qc = pass_qc_coverage and pass_qc_mapping + + if not pass_qc: + print 'Sample FAILS Assembly Mapping check with: ' + str(failing) + + utils.removeDirectory(assemblyMapping_folder) + + return run_successfully, pass_qc, failing, assembly_filtered diff --git a/modules/combine_reports.py b/modules/combine_reports.py index d854906..76af515 100644 --- a/modules/combine_reports.py +++ b/modules/combine_reports.py @@ -54,7 +54,7 @@ def combine_reports(args): if len(directories) == 0: sys.exit('No samples found') - files_to_search = ['coverage_report.txt', 'spades_report.txt', 'pilon_report.txt', 'mlst_report.txt'] + files_to_search = ['coverage_report.txt', 'spades_report.txt', 'pilon_report.txt', 'assembly_coverage_report.txt', 'assembly_mapping_report.txt', 'mlst_report.txt'] for directory in directories: sample = directory @@ -65,7 +65,7 @@ def combine_reports(args): if len(files) == 0: print 'No files found! Continue to the next sample' else: - results[sample] = ['NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA'] + results[sample] = ['NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA'] for file_found in files: name_file_found = file_found @@ -143,6 +143,45 @@ def combine_reports(args): else: break elif name_file_found == files_to_search[3]: + coverage = False + with open(file_found, 'rtU') as reader: + for line in reader: + if len(line) > 0: + line = line.splitlines()[0].split(' ')[0] + if line.startswith('#'): + if line.startswith('general', 1): + coverage = True + else: + if coverage: + results[sample][6] = line + coverage = False + elif name_file_found == files_to_search[4]: + total = False + total_reads = 0 + mapped = False + mapped_reads = 0 + with open(file_found, 'rtU') as reader: + for line in reader: + if len(line) > 0: + line = line.splitlines()[0].split(' ')[0] + if line.startswith('#'): + total = False + mapped = False + if line.startswith('in_total', 1): + total = True + elif line.startswith('mapped', 1): + mapped = True + elif line.startswith('>'): + continue + else: + if total: + total_reads = int(line) + total = False + elif mapped: + mapped_reads = int(line) + mapped = False + results[sample][7] = str(round((float(mapped_reads) / float(total_reads)), 2) * 100) + elif name_file_found == files_to_search[5]: species = False st = False with open(file_found, 'rtU') as reader: @@ -156,10 +195,10 @@ def combine_reports(args): st = True else: if species: - results[sample][6] = line + results[sample][8] = line species = False if st: - results[sample][7] = line + results[sample][9] = line st = False if len(results) == 0: @@ -167,7 +206,7 @@ def combine_reports(args): print '\n' + 'Writing results...' report = open(os.path.join(outdir, str('combine_samples_report.' + time.strftime("%Y%m%d-%H%M%S") + '.tab')), 'wt') - report.write('#samples' + '\t' + 'first_coverage' + '\t' + 'second_Coverage' + '\t' + 'SPAdes_number_contigs' + '\t' + 'SPAdes_number_bp' + '\t' + 'Pilon_changes' + '\t' + 'Pilon_contigs_changed' + '\t' + 'scheme' + '\t' + 'ST' + '\n') + report.write('#samples' + '\t' + 'first_coverage' + '\t' + 'second_Coverage' + '\t' + 'SPAdes_number_contigs' + '\t' + 'SPAdes_number_bp' + '\t' + 'Pilon_changes' + '\t' + 'Pilon_contigs_changed' + '\t' + 'assembly_coverage' + '\t' + 'mapped_reads_percentage' + '\t' + 'scheme' + '\t' + 'ST' + '\n') report.flush() for sample in results: diff --git a/modules/estimated_coverage.py b/modules/estimated_coverage.py index b2e2b49..0d4eebc 100644 --- a/modules/estimated_coverage.py +++ b/modules/estimated_coverage.py @@ -1,10 +1,11 @@ import utils import os from functools import partial +import multiprocessing # Count sequenced bases -def countSequencedBases(fastq_file): +def countSequencedBases(fastq_file, outdir): run_successfully = False bases = None @@ -12,27 +13,28 @@ def countSequencedBases(fastq_file): # Determine compression type filetype = utils.compressionType(fastq_file) - if filetype == 'gz': - command[0] = 'gunzip' - elif filetype == 'bz2': - command[0] = 'bunzip2' + if filetype != 'gz' and filetype != 'bz2': + utils.saveVariableToPickle([run_successfully, bases], outdir, str('estimate_coverage.' + os.path.basename(fastq_file))) else: - return run_successfully, bases - - # Number of characters - command[18] = '--chars' - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None) - if run_successfully: - bases = int(stdout.splitlines()[0]) + if filetype == 'gz': + command[0] = 'gunzip' + else: + command[0] = 'bunzip2' - # Number of lines - command[18] = '--lines' - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None) + # Number of characters + command[18] = '--chars' + run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False) if run_successfully: - lines = int(stdout.splitlines()[0]) - bases = bases - lines + bases = int(stdout.splitlines()[0]) - return run_successfully, bases + # Number of lines + command[18] = '--lines' + run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False) + if run_successfully: + lines = int(stdout.splitlines()[0]) + bases = bases - lines + + utils.saveVariableToPickle([run_successfully, bases], outdir, str('estimate_coverage.' + os.path.basename(fastq_file))) coverage_timer = partial(utils.timer, name='Estimated Coverage analysis') @@ -40,21 +42,36 @@ def countSequencedBases(fastq_file): # Get estimated coverage @coverage_timer -def getEstimatedCoverage(fastq_files, estimatedGenomeSizeMb, outdir): +def getEstimatedCoverage(fastq_files, estimatedGenomeSizeMb, outdir, threads): run_successfully = False pass_qc = False failing = {} failing['sample'] = False # Run Estimated Coverage - numberBases = 0 + + # Get number bases for each fastq file + pool = multiprocessing.Pool(processes=threads) for fastq in fastq_files: - # Get number bases for each fastq file - run_successfully, bases = countSequencedBases(fastq) - if not run_successfully: - break - else: - numberBases = numberBases + bases + pool.apply_async(countSequencedBases, args=(fastq, outdir,)) + pool.close() + pool.join() + + numberBases = 0 + file_problems = False + files = [f for f in os.listdir(outdir) if not f.startswith('.') and os.path.isfile(os.path.join(outdir, f))] + for file_found in files: + if file_found.startswith('estimate_coverage.') and file_found.endswith('.pkl'): + file_path = os.path.join(outdir, file_found) + + if not file_problems: + run_successfully, bases = utils.extractVariableFromPickle(file_path) + if run_successfully: + numberBases += bases + else: + file_problems = True + + os.remove(file_path) if run_successfully: estimatedCoverage = numberBases / float(estimatedGenomeSizeMb * 1000000) diff --git a/modules/fastQintegrity.py b/modules/fastQintegrity.py index c600d9d..7916c2a 100644 --- a/modules/fastQintegrity.py +++ b/modules/fastQintegrity.py @@ -18,7 +18,7 @@ def fastQintegrity(fastq, outdir): command[0] = 'bunzip2' if command[0] != '': - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None) + run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False) if os.path.isfile(temporary_output_file): os.remove(temporary_output_file) diff --git a/modules/fastqc.py b/modules/fastqc.py index 8d2afe2..02b9810 100644 --- a/modules/fastqc.py +++ b/modules/fastqc.py @@ -37,7 +37,7 @@ def fastQC(fastqc_folder, threads, adaptersFasta, fastq_files): adaptersTEMP = adapters2fastQC(fastqc_folder, adaptersFasta) print 'Scanning for adapters contamination using ' + adaptersFasta command[9] = '--adapters ' + adaptersTEMP - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None) + run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) # Remove temporary files os.rmdir(os.path.join(fastqc_folder, 'temp.fastqc_temporary_dir', '')) diff --git a/modules/mlst.py b/modules/mlst.py index c3aca06..9c35bf9 100644 --- a/modules/mlst.py +++ b/modules/mlst.py @@ -7,7 +7,7 @@ def getScheme(species): command = ['which', 'mlst'] - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None) + run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) mlst_folder = stdout.splitlines()[0] mlst_db_path = os.path.join(os.path.dirname(os.path.dirname(mlst_folder)), 'db', 'species_scheme_map.tab') @@ -36,7 +36,7 @@ def runMlst(contigs, scheme, outdir): command = ['mlst', contigs] - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None) + run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) if run_successfully: scheme_mlst = stdout.splitlines()[0].split('\t')[1].split('_')[0] diff --git a/modules/pilon.py b/modules/pilon.py index f40fae3..0aaf4f4 100644 --- a/modules/pilon.py +++ b/modules/pilon.py @@ -10,7 +10,7 @@ def indexSequenceBowtie2(referenceFile, threads): run_successfully = True else: command = ['bowtie2-build', '--threads', str(threads), referenceFile, referenceFile] - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None) + run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) return run_successfully @@ -27,7 +27,7 @@ def mappingBowtie2(fastq_files, referenceFile, threads, outdir): command[8] = '-U ' + fastq_files else: command[8] = '-1 ' + fastq_files[0] + ' -2 ' + fastq_files[1] - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None) + run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) if not run_successfully: sam_file = None @@ -41,7 +41,7 @@ def sortAlignment(alignment_file, output_file, sortByName_True, threads): command = ['samtools', 'sort', '-o', output_file, '-O', outFormat_string, '', '-@', str(threads), alignment_file] if sortByName_True: command[6] = '-n' - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None) + run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) if not run_successfully: output_file = None return run_successfully, output_file @@ -50,14 +50,14 @@ def sortAlignment(alignment_file, output_file, sortByName_True, threads): # Index alignment file def indexAlignment(alignment_file): command = ['samtools', 'index', alignment_file] - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None) + run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) return run_successfully def pilon(jar_path_pilon, assembly, bam_file, outdir): assembly_polished = os.path.splitext(assembly)[0] + '.polished.fasta' command = ['java', '-jar', jar_path_pilon, '--genome', assembly, '--frags', bam_file, '--outdir', outdir, '--output', os.path.basename(os.path.splitext(assembly_polished)[0]), '--changes', '--vcf'] - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None) + run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) if not run_successfully: assembly_polished = None return run_successfully, assembly_polished @@ -89,7 +89,7 @@ def parsePilonResult(assembly_polished, outdir): @pilon_timer -def runPilon(jar_path_pilon, assembly, fastq_files, threads, outdir, keepFiles, keepSPAdesAssembly): +def runPilon(jar_path_pilon, assembly, fastq_files, threads, outdir): failing = {} failing['sample'] = False @@ -102,6 +102,7 @@ def runPilon(jar_path_pilon, assembly, fastq_files, threads, outdir, keepFiles, os.symlink(assembly, assembly_link) assembly_polished = None + bam_file = None # Index assembly using Bowtie2 run_successfully = indexSequenceBowtie2(assembly_link, threads) @@ -124,10 +125,7 @@ def runPilon(jar_path_pilon, assembly, fastq_files, threads, outdir, keepFiles, parsePilonResult(assembly_polished, outdir) shutil.copyfile(assembly_polished, os.path.join(outdir, os.path.basename(assembly_polished))) assembly_polished = os.path.join(outdir, os.path.basename(assembly_polished)) - if not keepSPAdesAssembly: - os.remove(assembly) - - if not keepFiles: - utils.removeDirectory(pilon_folder) + else: + bam_file = None - return run_successfully, None, failing, assembly_polished + return run_successfully, None, failing, assembly_polished, bam_file, pilon_folder diff --git a/modules/spades.py b/modules/spades.py index 0e38aaf..71a232e 100644 --- a/modules/spades.py +++ b/modules/spades.py @@ -18,7 +18,7 @@ def spades(spades_folder, threads, fastq_files, notUseCareful, maxMemory, minCov kmers = ','.join(map(str, kmers)) command[9] = str('-k ' + kmers) - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None) + run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) return run_successfully, contigs @@ -67,10 +67,35 @@ def define_kmers(kmers, maximumReadsLength): return kmers_use +def define_minContigsLength(maximumReadsLength, minContigsLength): + minimum_length = 200 + if minContigsLength is not None: + minimum_length = minContigsLength + else: + if maximumReadsLength > minimum_length: + minimum_length = maximumReadsLength + + return minimum_length + + def define_memory(maxMemory, threads): - GB_per_thread = 800 / 1024.0 - available_memory_GB = (os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_AVPHYS_PAGES')) / (1024.0 ** 3) + GB_per_thread = 1024 / 1024.0 + minimum_required_memory_GB = GB_per_thread * threads + + available_memory_GB = utils.get_free_memory() + if available_memory_GB == 0: + print 'WARNING: it was not possible to determine the free available memory!' + if maxMemory is not None: + print 'Setting SPAdes maximum memory to the one provided by user' + time.sleep(10) + return maxMemory + else: + print 'Trying use the minimum memory required for SPAdes to run' + available_memory_GB = minimum_required_memory_GB + else: + available_memory_GB = available_memory_GB / (1024.0 ** 2) + if maxMemory is None: if minimum_required_memory_GB > available_memory_GB: print 'WARNNING: the minimum memory required to run SPAdes with ' + str(threads) + ' threads (' + str(round(minimum_required_memory_GB, 1)) + ' GB) are higher than the available memory (' + str(round(available_memory_GB, 1)) + ' GB)!' @@ -125,6 +150,7 @@ def runSpades(sampleName, outdir, threads, fastq_files, notUseCareful, maxMemory if run_successfully: shutil.copyfile(contigs, os.path.join(outdir, 'SPAdes_original_assembly.contigs.fasta')) + minContigsLength = define_minContigsLength(maximumReadsLength, minContigsLength) print 'Filtering for contigs with at least ' + str(minContigsLength) + ' nucleotides and a coverage of ' + str(minCoverageContigs) contigsFiltered, number_contigs, number_bases = renameFilterContigs(sampleName, outdir, contigs, minContigsLength, minCoverageContigs) print str(number_bases) + ' assembled nucleotides in ' + str(number_contigs) + ' contigs' diff --git a/modules/trimmomatic.py b/modules/trimmomatic.py index b3cfbcc..10c11fa 100644 --- a/modules/trimmomatic.py +++ b/modules/trimmomatic.py @@ -43,16 +43,16 @@ def trimmomatic(jar_path_trimmomatic, sampleName, trimmomatic_folder, threads, a adaptersFasta = concatenateFastaFiles(adapters_files, trimmomatic_folder, 'concatenated_adaptersFile.fasta') command[12] = 'ILLUMINACLIP:' + adaptersFasta + ':3:30:10:6:true' - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None) + run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) if not run_successfully: print 'Trimmomatic fail! Trying run with Phred+33 enconding defined...' command[18] = '-phred33' - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None) + run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) if not run_successfully: print 'Trimmomatic fail again! Trying run with Phred+64 enconding defined...' command[18] = '-phred64' - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None) + run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) return run_successfully @@ -100,7 +100,7 @@ def controlForZeroReads(fastq_files): command[0] = 'bunzip2' if command[0] != '': - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None) + run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, False) if run_successfully: stdout = stdout.splitlines() diff --git a/modules/utils.py b/modules/utils.py index b49d2ca..c454397 100644 --- a/modules/utils.py +++ b/modules/utils.py @@ -29,8 +29,9 @@ def parseArguments(version): general_options.add_argument('--skipEstimatedCoverage', action='store_true', help='Tells the programme to not estimate coverage depth based on number of sequenced nucleotides and expected genome size') general_options.add_argument('--skipFastQC', action='store_true', help='Tells the programme to not run FastQC analysis') general_options.add_argument('--skipTrimmomatic', action='store_true', help='Tells the programme to not run Trimmomatic') - general_options.add_argument('--skipSPAdes', action='store_true', help='Tells the programme to not run SPAdes and consequently MLST analysis (requires SPAdes contigs)') - general_options.add_argument('--skipPilon', action='store_true', help='Tells the programme to not run Pilon correction') + general_options.add_argument('--skipSPAdes', action='store_true', help='Tells the programme to not run SPAdes and consequently Pilon correction, Assembly Mapping check and MLST analysis (SPAdes contigs required)') + general_options.add_argument('--skipPilon', action='store_true', help='Tells the programme to not run Pilon correction and consequently Assembly Mapping check (bam files required)') + general_options.add_argument('--skipAssemblyMapping', action='store_true', help='Tells the programme to not run Assembly Mapping check') general_options.add_argument('--skipMLST', action='store_true', help='Tells the programme to not run MLST analysis') adapters_options = parser.add_mutually_exclusive_group() @@ -49,8 +50,8 @@ def parseArguments(version): spades_options = parser.add_argument_group('SPAdes options') spades_options.add_argument('--spadesNotUseCareful', action='store_true', help='Tells SPAdes to only perform the assembly without the --careful option') - spades_options.add_argument('--spadesMinContigsLength', type=int, metavar='N', help='Filter SPAdes contigs for length greater or equal than this value', required=False, default=200) - spades_options.add_argument('--spadesMaxMemory', type=int, metavar='N', help='The maximum amount of RAM Gb for SPAdes to use', required=False) + spades_options.add_argument('--spadesMinContigsLength', type=int, metavar='N', help='Filter SPAdes contigs for length greater or equal than this value (default: maximum reads size or 200 bp)', required=False) + spades_options.add_argument('--spadesMaxMemory', type=int, metavar='N', help='The maximum amount of RAM Gb for SPAdes to use (default: free memory available at the beginning of INNUca)', required=False) spades_options.add_argument('--spadesMinCoverageAssembly', type=spades_cov_cutoff, metavar='10', help='The minimum number of reads to consider an edge in the de Bruijn graph (or path I am not sure). Can also be auto or off', required=False, default=2) spades_options.add_argument('--spadesMinCoverageContigs', type=int, metavar='N', help='Minimum contigs coverage. After assembly only keep contigs with reported coverage equal or above this value', required=False, default=5) @@ -62,6 +63,10 @@ def parseArguments(version): pilon_options.add_argument('--pilonKeepFiles', action='store_true', help='Tells INNUca.py to not remove the output of Pilon') pilon_options.add_argument('--pilonKeepSPAdesAssembly', action='store_true', help='Tells INNUca.py to not remove the unpolished SPAdes assembly') + assembly_mapping_options = parser.add_argument_group('Assembly Mapping options') + assembly_mapping_options.add_argument('--assemblyMinCoverageContigs', type=int, metavar='N', help='Minimum contigs average coverage. After mapping reads back to the contigs, only keep contigs with at least this average coverage', required=False, default=2) + assembly_mapping_options.add_argument('--assemblyKeepPilonContigs', action='store_true', help='Tells INNUca.py to not remove the polished Pilon contigs') + args = parser.parse_args() if args.doNotTrimCrops and (args.trimCrop or args.trimHeadCrop): @@ -103,14 +108,16 @@ def spades_cov_cutoff(argument): argparse.ArgumentParser.error('--spadesMinCoverageAssembly must be positive integer, auto or off') -def runCommandPopenCommunicate(command, shell_True, timeout_sec_None): +def runCommandPopenCommunicate(command, shell_True, timeout_sec_None, print_comand_True): run_successfully = False if isinstance(command, basestring): command = shlex.split(command) else: command = shlex.split(' '.join(command)) - print 'Running: ' + ' '.join(command) + if print_comand_True: + print 'Running: ' + ' '.join(command) + if shell_True: command = ' '.join(command) proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) @@ -127,6 +134,8 @@ def runCommandPopenCommunicate(command, shell_True, timeout_sec_None): if proc.returncode == 0: run_successfully = True else: + if not print_comand_True: + print 'Running: ' + str(command) print 'STDOUT' print stdout.decode("utf-8") print 'STDERR' @@ -169,7 +178,7 @@ def checkPrograms(programs_version_dictionary): listMissings = [] for program in programs: which_program[1] = program - run_successfully, stdout, stderr = runCommandPopenCommunicate(which_program, False, None) + run_successfully, stdout, stderr = runCommandPopenCommunicate(which_program, False, None, False) if not run_successfully: listMissings.append(program + ' not found in PATH.') else: @@ -181,7 +190,7 @@ def checkPrograms(programs_version_dictionary): programs[program].append(stdout.splitlines()[0]) else: check_version = [stdout.splitlines()[0], programs[program][0]] - run_successfully, stdout, stderr = runCommandPopenCommunicate(check_version, False, None) + run_successfully, stdout, stderr = runCommandPopenCommunicate(check_version, False, None, False) if stdout == '': stdout = stderr if program == 'bunzip2': @@ -231,7 +240,7 @@ def setPATHvariable(doNotUseProvidedSoftware, script_path): # Search for fastq files in the directory # pairEnd_filesSeparation_list can be None def searchFastqFiles(directory, pairEnd_filesSeparation_list, listAllFilesOnly): - filesExtensions = ['fastq.gz', 'fq.gz', 'fastq.bz2', 'fq.bz2'] + filesExtensions = ['fastq.gz', 'fq.gz'] pairEnd_filesSeparation = [['_R1_001.f', '_R2_001.f'], ['_1.f', '_2.f']] if pairEnd_filesSeparation_list is not None: @@ -410,7 +419,7 @@ def compressionType(file): return None -steps = ('FastQ_Integrity', 'first_Coverage', 'first_FastQC', 'Trimmomatic', 'second_Coverage', 'second_FastQC', 'SPAdes', 'Pilon', 'MLST') +steps = ('FastQ_Integrity', 'first_Coverage', 'first_FastQC', 'Trimmomatic', 'second_Coverage', 'second_FastQC', 'SPAdes', 'Pilon', 'Assembly_Mapping', 'MLST') def sampleReportLine(run_report): @@ -451,13 +460,13 @@ def write_sample_report(samples_report_path, sample, run_successfully, pass_qc, def timer(function, name): @wraps(function) def wrapper(*args, **kwargs): - print('RUNNING {0}\n'.format(name)) + print('\n' + 'RUNNING {0}\n'.format(name)) start_time = time.time() results = list(function(*args, **kwargs)) # guarantees return is a list to allow .insert() time_taken = runTime(start_time) - print('END {0}\n'.format(name)) + print('END {0}'.format(name)) results.insert(2, time_taken) return results @@ -484,5 +493,110 @@ def write_fail_report(fail_report_path, run_report): writer_failReport.write('\n'.join(failures)) -def getJarPath(): - pass +def parse_free_output(free_output): + free_output = free_output.splitlines() + + dict_free_output = {} + + memory_fields = [] + + counter = 0 + for line in free_output: + if len(line) > 0: + fields = line.split(':', 1) + + values = [] + if len(fields) == 1: + fields = fields[0].split(' ') + + for field in fields: + if field != '': + values.append(field) + elif len(fields) == 2: + fields = fields[1].split(' ') + + for field in fields: + if field != '': + values.append(field) + + if counter == 0: + memory_fields = values + elif counter == 1: + dict_free_output['memory'] = {} + for i in range(0, len(memory_fields)): + dict_free_output['memory'][memory_fields[i]] = values[i] + elif counter == 2 and len(values) == 2: + dict_free_output['buffers'] = {memory_fields[1]: values[0], memory_fields[2]: values[1]} + elif counter == 3 or (counter == 2 and len(values) == 3): + dict_free_output['swap'] = {} + for i in range(0, 3): + dict_free_output['swap'][memory_fields[i]] = values[i] + + counter += 1 + + return dict_free_output + + +def get_free_memory_free(dict_free_output): + cached = 0 + + mem_cached_found = False + for mem_type in dict_free_output['memory']: + if 'cache' in mem_type: + cached = int(dict_free_output['memory'][mem_type]) + mem_cached_found = True + + if not mem_cached_found: + print 'WARNING: it was not possible to determine the cached memory!' + + return cached + + +def get_free_memory_os(): + free_memory_Kb = 0 + try: + free_memory_Kb = (os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_AVPHYS_PAGES')) / 1024.0 + print 'The strict free memory available determined is ' + str(round((free_memory_Kb / (1024.0 ** 2)), 1)) + ' Gb' + except Exception as e: + print e + print 'WARNING: it was not possible to determine the free memory available!' + + return free_memory_Kb + + +def get_free_memory(): + free_memory_Kb = 0 + + command = ['free', '-k'] + run_successfully, stdout, stderr = runCommandPopenCommunicate(command, False, None, False) + + if run_successfully: + dict_free_output = parse_free_output(stdout) + + cached = get_free_memory_free(dict_free_output) + + try: + free_memory_Kb = int(dict_free_output['memory']['free']) + cached + print 'For the memory use (in Kb) below, the free memory available (free + cached) is ' + str(round((free_memory_Kb / (1024.0 ** 2)), 1)) + ' Gb' + print dict_free_output['memory'] + except: + print 'WARNING: it was impossible to determine the free memory using the free command!' + free_memory_Kb = get_free_memory_os() + else: + print 'WARNING: it was impossible to determine the free memory using the free command!' + free_memory_Kb = get_free_memory_os() + + return free_memory_Kb + + +def get_cpu_information(outdir, time_str): + with open(os.path.join(outdir, 'cpu_information.' + time_str + '.cpu.txt'), 'wt') as writer: + command = ['cat', '/proc/cpuinfo'] + run_successfully, stdout, stderr = runCommandPopenCommunicate(command, False, None, False) + if run_successfully: + writer.write(stdout) + + with open(os.path.join(outdir, 'cpu_information.' + time_str + '.slurm.txt'), 'wt') as writer: + for environment in sorted(os.environ): + if environment.startswith('SLURM_'): + writer.write('#' + environment + '\n' + os.environ[environment] + '\n')