Skip to content

Commit

Permalink
Alt insert size metrics (#9)
Browse files Browse the repository at this point in the history
* Add samtools stats-based insert size calculation

* Incorporate alignment qc metrics from qualimap and samtools

* use unix dialect for csv
  • Loading branch information
dfornika authored Dec 15, 2023
1 parent 9dd1a85 commit 206a8e7
Show file tree
Hide file tree
Showing 7 changed files with 317 additions and 25 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,13 @@ flowchart TD
filtlong --> minimap2
minimap2 --> alignments
alignments --> qualimap(qualimap_bamqc)
alignments --> samtools_stats(samtools_stats)
alignments --> freebayes(freebayes)
alignments --> samtools_mpileup(samtools_mpileup)
samtools_mpileup --> generate_low_coverage_bed(generate_low_coverage_bed)
samtools_mpileup --> percent_coverage_by_depth(percent_coverage_by_depth)
qualimap --> combined_alignment_qc(combined_alignment_qc)
samtools_stats --> combined_alignment_qc
```

## Outputs
Expand Down
94 changes: 94 additions & 0 deletions bin/combine_alignment_qc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/env python3

import argparse
import csv
import json
import sys

def parse_samtools_stats_summary(samtools_stats_summary_file):
samtools_stats_summary_data = []
with open(samtools_stats_summary_file, 'r') as f:
reader = csv.DictReader(f, delimiter=',')
for row in reader:
record = {}
for k, v in row.items():
new_k = k + '__samtools'
record[new_k] = v
samtools_stats_summary_data.append(record)

first_record = samtools_stats_summary_data[0]

return first_record


def parse_qualimap_bamqc_genome_results(qualimap_bamqc_genome_results_file):
qualimap_bamqc_genome_results_data = []
with open(qualimap_bamqc_genome_results_file) as f:
reader = csv.DictReader(f, delimiter=',')
for row in reader:
record = {}
for k, v in row.items():
new_k = k + '__qualimap'
record[new_k] = v
qualimap_bamqc_genome_results_data.append(record)

first_record = qualimap_bamqc_genome_results_data[0]

return first_record


def main(args):
samtools_stats_summary_data = parse_samtools_stats_summary(args.samtools_stats_summary)
qualimap_bamqc_genome_results_data = parse_qualimap_bamqc_genome_results(args.qualimap_bamqc_genome_results)

samtools_fields = [
'reads_mapped',
'reads_mapped_and_paired',
'reads_unmapped',
'reads_properly_paired',
'reads_paired',
'reads_duplicated',
'error_rate',
'non-primary_alignments',
'supplementary_alignments',
'average_length',
'average_first_fragment_length',
'average_last_fragment_length',
'insert_size_average',
'insert_size_standard_deviation',
]

qualimap_fields = [
'mean_depth_coverage',
'stdev_depth_coverage',
]

selected_samtools_data = { k: samtools_stats_summary_data[k + '__samtools'] for k in samtools_fields }
selected_qualimap_data = { k: qualimap_bamqc_genome_results_data[ k + '__qualimap'] for k in qualimap_fields }
combined_data = selected_samtools_data.copy()
combined_data.update(selected_qualimap_data)

output_fields = []

if args.sample_id:
combined_data['sample_id'] = args.sample_id
output_fields.append('sample_id')
if args.read_type:
combined_data['read_type'] = args.read_type
output_fields.append('read_type')

output_fields += samtools_fields
output_fields += qualimap_fields

writer = csv.DictWriter(sys.stdout, dialect='unix', fieldnames=output_fields, quoting=csv.QUOTE_MINIMAL, extrasaction='ignore')
writer.writeheader()
writer.writerow(combined_data)

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--samtools-stats-summary', required=True)
parser.add_argument('--qualimap-bamqc-genome-results', required=True)
parser.add_argument('--sample-id')
parser.add_argument('--read-type')
args = parser.parse_args()
main(args)
132 changes: 132 additions & 0 deletions bin/parse_samtools_stats_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#!/usr/bin/env python3

import argparse
import csv
import json
import sys


def parse_samtools_stats_summary(samtools_stats_summary_file):
output_data = {}
int_fields = [
'raw_total_sequences',
'filtered_sequences',
'sequences',
'first_fragments',
'last_fragments',
'reads_mapped',
'reads_mapped_and_paired',
'reads_unmapped',
'reads_properly_paired',
'reads_paired',
'reads_duplicated',
'reads_MQ0',
'reads_QC_failed',
'non-primary_alignments',
'supplementary_alignments',
'total_length',
'total_first_fragment_length',
'total_last_fragment_length',
'bases_mapped',
'bases_mapped_cigar',
'bases_trimmed',
'bases_duplicated',
'mismatches',
'maximum_length',
'maximum_first_fragment_length',
'maximum_last_fragment_length',
'inward_oriented_pairs',
'outward_oriented_pairs',
'pairs_with_other_orientation',
'pairs_on_different_chromosomes',
]
float_fields = [
'error_rate',
'average_length',
'average_first_fragment_length',
'average_last_fragment_length',
'average_quality',
'insert_size_average',
'insert_size_standard_deviation',
]
with open(samtools_stats_summary_file, 'r') as f:
for line in f:
line = line.strip()
line_split = line.split('\t')
key = line_split[0].strip().rstrip(':').replace(' ', '_').replace('(', '').replace(')', '')
if key == '1st_fragments':
key = 'first_fragments'
if key.endswith('%'):
key = key.replace('_%', '')
elif key in int_fields:
try:
output_data[key] = int(line_split[1])
except ValueError:
output_data[key] = None
elif key == 'is_sorted':
output_data[key] = line_split[1] == '1'
elif key in float_fields:
try:
output_data[key] = float(line_split[1])
except ValueError:
output_data[key] = None
else:
output_data[key] = line_split[1]

return output_data


def main(args):
samtools_stats_summary_data = parse_samtools_stats_summary(args.input)

if args.sample_id:
samtools_stats_summary_data['sample_id'] = args.sample_id

output_fields = [
'raw_total_sequences',
'filtered_sequences',
'first_fragments',
'last_fragments',
'reads_mapped',
'reads_mapped_and_paired',
'reads_unmapped',
'reads_properly_paired',
'reads_paired',
'reads_duplicated',
'reads_MQ0',
'reads_QC_failed',
'non-primary_alignments',
'supplementary_alignments',
'total_length',
'total_first_fragment_length',
'total_last_fragment_length',
'bases_mapped',
'bases_mapped_cigar',
'bases_trimmed',
'bases_duplicated',
'mismatches',
'error_rate',
'average_length',
'average_first_fragment_length',
'average_last_fragment_length',
'average_quality',
'insert_size_average',
'insert_size_standard_deviation',
'inward_oriented_pairs',
'outward_oriented_pairs',
'pairs_with_other_orientation',
'pairs_on_different_chromosomes',
]
if args.sample_id:
output_fields = ['sample_id'] + output_fields

writer = csv.DictWriter(sys.stdout, fieldnames=output_fields, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, extrasaction='ignore')
writer.writeheader()
writer.writerow(samtools_stats_summary_data)

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Parse samtools stats summary file')
parser.add_argument('-i', '--input', type=str, required=True, help='samtools stats summary file')
parser.add_argument('-s', '--sample-id', type=str, required=True, help='sample id')
args = parser.parse_args()
main(args)
12 changes: 0 additions & 12 deletions bin/qualimap_bamqc_genome_results_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,6 @@ def main(args):
if line.startswith('number of secondary alignments'):
num_secondary_alignments = int(line.split('=')[1].strip().replace(',', ''))
output_data['num_secondary_alignments'] = num_secondary_alignments
if line.startswith('mean insert size'):
mean_insert_size = line.split('=')[1].strip().replace(',', '')
output_data['mean_insert_size'] = float(mean_insert_size)
if line.startswith('median insert size'):
median_insert_size = line.split('=')[1].strip().replace(',', '')
output_data['median_insert_size'] = float(median_insert_size)
if line.startswith('std insert size'):
stdev_insert_size = line.split('=')[1].strip().replace(',', '')
output_data['stdev_insert_size'] = float(stdev_insert_size)
if line.startswith('duplication rate'):
duplication_rate = line.split('=')[1].strip().replace('%', '')
output_data['duplication_rate_percent'] = round(float(duplication_rate), 2)
Expand Down Expand Up @@ -91,9 +82,6 @@ def main(args):
'num_reads',
'num_mapped_reads',
'percent_mapped_reads',
'mean_insert_size',
'median_insert_size',
'stdev_insert_size',
'mean_mapping_quality',
'error_rate',
'number_of_mismatches',
Expand Down
11 changes: 10 additions & 1 deletion main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ include { minimap2 } from './modules/alignment_variants.nf
include { freebayes } from './modules/alignment_variants.nf'
include { qualimap_bamqc } from './modules/alignment_variants.nf'
include { samtools_mpileup } from './modules/alignment_variants.nf'
include { samtools_stats } from './modules/alignment_variants.nf'
include { combine_alignment_qc } from './modules/alignment_variants.nf'
include { generate_low_coverage_bed } from './modules/alignment_variants.nf'
include { percent_coverage_by_depth } from './modules/alignment_variants.nf'
include { pipeline_provenance } from './modules/provenance.nf'
Expand Down Expand Up @@ -84,6 +86,10 @@ workflow {

samtools_mpileup(ch_alignments.join(ch_ref))

samtools_stats(ch_alignments)

combine_alignment_qc(qualimap_bamqc.out.alignment_qc.join(samtools_stats.out.stats_summary_csv, by: [0, 1]))

ch_depths = samtools_mpileup.out.depths

generate_low_coverage_bed(ch_depths)
Expand All @@ -93,7 +99,9 @@ workflow {
// Collect multi-sample outputs
if (params.collect_outputs) {
fastp.out.fastp_csv.map{ it -> it[1] }.collectFile(keepHeader: true, sort: { it.text }, name: "${params.collected_outputs_prefix}_fastp.csv", storeDir: "${params.outdir}")
qualimap_bamqc.out.alignment_qc.map{ it -> it[1] }.collectFile(keepHeader: true, sort: { it.text }, name: "${params.collected_outputs_prefix}_qualimap_alignment_qc.csv", storeDir: "${params.outdir}")
qualimap_bamqc.out.alignment_qc.map{ it -> it[2] }.collectFile(keepHeader: true, sort: { it.text }, name: "${params.collected_outputs_prefix}_qualimap_alignment_qc.csv", storeDir: "${params.outdir}")
samtools_stats.out.stats_summary_csv.map{ it -> it[2] }.collectFile(keepHeader: true, sort: { it.text }, name: "${params.collected_outputs_prefix}_samtools_stats_summary.csv", storeDir: "${params.outdir}")
combine_alignment_qc.out.map{ it -> it[2] }.collectFile(keepHeader: true, sort: { it.text }, name: "${params.collected_outputs_prefix}_combined_alignment_qc.csv", storeDir: "${params.outdir}")
}

// Collect Provenance
Expand All @@ -112,6 +120,7 @@ workflow {
ch_provenance = ch_provenance.join(minimap2.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
ch_provenance = ch_provenance.join(qualimap_bamqc.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
ch_provenance = ch_provenance.join(samtools_mpileup.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
ch_provenance = ch_provenance.join(samtools_stats.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
ch_provenance = ch_provenance.join(freebayes.out.provenance).map{ it -> [it[0], it[1] << it[2]] }

collect_provenance(ch_provenance)
Expand Down
Loading

0 comments on commit 206a8e7

Please sign in to comment.