From a75153a2dd2d6f5e290edfa2dd0899fd725f0f25 Mon Sep 17 00:00:00 2001 From: Stef Piatek Date: Fri, 10 May 2019 10:05:09 +0100 Subject: [PATCH 1/2] Add panelcn.MOPS Dockerfile Also break up copwriter Dockerfile steps so that they can be reused --- docker-images/copywriter/Dockerfile | 5 +++-- docker-images/panelcn-mops/Dockerfile | 12 ++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) create mode 100755 docker-images/panelcn-mops/Dockerfile diff --git a/docker-images/copywriter/Dockerfile b/docker-images/copywriter/Dockerfile index ac63f8d..6a13fdc 100755 --- a/docker-images/copywriter/Dockerfile +++ b/docker-images/copywriter/Dockerfile @@ -4,7 +4,8 @@ RUN apt-get update \ && apt-get install -y --no-install-recommends libcurl4-openssl-dev libxml2-dev libssl-dev \ && rm -rf /var/lib/apt/lists/* -RUN R -e "install.packages(c('optparse', 'dplyr', 'tidyr', 'stringr', 'BiocManager')); \ - BiocManager::install('CopywriteR', version = '3.8');" +RUN R -e "install.packages(c('optparse', 'dplyr', 'tidyr', 'stringr', 'BiocManager'));" + +RUN R -e "BiocManager::install('CopywriteR', version = '3.8');" CMD ["bash"] diff --git a/docker-images/panelcn-mops/Dockerfile b/docker-images/panelcn-mops/Dockerfile new file mode 100755 index 0000000..a340618 --- /dev/null +++ b/docker-images/panelcn-mops/Dockerfile @@ -0,0 +1,12 @@ +FROM r-base:3.5.1 + +RUN apt-get update \ + && apt-get install -y --no-install-recommends libcurl4-openssl-dev libxml2-dev libssl-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN R -e "install.packages(c('optparse', 'dplyr', 'tidyr', 'stringr', 'BiocManager'));" + +RUN R -e "install.packages(c('purrr')); \ + BiocManager::install('panelcn.mops', version = '3.8');" + +CMD ["bash"] From 7e25fb7f3590a6fdb12db543a68b7d6c47e192da Mon Sep 17 00:00:00 2001 From: Stef Piatek Date: Fri, 10 May 2019 10:07:23 +0100 Subject: [PATCH 2/2] Add panelcn.MOPS to callers --- .../panelcn_mops/panelcn_mops_runner.R | 62 ++++++++++++++++ cnv_patissier.py | 27 +++++-- scripts/panelcn_mops.py | 70 +++++++++++++++++++ .../output_parsing/panelcn_mops/results.txt | 5 ++ tests/unit/test_panelcn_mops.py | 61 ++++++++++++++++ 5 files changed, 218 insertions(+), 7 deletions(-) create mode 100755 cnv-caller-resources/panelcn_mops/panelcn_mops_runner.R create mode 100755 scripts/panelcn_mops.py create mode 100755 tests/test_files/output_parsing/panelcn_mops/results.txt create mode 100755 tests/unit/test_panelcn_mops.py diff --git a/cnv-caller-resources/panelcn_mops/panelcn_mops_runner.R b/cnv-caller-resources/panelcn_mops/panelcn_mops_runner.R new file mode 100755 index 0000000..63f414c --- /dev/null +++ b/cnv-caller-resources/panelcn_mops/panelcn_mops_runner.R @@ -0,0 +1,62 @@ +library(dplyr) +library(optparse) +library(panelcn.mops) +library(purrr) + + +detect_cnvs <- function(sample_number) { + " + Function to detect CNV for each column of unknown Granges object + Takes sample_number + Returns table of only positive CNVs with sample name + " + # Set up test data + test_and_control <- unknowns[, sample_number] + elementMetadata(test_and_control) <- cbind(elementMetadata(test_and_control), elementMetadata(normals)) + # Detect CNVs + result_list <- runPanelcnMops(test_and_control, countWindows = count_windows, selectedGenes = selected_gene, maxControls = 30) + # Output results from S4 object to table + file_names <- colnames(elementMetadata(unknowns)) + raw_results <- createResultTable( + resultlist = result_list, XandCB = test_and_control, + countWindows = count_windows, + selectedGenes = selected_gene, + sampleNames = file_names + ) + + # Process table for output + cnv_results <- raw_results[[1]] %>% + filter(CN != "CN2") %>% + mutate(Sample = unknown_samples$sample_name[sample_number]) %>% + return() +} + + +# Parse options +option_list <- list( + make_option(c("--output-path"), type = "character"), + make_option(c("--gene"), type = "character"), + make_option(c("--chrom-prefix"), type = "character", default = NULL) +) +opt <- parse_args(OptionParser(option_list = option_list), convert_hyphens_to_underscores = TRUE) + +# Set up parameters +bed <- file.path(opt$output_path, "capture.bed") +split_bed <- file.path(opt$output_path, "capture_split.bed") +splitROIs(bed, split_bed) + +all_samples <- read.table(file.path(opt$output_path, "samples.tsv"), header = TRUE, stringsAsFactors = FALSE) +unknown_samples <- all_samples[all_samples$sample_type == "unknown", ] +normal_samples <- all_samples[all_samples$sample_type == "normal_panel", ] +selected_gene <- opt$gene + +# Get counts +count_windows <- getWindows(split_bed) +normals <- countBamListInGRanges(countWindows = count_windows, bam.files = normal_samples$bam_path, read.width = 150) +unknowns <- countBamListInGRanges(countWindows = count_windows, bam.files = unknown_samples$bam_path, read.width = 150) + +# Run CNV calling +called_cnvs <- map_dfr(.x = seq(1, nrow(unknown_samples)), .f = detect_cnvs) + +# Save results +write.table(called_cnvs, file = file.path(opt$output_path, "calls.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) diff --git a/cnv_patissier.py b/cnv_patissier.py index fe22c55..b99b311 100755 --- a/cnv_patissier.py +++ b/cnv_patissier.py @@ -3,14 +3,26 @@ import pathlib from scripts.db_session import DbSession -from scripts import utils, copywriter, codex2, cnv_kit, decon, excavator2, exome_depth, gatk, savvy_cnv, xhmm - +from scripts import ( + utils, + copywriter, + codex2, + cnv_kit, + decon, + excavator2, + exome_depth, + gatk, + panelcn_mops, + savvy_cnv, + xhmm, +) def init_db(capture_name): db_path = f"{utils.get_cnv_patissier_dir()}/output/{capture_name}.sqlite" DbSession.global_init(db_path) + if __name__ == "__main__": parser = ArgumentParser(description="Ochrestrating your CNV-caller bakeoff") parser.add_argument("capture_name", help="After following the setup in the README.md, please give the capture name") @@ -18,7 +30,7 @@ def init_db(capture_name): start_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%m-%S") - capture_name = args.capture_name + capture_name = args.capture_name init_db(capture_name) cnv_pat_dir = utils.get_cnv_patissier_dir() sample_sheet_path = pathlib.Path(cnv_pat_dir, "input", capture_name, "sample-sheets") @@ -26,11 +38,11 @@ def init_db(capture_name): genes = [path.stem for path in list(sample_sheet_path.glob("*.txt"))] assert genes, "No genes found in path!" - for gene in genes: + for gene in sorted(genes): # cnv_caller = cnv_kit.CNVKit(capture_name, gene, start_time) # cnv_caller.main() cnv_caller = codex2.CODEX2(capture_name, gene, start_time) - cnv_caller.main() + cnv_caller.main() cnv_caller = copywriter.Copywriter(capture_name, gene, start_time) cnv_caller.main() cnv_caller = decon.DECoN(capture_name, gene, start_time) @@ -45,10 +57,11 @@ def init_db(capture_name): cnv_caller.main() cnv_caller = gatk.GATKCase(capture_name, gene, start_time) cnv_caller.main() + cnv_caller = panelcn_mops.panelcnMOPS(capture_name, gene, start_time) + cnv_caller.main() cnv_caller = savvy_cnv.SavvyCNV(capture_name, gene, start_time) cnv_caller.main() cnv_caller = xhmm.XHMM(capture_name, gene, start_time) cnv_caller.main() - -print("Congrats, you're all done") \ No newline at end of file + print("Congrats, you're all done") diff --git a/scripts/panelcn_mops.py b/scripts/panelcn_mops.py new file mode 100755 index 0000000..f2074bf --- /dev/null +++ b/scripts/panelcn_mops.py @@ -0,0 +1,70 @@ +""" +panelcn.MOPS set up using basic instructions in manual https://bioconductor.org/packages/release/bioc/vignettes/panelcn.mops/inst/doc/panelcn.mops.pdf + +Used splitting of BED file with defaults (100bp bins with 50 bp overlap) +""" + +import csv +import pathlib + +from . import base_classes + + +class panelcnMOPS(base_classes.BaseCNVTool): + def __init__(self, capture, gene, start_time, normal_panel=True): + self.run_type = "panelcn_mops" + super().__init__(capture, gene, start_time, normal_panel=normal_panel) + self.extra_db_fields = ["gene", "exon", "rc", "medrc", "rc.norm", "medrc.norm", "lowqual", "cn"] + self.settings = {**self.settings, "docker_image": "stefpiatek/panelcn_mops:1.4.0"} + + def parse_output_file(self, file_path, sample_id): + cnvs = [] + with open(file_path, "r") as handle: + output = csv.DictReader(handle, delimiter="\t") + for row in output: + if row["Sample"] == sample_id: + cnv = {key.lower(): value for key, value in row.items()} + cnv["chrom"] = f"{self.settings['chromosome_prefix']}{cnv.pop('chr')}" + cnv["sample_id"] = cnv.pop("sample") + copy_number = int(cnv["cn"].lstrip("CN")) + if copy_number < 2: + cnv["alt"] = "DEL" + elif copy_number > 2: + cnv["alt"] = "DUP" + else: + raise Exception(f"row doesn't have a copy number change {cnv}") + cnvs.append(cnv) + + return cnvs + + def run_command(self, args): + self.run_docker_subprocess(["Rscript", f"/mnt/cnv-caller-resources/panelcn_mops/panelcn_mops_runner.R", *args]) + + def run_workflow(self): + bed_path = self.settings["capture_path"].replace("/mnt", base_classes.cnv_pat_dir) + + pathlib.Path(self.output_base).mkdir(parents=True, exist_ok=True) + + with open(bed_path, "r") as input_bed: + with open(f"{self.output_base}/capture.bed", "w") as output_bed: + for line in input_bed: + chrom, start, end, gene = line.split() + output_bed.write(f"{chrom}\t{start}\t{end}\t{gene}.{chrom}.{start}.{end}\n") + + with open(f"{self.output_base}/samples.tsv", "w") as handle: + handle.write(f"bam_path\tsample_name\tsample_type\n") + for bam in self.settings["unknown_bams"] + self.settings["normal_bams"]: + sample = self.bam_to_sample[bam] + if bam in self.settings["unknown_bams"]: + sample_type = "unknown" + else: + sample_type = "normal_panel" + + handle.write(f"{bam}\t{sample}\t{sample_type}\n") + + self.run_command([f"--output-path={self.docker_output_base}", f"--gene={self.gene}"]) + + sample_names = [f"{self.bam_to_sample[unknown_bam]}" for unknown_bam in self.settings["unknown_bams"]] + output_paths = [f"{self.output_base}/calls.tsv" for sample_name in sample_names] + + return output_paths, sample_names diff --git a/tests/test_files/output_parsing/panelcn_mops/results.txt b/tests/test_files/output_parsing/panelcn_mops/results.txt new file mode 100755 index 0000000..0d60e10 --- /dev/null +++ b/tests/test_files/output_parsing/panelcn_mops/results.txt @@ -0,0 +1,5 @@ +Sample Chr Gene Exon Start End RC medRC RC.norm medRC.norm lowQual CN +sample_1 2 test test.NA.chr2.5700.5750 5700 5750 1430 2546 1537 2508 CN1 +sample_1 17 test test.NA.chr17.100.4000 100 4000 1430 2546 1537 2508 CN1 +sample_2 2 test test.NA.chr2.5700.5750 5700 5750 5430 2546 3537 2508 CN3 +sample_2 17 test test.NA.chr17.100.4000 100 4000 5430 2546 3537 2508 CN3 diff --git a/tests/unit/test_panelcn_mops.py b/tests/unit/test_panelcn_mops.py new file mode 100755 index 0000000..8d67ed2 --- /dev/null +++ b/tests/unit/test_panelcn_mops.py @@ -0,0 +1,61 @@ +import pathlib + +from scripts.panelcn_mops import panelcnMOPS + + +class TestParseOutputFile: + def setup(self): + self.caller = panelcnMOPS("capture", "gene_1", "time") + self.caller.settings = {"chromosome_prefix": "chr"} + self.output = pathlib.Path("tests/test_files/output_parsing/panelcn_mops/results.txt") + self.del_expected_output = [ + { + "chrom": "chr2", + "start": "5700", + "end": "5750", + "gene": "test", + "exon": "test.NA.chr2.5700.5750", + "alt": "DEL", + "sample_id": "sample_1", + "rc": "1430", + "medrc": "2546", + "rc.norm": "1537", + "medrc.norm": "2508", + "lowqual": "", + "cn": "CN1", + }, + { + "chrom": "chr17", + "start": "100", + "end": "4000", + "gene": "test", + "exon": "test.NA.chr17.100.4000", + "alt": "DEL", + "sample_id": "sample_1", + "rc": "1430", + "medrc": "2546", + "rc.norm": "1537", + "medrc.norm": "2508", + "lowqual": "", + "cn": "CN1", + }, + ] + + def test_del(self): + parsed = self.caller.parse_output_file(self.output, "sample_1") + assert parsed == self.del_expected_output + + def test_dup(self): + expected_output = list(self.del_expected_output) + for row in expected_output: + row["rc"] = "5430" + row["rc.norm"] = "3537" + row["alt"] = "DUP" + row["sample_id"] = "sample_2" + row["cn"] = "CN3" + parsed = self.caller.parse_output_file(self.output, "sample_2") + assert parsed == expected_output + + def test_normal(self): + parsed = self.caller.parse_output_file(self.output, "sample_3") + assert parsed == []