diff --git a/scripts/base_classes.py b/scripts/base_classes.py index ad10157..86ad064 100755 --- a/scripts/base_classes.py +++ b/scripts/base_classes.py @@ -4,7 +4,6 @@ import csv import datetime -import glob import json import os import pathlib @@ -35,10 +34,16 @@ class BaseCNVTool: + """ + Base Class for every CNV caller to inherit from. + + .main method will cause the subclass to run fully + """ def __init__(self, capture, gene, start_time, normal_panel=True): self.session = DbSession.factory() self.start_time = start_time self.capture = capture + # to be overwritten, any extra fields will be added to a json representation in the database just in case self.extra_db_fields = [] self.gene = gene self.script_dirs = [f"{cnv_pat_dir}/{folder}" for folder in ["scripts", "cnv-caller-resources"]] @@ -49,22 +54,30 @@ def __init__(self, capture, gene, start_time, normal_panel=True): # will not be done during pytest running of tests self.max_cpu = cnv_pat_settings["max_cpu"] self.max_mem = cnv_pat_settings["max_mem"] + + # Get paths for different variables self.sample_sheet = f"{cnv_pat_dir}/input/{capture}/sample-sheets/{gene}.txt" self.output_base, self.docker_output_base = self.base_output_dirs() + # Get ids and paths for samples split by those as reference (normal) and to be tested (unknown) normal_sample_ids, normal_bams = utils.SampleUtils.select_samples(self.sample_sheet, normal_panel=True) unknown_sample_ids, unknown_bams = utils.SampleUtils.select_samples(self.sample_sheet, normal_panel=False) self.bam_mount = utils.SampleUtils.get_mount_point(unknown_bams + normal_bams) normal_docker_bams = [f"/mnt/bam-input/{bam.split(self.bam_mount)[-1]}" for bam in normal_bams] unknown_docker_bams = [f"/mnt/bam-input/{bam.split(self.bam_mount)[-1]}" for bam in unknown_bams] + # This dictionary is useful when you only have the bam file, but want the id bam_to_sample = utils.SampleUtils.get_bam_to_id(self.sample_sheet) self.bam_to_sample = { f"/mnt/bam-input/{bam.split(self.bam_mount)[-1]}": sample_id for (bam, sample_id) in bam_to_sample.items() } + + # this dictionary is useful when you have the sample id, but want the bam file self.sample_to_bam = {sample_id: bam for (bam, sample_id) in self.bam_to_sample.items()} + # save settings, these will be recorded in the successful-run-settings.toml for each run + # if any of the values in the settings dict change, the run will start again and override the previous one self.settings = { "normal_bams": normal_docker_bams, "ref_fasta": f"/mnt/ref_genome/{cnv_pat_settings['genome_fasta_path'].split('/')[-1]}", @@ -103,6 +116,10 @@ def check_chrom_prefix(self, bed_file): ) def delete_unused_runs(self): + """ + Delete runs using docker so that the rooy created (ugh docker for GATK forces you to be root) + directories are deleted + """ logger.info(f"Removing any old or unsuccessful runs for {self.capture}, {self.run_type}, {self.gene}") subprocess.run( [ @@ -192,7 +209,9 @@ def get_normal_panel_duration(self): return duration def parse_output_file(file_path, sample_id=None): - """Dummy method to be overwritten by each CNV-caller class""" + """Dummy method to be overwritten by each CNV-caller class + This will parse the output data into a common format, with the extra_db_fields still in + """ pass @staticmethod @@ -462,16 +481,25 @@ def upload_run_data(self, sample_names): @logger.catch(reraise=True) def main(self): + """ + Looks for a run settings file, if it doesn't exist or has different values to the current settings - run + otherwise skip this caller for this capture and gene. + + Run the workflows and then write to database, finally writting settings toml file so you know that + everything has actually completed properly. += """ previous_run_settings_path = ( f"{cnv_pat_dir}/successful-run-settings/{self.capture}/{self.run_type}/{self.gene}.toml" ) if self.run_required(previous_run_settings_path): + # cohort is specifically for the normal panel running if self.run_type.endswith("cohort"): self.bam_headers = self.prerun_steps(self.sample_sheet, cnv_pat_settings["genome_fasta_path"]) self.settings["start_datetime"] = datetime.datetime.now() self.run_workflow() self.settings["end_datetime"] = datetime.datetime.now() else: + # if output will be the final run for this caller, then record the results into the database self.bam_headers = self.prerun_steps(self.sample_sheet, cnv_pat_settings["genome_fasta_path"]) self.settings["start_datetime"] = datetime.datetime.now() output_paths, sample_ids = self.run_workflow() diff --git a/scripts/cnv_kit.py b/scripts/cnv_kit.py index d28e33e..4a06eab 100755 --- a/scripts/cnv_kit.py +++ b/scripts/cnv_kit.py @@ -5,13 +5,10 @@ """ import csv -import subprocess import os import pathlib -import toml - -from . import utils, base_classes +from . import base_classes class CNVKit(base_classes.BaseCNVTool): diff --git a/scripts/decon.py b/scripts/decon.py index 823c723..5e86929 100755 --- a/scripts/decon.py +++ b/scripts/decon.py @@ -9,6 +9,9 @@ class DECoN(base_classes.BaseCNVTool): + """ + DECoN class, main in BaseCNVTool will cause self.run_workflow() to be triggered. + """ def __init__(self, capture, gene, start_time, normal_panel=True): self.run_type = "decon" super().__init__(capture, gene, start_time, normal_panel=normal_panel) @@ -61,6 +64,11 @@ def run_command(self, args): base_classes.logger.info(f"Completed {self.run_type}: {args[0]} {args[-1]}") def run_workflow(self): + """ + Will run entire workflow and return the final output data paths, and the sample names analysed + :return: (output_paths, sample_names) + + """ pathlib.Path(self.output_base).mkdir(parents=True, exist_ok=True) with open(f"{self.output_base}/bams.txt", "w") as handle: diff --git a/scripts/utils.py b/scripts/utils.py index 3f1a0e7..ea57d6b 100755 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -48,8 +48,11 @@ def get_bam_to_id(cls, sample_sheet): return {path: sample_id for (path, sample_id) in zip(paths, sample_ids)} @classmethod - def select_samples(cls, sample_sheet, normal_panel): - """returns (sample_ids, sample_paths) from a gene's sample sheet""" + def select_samples(cls, sample_sheet: str, normal_panel: bool) -> (str, str): + """ + Filters sample sheet ids and paths by normal_panel status + returns (sample_ids, sample_paths) from a gene's sample sheet + """ if normal_panel: cnv_statuses = ["normal-panel"] else: