Merge pull request #61 from stefpiatek/setup_checks

Add setup checks
stefpiatek · Apr 18, 2019 · ae0840d · ae0840d
2 parents 0a1a831 + 51fa506
commit ae0840d
Show file tree

Hide file tree

Showing 25 changed files with 350 additions and 91 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,8 @@ tests/coverage_html/
 ! tests/coverage_html/.gitkeep
 *.gz
 *.html
+tests/test_files/input/checks/sample*
+
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/README.md b/README.md
@@ -153,7 +153,7 @@ At least Python3.6 and Docker (at least engine 1.10) are required for this proje
     - Only create a sample sheet where you have at least 30 samples which are known not to have CNV, and have samples with known CNVs. You do not need to have a sample sheet for every gene in your capture. 
     - The column names are: 
         - sample_id: unique name for the sample
-        - sample_path: full path to the bam file
+        - sample_path: full path to the bam file, no '-' allowed in the filename (but in path is fine)
         - result_type :  samples that are known to have no CNVs can be either `normal-panel` or `normal`. Samples which have a CNV are `positive`
             - There should be at least 30 `normal-panel` samples, as many `positive` samples and a similar number of `normal` samples
         - Data for positive CNV samples:
@@ -187,3 +187,28 @@ To run the tests
 # in the root directory of cnv-patissier, with your environment activated 
 python -m pytest tests/
 ```
+
+## FAQ and common issues
+
+### What will you change
+
+- The bam files and the reference genome are mounted as read only
+- Only the `output` and `test` directory is mounted as writeable 
+
+
+### What is collected in the final SQLite database for sharing?
+
+- Each CNV call, with all metadata from each caller
+- The BAM header for each file used, the docker mount path of the BAM file, result type and the sample name
+- Information about the run duration and gene of interest
+
+
+### What can the user change
+
+- Ideally nothing, the files in the `scripts`  and `cnv-caller-resources` directory should never be altered
+
+
+### BAM index 
+
+- Please make sure the BAM is indexed, and that this is newer than the BAM file (touch if necessary)
+- Some tools will require this and fail because it doesn't exist
diff --git a/requirements.txt b/requirements.txt
@@ -22,5 +22,5 @@ pytest-cov==2.6.0
 pytest-cover==3.0.0
 pytest-flake8==1.0.2
 six==1.11.0
-SQLAlchemy==1.2.15
+SQLAlchemy==1.3.3
 toml==0.10.0
diff --git a/scripts/base_classes.py b/scripts/base_classes.py
@@ -75,8 +75,7 @@ def __init__(self, capture, gene, start_time, normal_panel=True):
                 "capture": capture,
                 "gene": gene,
                 "start_time": start_time,
-                # To add in after sure all genes run correctly
-                # "sample_sheet_md5sum": self.get_md5sum(self.sample_sheet),
+                "sample_sheet_md5sum": self.get_md5sum(self.sample_sheet)[0],
                 "capture_path": f"/mnt/input/{capture}/bed/{capture}.bed",
                 "unknown_bams": unknown_docker_bams,
             }
@@ -88,6 +87,21 @@ def base_output_dirs(self):
 
         return (output_base, docker_output_base)
 
+    def check_chrom_prefix(self, bed_file):
+        """Raises exception if chromosome prefix doesn't match chromsome in bed file"""
+        chromosome_names = [x for x in range(1, 23)] + ["X", "Y", "M", "mt"]
+        chromosomes = [f"{self.settings['chromosome_prefix']}{chromsome}\t" for chromsome in chromosome_names]
+        with open(bed_file, "r") as handle:
+            for line_number, line in enumerate(handle, start=1):
+                if not any([line.startswith(chrom) for chrom in chromosomes]):
+                    raise Exception(
+                        "BED file contains line which has an invalid chromosome:\n"
+                        f"Line number: {line_number}\n"
+                        "Line: '{}'\n".format(line.replace("\t", "<tab>").rstrip())
+                        + "Expected format: '{}'\n".format(chromosomes[0].replace("\t", "<tab>start<tab>end<tab>gene"))
+                        + "Please update 'chromosome_prefix' in local settings file, or alter the BED file."
+                    )
+
     def delete_unused_runs(self):
         logger.info(f"Removing any old or unsuccessful runs for {self.capture}, {self.run_type}, {self.gene}")
         subprocess.run(
@@ -156,7 +170,7 @@ def get_bam_header(self, sample_id):
     def get_md5sum(self, file_path):
         md5sum_proc = subprocess.run(["md5sum", file_path], check=True, stdout=subprocess.PIPE)
         md5sum, path = str(md5sum_proc.stdout, "utf-8").split()
-        return md5sum, path
+        return md5sum, path.replace(cnv_pat_dir, "cnv-pat")
 
     def get_normal_panel_time(self):
         normal_path = (
@@ -225,6 +239,46 @@ def parse_vcf(input_vcf, sample_id):
                 cnvs.append(cnv)
         return cnvs
 
+    def prerun_steps(self, sample_sheet_path, ref_genome_path):
+        """
+        Returns dictionary of sample_id: bam header
+        Checks: 
+          - filenames have no invalid characters (check_files)
+          - file paths exist (check_files)
+          - file paths are unique (check_files)
+          - sample_ids are unique (check_unique)
+          - reference genome files exist
+          - SN tag in bam header (from get_bam_header)
+          - sample id matches the sample ID given (from get_bam_header)
+
+        """
+        bam_headers = {}
+        sample_paths = []
+        sample_ids = []
+        with open(sample_sheet_path, "r") as handle:
+            sample_sheet = csv.DictReader(handle, dialect="excel", delimiter="\t")
+            for line in sample_sheet:
+                sample_paths.append(line["sample_path"])
+                sample_ids.append(line["sample_id"])
+
+        utils.SampleUtils.check_files(sample_paths)
+        utils.SampleUtils.check_unique(sample_ids, "sample_id")
+        for extension in ["", ".fai", ".dict"]:
+            if extension == ".dict":
+                ref_genome_path = ref_genome_path.rstrip(".fasta").rstrip(".fa")
+            ref_genome = pathlib.Path(ref_genome_path + extension)
+            assert (
+                ref_genome.exists()
+            ), f"{ref_genome} does not exist\nPlease edit your settings file or create the file"
+
+        for sample_id, sample_path in zip(sample_ids, sample_paths):
+            bam_header = self.get_bam_header(sample_id)
+            bam_headers[sample_id] = bam_header
+            #  to avoid `docker: Error response from daemon: container did not start before the specified timeout.`
+            time.sleep(5)
+
+        return bam_headers
+
     def process_caller_output(self, sample_path, sample_id=None):
         try:
             cnvs = self.parse_output_file(sample_path, sample_id)
@@ -300,12 +354,15 @@ def upload_all_known_data(self):
         known_cnv_table = self.upload_samples(self.sample_sheet)
         self.upload_positive_cnvs(known_cnv_table)
 
-    def upload_all_md5sums(self):
+    def upload_all_md5sums(self, run_id):
         for folder in self.script_dirs:
             folder_path = pathlib.Path(folder)
-            for file in folder_path.glob("*.[pR]*"):
+            files = [python for python in folder_path.glob("**/*.py")] + [R for R in folder_path.glob("**/*.R")]
+            for file in files:
                 md5sum, file_path = self.get_md5sum(file)
-                print(md5sum, file_path)
+                Queries.update_or_create(
+                    models.File, self.session, defaults={"run_id": run_id, "relative_path": file_path}, md5sum=md5sum
+                )
 
     def upload_cnv_caller(self):
         Queries.get_or_create(models.Caller, self.session, defaults=dict(name=self.run_type))
@@ -338,9 +395,7 @@ def upload_samples(self, sample_sheet_path):
 
             sample_sheet = csv.DictReader(handle, dialect="excel", delimiter="\t")
             for line in sample_sheet:
-                bam_header = self.get_bam_header(line["sample_id"])
-                #  to avoid `docker: Error response from daemon: container did not start before the specified timeout.`
-                time.sleep(5) 
+                bam_header = self.bam_headers[line["sample_id"]]
                 sample_defaults = {"name": line["sample_id"], "path": line["sample_path"], "gene_id": gene_instance.id}
                 sample_data = {"bam_header": bam_header, "result_type": line["result_type"]}
 
@@ -400,7 +455,9 @@ def upload_run_data(self, sample_names):
 
         run_defaults = {"gene_id": gene_instance.id, "caller_id": caller_instance.id}
         upload_data = {"samples": json.dumps(sample_ids), "duration": duration}
-        Queries.update_or_create(models.Run, self.session, defaults=run_defaults, **upload_data)
+        run_instance, created = Queries.update_or_create(models.Run, self.session, defaults=run_defaults, **upload_data)
+        self.session.commit()       
+        self.upload_all_md5sums(run_instance.id)
         self.session.commit()
 
     @logger.catch(reraise=True)
@@ -410,17 +467,18 @@ def main(self):
         )
         if self.run_required(previous_run_settings_path):
             if self.run_type.endswith("cohort"):
+                self.bam_headers = self.prerun_steps(self.sample_sheet, cnv_pat_settings["genome_fasta_path"])
                 self.settings["start_datetime"] = datetime.datetime.now()
                 self.run_workflow()
                 self.settings["end_datetime"] = datetime.datetime.now()
             else:
+                self.bam_headers = self.prerun_steps(self.sample_sheet, cnv_pat_settings["genome_fasta_path"])
                 self.settings["start_datetime"] = datetime.datetime.now()
                 output_paths, sample_ids = self.run_workflow()
                 self.settings["end_datetime"] = datetime.datetime.now()
                 self.upload_all_known_data()
                 self.upload_all_called_cnvs(output_paths, sample_ids)
                 self.upload_run_data(sample_ids)
-                # self.upload_file_data()
             self.write_settings_toml()
 
     def write_settings_toml(self):

diff --git a/scripts/utils.py b/scripts/utils.py
@@ -11,9 +11,17 @@ def get_cnv_patissier_dir():
 class SampleUtils:
     @classmethod
     def check_files(cls, paths):
-        """Returns common root path for a list of paths"""
-        files = [pathlib.Path(path) for path in paths]
+        """ Takes in a list of paths and raises Exception if:
+         - File doesn't exist
+         - Has invalid character in filename
+         - Paths are not unique
+
+        """
+        cls.check_unique(paths, "sample_path")
+
+        files = [pathlib.Path(path).resolve() for path in paths]
         for file in files:
+            # CODEX2 automatically replaces '-' with '.'
             if "-" in file.name:
                 raise Exception(
                     f"File {file} has a '-' in which is not allowed, please rename (or make a temporary copy of) "
@@ -22,8 +30,17 @@ def check_files(cls, paths):
             if not file.exists():
                 raise Exception(f"File {file} does not exist")
 
+    @classmethod
+    def check_unique(cls, items, data_type):
+        """If items are not unique, returns exception with duplicate items listed """
+        non_unique = set(item for item in items if items.count(item) > 1)
+        if non_unique:
+            non_unique_out = "\n ".join(non_unique)
+            raise Exception(f"The the following {data_type}(s) are not unique:\n {non_unique_out}")
+
     @classmethod
     def get_bam_to_id(cls, sample_sheet):
+        """Returns dictionary of bam paths to sample ids from the sample sheet """
         normal_id, normal_path = cls.select_samples(sample_sheet, normal_panel=True)
         unknown_id, unknown_path = cls.select_samples(sample_sheet, normal_panel=False)
         paths = normal_path + unknown_path
@@ -47,8 +64,8 @@ def select_samples(cls, sample_sheet, normal_panel):
                         if sample["result_type"] == cnv_status:
                             output_ids.append(sample["sample_id"].strip())
                             output_paths.append(sample["sample_path"].strip())
-        assert len(set(output_ids)) == len(output_ids), "sample sheet sample_ids must be unique"
-        assert len(set(output_paths)) == len(output_paths), "sample sheet sample_paths must be unique"
+        if normal_panel:
+            assert len(output_ids) >= 30, "There must be 30 normal-panel samples in the sample sheet"
         return output_ids, output_paths
 
     @classmethod

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -21,7 +21,7 @@ def cleanup_after_xhmm():
     subprocess.run(["rm", f"{vcf_path}.gz.tbi"], check=True)
 
 
-@pytest.yield_fixture(scope="session")
+@pytest.yield_fixture(scope="class")
 def db():
     """Session-wide test database."""
 
@@ -110,37 +110,37 @@ def populate_db(db):
             "@SQ\tSN:chr22\tLN:51304566",
             "@SQ\tSN:chrX\tLN:155270560",
             "@SQ\tSN:chrY\tLN:59373566",
-            "@RG\tID:18\tCN:GOSH\tDS:2018-12-25\tDT:2019-01-01\tLB:L001\tPG:PipelineV1\tPL:NB503215\tSM:12S13548",
+            "@RG\tID:18\tCN:GOSH\tDS:2018-12-25\tDT:2019-01-01\tLB:L001\tPG:PipelineV1\tPL:NB503215\tSM:10S21354",
             "@PG\tID:18\tPN:bwa\tCL:bwa\tmem\t-M\t-t\t25\t-R\t@RG\tVN:0.7.15-r1140",
             "@PG\tID:SAMBLASTER\tCL:samblaster\t-i\tstdin\t-o\tstdout\tVN:0.1.24\r\n",
         ]
     )
     sample_1 = {
         "bam_header": bam_header,
         "gene_id": 1,
-        "name": "12S13548",
-        "path": "/mnt/data/181225_NB503215_run/analysis/Alignments/12S13548_sorted.bam",
+        "name": "10S21354",
+        "path": "/mnt/data/181225_NB503215_run/analysis/Alignments/10S21354_sorted.bam",
         "result_type": "positive",
     }
     Queries.update_or_create(models.Sample, session, defaults={"id": 1}, **sample_1)
     sample_2 = {
-        "bam_header": bam_header.replace("12S13548", "92S13548"),
+        "bam_header": bam_header.replace("10S21354", "92S13548"),
         "gene_id": 1,
         "name": "92S13548",
         "path": "/mnt/data/181225_NB503215_run/analysis/Alignments/92S13548_sorted.bam",
         "result_type": "normal",
     }
     Queries.update_or_create(models.Sample, session, defaults={"id": 2}, **sample_2)
     sample_3 = {
-        "bam_header": bam_header.replace("12S13548", "02S13548"),
+        "bam_header": bam_header.replace("10S21354", "02S13548"),
         "gene_id": 1,
         "name": "02S13548",
         "path": "/mnt/data/181225_NB503215_run/analysis/Alignments/02S13548_sorted.bam",
         "result_type": "normal-panel",
     }
     Queries.update_or_create(models.Sample, session, defaults={"id": 3}, **sample_3)
     sample_4 = {
-        "bam_header": bam_header.replace("12S13548", "2S13548"),
+        "bam_header": bam_header.replace("10S21354", "2S13548"),
         "gene_id": 2,
         "name": "2S13548",
         "path": "/mnt/data/181225_NB503215_run/analysis/Alignments/2S13548_sorted.bam",
@@ -174,17 +174,11 @@ def populate_db(db):
 
     # files
     file_1 = {
-        "caller_id": 1,
-        "gene_id": 1,
-        "relative_path": "scripts/base_classes.py",
+        "run_id": 1,
+        "relative_path": "cnv-pat/scripts/base_classes.py",
         "md5sum": "cef8890c1c8051d0c87919cf5e30fb54",
     }
     Queries.update_or_create(models.File, session, defaults={"id": 1}, **file_1)
-    file_2 = {
-        "caller_id": 1,
-        "gene_id": 1,
-        "relative_path": "scripts/__init__.py",
-        "md5sum": "d41d8cd98f00b204e9800998ecf8427e",
-    }
+    file_2 = {"run_id": 1, "relative_path": "cnv-pat/scripts/__init__.py", "md5sum": "d41d8cd98f00b204e9800998ecf8427e"}
     Queries.update_or_create(models.File, session, defaults={"id": 2}, **file_2)
     session.commit()
diff --git a/tests/test_files/input/bed/chr-prefix.bed b/tests/test_files/input/bed/chr-prefix.bed
@@ -0,0 +1,3 @@
+chr19	1206912	1207203	STK11
+chr19	1218415	1218500	STK11
+chr19	1219322	1219413	STK11
diff --git a/tests/test_files/input/bed/chr-prefix_blank.bed b/tests/test_files/input/bed/chr-prefix_blank.bed
@@ -0,0 +1,4 @@
+chr19	1206912	1207203	STK11
+
+
+chr19	1219322	1219413	STK11
diff --git a/tests/test_files/input/bed/no-prefix.bed b/tests/test_files/input/bed/no-prefix.bed
@@ -0,0 +1,3 @@
+19	1206912	1207203	STK11
+19	1218415	1218500	STK11
+19	1219322	1219413	STK11
diff --git a/tests/test_files/input/bed/no-prefix_header.bed b/tests/test_files/input/bed/no-prefix_header.bed
@@ -0,0 +1,4 @@
+chrom	start	end	gene
+19	1206912	1207203	STK11
+19	1218415	1218500	STK11
+19	1219322	1219413	STK11
diff --git a/tests/test_files/input/capture/sample-sheets/gene-dup-sample-id_samples.txt b/tests/test_files/input/capture/sample-sheets/gene-dup-sample-id_samples.txt
diff --git a/tests/test_files/input/capture/sample-sheets/gene-dup-sample-path_samples.txt b/tests/test_files/input/capture/sample-sheets/gene-dup-sample-path_samples.txt
diff --git a/tests/test_files/input/capture/sample-sheets/gene-lt_30.txt b/tests/test_files/input/capture/sample-sheets/gene-lt_30.txt
@@ -0,0 +1,5 @@
+sample_id	sample_path	result_type	target_gene	chromosome	start	end	genome_build
+0	/path/0.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+1	/path/1.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+30	/path/30.sorted.bam	normal	ATM	11	108093559	108236235	hg19
+31	/path/31.sorted.bam	positive	ATM	11	108235809	108236235	hg19
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,4 @@
		chr19 1206912 1207203 STK11


		chr19 1219322 1219413 STK11