diff --git a/.gitignore b/.gitignore index 51ce685..bae900d 100755 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,8 @@ tests/coverage_html/ ! tests/coverage_html/.gitkeep *.gz *.html +tests/test_files/input/checks/sample* + # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index f6acc11..64c1a8f 100755 --- a/README.md +++ b/README.md @@ -153,7 +153,7 @@ At least Python3.6 and Docker (at least engine 1.10) are required for this proje - Only create a sample sheet where you have at least 30 samples which are known not to have CNV, and have samples with known CNVs. You do not need to have a sample sheet for every gene in your capture. - The column names are: - sample_id: unique name for the sample - - sample_path: full path to the bam file + - sample_path: full path to the bam file, no '-' allowed in the filename (but in path is fine) - result_type : samples that are known to have no CNVs can be either `normal-panel` or `normal`. Samples which have a CNV are `positive` - There should be at least 30 `normal-panel` samples, as many `positive` samples and a similar number of `normal` samples - Data for positive CNV samples: @@ -187,3 +187,28 @@ To run the tests # in the root directory of cnv-patissier, with your environment activated python -m pytest tests/ ``` + +## FAQ and common issues + +### What will you change + +- The bam files and the reference genome are mounted as read only +- Only the `output` and `test` directory is mounted as writeable + + +### What is collected in the final SQLite database for sharing? + +- Each CNV call, with all metadata from each caller +- The BAM header for each file used, the docker mount path of the BAM file, result type and the sample name +- Information about the run duration and gene of interest + + +### What can the user change + +- Ideally nothing, the files in the `scripts` and `cnv-caller-resources` directory should never be altered + + +### BAM index + +- Please make sure the BAM is indexed, and that this is newer than the BAM file (touch if necessary) +- Some tools will require this and fail because it doesn't exist \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index b2bdfa0..87f0a9a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,5 +22,5 @@ pytest-cov==2.6.0 pytest-cover==3.0.0 pytest-flake8==1.0.2 six==1.11.0 -SQLAlchemy==1.2.15 +SQLAlchemy==1.3.3 toml==0.10.0 diff --git a/scripts/base_classes.py b/scripts/base_classes.py index a642bcf..e04b5cb 100755 --- a/scripts/base_classes.py +++ b/scripts/base_classes.py @@ -75,8 +75,7 @@ def __init__(self, capture, gene, start_time, normal_panel=True): "capture": capture, "gene": gene, "start_time": start_time, - # To add in after sure all genes run correctly - # "sample_sheet_md5sum": self.get_md5sum(self.sample_sheet), + "sample_sheet_md5sum": self.get_md5sum(self.sample_sheet)[0], "capture_path": f"/mnt/input/{capture}/bed/{capture}.bed", "unknown_bams": unknown_docker_bams, } @@ -88,6 +87,21 @@ def base_output_dirs(self): return (output_base, docker_output_base) + def check_chrom_prefix(self, bed_file): + """Raises exception if chromosome prefix doesn't match chromsome in bed file""" + chromosome_names = [x for x in range(1, 23)] + ["X", "Y", "M", "mt"] + chromosomes = [f"{self.settings['chromosome_prefix']}{chromsome}\t" for chromsome in chromosome_names] + with open(bed_file, "r") as handle: + for line_number, line in enumerate(handle, start=1): + if not any([line.startswith(chrom) for chrom in chromosomes]): + raise Exception( + "BED file contains line which has an invalid chromosome:\n" + f"Line number: {line_number}\n" + "Line: '{}'\n".format(line.replace("\t", "").rstrip()) + + "Expected format: '{}'\n".format(chromosomes[0].replace("\t", "startendgene")) + + "Please update 'chromosome_prefix' in local settings file, or alter the BED file." + ) + def delete_unused_runs(self): logger.info(f"Removing any old or unsuccessful runs for {self.capture}, {self.run_type}, {self.gene}") subprocess.run( @@ -156,7 +170,7 @@ def get_bam_header(self, sample_id): def get_md5sum(self, file_path): md5sum_proc = subprocess.run(["md5sum", file_path], check=True, stdout=subprocess.PIPE) md5sum, path = str(md5sum_proc.stdout, "utf-8").split() - return md5sum, path + return md5sum, path.replace(cnv_pat_dir, "cnv-pat") def get_normal_panel_time(self): normal_path = ( @@ -225,6 +239,46 @@ def parse_vcf(input_vcf, sample_id): cnvs.append(cnv) return cnvs + def prerun_steps(self, sample_sheet_path, ref_genome_path): + """ + Returns dictionary of sample_id: bam header + Checks: + - filenames have no invalid characters (check_files) + - file paths exist (check_files) + - file paths are unique (check_files) + - sample_ids are unique (check_unique) + - reference genome files exist + - SN tag in bam header (from get_bam_header) + - sample id matches the sample ID given (from get_bam_header) + + """ + bam_headers = {} + sample_paths = [] + sample_ids = [] + with open(sample_sheet_path, "r") as handle: + sample_sheet = csv.DictReader(handle, dialect="excel", delimiter="\t") + for line in sample_sheet: + sample_paths.append(line["sample_path"]) + sample_ids.append(line["sample_id"]) + + utils.SampleUtils.check_files(sample_paths) + utils.SampleUtils.check_unique(sample_ids, "sample_id") + for extension in ["", ".fai", ".dict"]: + if extension == ".dict": + ref_genome_path = ref_genome_path.rstrip(".fasta").rstrip(".fa") + ref_genome = pathlib.Path(ref_genome_path + extension) + assert ( + ref_genome.exists() + ), f"{ref_genome} does not exist\nPlease edit your settings file or create the file" + + for sample_id, sample_path in zip(sample_ids, sample_paths): + bam_header = self.get_bam_header(sample_id) + bam_headers[sample_id] = bam_header + # to avoid `docker: Error response from daemon: container did not start before the specified timeout.` + time.sleep(5) + + return bam_headers + def process_caller_output(self, sample_path, sample_id=None): try: cnvs = self.parse_output_file(sample_path, sample_id) @@ -300,12 +354,15 @@ def upload_all_known_data(self): known_cnv_table = self.upload_samples(self.sample_sheet) self.upload_positive_cnvs(known_cnv_table) - def upload_all_md5sums(self): + def upload_all_md5sums(self, run_id): for folder in self.script_dirs: folder_path = pathlib.Path(folder) - for file in folder_path.glob("*.[pR]*"): + files = [python for python in folder_path.glob("**/*.py")] + [R for R in folder_path.glob("**/*.R")] + for file in files: md5sum, file_path = self.get_md5sum(file) - print(md5sum, file_path) + Queries.update_or_create( + models.File, self.session, defaults={"run_id": run_id, "relative_path": file_path}, md5sum=md5sum + ) def upload_cnv_caller(self): Queries.get_or_create(models.Caller, self.session, defaults=dict(name=self.run_type)) @@ -338,9 +395,7 @@ def upload_samples(self, sample_sheet_path): sample_sheet = csv.DictReader(handle, dialect="excel", delimiter="\t") for line in sample_sheet: - bam_header = self.get_bam_header(line["sample_id"]) - # to avoid `docker: Error response from daemon: container did not start before the specified timeout.` - time.sleep(5) + bam_header = self.bam_headers[line["sample_id"]] sample_defaults = {"name": line["sample_id"], "path": line["sample_path"], "gene_id": gene_instance.id} sample_data = {"bam_header": bam_header, "result_type": line["result_type"]} @@ -400,7 +455,9 @@ def upload_run_data(self, sample_names): run_defaults = {"gene_id": gene_instance.id, "caller_id": caller_instance.id} upload_data = {"samples": json.dumps(sample_ids), "duration": duration} - Queries.update_or_create(models.Run, self.session, defaults=run_defaults, **upload_data) + run_instance, created = Queries.update_or_create(models.Run, self.session, defaults=run_defaults, **upload_data) + self.session.commit() + self.upload_all_md5sums(run_instance.id) self.session.commit() @logger.catch(reraise=True) @@ -410,17 +467,18 @@ def main(self): ) if self.run_required(previous_run_settings_path): if self.run_type.endswith("cohort"): + self.bam_headers = self.prerun_steps(self.sample_sheet, cnv_pat_settings["genome_fasta_path"]) self.settings["start_datetime"] = datetime.datetime.now() self.run_workflow() self.settings["end_datetime"] = datetime.datetime.now() else: + self.bam_headers = self.prerun_steps(self.sample_sheet, cnv_pat_settings["genome_fasta_path"]) self.settings["start_datetime"] = datetime.datetime.now() output_paths, sample_ids = self.run_workflow() self.settings["end_datetime"] = datetime.datetime.now() self.upload_all_known_data() self.upload_all_called_cnvs(output_paths, sample_ids) self.upload_run_data(sample_ids) - # self.upload_file_data() self.write_settings_toml() def write_settings_toml(self): diff --git a/scripts/utils.py b/scripts/utils.py index c529fc1..3f1a0e7 100755 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -11,9 +11,17 @@ def get_cnv_patissier_dir(): class SampleUtils: @classmethod def check_files(cls, paths): - """Returns common root path for a list of paths""" - files = [pathlib.Path(path) for path in paths] + """ Takes in a list of paths and raises Exception if: + - File doesn't exist + - Has invalid character in filename + - Paths are not unique + + """ + cls.check_unique(paths, "sample_path") + + files = [pathlib.Path(path).resolve() for path in paths] for file in files: + # CODEX2 automatically replaces '-' with '.' if "-" in file.name: raise Exception( f"File {file} has a '-' in which is not allowed, please rename (or make a temporary copy of) " @@ -22,8 +30,17 @@ def check_files(cls, paths): if not file.exists(): raise Exception(f"File {file} does not exist") + @classmethod + def check_unique(cls, items, data_type): + """If items are not unique, returns exception with duplicate items listed """ + non_unique = set(item for item in items if items.count(item) > 1) + if non_unique: + non_unique_out = "\n ".join(non_unique) + raise Exception(f"The the following {data_type}(s) are not unique:\n {non_unique_out}") + @classmethod def get_bam_to_id(cls, sample_sheet): + """Returns dictionary of bam paths to sample ids from the sample sheet """ normal_id, normal_path = cls.select_samples(sample_sheet, normal_panel=True) unknown_id, unknown_path = cls.select_samples(sample_sheet, normal_panel=False) paths = normal_path + unknown_path @@ -47,8 +64,8 @@ def select_samples(cls, sample_sheet, normal_panel): if sample["result_type"] == cnv_status: output_ids.append(sample["sample_id"].strip()) output_paths.append(sample["sample_path"].strip()) - assert len(set(output_ids)) == len(output_ids), "sample sheet sample_ids must be unique" - assert len(set(output_paths)) == len(output_paths), "sample sheet sample_paths must be unique" + if normal_panel: + assert len(output_ids) >= 30, "There must be 30 normal-panel samples in the sample sheet" return output_ids, output_paths @classmethod diff --git a/tests/conftest.py b/tests/conftest.py index eb4b848..8b9277c 100755 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -21,7 +21,7 @@ def cleanup_after_xhmm(): subprocess.run(["rm", f"{vcf_path}.gz.tbi"], check=True) -@pytest.yield_fixture(scope="session") +@pytest.yield_fixture(scope="class") def db(): """Session-wide test database.""" @@ -110,7 +110,7 @@ def populate_db(db): "@SQ\tSN:chr22\tLN:51304566", "@SQ\tSN:chrX\tLN:155270560", "@SQ\tSN:chrY\tLN:59373566", - "@RG\tID:18\tCN:GOSH\tDS:2018-12-25\tDT:2019-01-01\tLB:L001\tPG:PipelineV1\tPL:NB503215\tSM:12S13548", + "@RG\tID:18\tCN:GOSH\tDS:2018-12-25\tDT:2019-01-01\tLB:L001\tPG:PipelineV1\tPL:NB503215\tSM:10S21354", "@PG\tID:18\tPN:bwa\tCL:bwa\tmem\t-M\t-t\t25\t-R\t@RG\tVN:0.7.15-r1140", "@PG\tID:SAMBLASTER\tCL:samblaster\t-i\tstdin\t-o\tstdout\tVN:0.1.24\r\n", ] @@ -118,13 +118,13 @@ def populate_db(db): sample_1 = { "bam_header": bam_header, "gene_id": 1, - "name": "12S13548", - "path": "/mnt/data/181225_NB503215_run/analysis/Alignments/12S13548_sorted.bam", + "name": "10S21354", + "path": "/mnt/data/181225_NB503215_run/analysis/Alignments/10S21354_sorted.bam", "result_type": "positive", } Queries.update_or_create(models.Sample, session, defaults={"id": 1}, **sample_1) sample_2 = { - "bam_header": bam_header.replace("12S13548", "92S13548"), + "bam_header": bam_header.replace("10S21354", "92S13548"), "gene_id": 1, "name": "92S13548", "path": "/mnt/data/181225_NB503215_run/analysis/Alignments/92S13548_sorted.bam", @@ -132,7 +132,7 @@ def populate_db(db): } Queries.update_or_create(models.Sample, session, defaults={"id": 2}, **sample_2) sample_3 = { - "bam_header": bam_header.replace("12S13548", "02S13548"), + "bam_header": bam_header.replace("10S21354", "02S13548"), "gene_id": 1, "name": "02S13548", "path": "/mnt/data/181225_NB503215_run/analysis/Alignments/02S13548_sorted.bam", @@ -140,7 +140,7 @@ def populate_db(db): } Queries.update_or_create(models.Sample, session, defaults={"id": 3}, **sample_3) sample_4 = { - "bam_header": bam_header.replace("12S13548", "2S13548"), + "bam_header": bam_header.replace("10S21354", "2S13548"), "gene_id": 2, "name": "2S13548", "path": "/mnt/data/181225_NB503215_run/analysis/Alignments/2S13548_sorted.bam", @@ -174,17 +174,11 @@ def populate_db(db): # files file_1 = { - "caller_id": 1, - "gene_id": 1, - "relative_path": "scripts/base_classes.py", + "run_id": 1, + "relative_path": "cnv-pat/scripts/base_classes.py", "md5sum": "cef8890c1c8051d0c87919cf5e30fb54", } Queries.update_or_create(models.File, session, defaults={"id": 1}, **file_1) - file_2 = { - "caller_id": 1, - "gene_id": 1, - "relative_path": "scripts/__init__.py", - "md5sum": "d41d8cd98f00b204e9800998ecf8427e", - } + file_2 = {"run_id": 1, "relative_path": "cnv-pat/scripts/__init__.py", "md5sum": "d41d8cd98f00b204e9800998ecf8427e"} Queries.update_or_create(models.File, session, defaults={"id": 2}, **file_2) session.commit() diff --git a/tests/test_files/input/bed/chr-prefix.bed b/tests/test_files/input/bed/chr-prefix.bed new file mode 100755 index 0000000..289c25a --- /dev/null +++ b/tests/test_files/input/bed/chr-prefix.bed @@ -0,0 +1,3 @@ +chr19 1206912 1207203 STK11 +chr19 1218415 1218500 STK11 +chr19 1219322 1219413 STK11 diff --git a/tests/test_files/input/bed/chr-prefix_blank.bed b/tests/test_files/input/bed/chr-prefix_blank.bed new file mode 100755 index 0000000..2989614 --- /dev/null +++ b/tests/test_files/input/bed/chr-prefix_blank.bed @@ -0,0 +1,4 @@ +chr19 1206912 1207203 STK11 + + +chr19 1219322 1219413 STK11 diff --git a/tests/test_files/input/bed/no-prefix.bed b/tests/test_files/input/bed/no-prefix.bed new file mode 100755 index 0000000..e21cca3 --- /dev/null +++ b/tests/test_files/input/bed/no-prefix.bed @@ -0,0 +1,3 @@ +19 1206912 1207203 STK11 +19 1218415 1218500 STK11 +19 1219322 1219413 STK11 diff --git a/tests/test_files/input/bed/no-prefix_header.bed b/tests/test_files/input/bed/no-prefix_header.bed new file mode 100755 index 0000000..3f14d2a --- /dev/null +++ b/tests/test_files/input/bed/no-prefix_header.bed @@ -0,0 +1,4 @@ +chrom start end gene +19 1206912 1207203 STK11 +19 1218415 1218500 STK11 +19 1219322 1219413 STK11 diff --git a/tests/test_files/input/capture/sample-sheets/gene-dup-sample-id_samples.txt b/tests/test_files/input/capture/sample-sheets/gene-dup-sample-id_samples.txt deleted file mode 100755 index b020624..0000000 --- a/tests/test_files/input/capture/sample-sheets/gene-dup-sample-id_samples.txt +++ /dev/null @@ -1,7 +0,0 @@ -sample_id sample_path result_type target_gene chromosome start end genome_build -17328 /path/17327.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 -17328 /path/17328.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 -17328 /path/17329.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 -17330 /path/17330.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 -17398 /path/17398.sorted.bam normal ATM 11 108093559 108236235 hg19 -17334 /path/17334.sorted.bam positive ATM 11 108235809 108236235 hg19 diff --git a/tests/test_files/input/capture/sample-sheets/gene-dup-sample-path_samples.txt b/tests/test_files/input/capture/sample-sheets/gene-dup-sample-path_samples.txt deleted file mode 100755 index 87b9027..0000000 --- a/tests/test_files/input/capture/sample-sheets/gene-dup-sample-path_samples.txt +++ /dev/null @@ -1,7 +0,0 @@ -sample_id sample_path result_type target_gene chromosome start end genome_build -17327 /path/17328.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 -17328 /path/17328.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 -17329 /path/17328.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 -17330 /path/17330.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 -17398 /path/17398.sorted.bam normal ATM 11 108093559 108236235 hg19 -17334 /path/17334.sorted.bam positive ATM 11 108235809 108236235 hg19 diff --git a/tests/test_files/input/capture/sample-sheets/gene-lt_30.txt b/tests/test_files/input/capture/sample-sheets/gene-lt_30.txt new file mode 100755 index 0000000..9d7155b --- /dev/null +++ b/tests/test_files/input/capture/sample-sheets/gene-lt_30.txt @@ -0,0 +1,5 @@ +sample_id sample_path result_type target_gene chromosome start end genome_build +0 /path/0.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +1 /path/1.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +30 /path/30.sorted.bam normal ATM 11 108093559 108236235 hg19 +31 /path/31.sorted.bam positive ATM 11 108235809 108236235 hg19 diff --git a/tests/test_files/input/capture/sample-sheets/gene_samples.txt b/tests/test_files/input/capture/sample-sheets/gene_samples.txt index fcce9cc..be527b5 100755 --- a/tests/test_files/input/capture/sample-sheets/gene_samples.txt +++ b/tests/test_files/input/capture/sample-sheets/gene_samples.txt @@ -1,7 +1,33 @@ sample_id sample_path result_type target_gene chromosome start end genome_build -17327 /path/17327.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 -17328 /path/17328.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 -17329 /path/17329.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 -17330 /path/17330.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 -17331 /path/17331.sorted.bam normal ATM 11 108093559 108236235 hg19 -17332 /path/17332.sorted.bam positive ATM 11 108235809 108236235 hg19 +0 /path/0.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +1 /path/1.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +2 /path/2.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +3 /path/3.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +4 /path/4.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +5 /path/5.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +6 /path/6.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +7 /path/7.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +8 /path/8.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +9 /path/9.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +10 /path/10.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +11 /path/11.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +12 /path/12.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +13 /path/13.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +14 /path/14.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +15 /path/15.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +16 /path/16.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +17 /path/17.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +18 /path/18.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +19 /path/19.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +20 /path/20.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +21 /path/21.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +22 /path/22.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +23 /path/23.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +24 /path/24.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +25 /path/25.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +26 /path/26.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +27 /path/27.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +28 /path/28.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +29 /path/29.sorted.bam normal-panel ATM 11 108093559 108236235 hg19 +30 /path/30.sorted.bam normal ATM 11 108093559 108236235 hg19 +31 /path/31.sorted.bam positive ATM 11 108235809 108236235 hg19 diff --git a/tests/test_files/input/checks/.gitkeep b/tests/test_files/input/checks/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_files/input/gene_1.txt b/tests/test_files/input/gene_1.txt index 79e9cd6..184534f 100755 --- a/tests/test_files/input/gene_1.txt +++ b/tests/test_files/input/gene_1.txt @@ -1,3 +1,3 @@ sample_id sample_path result_type target_gene capture_name chromosome start end cnv_call 12S13548 /mnt/data/181225_NB503215_run/analysis/Alignments/12S13548_sorted.bam positive gene_1 ICR chr17 1200 1500 DUP -10S21354 /mnt/data/181225_NB503215_run/analysis/Alignments/10S21354_sorted.bam normal-panel gene_1 ICR chr17 NA NA NA +12S13548 /mnt/data/181225_NB503215_run/analysis/Alignments/12S13548_sorted.bam normal-panel gene_1 ICR chr17 NA NA NA diff --git a/tests/test_files/reference/genome.dict b/tests/test_files/reference/genome.dict new file mode 100755 index 0000000..e69de29 diff --git a/tests/test_files/reference/genome.fa b/tests/test_files/reference/genome.fa new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_files/reference/genome.fa.fai b/tests/test_files/reference/genome.fa.fai new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_files/reference/genome_no_dict.fa b/tests/test_files/reference/genome_no_dict.fa new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_files/reference/genome_no_dict.fa.fai b/tests/test_files/reference/genome_no_dict.fa.fai new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_files/reference/genome_no_fai.dict b/tests/test_files/reference/genome_no_fai.dict new file mode 100755 index 0000000..e69de29 diff --git a/tests/test_files/reference/genome_no_fai.fa b/tests/test_files/reference/genome_no_fai.fa new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/test_base_classes.py b/tests/unit/test_base_classes.py index 060c195..d98db51 100755 --- a/tests/unit/test_base_classes.py +++ b/tests/unit/test_base_classes.py @@ -1,15 +1,50 @@ +import pathlib import subprocess import pytest from scripts.base_classes import BaseCNVTool -from scripts import models +from scripts import models, utils + +cnv_pat_dir = utils.get_cnv_patissier_dir() def instance_data(instance): return {k: v for k, v in instance.__dict__.items() if k != "_sa_instance_state"} +@pytest.mark.usefixtures("db", "db_session") +class TestCheckChromPrefix: + def setup(self): + self.caller = BaseCNVTool("capture", "gene", "time") + self.caller.settings = {"chromosome_prefix": "chr"} + + def test_chr_prefix_working(self): + self.caller.check_chrom_prefix("tests/test_files/input/bed/chr-prefix.bed") + + def test_chr_prefix_mismatch(self): + with pytest.raises(Exception): + self.caller.check_chrom_prefix("tests/test_files/input/bed/no-prefix.bed") + + def test_blank_lines(self): + with pytest.raises(Exception): + self.caller.check_chrom_prefix("tests/test_files/input/bed/chr-prefix_blank.bed") + + def test_no_prefix_working(self): + self.caller.settings = {"chromosome_prefix": ""} + self.caller.check_chrom_prefix("tests/test_files/input/bed/no-prefix.bed") + + def test_no_prefix_mismatch(self): + self.caller.settings = {"chromosome_prefix": ""} + with pytest.raises(Exception): + self.caller.check_chrom_prefix("tests/test_files/input/bed/chr-prefix.bed") + + def test_header(self): + self.caller.settings = {"chromosome_prefix": ""} + with pytest.raises(Exception): + self.caller.check_chrom_prefix("tests/test_files/input/bed/no-prefix_header.bed") + + @pytest.mark.usefixtures("db", "db_session") class TestFilterCNVs: def setup(self): @@ -82,15 +117,22 @@ class TestGetBAMHeader: def setup(self): self.caller = BaseCNVTool("capture", "gene", "time") self.caller.run_type = "example_type" - self.caller.sample_to_bam = {"sample_1": "/mnt/test_files/input/bam_header.bam"} + self.caller.sample_to_bam = { + "12S13548": "/mnt/test_files/input/bam_header.bam", + "sample_mismatch": "/mnt/test_files/input/bam_header.bam", + } self.caller.bam_mount = "/mnt/data/" def test_basic(self): with open("tests/test_files/input/bam_header.sam") as handle: header_list = handle.readlines() expected_header = "".join(header_list).replace("\n", "\r\n") - output_header = self.caller.get_bam_header("sample_1") - assert expected_header == output_header + output_header = self.caller.get_bam_header("12S13548") + assert expected_header == output_header + + def test_sample_mismatch(self): + with pytest.raises(AssertionError): + self.caller.get_bam_header("sample_mismatch") @pytest.mark.usefixtures("db", "db_session") @@ -100,15 +142,47 @@ def setup(self): self.caller.run_type = "example_type" def test_simple(self): - expected = ("d41d8cd98f00b204e9800998ecf8427e", "input/.gitkeep") - output = self.caller.get_md5sum("input/.gitkeep") + expected = ("d41d8cd98f00b204e9800998ecf8427e", "cnv-pat/input/.gitkeep") + output = self.caller.get_md5sum(pathlib.Path("input/.gitkeep").resolve()) assert expected == output def test_missing_file(self): with pytest.raises(subprocess.CalledProcessError): output = self.caller.get_md5sum("does_not_exist.txt") -@pytest.mark.dev + +@pytest.mark.usefixtures("db", "db_session", "populate_db") +class TestPreRunSteps: + def setup(self): + self.test_file_prefix = f"{cnv_pat_dir}/tests/test_files/input/checks" + + with open(f"{self.test_file_prefix}/sample_sheet_working.txt", "w") as handle: + handle.write("sample_id\tsample_path\n") + handle.write(f"12S13548\t{cnv_pat_dir}/tests/test_files/input/bam_header.bam\n") + + self.caller = BaseCNVTool("ICR", "gene_1", "time") + self.caller.bam_mount = "/mnt/data/" + self.caller.run_type = "example_type" + self.caller.sample_to_bam = {"12S13548": "/mnt/test_files/input/bam_header.bam"} + self.header_file = "tests/test_files/input/bam_header.sam" + with open("tests/test_files/input/bam_header.sam") as handle: + header_list = handle.readlines() + + self.expected_header = {"12S13548": "".join(header_list).replace("\n", "\r\n")} + + def test_header(self): + header = self.caller.prerun_steps( + f"{self.test_file_prefix}/sample_sheet_working.txt", "tests/test_files/reference/genome.fa" + ) + assert header == self.expected_header + + @pytest.mark.parametrize("genome", ["no_genome.fa", "genome_no_fai.fa", "genome_no_dict.fa"]) + def test_missing_reference(self, genome): + with pytest.raises(AssertionError): + genome_path = f"tests/test_files/reference/{genome}" + self.caller.prerun_steps(f"{self.test_file_prefix}/sample_sheet_working.txt", genome_path) + + @pytest.mark.usefixtures("db", "db_session", "populate_db") class TestUploadAllMd5sums: def setup(self): @@ -116,9 +190,10 @@ def setup(self): self.caller.run_type = "example_type" def test_working(self): - self.caller.upload_all_md5sums() - assert False - + before_upload = self.caller.session.query(models.File).all() + self.caller.upload_all_md5sums(1) + after_upload = self.caller.session.query(models.File).all() + assert len(before_upload) < len(after_upload) @pytest.mark.usefixtures("db", "db_session", "populate_db") @@ -189,10 +264,7 @@ def setup(self): } self.caller.bam_mount = "/mnt/data/" self.caller.settings = {"genome_build_name": "hg19"} - self.header_file = "tests/test_files/input/bam_header.sam" - with open("tests/test_files/input/bam_header.sam") as handle: - header_list = handle.readlines() - self.expected_header = "".join(header_list).replace("\n", "\r\n") + self.caller.bam_headers = {"12S13548": "header1", "10S21354": "header2"} self.sample_sheet = "tests/test_files/input/gene_1.txt" self.expected_output = [ { @@ -206,21 +278,15 @@ def setup(self): ] def test_basic(self): - """Existing instance should stay the same, 10S shouldn't exist, then be uploaded + """Existing instance should stay the same, 12S13548 shouldn't exist, then be uploaded Data returned should be a list of dictionaries for upload of known cnv information from sample sheet""" - existing_1 = self.caller.session.query(models.Sample).filter_by(name="12S13548").first() - existing_data_1 = instance_data(existing_1) - no_instance = self.caller.session.query(models.Sample).filter_by(name="10S21354").first() + no_instance = self.caller.session.query(models.Sample).filter_by(name="12S13548").first() output_table = self.caller.upload_samples(self.sample_sheet) uploaded_1 = self.caller.session.query(models.Sample).filter_by(name="12S13548").first() - uploaded_2 = self.caller.session.query(models.Sample).filter_by(name="10S21354").first() - assert existing_data_1 == instance_data(uploaded_1) - assert no_instance is None - assert uploaded_2.name == "10S21354" - assert uploaded_2.bam_header == existing_data_1["bam_header"] - assert uploaded_2.result_type == "normal-panel" + assert not no_instance + assert uploaded_1.name == "12S13548" assert output_table == self.expected_output @@ -233,9 +299,9 @@ def setup(self): { "cnv": {"alt": "DUP", "genome_build": "hg19", "chrom": "chr17", "start": "1200", "end": "1500"}, "sample_defaults": { - "name": "12S13548", + "name": "10S21354", "gene_id": 1, - "path": "/mnt/data/181225_NB503215_run/analysis/Alignments/12S13548_sorted.bam", + "path": "/mnt/data/181225_NB503215_run/analysis/Alignments/10S21354_sorted.bam", }, } ] @@ -277,7 +343,7 @@ def setup(self): "start": "10", "end": "120", "chrom": "chr1", - "sample_id": "12S13548", + "sample_id": "10S21354", "alt": "DEL", "json_data": {"extra_field1": "extra_data1", "extra_field2": "extra_data2"}, } diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index b916ab1..e1ddace 100755 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1,3 +1,5 @@ +import pathlib + import pytest from scripts import utils @@ -5,31 +7,95 @@ cnv_pat_dir = utils.get_cnv_patissier_dir() +class TestCheckFiles: + def setup(self): + self.test_file_prefix = f"{cnv_pat_dir}/tests/test_files/input/checks" + + for sample in range(40): + with open(f"{self.test_file_prefix}/sample{sample}.txt", "w") as handle: + handle.write("dummy") + + def test_no_issues(self): + paths = [pathlib.Path(f"{self.test_file_prefix}/sample{sample}.txt") for sample in range(40)] + utils.SampleUtils.check_files(paths) + + def test_dash_in_name(self): + paths = [pathlib.Path(f"{self.test_file_prefix}/sample-{sample}.txt") for sample in range(1)] + with pytest.raises(Exception): + utils.SampleUtils.check_files(paths) + + def test_file_doesnt_exist(self): + paths = [pathlib.Path(f"{self.test_file_prefix}/sample{sample}.txt") for sample in range(50, 60)] + with pytest.raises(Exception): + utils.SampleUtils.check_files(paths) + + def test_not_unique(self): + path_1 = [pathlib.Path(f"{self.test_file_prefix}/sample{sample}.txt") for sample in range(30)] + path_2 = [pathlib.Path(f"{self.test_file_prefix}/sample{sample}.txt") for sample in range(10, 12)] + with pytest.raises(Exception): + utils.SampleUtils.check_files([*path_1, *path_2]) + + +class TestCheckUnique: + def test_no_issues(self): + samples = [f"sample_{number}" for number in range(50)] + utils.SampleUtils.check_unique(samples, "samples") + + def test_not_unique(self): + sample_1 = [f"sample_{number}" for number in range(50)] + sample_2 = [f"sample_{number}" for number in range(10, 12)] + with pytest.raises(Exception): + utils.SampleUtils.check_unique([*sample_1, *sample_2], "samples") + + +class TestGetBAMtoID: + def setup(self): + self.test_file = f"{cnv_pat_dir}/tests/test_files/input/checks/sample_sheet_bam_to_id.txt" + expected_ids = {} + with open(self.test_file, "w") as handle: + handle.write("sample_id\tsample_path\tresult_type\n") + + for number in range(35): + if number < 30: + result = "normal-panel" + elif number % 2: + result = "positive" + else: + result = "normal" + sample = f"sample{number}" + path = f"input/sample{number}.bam" + handle.write(f"{sample}\t{path}\t{result}\n") + + expected_ids[path] = sample + + self.expected_ids = expected_ids + + def test_working(self): + bam_to_id = utils.SampleUtils.get_bam_to_id(self.test_file) + assert bam_to_id == self.expected_ids + + class TestSampleUtilsSelectSamples: def setup(self): self.sample_path_prefix = f"{cnv_pat_dir}/tests/test_files/input/capture/sample-sheets/gene" def test_normal_panel(self): - expected_ids = [str(number) for number in range(17327, 17331)] - expected_paths = [f"/path/{number}.sorted.bam" for number in range(17327, 17331)] + expected_ids = [str(number) for number in range(0, 30)] + expected_paths = [f"/path/{number}.sorted.bam" for number in range(0, 30)] ids, paths = utils.SampleUtils.select_samples(f"{self.sample_path_prefix}_samples.txt", normal_panel=True) assert ids == expected_ids assert paths == expected_paths def test_unknown_cases(self): - expected_ids = [str(number) for number in range(17331, 17333)] - expected_paths = [f"/path/{number}.sorted.bam" for number in range(17331, 17333)] + expected_ids = [str(number) for number in range(30, 32)] + expected_paths = [f"/path/{number}.sorted.bam" for number in range(30, 32)] ids, paths = utils.SampleUtils.select_samples(f"{self.sample_path_prefix}_samples.txt", normal_panel=False) assert ids == expected_ids assert paths == expected_paths - def test_dup_id(self): - with pytest.raises(AssertionError): - utils.SampleUtils.select_samples(f"{self.sample_path_prefix}-dup-sample-id_samples.txt", True) - - def test_dup_path(self): + def test_lt_30_normal_panel(self): with pytest.raises(AssertionError): - utils.SampleUtils.select_samples(f"{self.sample_path_prefix}-dup-sample-path_samples.txt", True) + utils.SampleUtils.select_samples(f"{self.sample_path_prefix}-lt_30.txt", True) class TestSampleUtilsGetMountPoint: