Skip to content
This repository has been archived by the owner on Sep 13, 2023. It is now read-only.

Commit

Permalink
Merge pull request #61 from stefpiatek/setup_checks
Browse files Browse the repository at this point in the history
Add setup checks
  • Loading branch information
stefpiatek authored Apr 18, 2019
2 parents 0a1a831 + 51fa506 commit ae0840d
Show file tree
Hide file tree
Showing 25 changed files with 350 additions and 91 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ tests/coverage_html/
! tests/coverage_html/.gitkeep
*.gz
*.html
tests/test_files/input/checks/sample*


# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
27 changes: 26 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ At least Python3.6 and Docker (at least engine 1.10) are required for this proje
- Only create a sample sheet where you have at least 30 samples which are known not to have CNV, and have samples with known CNVs. You do not need to have a sample sheet for every gene in your capture.
- The column names are:
- sample_id: unique name for the sample
- sample_path: full path to the bam file
- sample_path: full path to the bam file, no '-' allowed in the filename (but in path is fine)
- result_type : samples that are known to have no CNVs can be either `normal-panel` or `normal`. Samples which have a CNV are `positive`
- There should be at least 30 `normal-panel` samples, as many `positive` samples and a similar number of `normal` samples
- Data for positive CNV samples:
Expand Down Expand Up @@ -187,3 +187,28 @@ To run the tests
# in the root directory of cnv-patissier, with your environment activated
python -m pytest tests/
```
## FAQ and common issues
### What will you change
- The bam files and the reference genome are mounted as read only
- Only the `output` and `test` directory is mounted as writeable
### What is collected in the final SQLite database for sharing?
- Each CNV call, with all metadata from each caller
- The BAM header for each file used, the docker mount path of the BAM file, result type and the sample name
- Information about the run duration and gene of interest
### What can the user change
- Ideally nothing, the files in the `scripts` and `cnv-caller-resources` directory should never be altered
### BAM index
- Please make sure the BAM is indexed, and that this is newer than the BAM file (touch if necessary)
- Some tools will require this and fail because it doesn't exist
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,5 @@ pytest-cov==2.6.0
pytest-cover==3.0.0
pytest-flake8==1.0.2
six==1.11.0
SQLAlchemy==1.2.15
SQLAlchemy==1.3.3
toml==0.10.0
80 changes: 69 additions & 11 deletions scripts/base_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,7 @@ def __init__(self, capture, gene, start_time, normal_panel=True):
"capture": capture,
"gene": gene,
"start_time": start_time,
# To add in after sure all genes run correctly
# "sample_sheet_md5sum": self.get_md5sum(self.sample_sheet),
"sample_sheet_md5sum": self.get_md5sum(self.sample_sheet)[0],
"capture_path": f"/mnt/input/{capture}/bed/{capture}.bed",
"unknown_bams": unknown_docker_bams,
}
Expand All @@ -88,6 +87,21 @@ def base_output_dirs(self):

return (output_base, docker_output_base)

def check_chrom_prefix(self, bed_file):
"""Raises exception if chromosome prefix doesn't match chromsome in bed file"""
chromosome_names = [x for x in range(1, 23)] + ["X", "Y", "M", "mt"]
chromosomes = [f"{self.settings['chromosome_prefix']}{chromsome}\t" for chromsome in chromosome_names]
with open(bed_file, "r") as handle:
for line_number, line in enumerate(handle, start=1):
if not any([line.startswith(chrom) for chrom in chromosomes]):
raise Exception(
"BED file contains line which has an invalid chromosome:\n"
f"Line number: {line_number}\n"
"Line: '{}'\n".format(line.replace("\t", "<tab>").rstrip())
+ "Expected format: '{}'\n".format(chromosomes[0].replace("\t", "<tab>start<tab>end<tab>gene"))
+ "Please update 'chromosome_prefix' in local settings file, or alter the BED file."
)

def delete_unused_runs(self):
logger.info(f"Removing any old or unsuccessful runs for {self.capture}, {self.run_type}, {self.gene}")
subprocess.run(
Expand Down Expand Up @@ -156,7 +170,7 @@ def get_bam_header(self, sample_id):
def get_md5sum(self, file_path):
md5sum_proc = subprocess.run(["md5sum", file_path], check=True, stdout=subprocess.PIPE)
md5sum, path = str(md5sum_proc.stdout, "utf-8").split()
return md5sum, path
return md5sum, path.replace(cnv_pat_dir, "cnv-pat")

def get_normal_panel_time(self):
normal_path = (
Expand Down Expand Up @@ -225,6 +239,46 @@ def parse_vcf(input_vcf, sample_id):
cnvs.append(cnv)
return cnvs

def prerun_steps(self, sample_sheet_path, ref_genome_path):
"""
Returns dictionary of sample_id: bam header
Checks:
- filenames have no invalid characters (check_files)
- file paths exist (check_files)
- file paths are unique (check_files)
- sample_ids are unique (check_unique)
- reference genome files exist
- SN tag in bam header (from get_bam_header)
- sample id matches the sample ID given (from get_bam_header)
"""
bam_headers = {}
sample_paths = []
sample_ids = []
with open(sample_sheet_path, "r") as handle:
sample_sheet = csv.DictReader(handle, dialect="excel", delimiter="\t")
for line in sample_sheet:
sample_paths.append(line["sample_path"])
sample_ids.append(line["sample_id"])

utils.SampleUtils.check_files(sample_paths)
utils.SampleUtils.check_unique(sample_ids, "sample_id")
for extension in ["", ".fai", ".dict"]:
if extension == ".dict":
ref_genome_path = ref_genome_path.rstrip(".fasta").rstrip(".fa")
ref_genome = pathlib.Path(ref_genome_path + extension)
assert (
ref_genome.exists()
), f"{ref_genome} does not exist\nPlease edit your settings file or create the file"

for sample_id, sample_path in zip(sample_ids, sample_paths):
bam_header = self.get_bam_header(sample_id)
bam_headers[sample_id] = bam_header
# to avoid `docker: Error response from daemon: container did not start before the specified timeout.`
time.sleep(5)

return bam_headers

def process_caller_output(self, sample_path, sample_id=None):
try:
cnvs = self.parse_output_file(sample_path, sample_id)
Expand Down Expand Up @@ -300,12 +354,15 @@ def upload_all_known_data(self):
known_cnv_table = self.upload_samples(self.sample_sheet)
self.upload_positive_cnvs(known_cnv_table)

def upload_all_md5sums(self):
def upload_all_md5sums(self, run_id):
for folder in self.script_dirs:
folder_path = pathlib.Path(folder)
for file in folder_path.glob("*.[pR]*"):
files = [python for python in folder_path.glob("**/*.py")] + [R for R in folder_path.glob("**/*.R")]
for file in files:
md5sum, file_path = self.get_md5sum(file)
print(md5sum, file_path)
Queries.update_or_create(
models.File, self.session, defaults={"run_id": run_id, "relative_path": file_path}, md5sum=md5sum
)

def upload_cnv_caller(self):
Queries.get_or_create(models.Caller, self.session, defaults=dict(name=self.run_type))
Expand Down Expand Up @@ -338,9 +395,7 @@ def upload_samples(self, sample_sheet_path):

sample_sheet = csv.DictReader(handle, dialect="excel", delimiter="\t")
for line in sample_sheet:
bam_header = self.get_bam_header(line["sample_id"])
# to avoid `docker: Error response from daemon: container did not start before the specified timeout.`
time.sleep(5)
bam_header = self.bam_headers[line["sample_id"]]
sample_defaults = {"name": line["sample_id"], "path": line["sample_path"], "gene_id": gene_instance.id}
sample_data = {"bam_header": bam_header, "result_type": line["result_type"]}

Expand Down Expand Up @@ -400,7 +455,9 @@ def upload_run_data(self, sample_names):

run_defaults = {"gene_id": gene_instance.id, "caller_id": caller_instance.id}
upload_data = {"samples": json.dumps(sample_ids), "duration": duration}
Queries.update_or_create(models.Run, self.session, defaults=run_defaults, **upload_data)
run_instance, created = Queries.update_or_create(models.Run, self.session, defaults=run_defaults, **upload_data)
self.session.commit()
self.upload_all_md5sums(run_instance.id)
self.session.commit()

@logger.catch(reraise=True)
Expand All @@ -410,17 +467,18 @@ def main(self):
)
if self.run_required(previous_run_settings_path):
if self.run_type.endswith("cohort"):
self.bam_headers = self.prerun_steps(self.sample_sheet, cnv_pat_settings["genome_fasta_path"])
self.settings["start_datetime"] = datetime.datetime.now()
self.run_workflow()
self.settings["end_datetime"] = datetime.datetime.now()
else:
self.bam_headers = self.prerun_steps(self.sample_sheet, cnv_pat_settings["genome_fasta_path"])
self.settings["start_datetime"] = datetime.datetime.now()
output_paths, sample_ids = self.run_workflow()
self.settings["end_datetime"] = datetime.datetime.now()
self.upload_all_known_data()
self.upload_all_called_cnvs(output_paths, sample_ids)
self.upload_run_data(sample_ids)
# self.upload_file_data()
self.write_settings_toml()

def write_settings_toml(self):
Expand Down
25 changes: 21 additions & 4 deletions scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,17 @@ def get_cnv_patissier_dir():
class SampleUtils:
@classmethod
def check_files(cls, paths):
"""Returns common root path for a list of paths"""
files = [pathlib.Path(path) for path in paths]
""" Takes in a list of paths and raises Exception if:
- File doesn't exist
- Has invalid character in filename
- Paths are not unique
"""
cls.check_unique(paths, "sample_path")

files = [pathlib.Path(path).resolve() for path in paths]
for file in files:
# CODEX2 automatically replaces '-' with '.'
if "-" in file.name:
raise Exception(
f"File {file} has a '-' in which is not allowed, please rename (or make a temporary copy of) "
Expand All @@ -22,8 +30,17 @@ def check_files(cls, paths):
if not file.exists():
raise Exception(f"File {file} does not exist")

@classmethod
def check_unique(cls, items, data_type):
"""If items are not unique, returns exception with duplicate items listed """
non_unique = set(item for item in items if items.count(item) > 1)
if non_unique:
non_unique_out = "\n ".join(non_unique)
raise Exception(f"The the following {data_type}(s) are not unique:\n {non_unique_out}")

@classmethod
def get_bam_to_id(cls, sample_sheet):
"""Returns dictionary of bam paths to sample ids from the sample sheet """
normal_id, normal_path = cls.select_samples(sample_sheet, normal_panel=True)
unknown_id, unknown_path = cls.select_samples(sample_sheet, normal_panel=False)
paths = normal_path + unknown_path
Expand All @@ -47,8 +64,8 @@ def select_samples(cls, sample_sheet, normal_panel):
if sample["result_type"] == cnv_status:
output_ids.append(sample["sample_id"].strip())
output_paths.append(sample["sample_path"].strip())
assert len(set(output_ids)) == len(output_ids), "sample sheet sample_ids must be unique"
assert len(set(output_paths)) == len(output_paths), "sample sheet sample_paths must be unique"
if normal_panel:
assert len(output_ids) >= 30, "There must be 30 normal-panel samples in the sample sheet"
return output_ids, output_paths

@classmethod
Expand Down
26 changes: 10 additions & 16 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def cleanup_after_xhmm():
subprocess.run(["rm", f"{vcf_path}.gz.tbi"], check=True)


@pytest.yield_fixture(scope="session")
@pytest.yield_fixture(scope="class")
def db():
"""Session-wide test database."""

Expand Down Expand Up @@ -110,37 +110,37 @@ def populate_db(db):
"@SQ\tSN:chr22\tLN:51304566",
"@SQ\tSN:chrX\tLN:155270560",
"@SQ\tSN:chrY\tLN:59373566",
"@RG\tID:18\tCN:GOSH\tDS:2018-12-25\tDT:2019-01-01\tLB:L001\tPG:PipelineV1\tPL:NB503215\tSM:12S13548",
"@RG\tID:18\tCN:GOSH\tDS:2018-12-25\tDT:2019-01-01\tLB:L001\tPG:PipelineV1\tPL:NB503215\tSM:10S21354",
"@PG\tID:18\tPN:bwa\tCL:bwa\tmem\t-M\t-t\t25\t-R\t@RG\tVN:0.7.15-r1140",
"@PG\tID:SAMBLASTER\tCL:samblaster\t-i\tstdin\t-o\tstdout\tVN:0.1.24\r\n",
]
)
sample_1 = {
"bam_header": bam_header,
"gene_id": 1,
"name": "12S13548",
"path": "/mnt/data/181225_NB503215_run/analysis/Alignments/12S13548_sorted.bam",
"name": "10S21354",
"path": "/mnt/data/181225_NB503215_run/analysis/Alignments/10S21354_sorted.bam",
"result_type": "positive",
}
Queries.update_or_create(models.Sample, session, defaults={"id": 1}, **sample_1)
sample_2 = {
"bam_header": bam_header.replace("12S13548", "92S13548"),
"bam_header": bam_header.replace("10S21354", "92S13548"),
"gene_id": 1,
"name": "92S13548",
"path": "/mnt/data/181225_NB503215_run/analysis/Alignments/92S13548_sorted.bam",
"result_type": "normal",
}
Queries.update_or_create(models.Sample, session, defaults={"id": 2}, **sample_2)
sample_3 = {
"bam_header": bam_header.replace("12S13548", "02S13548"),
"bam_header": bam_header.replace("10S21354", "02S13548"),
"gene_id": 1,
"name": "02S13548",
"path": "/mnt/data/181225_NB503215_run/analysis/Alignments/02S13548_sorted.bam",
"result_type": "normal-panel",
}
Queries.update_or_create(models.Sample, session, defaults={"id": 3}, **sample_3)
sample_4 = {
"bam_header": bam_header.replace("12S13548", "2S13548"),
"bam_header": bam_header.replace("10S21354", "2S13548"),
"gene_id": 2,
"name": "2S13548",
"path": "/mnt/data/181225_NB503215_run/analysis/Alignments/2S13548_sorted.bam",
Expand Down Expand Up @@ -174,17 +174,11 @@ def populate_db(db):

# files
file_1 = {
"caller_id": 1,
"gene_id": 1,
"relative_path": "scripts/base_classes.py",
"run_id": 1,
"relative_path": "cnv-pat/scripts/base_classes.py",
"md5sum": "cef8890c1c8051d0c87919cf5e30fb54",
}
Queries.update_or_create(models.File, session, defaults={"id": 1}, **file_1)
file_2 = {
"caller_id": 1,
"gene_id": 1,
"relative_path": "scripts/__init__.py",
"md5sum": "d41d8cd98f00b204e9800998ecf8427e",
}
file_2 = {"run_id": 1, "relative_path": "cnv-pat/scripts/__init__.py", "md5sum": "d41d8cd98f00b204e9800998ecf8427e"}
Queries.update_or_create(models.File, session, defaults={"id": 2}, **file_2)
session.commit()
3 changes: 3 additions & 0 deletions tests/test_files/input/bed/chr-prefix.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
chr19 1206912 1207203 STK11
chr19 1218415 1218500 STK11
chr19 1219322 1219413 STK11
4 changes: 4 additions & 0 deletions tests/test_files/input/bed/chr-prefix_blank.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
chr19 1206912 1207203 STK11


chr19 1219322 1219413 STK11
3 changes: 3 additions & 0 deletions tests/test_files/input/bed/no-prefix.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
19 1206912 1207203 STK11
19 1218415 1218500 STK11
19 1219322 1219413 STK11
4 changes: 4 additions & 0 deletions tests/test_files/input/bed/no-prefix_header.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
chrom start end gene
19 1206912 1207203 STK11
19 1218415 1218500 STK11
19 1219322 1219413 STK11

This file was deleted.

This file was deleted.

5 changes: 5 additions & 0 deletions tests/test_files/input/capture/sample-sheets/gene-lt_30.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
sample_id sample_path result_type target_gene chromosome start end genome_build
0 /path/0.sorted.bam normal-panel ATM 11 108093559 108236235 hg19
1 /path/1.sorted.bam normal-panel ATM 11 108093559 108236235 hg19
30 /path/30.sorted.bam normal ATM 11 108093559 108236235 hg19
31 /path/31.sorted.bam positive ATM 11 108235809 108236235 hg19
Loading

0 comments on commit ae0840d

Please sign in to comment.