diff --git a/.gitignore b/.gitignore
index 51ce685..bae900d 100755
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,8 @@ tests/coverage_html/
 ! tests/coverage_html/.gitkeep
 *.gz
 *.html
+tests/test_files/input/checks/sample*
+
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/README.md b/README.md
index f6acc11..64c1a8f 100755
--- a/README.md
+++ b/README.md
@@ -153,7 +153,7 @@ At least Python3.6 and Docker (at least engine 1.10) are required for this proje
     - Only create a sample sheet where you have at least 30 samples which are known not to have CNV, and have samples with known CNVs. You do not need to have a sample sheet for every gene in your capture. 
     - The column names are: 
         - sample_id: unique name for the sample
-        - sample_path: full path to the bam file
+        - sample_path: full path to the bam file, no '-' allowed in the filename (but in path is fine)
         - result_type :  samples that are known to have no CNVs can be either `normal-panel` or `normal`. Samples which have a CNV are `positive`
             - There should be at least 30 `normal-panel` samples, as many `positive` samples and a similar number of `normal` samples
         - Data for positive CNV samples:
@@ -187,3 +187,28 @@ To run the tests
 # in the root directory of cnv-patissier, with your environment activated 
 python -m pytest tests/
 ```
+
+## FAQ and common issues
+
+### What will you change
+
+- The bam files and the reference genome are mounted as read only
+- Only the `output` and `test` directory is mounted as writeable 
+
+
+### What is collected in the final SQLite database for sharing?
+
+- Each CNV call, with all metadata from each caller
+- The BAM header for each file used, the docker mount path of the BAM file, result type and the sample name
+- Information about the run duration and gene of interest
+
+
+### What can the user change
+
+- Ideally nothing, the files in the `scripts`  and `cnv-caller-resources` directory should never be altered
+
+
+### BAM index 
+
+- Please make sure the BAM is indexed, and that this is newer than the BAM file (touch if necessary)
+- Some tools will require this and fail because it doesn't exist
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index b2bdfa0..87f0a9a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,5 +22,5 @@ pytest-cov==2.6.0
 pytest-cover==3.0.0
 pytest-flake8==1.0.2
 six==1.11.0
-SQLAlchemy==1.2.15
+SQLAlchemy==1.3.3
 toml==0.10.0
diff --git a/scripts/base_classes.py b/scripts/base_classes.py
index a642bcf..e04b5cb 100755
--- a/scripts/base_classes.py
+++ b/scripts/base_classes.py
@@ -75,8 +75,7 @@ def __init__(self, capture, gene, start_time, normal_panel=True):
                 "capture": capture,
                 "gene": gene,
                 "start_time": start_time,
-                # To add in after sure all genes run correctly
-                # "sample_sheet_md5sum": self.get_md5sum(self.sample_sheet),
+                "sample_sheet_md5sum": self.get_md5sum(self.sample_sheet)[0],
                 "capture_path": f"/mnt/input/{capture}/bed/{capture}.bed",
                 "unknown_bams": unknown_docker_bams,
             }
@@ -88,6 +87,21 @@ def base_output_dirs(self):
 
         return (output_base, docker_output_base)
 
+    def check_chrom_prefix(self, bed_file):
+        """Raises exception if chromosome prefix doesn't match chromsome in bed file"""
+        chromosome_names = [x for x in range(1, 23)] + ["X", "Y", "M", "mt"]
+        chromosomes = [f"{self.settings['chromosome_prefix']}{chromsome}\t" for chromsome in chromosome_names]
+        with open(bed_file, "r") as handle:
+            for line_number, line in enumerate(handle, start=1):
+                if not any([line.startswith(chrom) for chrom in chromosomes]):
+                    raise Exception(
+                        "BED file contains line which has an invalid chromosome:\n"
+                        f"Line number: {line_number}\n"
+                        "Line: '{}'\n".format(line.replace("\t", "<tab>").rstrip())
+                        + "Expected format: '{}'\n".format(chromosomes[0].replace("\t", "<tab>start<tab>end<tab>gene"))
+                        + "Please update 'chromosome_prefix' in local settings file, or alter the BED file."
+                    )
+
     def delete_unused_runs(self):
         logger.info(f"Removing any old or unsuccessful runs for {self.capture}, {self.run_type}, {self.gene}")
         subprocess.run(
@@ -156,7 +170,7 @@ def get_bam_header(self, sample_id):
     def get_md5sum(self, file_path):
         md5sum_proc = subprocess.run(["md5sum", file_path], check=True, stdout=subprocess.PIPE)
         md5sum, path = str(md5sum_proc.stdout, "utf-8").split()
-        return md5sum, path
+        return md5sum, path.replace(cnv_pat_dir, "cnv-pat")
 
     def get_normal_panel_time(self):
         normal_path = (
@@ -225,6 +239,46 @@ def parse_vcf(input_vcf, sample_id):
                 cnvs.append(cnv)
         return cnvs
 
+    def prerun_steps(self, sample_sheet_path, ref_genome_path):
+        """
+        Returns dictionary of sample_id: bam header
+        Checks: 
+          - filenames have no invalid characters (check_files)
+          - file paths exist (check_files)
+          - file paths are unique (check_files)
+          - sample_ids are unique (check_unique)
+          - reference genome files exist
+          - SN tag in bam header (from get_bam_header)
+          - sample id matches the sample ID given (from get_bam_header)
+
+        """
+        bam_headers = {}
+        sample_paths = []
+        sample_ids = []
+        with open(sample_sheet_path, "r") as handle:
+            sample_sheet = csv.DictReader(handle, dialect="excel", delimiter="\t")
+            for line in sample_sheet:
+                sample_paths.append(line["sample_path"])
+                sample_ids.append(line["sample_id"])
+
+        utils.SampleUtils.check_files(sample_paths)
+        utils.SampleUtils.check_unique(sample_ids, "sample_id")
+        for extension in ["", ".fai", ".dict"]:
+            if extension == ".dict":
+                ref_genome_path = ref_genome_path.rstrip(".fasta").rstrip(".fa")
+            ref_genome = pathlib.Path(ref_genome_path + extension)
+            assert (
+                ref_genome.exists()
+            ), f"{ref_genome} does not exist\nPlease edit your settings file or create the file"
+
+        for sample_id, sample_path in zip(sample_ids, sample_paths):
+            bam_header = self.get_bam_header(sample_id)
+            bam_headers[sample_id] = bam_header
+            #  to avoid `docker: Error response from daemon: container did not start before the specified timeout.`
+            time.sleep(5)
+
+        return bam_headers
+
     def process_caller_output(self, sample_path, sample_id=None):
         try:
             cnvs = self.parse_output_file(sample_path, sample_id)
@@ -300,12 +354,15 @@ def upload_all_known_data(self):
         known_cnv_table = self.upload_samples(self.sample_sheet)
         self.upload_positive_cnvs(known_cnv_table)
 
-    def upload_all_md5sums(self):
+    def upload_all_md5sums(self, run_id):
         for folder in self.script_dirs:
             folder_path = pathlib.Path(folder)
-            for file in folder_path.glob("*.[pR]*"):
+            files = [python for python in folder_path.glob("**/*.py")] + [R for R in folder_path.glob("**/*.R")]
+            for file in files:
                 md5sum, file_path = self.get_md5sum(file)
-                print(md5sum, file_path)
+                Queries.update_or_create(
+                    models.File, self.session, defaults={"run_id": run_id, "relative_path": file_path}, md5sum=md5sum
+                )
 
     def upload_cnv_caller(self):
         Queries.get_or_create(models.Caller, self.session, defaults=dict(name=self.run_type))
@@ -338,9 +395,7 @@ def upload_samples(self, sample_sheet_path):
 
             sample_sheet = csv.DictReader(handle, dialect="excel", delimiter="\t")
             for line in sample_sheet:
-                bam_header = self.get_bam_header(line["sample_id"])
-                #  to avoid `docker: Error response from daemon: container did not start before the specified timeout.`
-                time.sleep(5) 
+                bam_header = self.bam_headers[line["sample_id"]]
                 sample_defaults = {"name": line["sample_id"], "path": line["sample_path"], "gene_id": gene_instance.id}
                 sample_data = {"bam_header": bam_header, "result_type": line["result_type"]}
 
@@ -400,7 +455,9 @@ def upload_run_data(self, sample_names):
 
         run_defaults = {"gene_id": gene_instance.id, "caller_id": caller_instance.id}
         upload_data = {"samples": json.dumps(sample_ids), "duration": duration}
-        Queries.update_or_create(models.Run, self.session, defaults=run_defaults, **upload_data)
+        run_instance, created = Queries.update_or_create(models.Run, self.session, defaults=run_defaults, **upload_data)
+        self.session.commit()       
+        self.upload_all_md5sums(run_instance.id)
         self.session.commit()
 
     @logger.catch(reraise=True)
@@ -410,17 +467,18 @@ def main(self):
         )
         if self.run_required(previous_run_settings_path):
             if self.run_type.endswith("cohort"):
+                self.bam_headers = self.prerun_steps(self.sample_sheet, cnv_pat_settings["genome_fasta_path"])
                 self.settings["start_datetime"] = datetime.datetime.now()
                 self.run_workflow()
                 self.settings["end_datetime"] = datetime.datetime.now()
             else:
+                self.bam_headers = self.prerun_steps(self.sample_sheet, cnv_pat_settings["genome_fasta_path"])
                 self.settings["start_datetime"] = datetime.datetime.now()
                 output_paths, sample_ids = self.run_workflow()
                 self.settings["end_datetime"] = datetime.datetime.now()
                 self.upload_all_known_data()
                 self.upload_all_called_cnvs(output_paths, sample_ids)
                 self.upload_run_data(sample_ids)
-                # self.upload_file_data()
             self.write_settings_toml()
 
     def write_settings_toml(self):
diff --git a/scripts/utils.py b/scripts/utils.py
index c529fc1..3f1a0e7 100755
--- a/scripts/utils.py
+++ b/scripts/utils.py
@@ -11,9 +11,17 @@ def get_cnv_patissier_dir():
 class SampleUtils:
     @classmethod
     def check_files(cls, paths):
-        """Returns common root path for a list of paths"""
-        files = [pathlib.Path(path) for path in paths]
+        """ Takes in a list of paths and raises Exception if:
+         - File doesn't exist
+         - Has invalid character in filename
+         - Paths are not unique
+
+        """
+        cls.check_unique(paths, "sample_path")
+
+        files = [pathlib.Path(path).resolve() for path in paths]
         for file in files:
+            # CODEX2 automatically replaces '-' with '.'
             if "-" in file.name:
                 raise Exception(
                     f"File {file} has a '-' in which is not allowed, please rename (or make a temporary copy of) "
@@ -22,8 +30,17 @@ def check_files(cls, paths):
             if not file.exists():
                 raise Exception(f"File {file} does not exist")
 
+    @classmethod
+    def check_unique(cls, items, data_type):
+        """If items are not unique, returns exception with duplicate items listed """
+        non_unique = set(item for item in items if items.count(item) > 1)
+        if non_unique:
+            non_unique_out = "\n ".join(non_unique)
+            raise Exception(f"The the following {data_type}(s) are not unique:\n {non_unique_out}")
+
     @classmethod
     def get_bam_to_id(cls, sample_sheet):
+        """Returns dictionary of bam paths to sample ids from the sample sheet """
         normal_id, normal_path = cls.select_samples(sample_sheet, normal_panel=True)
         unknown_id, unknown_path = cls.select_samples(sample_sheet, normal_panel=False)
         paths = normal_path + unknown_path
@@ -47,8 +64,8 @@ def select_samples(cls, sample_sheet, normal_panel):
                         if sample["result_type"] == cnv_status:
                             output_ids.append(sample["sample_id"].strip())
                             output_paths.append(sample["sample_path"].strip())
-        assert len(set(output_ids)) == len(output_ids), "sample sheet sample_ids must be unique"
-        assert len(set(output_paths)) == len(output_paths), "sample sheet sample_paths must be unique"
+        if normal_panel:
+            assert len(output_ids) >= 30, "There must be 30 normal-panel samples in the sample sheet"
         return output_ids, output_paths
 
     @classmethod
diff --git a/tests/conftest.py b/tests/conftest.py
index eb4b848..8b9277c 100755
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -21,7 +21,7 @@ def cleanup_after_xhmm():
     subprocess.run(["rm", f"{vcf_path}.gz.tbi"], check=True)
 
 
-@pytest.yield_fixture(scope="session")
+@pytest.yield_fixture(scope="class")
 def db():
     """Session-wide test database."""
 
@@ -110,7 +110,7 @@ def populate_db(db):
             "@SQ\tSN:chr22\tLN:51304566",
             "@SQ\tSN:chrX\tLN:155270560",
             "@SQ\tSN:chrY\tLN:59373566",
-            "@RG\tID:18\tCN:GOSH\tDS:2018-12-25\tDT:2019-01-01\tLB:L001\tPG:PipelineV1\tPL:NB503215\tSM:12S13548",
+            "@RG\tID:18\tCN:GOSH\tDS:2018-12-25\tDT:2019-01-01\tLB:L001\tPG:PipelineV1\tPL:NB503215\tSM:10S21354",
             "@PG\tID:18\tPN:bwa\tCL:bwa\tmem\t-M\t-t\t25\t-R\t@RG\tVN:0.7.15-r1140",
             "@PG\tID:SAMBLASTER\tCL:samblaster\t-i\tstdin\t-o\tstdout\tVN:0.1.24\r\n",
         ]
@@ -118,13 +118,13 @@ def populate_db(db):
     sample_1 = {
         "bam_header": bam_header,
         "gene_id": 1,
-        "name": "12S13548",
-        "path": "/mnt/data/181225_NB503215_run/analysis/Alignments/12S13548_sorted.bam",
+        "name": "10S21354",
+        "path": "/mnt/data/181225_NB503215_run/analysis/Alignments/10S21354_sorted.bam",
         "result_type": "positive",
     }
     Queries.update_or_create(models.Sample, session, defaults={"id": 1}, **sample_1)
     sample_2 = {
-        "bam_header": bam_header.replace("12S13548", "92S13548"),
+        "bam_header": bam_header.replace("10S21354", "92S13548"),
         "gene_id": 1,
         "name": "92S13548",
         "path": "/mnt/data/181225_NB503215_run/analysis/Alignments/92S13548_sorted.bam",
@@ -132,7 +132,7 @@ def populate_db(db):
     }
     Queries.update_or_create(models.Sample, session, defaults={"id": 2}, **sample_2)
     sample_3 = {
-        "bam_header": bam_header.replace("12S13548", "02S13548"),
+        "bam_header": bam_header.replace("10S21354", "02S13548"),
         "gene_id": 1,
         "name": "02S13548",
         "path": "/mnt/data/181225_NB503215_run/analysis/Alignments/02S13548_sorted.bam",
@@ -140,7 +140,7 @@ def populate_db(db):
     }
     Queries.update_or_create(models.Sample, session, defaults={"id": 3}, **sample_3)
     sample_4 = {
-        "bam_header": bam_header.replace("12S13548", "2S13548"),
+        "bam_header": bam_header.replace("10S21354", "2S13548"),
         "gene_id": 2,
         "name": "2S13548",
         "path": "/mnt/data/181225_NB503215_run/analysis/Alignments/2S13548_sorted.bam",
@@ -174,17 +174,11 @@ def populate_db(db):
 
     # files
     file_1 = {
-        "caller_id": 1,
-        "gene_id": 1,
-        "relative_path": "scripts/base_classes.py",
+        "run_id": 1,
+        "relative_path": "cnv-pat/scripts/base_classes.py",
         "md5sum": "cef8890c1c8051d0c87919cf5e30fb54",
     }
     Queries.update_or_create(models.File, session, defaults={"id": 1}, **file_1)
-    file_2 = {
-        "caller_id": 1,
-        "gene_id": 1,
-        "relative_path": "scripts/__init__.py",
-        "md5sum": "d41d8cd98f00b204e9800998ecf8427e",
-    }
+    file_2 = {"run_id": 1, "relative_path": "cnv-pat/scripts/__init__.py", "md5sum": "d41d8cd98f00b204e9800998ecf8427e"}
     Queries.update_or_create(models.File, session, defaults={"id": 2}, **file_2)
     session.commit()
diff --git a/tests/test_files/input/bed/chr-prefix.bed b/tests/test_files/input/bed/chr-prefix.bed
new file mode 100755
index 0000000..289c25a
--- /dev/null
+++ b/tests/test_files/input/bed/chr-prefix.bed
@@ -0,0 +1,3 @@
+chr19	1206912	1207203	STK11
+chr19	1218415	1218500	STK11
+chr19	1219322	1219413	STK11
diff --git a/tests/test_files/input/bed/chr-prefix_blank.bed b/tests/test_files/input/bed/chr-prefix_blank.bed
new file mode 100755
index 0000000..2989614
--- /dev/null
+++ b/tests/test_files/input/bed/chr-prefix_blank.bed
@@ -0,0 +1,4 @@
+chr19	1206912	1207203	STK11
+
+
+chr19	1219322	1219413	STK11
diff --git a/tests/test_files/input/bed/no-prefix.bed b/tests/test_files/input/bed/no-prefix.bed
new file mode 100755
index 0000000..e21cca3
--- /dev/null
+++ b/tests/test_files/input/bed/no-prefix.bed
@@ -0,0 +1,3 @@
+19	1206912	1207203	STK11
+19	1218415	1218500	STK11
+19	1219322	1219413	STK11
diff --git a/tests/test_files/input/bed/no-prefix_header.bed b/tests/test_files/input/bed/no-prefix_header.bed
new file mode 100755
index 0000000..3f14d2a
--- /dev/null
+++ b/tests/test_files/input/bed/no-prefix_header.bed
@@ -0,0 +1,4 @@
+chrom	start	end	gene
+19	1206912	1207203	STK11
+19	1218415	1218500	STK11
+19	1219322	1219413	STK11
diff --git a/tests/test_files/input/capture/sample-sheets/gene-dup-sample-id_samples.txt b/tests/test_files/input/capture/sample-sheets/gene-dup-sample-id_samples.txt
deleted file mode 100755
index b020624..0000000
--- a/tests/test_files/input/capture/sample-sheets/gene-dup-sample-id_samples.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-sample_id	sample_path	result_type	target_gene	chromosome	start	end	genome_build
-17328	/path/17327.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
-17328	/path/17328.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
-17328	/path/17329.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
-17330	/path/17330.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
-17398	/path/17398.sorted.bam	normal	ATM	11	108093559	108236235	hg19
-17334	/path/17334.sorted.bam	positive	ATM	11	108235809	108236235	hg19
diff --git a/tests/test_files/input/capture/sample-sheets/gene-dup-sample-path_samples.txt b/tests/test_files/input/capture/sample-sheets/gene-dup-sample-path_samples.txt
deleted file mode 100755
index 87b9027..0000000
--- a/tests/test_files/input/capture/sample-sheets/gene-dup-sample-path_samples.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-sample_id	sample_path	result_type	target_gene	chromosome	start	end	genome_build
-17327	/path/17328.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
-17328	/path/17328.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
-17329	/path/17328.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
-17330	/path/17330.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
-17398	/path/17398.sorted.bam	normal	ATM	11	108093559	108236235	hg19
-17334	/path/17334.sorted.bam	positive	ATM	11	108235809	108236235	hg19
diff --git a/tests/test_files/input/capture/sample-sheets/gene-lt_30.txt b/tests/test_files/input/capture/sample-sheets/gene-lt_30.txt
new file mode 100755
index 0000000..9d7155b
--- /dev/null
+++ b/tests/test_files/input/capture/sample-sheets/gene-lt_30.txt
@@ -0,0 +1,5 @@
+sample_id	sample_path	result_type	target_gene	chromosome	start	end	genome_build
+0	/path/0.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+1	/path/1.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+30	/path/30.sorted.bam	normal	ATM	11	108093559	108236235	hg19
+31	/path/31.sorted.bam	positive	ATM	11	108235809	108236235	hg19
diff --git a/tests/test_files/input/capture/sample-sheets/gene_samples.txt b/tests/test_files/input/capture/sample-sheets/gene_samples.txt
index fcce9cc..be527b5 100755
--- a/tests/test_files/input/capture/sample-sheets/gene_samples.txt
+++ b/tests/test_files/input/capture/sample-sheets/gene_samples.txt
@@ -1,7 +1,33 @@
 sample_id	sample_path	result_type	target_gene	chromosome	start	end	genome_build
-17327	/path/17327.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
-17328	/path/17328.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
-17329	/path/17329.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
-17330	/path/17330.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
-17331	/path/17331.sorted.bam	normal	ATM	11	108093559	108236235	hg19
-17332	/path/17332.sorted.bam	positive	ATM	11	108235809	108236235	hg19
+0	/path/0.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+1	/path/1.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+2	/path/2.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+3	/path/3.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+4	/path/4.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+5	/path/5.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+6	/path/6.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+7	/path/7.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+8	/path/8.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+9	/path/9.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+10	/path/10.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+11	/path/11.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+12	/path/12.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+13	/path/13.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+14	/path/14.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+15	/path/15.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+16	/path/16.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+17	/path/17.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+18	/path/18.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+19	/path/19.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+20	/path/20.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+21	/path/21.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+22	/path/22.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+23	/path/23.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+24	/path/24.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+25	/path/25.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+26	/path/26.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+27	/path/27.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+28	/path/28.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+29	/path/29.sorted.bam	normal-panel	ATM	11	108093559	108236235	hg19
+30	/path/30.sorted.bam	normal	ATM	11	108093559	108236235	hg19
+31	/path/31.sorted.bam	positive	ATM	11	108235809	108236235	hg19
diff --git a/tests/test_files/input/checks/.gitkeep b/tests/test_files/input/checks/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_files/input/gene_1.txt b/tests/test_files/input/gene_1.txt
index 79e9cd6..184534f 100755
--- a/tests/test_files/input/gene_1.txt
+++ b/tests/test_files/input/gene_1.txt
@@ -1,3 +1,3 @@
 sample_id	sample_path	result_type	target_gene	capture_name	chromosome	start	end	cnv_call
 12S13548	/mnt/data/181225_NB503215_run/analysis/Alignments/12S13548_sorted.bam	positive	gene_1	ICR	chr17	1200	1500	DUP
-10S21354	/mnt/data/181225_NB503215_run/analysis/Alignments/10S21354_sorted.bam	normal-panel	gene_1	ICR	chr17	NA	NA	NA
+12S13548	/mnt/data/181225_NB503215_run/analysis/Alignments/12S13548_sorted.bam	normal-panel	gene_1	ICR	chr17	NA	NA	NA
diff --git a/tests/test_files/reference/genome.dict b/tests/test_files/reference/genome.dict
new file mode 100755
index 0000000..e69de29
diff --git a/tests/test_files/reference/genome.fa b/tests/test_files/reference/genome.fa
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_files/reference/genome.fa.fai b/tests/test_files/reference/genome.fa.fai
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_files/reference/genome_no_dict.fa b/tests/test_files/reference/genome_no_dict.fa
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_files/reference/genome_no_dict.fa.fai b/tests/test_files/reference/genome_no_dict.fa.fai
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_files/reference/genome_no_fai.dict b/tests/test_files/reference/genome_no_fai.dict
new file mode 100755
index 0000000..e69de29
diff --git a/tests/test_files/reference/genome_no_fai.fa b/tests/test_files/reference/genome_no_fai.fa
new file mode 100644
index 0000000..e69de29
diff --git a/tests/unit/test_base_classes.py b/tests/unit/test_base_classes.py
index 060c195..d98db51 100755
--- a/tests/unit/test_base_classes.py
+++ b/tests/unit/test_base_classes.py
@@ -1,15 +1,50 @@
+import pathlib
 import subprocess
 
 import pytest
 
 from scripts.base_classes import BaseCNVTool
-from scripts import models
+from scripts import models, utils
+
+cnv_pat_dir = utils.get_cnv_patissier_dir()
 
 
 def instance_data(instance):
     return {k: v for k, v in instance.__dict__.items() if k != "_sa_instance_state"}
 
 
+@pytest.mark.usefixtures("db", "db_session")
+class TestCheckChromPrefix:
+    def setup(self):
+        self.caller = BaseCNVTool("capture", "gene", "time")
+        self.caller.settings = {"chromosome_prefix": "chr"}
+
+    def test_chr_prefix_working(self):
+        self.caller.check_chrom_prefix("tests/test_files/input/bed/chr-prefix.bed")
+
+    def test_chr_prefix_mismatch(self):
+        with pytest.raises(Exception):
+            self.caller.check_chrom_prefix("tests/test_files/input/bed/no-prefix.bed")
+
+    def test_blank_lines(self):
+        with pytest.raises(Exception):
+            self.caller.check_chrom_prefix("tests/test_files/input/bed/chr-prefix_blank.bed")
+
+    def test_no_prefix_working(self):
+        self.caller.settings = {"chromosome_prefix": ""}
+        self.caller.check_chrom_prefix("tests/test_files/input/bed/no-prefix.bed")
+
+    def test_no_prefix_mismatch(self):
+        self.caller.settings = {"chromosome_prefix": ""}
+        with pytest.raises(Exception):
+            self.caller.check_chrom_prefix("tests/test_files/input/bed/chr-prefix.bed")
+
+    def test_header(self):
+        self.caller.settings = {"chromosome_prefix": ""}
+        with pytest.raises(Exception):
+            self.caller.check_chrom_prefix("tests/test_files/input/bed/no-prefix_header.bed")
+
+
 @pytest.mark.usefixtures("db", "db_session")
 class TestFilterCNVs:
     def setup(self):
@@ -82,15 +117,22 @@ class TestGetBAMHeader:
     def setup(self):
         self.caller = BaseCNVTool("capture", "gene", "time")
         self.caller.run_type = "example_type"
-        self.caller.sample_to_bam = {"sample_1": "/mnt/test_files/input/bam_header.bam"}
+        self.caller.sample_to_bam = {
+            "12S13548": "/mnt/test_files/input/bam_header.bam",
+            "sample_mismatch": "/mnt/test_files/input/bam_header.bam",
+        }
         self.caller.bam_mount = "/mnt/data/"
 
     def test_basic(self):
         with open("tests/test_files/input/bam_header.sam") as handle:
             header_list = handle.readlines()
         expected_header = "".join(header_list).replace("\n", "\r\n")
-        output_header = self.caller.get_bam_header("sample_1")
-        assert expected_header == output_header  
+        output_header = self.caller.get_bam_header("12S13548")
+        assert expected_header == output_header
+
+    def test_sample_mismatch(self):
+        with pytest.raises(AssertionError):
+            self.caller.get_bam_header("sample_mismatch")
 
 
 @pytest.mark.usefixtures("db", "db_session")
@@ -100,15 +142,47 @@ def setup(self):
         self.caller.run_type = "example_type"
 
     def test_simple(self):
-        expected = ("d41d8cd98f00b204e9800998ecf8427e", "input/.gitkeep")
-        output = self.caller.get_md5sum("input/.gitkeep")
+        expected = ("d41d8cd98f00b204e9800998ecf8427e", "cnv-pat/input/.gitkeep")
+        output = self.caller.get_md5sum(pathlib.Path("input/.gitkeep").resolve())
         assert expected == output
 
     def test_missing_file(self):
         with pytest.raises(subprocess.CalledProcessError):
             output = self.caller.get_md5sum("does_not_exist.txt")
 
-@pytest.mark.dev
+
+@pytest.mark.usefixtures("db", "db_session", "populate_db")
+class TestPreRunSteps:
+    def setup(self):
+        self.test_file_prefix = f"{cnv_pat_dir}/tests/test_files/input/checks"
+
+        with open(f"{self.test_file_prefix}/sample_sheet_working.txt", "w") as handle:
+            handle.write("sample_id\tsample_path\n")
+            handle.write(f"12S13548\t{cnv_pat_dir}/tests/test_files/input/bam_header.bam\n")
+
+        self.caller = BaseCNVTool("ICR", "gene_1", "time")
+        self.caller.bam_mount = "/mnt/data/"
+        self.caller.run_type = "example_type"
+        self.caller.sample_to_bam = {"12S13548": "/mnt/test_files/input/bam_header.bam"}
+        self.header_file = "tests/test_files/input/bam_header.sam"
+        with open("tests/test_files/input/bam_header.sam") as handle:
+            header_list = handle.readlines()
+
+        self.expected_header = {"12S13548": "".join(header_list).replace("\n", "\r\n")}
+
+    def test_header(self):
+        header = self.caller.prerun_steps(
+            f"{self.test_file_prefix}/sample_sheet_working.txt", "tests/test_files/reference/genome.fa"
+        )
+        assert header == self.expected_header
+
+    @pytest.mark.parametrize("genome", ["no_genome.fa", "genome_no_fai.fa", "genome_no_dict.fa"])
+    def test_missing_reference(self, genome):
+        with pytest.raises(AssertionError):
+            genome_path = f"tests/test_files/reference/{genome}"
+            self.caller.prerun_steps(f"{self.test_file_prefix}/sample_sheet_working.txt", genome_path)
+
+
 @pytest.mark.usefixtures("db", "db_session", "populate_db")
 class TestUploadAllMd5sums:
     def setup(self):
@@ -116,9 +190,10 @@ def setup(self):
         self.caller.run_type = "example_type"
 
     def test_working(self):
-        self.caller.upload_all_md5sums()
-        assert False
-
+        before_upload = self.caller.session.query(models.File).all()
+        self.caller.upload_all_md5sums(1)
+        after_upload = self.caller.session.query(models.File).all()
+        assert len(before_upload) < len(after_upload)
 
 
 @pytest.mark.usefixtures("db", "db_session", "populate_db")
@@ -189,10 +264,7 @@ def setup(self):
         }
         self.caller.bam_mount = "/mnt/data/"
         self.caller.settings = {"genome_build_name": "hg19"}
-        self.header_file = "tests/test_files/input/bam_header.sam"
-        with open("tests/test_files/input/bam_header.sam") as handle:
-            header_list = handle.readlines()
-        self.expected_header = "".join(header_list).replace("\n", "\r\n")
+        self.caller.bam_headers = {"12S13548": "header1", "10S21354": "header2"}
         self.sample_sheet = "tests/test_files/input/gene_1.txt"
         self.expected_output = [
             {
@@ -206,21 +278,15 @@ def setup(self):
         ]
 
     def test_basic(self):
-        """Existing instance should stay the same, 10S shouldn't exist, then be uploaded
+        """Existing instance should stay the same, 12S13548 shouldn't exist, then be uploaded
         Data returned should be a list of dictionaries for upload of known cnv information from sample sheet"""
 
-        existing_1 = self.caller.session.query(models.Sample).filter_by(name="12S13548").first()
-        existing_data_1 = instance_data(existing_1)
-        no_instance = self.caller.session.query(models.Sample).filter_by(name="10S21354").first()
+        no_instance = self.caller.session.query(models.Sample).filter_by(name="12S13548").first()
 
         output_table = self.caller.upload_samples(self.sample_sheet)
         uploaded_1 = self.caller.session.query(models.Sample).filter_by(name="12S13548").first()
-        uploaded_2 = self.caller.session.query(models.Sample).filter_by(name="10S21354").first()
-        assert existing_data_1 == instance_data(uploaded_1)
-        assert no_instance is None
-        assert uploaded_2.name == "10S21354"
-        assert uploaded_2.bam_header == existing_data_1["bam_header"]
-        assert uploaded_2.result_type == "normal-panel"
+        assert not no_instance
+        assert uploaded_1.name == "12S13548"
         assert output_table == self.expected_output
 
 
@@ -233,9 +299,9 @@ def setup(self):
             {
                 "cnv": {"alt": "DUP", "genome_build": "hg19", "chrom": "chr17", "start": "1200", "end": "1500"},
                 "sample_defaults": {
-                    "name": "12S13548",
+                    "name": "10S21354",
                     "gene_id": 1,
-                    "path": "/mnt/data/181225_NB503215_run/analysis/Alignments/12S13548_sorted.bam",
+                    "path": "/mnt/data/181225_NB503215_run/analysis/Alignments/10S21354_sorted.bam",
                 },
             }
         ]
@@ -277,7 +343,7 @@ def setup(self):
             "start": "10",
             "end": "120",
             "chrom": "chr1",
-            "sample_id": "12S13548",
+            "sample_id": "10S21354",
             "alt": "DEL",
             "json_data": {"extra_field1": "extra_data1", "extra_field2": "extra_data2"},
         }
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index b916ab1..e1ddace 100755
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -1,3 +1,5 @@
+import pathlib
+
 import pytest
 
 from scripts import utils
@@ -5,31 +7,95 @@
 cnv_pat_dir = utils.get_cnv_patissier_dir()
 
 
+class TestCheckFiles:
+    def setup(self):
+        self.test_file_prefix = f"{cnv_pat_dir}/tests/test_files/input/checks"
+
+        for sample in range(40):
+            with open(f"{self.test_file_prefix}/sample{sample}.txt", "w") as handle:
+                handle.write("dummy")
+
+    def test_no_issues(self):
+        paths = [pathlib.Path(f"{self.test_file_prefix}/sample{sample}.txt") for sample in range(40)]
+        utils.SampleUtils.check_files(paths)
+
+    def test_dash_in_name(self):
+        paths = [pathlib.Path(f"{self.test_file_prefix}/sample-{sample}.txt") for sample in range(1)]
+        with pytest.raises(Exception):
+            utils.SampleUtils.check_files(paths)
+
+    def test_file_doesnt_exist(self):
+        paths = [pathlib.Path(f"{self.test_file_prefix}/sample{sample}.txt") for sample in range(50, 60)]
+        with pytest.raises(Exception):
+            utils.SampleUtils.check_files(paths)
+
+    def test_not_unique(self):
+        path_1 = [pathlib.Path(f"{self.test_file_prefix}/sample{sample}.txt") for sample in range(30)]
+        path_2 = [pathlib.Path(f"{self.test_file_prefix}/sample{sample}.txt") for sample in range(10, 12)]
+        with pytest.raises(Exception):
+            utils.SampleUtils.check_files([*path_1, *path_2])
+
+
+class TestCheckUnique:
+    def test_no_issues(self):
+        samples = [f"sample_{number}" for number in range(50)]
+        utils.SampleUtils.check_unique(samples, "samples")
+
+    def test_not_unique(self):
+        sample_1 = [f"sample_{number}" for number in range(50)]
+        sample_2 = [f"sample_{number}" for number in range(10, 12)]
+        with pytest.raises(Exception):
+            utils.SampleUtils.check_unique([*sample_1, *sample_2], "samples")
+
+
+class TestGetBAMtoID:
+    def setup(self):
+        self.test_file = f"{cnv_pat_dir}/tests/test_files/input/checks/sample_sheet_bam_to_id.txt"
+        expected_ids = {}
+        with open(self.test_file, "w") as handle:
+            handle.write("sample_id\tsample_path\tresult_type\n")
+
+            for number in range(35):
+                if number < 30:
+                    result = "normal-panel"
+                elif number % 2:
+                    result = "positive"
+                else:
+                    result = "normal"
+                sample = f"sample{number}"
+                path = f"input/sample{number}.bam"
+                handle.write(f"{sample}\t{path}\t{result}\n")
+
+                expected_ids[path] = sample
+
+        self.expected_ids = expected_ids
+
+    def test_working(self):
+        bam_to_id = utils.SampleUtils.get_bam_to_id(self.test_file)
+        assert bam_to_id == self.expected_ids
+
+
 class TestSampleUtilsSelectSamples:
     def setup(self):
         self.sample_path_prefix = f"{cnv_pat_dir}/tests/test_files/input/capture/sample-sheets/gene"
 
     def test_normal_panel(self):
-        expected_ids = [str(number) for number in range(17327, 17331)]
-        expected_paths = [f"/path/{number}.sorted.bam" for number in range(17327, 17331)]
+        expected_ids = [str(number) for number in range(0, 30)]
+        expected_paths = [f"/path/{number}.sorted.bam" for number in range(0, 30)]
         ids, paths = utils.SampleUtils.select_samples(f"{self.sample_path_prefix}_samples.txt", normal_panel=True)
         assert ids == expected_ids
         assert paths == expected_paths
 
     def test_unknown_cases(self):
-        expected_ids = [str(number) for number in range(17331, 17333)]
-        expected_paths = [f"/path/{number}.sorted.bam" for number in range(17331, 17333)]
+        expected_ids = [str(number) for number in range(30, 32)]
+        expected_paths = [f"/path/{number}.sorted.bam" for number in range(30, 32)]
         ids, paths = utils.SampleUtils.select_samples(f"{self.sample_path_prefix}_samples.txt", normal_panel=False)
         assert ids == expected_ids
         assert paths == expected_paths
 
-    def test_dup_id(self):
-        with pytest.raises(AssertionError):
-            utils.SampleUtils.select_samples(f"{self.sample_path_prefix}-dup-sample-id_samples.txt", True)
-
-    def test_dup_path(self):
+    def test_lt_30_normal_panel(self):
         with pytest.raises(AssertionError):
-            utils.SampleUtils.select_samples(f"{self.sample_path_prefix}-dup-sample-path_samples.txt", True)
+            utils.SampleUtils.select_samples(f"{self.sample_path_prefix}-lt_30.txt", True)
 
 
 class TestSampleUtilsGetMountPoint: