diff --git a/docker-compose.env b/docker-compose.env index 610154a..c405504 100644 --- a/docker-compose.env +++ b/docker-compose.env @@ -2,5 +2,6 @@ SAMPLE_DIR=../../../data/sr2silo/daemon_test/samples/A1_05_2024_10_08/20241024_2 SAMPLE_ID=A1_05_2024_10_08 BATCH_ID=20241024_2411515907 TIMELINE_FILE=../../../data/sr2silo/daemon_test/timeline.tsv +PRIMER_FILE=./tests/data/samples/primers.yaml NEXTCLADE_REFERENCE=sars-cov2 RESULTS_DIR=./results diff --git a/docker-compose.yml b/docker-compose.yml index 1e16388..4a8159a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,6 +4,7 @@ services: volumes: - ${SAMPLE_DIR}:/app/sample - ${TIMELINE_FILE}:/app/timeline.tsv + - ${PRIMER_FILE}:/app/primers.yaml - ${RESULTS_DIR}:/app/results - ./scripts/vp_config.json:/app/scripts/vp_config.json environment: @@ -12,6 +13,7 @@ services: - SAMPLE_ID=${SAMPLE_ID} - BATCH_ID=${BATCH_ID} - TIMELINE_FILE=${TIMELINE_FILE} + - PRIMER_FILE=${PRIMER_FILE} - RESULTS_DIR=${RESULTS_DIR} volumes: diff --git a/pyproject.toml b/pyproject.toml index 35778cc..b0d357e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ packages = [{ include = "sr2silo", from = "src" }] [tool.poetry.dependencies] python = "^3.10" pysam = "^0.22.1" +pyyaml = "^6.0.2" [tool.poetry.group.dev.dependencies] pytest = "^7.2.1" diff --git a/scripts/vp_transformer.py b/scripts/vp_transformer.py index 2ab0b93..fc1ac98 100644 --- a/scripts/vp_transformer.py +++ b/scripts/vp_transformer.py @@ -10,6 +10,7 @@ from pathlib import Path import click +import yaml from sr2silo.convert import bam_to_sam from sr2silo.process import pair_normalize_reads @@ -90,7 +91,7 @@ def convert_to_iso_date(date: str) -> str: return date_obj.date().isoformat() -def get_metadata(sample_id: str, batch_id: str, timeline: Path) -> dict: +def get_metadata(sample_id: str, batch_id: str, timeline: Path, primers: Path) -> dict: """ Get metadata for a given sample and batch directory. Cross-references the directory with the timeline file to get the metadata. @@ -99,6 +100,7 @@ def get_metadata(sample_id: str, batch_id: str, timeline: Path) -> dict: sample_id (str): The sample ID to use for metadata. batch_id (str): The batch ID to use for metadata. timeline (Path): The timeline file to cross-reference the metadata. + primers (Path): The primers file to cross-reference the metadata. Returns: dict: A dictionary containing the metadata. @@ -122,7 +124,10 @@ def get_metadata(sample_id: str, batch_id: str, timeline: Path) -> dict: # timline has headers: # sample_id batch_id read_length primer_protocol location_code sampling_date location_name # get read length, primer protocol, location name - # double checl if location code and location code are the same + # double check if location code and location code are the same + if not timeline.is_file(): + logging.error(f"Timeline file not found or is not a file: {timeline}") + raise FileNotFoundError(f"Timeline file not found or is not a file: {timeline}") with timeline.open() as f: reader = csv.reader(f, delimiter="\t") for row in reader: @@ -162,6 +167,28 @@ def get_metadata(sample_id: str, batch_id: str, timeline: Path) -> dict: raise ValueError( f"No matching entry found in timeline for sample_id {metadata['sample_id']} and batch_id {metadata['batch_id']}" ) + # Read the primers yaml to get additional metadata + # find the key with matching primer_protocol and get the "name" value + # as the canonical name of the primer protocol + if not primers.is_file(): + logging.error(f"Primers file not found or is not a file: {primers}") + raise FileNotFoundError(f"Primers file not found or is not a file: {primers}") + # Load YAML file + with open(primers, "r") as file: + primers = yaml.safe_load(file) + logging.debug(f"Primers: {primers}") + logging.debug(f" Type of primers: {type(primers)}") + for primer in primers.keys(): + if primer == metadata["primer_protocol"]: + logging.info( + f"Enriching metadata with primer data e.g. primer_protocol_name" + ) + metadata["primer_protocol_name"] = primers[primer]["name"] + break + else: + raise ValueError( + f"No matching entry found in primers for primer_protocol {metadata['primer_protocol']}" + ) return metadata @@ -172,6 +199,7 @@ def process_directory( result_dir: Path, nextclade_reference: str, timeline_file: Path, + primers_file: Path, file_name: str = "REF_aln_trim.bam", ) -> None: """Process all files in a given directory. @@ -182,6 +210,7 @@ def process_directory( result_dir (Path): The directory to save the results. nextclade_reference (str): The reference to use for nextclade. timeline_file (Path): The timeline file to cross-reference the metadata. + primers_file (Path): The primers file to cross-reference the metadata. file_name (str): The name of the file to process Returns: @@ -202,7 +231,7 @@ def process_directory( raise FileNotFoundError(f"Input file not found: {sample_fp}") # Get Sample and Batch metadata and write to a file - metadata = get_metadata(sample_id, batch_id, timeline_file) + metadata = get_metadata(sample_id, batch_id, timeline_file, primers_file) # add nextclade reference to metadata metadata["nextclade_reference"] = nextclade_reference metadata_file = result_dir / "metadata.json" @@ -239,6 +268,7 @@ def process_directory( @click.option( "--timeline_file", envvar="TIMELINE_FILE", help="Path to the timeline file." ) +@click.option("--primer_file", envvar="PRIMER_FILE", help="Path to the primers file.") @click.option( "--nextclade_reference", envvar="NEXTCLADE_REFERENCE", @@ -246,12 +276,19 @@ def process_directory( help="Nextclade reference.", ) def main( - sample_dir, sample_id, batch_id, result_dir, timeline_file, nextclade_reference + sample_dir, + sample_id, + batch_id, + result_dir, + timeline_file, + primer_file, + nextclade_reference, ): """Process a sample directory.""" logging.info(f"Processing sample directory: {sample_dir}") logging.info(f"Saving results to: {result_dir}") logging.info(f"Using timeline file: {timeline_file}") + logging.info(f"Using primers file: {primer_file}") logging.info(f"Using Nextclade reference: {nextclade_reference}") logging.info(f"Using sample_id: {sample_id}") logging.info(f"Using batch_id: {batch_id}") @@ -262,6 +299,7 @@ def main( batch_id=batch_id, result_dir=Path("results"), timeline_file=Path("timeline.tsv"), + primers_file=Path("primers.yaml"), nextclade_reference=nextclade_reference, ) diff --git a/tests/data/samples/primers.yaml b/tests/data/samples/primers.yaml new file mode 100644 index 0000000..cc6d51f --- /dev/null +++ b/tests/data/samples/primers.yaml @@ -0,0 +1,44 @@ +v532: + name: SARS-CoV-2 ARTIC V5.3.2 + alias: + - SARS-CoV-2 ARTIC V5.3.2 NEB Ultra II + - SARS-CoV-2 ARTIC V5.3.2 NexteraXT + inserts_bedfile: references/primers/v532/SARS-CoV-2.insert.bed + primers_bedfile: references/primers/v532/SARS-CoV-2.primer.bed + primers_file: references/primers/v532/SARS-CoV-2.tsv + primers_fasta: references/primers/v532/SARS-CoV-2.primer.fasta +v41: + name: SARS-CoV-2 ARTIC V4.1 + alias: + - SARS-CoV-2 ARTIC V4.1 NEB Ultra II + - SARS-CoV-2 ARTIC V4.1 NexteraXT + inserts_bedfile: references/primers/v41/SARS-CoV-2.insert.bed + primers_bedfile: references/primers/v41/SARS-CoV-2.primer.bed + primers_file: references/primers/v41/SARS-CoV-2.tsv + primers_fasta: references/primers/v41/SARS-CoV-2.primer.fasta +v4: + name: SARS-CoV-2 ARTIC V4 + alias: + - SARS-CoV-2 ARTIC V4 NEB Ultra II + - SARS-CoV-2 ARTIC V4 NexteraXT + inserts_bedfile: references/primers/v4/SARS-CoV-2.insert.bed + primers_bedfile: references/primers/v4/SARS-CoV-2.primer.bed + primers_file: references/primers/v4/SARS-CoV-2.tsv + primers_fasta: references/primers/v4/ARTIC_v4.fasta +v3: + name: SARS-CoV-2 ARTIC V3 + alias: + - ARTIC_NEB + - ARTICV3_NEB + - COVID_ARTIC_V3_NEB + # Not 100% exact, because Illumina introduces additional controls + - Illumina_COVIDSeq + - COVIDSeq_RUO_Custom + # TODO check with Viollier what custom is + # Nimagen: not 100% sure. + # TODO check with GFB and Christian + - EASYSEQ_SARS_COV2_WHOLE_GENOME_NGS + inserts_bedfile: references/primers/v3/nCoV-2019.insert.bed + primers_bedfile: references/primers/v3/nCoV-2019.primer.bed + primers_file: references/primers/v3/nCoV-2019.tsv + primers_fasta: references/primers/v3/ARTIC_v3.fasta diff --git a/tests/test_scripts.py b/tests/test_scripts.py index b15c0c2..8d00f3d 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -20,6 +20,7 @@ def test_get_metadata(): sample_id="A1_05_2024_10_08", batch_id="20241024_2411515907", timeline=Path("tests/data/samples/timeline_A1_05_2024_10_08.tsv"), + primers=Path("tests/data/samples/primers.yaml"), ) print(metadata) @@ -35,6 +36,7 @@ def test_get_metadata(): "read_length": "250", "primer_protocol": "v532", "location_name": "Lugano (TI)", + "primer_protocol_name": "SARS-CoV-2 ARTIC V5.3.2", } assert metadata == expected_metadata @@ -49,5 +51,6 @@ def test_process_directory(): Path("tests/output"), "nextstrain/sars-cov-2/wuhan-hu-1/orfs", Path("tests/data/samples/timeline_A1_05_2024_10_08.tsv"), + Path("tests/data/samples/primers.yaml"), ) assert True