From d2669a7f67e5c054f50199b7ec1d1f59a9f722a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gordon=20J=2E=20K=C3=B6hn?= Date: Fri, 22 Nov 2024 11:33:25 +0100 Subject: [PATCH 1/7] add test primer data --- tests/data/samples/primers.yaml | 44 +++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 tests/data/samples/primers.yaml diff --git a/tests/data/samples/primers.yaml b/tests/data/samples/primers.yaml new file mode 100644 index 0000000..cc6d51f --- /dev/null +++ b/tests/data/samples/primers.yaml @@ -0,0 +1,44 @@ +v532: + name: SARS-CoV-2 ARTIC V5.3.2 + alias: + - SARS-CoV-2 ARTIC V5.3.2 NEB Ultra II + - SARS-CoV-2 ARTIC V5.3.2 NexteraXT + inserts_bedfile: references/primers/v532/SARS-CoV-2.insert.bed + primers_bedfile: references/primers/v532/SARS-CoV-2.primer.bed + primers_file: references/primers/v532/SARS-CoV-2.tsv + primers_fasta: references/primers/v532/SARS-CoV-2.primer.fasta +v41: + name: SARS-CoV-2 ARTIC V4.1 + alias: + - SARS-CoV-2 ARTIC V4.1 NEB Ultra II + - SARS-CoV-2 ARTIC V4.1 NexteraXT + inserts_bedfile: references/primers/v41/SARS-CoV-2.insert.bed + primers_bedfile: references/primers/v41/SARS-CoV-2.primer.bed + primers_file: references/primers/v41/SARS-CoV-2.tsv + primers_fasta: references/primers/v41/SARS-CoV-2.primer.fasta +v4: + name: SARS-CoV-2 ARTIC V4 + alias: + - SARS-CoV-2 ARTIC V4 NEB Ultra II + - SARS-CoV-2 ARTIC V4 NexteraXT + inserts_bedfile: references/primers/v4/SARS-CoV-2.insert.bed + primers_bedfile: references/primers/v4/SARS-CoV-2.primer.bed + primers_file: references/primers/v4/SARS-CoV-2.tsv + primers_fasta: references/primers/v4/ARTIC_v4.fasta +v3: + name: SARS-CoV-2 ARTIC V3 + alias: + - ARTIC_NEB + - ARTICV3_NEB + - COVID_ARTIC_V3_NEB + # Not 100% exact, because Illumina introduces additional controls + - Illumina_COVIDSeq + - COVIDSeq_RUO_Custom + # TODO check with Viollier what custom is + # Nimagen: not 100% sure. + # TODO check with GFB and Christian + - EASYSEQ_SARS_COV2_WHOLE_GENOME_NGS + inserts_bedfile: references/primers/v3/nCoV-2019.insert.bed + primers_bedfile: references/primers/v3/nCoV-2019.primer.bed + primers_file: references/primers/v3/nCoV-2019.tsv + primers_fasta: references/primers/v3/ARTIC_v3.fasta From c98e8e7efecf27129beb3a918e2241bf44723c9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gordon=20J=2E=20K=C3=B6hn?= Date: Fri, 22 Nov 2024 11:35:11 +0100 Subject: [PATCH 2/7] add catch error for filepath timeline --- scripts/vp_transformer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/vp_transformer.py b/scripts/vp_transformer.py index 2ab0b93..737a386 100644 --- a/scripts/vp_transformer.py +++ b/scripts/vp_transformer.py @@ -122,7 +122,10 @@ def get_metadata(sample_id: str, batch_id: str, timeline: Path) -> dict: # timline has headers: # sample_id batch_id read_length primer_protocol location_code sampling_date location_name # get read length, primer protocol, location name - # double checl if location code and location code are the same + # double check if location code and location code are the same + if not timeline.is_file(): + logging.error(f"Timeline file not found or is not a file: {timeline}") + raise FileNotFoundError(f"Timeline file not found or is not a file: {timeline}") with timeline.open() as f: reader = csv.reader(f, delimiter="\t") for row in reader: From 59ad5fdde43235afd1b415722a22eb2afd14c622 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gordon=20J=2E=20K=C3=B6hn?= Date: Fri, 22 Nov 2024 12:02:25 +0100 Subject: [PATCH 3/7] adding cannonical primer name --- docker-compose.env | 1 + docker-compose.yml | 2 ++ pyproject.toml | 1 + scripts/vp_transformer.py | 41 ++++++++++++++++++++++++++++++++++++--- 4 files changed, 42 insertions(+), 3 deletions(-) diff --git a/docker-compose.env b/docker-compose.env index 610154a..c405504 100644 --- a/docker-compose.env +++ b/docker-compose.env @@ -2,5 +2,6 @@ SAMPLE_DIR=../../../data/sr2silo/daemon_test/samples/A1_05_2024_10_08/20241024_2 SAMPLE_ID=A1_05_2024_10_08 BATCH_ID=20241024_2411515907 TIMELINE_FILE=../../../data/sr2silo/daemon_test/timeline.tsv +PRIMER_FILE=./tests/data/samples/primers.yaml NEXTCLADE_REFERENCE=sars-cov2 RESULTS_DIR=./results diff --git a/docker-compose.yml b/docker-compose.yml index 1e16388..4a8159a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,6 +4,7 @@ services: volumes: - ${SAMPLE_DIR}:/app/sample - ${TIMELINE_FILE}:/app/timeline.tsv + - ${PRIMER_FILE}:/app/primers.yaml - ${RESULTS_DIR}:/app/results - ./scripts/vp_config.json:/app/scripts/vp_config.json environment: @@ -12,6 +13,7 @@ services: - SAMPLE_ID=${SAMPLE_ID} - BATCH_ID=${BATCH_ID} - TIMELINE_FILE=${TIMELINE_FILE} + - PRIMER_FILE=${PRIMER_FILE} - RESULTS_DIR=${RESULTS_DIR} volumes: diff --git a/pyproject.toml b/pyproject.toml index 35778cc..b0d357e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ packages = [{ include = "sr2silo", from = "src" }] [tool.poetry.dependencies] python = "^3.10" pysam = "^0.22.1" +pyyaml = "^6.0.2" [tool.poetry.group.dev.dependencies] pytest = "^7.2.1" diff --git a/scripts/vp_transformer.py b/scripts/vp_transformer.py index 737a386..a375a06 100644 --- a/scripts/vp_transformer.py +++ b/scripts/vp_transformer.py @@ -10,6 +10,7 @@ from pathlib import Path import click +import yaml from sr2silo.convert import bam_to_sam from sr2silo.process import pair_normalize_reads @@ -90,7 +91,7 @@ def convert_to_iso_date(date: str) -> str: return date_obj.date().isoformat() -def get_metadata(sample_id: str, batch_id: str, timeline: Path) -> dict: +def get_metadata(sample_id: str, batch_id: str, timeline: Path, primers: Path) -> dict: """ Get metadata for a given sample and batch directory. Cross-references the directory with the timeline file to get the metadata. @@ -99,6 +100,7 @@ def get_metadata(sample_id: str, batch_id: str, timeline: Path) -> dict: sample_id (str): The sample ID to use for metadata. batch_id (str): The batch ID to use for metadata. timeline (Path): The timeline file to cross-reference the metadata. + primers (Path): The primers file to cross-reference the metadata. Returns: dict: A dictionary containing the metadata. @@ -165,6 +167,28 @@ def get_metadata(sample_id: str, batch_id: str, timeline: Path) -> dict: raise ValueError( f"No matching entry found in timeline for sample_id {metadata['sample_id']} and batch_id {metadata['batch_id']}" ) + # Read the primers yaml to get additional metadata + # find the key with matching primer_protocol and ge the "name" value + # as the canonical name of the primer protocol + if not primers.is_file(): + logging.error(f"Primers file not found or is not a file: {primers}") + raise FileNotFoundError(f"Primers file not found or is not a file: {primers}") + # Load YAML file + with open(primers, "r") as file: + primers = yaml.safe_load(file) + logging.debug(f"Primers: {primers}") + logging.debug(f" Type of primers: {type(primers)}") + for primer in primers.keys(): + if primer == metadata["primer_protocol"]: + logging.info( + f"Enriching metadata with primer data e.g. primer_protocol_name" + ) + metadata["primer_protocol_name"] = primers[primer]["name"] + break + else: + raise ValueError( + f"No matching entry found in primers for primer_protocol {metadata['primer_protocol']}" + ) return metadata @@ -175,6 +199,7 @@ def process_directory( result_dir: Path, nextclade_reference: str, timeline_file: Path, + primers_file: Path, file_name: str = "REF_aln_trim.bam", ) -> None: """Process all files in a given directory. @@ -185,6 +210,7 @@ def process_directory( result_dir (Path): The directory to save the results. nextclade_reference (str): The reference to use for nextclade. timeline_file (Path): The timeline file to cross-reference the metadata. + primers_file (Path): The primers file to cross-reference the metadata. file_name (str): The name of the file to process Returns: @@ -205,7 +231,7 @@ def process_directory( raise FileNotFoundError(f"Input file not found: {sample_fp}") # Get Sample and Batch metadata and write to a file - metadata = get_metadata(sample_id, batch_id, timeline_file) + metadata = get_metadata(sample_id, batch_id, timeline_file, primers_file) # add nextclade reference to metadata metadata["nextclade_reference"] = nextclade_reference metadata_file = result_dir / "metadata.json" @@ -242,6 +268,7 @@ def process_directory( @click.option( "--timeline_file", envvar="TIMELINE_FILE", help="Path to the timeline file." ) +@click.option("--primer_file", envvar="PRIMER_FILE", help="Path to the primers file.") @click.option( "--nextclade_reference", envvar="NEXTCLADE_REFERENCE", @@ -249,12 +276,19 @@ def process_directory( help="Nextclade reference.", ) def main( - sample_dir, sample_id, batch_id, result_dir, timeline_file, nextclade_reference + sample_dir, + sample_id, + batch_id, + result_dir, + timeline_file, + primer_file, + nextclade_reference, ): """Process a sample directory.""" logging.info(f"Processing sample directory: {sample_dir}") logging.info(f"Saving results to: {result_dir}") logging.info(f"Using timeline file: {timeline_file}") + logging.info(f"Using primers file: {primer_file}") logging.info(f"Using Nextclade reference: {nextclade_reference}") logging.info(f"Using sample_id: {sample_id}") logging.info(f"Using batch_id: {batch_id}") @@ -265,6 +299,7 @@ def main( batch_id=batch_id, result_dir=Path("results"), timeline_file=Path("timeline.tsv"), + primers_file=Path("primers.yaml"), nextclade_reference=nextclade_reference, ) From 25a83e3a8f28f6f260ddd574881e4d814b76b0f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gordon=20J=2E=20K=C3=B6hn?= Date: Fri, 22 Nov 2024 12:54:30 +0100 Subject: [PATCH 4/7] adjust test --- tests/test_scripts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index b15c0c2..7a342f7 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -49,5 +49,6 @@ def test_process_directory(): Path("tests/output"), "nextstrain/sars-cov-2/wuhan-hu-1/orfs", Path("tests/data/samples/timeline_A1_05_2024_10_08.tsv"), + Path("tests/data/samples/primers.yaml"), ) assert True From 9529b48772a0fe06fcddc77620ecd7b39b40533b Mon Sep 17 00:00:00 2001 From: Gordon Julian Koehn Date: Fri, 22 Nov 2024 12:54:56 +0100 Subject: [PATCH 5/7] Update scripts/vp_transformer.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- scripts/vp_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/vp_transformer.py b/scripts/vp_transformer.py index a375a06..fc1ac98 100644 --- a/scripts/vp_transformer.py +++ b/scripts/vp_transformer.py @@ -168,7 +168,7 @@ def get_metadata(sample_id: str, batch_id: str, timeline: Path, primers: Path) - f"No matching entry found in timeline for sample_id {metadata['sample_id']} and batch_id {metadata['batch_id']}" ) # Read the primers yaml to get additional metadata - # find the key with matching primer_protocol and ge the "name" value + # find the key with matching primer_protocol and get the "name" value # as the canonical name of the primer protocol if not primers.is_file(): logging.error(f"Primers file not found or is not a file: {primers}") From c585f6256e2531c4f489b01431e76e78c4c69f88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gordon=20J=2E=20K=C3=B6hn?= Date: Fri, 22 Nov 2024 12:59:01 +0100 Subject: [PATCH 6/7] adjust test --- tests/test_scripts.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 7a342f7..ef3f15e 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -20,6 +20,7 @@ def test_get_metadata(): sample_id="A1_05_2024_10_08", batch_id="20241024_2411515907", timeline=Path("tests/data/samples/timeline_A1_05_2024_10_08.tsv"), + primers=Path("tests/data/samples/primers.yaml"), ) print(metadata) @@ -35,6 +36,8 @@ def test_get_metadata(): "read_length": "250", "primer_protocol": "v532", "location_name": "Lugano (TI)", + "primer_protocol_name": "SARS-CoV-2 ARTIC V5.3.2", + "nextclade_reference": "sars-cov-2", } assert metadata == expected_metadata From 78a8bb7174acf474ded4441e0e5fc5ac4498e9ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gordon=20J=2E=20K=C3=B6hn?= Date: Fri, 22 Nov 2024 13:08:09 +0100 Subject: [PATCH 7/7] adjust test --- tests/test_scripts.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index ef3f15e..8d00f3d 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -37,7 +37,6 @@ def test_get_metadata(): "primer_protocol": "v532", "location_name": "Lugano (TI)", "primer_protocol_name": "SARS-CoV-2 ARTIC V5.3.2", - "nextclade_reference": "sars-cov-2", } assert metadata == expected_metadata