diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7d66b49..2288dd0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,8 +29,6 @@ repos: hooks: - id: poetry-check - id: poetry-lock - - id: poetry-export - args: ["-f", "requirements.txt", "-o", "requirements.txt"] - repo: https://github.com/charliermarsh/ruff-pre-commit rev: 'v0.0.245' hooks: diff --git a/docker-compose.env b/docker-compose.env index d86f043..261ea9f 100644 --- a/docker-compose.env +++ b/docker-compose.env @@ -2,5 +2,6 @@ SAMPLE_DIR=./tests/data/samples/A1_05_2024_10_08/20241024_2411515907/alignments SAMPLE_ID=A1_05_2024_10_08 BATCH_ID=20241024_2411515907 TIMELINE_FILE=./tests/data/samples/timeline_A1_05_2024_10_08.tsv +PRIMER_FILE=./tests/data/samples/primers.yaml NEXTCLADE_REFERENCE=sars-cov2 RESULTS_DIR=./results diff --git a/docker-compose.yml b/docker-compose.yml index 1e16388..4a8159a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,6 +4,7 @@ services: volumes: - ${SAMPLE_DIR}:/app/sample - ${TIMELINE_FILE}:/app/timeline.tsv + - ${PRIMER_FILE}:/app/primers.yaml - ${RESULTS_DIR}:/app/results - ./scripts/vp_config.json:/app/scripts/vp_config.json environment: @@ -12,6 +13,7 @@ services: - SAMPLE_ID=${SAMPLE_ID} - BATCH_ID=${BATCH_ID} - TIMELINE_FILE=${TIMELINE_FILE} + - PRIMER_FILE=${PRIMER_FILE} - RESULTS_DIR=${RESULTS_DIR} volumes: diff --git a/pyproject.toml b/pyproject.toml index 0dcccfa..b0d357e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,8 +9,7 @@ packages = [{ include = "sr2silo", from = "src" }] [tool.poetry.dependencies] python = "^3.10" pysam = "^0.22.1" -click = "^8.0.0" -schedule = "^1.2.2" +pyyaml = "^6.0.2" [tool.poetry.group.dev.dependencies] pytest = "^7.2.1" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 63b42db..0000000 --- a/requirements.txt +++ /dev/null @@ -1,28 +0,0 @@ -pysam==0.22.1 ; python_full_version >= "3.12.6" and python_full_version < "4.0.0" \ - --hash=sha256:098e0bf12d8b0399613065843310c91ba31a02d014b1f6b4e9d7f2d0d1254ff8 \ - --hash=sha256:17fac22fc89c86241a71084ca097878c61c97f6ff5fd4535d718681a849852a7 \ - --hash=sha256:18a0b97be95bd71e584de698441c46651cdff378db1c9a4fb3f541e560253b22 \ - --hash=sha256:18d886d50d75d8f853057fbbb284f0f0e98afad1f76b1a6f55660ea167d31c17 \ - --hash=sha256:1be663a73cf56ddd1d309b91d314a0c94c9bf352eaa3c6eda30cef12699843f0 \ - --hash=sha256:2b6cf1871c99cfc9c01261ec5f628519c2c889f0ff070e7a26aa5adbf9f69af1 \ - --hash=sha256:44420290a619c02da48ca0956548eb82a1665ae97b6ee69c094f9da5a6206431 \ - --hash=sha256:4447fdc2630519a00b6bf598995f1440e6f398eb0c084a7c141db026990ae07a \ - --hash=sha256:4aff9b41856d5dba6585ffd60884b8f3778c5d2688f33989662aabe7f4cd0fe0 \ - --hash=sha256:4dfae1de006d1c6491a59b00052a3f67c53a136165cf4edd7789b5dcb1e6806f \ - --hash=sha256:503c833e6cf348d87aec9113b1386d5c85c031d64deb914c29f5ad1792d103e6 \ - --hash=sha256:78ed746a39c9cebe489b8f0f86cf23c09c942e76c901260fb2794906e4cd0e26 \ - --hash=sha256:79cd94eeb96541385fa99e759a8f83d21428e092c8b577d50b4eee5823e757cd \ - --hash=sha256:860c7c78ddb1539b83d5476502ba14c8b4e8435810dc7a5b715196da3dfb86b6 \ - --hash=sha256:ab3343f221994d163e1ba2691430ce0f6e7da13762473e0d7f9a2d5db3bec235 \ - --hash=sha256:acff506c921af36f364c5a87f3a30b3c105ebeb270d0e821c2ca571eaf60ca20 \ - --hash=sha256:aeb31472365014fd8b37da4a88af758094b5872a8a16a25635a52cf8ceff5a9f \ - --hash=sha256:af9fb53157ba2431b7b20a550c0223f4a039304c9f180d8da98ea9d2d3ef3fbf \ - --hash=sha256:b1addca11c5cfceefaebdfcf3d83bc42f4b89fb1e8ae645a4bdab971cbcd2bc0 \ - --hash=sha256:c0e051fda433c1c7ff94532f60477bb83b97f4bb183567a0ae23f340e1c200b4 \ - --hash=sha256:c71ea45461ee596949061f321a799a97c418164485fdd7e8db89aea2ff979092 \ - --hash=sha256:cd9d457063272df16136640515183ea501bf3371f140a134b2f0a42f425a37d9 \ - --hash=sha256:d3fd6fe5aca79933632f38e5b568ce8d4e67e5c4f3bd39bff55fd9646af814d2 \ - --hash=sha256:e72e129d245574801125029a5892c9e18d2956b13c4203ea585cbd64ccde9351 \ - --hash=sha256:f18e72013ef2db9a9bb7e8ac421934d054427f6c03e66ce8abc39b09c846ba72 \ - --hash=sha256:f8f00bb1fb977fc33c87cf5fe9023eefc2ba3d43d30ab4875a1765827018c949 \ - --hash=sha256:faa5298291b54f185c7b8f84510224918bddc64bbdcb2e8426ff43e83452310f diff --git a/scripts/vp_transformer.py b/scripts/vp_transformer.py index 2ab0b93..fc1ac98 100644 --- a/scripts/vp_transformer.py +++ b/scripts/vp_transformer.py @@ -10,6 +10,7 @@ from pathlib import Path import click +import yaml from sr2silo.convert import bam_to_sam from sr2silo.process import pair_normalize_reads @@ -90,7 +91,7 @@ def convert_to_iso_date(date: str) -> str: return date_obj.date().isoformat() -def get_metadata(sample_id: str, batch_id: str, timeline: Path) -> dict: +def get_metadata(sample_id: str, batch_id: str, timeline: Path, primers: Path) -> dict: """ Get metadata for a given sample and batch directory. Cross-references the directory with the timeline file to get the metadata. @@ -99,6 +100,7 @@ def get_metadata(sample_id: str, batch_id: str, timeline: Path) -> dict: sample_id (str): The sample ID to use for metadata. batch_id (str): The batch ID to use for metadata. timeline (Path): The timeline file to cross-reference the metadata. + primers (Path): The primers file to cross-reference the metadata. Returns: dict: A dictionary containing the metadata. @@ -122,7 +124,10 @@ def get_metadata(sample_id: str, batch_id: str, timeline: Path) -> dict: # timline has headers: # sample_id batch_id read_length primer_protocol location_code sampling_date location_name # get read length, primer protocol, location name - # double checl if location code and location code are the same + # double check if location code and location code are the same + if not timeline.is_file(): + logging.error(f"Timeline file not found or is not a file: {timeline}") + raise FileNotFoundError(f"Timeline file not found or is not a file: {timeline}") with timeline.open() as f: reader = csv.reader(f, delimiter="\t") for row in reader: @@ -162,6 +167,28 @@ def get_metadata(sample_id: str, batch_id: str, timeline: Path) -> dict: raise ValueError( f"No matching entry found in timeline for sample_id {metadata['sample_id']} and batch_id {metadata['batch_id']}" ) + # Read the primers yaml to get additional metadata + # find the key with matching primer_protocol and get the "name" value + # as the canonical name of the primer protocol + if not primers.is_file(): + logging.error(f"Primers file not found or is not a file: {primers}") + raise FileNotFoundError(f"Primers file not found or is not a file: {primers}") + # Load YAML file + with open(primers, "r") as file: + primers = yaml.safe_load(file) + logging.debug(f"Primers: {primers}") + logging.debug(f" Type of primers: {type(primers)}") + for primer in primers.keys(): + if primer == metadata["primer_protocol"]: + logging.info( + f"Enriching metadata with primer data e.g. primer_protocol_name" + ) + metadata["primer_protocol_name"] = primers[primer]["name"] + break + else: + raise ValueError( + f"No matching entry found in primers for primer_protocol {metadata['primer_protocol']}" + ) return metadata @@ -172,6 +199,7 @@ def process_directory( result_dir: Path, nextclade_reference: str, timeline_file: Path, + primers_file: Path, file_name: str = "REF_aln_trim.bam", ) -> None: """Process all files in a given directory. @@ -182,6 +210,7 @@ def process_directory( result_dir (Path): The directory to save the results. nextclade_reference (str): The reference to use for nextclade. timeline_file (Path): The timeline file to cross-reference the metadata. + primers_file (Path): The primers file to cross-reference the metadata. file_name (str): The name of the file to process Returns: @@ -202,7 +231,7 @@ def process_directory( raise FileNotFoundError(f"Input file not found: {sample_fp}") # Get Sample and Batch metadata and write to a file - metadata = get_metadata(sample_id, batch_id, timeline_file) + metadata = get_metadata(sample_id, batch_id, timeline_file, primers_file) # add nextclade reference to metadata metadata["nextclade_reference"] = nextclade_reference metadata_file = result_dir / "metadata.json" @@ -239,6 +268,7 @@ def process_directory( @click.option( "--timeline_file", envvar="TIMELINE_FILE", help="Path to the timeline file." ) +@click.option("--primer_file", envvar="PRIMER_FILE", help="Path to the primers file.") @click.option( "--nextclade_reference", envvar="NEXTCLADE_REFERENCE", @@ -246,12 +276,19 @@ def process_directory( help="Nextclade reference.", ) def main( - sample_dir, sample_id, batch_id, result_dir, timeline_file, nextclade_reference + sample_dir, + sample_id, + batch_id, + result_dir, + timeline_file, + primer_file, + nextclade_reference, ): """Process a sample directory.""" logging.info(f"Processing sample directory: {sample_dir}") logging.info(f"Saving results to: {result_dir}") logging.info(f"Using timeline file: {timeline_file}") + logging.info(f"Using primers file: {primer_file}") logging.info(f"Using Nextclade reference: {nextclade_reference}") logging.info(f"Using sample_id: {sample_id}") logging.info(f"Using batch_id: {batch_id}") @@ -262,6 +299,7 @@ def main( batch_id=batch_id, result_dir=Path("results"), timeline_file=Path("timeline.tsv"), + primers_file=Path("primers.yaml"), nextclade_reference=nextclade_reference, ) diff --git a/tests/data/samples/primers.yaml b/tests/data/samples/primers.yaml new file mode 100644 index 0000000..cc6d51f --- /dev/null +++ b/tests/data/samples/primers.yaml @@ -0,0 +1,44 @@ +v532: + name: SARS-CoV-2 ARTIC V5.3.2 + alias: + - SARS-CoV-2 ARTIC V5.3.2 NEB Ultra II + - SARS-CoV-2 ARTIC V5.3.2 NexteraXT + inserts_bedfile: references/primers/v532/SARS-CoV-2.insert.bed + primers_bedfile: references/primers/v532/SARS-CoV-2.primer.bed + primers_file: references/primers/v532/SARS-CoV-2.tsv + primers_fasta: references/primers/v532/SARS-CoV-2.primer.fasta +v41: + name: SARS-CoV-2 ARTIC V4.1 + alias: + - SARS-CoV-2 ARTIC V4.1 NEB Ultra II + - SARS-CoV-2 ARTIC V4.1 NexteraXT + inserts_bedfile: references/primers/v41/SARS-CoV-2.insert.bed + primers_bedfile: references/primers/v41/SARS-CoV-2.primer.bed + primers_file: references/primers/v41/SARS-CoV-2.tsv + primers_fasta: references/primers/v41/SARS-CoV-2.primer.fasta +v4: + name: SARS-CoV-2 ARTIC V4 + alias: + - SARS-CoV-2 ARTIC V4 NEB Ultra II + - SARS-CoV-2 ARTIC V4 NexteraXT + inserts_bedfile: references/primers/v4/SARS-CoV-2.insert.bed + primers_bedfile: references/primers/v4/SARS-CoV-2.primer.bed + primers_file: references/primers/v4/SARS-CoV-2.tsv + primers_fasta: references/primers/v4/ARTIC_v4.fasta +v3: + name: SARS-CoV-2 ARTIC V3 + alias: + - ARTIC_NEB + - ARTICV3_NEB + - COVID_ARTIC_V3_NEB + # Not 100% exact, because Illumina introduces additional controls + - Illumina_COVIDSeq + - COVIDSeq_RUO_Custom + # TODO check with Viollier what custom is + # Nimagen: not 100% sure. + # TODO check with GFB and Christian + - EASYSEQ_SARS_COV2_WHOLE_GENOME_NGS + inserts_bedfile: references/primers/v3/nCoV-2019.insert.bed + primers_bedfile: references/primers/v3/nCoV-2019.primer.bed + primers_file: references/primers/v3/nCoV-2019.tsv + primers_fasta: references/primers/v3/ARTIC_v3.fasta diff --git a/tests/test_scripts.py b/tests/test_scripts.py index b15c0c2..8d00f3d 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -20,6 +20,7 @@ def test_get_metadata(): sample_id="A1_05_2024_10_08", batch_id="20241024_2411515907", timeline=Path("tests/data/samples/timeline_A1_05_2024_10_08.tsv"), + primers=Path("tests/data/samples/primers.yaml"), ) print(metadata) @@ -35,6 +36,7 @@ def test_get_metadata(): "read_length": "250", "primer_protocol": "v532", "location_name": "Lugano (TI)", + "primer_protocol_name": "SARS-CoV-2 ARTIC V5.3.2", } assert metadata == expected_metadata @@ -49,5 +51,6 @@ def test_process_directory(): Path("tests/output"), "nextstrain/sars-cov-2/wuhan-hu-1/orfs", Path("tests/data/samples/timeline_A1_05_2024_10_08.tsv"), + Path("tests/data/samples/primers.yaml"), ) assert True