Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: adding canonical primer names #44

Merged
merged 7 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docker-compose.env
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@ SAMPLE_DIR=../../../data/sr2silo/daemon_test/samples/A1_05_2024_10_08/20241024_2
SAMPLE_ID=A1_05_2024_10_08
BATCH_ID=20241024_2411515907
TIMELINE_FILE=../../../data/sr2silo/daemon_test/timeline.tsv
PRIMER_FILE=./tests/data/samples/primers.yaml
NEXTCLADE_REFERENCE=sars-cov2
RESULTS_DIR=./results
2 changes: 2 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ services:
volumes:
- ${SAMPLE_DIR}:/app/sample
- ${TIMELINE_FILE}:/app/timeline.tsv
- ${PRIMER_FILE}:/app/primers.yaml
- ${RESULTS_DIR}:/app/results
- ./scripts/vp_config.json:/app/scripts/vp_config.json
environment:
Expand All @@ -12,6 +13,7 @@ services:
- SAMPLE_ID=${SAMPLE_ID}
- BATCH_ID=${BATCH_ID}
- TIMELINE_FILE=${TIMELINE_FILE}
- PRIMER_FILE=${PRIMER_FILE}
- RESULTS_DIR=${RESULTS_DIR}

volumes:
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ packages = [{ include = "sr2silo", from = "src" }]
[tool.poetry.dependencies]
python = "^3.10"
pysam = "^0.22.1"
pyyaml = "^6.0.2"

[tool.poetry.group.dev.dependencies]
pytest = "^7.2.1"
Expand Down
46 changes: 42 additions & 4 deletions scripts/vp_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pathlib import Path

import click
import yaml

from sr2silo.convert import bam_to_sam
from sr2silo.process import pair_normalize_reads
Expand Down Expand Up @@ -90,7 +91,7 @@ def convert_to_iso_date(date: str) -> str:
return date_obj.date().isoformat()


def get_metadata(sample_id: str, batch_id: str, timeline: Path) -> dict:
def get_metadata(sample_id: str, batch_id: str, timeline: Path, primers: Path) -> dict:
"""
Get metadata for a given sample and batch directory.
Cross-references the directory with the timeline file to get the metadata.
Expand All @@ -99,6 +100,7 @@ def get_metadata(sample_id: str, batch_id: str, timeline: Path) -> dict:
sample_id (str): The sample ID to use for metadata.
batch_id (str): The batch ID to use for metadata.
timeline (Path): The timeline file to cross-reference the metadata.
primers (Path): The primers file to cross-reference the metadata.

Returns:
dict: A dictionary containing the metadata.
Expand All @@ -122,7 +124,10 @@ def get_metadata(sample_id: str, batch_id: str, timeline: Path) -> dict:
# timline has headers:
# sample_id batch_id read_length primer_protocol location_code sampling_date location_name
# get read length, primer protocol, location name
# double checl if location code and location code are the same
# double check if location code and location code are the same
if not timeline.is_file():
logging.error(f"Timeline file not found or is not a file: {timeline}")
raise FileNotFoundError(f"Timeline file not found or is not a file: {timeline}")
with timeline.open() as f:
reader = csv.reader(f, delimiter="\t")
for row in reader:
Expand Down Expand Up @@ -162,6 +167,28 @@ def get_metadata(sample_id: str, batch_id: str, timeline: Path) -> dict:
raise ValueError(
f"No matching entry found in timeline for sample_id {metadata['sample_id']} and batch_id {metadata['batch_id']}"
)
# Read the primers yaml to get additional metadata
# find the key with matching primer_protocol and get the "name" value
# as the canonical name of the primer protocol
if not primers.is_file():
logging.error(f"Primers file not found or is not a file: {primers}")
raise FileNotFoundError(f"Primers file not found or is not a file: {primers}")
# Load YAML file
with open(primers, "r") as file:
primers = yaml.safe_load(file)
logging.debug(f"Primers: {primers}")
logging.debug(f" Type of primers: {type(primers)}")
for primer in primers.keys():
if primer == metadata["primer_protocol"]:
logging.info(
f"Enriching metadata with primer data e.g. primer_protocol_name"
)
metadata["primer_protocol_name"] = primers[primer]["name"]
break
else:
raise ValueError(
f"No matching entry found in primers for primer_protocol {metadata['primer_protocol']}"
)
return metadata


Expand All @@ -172,6 +199,7 @@ def process_directory(
result_dir: Path,
nextclade_reference: str,
timeline_file: Path,
primers_file: Path,
file_name: str = "REF_aln_trim.bam",
) -> None:
"""Process all files in a given directory.
Expand All @@ -182,6 +210,7 @@ def process_directory(
result_dir (Path): The directory to save the results.
nextclade_reference (str): The reference to use for nextclade.
timeline_file (Path): The timeline file to cross-reference the metadata.
primers_file (Path): The primers file to cross-reference the metadata.
file_name (str): The name of the file to process

Returns:
Expand All @@ -202,7 +231,7 @@ def process_directory(
raise FileNotFoundError(f"Input file not found: {sample_fp}")

# Get Sample and Batch metadata and write to a file
metadata = get_metadata(sample_id, batch_id, timeline_file)
metadata = get_metadata(sample_id, batch_id, timeline_file, primers_file)
# add nextclade reference to metadata
metadata["nextclade_reference"] = nextclade_reference
metadata_file = result_dir / "metadata.json"
Expand Down Expand Up @@ -239,19 +268,27 @@ def process_directory(
@click.option(
"--timeline_file", envvar="TIMELINE_FILE", help="Path to the timeline file."
)
@click.option("--primer_file", envvar="PRIMER_FILE", help="Path to the primers file.")
@click.option(
"--nextclade_reference",
envvar="NEXTCLADE_REFERENCE",
default="sars-cov-2",
help="Nextclade reference.",
)
def main(
sample_dir, sample_id, batch_id, result_dir, timeline_file, nextclade_reference
sample_dir,
sample_id,
batch_id,
result_dir,
timeline_file,
primer_file,
nextclade_reference,
):
"""Process a sample directory."""
logging.info(f"Processing sample directory: {sample_dir}")
logging.info(f"Saving results to: {result_dir}")
logging.info(f"Using timeline file: {timeline_file}")
logging.info(f"Using primers file: {primer_file}")
logging.info(f"Using Nextclade reference: {nextclade_reference}")
logging.info(f"Using sample_id: {sample_id}")
logging.info(f"Using batch_id: {batch_id}")
Expand All @@ -262,6 +299,7 @@ def main(
batch_id=batch_id,
result_dir=Path("results"),
timeline_file=Path("timeline.tsv"),
primers_file=Path("primers.yaml"),
nextclade_reference=nextclade_reference,
)

Expand Down
44 changes: 44 additions & 0 deletions tests/data/samples/primers.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
v532:
name: SARS-CoV-2 ARTIC V5.3.2
alias:
- SARS-CoV-2 ARTIC V5.3.2 NEB Ultra II
- SARS-CoV-2 ARTIC V5.3.2 NexteraXT
inserts_bedfile: references/primers/v532/SARS-CoV-2.insert.bed
primers_bedfile: references/primers/v532/SARS-CoV-2.primer.bed
primers_file: references/primers/v532/SARS-CoV-2.tsv
primers_fasta: references/primers/v532/SARS-CoV-2.primer.fasta
v41:
name: SARS-CoV-2 ARTIC V4.1
alias:
- SARS-CoV-2 ARTIC V4.1 NEB Ultra II
- SARS-CoV-2 ARTIC V4.1 NexteraXT
inserts_bedfile: references/primers/v41/SARS-CoV-2.insert.bed
primers_bedfile: references/primers/v41/SARS-CoV-2.primer.bed
primers_file: references/primers/v41/SARS-CoV-2.tsv
primers_fasta: references/primers/v41/SARS-CoV-2.primer.fasta
v4:
name: SARS-CoV-2 ARTIC V4
alias:
- SARS-CoV-2 ARTIC V4 NEB Ultra II
- SARS-CoV-2 ARTIC V4 NexteraXT
inserts_bedfile: references/primers/v4/SARS-CoV-2.insert.bed
primers_bedfile: references/primers/v4/SARS-CoV-2.primer.bed
primers_file: references/primers/v4/SARS-CoV-2.tsv
primers_fasta: references/primers/v4/ARTIC_v4.fasta
v3:
name: SARS-CoV-2 ARTIC V3
alias:
- ARTIC_NEB
- ARTICV3_NEB
- COVID_ARTIC_V3_NEB
# Not 100% exact, because Illumina introduces additional controls
- Illumina_COVIDSeq
- COVIDSeq_RUO_Custom
# TODO check with Viollier what custom is
# Nimagen: not 100% sure.
# TODO check with GFB and Christian
- EASYSEQ_SARS_COV2_WHOLE_GENOME_NGS
inserts_bedfile: references/primers/v3/nCoV-2019.insert.bed
primers_bedfile: references/primers/v3/nCoV-2019.primer.bed
primers_file: references/primers/v3/nCoV-2019.tsv
primers_fasta: references/primers/v3/ARTIC_v3.fasta
3 changes: 3 additions & 0 deletions tests/test_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def test_get_metadata():
sample_id="A1_05_2024_10_08",
batch_id="20241024_2411515907",
timeline=Path("tests/data/samples/timeline_A1_05_2024_10_08.tsv"),
primers=Path("tests/data/samples/primers.yaml"),
)

print(metadata)
Expand All @@ -35,6 +36,7 @@ def test_get_metadata():
"read_length": "250",
"primer_protocol": "v532",
"location_name": "Lugano (TI)",
"primer_protocol_name": "SARS-CoV-2 ARTIC V5.3.2",
}

assert metadata == expected_metadata
Expand All @@ -49,5 +51,6 @@ def test_process_directory():
Path("tests/output"),
"nextstrain/sars-cov-2/wuhan-hu-1/orfs",
Path("tests/data/samples/timeline_A1_05_2024_10_08.tsv"),
Path("tests/data/samples/primers.yaml"),
)
assert True
Loading