Skip to content

Commit

Permalink
feat: submission to Loculus (#55)
Browse files Browse the repository at this point in the history
* starting point

* posting metadata works

* placeholder structure

* works - first programmatic submissions

* with this

* long line

* fix link
  • Loading branch information
gordonkoehn authored Dec 3, 2024
1 parent 4f3efe1 commit bfa659c
Show file tree
Hide file tree
Showing 4 changed files with 289 additions and 11 deletions.
137 changes: 137 additions & 0 deletions scripts/pathinder-client/submit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""Credits to Chaoran Chen for this script.
https://github.com/microbio-hackathon-2024/pathinder-client/blob/main/submit.py
Use with:
python3 submit.py --input ../../loculus_submission/metadata.tsv --group-id 1 --username testuser
pw: testuser
"""

from __future__ import annotations

import csv
import getpass
import tempfile

import click
import requests

KEYCLOAK_TOKEN_URL = "https://authentication-wise-seqs.loculus.org/realms/loculus/protocol/openid-connect/token"
SUBMISSION_URL = "https://backend-wise-seqs.loculus.org/test/submit?groupId={group_id}&dataUseTermsType=OPEN"


def generate_placeholder_fasta(submission_ids: list[str]) -> str:
"""
Generates a placeholder FASTA file for each submission ID with "NNN" as the sequence.
"""
fasta_entries = []
for submission_id in submission_ids:
fasta_entries.append(f">{submission_id}")
fasta_entries.append("NNN") # Placeholder sequence
return "\n".join(fasta_entries)


def get_submission_ids_from_tsv(file_path: str) -> list[str]:
"""
Reads a TSV file and extracts submission IDs by parsing the "submissionId" column.
"""
submission_ids = []
with open(file_path, "r") as tsv_file:
reader = csv.DictReader(tsv_file, delimiter="\t")

# Check if "submissionId" exists in the header
if "submissionId" not in reader.fieldnames:
raise ValueError('Error: "submissionId" column not found in the TSV file.')

# Extract submission IDs from the "submissionId" column
for row in reader:
submission_ids.append(row["submissionId"])

return submission_ids


def ask_for_password() -> str:
"""
Prompt the user for a password securely (without echoing the input).
"""
return getpass.getpass(prompt="Enter your password: ")


def get_loculus_authentication_token(username: str, password: str) -> str:
"""
Sends a request to the Keycloak authentication server to obtain a token.
"""
response = requests.post(
KEYCLOAK_TOKEN_URL,
headers={"Content-Type": "application/x-www-form-urlencoded"},
data={
"username": username,
"password": password,
"grant_type": "password",
"client_id": "backend-client",
},
)

if response.status_code == 200:
return response.json().get("access_token")
else:
raise Exception(
f"Error: Unable to authenticate. Status code: {response.status_code}, Response: {response.text}"
)


def submit(
authentication_token: str, group_id: int, tsv_path: str, fasta_path: str
) -> None:
"""
Submits the metadata and sequence files to Loculus via a POST request.
"""
submission_url = SUBMISSION_URL.format(group_id=group_id)

with open(tsv_path, "rb") as tsv_file, open(fasta_path, "rb") as fasta_file:
response = requests.post(
submission_url,
headers={
"Authorization": f"Bearer {authentication_token}",
"accept": "application/json",
},
files={"metadataFile": tsv_file, "sequenceFile": fasta_file},
)

if response.status_code == 200:
print("Upload successful.")
print(
"You can approve the upload for release at:\n\nhttps://microbioinfo-hackathon.loculus.org/salmonella/submission/1/review"
)
else:
raise Exception(
f"Error: Unable to submit. Status code: {response.status_code}, Response: {response.text}"
)


@click.command()
@click.option("--input", required=True, help="Path to the input TSV file")
@click.option(
"--group-id",
required=True,
type=int,
help="The ID of the group for which you are submitting",
)
@click.option("--username", required=True, help="Your username")
def main(input: str, group_id: int, username: str):
password = ask_for_password()
authentication_token = get_loculus_authentication_token(username, password)
submission_ids = get_submission_ids_from_tsv(input)
placeholder_fasta_str = generate_placeholder_fasta(submission_ids)

# Write the placeholder FASTA to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta") as fasta_file:
fasta_file.write(placeholder_fasta_str.encode("utf-8"))
placeholder_tmp_path = fasta_file.name

submit(authentication_token, group_id, input, placeholder_tmp_path)


if __name__ == "__main__":
main()
35 changes: 28 additions & 7 deletions scripts/vp_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import silo_input_transformer
from sr2silo.convert import bam_to_sam
from sr2silo.lapis import submit
from sr2silo.process import pair_normalize_reads
from sr2silo.translation import translate

Expand Down Expand Up @@ -462,7 +463,7 @@ def process_directory(
logging.error(f"Input file not found: {sample_fp}")
raise FileNotFoundError(f"Input file not found: {sample_fp}")

# Get Sample and Batch metadata and write to a file
##### Get Sample and Batch metadata and write to a file #####
metadata = get_metadata(sample_id, batch_id, timeline_file, primers_file)
# add nextclade reference to metadata
metadata["nextclade_reference"] = nextclade_reference
Expand All @@ -472,25 +473,25 @@ def process_directory(
json.dump(metadata, f, indent=4)
logging.info(f"Metadata saved to: {metadata_file}")

# Convert BAM to SAM
##### Convert BAM to SAM #####
logging.info(f"Converting BAM to SAM")
bam_file = sample_fp
sam_data = bam_to_sam(bam_file)

# Process SAM to FASTA
##### Process SAM to FASTA #####
logging.info(f"Processing SAM to FASTA (pair, merge, and normalize reads)")
fasta_file = result_dir / "reads.fasta"
insertions_file = result_dir / "insertions.txt"
pair_normalize_reads(sam_data, fasta_file, insertions_file)

# Translate nucleotides to amino acids
##### Translate nucleotides to amino acids #####
logging.info(f"Aliging and translating sequences")
results_dir_translated = result_dir / "translated"
translate([fasta_file], results_dir_translated, nextclade_reference)

logging.info(f"Results saved to: {results_dir_translated}")

# Wrangle to Nextclade format // silo_input_transformer inputs
##### Wrangle to Nextclade format // silo_input_transformer inputs #####
result_dir_wrangled = result_dir / "wrangled"
path_to_files = wrangle_for_transformer(
input_dir=results_dir_translated,
Expand All @@ -501,7 +502,7 @@ def process_directory(
database_config=database_config,
)

# Transform to NDJSON
###### Transform to NDJSON ######
result_dir_transformed = result_dir / "transformed"
logging.debug(f"Transforming to NDJSON")
logging.debug(f"sequence_file_directory: {result_dir_wrangled}")
Expand All @@ -515,7 +516,27 @@ def process_directory(
reference_genomes_fp=path_to_files["reference_genomes_fp"],
)

# NEED A WORKDIR FOR THIS ALL TO RUN IN A DOCKER CONTAINER
##### PLACEHOLDER: for uploading S3 reference to SILO #####
# make new dir for upload_submissions
result_dir_submission = result_dir / "submission"
result_dir_submission.mkdir(parents=True, exist_ok=True)
# Placeholder s3 link
srLink = "s3://sr2silo01/silo_input.ndjson"
# make mock metadata.tsv file with the srLink with the header "submissionId | s3Link | versionComment"
# and one entry 001 | s3://sr2silo01/silo_input.ndjson | ""
submission_metadata_fp = result_dir_submission / "metadata.tsv"
with (submission_metadata_fp).open("w") as f:
f.write("submissionId\ts3Link\tversionComment\n")
f.write("001\t" + srLink + "\t\n")
logging.info(f"Submission metadata saved to: {submission_metadata_fp}")

##### Submit S3 reference to SILO #####
logging.info(f"Submitting to Loculus")
input_fp = submission_metadata_fp
username = "testuser"
password = "testuser"
group_id = 1
submit(input_fp, username, password, group_id)


@click.command()
Expand Down
124 changes: 124 additions & 0 deletions src/sr2silo/lapis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
""""Interactions with the Lapis API."""

from __future__ import annotations

import csv
import tempfile
from pathlib import Path

import requests

# TODO: move to environment variables
KEYCLOAK_TOKEN_URL = (
"https:"
"//authentication-wise-seqs.loculus.org"
"/realms/loculus/protocol/openid-connect/token"
)
SUBMISSION_URL = (
"https:"
"//backend-wise-seqs.loculus.org"
"/test/submit?groupId={group_id}&dataUseTermsType=OPEN"
)


def generate_placeholder_fasta(submission_ids: list[str]) -> str:
"""
Generates a placeholder FASTA file for each submission ID with "NNN" as
the sequence.
"""
fasta_entries = []
for submission_id in submission_ids:
fasta_entries.append(f">{submission_id}")
fasta_entries.append("NNN") # Placeholder sequence
return "\n".join(fasta_entries)


def get_submission_ids_from_tsv(file_path: str) -> list[str]:
"""
Reads a TSV file and extracts submission IDs by parsing the "submissionId" column.
"""
submission_ids = []
with open(file_path, "r") as tsv_file:
reader = csv.DictReader(tsv_file, delimiter="\t")

# Check if "submissionId" exists in the header
if reader.fieldnames is not None and "submissionId" not in reader.fieldnames:
raise ValueError('Error: "submissionId" column not found in the TSV file.')

# Extract submission IDs from the "submissionId" column
for row in reader:
submission_ids.append(row["submissionId"])

return submission_ids


def get_loculus_authentication_token(username: str, password: str) -> str:
"""
Sends a request to the Keycloak authentication server to obtain a token.
"""
response = requests.post(
KEYCLOAK_TOKEN_URL,
headers={"Content-Type": "application/x-www-form-urlencoded"},
data={
"username": username,
"password": password,
"grant_type": "password",
"client_id": "backend-client",
},
)

if response.status_code == 200:
return response.json().get("access_token")
else:
raise Exception(
f"Error: Unable to authenticate. Status code: {response.status_code},"
f"Response: {response.text}"
)


def _submit(
authentication_token: str, group_id: int, tsv_path: str, fasta_path: str
) -> None:
"""
Submits the metadata and sequence files to Loculus via a POST request.
"""
submission_url = SUBMISSION_URL.format(group_id=group_id)

with open(tsv_path, "rb") as tsv_file, open(fasta_path, "rb") as fasta_file:
response = requests.post(
submission_url,
headers={
"Authorization": f"Bearer {authentication_token}",
"accept": "application/json",
},
files={"metadataFile": tsv_file, "sequenceFile": fasta_file},
)

if response.status_code == 200:
print("Upload successful.")
print(
"You can approve the upload for release at:\n\n"
"https://wise-seqs.loculus.org/salmonella/submission/1/review"
)
else:
raise Exception(
f"Error: Unable to submit. Status code: {response.status_code}, "
f"Response: {response.text}"
)


def submit(input_fp: Path, username: str, password: str, group_id: int) -> None:
"""
Upload the a metadata tsv file to a loculus instance.
"""

authentication_token = get_loculus_authentication_token(username, password)
submission_ids = get_submission_ids_from_tsv(str(input_fp))
placeholder_fasta_str = generate_placeholder_fasta(submission_ids)

# Write the placeholder FASTA to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta") as fasta_file:
fasta_file.write(placeholder_fasta_str.encode("utf-8"))
placeholder_tmp_path = fasta_file.name

_submit(authentication_token, group_id, str(input_fp), placeholder_tmp_path)
4 changes: 0 additions & 4 deletions src/sr2silo/silo.py

This file was deleted.

0 comments on commit bfa659c

Please sign in to comment.