From bfa659cc1eb009d23039f48b38628b9e877eec35 Mon Sep 17 00:00:00 2001 From: Gordon Julian Koehn Date: Tue, 3 Dec 2024 16:38:56 +0100 Subject: [PATCH] feat: submission to Loculus (#55) * starting point * posting metadata works * placeholder structure * works - first programmatic submissions * with this * long line * fix link --- scripts/pathinder-client/submit.py | 137 +++++++++++++++++++++++++++++ scripts/vp_transformer.py | 35 ++++++-- src/sr2silo/lapis.py | 124 ++++++++++++++++++++++++++ src/sr2silo/silo.py | 4 - 4 files changed, 289 insertions(+), 11 deletions(-) create mode 100644 scripts/pathinder-client/submit.py create mode 100644 src/sr2silo/lapis.py delete mode 100644 src/sr2silo/silo.py diff --git a/scripts/pathinder-client/submit.py b/scripts/pathinder-client/submit.py new file mode 100644 index 0000000..a693ceb --- /dev/null +++ b/scripts/pathinder-client/submit.py @@ -0,0 +1,137 @@ +"""Credits to Chaoran Chen for this script. + + https://github.com/microbio-hackathon-2024/pathinder-client/blob/main/submit.py + + Use with: + python3 submit.py --input ../../loculus_submission/metadata.tsv --group-id 1 --username testuser + + pw: testuser +""" + +from __future__ import annotations + +import csv +import getpass +import tempfile + +import click +import requests + +KEYCLOAK_TOKEN_URL = "https://authentication-wise-seqs.loculus.org/realms/loculus/protocol/openid-connect/token" +SUBMISSION_URL = "https://backend-wise-seqs.loculus.org/test/submit?groupId={group_id}&dataUseTermsType=OPEN" + + +def generate_placeholder_fasta(submission_ids: list[str]) -> str: + """ + Generates a placeholder FASTA file for each submission ID with "NNN" as the sequence. + """ + fasta_entries = [] + for submission_id in submission_ids: + fasta_entries.append(f">{submission_id}") + fasta_entries.append("NNN") # Placeholder sequence + return "\n".join(fasta_entries) + + +def get_submission_ids_from_tsv(file_path: str) -> list[str]: + """ + Reads a TSV file and extracts submission IDs by parsing the "submissionId" column. + """ + submission_ids = [] + with open(file_path, "r") as tsv_file: + reader = csv.DictReader(tsv_file, delimiter="\t") + + # Check if "submissionId" exists in the header + if "submissionId" not in reader.fieldnames: + raise ValueError('Error: "submissionId" column not found in the TSV file.') + + # Extract submission IDs from the "submissionId" column + for row in reader: + submission_ids.append(row["submissionId"]) + + return submission_ids + + +def ask_for_password() -> str: + """ + Prompt the user for a password securely (without echoing the input). + """ + return getpass.getpass(prompt="Enter your password: ") + + +def get_loculus_authentication_token(username: str, password: str) -> str: + """ + Sends a request to the Keycloak authentication server to obtain a token. + """ + response = requests.post( + KEYCLOAK_TOKEN_URL, + headers={"Content-Type": "application/x-www-form-urlencoded"}, + data={ + "username": username, + "password": password, + "grant_type": "password", + "client_id": "backend-client", + }, + ) + + if response.status_code == 200: + return response.json().get("access_token") + else: + raise Exception( + f"Error: Unable to authenticate. Status code: {response.status_code}, Response: {response.text}" + ) + + +def submit( + authentication_token: str, group_id: int, tsv_path: str, fasta_path: str +) -> None: + """ + Submits the metadata and sequence files to Loculus via a POST request. + """ + submission_url = SUBMISSION_URL.format(group_id=group_id) + + with open(tsv_path, "rb") as tsv_file, open(fasta_path, "rb") as fasta_file: + response = requests.post( + submission_url, + headers={ + "Authorization": f"Bearer {authentication_token}", + "accept": "application/json", + }, + files={"metadataFile": tsv_file, "sequenceFile": fasta_file}, + ) + + if response.status_code == 200: + print("Upload successful.") + print( + "You can approve the upload for release at:\n\nhttps://microbioinfo-hackathon.loculus.org/salmonella/submission/1/review" + ) + else: + raise Exception( + f"Error: Unable to submit. Status code: {response.status_code}, Response: {response.text}" + ) + + +@click.command() +@click.option("--input", required=True, help="Path to the input TSV file") +@click.option( + "--group-id", + required=True, + type=int, + help="The ID of the group for which you are submitting", +) +@click.option("--username", required=True, help="Your username") +def main(input: str, group_id: int, username: str): + password = ask_for_password() + authentication_token = get_loculus_authentication_token(username, password) + submission_ids = get_submission_ids_from_tsv(input) + placeholder_fasta_str = generate_placeholder_fasta(submission_ids) + + # Write the placeholder FASTA to a temporary file + with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta") as fasta_file: + fasta_file.write(placeholder_fasta_str.encode("utf-8")) + placeholder_tmp_path = fasta_file.name + + submit(authentication_token, group_id, input, placeholder_tmp_path) + + +if __name__ == "__main__": + main() diff --git a/scripts/vp_transformer.py b/scripts/vp_transformer.py index 4cf1dd8..460dea4 100644 --- a/scripts/vp_transformer.py +++ b/scripts/vp_transformer.py @@ -14,6 +14,7 @@ import silo_input_transformer from sr2silo.convert import bam_to_sam +from sr2silo.lapis import submit from sr2silo.process import pair_normalize_reads from sr2silo.translation import translate @@ -462,7 +463,7 @@ def process_directory( logging.error(f"Input file not found: {sample_fp}") raise FileNotFoundError(f"Input file not found: {sample_fp}") - # Get Sample and Batch metadata and write to a file + ##### Get Sample and Batch metadata and write to a file ##### metadata = get_metadata(sample_id, batch_id, timeline_file, primers_file) # add nextclade reference to metadata metadata["nextclade_reference"] = nextclade_reference @@ -472,25 +473,25 @@ def process_directory( json.dump(metadata, f, indent=4) logging.info(f"Metadata saved to: {metadata_file}") - # Convert BAM to SAM + ##### Convert BAM to SAM ##### logging.info(f"Converting BAM to SAM") bam_file = sample_fp sam_data = bam_to_sam(bam_file) - # Process SAM to FASTA + ##### Process SAM to FASTA ##### logging.info(f"Processing SAM to FASTA (pair, merge, and normalize reads)") fasta_file = result_dir / "reads.fasta" insertions_file = result_dir / "insertions.txt" pair_normalize_reads(sam_data, fasta_file, insertions_file) - # Translate nucleotides to amino acids + ##### Translate nucleotides to amino acids ##### logging.info(f"Aliging and translating sequences") results_dir_translated = result_dir / "translated" translate([fasta_file], results_dir_translated, nextclade_reference) logging.info(f"Results saved to: {results_dir_translated}") - # Wrangle to Nextclade format // silo_input_transformer inputs + ##### Wrangle to Nextclade format // silo_input_transformer inputs ##### result_dir_wrangled = result_dir / "wrangled" path_to_files = wrangle_for_transformer( input_dir=results_dir_translated, @@ -501,7 +502,7 @@ def process_directory( database_config=database_config, ) - # Transform to NDJSON + ###### Transform to NDJSON ###### result_dir_transformed = result_dir / "transformed" logging.debug(f"Transforming to NDJSON") logging.debug(f"sequence_file_directory: {result_dir_wrangled}") @@ -515,7 +516,27 @@ def process_directory( reference_genomes_fp=path_to_files["reference_genomes_fp"], ) - # NEED A WORKDIR FOR THIS ALL TO RUN IN A DOCKER CONTAINER + ##### PLACEHOLDER: for uploading S3 reference to SILO ##### + # make new dir for upload_submissions + result_dir_submission = result_dir / "submission" + result_dir_submission.mkdir(parents=True, exist_ok=True) + # Placeholder s3 link + srLink = "s3://sr2silo01/silo_input.ndjson" + # make mock metadata.tsv file with the srLink with the header "submissionId | s3Link | versionComment" + # and one entry 001 | s3://sr2silo01/silo_input.ndjson | "" + submission_metadata_fp = result_dir_submission / "metadata.tsv" + with (submission_metadata_fp).open("w") as f: + f.write("submissionId\ts3Link\tversionComment\n") + f.write("001\t" + srLink + "\t\n") + logging.info(f"Submission metadata saved to: {submission_metadata_fp}") + + ##### Submit S3 reference to SILO ##### + logging.info(f"Submitting to Loculus") + input_fp = submission_metadata_fp + username = "testuser" + password = "testuser" + group_id = 1 + submit(input_fp, username, password, group_id) @click.command() diff --git a/src/sr2silo/lapis.py b/src/sr2silo/lapis.py new file mode 100644 index 0000000..329a840 --- /dev/null +++ b/src/sr2silo/lapis.py @@ -0,0 +1,124 @@ +""""Interactions with the Lapis API.""" + +from __future__ import annotations + +import csv +import tempfile +from pathlib import Path + +import requests + +# TODO: move to environment variables +KEYCLOAK_TOKEN_URL = ( + "https:" + "//authentication-wise-seqs.loculus.org" + "/realms/loculus/protocol/openid-connect/token" +) +SUBMISSION_URL = ( + "https:" + "//backend-wise-seqs.loculus.org" + "/test/submit?groupId={group_id}&dataUseTermsType=OPEN" +) + + +def generate_placeholder_fasta(submission_ids: list[str]) -> str: + """ + Generates a placeholder FASTA file for each submission ID with "NNN" as + the sequence. + """ + fasta_entries = [] + for submission_id in submission_ids: + fasta_entries.append(f">{submission_id}") + fasta_entries.append("NNN") # Placeholder sequence + return "\n".join(fasta_entries) + + +def get_submission_ids_from_tsv(file_path: str) -> list[str]: + """ + Reads a TSV file and extracts submission IDs by parsing the "submissionId" column. + """ + submission_ids = [] + with open(file_path, "r") as tsv_file: + reader = csv.DictReader(tsv_file, delimiter="\t") + + # Check if "submissionId" exists in the header + if reader.fieldnames is not None and "submissionId" not in reader.fieldnames: + raise ValueError('Error: "submissionId" column not found in the TSV file.') + + # Extract submission IDs from the "submissionId" column + for row in reader: + submission_ids.append(row["submissionId"]) + + return submission_ids + + +def get_loculus_authentication_token(username: str, password: str) -> str: + """ + Sends a request to the Keycloak authentication server to obtain a token. + """ + response = requests.post( + KEYCLOAK_TOKEN_URL, + headers={"Content-Type": "application/x-www-form-urlencoded"}, + data={ + "username": username, + "password": password, + "grant_type": "password", + "client_id": "backend-client", + }, + ) + + if response.status_code == 200: + return response.json().get("access_token") + else: + raise Exception( + f"Error: Unable to authenticate. Status code: {response.status_code}," + f"Response: {response.text}" + ) + + +def _submit( + authentication_token: str, group_id: int, tsv_path: str, fasta_path: str +) -> None: + """ + Submits the metadata and sequence files to Loculus via a POST request. + """ + submission_url = SUBMISSION_URL.format(group_id=group_id) + + with open(tsv_path, "rb") as tsv_file, open(fasta_path, "rb") as fasta_file: + response = requests.post( + submission_url, + headers={ + "Authorization": f"Bearer {authentication_token}", + "accept": "application/json", + }, + files={"metadataFile": tsv_file, "sequenceFile": fasta_file}, + ) + + if response.status_code == 200: + print("Upload successful.") + print( + "You can approve the upload for release at:\n\n" + "https://wise-seqs.loculus.org/salmonella/submission/1/review" + ) + else: + raise Exception( + f"Error: Unable to submit. Status code: {response.status_code}, " + f"Response: {response.text}" + ) + + +def submit(input_fp: Path, username: str, password: str, group_id: int) -> None: + """ + Upload the a metadata tsv file to a loculus instance. + """ + + authentication_token = get_loculus_authentication_token(username, password) + submission_ids = get_submission_ids_from_tsv(str(input_fp)) + placeholder_fasta_str = generate_placeholder_fasta(submission_ids) + + # Write the placeholder FASTA to a temporary file + with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta") as fasta_file: + fasta_file.write(placeholder_fasta_str.encode("utf-8")) + placeholder_tmp_path = fasta_file.name + + _submit(authentication_token, group_id, str(input_fp), placeholder_tmp_path) diff --git a/src/sr2silo/silo.py b/src/sr2silo/silo.py deleted file mode 100644 index 1b7d4bb..0000000 --- a/src/sr2silo/silo.py +++ /dev/null @@ -1,4 +0,0 @@ -"""Place to interacting with the SILO database - (including the insert(JSON) method, potentially). - This module might handle S3 interactions as well. -"""