diff --git a/ingest/bin/reverse_reversed_sequences.py b/ingest/bin/reverse_reversed_sequences.py deleted file mode 100644 index 6ca5ed2..0000000 --- a/ingest/bin/reverse_reversed_sequences.py +++ /dev/null @@ -1,29 +0,0 @@ -import pandas as pd -import argparse -from Bio import SeqIO - -if __name__=="__main__": - parser = argparse.ArgumentParser( - description="Reverse-complement reverse-complemented sequence", - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - - parser.add_argument('--metadata', type=str, required=True, help="input metadata") - parser.add_argument('--sequences', type=str, required=True, help="input sequences") - parser.add_argument('--output', type=str, required=True, help="output sequences") - args = parser.parse_args() - - metadata = pd.read_csv(args.metadata, sep='\t') - - # Read in fasta file - with open(args.sequences, 'r') as f_in: - with open(args.output, 'w') as f_out: - for seq in SeqIO.parse(f_in, 'fasta'): - # Check if metadata['reverse'] is True - if metadata.loc[metadata['accession'] == seq.id, 'reverse'].values[0] == True: - # Reverse-complement sequence - seq.seq = seq.seq.reverse_complement() - print("Reverse-complementing sequence:", seq.id) - - # Write sequences to file - SeqIO.write(seq, f_out, 'fasta') diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml index 64dbab1..92fb886 100644 --- a/ingest/config/config.yaml +++ b/ingest/config/config.yaml @@ -7,17 +7,31 @@ ncbi_field_map: 'source-data/ncbi-dataset-field-map.tsv' # Params for the transform rule transform: - # Fields to rename. + # NCBI fields to rename to Nextstrain field names. # This is the first step in the pipeline, so any references to field names # in the configs below should use the new field names - field_map: ['collected=date', 'submitted=date_submitted', 'genbank_accession=accession', 'submitting_organization=institution'] + field_map: [ + 'accession=genbank_accession', + 'accession-rev=genbank_accession_rev', + 'isolate-lineage=strain', + 'sourcedb=database', + 'geo-region=region', + 'geo-location=location', + 'host-name=host', + 'isolate-collection-date=date', + 'release-date=release_date', + 'update-date=update_date', + 'sra-accs=sra_accessions', + 'submitter-names=authors', + 'submitter-affiliations=institution', + ] # Standardized strain name regex # Currently accepts any characters because we do not have a clear standard for strain names strain_regex: '^.+$' # Back up strain name field if 'strain' doesn't match regex above - strain_backup_fields: ['accession'] + strain_backup_fields: ['genbank_accession'] # List of date fields to standardize - date_fields: ['date', 'date_submitted'] + date_fields: ['date', 'release_date', 'update_date'] # Expected date formats present in date fields # These date formats should use directives expected by datetime # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes @@ -47,14 +61,14 @@ transform: # User annotations file annotations: 'source-data/annotations.tsv' # ID field used to merge annotations - annotations_id: 'accession' + annotations_id: 'genbank_accession' # Field to use as the sequence ID in the FASTA file - id_field: 'accession' + id_field: 'genbank_accession' # Field to use as the sequence in the FASTA file sequence_field: 'sequence' # Final output columns for the metadata TSV metadata_columns: [ - 'accession', + 'genbank_accession', 'genbank_accession_rev', 'strain', 'date', @@ -62,11 +76,12 @@ transform: 'country', 'division', 'location', + 'length', 'host', - 'date_submitted', - 'sra_accession', + 'release_date', + 'update_date', + 'sra_accessions', 'abbr_authors', - 'reverse', 'authors', 'institution' ] diff --git a/ingest/source-data/ncbi-dataset-field-map.tsv b/ingest/source-data/ncbi-dataset-field-map.tsv index eb79418..57b4f8c 100644 --- a/ingest/source-data/ncbi-dataset-field-map.tsv +++ b/ingest/source-data/ncbi-dataset-field-map.tsv @@ -1,17 +1,17 @@ +# Maps the NCBI output TSV column names back to the NCBI mnemonics. +# This list should match the list in +# ingest/workflow/snakemake_rules/fetch_sequences.smk _get_ncbi_dataset_field_mnemonics key value -Accession genbank_accession_rev -Source database database -Isolate Lineage strain -Geographic Region region -Geographic Location location -Isolate Collection date collected -Release date submitted -Update date updated +Accession accession-rev +Source database sourcedb +Isolate Lineage isolate-lineage +Geographic Region geo-region +Geographic Location geo-location +Isolate Collection date isolate-collection-date +Release date release-date +Update date update-date Length length -Host Name host -Isolate Lineage source isolation_source -BioProjects bioproject_accession -BioSample accession biosample_accession -SRA Accessions sra_accession -Submitter Names authors -Submitter Affiliation submitting_organization +Host Name host-name +SRA Accessions sra-accs +Submitter Names submitter-names +Submitter Affiliation submitter-affiliation diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk index 3f32f9b..8d27193 100644 --- a/ingest/workflow/snakemake_rules/fetch_sequences.smk +++ b/ingest/workflow/snakemake_rules/fetch_sequences.smk @@ -76,8 +76,7 @@ def _get_ncbi_dataset_field_mnemonics(wildcards) -> str: rule format_ncbi_dataset_report: - # Formats the headers to be the same as before we used NCBI Datasets - # The only fields we do not have equivalents for are "title" and "publications" + # Formats the headers to match the NCBI mnemonic names input: dataset_package="data/ncbi_dataset.zip", ncbi_field_map=config["ncbi_field_map"], @@ -93,8 +92,8 @@ rule format_ncbi_dataset_report: --package {input.dataset_package} \ --fields {params.fields_to_include:q} \ | csvtk -tl rename2 -F -f '*' -p '(.+)' -r '{{kv}}' -k {input.ncbi_field_map} \ - | csvtk -tl mutate -f genbank_accession_rev -n genbank_accession -p "^(.+?)\." \ - | tsv-select -H -f genbank_accession --rest last \ + | csvtk -tl mutate -f accession-rev -n accession -p "^(.+?)\." \ + | tsv-select -H -f accession --rest last \ > {output.ncbi_dataset_tsv} """ @@ -114,7 +113,7 @@ rule format_ncbi_datasets_ndjson: augur curate passthru \ --metadata {input.ncbi_dataset_tsv} \ --fasta {input.ncbi_dataset_sequences} \ - --seq-id-column genbank_accession_rev \ + --seq-id-column accession-rev \ --seq-field sequence \ --unmatched-reporting warn \ --duplicate-reporting warn \