Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[on hold] dedup ncbi segments #93

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ rule upload_all:
metadata="fauna/s3/metadata.done",

include: "rules/ingest_fauna.smk"
include: "rules/merge_segment_metadata.smk"
# include: "rules/merge_segment_metadata.smk"
include: "rules/upload_to_s3.smk"

# Allow users to import custom rules provided via the config.
Expand Down
63 changes: 59 additions & 4 deletions ingest/build-configs/ncbi/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,8 @@ curate:
metadata_columns:
- strain
- virus
- isolate_id
- date
- date_released
- region
- country
- division
Expand All @@ -123,11 +123,39 @@ curate:
- submitting_lab
- authors
- PMID
- sra_accessions
- gisaid_clade
- h5_clade
- genbank_accession
- sra_accessions
- date_released
- n_segments
- segment_pb2
- segment_pb1
- segment_pa
- segment_ha
- segment_np
- segment_na
- segment_mp
- segment_ns
- isolate_id_pb2
- isolate_id_pb1
- isolate_id_pa
- isolate_id_ha
- isolate_id_np
- isolate_id_na
- isolate_id_mp
- isolate_id_ns
- genbank_accession_pb2
- genbank_accession_pb1
- genbank_accession_pa
- genbank_accession_ha
- genbank_accession_np
- genbank_accession_na
- genbank_accession_mp
- genbank_accession_ns
# Note - we've not exported the common fields:
# bioprojects, database, original_strain, sample_type, abbr_authors
# and also dropped many segment-specific fields.
# QUESTION: what's the difference between isolate_id and genbank_accession?


s3_dst:
ncbi: s3://nextstrain-data/files/workflows/avian-flu/h5n1/ncbi
Expand All @@ -142,3 +170,30 @@ join_ncbi_andersen:
ncbi_source: genbank
andersen_source: sra-via-andersen-lab
dedup_column: strain

grouping:
common_strain_fields:
# - strain # not needed - should improve
- date
- host
- region
- country
- division
- location
- bioprojects
- h5_clade
- database
- subtype
- authors
- abbr_authors
- submitting_lab
- originating_lab
- virus
- domestic_status
- PMID
- gisaid_clade
- original_strain
- sample_type
- sra_accessions
- date_released # enforce release date to be the same, or segment-specific?
accession: genbank_accession
92 changes: 63 additions & 29 deletions ingest/build-configs/ncbi/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -104,47 +104,81 @@ rule curate:
--id-field {params.annotations_id} ) 2>> {log} > {output.curated_ndjson}
"""


rule split_curated_ndjson_by_segment:
"""
Split out the full curate NDJSON by segment, then we can deduplicate
records by strain name within each segment
"""
rule dedup_by_accession:
input:
curated_ndjson="ncbi/data/curated.ndjson",
ndjson="ncbi/data/curated.ndjson",
output:
metadata="ncbi/data/all_metadata_{segment}.tsv",
sequences="ncbi/results/sequences_{segment}.fasta",
ndjson="ncbi/data/curated_by_strain.ndjson",
params:
id_field=config["curate"]["output_id_field"],
sequence_field=config["curate"]["output_sequence_field"],
log:
"ncbi/logs/{segment}/split_curated_ndjson_by_segment.txt",
benchmark:
"ncbi/benchmarks/{segment}/split_curated_ndjson_by_segment.txt"
common_strain_fields = config["grouping"]["common_strain_fields"],
segments = config["segments"],
accession = config["grouping"]["accession"],
shell:
r"""
python3 scripts/group_metadata.py \
--metadata {input.ndjson} \
--common-strain-fields {params.common_strain_fields} \
--segments {params.segments} \
--accession {params.accession} \
--output-metadata {output.ndjson}
"""
(cat {input.curated_ndjson} \
| ./build-configs/ncbi/bin/filter-ndjson-by-segment \
--segment {wildcards.segment} \
| ./build-configs/ncbi/bin/dedup-by-strain \
| augur curate passthru \
--output-metadata {output.metadata} \
--output-fasta {output.sequences} \
--output-id-field {params.id_field} \
--output-seq-field {params.sequence_field} ) 2>> {log}

rule write_sequences_from_ndjson:
input:
ndjson="ncbi/data/curated_by_strain.ndjson",
output:
sequences=expand("ncbi/results/sequences_{segment}.fasta", segment=config["segments"])
params:
sequences=[f"{segment}=ncbi/results/sequences_{segment}.fasta" for segment in config["segments"]]
shell:
r"""
python3 scripts/write_sequences_from_ndjson.py \
--input {input.ndjson} \
--output {params.sequences}
"""

# rule split_curated_ndjson_by_segment:
# """
# Split out the full curate NDJSON by segment, then we can deduplicate
# records by strain name within each segment
# """
# input:
# curated_ndjson="ncbi/data/curated.ndjson",
# output:
# metadata="ncbi/data/all_metadata_{segment}.tsv",
# sequences="ncbi/results/sequences_{segment}.fasta",
# params:
# id_field=config["curate"]["output_id_field"],
# sequence_field=config["curate"]["output_sequence_field"],
# log:
# "ncbi/logs/{segment}/split_curated_ndjson_by_segment.txt",
# benchmark:
# "ncbi/benchmarks/{segment}/split_curated_ndjson_by_segment.txt"
# shell:
# """
# (cat {input.curated_ndjson} \
# | ./build-configs/ncbi/bin/filter-ndjson-by-segment \
# --segment {wildcards.segment} \
# | ./build-configs/ncbi/bin/dedup-by-strain \
# | augur curate passthru \
# --output-metadata {output.metadata} \
# --output-fasta {output.sequences} \
# --output-id-field {params.id_field} \
# --output-seq-field {params.sequence_field} ) 2>> {log}
# """


rule subset_metadata:
input:
metadata="ncbi/data/all_metadata_{segment}.tsv",
metadata="ncbi/data/curated_by_strain.ndjson",
output:
subset_metadata="ncbi/data/metadata_{segment}.tsv",
subset_metadata = "ncbi/results/metadata.tsv",
params:
metadata_fields=",".join(config["curate"]["metadata_columns"]),
shell:
"""
tsv-select -H -f {params.metadata_fields} \
{input.metadata} > {output.subset_metadata}
r"""
cat {input.metadata} \
| augur curate passthru --output-metadata - \
| tsv-select -H -f {params.metadata_fields} \
> {output.subset_metadata}
"""
62 changes: 45 additions & 17 deletions ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ rule match_metadata_and_segment_fasta:
metadata = "andersen-lab/data/metadata.tsv",
fasta = "andersen-lab/data/{segment}.fasta"
output:
metadata = "andersen-lab/data/matched_metadata_{segment}.tsv",
presence_absence = "andersen-lab/results/presence_absence_{segment}.tsv",
fasta = "andersen-lab/results/sequences_{segment}.fasta",
params:
input_id_field="isolate_id",
Expand All @@ -172,30 +172,58 @@ rule match_metadata_and_segment_fasta:
--seq-field {params.sequence_field} \
--unmatched-reporting warn \
--duplicate-reporting warn \
--output-metadata {output.metadata} \
--output-metadata - \
--output-fasta {output.fasta} \
--output-id-field {params.output_id_field} \
--output-seq-field {params.sequence_field} \
| python scripts/segment_presence_absence.py \
--segment {wildcards.segment} \
1> {output.presence_absence}
2> {log}
"""


rule reorder_metadata_columns:
"""
Using tsv-select to reorder the columns of the Andersen lab metadata to
exactly match the NCBI metadata columns. Ensures that we don't accidently
append the wrong columns in joining steps.

tsv-select will exit with error if the column does not exist.
"""
rule add_segments_to_metadata:
input:
metadata = "andersen-lab/data/matched_metadata_{segment}.tsv",
metadata = "andersen-lab/data/metadata.tsv",
segments = expand("andersen-lab/results/presence_absence_{segment}.tsv", segment=config["segments"])
output:
reordered_metadata = "andersen-lab/data/metadata_{segment}.tsv"
metadata_partial = temp("andersen-lab/data/metadata_with_segments.tsv"), # augur merge doesn't allow writing to stdout :(
metadata = "andersen-lab/results/metadata.tsv",
params:
metadata_fields=",".join(config["curate"]["metadata_columns"]),
# augur merge requires NAME=FILEPATH argments, so we transform the inputs here:
metadata = lambda w,input: " ".join([f"main={input.metadata}", *[f"s_{idx}={s}" for idx,s in enumerate(input.segments)]]),
segments = config["segments"]
shell:
r"""
augur merge \
--metadata {params.metadata} \
--metadata-id-columns strain \
--no-source-columns \
--output-metadata {output.metadata_partial} \
&& cat {output.metadata_partial} \
| python scripts/add_n_segments_columns.py \
--segments {params.segments} \
> {output.metadata}
"""
tsv-select -H -f {params.metadata_fields} \
{input.metadata} > {output.reordered_metadata}
"""


# THIS RULE WAS UNUSED XXX
# rule reorder_metadata_columns:
# """
# Using tsv-select to reorder the columns of the Andersen lab metadata to
# exactly match the NCBI metadata columns. Ensures that we don't accidently
# append the wrong columns in joining steps.

# tsv-select will exit with error if the column does not exist.
# """
# input:
# metadata = "andersen-lab/data/matched_metadata_{segment}.tsv",
# output:
# reordered_metadata = "andersen-lab/data/metadata_{segment}.tsv"
# params:
# metadata_fields=",".join(config["curate"]["metadata_columns"]),
# shell:
# """
# tsv-select -H -f {params.metadata_fields} \
# {input.metadata} > {output.reordered_metadata}
# """
40 changes: 20 additions & 20 deletions ingest/rules/merge_segment_metadata.smk
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,23 @@ into a central metadata file.
"""


rule merge_segment_metadata:
"""
For each subtype's HA metadata file add a column "n_segments" which reports
how many segments have sequence data (no QC performed). This will force the
download & parsing of all segments for a given subtype. Note that this does
not currently consider the prescribed min lengths (see min_length function)
for each segment, but that would be a nice improvement.
"""
input:
segments = expand("{{data_source}}/data/metadata_{segment}.tsv", segment=config["segments"]),
metadata = "{data_source}/data/metadata_ha.tsv",
output:
metadata = "{data_source}/results/metadata.tsv",
shell:
"""
python scripts/add_segment_counts.py \
--segments {input.segments} \
--metadata {input.metadata} \
--output {output.metadata}
"""
# rule merge_segment_metadata:
# """
# For each subtype's HA metadata file add a column "n_segments" which reports
# how many segments have sequence data (no QC performed). This will force the
# download & parsing of all segments for a given subtype. Note that this does
# not currently consider the prescribed min lengths (see min_length function)
# for each segment, but that would be a nice improvement.
# """
# input:
# segments = expand("{{data_source}}/data/metadata_{segment}.tsv", segment=config["segments"]),
# metadata = "{data_source}/data/metadata_ha.tsv",
# output:
# metadata = "{data_source}/results/metadata.tsv",
# shell:
# """
# python scripts/add_segment_counts.py \
# --segments {input.segments} \
# --metadata {input.metadata} \
# --output {output.metadata}
# """
29 changes: 29 additions & 0 deletions ingest/scripts/add_n_segments_columns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""
Given a metadata TSV on STDIN, write out the same metadata TSV but with
an extra column "n_segments" which is the sum of all "segment_X" columns
for all X in --segments
TODO - use proper quoting once it's been decided
"""

import csv
import argparse
from sys import stdin,stdout

if __name__ == "__main__":
parser = argparse.ArgumentParser(description = __doc__)
parser.add_argument('--segments', metavar="SEGMENT", nargs="+", required=True, help="Segment names")
args = parser.parse_args()
reader = csv.DictReader(stdin, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
writer = csv.DictWriter( # copy/paste from the (unexported) `augur.io.metadata.write_records_to_tsv`
stdout,
[*list(reader.fieldnames), "n_segments"],
extrasaction='ignore',
delimiter='\t',
lineterminator='\n',
quoting=csv.QUOTE_NONE,
quotechar=None,
)
writer.writeheader()
for record in reader:
record['n_segments'] = sum([int(record.get(f"segment_{segment}", '0'))==1 for segment in args.segments])
writer.writerow(record)
Loading