Skip to content

Commit

Permalink
Merge pull request #96 from nextstrain/dedup-ncbi
Browse files Browse the repository at this point in the history
Dedup ncbi
  • Loading branch information
joverlee521 authored Oct 16, 2024
2 parents e98c98c + 12e8964 commit 01fa60e
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 39 deletions.
44 changes: 44 additions & 0 deletions ingest/build-configs/ncbi/bin/dedup-by-sample-id
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python3
"""
Deduplicate records by the sample id of the `strain` field.
For example, the following strain names have duplicate sample id 24-005334-001
- A/chicken/Ohio/24-005334-001/2024
- A/Chicken/USA/24-005334-001-original/2024
Only keeps the first record of duplicates and keeps record if the sample id
cannot be parsed from `strain`.
"""
import json
import re
from sys import stderr, stdin, stdout


SAMPLE_ID_REGEX = r"^A\/[^\/]*\/[^\/]*\/(\d{2}-\d{6}-\d{3})[^\/]*\/\d{4}$"
STRAIN_FIELD = "strain"


if __name__ == "__main__":
seen_sample_ids = set()

for index, record in enumerate(stdin):
record = json.loads(record).copy()

strain = record[STRAIN_FIELD]
sample_id_matches = re.match(SAMPLE_ID_REGEX, strain)
if sample_id_matches is not None:
sample_id = sample_id_matches.group(1)
if sample_id in seen_sample_ids:
print(
f"Dropping record {index!r} because it has a duplicate ",
f"sample ID {sample_id!r} in the strain name {strain!r}",
file=stderr
)
continue

seen_sample_ids.add(sample_id)

# Not every strain name has a matching sample id
# Just keep records that don't match
json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
print()
3 changes: 2 additions & 1 deletion ingest/build-configs/ncbi/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,12 @@ rule split_curated_ndjson_by_segment:
benchmark:
"ncbi/benchmarks/{segment}/split_curated_ndjson_by_segment.txt"
shell:
"""
r"""
(cat {input.curated_ndjson} \
| ./build-configs/ncbi/bin/filter-ndjson-by-segment \
--segment {wildcards.segment} \
| ./build-configs/ncbi/bin/dedup-by-strain \
| ./build-configs/ncbi/bin/dedup-by-sample-id \
| augur curate passthru \
--output-metadata {output.metadata} \
--output-fasta {output.sequences} \
Expand Down
7 changes: 4 additions & 3 deletions ingest/build-configs/ncbi/rules/ingest_andersen_lab.smk
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,12 @@ rule curate_metadata:
expected_date_formats=['%Y-%m-%d', '%Y', '%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%SZ'],
annotations_id=config["curate"]["annotations_id"],
shell:
"""
augur curate normalize-strings \
r"""
(augur curate normalize-strings \
--metadata {input.metadata} \
| ./build-configs/ncbi/bin/curate-andersen-lab-data \
| ./build-configs/ncbi/bin/dedup-by-strain \
| ./build-configs/ncbi/bin/dedup-by-sample-id \
| augur curate format-dates \
--date-fields {params.date_fields:q} \
--expected-date-formats {params.expected_date_formats:q} \
Expand All @@ -143,7 +144,7 @@ rule curate_metadata:
--annotations {input.annotations} \
--id-field {params.annotations_id} \
| augur curate passthru \
--output-metadata {output.metadata} 2>> {log}
--output-metadata {output.metadata}) 2>> {log}
"""

rule match_metadata_and_segment_fasta:
Expand Down
77 changes: 42 additions & 35 deletions ingest/build-configs/ncbi/rules/join_ncbi_andersen.smk
Original file line number Diff line number Diff line change
Expand Up @@ -26,66 +26,73 @@ rule select_missing_metadata:
"""


rule select_missing_strain_names:
rule append_missing_metadata_to_ncbi:
input:
ncbi_metadata = "ncbi/results/metadata.tsv",
missing_metadata = "joined-ncbi/data/missing_metadata.tsv",
output:
missing_sequence_ids = "joined-ncbi/data/missing_sequence_ids.txt",
joined_metadata = "joined-ncbi/data/all_metadata.tsv",
params:
sequence_id_column = config["curate"]["output_id_field"],
source_column_name = config["join_ncbi_andersen"]["source_column_name"],
ncbi_source = config["join_ncbi_andersen"]["ncbi_source"],
andersen_source = config["join_ncbi_andersen"]["andersen_source"],
dedup_column = config["join_ncbi_andersen"]["dedup_column"],
shell:
"""
tsv-select -H -f {params.sequence_id_column} \
{input.missing_metadata} \
> {output.missing_sequence_ids}
tsv-append \
--source-header {params.source_column_name} \
--file {params.ncbi_source}={input.ncbi_metadata} \
--file {params.andersen_source}={input.missing_metadata} \
| tsv-uniq -H -f {params.dedup_column} \
> {output.joined_metadata}
"""


rule select_missing_sequences:
rule dedup_metadata_by_sample_id:
input:
missing_sequence_ids = "joined-ncbi/data/missing_sequence_ids.txt",
andersen_sequences = "andersen-lab/results/sequences_{segment}.fasta",
all_joined_metadata="joined-ncbi/data/all_metadata.tsv",
output:
missing_sequences = "joined-ncbi/data/missing_sequences_{segment}.fasta",
joined_metadata="joined-ncbi/results/metadata.tsv",
params:
id_field=config["curate"]["output_id_field"],
log:
"logs/joined-ncbi/dedup_metadata_by_sample_id.txt",
shell:
"""
seqkit grep -f {input.missing_sequence_ids} \
{input.andersen_sequences} \
> {output.missing_sequences}
r"""
(augur curate passthru \
--id-column {params.id_field:q} \
--metadata {input.all_joined_metadata:q} \
| ./build-configs/ncbi/bin/dedup-by-sample-id \
| augur curate passthru \
--output-metadata {output.joined_metadata}) 2>> {log}
"""


rule append_missing_metadata_to_ncbi:
rule select_final_strain_names:
input:
ncbi_metadata = "ncbi/results/metadata.tsv",
missing_metadata = "joined-ncbi/data/missing_metadata.tsv",
joined_metadata="joined-ncbi/results/metadata.tsv",
output:
joined_metadata = "joined-ncbi/results/metadata.tsv",
strain_names = "joined-ncbi/data/final_strain_names.txt",
params:
source_column_name = config["join_ncbi_andersen"]["source_column_name"],
ncbi_source = config["join_ncbi_andersen"]["ncbi_source"],
andersen_source = config["join_ncbi_andersen"]["andersen_source"],
dedup_column = config["join_ncbi_andersen"]["dedup_column"],
sequence_id_column = config["curate"]["output_id_field"],
shell:
"""
tsv-append \
--source-header {params.source_column_name} \
--file {params.ncbi_source}={input.ncbi_metadata} \
--file {params.andersen_source}={input.missing_metadata} \
| tsv-uniq -H -f {params.dedup_column} \
> {output.joined_metadata}
r"""
tsv-select -H -f {params.sequence_id_column} \
{input.joined_metadata} \
> {output.strain_names}
"""


rule append_missing_sequences_to_ncbi:
rule select_final_sequences:
input:
strain_names = "joined-ncbi/data/final_strain_names.txt",
ncbi_sequences = "ncbi/results/sequences_{segment}.fasta",
missing_sequences = "joined-ncbi/data/missing_sequences_{segment}.fasta",
andersen_sequences = "andersen-lab/results/sequences_{segment}.fasta",
output:
joined_sequences = "joined-ncbi/results/sequences_{segment}.fasta",
shell:
"""
cat {input.ncbi_sequences} {input.missing_sequences} \
| seqkit rmdup \
> {output.joined_sequences}
r"""
cat {input.ncbi_sequences} {input.andersen_sequences} \
| seqkit grep -f {input.strain_names} \
> {output.joined_sequences}
"""

0 comments on commit 01fa60e

Please sign in to comment.