Skip to content

Commit

Permalink
[MRG] Change column names in intermediate CSVs. (#133)
Browse files Browse the repository at this point in the history
* change column names

* remove old notebooks

* fix mistake
  • Loading branch information
ctb authored Jan 11, 2022
1 parent 80fa463 commit 79626e5
Show file tree
Hide file tree
Showing 11 changed files with 80 additions and 2,818 deletions.
806 changes: 0 additions & 806 deletions gather-p8808mo11.ipynb

This file was deleted.

805 changes: 0 additions & 805 deletions gather-p8808mo9.ipynb

This file was deleted.

806 changes: 0 additions & 806 deletions gather-podar.ipynb

This file was deleted.

253 changes: 0 additions & 253 deletions gathergram.ipynb

This file was deleted.

68 changes: 0 additions & 68 deletions generate-private-info.py

This file was deleted.

106 changes: 53 additions & 53 deletions genome_grist/conf/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,8 @@ checkpoint gather_reads_wc:


class Checkpoint_GatherResults:
"""Given a pattern containing {acc} and {sample}, this class
will generate the list of {acc} for that {sample}'s gather results.
"""Given a pattern containing {ident} and {sample}, this class
will generate the list of {ident} for that {sample}'s gather results.
Alternatively, you can omit {sample} from the pattern and include the
list of sample names in the second argument to the constructor.
Expand All @@ -183,20 +183,20 @@ class Checkpoint_GatherResults:
self.pattern = pattern
self.samples = samples

def get_genome_accs(self, sample):
def get_genome_idents(self, sample):
gather_csv = f'{outdir}/gather/{sample}.gather.csv'
assert os.path.exists(gather_csv), "gather output does not exist!?"

genome_accs = []
genome_idents = []
with open(gather_csv, 'rt') as fp:
r = csv.DictReader(fp)
for row in r:
acc = row['name'].split(' ')[0]
genome_accs.append(acc)
print(f'loaded {len(genome_accs)} accessions from {gather_csv}.',
ident = row['name'].split(' ')[0]
genome_idents.append(ident)
print(f'loaded {len(genome_idents)} identifiers from {gather_csv}.',
file=sys.stderr)

return genome_accs
return genome_idents

def __call__(self, w):
# get 'sample' from wildcards?
Expand All @@ -221,9 +221,9 @@ class Checkpoint_GatherResults:
checkpoints.gather_reads_wc.get(**w)

# parse hitlist_genomes,
genome_accs = self.get_genome_accs(w.sample)
genome_idents = self.get_genome_idents(w.sample)

p = expand(self.pattern, acc=genome_accs, **w)
p = expand(self.pattern, ident=genome_idents, **w)

return p

Expand All @@ -243,18 +243,18 @@ class ListGatherGenomes(Checkpoint_GatherResults):
private_info = {}
for filename in private_databases_info:
for row in load_csv(filename):
acc = row['acc']
ident = row['ident']

genome_dir = os.path.dirname(row['genome_filename'])
row['genome_filename'] = os.path.normpath(row['genome_filename'])

genome_dir = os.path.dirname(row['genome_filename'])
info_filename = f'{acc}.info.csv'
info_filename = f'{ident}.info.csv'
info_filename = os.path.join(genome_dir, info_filename)

row['info_filename'] = os.path.normpath(info_filename)

private_info[acc] = row
private_info[ident] = row

print(f"Loaded info on {len(private_info)} private genomes.")
return private_info
Expand All @@ -273,18 +273,18 @@ class ListGatherGenomes(Checkpoint_GatherResults):
assert os.path.exists(gather_csv), f"gather output does not exist for {sample}!?"

for row in load_csv(gather_csv):
acc = row['name'].split(' ')[0]
ident = row['name'].split(' ')[0]

# if in private information, use that as genome source.
if acc in private_info:
info = private_info[acc]
if ident in private_info:
info = private_info[ident]
genome_filenames.append(info['genome_filename'])
genome_filenames.append(info['info_filename'])

# genbank: point at genbank_genomes
else:
genome_filenames.append(f'genbank_genomes/{acc}_genomic.fna.gz')
genome_filenames.append(f'genbank_genomes/{acc}.info.csv')
genome_filenames.append(f'genbank_genomes/{ident}_genomic.fna.gz')
genome_filenames.append(f'genbank_genomes/{ident}.info.csv')

return genome_filenames

Expand All @@ -293,12 +293,12 @@ class Checkpoint_GenomeFiles(Checkpoint_GatherResults):
checkpoints.copy_sample_genomes_to_output_wc.get(**w)

# parse hitlist_genomes,
genome_accs = self.get_genome_accs(w.sample)
genome_idents = self.get_genome_idents(w.sample)

if 'acc' in dict(w):
if 'ident' in dict(w):
p = expand(self.pattern, **w)
else:
p = expand(self.pattern, acc=genome_accs, **w)
p = expand(self.pattern, ident=genome_idents, **w)

return p

Expand Down Expand Up @@ -326,7 +326,7 @@ rule combine_genome_info:
@toplevel
rule download_genbank_genomes:
input:
Checkpoint_GatherResults("genbank_genomes/{acc}_genomic.fna.gz",
Checkpoint_GatherResults("genbank_genomes/{ident}_genomic.fna.gz",
samples=SAMPLES)

@toplevel
Expand All @@ -343,8 +343,8 @@ rule map_reads:
@toplevel
rule build_consensus:
input:
Checkpoint_GatherResults(outdir + f"/mapping/{{sample}}.x.{{acc}}.consensus.fa.gz"),
Checkpoint_GatherResults(outdir + f"/leftover/{{sample}}.x.{{acc}}.consensus.fa.gz"),
Checkpoint_GatherResults(outdir + f"/mapping/{{sample}}.x.{{ident}}.consensus.fa.gz"),
Checkpoint_GatherResults(outdir + f"/leftover/{{sample}}.x.{{ident}}.consensus.fa.gz"),

@toplevel
rule summarize_mapping:
Expand Down Expand Up @@ -556,10 +556,10 @@ rule count_trimmed_reads_wc:
# map abundtrim reads and produce a bam
rule minimap_wc:
input:
query = Checkpoint_GenomeFiles(f"{outdir}/genomes/{{acc}}_genomic.fna.gz"),
query = Checkpoint_GenomeFiles(f"{outdir}/genomes/{{ident}}_genomic.fna.gz"),
metagenome = outdir + "/abundtrim/{sample}.abundtrim.fq.gz",
output:
bam = outdir + "/mapping/{sample}.x.{acc}.bam",
bam = outdir + "/mapping/{sample}.x.{ident}.bam",
conda: "env/minimap2.yml"
threads: 4
shell: """
Expand Down Expand Up @@ -604,12 +604,12 @@ rule bam_covered_regions_wc:
# calculating SNPs/etc.
rule mpileup_wc:
input:
query = Checkpoint_GenomeFiles(f"{outdir}/genomes/{{acc}}_genomic.fna.gz"),
bam = outdir + "/{dir}/{sample}.x.{acc}.bam",
query = Checkpoint_GenomeFiles(f"{outdir}/genomes/{{ident}}_genomic.fna.gz"),
bam = outdir + "/{dir}/{sample}.x.{ident}.bam",
output:
bcf = outdir + "/{dir}/{sample}.x.{acc}.bcf",
vcf = outdir + "/{dir}/{sample}.x.{acc}.vcf.gz",
vcfi = outdir + "/{dir}/{sample}.x.{acc}.vcf.gz.csi",
bcf = outdir + "/{dir}/{sample}.x.{ident}.bcf",
vcf = outdir + "/{dir}/{sample}.x.{ident}.vcf.gz",
vcfi = outdir + "/{dir}/{sample}.x.{ident}.vcf.gz.csi",
conda: "env/bcftools.yml"
shell: """
genomefile=$(mktemp -t grist.genome.XXXXXXX)
Expand All @@ -623,13 +623,13 @@ rule mpileup_wc:
# build new consensus
rule build_new_consensus_wc:
input:
vcf = outdir + "/{dir}/{sample}.x.{acc}.vcf.gz",
query = Checkpoint_GenomeFiles(f"{outdir}/genomes/{{acc}}_genomic.fna.gz"),
regions = outdir + "/{dir}/{sample}.x.{acc}.regions.bed",
vcf = outdir + "/{dir}/{sample}.x.{ident}.vcf.gz",
query = Checkpoint_GenomeFiles(f"{outdir}/genomes/{{ident}}_genomic.fna.gz"),
regions = outdir + "/{dir}/{sample}.x.{ident}.regions.bed",
output:
mask = outdir + "/{dir}/{sample}.x.{acc}.mask.bed",
genomefile = outdir + "/{dir}/{sample}.x.{acc}.fna.gz.sizes",
consensus = outdir + "/{dir}/{sample}.x.{acc}.consensus.fa.gz",
mask = outdir + "/{dir}/{sample}.x.{ident}.mask.bed",
genomefile = outdir + "/{dir}/{sample}.x.{ident}.fna.gz.sizes",
consensus = outdir + "/{dir}/{sample}.x.{ident}.consensus.fa.gz",
conda: "env/bcftools.yml"
shell: """
genomefile=$(mktemp -t grist.genome.XXXXXXX)
Expand All @@ -645,7 +645,7 @@ rule build_new_consensus_wc:
# summarize depth into a CSV
rule summarize_samtools_depth_wc:
input:
Checkpoint_GatherResults(outdir + f"/{{dir}}/{{sample}}.x.{{acc}}.depth.txt")
Checkpoint_GatherResults(outdir + f"/{{dir}}/{{sample}}.x.{{ident}}.depth.txt")
output:
csv = f"{outdir}/{{dir}}/{{sample}}.summary.csv"
shell: """
Expand Down Expand Up @@ -760,7 +760,7 @@ rule make_mapping_notebook_wc:
rule extract_leftover_reads_wc:
input:
csv = f'{outdir}/gather/{{sample}}.gather.csv',
mapped = Checkpoint_GatherResults(f"{outdir}/mapping/{{sample}}.x.{{acc}}.mapped.fq.gz"),
mapped = Checkpoint_GatherResults(f"{outdir}/mapping/{{sample}}.x.{{ident}}.mapped.fq.gz"),
output:
touch(f"{outdir}/leftover/.leftover.{{sample}}")
conda: "env/sourmash.yml"
Expand All @@ -775,15 +775,15 @@ rule extract_leftover_reads_wc:
rule map_leftover_reads_wc:
input:
all_csv = f"{outdir}/mapping/{{sample}}.summary.csv",
query = Checkpoint_GenomeFiles(f"{outdir}/genomes/{{acc}}_genomic.fna.gz"),
query = Checkpoint_GenomeFiles(f"{outdir}/genomes/{{ident}}_genomic.fna.gz"),
leftover_reads_flag = f"{outdir}/leftover/.leftover.{{sample}}",
output:
bam=outdir + "/leftover/{sample}.x.{acc}.bam",
bam=outdir + "/leftover/{sample}.x.{ident}.bam",
conda: "env/minimap2.yml"
threads: 4
shell: """
minimap2 -ax sr -t {threads} {input.query} \
{outdir}/mapping/{wildcards.sample}.x.{wildcards.acc}.leftover.fq.gz | \
{outdir}/mapping/{wildcards.sample}.x.{wildcards.ident}.leftover.fq.gz | \
samtools view -b -F 4 - | samtools sort - > {output.bam}
"""

Expand Down Expand Up @@ -878,9 +878,9 @@ rule summarize_tax_wc:
# download genbank genome details; make an info.csv file for entry.
rule make_genbank_info_csv:
output:
csvfile = 'genbank_genomes/{acc}.info.csv'
csvfile = 'genbank_genomes/{ident}.info.csv'
shell: """
python -Werror -m genome_grist.genbank_genomes {wildcards.acc} \
python -Werror -m genome_grist.genbank_genomes {wildcards.ident} \
--output {output.csvfile}
"""

Expand All @@ -898,33 +898,33 @@ checkpoint copy_sample_genomes_to_output_wc:
# combined info.csv per sample
rule make_combined_info_csv_wc:
input:
csvs = Checkpoint_GenomeFiles(f'{outdir}/genomes/{{acc}}.info.csv'),
csvs = Checkpoint_GenomeFiles(f'{outdir}/genomes/{{ident}}.info.csv'),
output:
genomes_info_csv = f"{outdir}/gather/{{sample}}.genomes.info.csv",
shell: """
python -Werror -m genome_grist.combine_csvs \
--fields acc,ncbi_tax_name \
--fields ident,display_name \
{input.csvs} > {output}
"""

# download actual genomes!
rule download_matching_genome_wc:
input:
csvfile = ancient('genbank_genomes/{acc}.info.csv')
csvfile = ancient('genbank_genomes/{ident}.info.csv')
output:
genome = "genbank_genomes/{acc}_genomic.fna.gz"
genome = "genbank_genomes/{ident}_genomic.fna.gz"
run:
with open(input.csvfile, 'rt') as infp:
r = csv.DictReader(infp)
rows = list(r)
assert len(rows) == 1
row = rows[0]
acc = row['acc']
assert wildcards.acc.startswith(acc)
ident = row['ident']
assert wildcards.ident.startswith(ident)
url = row['genome_url']
name = row['ncbi_tax_name']
name = row['display_name']

print(f"downloading genome for acc {acc}/{name} from NCBI...",
print(f"downloading genome for ident {ident}/{name} from NCBI...",
file=sys.stderr)
with open(output.genome, 'wb') as outfp:
with urllib.request.urlopen(url) as response:
Expand Down Expand Up @@ -972,7 +972,7 @@ rule summarize_reads_info_wc:
rule create_sgc_conf_wc:
input:
csv = outdir + "/gather/{sample}.gather.csv",
queries = Checkpoint_GenomeFiles(f"{outdir}/genomes/{{acc}}_genomic.fna.gz")
queries = Checkpoint_GenomeFiles(f"{outdir}/genomes/{{ident}}_genomic.fna.gz")
output:
conf = outdir + "/sgc/{sample}.conf"
run:
Expand Down
12 changes: 6 additions & 6 deletions genome_grist/copy_private_genomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ def main():
args = p.parse_args()

output_fp = open(args.output_csv, 'wt')
w = csv.DictWriter(output_fp, fieldnames=['acc',
'ncbi_tax_name',
w = csv.DictWriter(output_fp, fieldnames=['ident',
'display_name',
'genome_filename'])
w.writeheader()

Expand All @@ -42,15 +42,15 @@ def main():
break

record_name = record_name.split(' ', 1)
acc, remainder = record_name
ident, remainder = record_name

print(f"read identifer '{acc}' and name '{remainder}'")
print(f"read identifer '{ident}' and name '{remainder}'")

destfile = os.path.join(args.output_directory, f"{acc}_genomic.fna.gz")
destfile = os.path.join(args.output_directory, f"{ident}_genomic.fna.gz")
print(f"copying '{filename}' to '{destfile}'")
shutil.copyfile(filename, destfile)

w.writerow(dict(acc=acc, ncbi_tax_name=remainder,
w.writerow(dict(ident=ident, display_name=remainder,
genome_filename=destfile))
n += 1

Expand Down
Loading

0 comments on commit 79626e5

Please sign in to comment.