Skip to content

Commit

Permalink
update all the things
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb committed Jan 11, 2022
1 parent ff04270 commit 80fa463
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 15 deletions.
37 changes: 29 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
all: clean-test test

flakes:
flake8 --ignore=E501 genome_grist/ tests/

black:
black .

clean-test:
rm -fr outputs.test/

Expand All @@ -15,19 +21,34 @@ test:
genome-grist run tests/test-data/SRR5950647.conf count_trimmed_reads -j 8 -p
genome-grist run tests/test-data/SRR5950647.conf summarize_sample_info -j 8 -p

test-private:
### private genomes test stuff

test-private: outputs.private/abundtrim/podar.abundtrim.fq.gz \
databases/podar-ref.zip databases/podar-ref.info.csv \
databases/podar-ref.tax.csv
genome-grist run conf-private.yml summarize_gather summarize_mapping summarize_tax -j 4 -p

# download the (subsampled) reads for SRR606249
outputs.private/abundtrim/podar.abundtrim.fq.gz:
mkdir -p outputs.private/abundtrim
# download the reads
curl -L https://osf.io/ckbq3/download -o outputs.private2/abundtrim/podar.abundtrim.fq.gz
curl -L https://osf.io/ckbq3/download -o outputs.private/abundtrim/podar.abundtrim.fq.gz

# download the ref genomes
# download the ref genomes
databases/podar-ref:
mkdir -p databases/podar-ref
curl -L https://osf.io/vbhy5/download -o databases/podar-ref.tar.gz
cd databases/podar-ref/ && tar xzf ../podar-ref.tar.gz

# sketch the ref genomes
databases/podar-ref.zip: databases/podar-ref/
sourmash sketch dna -p k=31,scaled=1000 --name-from-first \
databases/podar-ref/*.fa -o databases/podar-ref.zip

flakes:
flake8 --ignore=E501 genome_grist/ tests/
# download taxonomy
databases/podar-ref.tax.csv:
curl -L https://osf.io/4yhjw/download -o databases/podar-ref.tax.csv

black:
black .
# create info file and genomes directory:
databases/podar-ref.info.csv:
python -m genome_grist.copy_private_genomes databases/podar-ref/*.fa -o databases/podar-ref.info.csv -d databases/podar-ref.d
python -m genome_grist.make_info_file databases/podar-ref.info.csv
13 changes: 13 additions & 0 deletions conf-private.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
sample:
- podar

outdir: outputs.private/

private_databases:
- databases/podar-ref.zip

private_databases_info:
- databases/podar-ref.info.csv

taxonomies:
- databases/podar-ref.tax.csv
15 changes: 8 additions & 7 deletions genome_grist/conf/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -242,16 +242,17 @@ class ListGatherGenomes(Checkpoint_GatherResults):
# get list of private genomes first...
private_info = {}
for filename in private_databases_info:
csv_dirpath = os.path.dirname(filename)
for row in load_csv(filename):
acc = row['acc']

# adjust filenames to be relative to CSV location -
newpath = os.path.join(csv_dirpath, row['info_filename'])
row['info_filename'] = os.path.normpath(newpath)
genome_dir = os.path.dirname(row['genome_filename'])
row['genome_filename'] = os.path.normpath(row['genome_filename'])

newpath = os.path.join(csv_dirpath, row['filename'])
row['filename'] = os.path.normpath(newpath)
genome_dir = os.path.dirname(row['genome_filename'])
info_filename = f'{acc}.info.csv'
info_filename = os.path.join(genome_dir, info_filename)

row['info_filename'] = os.path.normpath(info_filename)

private_info[acc] = row

Expand All @@ -277,7 +278,7 @@ class ListGatherGenomes(Checkpoint_GatherResults):
# if in private information, use that as genome source.
if acc in private_info:
info = private_info[acc]
genome_filenames.append(info['filename'])
genome_filenames.append(info['genome_filename'])
genome_filenames.append(info['info_filename'])

# genbank: point at genbank_genomes
Expand Down
64 changes: 64 additions & 0 deletions genome_grist/copy_private_genomes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#! /usr/bin/env python
"""
Copy private genomes into a new directory, properly named; create a summary
CSV for genome-grist.
"""
import sys
import argparse
import screed
import csv
import os
import shutil


def main():
p = argparse.ArgumentParser()
p.add_argument('genome_files', nargs='+')
p.add_argument('-o', '--output-csv', required=True)
p.add_argument('-d', '--output-directory', required=True)
args = p.parse_args()

output_fp = open(args.output_csv, 'wt')
w = csv.DictWriter(output_fp, fieldnames=['acc',
'ncbi_tax_name',
'genome_filename'])
w.writeheader()

try:
os.mkdir(args.output_directory)
print(f"Created genome directory '{args.output_directory}'")
except FileExistsError:
print(f"Genome directory '{args.output_directory}' already exists.")

print(f"Copying genomes into '{args.output_directory}'")

n = 0
for filename in args.genome_files:
print(f"---")
print(f"processing genome '{filename}'")

for record in screed.open(filename):
record_name = record.name
break

record_name = record_name.split(' ', 1)
acc, remainder = record_name

print(f"read identifer '{acc}' and name '{remainder}'")

destfile = os.path.join(args.output_directory, f"{acc}_genomic.fna.gz")
print(f"copying '{filename}' to '{destfile}'")
shutil.copyfile(filename, destfile)

w.writerow(dict(acc=acc, ncbi_tax_name=remainder,
genome_filename=destfile))
n += 1

output_fp.close()
print('---')
print(f"wrote {n} genome entries to '{args.output_csv}'")

return 0

if __name__ == '__main__':
sys.exit(main())
49 changes: 49 additions & 0 deletions genome_grist/make_info_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#! /usr/bin/env python
"""
Scan a list of genome files and create individual "info file" CSVs
for genome-grist to use for private genomes.
"""
import sys
import argparse
import screed
import csv
import os
import shutil


def main():
p = argparse.ArgumentParser()
p.add_argument('info_csv')
args = p.parse_args()

info_d = {}
with open(args.info_csv, 'r', newline="") as fp:
r = csv.DictReader(fp)
for row in r:
acc = row['acc']
info_d[acc] = row

print(f"loaded {len(info_d)} info files from '{args.info_csv}'")

n = 0
for acc, item_d in info_d.items():
# write .info.csv.
dirname = os.path.dirname(item_d['genome_filename'])
info_filename = os.path.join(dirname, f"{acc}.info.csv")
name = item_d['ncbi_tax_name']

with open(info_filename, 'wt') as fp:
w2 = csv.DictWriter(fp, fieldnames=['acc',
'ncbi_tax_name'])
w2.writeheader()
w2.writerow(dict(acc=acc, ncbi_tax_name=name))
print(f"Created info CSV '{info_filename}'")

n += 1

print(f"wrote {n} info files.")

return 0

if __name__ == '__main__':
sys.exit(main())

0 comments on commit 80fa463

Please sign in to comment.