diff --git a/Makefile b/Makefile index ad1a6750..920b17c1 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,11 @@ all: clean-test test +flakes: + flake8 --ignore=E501 genome_grist/ tests/ + +black: + black . + clean-test: rm -fr outputs.test/ @@ -15,19 +21,34 @@ test: genome-grist run tests/test-data/SRR5950647.conf count_trimmed_reads -j 8 -p genome-grist run tests/test-data/SRR5950647.conf summarize_sample_info -j 8 -p -test-private: +### private genomes test stuff + +test-private: outputs.private/abundtrim/podar.abundtrim.fq.gz \ + databases/podar-ref.zip databases/podar-ref.info.csv \ + databases/podar-ref.tax.csv + genome-grist run conf-private.yml summarize_gather summarize_mapping summarize_tax -j 4 -p + +# download the (subsampled) reads for SRR606249 +outputs.private/abundtrim/podar.abundtrim.fq.gz: mkdir -p outputs.private/abundtrim - # download the reads - curl -L https://osf.io/ckbq3/download -o outputs.private2/abundtrim/podar.abundtrim.fq.gz + curl -L https://osf.io/ckbq3/download -o outputs.private/abundtrim/podar.abundtrim.fq.gz - # download the ref genomes +# download the ref genomes +databases/podar-ref: mkdir -p databases/podar-ref curl -L https://osf.io/vbhy5/download -o databases/podar-ref.tar.gz cd databases/podar-ref/ && tar xzf ../podar-ref.tar.gz +# sketch the ref genomes +databases/podar-ref.zip: databases/podar-ref/ + sourmash sketch dna -p k=31,scaled=1000 --name-from-first \ + databases/podar-ref/*.fa -o databases/podar-ref.zip -flakes: - flake8 --ignore=E501 genome_grist/ tests/ +# download taxonomy +databases/podar-ref.tax.csv: + curl -L https://osf.io/4yhjw/download -o databases/podar-ref.tax.csv -black: - black . +# create info file and genomes directory: +databases/podar-ref.info.csv: + python -m genome_grist.copy_private_genomes databases/podar-ref/*.fa -o databases/podar-ref.info.csv -d databases/podar-ref.d + python -m genome_grist.make_info_file databases/podar-ref.info.csv diff --git a/conf-private.yml b/conf-private.yml new file mode 100644 index 00000000..e6e45daa --- /dev/null +++ b/conf-private.yml @@ -0,0 +1,13 @@ +sample: +- podar + +outdir: outputs.private/ + +private_databases: +- databases/podar-ref.zip + +private_databases_info: +- databases/podar-ref.info.csv + +taxonomies: +- databases/podar-ref.tax.csv diff --git a/genome_grist/conf/Snakefile b/genome_grist/conf/Snakefile index bae4a56a..862b0310 100755 --- a/genome_grist/conf/Snakefile +++ b/genome_grist/conf/Snakefile @@ -242,16 +242,17 @@ class ListGatherGenomes(Checkpoint_GatherResults): # get list of private genomes first... private_info = {} for filename in private_databases_info: - csv_dirpath = os.path.dirname(filename) for row in load_csv(filename): acc = row['acc'] - # adjust filenames to be relative to CSV location - - newpath = os.path.join(csv_dirpath, row['info_filename']) - row['info_filename'] = os.path.normpath(newpath) + genome_dir = os.path.dirname(row['genome_filename']) + row['genome_filename'] = os.path.normpath(row['genome_filename']) - newpath = os.path.join(csv_dirpath, row['filename']) - row['filename'] = os.path.normpath(newpath) + genome_dir = os.path.dirname(row['genome_filename']) + info_filename = f'{acc}.info.csv' + info_filename = os.path.join(genome_dir, info_filename) + + row['info_filename'] = os.path.normpath(info_filename) private_info[acc] = row @@ -277,7 +278,7 @@ class ListGatherGenomes(Checkpoint_GatherResults): # if in private information, use that as genome source. if acc in private_info: info = private_info[acc] - genome_filenames.append(info['filename']) + genome_filenames.append(info['genome_filename']) genome_filenames.append(info['info_filename']) # genbank: point at genbank_genomes diff --git a/genome_grist/copy_private_genomes.py b/genome_grist/copy_private_genomes.py new file mode 100644 index 00000000..995f3224 --- /dev/null +++ b/genome_grist/copy_private_genomes.py @@ -0,0 +1,64 @@ +#! /usr/bin/env python +""" +Copy private genomes into a new directory, properly named; create a summary +CSV for genome-grist. +""" +import sys +import argparse +import screed +import csv +import os +import shutil + + +def main(): + p = argparse.ArgumentParser() + p.add_argument('genome_files', nargs='+') + p.add_argument('-o', '--output-csv', required=True) + p.add_argument('-d', '--output-directory', required=True) + args = p.parse_args() + + output_fp = open(args.output_csv, 'wt') + w = csv.DictWriter(output_fp, fieldnames=['acc', + 'ncbi_tax_name', + 'genome_filename']) + w.writeheader() + + try: + os.mkdir(args.output_directory) + print(f"Created genome directory '{args.output_directory}'") + except FileExistsError: + print(f"Genome directory '{args.output_directory}' already exists.") + + print(f"Copying genomes into '{args.output_directory}'") + + n = 0 + for filename in args.genome_files: + print(f"---") + print(f"processing genome '{filename}'") + + for record in screed.open(filename): + record_name = record.name + break + + record_name = record_name.split(' ', 1) + acc, remainder = record_name + + print(f"read identifer '{acc}' and name '{remainder}'") + + destfile = os.path.join(args.output_directory, f"{acc}_genomic.fna.gz") + print(f"copying '{filename}' to '{destfile}'") + shutil.copyfile(filename, destfile) + + w.writerow(dict(acc=acc, ncbi_tax_name=remainder, + genome_filename=destfile)) + n += 1 + + output_fp.close() + print('---') + print(f"wrote {n} genome entries to '{args.output_csv}'") + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/genome_grist/make_info_file.py b/genome_grist/make_info_file.py new file mode 100644 index 00000000..f6ef3204 --- /dev/null +++ b/genome_grist/make_info_file.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python +""" +Scan a list of genome files and create individual "info file" CSVs +for genome-grist to use for private genomes. +""" +import sys +import argparse +import screed +import csv +import os +import shutil + + +def main(): + p = argparse.ArgumentParser() + p.add_argument('info_csv') + args = p.parse_args() + + info_d = {} + with open(args.info_csv, 'r', newline="") as fp: + r = csv.DictReader(fp) + for row in r: + acc = row['acc'] + info_d[acc] = row + + print(f"loaded {len(info_d)} info files from '{args.info_csv}'") + + n = 0 + for acc, item_d in info_d.items(): + # write .info.csv. + dirname = os.path.dirname(item_d['genome_filename']) + info_filename = os.path.join(dirname, f"{acc}.info.csv") + name = item_d['ncbi_tax_name'] + + with open(info_filename, 'wt') as fp: + w2 = csv.DictWriter(fp, fieldnames=['acc', + 'ncbi_tax_name']) + w2.writeheader() + w2.writerow(dict(acc=acc, ncbi_tax_name=name)) + print(f"Created info CSV '{info_filename}'") + + n += 1 + + print(f"wrote {n} info files.") + + return 0 + +if __name__ == '__main__': + sys.exit(main())