From 980b5733164931838283b53f02993d0e826d9374 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 1 Oct 2024 11:52:03 -0700 Subject: [PATCH] add docs --- README.md | 37 +++++++++++++++++-- .../sourmash_plugin_directsketch/__init__.py | 4 +- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 1f0a783..c77f0b3 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ For reference: To test `gbsketch`, you can download a csv file and run: ``` curl -JLO https://raw.githubusercontent.com/sourmash-bio/sourmash_plugin_directsketch/main/tests/test-data/acc.csv -sourmash scripts gbsketch acc.csv -o test-gbsketch.zip -f out_fastas -k --failed test.failed.csv -p dna,k=21,k=31,scaled=1000,abund -p protein,k=10,scaled=100,abund -r 1 +sourmash scripts gbsketch acc.csv -o test-gbsketch.zip -f out_fastas -k --failed test.failed.csv --checksum-fail test.checksum-failed.csv -p dna,k=21,k=31,scaled=1000,abund -p protein,k=10,scaled=100,abund -r 1 ``` To check that the `zip` was created properly, you can run: ``` @@ -99,10 +99,20 @@ summary of sketches: 1 sketches with protein, k=10, scaled=100, abund 5108 total hashes ``` +### Usage Considerations + +If you're building large databases (over 20k files), we highly recommend you use batched zipfiles to facilitate restart. +If you encounter unexpected failures and are using a single zipfile output (default), `gbsketch` will have to re-download and +re-sketch all files. If you instead set a number of accessions using `--batch-size`, e.g. 10000, then `gbsketch` can load any +batched zips that finished writing, and avoid re-generating those signatures. + + Full Usage: ``` -usage: gbsketch [-h] [-q] [-d] [-o OUTPUT] [-f FASTAS] [-k] [--download-only] [--failed FAILED] [-p PARAM_STRING] [-c CORES] [-r RETRY_TIMES] [-g | -m] input_csv +usage: gbsketch [-h] [-q] [-d] [-o OUTPUT] [-f FASTAS] [--batch-size BATCH_SIZE] [-k] [--download-only] --failed FAILED --checksum-fail CHECKSUM_FAIL [-p PARAM_STRING] [-c CORES] + [-r RETRY_TIMES] [-g | -m] + input_csv download and sketch GenBank assembly datasets @@ -117,9 +127,14 @@ options: output zip file for the signatures -f FASTAS, --fastas FASTAS Write fastas here + --batch-size BATCH_SIZE + Write smaller zipfiles, each containing sigs associated with this number of accessions. This allows gbsketch to recover after unexpected failures, rather than needing to + restart sketching from scratch. Default: write all sigs to single zipfile. -k, --keep-fasta write FASTA files in addition to sketching. Default: do not write FASTA files --download-only just download genomes; do not sketch --failed FAILED csv of failed accessions and download links (should be mostly protein). + --checksum-fail CHECKSUM_FAIL + csv of accessions where the md5sum check failed or the md5sum file was improperly formatted or could not be downloaded -p PARAM_STRING, --param-string PARAM_STRING parameter string for sketching (default: k=31,scaled=1000) -c CORES, --cores CORES @@ -156,9 +171,18 @@ To run the test accession file at `tests/test-data/acc-url.csv`, run: sourmash scripts urlsketch tests/test-data/acc-url.csv -o test-urlsketch.zip -f out_fastas -k --failed test.failed.csv -p dna,k=21,k=31,scaled=1000,abund -p protein,k=10,scaled=100,abund -r 1 ``` +### Usage Considerations + +If you're building large databases (over 20k files), we highly recommend you use batched zipfiles to facilitate restart. +If you encounter unexpected failures and are using a single zipfile output (default), `urlsketch` will have to re-download and +re-sketch all files. If you instead set a number of accessions using `--batch-size`, e.g. 10000, then `urlsketch` can load any +batched zips that finished writing, and avoid re-generating those signatures. + Full Usage: ``` -usage: urlsketch [-h] [-q] [-d] [-o OUTPUT] [-f FASTAS] [-k] [--download-only] [--failed FAILED] [-p PARAM_STRING] [-c CORES] [-r RETRY_TIMES] input_csv +usage: urlsketch [-h] [-q] [-d] [-o OUTPUT] [--batch-size BATCH_SIZE] [-f FASTAS] [-k] [--download-only] --failed FAILED [--checksum-fail CHECKSUM_FAIL] [-p PARAM_STRING] [-c CORES] + [-r RETRY_TIMES] + input_csv download and sketch GenBank assembly datasets @@ -171,12 +195,17 @@ options: -d, --debug provide debugging output -o OUTPUT, --output OUTPUT output zip file for the signatures + --batch-size BATCH_SIZE + Write smaller zipfiles, each containing sigs associated with this number of accessions. This allows urlsketch to recover after unexpected failures, rather than needing to + restart sketching from scratch. Default: write all sigs to single zipfile. -f FASTAS, --fastas FASTAS Write fastas here -k, --keep-fasta, --keep-fastq write FASTA/Q files in addition to sketching. Default: do not write FASTA files --download-only just download genomes; do not sketch - --failed FAILED csv of failed accessions and download links (should be mostly protein). + --failed FAILED csv of failed accessions and download links. + --checksum-fail CHECKSUM_FAIL + csv of accessions where the md5sum check failed. If not provided, md5sum failures will be written to the download failures file (no additional md5sum information). -p PARAM_STRING, --param-string PARAM_STRING parameter string for sketching (default: k=31,scaled=1000) -c CORES, --cores CORES diff --git a/src/python/sourmash_plugin_directsketch/__init__.py b/src/python/sourmash_plugin_directsketch/__init__.py index 1ae647d..acf9542 100644 --- a/src/python/sourmash_plugin_directsketch/__init__.py +++ b/src/python/sourmash_plugin_directsketch/__init__.py @@ -46,7 +46,7 @@ def __init__(self, p): p.add_argument('--batch-size', type=int, default = 0, help='Write smaller zipfiles, each containing sigs associated with this number of accessions. \ This allows gbsketch to recover after unexpected failures, rather than needing to \ - restart sketching from scratch.') + restart sketching from scratch. Default: write all sigs to single zipfile.') p.add_argument('-k', '--keep-fasta', action='store_true', help="write FASTA files in addition to sketching. Default: do not write FASTA files") p.add_argument('--download-only', help='just download genomes; do not sketch', action='store_true') @@ -121,7 +121,7 @@ def __init__(self, p): p.add_argument('--batch-size', type=int, default = 0, help='Write smaller zipfiles, each containing sigs associated with this number of accessions. \ This allows urlsketch to recover after unexpected failures, rather than needing to \ - restart sketching from scratch.') + restart sketching from scratch. Default: write all sigs to single zipfile.') p.add_argument('-f', '--fastas', help='Write fastas here', default = '.') p.add_argument('-k', '--keep-fasta', '--keep-fastq', action='store_true',