diff --git a/cravat/oc.py b/cravat/oc.py index 4cdfdda3..18244397 100644 --- a/cravat/oc.py +++ b/cravat/oc.py @@ -6,7 +6,7 @@ from cravat.cravat_report import parser as report_parser from cravat.vcfanno import vcfanno import sys -from pathlib import Path +import pathlib root_p = argparse.ArgumentParser( description="Open-CRAVAT genomic variant interpreter. https://github.com/KarchinLab/open-cravat" @@ -240,9 +240,16 @@ type = int, help = 'Number of CPU threads to use') vcfanno_p.add_argument('--temp-dir', - type = Path, - default = Path('temp-vcfanno'), + type = pathlib.Path, + default = pathlib.Path('temp-vcfanno'), help = 'Temporary directory for working files') +vcfanno_p.add_argument('-o','--output-path', + type = pathlib.Path, + help = 'Output vcf path (gzipped). Defaults to input_path.oc.vcf.gz') +vcfanno_p.add_argument('--chunk-size', + type = int, + default = 10**4, + help = 'Number of lines to annotate in each thread before syncing to disk. Affects performance.') vcfanno_p.set_defaults(func=vcfanno) def main(): diff --git a/cravat/vcfanno.py b/cravat/vcfanno.py index 3568985c..b1e58567 100644 --- a/cravat/vcfanno.py +++ b/cravat/vcfanno.py @@ -397,7 +397,10 @@ def process(self): def vcfanno(args): input_path = pathlib.Path(args.input_path) - output_path = pathlib.Path(str(input_path)+'.oc.vcf.gz') + if args.output_path is not None: + output_path = args.output_path + else: + output_path = pathlib.Path(str(input_path)+'.oc.vcf.gz') handler = logging.StreamHandler(sys.stdout) handler.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') @@ -413,7 +416,8 @@ def vcfanno(args): output_path = str(output_path), temp_dir = args.temp_dir, processors = args.threads if args.threads else mp.cpu_count(), - chunk_size=10**4, - chunk_log_frequency=50, - annotators=args.annotators) + chunk_size= args.chunk_size, + chunk_log_frequency = 50, + annotators = args.annotators, + ) anno.process()