From bbd432002cb9e84038e66f35c10b774c5f2a6e2b Mon Sep 17 00:00:00 2001 From: pfiziev Date: Mon, 16 Apr 2018 19:32:52 -0700 Subject: [PATCH] code refactoring --- ChromTime.py | 256 +++++++++++++++++++++----------------- README.md | 61 +++++---- call_boundary_dynamics.py | 15 ++- utils.py | 1 + 4 files changed, 187 insertions(+), 146 deletions(-) diff --git a/ChromTime.py b/ChromTime.py index 4a57148..181624d 100644 --- a/ChromTime.py +++ b/ChromTime.py @@ -606,112 +606,130 @@ def determine_block_boundaries(aligned_fnames, parser = argparse.ArgumentParser(description='ChromTime: Modeling Spatio-temporal Dynamics of Chromatin Marks') - parser.add_argument('-a', - '--aligned-reads', - dest='aligned_fnames', - nargs='+', - help='Bed files with aligned reads for each time point in the correct order') - - parser.add_argument('-c', - '--control-reads', - dest='control_fnames', - nargs='+', - help='Bed files with aligned reads for control (input) for each time point in the correct order') - - parser.add_argument('-i', - '--input-order-file', - dest='order_fname', - help='A file with paths to files with foreground and control aligned reads ' - '- one line per time point, in the right order.') - - parser.add_argument("-g", "--genome", dest="genome", - help="Genome. One of: [%s] or path to a file with chromosome sizes one per line" - % ', '.join(fname.replace('.txt', '') for fname in os.listdir(GENOMES_DIR))) - - parser.add_argument("-o", "--output-dir", dest="out_dir", - help="Output directory", metavar="DIRECTORY") - - parser.add_argument("-p", "--prefix", dest="prefix", - help="prefix for the output files") - - parser.add_argument("-b", "--bin-size", type=int, dest="bin_size", default=200, - help="Bin size in base pairs (Default: %(default)s)", metavar="INT") - - parser.add_argument("-t", "--threads", type=int, dest="n_threads", default=1, - help="Number of threads to use (Default: %(default)s)", - metavar="INT") - - parser.add_argument('-s', - '--shift', - type=int, - dest='shift', - help='Number of bases to shift each read (Default: %(default)s)', - default=100) - - parser.add_argument('-q', - '--q-value', - type=float, - dest='fdr_for_decoding', - help='False discovery rate (Q-value) for calling peaks at each time point (Default: %(default)s)', - default=0.05) - - parser.add_argument('--q-value-seed', - type=float, - dest='q_value_seed', - help='FDR threshold to call significant bins (Default: %(default)s)', - default=0.05) - - parser.add_argument('--p-value-extend', - type=float, - dest='p_value_extend', - help='FDR threshold to call significant bins (Default: %(default)s)', - default=0.15) - - parser.add_argument('--min-expected-reads', - type=int, - dest='min_expected_reads', - help='Minimum expected reads per bin for the background component (Default: %(default)s)', - default=1) - - parser.add_argument('--min-gap', - type=int, - dest='min_gap', - help='Minimum gap between significant regions before they are joined (Default: %(default)s)', - default=600) - - parser.add_argument("-n", - "--n-training-examples", - type=int, - dest="n_training_examples", - default=10000, - help="Number of training examples to use. (Default: %(default)s)", - metavar="INT") - - parser.add_argument("-m", "--model-file", dest="model_fname", - help="Pickled model file to load a learned the model from", metavar="FILE") - - parser.add_argument("-d", "--data-file", dest="data_fname", - help="Pickled data file to load", metavar="FILE") - - parser.add_argument("--skip-training", action="store_true", dest="skip_training", default=False, - help="Skip EM training (%(default)s)") - - parser.add_argument("--merge-peaks", action="store_true", dest="merge_peaks", default=False, - help="Merge significant peaks across time points instead of splitting them (%(default)s)") - - parser.add_argument("--no-output-signal-files", action="store_true", dest="no_output_signal_files", default=False, - help="Do not output signal files for each time point in wiggle format (%(default)s)") - parser.add_argument("--broad", action="store_true", dest="broad", default=False, - help="Use default settings for broad marks. " - "Equivalent to \"-b 500 --min-gap 1500 --merge-peaks\" (%(default)s)") + g1 = parser.add_argument_group('Input data from command line') + g1.add_argument('-a', + '--aligned-reads', + dest='aligned_fnames', + nargs='+', + help='BED files with aligned reads for each time point in the correct order') + + g1.add_argument('-c', + '--control-reads', + dest='control_fnames', + nargs='+', + help='BED files with aligned reads for control (input) for each time point in the correct order') + + g2 = parser.add_argument_group('Input data from order file') + g2.add_argument('-i', + '--input-order-file', + dest='order_fname', + help='A tab-separated file with paths to files with foreground and control aligned reads ' + '- one line per time point, in the right order.') + + g3 = parser.add_argument_group('Options') + g3.add_argument('-m', + '--mode', + dest='mode', + choices=['punctate', 'narrow', 'broad'], + default='narrow', + help='punctate: equivalent to \"-b 200 --min-gap 600 --min-dynamic-prior 0.05\", ' + 'narrow (default): equivalent to \"-b 200 --min-gap 600 --min-dynamic-prior 0\", ' + 'broad: equivalent to \"-b 500 --min-gap 1500 --merge-peaks\"') + + g3.add_argument("-g", "--genome", dest="genome", + help="Genome. One of: [%s] or path to a file with chromosome sizes one per line" + % ', '.join(fname.replace('.txt', '') for fname in os.listdir(GENOMES_DIR))) + + g3.add_argument("-o", "--output-dir", dest="out_dir", + help="Output directory", metavar="DIRECTORY") + + g3.add_argument("-p", "--prefix", dest="prefix", + help="prefix for the output files") + + g3.add_argument("-b", "--bin-size", type=int, dest="bin_size", default=200, + help="Bin size in base pairs (Default: %(default)s)", metavar="INT") + + g3.add_argument("-t", "--threads", type=int, dest="n_threads", default=1, + help="Number of threads to use (Default: %(default)s)", + metavar="INT") + + g3.add_argument('-s', + '--shift', + type=int, + dest='shift', + help='Number of bases to shift each read (Default: %(default)s)', + default=100) + + g3.add_argument('-q', + '--q-value', + type=float, + dest='fdr_for_decoding', + help='False discovery rate (Q-value) for calling peaks at each time point (Default: %(default)s)', + default=0.05) + + g3.add_argument('--q-value-seed', + type=float, + dest='q_value_seed', + help='FDR threshold to call significant bins (Default: %(default)s)', + default=0.05) + + g3.add_argument('--p-value-extend', + type=float, + dest='p_value_extend', + help='FDR threshold to call significant bins (Default: %(default)s)', + default=0.15) + + g3.add_argument('--min-expected-reads', + type=int, + dest='min_expected_reads', + help='Minimum expected reads per bin for the background component (Default: %(default)s)', + default=1) + + g3.add_argument('--min-gap', + type=int, + dest='min_gap', + help='Minimum gap between significant regions before they are joined (Default: %(default)s)', + default=600) + + + g3.add_argument("--merge-peaks", action="store_true", dest="merge_peaks", default=False, + help="Merge significant peaks across time points instead of splitting them (Default: %(default)s)") + + g3.add_argument("--min-dynamic-prior", + type=float, + dest="min_dynamic_prior", + default=0.0, + help="Minimum prior probability for each dynamic at each time point (Default: %(default)s)") + + g3.add_argument("--model-file", dest="model_fname", + help="Pickled model to load", + metavar="FILE") + + g3.add_argument("--data-file", dest="data_fname", + help="Pickled data to load", + metavar="FILE") + + g3.add_argument("--skip-training", action="store_true", dest="skip_training", default=False, + help="Skip EM training (Default: %(default)s)") + + g3.add_argument("-n", + "--n-training-examples", + type=int, + dest="n_training_examples", + default=10000, + help="Number of training examples to use. (Default: %(default)s)", + metavar="INT") + + g3.add_argument("--output-signal-files", action="store_true", dest="output_signal_files", default=False, + help="Output signal files for each time point in wiggle format (Default: %(default)s)") + + # below are legacy options - parser.add_argument("--atac", action="store_true", dest="atac", default=False, - # help=argparse.SUPPRESS, - help="Use default settings for ATAC-seq marks. " - "Equivalent to \"-b 50 --min-gap 150 -s 5 \" (%(default)s). " - "This option has not been tested extensively, so use with caution." - ) + parser.add_argument("--broad", action="store_true", dest="broad", default=False, + # help="Use default settings for broad marks. " + # "Equivalent to \"-b 500 --min-gap 1500 --merge-peaks\" (%(default)s)", + help=argparse.SUPPRESS) parser.add_argument("--output_empty_blocks", action="store_true", dest="output_empty_blocks", default=False, help=argparse.SUPPRESS) @@ -719,12 +737,6 @@ def determine_block_boundaries(aligned_fnames, parser.add_argument("--keep-fixed-priors", action="store_true", dest="keep_fixed_priors", default=False, help=argparse.SUPPRESS) - parser.add_argument("--min-dynamic-prior", - type=float, - dest="min_dynamic_prior", - default=0.0, - help=argparse.SUPPRESS) - parser.add_argument("--use-broad-window-for-background", action="store_true", dest="use_broad_window_for_background", @@ -738,16 +750,28 @@ def determine_block_boundaries(aligned_fnames, parser.print_help() exit(0) - if args.broad: + if args.broad or args.mode == 'broad': args.bin_size = 500 args.min_gap = 1500 args.merge_peaks = True - elif args.atac: - args.bin_size = 50 - args.min_gap = 150 + elif args.mode == 'punctate': + args.bin_size = 200 + args.min_gap = 600 + args.merge_peaks = False + args.min_dynamic_prior = 0.05 + elif args.mode == 'narrow': + args.bin_size = 200 + args.min_gap = 600 args.merge_peaks = False - args.shift = 5 + args.min_dynamic_prior = 0 + + + # elif args.atac: + # args.bin_size = 50 + # args.min_gap = 150 + # args.merge_peaks = False + # args.shift = 5 bin_size = args.bin_size min_gap = args.min_gap @@ -835,7 +859,7 @@ def determine_block_boundaries(aligned_fnames, min_gap=min_gap / bin_size, out_prefix=out_prefix, chrom_lengths=chrom_lengths, - output_signal_files=not args.no_output_signal_files, + output_signal_files=args.output_signal_files, min_expected_reads=args.min_expected_reads, use_broad_window_for_background=args.use_broad_window_for_background) diff --git a/README.md b/README.md index f32eafe..b7509f0 100644 --- a/README.md +++ b/README.md @@ -16,29 +16,41 @@ After the compilation has finished, you should be able to run ChromTime by typin $ python ChromTime.py usage: ChromTime.py [-h] [-a ALIGNED_FNAMES [ALIGNED_FNAMES ...]] [-c CONTROL_FNAMES [CONTROL_FNAMES ...]] [-i ORDER_FNAME] - [-g GENOME] [-o DIRECTORY] [-p PREFIX] [-b INT] [-t INT] - [-s SHIFT] [-q FDR_FOR_DECODING] - [--q-value-seed Q_VALUE_SEED] + [-m {punctate,narrow,broad}] [-g GENOME] [-o DIRECTORY] + [-p PREFIX] [-b INT] [-t INT] [-s SHIFT] + [-q FDR_FOR_DECODING] [--q-value-seed Q_VALUE_SEED] [--p-value-extend P_VALUE_EXTEND] [--min-expected-reads MIN_EXPECTED_READS] - [--min-gap MIN_GAP] [-n INT] [-m FILE] [-d FILE] - [--skip-training] [--merge-peaks] - [--no-output-signal-files] [--broad] [--atac] + [--min-gap MIN_GAP] [--merge-peaks] + [--min-dynamic-prior MIN_DYNAMIC_PRIOR] + [--model-file FILE] [--data-file FILE] [--skip-training] + [-n INT] [--output-signal-files] ChromTime: Modeling Spatio-temporal Dynamics of Chromatin Marks optional arguments: -h, --help show this help message and exit + + Input data from command line: -a ALIGNED_FNAMES [ALIGNED_FNAMES ...], --aligned-reads ALIGNED_FNAMES [ALIGNED_FNAMES ...] - Bed files with aligned reads for each time point in + BED files with aligned reads for each time point in the correct order -c CONTROL_FNAMES [CONTROL_FNAMES ...], --control-reads CONTROL_FNAMES [CONTROL_FNAMES ...] - Bed files with aligned reads for control (input) for + BED files with aligned reads for control (input) for each time point in the correct order + + Input data from order file: -i ORDER_FNAME, --input-order-file ORDER_FNAME - A file with paths to files with foreground and control - aligned reads - one line per time point, in the right - order. + A tab-separated file with paths to files with + foreground and control aligned reads - one line per + time point, in the right order. + + Options: + -m {punctate,narrow,broad}, --mode {punctate,narrow,broad} + punctate: equivalent to "-b 200 --min-gap 600 --min- + dynamic-prior 0.05", narrow (default): equivalent to + "-b 200 --min-gap 600 --min-dynamic-prior 0", broad: + equivalent to "-b 500 --min-gap 1500 --merge-peaks" -g GENOME, --genome GENOME Genome. One of: [hg18, hg19, mm10, mm9, zv9] or path to a file with chromosome sizes one per line @@ -64,23 +76,20 @@ After the compilation has finished, you should be able to run ChromTime by typin component (Default: 1) --min-gap MIN_GAP Minimum gap between significant regions before they are joined (Default: 600) + --merge-peaks Merge significant peaks across time points instead of + splitting them (Default: False) + --min-dynamic-prior MIN_DYNAMIC_PRIOR + Minimum prior probability for each dynamic at each + time point (Default: 0.0) + --model-file FILE Pickled model to load + --data-file FILE Pickled data to load + --skip-training Skip EM training (Default: False) -n INT, --n-training-examples INT Number of training examples to use. (Default: 10000) - -m FILE, --model-file FILE - Pickled model file to load a learned the model from - -d FILE, --data-file FILE - Pickled data file to load - --skip-training Skip EM training (False) - --merge-peaks Merge significant peaks across time points instead of - splitting them (False) - --no-output-signal-files - Do not output signal files for each time point in - wiggle format (False) - --broad Use default settings for broad marks. Equivalent to - "-b 500 --min-gap 1500 --merge-peaks" (False) - --atac Use default settings for ATAC-seq marks. Equivalent to - "-b 50 --min-gap 150 -s 5 " (False). This option has - not been tested extensively, so use with caution. + --output-signal-files + Output signal files for each time point in wiggle + format (Default: False) + ## Input diff --git a/call_boundary_dynamics.py b/call_boundary_dynamics.py index ae10f92..be7d3de 100644 --- a/call_boundary_dynamics.py +++ b/call_boundary_dynamics.py @@ -169,6 +169,7 @@ def __init__(self, self.n_dynamics = len(self.dynamics) + self.out_prefix = out_prefix self.model_fname = out_prefix + '.model.pickle' self.reset_model() @@ -1217,7 +1218,11 @@ def EM(self, blocks, MIN_DELTA_LOG_LIKELIHOOD=None, echo_level=ECHO_TO_SCREEN, b if bruteforce_debug: echo('delta log likelihood=', delta_likelihood, '\tblocks:', blocks) - echo('+' * 100, '\n', '+' * 100) + echo('+' * 100, '\n', '+' * 100 + '\n\n' + + 'Please submit an issue report to: https://github.com/ernstlab/ChromTime/issues\n' + + 'In the issue report, please attach the ChromTime\'s log file: ' + open_log.logfname + '\n' + 'and the pickled data file: ' + self.out_prefix + '.data.pickle' + '\n\n') + raise DecreasingLikelihoodException("Decreasing likelihood. LL delta: " + str(delta_likelihood), delta_likelihood, blocks) @@ -1236,6 +1241,7 @@ def EM(self, blocks, MIN_DELTA_LOG_LIKELIHOOD=None, echo_level=ECHO_TO_SCREEN, b new_priors = [(param_info[TOTAL_POSTERIORS_PER_DYNAMIC][d][t] / total_p) for d in xrange(self.n_dynamics)] + # loop until all priors are updated to be equal to at least min_dinamic_prior while any(p < self.min_dynamic_prior for p in new_priors): # check which priors go below the minimum dynamic prior set by the user dyns_to_update = [p > self.min_dynamic_prior for p in new_priors] @@ -1251,10 +1257,11 @@ def EM(self, blocks, MIN_DELTA_LOG_LIKELIHOOD=None, echo_level=ECHO_TO_SCREEN, b if n_dyns_to_fix > 0: echo('At time point', t, ', priors for the following dynamics will be set to', self.min_dynamic_prior, ':', - [self.dynamics[_d] for _d in xrange(self.n_dynamics) if not dyns_to_update[_d]] - ) + [self.dynamics[_d] for _d in xrange(self.n_dynamics) if not dyns_to_update[_d]]) + # echo([(param_info[TOTAL_POSTERIORS_PER_DYNAMIC][d][t] / total_p) # for d in xrange(self.n_dynamics)]) + # compute the denominator for the rest of the priors that will be updated _lambda = total_p_for_dyns_to_update / (1 - n_dyns_to_fix * self.min_dynamic_prior) @@ -1292,7 +1299,7 @@ def EM(self, blocks, MIN_DELTA_LOG_LIKELIHOOD=None, echo_level=ECHO_TO_SCREEN, b block_dynamics_jump_posteriors = param_info[DYNAMICS_JUMP_POSTERIORS][block_id][dynamic] for dist, weight in enumerate(block_dynamics_jump_posteriors[t]): if dist > 0: - + # the response for the regression is dist - 1 y.append(dist - 1) weights.append(weight) scale += weight diff --git a/utils.py b/utils.py index 1aacf92..0a268a1 100644 --- a/utils.py +++ b/utils.py @@ -74,6 +74,7 @@ def elapsed(message = None): def open_log(fname): """ Opens a log file """ + open_log.logfname = fname open_log.logfile = open(fname, 'w', 1)