From 19426c09bf76a53e96c8ccfbd5f02a0838dc525d Mon Sep 17 00:00:00 2001 From: Yutaka Saito Date: Fri, 12 Dec 2014 17:44:05 +0900 Subject: [PATCH] migrate v20131226 from Google Code --- ComMet/README | 101 +- ComMet/src/ComMet.cc | 8 +- ComMet/src/FrameworkComMet.cc | 17 +- ComMet/src/FrameworkComMet.h | 17 +- ComMet/src/Makefile | 1 + ComMet/src/ProbabilityModel.cc | 9 + ComMet/src/SlimProbabilityModel.cc | 2 +- ComMet/src/Utility.cc | 11 + ComMet/src/Utility.h | 8 +- README.txt | 1 + bsf-call/bsf-call | 1084 ++--------------- bsf-call/bsf-call.txt | 72 +- bsf-call/bsfcall.py | 1824 ++++++++++++++++++++++++++++ bsf-call/mapping-p.sh | 103 ++ bsf-call/mapping-s.sh | 65 + bsf-call/mc-detector.py | 84 ++ bsf-diff/src/__init__.pyc | Bin 135 -> 0 bytes bsf-diff/src/bsf_diff.pyc | Bin 10064 -> 0 bytes bsf-diff/src/bsfdiffexception.pyc | Bin 937 -> 0 bytes bsf-diff/src/gene.pyc | Bin 3362 -> 0 bytes bsf-diff/src/methylc.pyc | Bin 12799 -> 0 bytes bsf-diff/src/sampledata.pyc | Bin 3555 -> 0 bytes bsf-diff/src/significance_test.pyc | Bin 5347 -> 0 bytes bsf-diff/src/target_region.pyc | Bin 2997 -> 0 bytes bsf-diff/src/threadpool.pyc | Bin 5130 -> 0 bytes bsf-diff/test/TestSignificance.pyc | Bin 702 -> 0 bytes bsf-diff/test/__init__.pyc | Bin 131 -> 0 bytes bsf-diff/test/testGene.pyc | Bin 1617 -> 0 bytes bsf-diff/test/testSampleData.pyc | Bin 1257 -> 0 bytes bsf-diff/test/testThread.pyc | Bin 1271 -> 0 bytes bsf-diff/test/test_bed.pyc | Bin 1953 -> 0 bytes demo/demo.sh | 51 +- 32 files changed, 2391 insertions(+), 1067 deletions(-) create mode 100755 bsf-call/bsfcall.py create mode 100755 bsf-call/mapping-p.sh create mode 100755 bsf-call/mapping-s.sh create mode 100755 bsf-call/mc-detector.py delete mode 100644 bsf-diff/src/__init__.pyc delete mode 100644 bsf-diff/src/bsf_diff.pyc delete mode 100644 bsf-diff/src/bsfdiffexception.pyc delete mode 100644 bsf-diff/src/gene.pyc delete mode 100644 bsf-diff/src/methylc.pyc delete mode 100644 bsf-diff/src/sampledata.pyc delete mode 100644 bsf-diff/src/significance_test.pyc delete mode 100644 bsf-diff/src/target_region.pyc delete mode 100644 bsf-diff/src/threadpool.pyc delete mode 100644 bsf-diff/test/TestSignificance.pyc delete mode 100644 bsf-diff/test/__init__.pyc delete mode 100644 bsf-diff/test/testGene.pyc delete mode 100644 bsf-diff/test/testSampleData.pyc delete mode 100644 bsf-diff/test/testThread.pyc delete mode 100644 bsf-diff/test/test_bed.pyc diff --git a/ComMet/README b/ComMet/README index ba20d09..a449ee1 100644 --- a/ComMet/README +++ b/ComMet/README @@ -1,35 +1,40 @@ -= ComMet: an HMM-based approach to detection of differentially methylated regions +== ComMet: an HMM-based approach to detection of differentially methylated regions -== Required libraries - Boost C++ libraries - http://www.boost.org/ +==== Required libraries +Boost C++ libraries +http://www.boost.org/ -== Build - tar xzvf ComMet-xxx.tar.gz - cd ComMet-xxx - make +==== Build + +$ tar xzvf ComMet-xxx.tar.gz +$ cd ComMet-xxx +$ make You will find the executable in src directory. -== Usage +==== Usage + +For help, +$ ComMet --help -For help - ComMet --help +For test run, +$ ComMet example/example.in example/example.out1 example/example.out2 -For test run - ComMet example/example.in example/example.out1 example/example.out2 +For more output DMRs, +$ ComMet --threshold -5 example/example.in example/example.out1 example/example.out2 -For more accurate, but slower DMR detection - ComMet --dual example/example.in example/example.out1 example/example.out2 - ComMet --threshold -5 example/example.in example/example.out1 example/example.out2 - ComMet --dual --threshold -5 example/example.in example/example.out1 example/example.out2 +For more accurate DMRs, +$ ComMet --dual example/example.in example/example.out1 example/example.out2 +Combining them, +$ ComMet --dual --threshold -5 example/example.in example/example.out1 example/example.out2 -== Input format + +==== Input format See example/example.in @@ -45,8 +50,16 @@ Col.| Description reads supporting mC = C-C matches reads not supporting mC = otherwise +Make sure chromosome names and genomic positions are sorted by "sort -k1,1 -k2,2n". + +Note that input files do not contain strand information. +Normally, you should integrate both strands by summing the read counts at two neighbor CpGs, +i.e. the 5'-CpG-3' in the plus strand, and the neighboring 3'-GpC-5' in the minus strand. +Alternatively, if you are interested in strand-specific DMRs, you can prepare two input files +for plus and minus strands, and apply them to ComMet separately. -== Output format + +==== Output format output1 contains information of differential methylation at individual cytosines. See example/example.out1_ @@ -59,7 +72,7 @@ Col.| Description 4 | mC ratio in sample2 5 | prob. for hypermethylation (UP) in sample1 against sample2 6 | prob. for hypomethylation (DOWN) in sample1 against sample2 -7 | prob. for no methylation change (NC) between sample1 and sample2 +7 | prob. for no methylation change (NoCh) between sample1 and sample2 output2 contains information of detected DMRs. See example/example.out2_ @@ -73,7 +86,41 @@ Col.| Description 5 | log-likelihood ratio score -== Bisulfighter +==== FAQ + +Q. What is the meaning of the error "distance between neighbor CpGs must not be less than 2"? +A. +Your input file contains invalid genomic positions. +By definition of CpG, the base next to C must be G, and therefore two neighbor CpGs should be +separated by at least two bases. Your input file may violate this rule for several reasons. +First, the input file may contain two neighbor CpGs from different strands, +i.e. the 5'-CpG-3' in the plus strand, and the neighboring 3'-GpC-5' in the minus strand. +See the "Input format" section above for this issue. +Second, the input file may contain cytosines in non-CpG context; just remove them. + +Q. Does ComMet support DMRs in non-CpG context (CHG or CHH)? +A. +No. But we are planning to address this issue in the next version of ComMet. + +Q. The read counts in the example input file are decimals rather than integers. Why? +A. +Either decimals or integers can be used for read counts in input files. +The reason that the example input file contains decimals is that some alignment tools produce +probability-weighted read counts. Of course, you can use your favorite aligners for preparing +input files that may contain integers only. + + +==== History + +* version 1.0 +- tuned the default parameters +- added the FAQ in README + +* version 0.1 +- implemented the algorithms described in [Saito et al., Nucleic Acids Res, 2013] + + +==== Bisulfighter ComMet was first developed as the DMR detection module in Bisulfighter, a software package for mC calling and DMR detection. @@ -81,21 +128,21 @@ Compatibility between ComMet and the mC calling module in Bisulfighter is kept by input-output transformation scripts in util directory. -== License +==== License Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. http://creativecommons.org/licenses/by-nc-sa/3.0/ -== References +==== References -Y. Saito, J. Tsuji, T. Mituyama, +Yutaka Saito, Junko Tsuji, and Toutai Mituyama, Bisulfighter: accurate detection of methylated cytosines and differentially methylated regions, -submitted. +Nucleic Acids Research, accepted for publication, 2013. -== Contact +==== Contact -Yutaka SAITO +Yutaka Saito yutaka.saito AT aist.go.jp diff --git a/ComMet/src/ComMet.cc b/ComMet/src/ComMet.cc index 1981a97..7e8e22c 100644 --- a/ComMet/src/ComMet.cc +++ b/ComMet/src/ComMet.cc @@ -9,6 +9,7 @@ #include #include +#include "Utility.h" #include "FrameworkComMet.h" #include "NaiveModel.h" #include "CGIModel.h" @@ -23,6 +24,8 @@ namespace po = boost::program_options; int main(int argc, char** argv) { + progress(whoami()); + Options opts; // parse command line options @@ -44,10 +47,9 @@ main(int argc, char** argv) po::notify(vm); if (vm.count("help") || extra_args.size()<3) { - cout << "ComMet v0.1 - identification of differentially methylated regions (DMRs)" << endl - << endl + cout << endl << "Usage:" << endl - << " " << argv[0] << " [options] input output1 output2" << endl + << argv[0] << " [options] input output1 output2" << endl << endl << desc << endl; return 1; diff --git a/ComMet/src/FrameworkComMet.cc b/ComMet/src/FrameworkComMet.cc index 13dd511..8c775e2 100644 --- a/ComMet/src/FrameworkComMet.cc +++ b/ComMet/src/FrameworkComMet.cc @@ -23,7 +23,7 @@ add_options(po::options_description& opts) po::value(&nitr)->default_value(100), "set the number of training iterations") ("separate", - po::value(&dsep)->default_value(100), + po::value(&dsep)->default_value(5), "divide procedures when CpGs are separated by this distance (Kb)") ("threshold", po::value(&thsh)->default_value(0.0), @@ -31,21 +31,12 @@ add_options(po::options_description& opts) ("alpha", po::value(&alpha)->default_value(8.0), "set pseudocounts for the emission function") - //("gcpg", - // po::value(&gcpg)->default_value(1.0), - // "set gamma for the CpG gain function") - //("ggap", - // po::value(&ggap)->default_value(1.0), - // "set gamma for the GAP gain function") ("dual", po::value(&dual)->zero_tokens()->default_value(false), "use the dual model instead of the naive model") - //("viterbi", - // po::value(&dovtb)->zero_tokens()->default_value(false), - // "use Viterbi decoding instead of posterior decoding") - ("noslim", - po::value(&noslim)->zero_tokens()->default_value(false), - "use straightforward implementation instead of slim one") + //("noslim", + // po::value(&noslim)->zero_tokens()->default_value(false), + // "use straightforward implementation instead of slim one") ("verbose", po::value(&verbose)->zero_tokens()->default_value(false), "make verbose reports to stdout") diff --git a/ComMet/src/FrameworkComMet.h b/ComMet/src/FrameworkComMet.h index 927124d..3b93bf5 100644 --- a/ComMet/src/FrameworkComMet.h +++ b/ComMet/src/FrameworkComMet.h @@ -30,11 +30,8 @@ struct Options uint dsep; float thsh; float alpha; - static const float gcpg = 1.0; - static const float ggap = 1.0; bool dual; - static const bool dovtb = false; - bool noslim; + static const bool noslim = false; bool verbose; Options() : imc_file(), omc_file(), dmr_file() {} @@ -59,14 +56,6 @@ class App std::cout << "alpha must be larger than 2" << std::endl; return false; } - else if (! (opts_.gcpg > 0.0)) { - std::cout << "gcpg must be larger than 0" << std::endl; - return false; - } - else if (! (opts_.ggap > 0.0)) { - std::cout << "ggap must be larger than 0" << std::endl; - return false; - } return identify(); } @@ -76,10 +65,8 @@ class App { bool res = false; - progress("ComMet v0.1 - identification of differentially methylated regions (DMRs)"); - std::vector data; - progress("load methylated bases"); + std::vector data; res = load_data(data); if (!res) return false; diff --git a/ComMet/src/Makefile b/ComMet/src/Makefile index e66c184..d8cad05 100644 --- a/ComMet/src/Makefile +++ b/ComMet/src/Makefile @@ -43,6 +43,7 @@ all: $(PROGRAM1) Makefile clean: \rm -f *.o *~ $(PROGRAM1) + chmod 644 *.cc *.h *.hpp $(PROGRAM1): $(OBJ1) $(CXX) $(CPPFLAGS) -o $(PROGRAM1) $(OBJ1) $(LDFLAGS) $(LDLIBS) diff --git a/ComMet/src/ProbabilityModel.cc b/ComMet/src/ProbabilityModel.cc index 9d161a2..2c5a9ea 100644 --- a/ComMet/src/ProbabilityModel.cc +++ b/ComMet/src/ProbabilityModel.cc @@ -22,6 +22,15 @@ reset(const MethylList& met, ValueType alpha) return false; } + // dist + for (uint i=0; i!=met.pos_.size()-1; ++i) { + uint dist = met.pos_[i+1] - met.pos_[i]; + if (dist < 2) { + cout << "error: distance between neighbor CpGs must not be less than 2 " << met.name_ << endl; + return false; + } + } + // For the reuse of a class instance, clear() is necessary before resize() // because resize() a vector to the same size as the previous use DOES NOT // overwrite its values. diff --git a/ComMet/src/SlimProbabilityModel.cc b/ComMet/src/SlimProbabilityModel.cc index 683bc22..496585c 100644 --- a/ComMet/src/SlimProbabilityModel.cc +++ b/ComMet/src/SlimProbabilityModel.cc @@ -30,7 +30,7 @@ reset(const MethylList& met, ValueType alpha) for (uint i=0; i!=dpsize_-1; ++i) { Dist[i] = met.pos_[i+1] - met.pos_[i]; if (Dist[i] < 2) { - cout << "error: distance between neighbor CpGs must not be less than 2" << met.name_ << endl; + cout << "error: distance between neighbor CpGs must not be less than 2 " << met.name_ << endl; return false; } } diff --git a/ComMet/src/Utility.cc b/ComMet/src/Utility.cc index 96a3617..983382b 100644 --- a/ComMet/src/Utility.cc +++ b/ComMet/src/Utility.cc @@ -17,6 +17,17 @@ progress(const char* message) { cout << message << endl << flush; } +void +progress(const string& message) { + cout << message << endl << flush; +} + +string +whoami() { + string iam = "ComMet v1.0 - identification of differentially methylated regions (DMRs)"; + return iam; +} + uint round(float val) { return (uint) (val + 0.5); diff --git a/ComMet/src/Utility.h b/ComMet/src/Utility.h index 3830c31..0e302ff 100644 --- a/ComMet/src/Utility.h +++ b/ComMet/src/Utility.h @@ -15,12 +15,18 @@ #define PI 3.141592653589793238462643383279502884197169399375105820974944 #ifndef uint -#define uint unsigned int +#define uint unsigned int #endif void progress(const char* message); +void +progress(const std::string& message); + +std::string +whoami(); + uint round(float val); diff --git a/README.txt b/README.txt index b4f9801..4b884b6 100644 --- a/README.txt +++ b/README.txt @@ -38,6 +38,7 @@ II. PACKAGE COMPONENTS 3) bsf-diff An optional component for DMR identification using statistical test and signal smoothing. + This program is not officially released. You can use this one but we are not going to provide details about this. 4) script Scripts to generate simulated reads for differentially methylated diff --git a/bsf-call/bsf-call b/bsf-call/bsf-call index 33a05bc..ad9d862 100755 --- a/bsf-call/bsf-call +++ b/bsf-call/bsf-call @@ -10,1037 +10,133 @@ http://creativecommons.org/licenses/by-nc-sa/3.0/ __version__= "0.9" -import sys -import os -import glob -import threading -import subprocess -import Queue from optparse import OptionParser -from datetime import datetime -import md5 -from string import maketrans -import gzip -import logging - -class BsfCallBase(object): - def splitFilePath(self, filePath): - if self.isGzippedFile(filePath): - dir_name, file_name = os.path.split(filePath[0:-3]) - else: - dir_name, file_name = os.path.split(filePath) - base_name, ext = os.path.splitext(file_name) - if len(ext) > 1: - ext = ext[1:] - - return (dir_name, file_name, base_name, ext) - - - def readNameByReadFile(self, readFilePath): - dir_name, file_name, read_name, ext = self.splitFilePath(readFilePath) - - return read_name - - - def secondReadFilePathByFirstReadFilePath(self, readFile, secondReadType): - fpath = "" - - if self.isGzippedFile(readFile): - dir_name, file_name = os.path.split(readFile[0:-3]) - basename, ext = os.path.splitext(file_name) - fpath = "%s/%s2.%s.gz" % (dir_name, basename[0:-1], secondReadType) - else: - dir_name, file_name = os.path.split(readFile) - basename, ext = os.path.splitext(file_name) - fpath = "%s/%s2.%s" % (dir_name, basename[0:-1], secondReadType) - - return fpath - - - def pairedEndReadDirections(self): - return (1, 2) - - - def clearGap(self, seq): - return seq.replace("-", "") - - - def chrnoFromFastaDescription(self, description): - return description.strip()[1:].strip() - - - def complementStartPosition(self, genomeLen, subseqStart, subseqLen): - return genomeLen - subseqStart - subseqLen - - - def complementSeq(self, seq): - return seq.translate(maketrans("ATGCatgc", "TACGtacg"))[::-1] - - - def getMcContextType(self, bases, startPos, lastPos): - try: - if bases[startPos + 1] == "G": - return "CG" - else: - if bases[startPos + 2] == "G": - return "CHG" - else: - return "CHH" - except IndexError: - return None - - - def extractChrNo(self, chrName): - chr_no = chrName[3:] - if chr_no == "X": - return 23 - elif chr_no == "Y": - return 24 - else: - return int(chr_no) - - - def chrSort(self, a, b): - # chrno_a = self.extractChrNo(a) - # chrno_b = self.extractChrNo(b) - # - # return chrno_a - chrno_b - return cmp(a, b) - - def strands(self): - return ("+", "-") - - - def mcContextTypes(self): - return ("CG", "CHG", "CHH") - - - def gzipFile(self, filePath, wait = True): - dirpath, fname = os.path.split(filePath) - cmd = "gzip %s" % fname - p = subprocess.Popen(cmd, shell = True, cwd = dirpath) - if wait: - p.wait() - - - def isGzippedFile(self, filePath): - return filePath[-3:] == ".gz" - - -class BsfCall(BsfCallBase): - def __init__(self, refGenome, readFilePaths, cmdOpts): - self.refGenome = refGenome - self.readFilePaths = readFilePaths - self.reads = [] - self.opts = cmdOpts - - self.numReadsPerFile = 100 * 1000 * 1000 - #self.numReadsPerFile = 10 * 1000 * 1000 - #self.numReadsPerFile = 1 * 1000 * 1000 - - self.dataDir = None - self.genomeDir = None - self.mcContextDir = None - self.pairedEnd = None - self.pairedEndDirection = None - - self.setDataDir() - self.setLogger() - - logging.info("Reference genome: %s", self.refGenome) - logging.info("Read files: %s", self.readFilePaths) - logging.info("Options:") - logging.info(" threshold of read coverate: %d" % self.opts["coverage"]) - logging.info(" number of threads: %d" % self.opts["num_threads"]) - logging.info(" threshold of mC ratio: %f" % self.opts["lower_bound"]) - logging.info(" threshold of the alignment score at filtering: %d" % self.opts["alignment_score_threshold"]) - logging.info(" threshold of the mismap probability at filtering: %f" % self.opts["alignment_mismap_prob_threshold"]) - logging.info(" paired-end direction: %s" % self.opts["pe_direction"]) - if self.opts["last_opts"]: - logging.info(" options for LAST: %s" % self.opts["last_opts"]) - if self.opts["output"]: - logging.info(" output file: %s" % self.opts["output"]) - else: - logging.info(" output file: (stdout)") - logging.info(" work directory: %s" % self.dataDir) - - - def execute(self): - try: - logging.info("bsf-call start.") - - self.makeIndexFile() - self.prepare() - - result_dirs = [] - for read_attr in self.reads: - self.executeOneRead(read_attr) - result_dirs.append(read_attr["results_dir"]) - - mc_detector = McDetector(self.refGenome, result_dirs, self.mcContextDir, self.opts["lower_bound"], self.opts["coverage"]) - mc_detector.execute(self.opts["output"], self.opts["num_threads"]) - logging.info("bsf-call end.") - except: - logging.exception("Exception has occurred.") - - - def setDataDir(self): - if self.opts["work_dir"]: - self.dataDir = self.opts["work_dir"] - else: - self.dataDir = self.autoWorkDir() - - self.mcContextDir = "%s/mc_contexts" % self.dataDir - - os.mkdir(self.dataDir) - os.mkdir(self.mcContextDir) - - - def setLogger(self): - log_level = logging.INFO - - log_file = "%s/bsf-call.log" % self.dataDir - file_logger = logging.FileHandler(filename=log_file) - - file_logger.setLevel(log_level) - file_logger.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s')) - - logging.getLogger().addHandler(file_logger) - logging.getLogger().setLevel(log_level) - - - def prepare(self): - for file_no, read_path in enumerate(self.readFilePaths): - read = self.prepareForOneRead(read_path, file_no + 1) - self.reads.append(read) - - - def prepareForOneRead(self, readPath, readNo): - data_dir = "%s/%d"% (self.dataDir, readNo) - read = {"base_dir": data_dir, "path": readPath, "reads_dir": data_dir + "/reads", "results_dir": data_dir + "/results"} - - os.mkdir(data_dir) - os.mkdir(read["reads_dir"]) - os.mkdir(read["results_dir"]) - - direction = 1 - for read_path in readPath.split(","): - dir_name, file_name, base_name, ext = self.splitFilePath(read_path) - file_type = self.checkReadFileType(read_path) - read[direction] = {"name": base_name, "fname": file_name, "type": file_type, "path": read_path} - direction += 1 - - if self.isPairedEnd(read): - for direction in self.pairedEndReadDirections(): - if read[direction]["type"] == "sra": - self.sra2Fastq(read[direction]["path"], data_dir, False) - fastq = self.fastqDumpedFilePath(data_dir, read[direction]["name"]) - if os.path.exists(fastq): - read[direction]["name"] = self.readNameByReadFile(fastq) - read[direction]["path"] = fastq - read[direction]["type"] = "fastq" - read[direction]["fname"] = os.path.basename(read[direction]["path"]) - else: - if read[1]["type"] == "sra": - self.sra2Fastq(read[1]["path"], data_dir, True) - read_name = read[1]["name"] - for direction in self.pairedEndReadDirections(): - fastq = self.fastqDumpedFilePath(data_dir, read_name, direction) - if os.path.exists(fastq): - if not direction in read: - read[direction] = {} - read[direction]["name"] = self.readNameByReadFile(fastq) - read[direction]["path"] = fastq - read[direction]["type"] = "fastq" - read[direction]["fname"] = os.path.basename(read[direction]["path"]) - - return read - - - def executeOneRead(self, readAttr): - logging.info("Target read file: %s" % readAttr["path"]) - - self.pairedEnd = self.isPairedEnd(readAttr) - - logging.info("Paired-end: %s" % self.pairedEnd) - logging.info(": %s" % readAttr[1]["path"]) - if self.pairedEnd: - logging.info("Reverse: %s" % readAttr[2]["path"]) - - for direction in self.pairedEndReadDirections(): - if direction in readAttr: - self.splitRead(readAttr[direction]["path"], readAttr["reads_dir"], direction, readAttr[direction]["type"]) - for dumped_fastq_file in glob.glob("%s/*.fastq" % readAttr["base_dir"]): - self.gzipFile(dumped_fastq_file, False) - - self.runLast(readAttr) - - - def sra2Fastq(self, sraFile, outputDir, splitFiles = True): - cmd = "fastq-dump -O %s " % outputDir - if splitFiles: - cmd += "--split-files " - cmd += sraFile - logging.info(cmd) - p = subprocess.Popen(cmd, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT) - out = p.stdout - p.wait() - out_data = out.read() - if len(out_data) > 0: - logging.info(out_data) - - - def splitRead(self, readFile, outputDir, readNo, fileType): - if fileType == "fasta": - self.splitFasta(readFile, outputDir, readNo) - elif fileType == "fastq": - self.splitFastq(readFile, outputDir, readNo) - - - def splitFasta(self, readFile, outputDir, readNo): - ext = "fasta" - num_reads = 0 - - out_file_path = self.splitedReadFilePath(outputDir, 1, self.numReadsPerFile, readNo, ext) - in_file = open(readFile, "r") - out_file = open(out_file_path, "w") - for line in in_file: - if line[0:1] == ">": - if num_reads != 0 and num_reads % self.numReadsPerFile == 0: - out_file.close() - self.gzipFile(out_file_path) - out_file_path = self.splitedReadFilePath(outputDir, num_reads + 1, num_reads + self.numReadsPerFile, readNo, ext) - out_file = open(out_file_path, "w") - out_file.write(line) - num_reads += 1 - else: - line = line.upper().replace("C", "t") - out_file.write(line) - - out_file.close() - in_file.close() - self.gzipFile(out_file_path) - logging.info("%s: the number of reads: %d" % (readFile, num_reads)) - - - def splitFastq(self, readFile, outputDir, readNo): - ext = "fastq" - num_lines_per_read = 4 - num_reads = 0 - line_no = 1 - - out_file_path = self.splitedReadFilePath(outputDir, 1, self.numReadsPerFile, readNo, ext) - out_fh = open(out_file_path, "w") - for line in open(readFile, "r"): - if line_no % num_lines_per_read == 2: - line = line.upper().replace("C", "t") - out_fh.write(line) - if line_no % num_lines_per_read == 0: - num_reads += 1 - if line_no % (num_lines_per_read * self.numReadsPerFile) == 0: - out_fh.close() - self.gzipFile(out_file_path) - out_file_path = self.splitedReadFilePath(outputDir, num_reads + 1, num_reads + self.numReadsPerFile, readNo, ext) - out_fh = open(out_file_path, "w") - line_no += 1 - out_fh.close() - self.gzipFile(out_file_path) - logging.info("%s: the number of reads: %d" % (readFile, num_reads)) - - - def makeIndexFile(self): - directions = [] - if not os.path.exists("%s.f.prj" % self.refGenome): - directions.append("f") - if not os.path.exists("%s.r.prj" % self.refGenome): - directions.append("r") - - if len(directions) > 0: - last_executor = LastExecutor(self.refGenome, self.dataDir) - last_executor.lastdb(directions, self.opts["num_threads"] > 1) - - - def runLast(self, readAttr): - is_paired_end = self.isPairedEnd(readAttr) - filter_option = self.filterOpts(is_paired_end) - if is_paired_end: - last_executor = LastExecutorPairedEnd(self.refGenome, self.dataDir, readAttr["reads_dir"], readAttr["results_dir"]) - else: - last_executor = LastExecutorSingle(self.refGenome, self.dataDir, readAttr["reads_dir"], readAttr["results_dir"]) - - last_executor.execute(readAttr, self.opts["num_threads"], self.lastalOpts(readAttr), self.mergeOpts(), filter_option) - - - def lastalOpts(self, readAttr): - return " ".join(self.opts["last_opts"].split(",")) - - - def mergeOpts(self): - return "" - - - def filterOpts(self, isPairedEnd): - option = "" - if isPairedEnd: - option = "-m%f" % self.opts["alignment_mismap_prob_threshold"] - else: - option = "-s%d -m%f" % (self.opts["alignment_score_threshold"], self.opts["alignment_mismap_prob_threshold"]) - - return option - - - def checkReadFileType(self, readFilePath): - name, ext = os.path.splitext(readFilePath) - if ext == ".gz": - name, ext = os.path.splitext(name) - - if len(ext) > 1: - ext = ext[1:] - - file_type = None - - if ext == "sra" or "fastq" or "fasta": - file_type = ext - elif ext == "fa": - file_type = "fasta" - else: - f = open(readFilePath, "r") - first_char = f.read(1) - if first_char == "@": - file_type = "fastq" - elif first_char == ">": - file_type = "fasta" - else: - file_type = "sra" - f.close() - - return file_type - - - def splitedReadFilePath(self, outputDir, start, end, readDirection, ext): - return "%s/%010d-%010d_%d.%s" % (outputDir, start, end, readDirection, ext) - - - def fastqDumpedFilePath(self, outputDir, readName, readDirection = None): - path = "%s/%s" % (outputDir, readName) - if readDirection: - path += "_%d" % readDirection - - return path + ".fastq" - - - def isPairedEnd(self, readAttr): - return 2 in readAttr - - - def autoWorkDir(self): - now = datetime.now() - - s = ",".join(self.readFilePaths) + self.refGenome - for key, value in self.opts.items(): - s += ("%s:%s" % (key, value)) - h = md5.new(s).hexdigest() - - return "%s-%06d-%s" % (now.strftime("%Y%m%d-%H%M%S"), now.microsecond, h[0:16]) - - - def bisulfite_f_seed (self): - filename = 'bisulfite_f.seed' - if not os.path.exists(filename): - file = open(filename, "w") - file.write(""" -# This subset seed pattern is suitable for aligning forward strands of -# bisulfite-converted DNA to unconverted DNA. -CT A G -CT A G -CT A G -CT A G -CT A G -CT A G -CTAG -CT A G -CTAG -CT A G -CT A G -CTAG -CTAG -""") - file.close() - return - - - def bisulfite_r_seed (self): - filename = 'bisulfite_r.seed' - if not os.path.exists(filename): - file = open(filename, "w") - file.write(""" -# This subset seed pattern is suitable for aligning reverse strands of -# bisulfite-converted DNA to unconverted DNA. -AG C T -AG C T -AG C T -AG C T -AG C T -AG C T -AGCT -AG C T -AGCT -AG C T -AG C T -AGCT -AGCT -""") - file.close() - return - - - def bisulfite_f_mat (self): - filename = 'bisulfite_f.mat' - if not os.path.exists(filename): - file = open(filename, "w") - file.write(""" -# This score matrix is suitable for aligning forward strands of -# bisulfite-converted DNA queries to unconverted reference sequences. - A C G T -A 6 -18 -18 -18 -C -18 6 -18 3 -G -18 -18 6 -18 -T -18 -18 -18 3 -""") - file.close() - return - - - def bisulfite_r_mat (self): - filename = 'bisulfite_r.mat' - if not os.path.exists(filename): - file = open(filename, "w") - file.write(""" -# This score matrix is suitable for aligning forward strands of -# bisulfite-converted DNA queries to unconverted reference sequences. - A C G T -A 3 -18 -18 -18 -C -18 6 -18 -18 -G 3 -18 6 -18 -T -18 -18 -18 6 -""") - file.close() - return - - -class LastExecutor(BsfCallBase): - def __init__(self, refGenome, baseDir=".", readsDir=None, resultsDir=None): - self.refGenome = refGenome - self.baseDir = baseDir - self.readsDir = readsDir - self.resultsDir = resultsDir - self.queue = Queue.Queue() - self.lock = threading.Lock() - - - def execute(self, readAttr, numThreads, lastalOpts = "", mergeOpts = "", filterOpts = ""): - self.enqueue(readAttr) - - threads = [] - for i in range(numThreads): - t = threading.Thread(target=self.worker, args=(readAttr, lastalOpts, mergeOpts, filterOpts)) - t.daemon = True - threads.append(t) - t.start() - - for thread in threads: - thread.join() - - - def worker(self, readAttr, lastalOpts, mergeOpts, filterOpts): - while True: - try: - fpath = self.queue.get_nowait() - self.runLast(fpath, readAttr, lastalOpts, mergeOpts, filterOpts) - except Queue.Empty: - break - - - def runLast(self, readFile, readAttr, lastalOpts, mergeOpts, filterOpts, rmInFiles = True): - cmd = self.batchCmd(readFile, readAttr, lastalOpts, mergeOpts, filterOpts, rmInFiles) - logging.debug(cmd) - p = subprocess.Popen(cmd, shell = True, stderr = subprocess.PIPE) - error = p.stderr - p.wait() - """ - error_msg = error.read() - if len(error_msg) > 0 and error_msg.lower().find("error") != -1: - read_name = self.readNameByReadFile(readFile) - self.writeError("%s: %s" % (read_name, error_msg)) - """ - - - def enqueue(self, readAttr): - for read_file in glob.glob("%s/*_1.%s.gz" % (self.readsDir, readAttr[1]["type"])): - self.queue.put(read_file) - - - def writeError(self, error): - self.lock.acquire() - try: - f = open(self.errorFilePath(), "a") - f.write(error) - f.close() - finally: - self.lock.release() - - - def lastdb(self, directions, parallel = False): - cmds = [] - for direction in directions: - cmds.append(self.lastdbCmd(direction)) - - if parallel: - processes = [] - for cmd in cmds: - logging.info(cmd) - p = subprocess.Popen(cmd, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT) - out = p.stdout - processes.append({"p": p, "out": out}) - for process in processes: - process["p"].wait() - out_data = process["out"].read() - if len(out_data) > 0: - logging.info(out_data) - else: - for cmd in cmds: - logging.info(cmd) - p = subprocess.Popen(cmd, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT) - out = p.stdout - p.wait() - out_data = out.read() - if len(out_data) > 0: - logging.info(out_data) - - - def lastdbCmd(self, direction): - return "lastdb -w2 -u bisulfite_%s.seed %s.%s %s" % (direction, self.refGenome, direction, self.refGenome) - - - def lastalCmd(self, readFile, direction, opts = ""): - if direction == "f": - s_opt = "-s1" - elif direction == "r": - s_opt = "-s0" - read_name = self.readNameByReadFile(readFile) - - return "zcat %s | lastal -p bisulfite_%s.mat %s %s %s.%s - > %s" % (readFile, direction, s_opt, opts, self.refGenome, direction, self.mappingResultFilePath(read_name, direction)) - - - def mergeCmd(self, forwardFile, reverseFile, outputFile, opts = "", rmInFiles = True): - cmd = "last-merge-batches.py %s %s %s > %s" % (opts, forwardFile, reverseFile, outputFile) - if rmInFiles: - cmd += "; rm %s %s" % (forwardFile, reverseFile) - - return cmd - - - def mappingAndMergeCmd(self, readFile, lastalOpts = "", mergeOpts = "", rmInFiles = True): - read_name = self.readNameByReadFile(readFile) - n, ext = os.path.splitext(readFile) - if ext == ".gz": - n, ext = os.path.splitext(n) - lastal_qopt = self.lastalQopt(ext[1:]) - lastal_opt = "%s %s" % (lastalOpts, lastal_qopt) - mapping_file_f = self.mappingResultFilePath(read_name, "f") - mapping_file_r = self.mappingResultFilePath(read_name, "r") - merged_file = self.mergeResultFilePath(read_name) - - return "%s; %s; %s" % (self.lastalCmd(readFile, "f", lastal_opt), self.lastalCmd(readFile, "r", lastal_opt), self.mergeCmd(mapping_file_f, mapping_file_r, merged_file, mergeOpts, rmInFiles)) - - - def mappingResultFilePath(self, readName, direction): - return "%s/%s_%s" % (self.resultsDir, readName, direction) - - - def mergeResultFilePath(self, readName): - return "%s/%s" % (self.resultsDir, readName) - - - def filterResultFilePath(self, readName): - return "%s/%s.maf" % (self.resultsDir, readName) - - - def lastalQopt(self, fileType): - opt = "" - if fileType == "fasta": - opt = "-Q0" - elif fileType == "fastq": - opt = "-Q1" - - return opt - - - def errorFilePath(self): - return "%s/error.txt" % self.baseDir - - -class LastExecutorSingle(LastExecutor): - def __init__(self, refGenome, baseDir, readsDir, resultsDir): - LastExecutor.__init__(self, refGenome, baseDir, readsDir, resultsDir) - - - def batchCmd(self, readFile, readAttr, lastalOpts = "", mergeOpts = "", filterOpts = "", rmInFiles = True): - read_name = self.readNameByReadFile(readFile) - out_file = self.filterResultFilePath(read_name[0:-2]) - - cmds = [] - cmds.append(self.mappingAndMergeCmd(readFile, lastalOpts, mergeOpts, rmInFiles)) - cmds.append(self.filterCmd(self.mergeResultFilePath(read_name), out_file, filterOpts, rmInFiles)) - cmds.append("gzip %s" % out_file) - - return "; ".join(cmds) - - - def filterCmd(self, inputFile, outputFile, opts = "", rmInFile = True): - cmd = "last-map-probs.py %s %s > %s" % (opts, inputFile, outputFile) - if rmInFile: - cmd += "; rm %s" % inputFile - - return cmd - - -class LastExecutorPairedEnd(LastExecutor): - def __init__(self, refGenome, baseDir, readsDir, resultsDir): - LastExecutor.__init__(self, refGenome, baseDir, readsDir, resultsDir) - - - def batchCmd(self, readFile1, readAttr, lastalOpts = "", mergeOpts = "", filterOpts = "", rmInFiles = True): - read_file2 = self.secondReadFilePathByFirstReadFilePath(readFile1, readAttr[2]["type"]) - read_files = (readFile1, read_file2) - - filter_cmd_in_file = "%s %s" % (self.mergeResultFilePath(read_files[0]), self.mergeResultFilePath(read_files[1])) - read_name1 = self.readNameByReadFile(read_files[0]) - read_name2 = self.readNameByReadFile(read_files[1]) - merge_result_file = "%s %s" % (self.mergeResultFilePath(read_name1), self.mergeResultFilePath(read_name2)) - out_file = self.filterResultFilePath(read_name1[0:-2]) - - cmds = [] - cmds.append(self.mappingAndMergeCmd(read_files[0], lastalOpts, mergeOpts, rmInFiles)) - cmds.append(self.mappingAndMergeCmd(read_files[1], lastalOpts, mergeOpts, rmInFiles)) - cmds.append(self.filterCmd(merge_result_file, out_file, filterOpts, rmInFiles)) - cmds.append("gzip %s" % out_file) - - return "; ".join(cmds) - - - def filterCmd(self, inputFile, outputFile, opts = "", rmInFile = True): - cmd = "last-pair-probs.py %s %s > %s" % (opts, inputFile, outputFile) - if rmInFile: - cmd += "; rm %s" % inputFile - - return cmd - - - -class McDetector(BsfCallBase): - def __init__(self, refGenome, resultDirs, mcCtxDir, lowerBound, coverageThreshold): - self.refGenome = refGenome - self.resultDirs = resultDirs - self.mcCtxDir = mcCtxDir - self.lowerBound = lowerBound - self.coverageThreshold = coverageThreshold - - self.refGenomeBuf = [] - self.targetChr = "" - self.targetSeqD = "" - self.targetSeqC = "" - self.targetSeqLen = 0 - - self.mcDetectFh = {} - - - def execute(self, outputFile, numWorkers=1): - for line in open(self.refGenome, "r"): - line = line.strip() - if line[0] == ">": - if self.targetChr != "": - self.executeOneChr() - self.targetChr = self.chrnoFromFastaDescription(line) - else: - self.refGenomeBuf.append(line.upper()) - self.executeOneChr() - self.output(outputFile) - - - def executeOneChr(self): - self.targetSeqD = "".join(self.refGenomeBuf) - self.targetSeqD = self.targetSeqD.upper() - del self.refGenomeBuf[:] - self.targetSeqLen = len(self.targetSeqD) - self.targetSeqC = self.complementSeq(self.targetSeqD) - self.targetSeqC = self.targetSeqC.upper() - - logging.info("mC detection start: %s (%d)" % (self.targetChr, self.targetSeqLen)) - - os.mkdir("%s/%s" % (self.mcCtxDir, self.targetChr)) - for result_dir in self.resultDirs: - for data_file in glob.glob("%s/*.maf.gz" % result_dir): - self.readLastResultFile(data_file) - self.targetSeqD = "" - self.targetSeqC = "" - self.targetSeqLen = 0 - self.closeMcDetectFileHandler() - - - def output(self, outputFile): - if outputFile: - out_f = open(outputFile, "w") - else: - out_f = sys.stdout - - for chr in sorted(os.listdir(self.mcCtxDir), self.chrSort): - self.outputByOneChr(chr, out_f) - - if outputFile: - out_f.close() - - - def readLastResultFile(self, resultFile): - logging.debug("MAF file: %s start." % resultFile) - chr_line = True - chr = "" - try: - f = gzip.open(resultFile, "r") - for line in f: - tag = line[0] - if tag != "s": - continue - line = line.strip() - tag, name, start, aln_size, strand, seq_size, alignment = line.split() - if chr_line: - if name != self.targetChr: - continue - chr = name - gl_alignment = self.clearGap(alignment) - ref_subseqlen = len(gl_alignment) - ref_start = int(start) - ref_seqlen = int(seq_size) - ref_seq = alignment - # alignment = self.clearGap(alignment) - # ref_subseqlen = int(aln_size) - # ref_start = int(start) - # ref_seqlen = int(seq_size) - # ref_seq = alignment - else: - if chr == "": - continue - if strand == "-": - ref_start = self.complementStartPosition(ref_seqlen, ref_start, ref_subseqlen) - ref_seq = self.complementSeq(ref_seq) - alignment = self.complementSeq(alignment) - read_seq = alignment.replace("t", "C") - self.extractMcContextsByOneRead(chr, strand, ref_seq.upper(), ref_start, ref_seqlen, read_seq) - chr_line = not chr_line - f.close() - logging.debug("MAF file: %s end." % resultFile) - except IOError, e: - logging.fatal("%s: %s" % (resultFile, e.args[0])) - if f: - f.close() - - - def extractMcContextsByOneRead(self, chr, strand, refSeq, refStart, refSeqLen, readSeq): - logging.debug("extractMcContextsByOneRead(%s, %s, %s, %d, %d, %s)" % (chr, strand, refSeq, refStart, refSeqLen, readSeq)) - - nogap_refseq = self.clearGap(refSeq) - bases = list(refSeq) - last_pos = len(nogap_refseq) - 1 - pos = -1 - while True: - try: - pos = bases.index("C", pos + 1) - num_gaps = refSeq.count("-", 0, pos) - real_pos = pos - num_gaps - ctx_type = self.getMcContextType(nogap_refseq, real_pos, last_pos) - ctx_pos = refStart + real_pos - if ctx_type == None: - if strand == "+": - ctx_type = self.getMcContextType(self.targetSeqD, ctx_pos, self.targetSeqLen - 1) - elif strand == "-": - ctx_type = self.getMcContextType(self.targetSeqC, ctx_pos, self.targetSeqLen - 1) - if strand == "-": - ctx_pos = refSeqLen - ctx_pos - 1 - self.mcDetectFileHandler(ctx_pos).write("%d\t%s\t%s\t%s\n" % (ctx_pos, strand, ctx_type, readSeq[pos])) - except IndexError: - logging.debug("extractMcContextsByOneRead#IndexError: %s %d %s %s %s %d" % (chr, ctx_pos, strand, ctx_type, readSeq, pos)) - except ValueError: - break - - - def mcDetectFileHandler(self, pos): - outfile = "%s/%s/%010d" % (self.mcCtxDir, self.targetChr, (int(pos) / 100000)) - if outfile not in self.mcDetectFh: - self.mcDetectFh[outfile] = open(outfile, "w") - - return self.mcDetectFh[outfile] - - - def closeMcDetectFileHandler(self): - for f in self.mcDetectFh.values(): - f.close() - self.mcDetectFh.clear() - - - def outputByOneChr(self, chr, outFh): - ctx_dir = "%s/%s" % (self.mcCtxDir, chr) - for fname in sorted(os.listdir(ctx_dir)): - self.outputByOneMcFile(chr, "%s/%s" % (ctx_dir, fname), outFh) - - - def outputByOneMcFile(self, chr, fpath, outFh): - mc_contexts = {} - f = open(fpath, "r") - for line in f: - try: - ctx_pos, strand, mc_ctx, base = line.strip().split("\t") - ctx_pos = int(ctx_pos) - if not ctx_pos in mc_contexts: - mc_contexts[ctx_pos] = {} - if not strand in mc_contexts[ctx_pos]: - mc_contexts[ctx_pos][strand] = {} - if not mc_ctx in mc_contexts[ctx_pos][strand]: - mc_contexts[ctx_pos][strand][mc_ctx] = [] - mc_contexts[ctx_pos][strand][mc_ctx].append(base) - except ValueError, e: - logging.warning("ValueError: %s: %s -> %s" % (fpath, line.strip(), e.args[0])) - - for pos in sorted(mc_contexts.keys()): - for strand in mc_contexts[pos].keys(): - for mc_ctx in mc_contexts[pos][strand].keys(): - coverage, mc_ratio = self.calcCoverage(mc_contexts[pos][strand][mc_ctx]) - if coverage >= self.coverageThreshold and mc_ratio >= self.lowerBound: - outFh.write("%s\t%d\t%s\t%s\t%.02f\t%d\n" % (chr, pos, strand, mc_ctx, mc_ratio, coverage)) - - f.close() - self.gzipFile(fpath, False) - - - def mcContextHash(self): - h = {} - for strand in self.strands(): - h[strand] = {} - for mc_ctx in self.mcContextTypes(): - h[strand][mc_ctx] = {} +import os +import sys +import bsfcall - return h +prog = 'bsf-call' +usage = """%prog [options] refgenome read1 read2 ... +ample: %prog -c 10 -m 0.1 -s 250 -o experiment.txt hg19/hg19.fa paired-sample1-1.sra,paired-sample1-2.sra single-sample1""" +description = "A mapping of the read bisulfite treated by LAST, to detect methylated cytosine (mC) of the results, and outputs the detection result to the file." +op = OptionParser(prog=prog, usage=usage, description=description, version="%s-%s" % (prog, __version__)) - def cSeq(self, seq): - return seq[0:1].upper() == "C" +op.add_option("-c", "--coverage", type="int", default=5, metavar="C", + help="threshold of read coverate (default: %default)") +op.add_option("-d", "--pe-direction", type="string", default="ff", + help="direction of paired-end probes: ff, fr, rf, rr (default: %default)") - def tSeq(self, seq): - return seq[0:1].upper() == "T" +op.add_option("-l", "--lower-bound", type="float", default=0.01, metavar="L", + help="threshold of mC ratio (default: %default)") +op.add_option("-p", "--multi-thread", type="int", default=1, metavar="P", + help="number of threads (default: %default)") - def calcCoverage(self, seqAry): - num_seq = len(seqAry) - c_ary = filter(self.cSeq, seqAry) - t_ary = filter(self.tSeq, seqAry) - num_c = len(c_ary) - num_t = len(t_ary) +op.add_option("-s", "", type="int", default=150, metavar="S", + help="threshold of the alignment score at filtering (default: %default)") - if num_c + num_t == 0: - return (num_seq, 0) - else: - return (num_seq, float(num_c) / (float(num_c) + float(num_t))) - +op.add_option("-m", "", type="float", default=0.01, metavar="M", + help="threshold of the mismap probability at filtering (default: %default)") -if __name__ == "__main__": - prog = 'bsf-call' - usage = """%prog [options] refgenome read1 read2 ... - example: %prog -c 10 -m 0.1 -s 250 -o experiment.txt hg19/hg19.fa paired-sample1-1.sra,paired-sample1-2.sra single-sample1""" - description = "A mapping of the read bisulfite treated by LAST, to detect methylated cytosine (mC) of the results, and outputs the detection result to the file." +op.add_option("", "--last", type="string", default="", metavar="OPT1,OPT2,...", + help="options for LAST (lastal command)") - op = OptionParser(prog=prog, usage=usage, description=description, version="%s-%s" % (prog, __version__)) +op.add_option("-o", "", type="string", default=None, metavar="FILE", + help="output file (default: stdout)") - op.add_option("-c", "--coverage", type="int", default=5, metavar="C", - help="threshold of read coverate (default: %default)") +op.add_option("-W", "", type="string", default=None, metavar="WORKDIR", + help="work directory (default: bsf-call_work)") - op.add_option("-d", "--pe-direction", type="string", default="ff", - help="direction of paired-end probes: ff, fr, rf, rr (default: %default)") +op.add_option("", "--work-auto", action="store_true", dest="auto_create_work_dir", default=False, + help="create work directory automatically") - op.add_option("-l", "--lower-bound", type="float", default=0.01, metavar="L", - help="threshold of mC ratio (default: %default)") +op.add_option("-n", "", action="store_true", dest="use_cluster", default=False, + help="run bsf-call on pc cluster") - op.add_option("-p", "--multi-thread", type="int", default=1, metavar="P", - help="number of threads (default: %default)") +op.add_option("-q", "", type="string", default="", metavar="QUEUE_LIST", + help="queue list") - op.add_option("-s", "", type="int", default=150, metavar="S", - help="threshold of the alignment score at filtering (default: %default)") +op.add_option("-M", "", type="string", metavar="MAPPING_DIR", + help="mapping result directory") - op.add_option("-m", "", type="float", default=0.01, metavar="M", - help="threshold of the mismap probability at filtering (default: %default)") +op.add_option("-T", "", type="string", metavar="LOCAL_DIR", + help="local directory") - op.add_option("", "--last", type="string", default="", metavar="OPT1,OPT2,...", - help="options for LAST (lastal command)") +op.add_option("-r", "", type="string", default="100M", metavar="SPLIT_READ_SIZE", + help="split read size") - op.add_option("-o", "", type="string", default=None, metavar="FILE", - help="output file (default: stdout)") +options, args = op.parse_args() - op.add_option("-W", "", type="string", default="bsf-call_work", metavar="WORKDIR", - help="work directory (default: %default)") +errors = [] - op.add_option("", "--work-auto", action="store_true", dest="auto_create_work_dir", default=False, - help="create work directory automatically") +work_dir = None +if options.W: + work_dir = options.W +else: + if not options.auto_create_work_dir: + work_dir = "bsf-call_work" - options, args = op.parse_args() +if options.M: + if len(args) < 1: + op.error("\n Reference genome is not specified.") - work_dir = None - if op.has_option("-W"): - work_dir = options.W - else: - if not options.auto_create_work_dir: - work_dir = "bsf-call_work" + for result_dir in options.M.split(","): + if not os.path.exists(result_dir): + errors.append("Mapping result directory: '%s' does not exists." % options.M) + ref_genome = args[0] + reads = None +else: if len(args) < 2: op.error("\n Reference genome and read sequence is not specified.") ref_genome = args[0] reads = args[1:] - errors = [] - - if not os.path.exists(ref_genome): - errors.append("Reference genome: '%s' does not exists." % ref_genome) for read_files in reads: for read_file in read_files.split(','): if not os.path.exists(read_file): errors.append("Read file: '%s' does not exists." % read_file) - # bisulfite_files = ("bisulfite_f.seed", "bisulfite_r.seed", "bisulfite_f.mat", "bisulfite_r.mat") - # for bisulfite_file in bisulfite_files: - # if not os.path.exists(bisulfite_file): - # errors.append("Bisulfite file: '%s' does not exists." % bisulfite_file) - - if work_dir and os.path.exists(work_dir): - errors.append("Work directory: '%s' already exists." % work_dir) - - if len(errors) > 0: - op.error("\n " + "\n ".join(errors)) - - cmd_opts = {} - cmd_opts["coverage"] = options.coverage - cmd_opts["pe_direction"] = options.pe_direction - cmd_opts["num_threads"] = options.multi_thread - cmd_opts["lower_bound"] = options.lower_bound - cmd_opts["alignment_score_threshold"] = options.s - cmd_opts["alignment_mismap_prob_threshold"] = options.m - cmd_opts["output"] = options.o - cmd_opts["last_opts"] = options.last - cmd_opts["work_dir"] = work_dir - - bsf_call = BsfCall(ref_genome, reads, cmd_opts) + errors.append("Working directory: '%s' already exists." % work_dir) + +if not os.path.exists(ref_genome): + errors.append("Reference genome: '%s' does not exists." % ref_genome) + +if len(errors) > 0: + op.error("\n " + "\n ".join(errors)) + +cmd_opts = {} +cmd_opts["coverage"] = options.coverage +cmd_opts["pe_direction"] = options.pe_direction +cmd_opts["num_threads"] = options.multi_thread +cmd_opts["lower_bound"] = options.lower_bound +cmd_opts["aln_score_thres"] = options.s +cmd_opts["aln_mismap_prob_thres"] = options.m +cmd_opts["output"] = options.o +cmd_opts["last_opts"] = options.last +cmd_opts["work_dir"] = work_dir +cmd_opts["use_cluster"] = options.use_cluster +cmd_opts["queue_list"] = options.q +cmd_opts["mapping_dir"] = options.M +cmd_opts["local_dir"] = options.T +cmd_opts["split_read_size"] = options.r + +if cmd_opts["use_cluster"]: + cmd_opts["num_threads"] = 1 + bsf_call = bsfcall.BsfCallCluster(ref_genome, reads, cmd_opts) +else: + bsf_call = bsfcall.BsfCall(ref_genome, reads, cmd_opts) + +if not cmd_opts["mapping_dir"]: bsf_call.bisulfite_f_seed() bsf_call.bisulfite_r_seed() bsf_call.bisulfite_f_mat() bsf_call.bisulfite_r_mat() - bsf_call.execute() - sys.exit(0) + +bsf_call.execute() + +sys.exit(0) diff --git a/bsf-call/bsf-call.txt b/bsf-call/bsf-call.txt index da01c09..fff4c7c 100644 --- a/bsf-call/bsf-call.txt +++ b/bsf-call/bsf-call.txt @@ -8,34 +8,58 @@ SYNOPSIS OPTIONS -c , --coverage= - Specify the threshold for valid read coverage for mC detection (default: 5). + Specifies the threshold for valid read coverage for mC detection (default: 5). -l , --lover-bound= - Specify the threshold for valid mC rate (default: 0.01). + Specifies the threshold for valid mC rate (default: 0.01). -s - Specify the threshold for valid alignment score (default: 150) + Specifies the threshold for valid alignment score (default: 150) -m - Specify the threshold for filtering mis-mapping probability (default: 0.01) + Specifies the threshold for filtering mis-mapping probability (default: 0.01) -p , --multi-thread= - Specify the number of threads (default: 1) + Specifies the number of threads (default: 1) -o - Specify the filename for the output (default: stdout) + Specifies the filename for the output (default: STDOUT) --last= - Specify options passed to lastal command. + Specifies options passed to lastal command. "-s" and "-Q" options are automatically specified by bsf-call. -W - Specify the working directory (default: ./bsf-call_work ). + Specifies the working directory (default: ./bsf-call_work ). - --work-wuto - Let bsf-call determine the working directory automatically. + --work-auto + Lets bsf-call determine the working directory automatically which differs from -W default value and the generated wroking directory has a format YYYYmmdd-HHMMSS-microsecond-MD5HASH. + MD5HASH part is a MD5 hash value (first 16 columns) computed from values of options and arguments. + e.g. 20130130-164854-371195-14a503deb0ef62a5 -W option cancels this option. + -n + Lets bsf-call run on parallel job scheduler. + Currently Sun Grid Engine is supported. + This option cancells -p or --multi-thread. + + -q , -q @,@,... + Specifies queue names used for bsf-call. + + -M , -M ,,... + Only mC-detection is performed by using mapping results stored in . + Subdirectories are searched for finding mapping files in MAF format. + + +REQUIRED FILES + The following files should stay in the same directory: + bsf-call + bsfcall.py + mapping-p.sh + mapping-s.sh + mc-detector.py + + REFERENCE_GENOME Specify the filename for the reference genome, which is a multi-fasta file. Example: hg19/hg19.fa @@ -53,9 +77,19 @@ SEQUENCE_FILES 3) mixture of single and paired reads single-sample1.fastq paired-sample1-1.fastq,paired-sample1-2.fastq -EXAMPLE +EXAMPLES + + Case 1) bsf-call -c 10 -m 0.1 -s 250 -o experiment.txt --last=-d108,-e120,-i1 -W mywork hg19/hg19.fa paired-sample1-1.fastq,paired-sample1-2.fastq single-sample1.fasta + Case 2) + bsf-call -n -c 10 -m 0.1 -s 250 -o result.txt --last=-d108,-e120,-i1 -W mywork hg19/hg19.fa paired-sample1-1.fastq,paired-sample1-2.fastq single-sample1.fasta + + Case 3) + bsf-call -n -M mapping_resdir1,mapping_resdir2 -c 10 -m 0.1 -o result.txt -W mywork hg19/hg19.fa + + + OUTPUT FORMAT bsf-call outputs six column tab-delimited text file. Col.| Description @@ -73,6 +107,22 @@ OUTPUT FORMAT chr10 71345 - CHH 0.07 5 chrX 52123 + CG 0.21 21 +WORKING DIRECTORY + $W represents a specified or default working directory. + $W/1 Results for single/primary reads are stored under this directory. + mapping_.err STDERR from a job scheduler go to this file. + mapping_.out STDOUT from a job scheduler go to this file. + $W/1/reads Single/primary reads divided into 100mil read per file are stored in this directory. + $W/1/results Mapping/filtering results + $W/2 Results for secondary reads + $W/mc_contexts mC-detection results + $W/mc_contexts/ Intermediate files + $W/mc_contexts/.tsv mC-detection results per chromosome + $W/mc_contexts/mc-detector-.err Error messages from mc-detector.py + $W/mc_contexts/mc-detector-.log Log messages from mc-detector.py + $W/mc_contexts/mc-detector-.out STDOUT from mc-detector.py + $W/bsf-call.log Log messages from bsf-call script. + AUTHOR Junko Tsuji Toutai Mituyama diff --git a/bsf-call/bsfcall.py b/bsf-call/bsfcall.py new file mode 100755 index 0000000..66d373d --- /dev/null +++ b/bsf-call/bsfcall.py @@ -0,0 +1,1824 @@ +#!/usr/bin/env python +""" +Bisulfighter::bsf-call + +Bisulfighter (http://epigenome.cbrc.jp/bisulfighter) +by National Institute of Advanced Industrial Science and Technology (AIST) +is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. +http://creativecommons.org/licenses/by-nc-sa/3.0/ +""" + +__version__= "0.9" + +import sys +import os +import glob +import threading +import subprocess +import Queue +from datetime import datetime +import hashlib +from string import maketrans +import gzip +import logging +from time import sleep +from shutil import copy + +class BsfCallBase(object): + """ + base class for BsfCall, LastExecutor, McDetector. + define functions that is used in each sub classes. + """ + + def splitFilePath(self, filePath): + """ + split file path to directory path, file name (with file extension), + file name (without file extension) and file extension. + if extension of filePath is '.gz', '.gz' extension is ignored. + """ + + if self.isGzippedFile(filePath): + dir_name, file_name = os.path.split(filePath[0:-3]) + else: + dir_name, file_name = os.path.split(filePath) + base_name, ext = os.path.splitext(file_name) + if len(ext) > 1: + ext = ext[1:] + + return (dir_name, file_name, base_name, ext) + + + def readNameByReadFile(self, readFilePath): + """ + get read name by read file path. + """ + + dir_name, file_name, read_name, ext = self.splitFilePath(readFilePath) + + return read_name + + + def secondReadFilePathByFirstReadFilePath(self, readFile, secondReadType = None): + """ + get second read file path by first read file path. + if first read file path is '/path/to/read_file_1.fastq' second read file + path is '/path/to/read_file_2.fastq' + if secondReadType is specified, the extension of second read file is its + value. + """ + + fpath = "" + + if self.isGzippedFile(readFile): + dir_name, file_name = os.path.split(readFile[0:-3]) + basename, ext = os.path.splitext(file_name) + if secondReadType: + ext = ".%s" % secondReadType + fpath = "%s/%s2%s.gz" % (dir_name, basename[0:-1], ext) + else: + dir_name, file_name = os.path.split(readFile) + basename, ext = os.path.splitext(file_name) + if secondReadType: + ext = ".%s" % secondReadType + fpath = "%s/%s2%s" % (dir_name, basename[0:-1], ext) + + return fpath + + + def pairedEndReadNumbers(self): + return (1, 2) + + + def clearGap(self, seq): + return seq.replace("-", "") + + + def complementStartPosition(self, genomeLen, subseqStart, subseqLen): + return genomeLen - subseqStart - subseqLen + + + def complementSeq(self, seq): + return seq.translate(maketrans("ATGCatgc", "TACGtacg"))[::-1] + + + def mcContextType(self, genomeSeq, cBasePos): + """ + get mC context type (CG, CHG, CHH) by genome sequence and C base position. + if no mC context found, return None. + """ + + try: + if genomeSeq[cBasePos + 1] == "G": + return "CG" + else: + if genomeSeq[cBasePos + 2] == "G": + return "CHG" + else: + return "CHH" + return None + + except IndexError: + return None + + + def chrSort(self, a, b): + return cmp(a, b) + + + def strands(self): + return ("+", "-") + + + def mcContextTypes(self): + return ("CG", "CHG", "CHH") + + + def gzipFile(self, filePath, wait = True, log = False): + """ + gzip file. If wait argument is False, without waiting for gzip process to be + completed, this function returns immediately. + """ + + if log: + logging.info("gzip start: %s" % filePath) + + dirpath, fname = os.path.split(filePath) + cmd = "gzip %s" % fname + p = subprocess.Popen(cmd, shell = True, cwd = dirpath) + if wait: + p.wait() + + if log: + logging.info("gzip done: %s" % filePath) + + + def isGzippedFile(self, filePath): + return filePath[-3:] == ".gz" + + + def scriptDir(self): + return os.path.dirname(os.path.abspath(sys.argv[0])) + + + def chrnoFromFastaDescription(self, description): + """ + get chromosome number by fasta description line. + """ + + return description.strip()[1:].strip() + + + def chrsByRefGenome(self, refGenome): + """ + get all chromosome numbers that is included in the reference genome. + """ + + chrs = [] + for line in open(refGenome, "r"): + line = line.strip() + if line[0] == ">": + chrs.append(self.chrnoFromFastaDescription(line)) + + return chrs + + + def oneChrGenomeSeq(self, refGenome, chrNo): + """ + get genome sequence of specified chrmosome number. + """ + + in_target_chr = False + buf = [] + for line in open(refGenome, 'r'): + line = line.strip() + if line[0] == ">": + if in_target_chr: + break + else: + cur_chr = self.chrnoFromFastaDescription(line) + if cur_chr == chrNo: + in_target_chr = True + else: + if in_target_chr: + buf.append(line) + + return "".join(buf) + + + def lastalOpts(self, lastOpt): + """ + get options for lastal by bsf-call --last option + """ + + return " ".join(lastOpt.split(",")) + + + def mergeOpts(self): + return "" + + + def filterOpts(self, mismapProb, scoreThres, isPairedEnd): + """ + get filtering option. this option is specified to last-map-probs.py or + last-pair-probs.py. + """ + + option = "" + if isPairedEnd: + option = "-m%s" % str(mismapProb) + else: + option = "-s%d -m%s" % (scoreThres, str(mismapProb)) + + return option + + + def isPairedEnd(self, readAttr): + return self.pairedEndReadNumbers()[1] in readAttr + + + def jobIdByQsubOutput(self, qsubOutput): + """ + get job id submitted by qsub command and its output. + """ + + fields = qsubOutput.strip().split() + + return fields[2] + + + def waitForSubmitJobs(self, jobIds, checkInterval = 10): + """ + wait for all jobs that have been submitted with qsub command to finish. + """ + + error_jobs = [] + while True: + all_done = True + qstat = os.popen("qstat") + for line in qstat: + fields = line.strip().split() + if fields[0] in jobIds: + if fields[4].find('E') > 0: + if fields[0] not in error_jobs: + logging.fatal("Error has occurred: Job ID=%s" % fields[0]) + error_jobs.append(fields[0]) + else: + all_done = False + break + qstat.close() + + if all_done: + break + else: + sleep(checkInterval) + + return + + + def logJobSubmit(self, msg, jobId, cmd = None): + logging.info("Submit job: %s --> Job ID = %s" % (msg, jobId)) + if cmd: + logging.info(cmd) + + +class BsfCall(BsfCallBase): + """ + class to execute bsf-call process. + """ + + def __init__(self, refGenome, readFilePaths, cmdOpts): + """ + constructor of BsfCall + """ + + self.refGenome = refGenome + self.readFilePaths = readFilePaths + self.reads = [] + self.opts = cmdOpts + + self.dataDir = None + self.genomeDir = None + self.mcContextDir = None + + self.readInFh1 = None + self.readInFh2 = None + self.numReads = {1: 0, 2: 0} + + self.setDataDir() + self.setLogger() + + logging.info("bsf-call start.") + self.logOption() + + self.numReadsPerFile = self.sizeForSplitRead(self.opts["split_read_size"]) + + if self.opts["mapping_dir"]: + self.opts["only_mcdetection"] = True + else: + self.opts["only_mcdetection"] = False + + + def execute(self): + """ + execute bsf-call process. + """ + + try: + if self.opts["mapping_dir"]: + result_dirs = self.opts["mapping_dir"].split(",") + else: + self.makeIndexFile() + self.processReads() + result_dirs = self.processMapping() + + self.processMcDetection(result_dirs, self.opts["local_dir"]) + + logging.info("bsf-call done.") + except: + logging.exception("Exception has occurred.") + + + def processMapping(self): + """ + run read mapping and filtering process. + """ + + logging.info("Mapping and filtering process start.") + + result_dirs = [] + for read_attr in self.reads: + self.processMappingOneRead(read_attr) + result_dirs.append(read_attr["results_dir"]) + + logging.info("Mapping and filtering process done.") + + return result_dirs + + + def processMcDetection(self, resultDirs, localDir = None): + """ + run mC detection process. + """ + + logging.info("mC detection process start.") + + mc_detector = McDetector(self.refGenome, resultDirs, self.mcContextDir, self.opts) + mc_detector.execute(self.opts["output"], self.opts["num_threads"]) + + logging.info("mC detection process done.") + + + def setDataDir(self): + """ + create directries to store the files of the bsf-call process. + """ + + if self.opts["work_dir"]: + self.dataDir = self.opts["work_dir"] + else: + self.dataDir = self.autoWorkDir() + + self.mcContextDir = "%s/mc_contexts" % self.dataDir + + if not os.path.exists(self.dataDir): + os.makedirs(self.dataDir) + + if not os.path.exists(self.mcContextDir): + os.makedirs(self.mcContextDir) + + + def setLogger(self): + """ + create logger to store the logs of the bsf-call process. + """ + + log_level = logging.INFO + + log_file = "%s/bsf-call.log" % self.dataDir + file_logger = logging.FileHandler(filename=log_file) + + file_logger.setLevel(log_level) + file_logger.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s')) + + logging.getLogger().addHandler(file_logger) + logging.getLogger().setLevel(log_level) + + + def logOption(self): + """ + output bsf-call option values and arguments to the log. + """ + + if self.opts["mapping_dir"]: + logging.info("Mapping result directory is specified. Only mC detection is executed.") + logging.info(" Mapping result directory: %s" % self.opts["mapping_dir"]) + logging.info(" Reference genome: %s" % self.refGenome) + else: + logging.info("Reference genome: %s" % self.refGenome) + logging.info("Read files: %s" % self.readFilePaths) + logging.info("Working directory: %s" % self.dataDir) + logging.info("Options:") + logging.info(" Threshold of the alignment score at filtering: %d" % self.opts["aln_score_thres"]) + logging.info(" Paired-end direction: %s" % self.opts["pe_direction"]) + logging.info(" Options for LAST: %s" % self.opts["last_opts"]) + + logging.info(" Threshold of read coverate: %d" % self.opts["coverage"]) + logging.info(" Threshold of mC ratio: %s" % str(self.opts["lower_bound"])) + logging.info(" Threshold of the mismap probability at filtering: %s" % str(self.opts["aln_mismap_prob_thres"])) + logging.info(" Working directory: %s" % self.dataDir) + logging.info(" Local directory: %s" % self.opts["local_dir"]) + logging.info(" Output file: %s" % (self.opts["output"] if self.opts["output"] else "(stdout)")) + logging.info(" Use cluster: %s" % ("Yes" if self.opts["use_cluster"] else "No")) + logging.info(" Queue name: %s" % self.opts["queue_list"]) + logging.info(" Number of threads: %d" % self.opts["num_threads"]) + logging.info(" Split read size: %s" % self.opts["split_read_size"]) + + + def sizeForSplitRead(self, size): + """ + get split read size. + """ + + unit = size[-1:].upper() + if unit == "K": + split_size = int(size[0:-1]) * 1000 + elif unit == "M": + split_size = int(size[0:-1]) * 1000 * 1000 + else: + split_size = int(size) + + return split_size + + + def processReads(self): + """ + process all read files. + """ + + for read_no, read_path in enumerate(self.readFilePaths): + read = self.processOneRead(read_path, read_no + 1) + self.reads.append(read) + + + def processOneRead(self, readPath, readNo): + """ + process one read. do the following: + create directories to store the splited read and result files. + if read is SRA file, convert to FASTQ file using fastq-dump command. + split read file. + """ + + logging.info("Process read file start: %d: %s" % (readNo, readPath)) + + data_dir = "%s/%d"% (self.dataDir, readNo) + read = {"base_dir": data_dir, "path": readPath, "reads_dir": data_dir + "/reads", "results_dir": data_dir + "/results"} + + if not os.path.exists(data_dir): + os.makedirs(data_dir) + os.makedirs(read["reads_dir"]) + os.makedirs(read["results_dir"]) + + pe_no = self.pairedEndReadNumbers()[0] + for read_path in readPath.split(","): + dir_name, file_name, base_name, ext = self.splitFilePath(read_path) + file_type = self.checkReadFileType(read_path) + read[pe_no] = {"name": base_name, "fname": file_name, "type": file_type, "path": read_path} + pe_no += 1 + + if self.isPairedEnd(read): + for pe_no in self.pairedEndReadNumbers(): + if read[pe_no]["type"] == "sra": + self.sra2Fastq(read[pe_no]["path"], data_dir, False) + fastq_fpath = self.fastqDumpedFilePath(data_dir, read[pe_no]["name"]) + if os.path.exists(fastq_fpath): + read[pe_no]["name"] = self.readNameByReadFile(fastq_fpath) + read[pe_no]["path"] = fastq_fpath + read[pe_no]["type"] = "fastq" + read[pe_no]["fname"] = os.path.basename(read[pe_no]["path"]) + else: + if read[1]["type"] == "sra": + self.sra2Fastq(read[1]["path"], data_dir, True) + read_name = read[1]["name"] + for pe_no in self.pairedEndReadNumbers(): + fastq_fpath = self.fastqDumpedFilePath(data_dir, read_name, pe_no) + if os.path.exists(fastq_fpath): + if not pe_no in read: + read[pe_no] = {} + read[pe_no]["name"] = self.readNameByReadFile(fastq_fpath) + read[pe_no]["path"] = fastq_fpath + read[pe_no]["type"] = "fastq" + read[pe_no]["fname"] = os.path.basename(read[pe_no]["path"]) + + is_paired_end = self.isPairedEnd(read) + logging.info("Paired-end: %s" % is_paired_end) + logging.info(" Forward: %s" % read[1]["path"]) + if is_paired_end: + logging.info(" Reverse: %s" % read[2]["path"]) + + self.splitRead(read) + + # gzip FASTQ file which converted from SRA file in background. + for dumped_fastq_file in glob.glob("%s/*.fastq" % read["base_dir"]): + self.gzipFile(dumped_fastq_file, False) + + logging.info("Process read file done: %d: %s" % (readNo, readPath)) + + return read + + + def processMappingOneRead(self, readAttr): + """ + run mapping and filtering process for one read. + """ + + logging.info("Target read file: %s" % readAttr["path"]) + + self.runLast(readAttr) + + + def runLast(self, readAttr): + """ + run LAST programs to map reads and filtering. + """ + + is_paired_end = self.isPairedEnd(readAttr) + filter_option = self.filterOpts(self.opts["aln_mismap_prob_thres"], self.opts["aln_score_thres"], is_paired_end) + + if is_paired_end: + last_executor = LastExecutorPairedEnd(self.refGenome, self.dataDir, readAttr["reads_dir"], readAttr["results_dir"]) + else: + last_executor = LastExecutorSingle(self.refGenome, self.dataDir, readAttr["reads_dir"], readAttr["results_dir"]) + + last_executor.execute(readAttr, self.opts["num_threads"], self.lastalOpts(self.opts["last_opts"]), self.mergeOpts(), filter_option) + + + def sra2Fastq(self, sraFile, outputDir, splitFiles = True): + """ + convert SRA file to FASTQ file. + """ + + logging.info("Convert SRA file to FASTQ start: %s" % sraFile) + + cmd = "fastq-dump -O %s " % outputDir + if splitFiles: + cmd += "--split-files " + cmd += sraFile + logging.info(cmd) + p = subprocess.Popen(cmd, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT) + out = p.stdout + p.wait() + out_data = out.read() + if len(out_data) > 0: + logging.info(out_data) + + logging.info("Convert SRA file to FASTQ done: %s" % sraFile) + + + def splitRead(self, readAttr): + """ + split read file. + """ + + logging.info("Split read file start: %s" % readAttr["path"]) + + is_paired_end = self.isPairedEnd(readAttr) + + self.readInFh1 = open(readAttr[1]["path"], "r") + if is_paired_end: + self.readInFh2 = open(readAttr[2]["path"], "r") + + out_fpath1 = self.splitedReadFilePath(readAttr["reads_dir"], 1, self.numReadsPerFile, 1, readAttr[1]["type"]) + while self.outputOneSplittedReadFile(self.readInFh1, out_fpath1, readAttr, 1): + out_fpath1 = self.splitedReadFilePath(readAttr["reads_dir"], self.numReads[1] + 1, self.numReads[1] + self.numReadsPerFile, 1, readAttr[1]["type"]) + + if is_paired_end: + out_fpath2 = self.secondReadFilePathByFirstReadFilePath(out_fpath1, readAttr[2]["type"]) + self.outputOneSplittedReadFile(self.readInFh2, out_fpath2, readAttr, 2) + + self.readInFh1.close() + if is_paired_end: + self.readInFh2.close() + + logging.info("Split read file done: %s" % readAttr["path"]) + logging.info("%s: the number of reads (1st read): %d" % (readAttr[1]["path"], self.numReads[1])) + if is_paired_end: + logging.info("%s: the number of reads (2nd read): %d" % (readAttr[2]["path"], self.numReads[2])) + + self.readInFh1 = None + self.readInFh2 = None + self.numReads[1] = 0 + self.numReads[2] = 0 + + + def outputOneSplittedReadFile(self, fh, outFilePath, readAttr, pairedEndReadNo): + """ + output one splitted read file. + """ + + if readAttr[pairedEndReadNo]["type"] == "fastq": + return self.outputOneSplittedFastqFile(fh, outFilePath, readAttr, pairedEndReadNo) + elif readAttr[pairedEndReadNo]["type"] == "fasta": + return self.outputOneSplittedFastaFile(fh, outFilePath, readAttr, pairedEndReadNo) + + + def outputOneSplittedFastqFile(self, fh, outFilePath, readAttr, pairedEndReadNo): + """ + output one splitted FASTQ file. + """ + + num_lines_per_read = 4 + line_no = 1 + out_fh = open(outFilePath, "w") + for line in fh: + if line_no % num_lines_per_read == 2: + line = line.upper().replace("C", "t") + out_fh.write(line) + + if line_no % num_lines_per_read == 0: + self.numReads[pairedEndReadNo] += 1 + + if line_no % (num_lines_per_read * self.numReadsPerFile) == 0: + out_fh.close() + self.afterProcessSplitRead(outFilePath, readAttr) + if self.isPairedEnd(readAttr) and pairedEndReadNo == 1: + out_fpath2 = self.secondReadFilePathByFirstReadFilePath(outFilePath, readAttr[2]["type"]) + self.outputOneSplittedReadFile(self.readInFh2, out_fpath2, readAttr, 2) + + return line_no + + line_no += 1 + + self.afterProcessSplitRead(outFilePath, readAttr) + + return None + + + def outputOneSplittedFastaFile(self, fh, outFilePath, readAttr, pairedEndReadNo): + """ + output one splitted FASTA file. + """ + + num_reads = 0 + line_no = 1 + out_fh = open(outFilePath, "w") + for line in fh: + if line[0:1] == ">": + out_fh.write(line) + num_reads += 1 + self.numReads[pairedEndReadNo] += 1 + else: + line = line.upper().replace("C", "t") + out_fh.write(line) + + if num_reads != 0 and num_reads % self.numReadsPerFile == 0: + out_fh.close() + self.afterProcessSplitRead(outFilePath, readAttr) + if self.isPairedEnd(readAttr) and pairedEndReadNo == 1: + out_fpath2 = self.secondReadFilePathByFirstReadFilePath(outFilePath, readAttr[2]["type"]) + self.outputOneSplittedReadFile(self.readInFh2, out_fpath2, readAttr, 2) + return line_no + + line_no += 1 + + self.afterProcessSplitRead(outFilePath, readAttr) + + return None + + + + def afterProcessSplitRead(self, readFile, readAttr = None): + """ + this function is called after output splitted one read file. + """ + + self.gzipFile(readFile, True, True) + + + def makeIndexFile(self): + """ + create index file of reference genome. + """ + + directions = [] + if not os.path.exists("%s.f.prj" % self.refGenome): + directions.append("f") + if not os.path.exists("%s.r.prj" % self.refGenome): + directions.append("r") + + if len(directions) > 0: + logging.info("Make index file start.") + last_executor = LastExecutor(self.refGenome, self.dataDir) + last_executor.lastdb(directions, self.opts["num_threads"] > 1) + logging.info("Make index file done.") + + + def checkReadFileType(self, readFilePath): + """ + get read file type. + """ + + name, ext = os.path.splitext(readFilePath) + if ext == ".gz": + name, ext = os.path.splitext(name) + + if len(ext) > 1: + ext = ext[1:] + + file_type = None + + if ext == "sra" or "fastq" or "fasta": + file_type = ext + elif ext == "fa": + file_type = "fasta" + else: + f = open(readFilePath, "r") + first_char = f.read(1) + if first_char == "@": + file_type = "fastq" + elif first_char == ">": + file_type = "fasta" + else: + file_type = "sra" + f.close() + + return file_type + + + def splitedReadFilePath(self, outputDir, start, end, readDirection, ext): + """ + get splitted read file path. + """ + + return "%s/%010d-%010d_%d.%s" % (outputDir, start, end, readDirection, ext) + + + def fastqDumpedFilePath(self, outputDir, readName, readDirection = None): + """ + get output file path for fastq-dump command. + """ + + path = "%s/%s" % (outputDir, readName) + if readDirection: + path += "_%d" % readDirection + + return path + ".fastq" + + + def autoWorkDir(self): + """ + get working directory path determined automatically. + """ + + now = datetime.now() + + s = ",".join(self.readFilePaths) + self.refGenome + for key, value in self.opts.items(): + s += ("%s:%s" % (key, value)) + h = hashlib.md5(s).hexdigest() + + return "%s-%06d-%s" % (now.strftime("%Y%m%d-%H%M%S"), now.microsecond, h[0:16]) + + + def bisulfite_f_seed(self): + """ + output bisulfite_f.seed file. + """ + + filename = 'bisulfite_f.seed' + if not os.path.exists(filename): + file = open(filename, "w") + file.write(""" +# This subset seed pattern is suitable for aligning forward strands of +# bisulfite-converted DNA to unconverted DNA. +CT A G +CT A G +CT A G +CT A G +CT A G +CT A G +CTAG +CT A G +CTAG +CT A G +CT A G +CTAG +CTAG +""") + file.close() + return + + + def bisulfite_r_seed(self): + """ + output bisulfite_r.seed file. + """ + + filename = 'bisulfite_r.seed' + if not os.path.exists(filename): + file = open(filename, "w") + file.write(""" +# This subset seed pattern is suitable for aligning reverse strands of +# bisulfite-converted DNA to unconverted DNA. +AG C T +AG C T +AG C T +AG C T +AG C T +AG C T +AGCT +AG C T +AGCT +AG C T +AG C T +AGCT +AGCT +""") + file.close() + return + + + def bisulfite_f_mat(self): + """ + output bisulfite_f.mat file. + """ + + filename = 'bisulfite_f.mat' + if not os.path.exists(filename): + file = open(filename, "w") + file.write(""" +# This score matrix is suitable for aligning forward strands of +# bisulfite-converted DNA queries to unconverted reference sequences. + A C G T +A 6 -18 -18 -18 +C -18 6 -18 3 +G -18 -18 6 -18 +T -18 -18 -18 3 +""") + file.close() + return + + + def bisulfite_r_mat(self): + """ + output bisulfite_r.mat file. + """ + + filename = 'bisulfite_r.mat' + if not os.path.exists(filename): + file = open(filename, "w") + file.write(""" +# This score matrix is suitable for aligning forward strands of +# bisulfite-converted DNA queries to unconverted reference sequences. + A C G T +A 3 -18 -18 -18 +C -18 6 -18 -18 +G 3 -18 6 -18 +T -18 -18 -18 6 +""") + file.close() + return + + +class BsfCallCluster(BsfCall): + """ + class to execute bsf-call process on pc cluster. + """ + + def __init__(self, refGenome, readFilePaths, cmdOpts): + """ + constructor of BsfCallCluster + """ + + BsfCall.__init__(self, refGenome, readFilePaths, cmdOpts) + + self.lastExecutor = LastExecutorCluster(self.refGenome, self.opts) + self.mappingJobIds = [] + + + def processMapping(self): + """ + if bsf-call is executed on cluster, mapping and filtering job is submitted + when read file is splitted. + therefore, in this function, only wait for all jobs to finish. + """ + + logging.info("Waiting for all jobs to finish.") + + self.waitForSubmitJobs(self.mappingJobIds) + + logging.info("Mapping and filtering process done.") + + return self.lastExecutor.resultDirs + + + def processMcDetection(self, resultDirs, localDir = None): + """ + for each chromosome number, submit mC detection process job to the cluster. + after all jobs have been finished, output mC detection result. + """ + + logging.info("mC detection process start.") + + chrs = self.chrsByRefGenome(self.refGenome) + job_ids = [] + for chr_no in chrs: + job_id = self.submitMcDetectionJob(resultDirs, chr_no) + job_ids.append(job_id) + sleep(1) + logging.info("Submitted jobs: %s" % ",".join(job_ids)) + + self.waitForSubmitJobs(job_ids) + + mc_detector = McDetector(self.refGenome, resultDirs, self.mcContextDir, self.opts) + + mc_detector.chrs = chrs + mc_detector.output(self.opts["output"]) + + logging.info("mC detection process done.") + + + def submitMcDetectionJob(self, resultDirs, chrNo): + """ + submit mC detection process job to the cluster. + """ + + argv = self.qsubRemoteCommandArgv(resultDirs, chrNo) + cmd = self.qsubCommand(chrNo, " ".join(map((lambda s: '"' + s + '"'), argv))) + + qsub = os.popen(cmd) + out = qsub.read() + qsub.close() + + job_id = self.jobIdByQsubOutput(out) + self.logJobSubmit("mC detection job: %s: Mapping result directories: %s" % (chrNo, resultDirs), job_id) + + return job_id + + + def qsubRemoteCommandArgv(self, resultDirs, chrNo): + """ + get arguments for mC detection program (mc-detector.py). + """ + + argv = [] + + argv.append(self.refGenome) + argv.append(",".join(resultDirs)) + argv.append(self.mcContextDir) + argv.append(chrNo) + argv.append(str(self.opts["only_mcdetection"])) + argv.append(str(self.opts["lower_bound"])) + argv.append(str(self.opts["coverage"])) + argv.append(str(self.opts["aln_mismap_prob_thres"])) + if self.opts["local_dir"]: + argv.append(self.opts["local_dir"]) + + return argv + + + def qsubCommand(self, chrNo, cmdArgs): + """ + get qsub command to submit mC detection job. + """ + + remote_cmd = os.path.join(self.scriptDir(), "mc-detector.py") + out_file = os.path.join(self.mcContextDir, "mc-detector-%s.out" % chrNo) + err_file = os.path.join(self.mcContextDir, "mc-detector-%s.err" % chrNo) + + if self.opts["queue_list"]: + cmd = "qsub -o %s -e %s -q %s -cwd -b y %s %s" % (out_file, err_file, self.opts["queue_list"], remote_cmd, cmdArgs) + else: + cmd = "qsub -o %s -e %s -cwd -b y %s %s" % (out_file, err_file, remote_cmd, cmdArgs) + + return cmd + + + def afterProcessSplitRead(self, readFile, readAttr = None): + """ + this function is called after output splitted one read file. + if bsf-call is executed on pc cluster, mapping and filtering job is submitted + after output one splitted read file. + """ + + if self.isPairedEnd(readAttr): + fpath, ext = os.path.splitext(readFile) + if fpath[-1:] == "2": + read_file = "%s1.%s" % (fpath[0:-1], readAttr[1]["type"]) + job_id = self.lastExecutor.executeOneRead(readFile, readAttr) + self.mappingJobIds.append(job_id) + else: + job_id = self.lastExecutor.executeOneRead(readFile, readAttr) + self.mappingJobIds.append(job_id) + + +class LastExecutor(BsfCallBase): + """ + class to run LAST programs to map read and filtering. + """ + + def __init__(self, refGenome, baseDir = ".", readsDir = None, resultsDir = None): + """ + constructor of LastExecutor + """ + + self.refGenome = refGenome + self.baseDir = baseDir + self.readsDir = readsDir + self.resultsDir = resultsDir + self.queue = Queue.Queue() + self.lock = threading.Lock() + + + def execute(self, readAttr, numThreads, lastalOpts = "", mergeOpts = "", filterOpts = ""): + """ + enqueue all splited read files to the queue. + create and start threads to execute read mapping and filtering process. + wait for all threads to finish. + """ + + self.enqueue(readAttr) + + threads = [] + for i in range(numThreads): + t = threading.Thread(target=self.worker, args=(readAttr, lastalOpts, mergeOpts, filterOpts)) + t.daemon = True + threads.append(t) + t.start() + + for thread in threads: + thread.join() + + + def worker(self, readAttr, lastalOpts, mergeOpts, filterOpts): + """ + thread worker to execute LAST. + dequeue read file path from the queue and execute read mapping filtering + process. + """ + + while True: + try: + fpath = self.queue.get_nowait() + self.runLast(fpath, readAttr, lastalOpts, mergeOpts, filterOpts) + except Queue.Empty: + break + + + def runLast(self, readFile, readAttr, lastalOpts, mergeOpts, filterOpts, rmInFiles = True): + """ + execute LAST programs to map read and filtering. + """ + + cmd = self.batchCmd(readFile, readAttr, lastalOpts, mergeOpts, filterOpts, rmInFiles) + logging.debug(cmd) + p = subprocess.Popen(cmd, shell = True, stderr = subprocess.PIPE) + error = p.stderr + p.wait() + + error_msg = error.read() + if len(error_msg) > 0: + logging.fatal(error_msg) + + + def enqueue(self, readAttr): + """ + enqueue all splitted read files to the queue. + """ + + for read_file in glob.glob("%s/*_1.%s.gz" % (self.readsDir, readAttr[1]["type"])): + self.queue.put(read_file) + + + def lastdb(self, directions, parallel = False): + """ + execute lastdb command to create index file of reference genome. + """ + + cmds = [] + for direction in directions: + cmds.append(self.lastdbCmd(direction)) + + if parallel: + processes = [] + for cmd in cmds: + logging.info(cmd) + p = subprocess.Popen(cmd, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT) + out = p.stdout + processes.append({"p": p, "out": out}) + for process in processes: + process["p"].wait() + out_data = process["out"].read() + if len(out_data) > 0: + logging.info(out_data) + else: + for cmd in cmds: + logging.info(cmd) + p = subprocess.Popen(cmd, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT) + out = p.stdout + p.wait() + out_data = out.read() + if len(out_data) > 0: + logging.info(out_data) + + + def lastdbCmd(self, direction): + """ + get lastdb command to create index file of reference genome. + """ + + return "lastdb -w2 -u bisulfite_%s.seed %s.%s %s" % (direction, self.refGenome, direction, self.refGenome) + + + def lastalCmd(self, readFile, direction, opts = ""): + """ + get lastal command to map read. + """ + + s_opt = self.lastalSopt(direction) + read_name = self.readNameByReadFile(readFile) + + return "zcat %s | lastal -p bisulfite_%s.mat %s %s %s.%s - > %s" % (readFile, direction, s_opt, opts, self.refGenome, direction, self.mappingResultFilePath(read_name, direction)) + + + def mergeCmd(self, forwardFile, reverseFile, outputFile, opts = "", rmInFiles = True): + """ + get command to merge lastal output. + """ + + cmd = "last-merge-batches.py %s %s %s > %s" % (opts, forwardFile, reverseFile, outputFile) + if rmInFiles: + cmd += "; rm %s %s" % (forwardFile, reverseFile) + + return cmd + + + def mappingAndMergeCmd(self, readFile, lastalOpts = "", mergeOpts = "", rmInFiles = True): + """ + get read mapping and filtering command. + """ + + read_name = self.readNameByReadFile(readFile) + n, ext = os.path.splitext(readFile) + if ext == ".gz": + n, ext = os.path.splitext(n) + lastal_qopt = self.lastalQopt(ext[1:]) + lastal_opt = "%s %s" % (lastalOpts, lastal_qopt) + mapping_file_f = self.mappingResultFilePath(read_name, "f") + mapping_file_r = self.mappingResultFilePath(read_name, "r") + merged_file = self.mergeResultFilePath(read_name) + + return "%s; %s; %s" % (self.lastalCmd(readFile, "f", lastal_opt), self.lastalCmd(readFile, "r", lastal_opt), self.mergeCmd(mapping_file_f, mapping_file_r, merged_file, mergeOpts, rmInFiles)) + + + def mappingResultFilePath(self, readName, direction): + """ + get read mapping result file path. + """ + + return "%s/%s_%s" % (self.resultsDir, readName, direction) + + + def mergeResultFilePath(self, readName): + """ + get merge result file path. + """ + + return "%s/%s" % (self.resultsDir, readName) + + + def filterResultFilePath(self, readName): + """ + get filtering result file path. + """ + + return "%s/%s.maf" % (self.resultsDir, readName) + + + def lastalSopt(self, direction): + """ + get -s option for lastal. + """ + + opt = "" + if direction == "f": + opt = "-s1" + elif direction == "r": + opt = "-s0" + + return opt + + + def lastalQopt(self, fileType): + """ + get -Q option for lastal. + """ + + opt = "" + if fileType == "fasta": + opt = "-Q0" + elif fileType == "fastq": + opt = "-Q1" + + return opt + + +class LastExecutorCluster(LastExecutor): + """ + class to run LAST programs on pc cluster. + """ + + def __init__(self, refGenome, bsfCallOpts): + """ + constructor of LastExecutorCluster + """ + + self.refGenome = refGenome + self.opts = bsfCallOpts + + self.resultDirs = [] + + + def executeOneRead(self, readFile, readAttr): + """ + execute read mapping and filtering process with specified read. + on pc cluster, submit mapping and filtering job and return job id. + """ + + if readAttr["results_dir"] not in self.resultDirs: + self.resultDirs.append(readAttr["results_dir"]) + + lastal_opts = self.lastalOpts(self.opts["last_opts"]) + merge_opts = self.mergeOpts() + filter_opts = self.filterOpts(self.opts["aln_mismap_prob_thres"], self.opts["aln_score_thres"], self.isPairedEnd(readAttr)) + job_id = self.submitJob(readFile, readAttr, lastal_opts, merge_opts, filter_opts, self.opts["queue_list"]) + + return job_id + + + def submitJob(self, readFile, readAttr, lastalOpts = "", mergeOpts = "", filterOpts = "", queueName = None): + """ + submit read mapping and filtering process job. + """ + + job_id = None + + read_name = self.readNameByReadFile(readFile)[0:-2] + out_file = self.qsubStdoutFilePath(readAttr["base_dir"], read_name) + err_file = self.qsubStderrFilePath(readAttr["base_dir"], read_name) + remote_cmd = self.remoteCommand(readAttr) + remote_cmd_args = " ".join(map((lambda s: '"' + s + '"'), self.remoteCommandArgv(read_name, readAttr, lastalOpts, filterOpts))) + + if queueName: + cmd = "qsub -o %s -e %s -q %s -cwd %s %s" % (out_file, err_file, queueName, remote_cmd, remote_cmd_args) + else: + cmd = "qsub -o %s -e %s -cwd %s %s" % (out_file, err_file, remote_cmd, remote_cmd_args) + + qsub = os.popen(cmd) + out = qsub.read() + qsub.close() + + job_id = self.jobIdByQsubOutput(out) + + dir_path, file_name, base_name, ext = self.splitFilePath(readFile) + self.logJobSubmit("Mapping and filtering: read: %s/%s" % (dir_path, base_name[0:-2]), job_id) + + return job_id + + + def remoteCommand(self, readAttr): + """ + get read mapping and filtering command path to submit by qsub command. + """ + + if self.isPairedEnd(readAttr): + return os.path.join(self.scriptDir(), "mapping-p.sh") + else: + return os.path.join(self.scriptDir(), "mapping-s.sh") + + + def remoteCommandArgv(self, readName, readAttr, lastalOpts, filterOpts): + """ + get read mapping and filtering command arguments. + """ + + argv = [] + + argv.append(readAttr["reads_dir"]) + argv.append(readAttr["results_dir"]) + argv.append(self.refGenome) + argv.append(filterOpts) + + argv.append(readName) + argv.append(readAttr[1]["type"]) + argv.append("%s %s" % (lastalOpts, self.lastalQopt(readAttr[1]["type"]))) + + if self.isPairedEnd(readAttr): + argv.append(readName) + argv.append(readAttr[2]["type"]) + argv.append("%s %s" % (lastalOpts, self.lastalQopt(readAttr[2]["type"]))) + + return argv + + + def qsubStdoutFilePath(self, dirPath, readName): + """ + get qsub command standard output file path. + """ + + return "%s/mapping_%s.out" % (dirPath, readName) + + + def qsubStderrFilePath(self, dirPath, readName): + """ + get qsub command standard error file path. + """ + + return "%s/mapping_%s.err" % (dirPath, readName) + + +class LastExecutorSingle(LastExecutor): + """ + class to run LAST programs to map single read and filtering. + """ + + def __init__(self, refGenome, baseDir, readsDir, resultsDir): + """ + constructor of LastExecutorSingle + """ + + LastExecutor.__init__(self, refGenome, baseDir, readsDir, resultsDir) + + + def batchCmd(self, readFile, readAttr, lastalOpts = "", mergeOpts = "", filterOpts = "", rmInFiles = True): + """ + get batch command to map read and filtering. + """ + + read_name = self.readNameByReadFile(readFile) + out_file = self.filterResultFilePath(read_name[0:-2]) + + cmds = [] + cmds.append(self.mappingAndMergeCmd(readFile, lastalOpts, mergeOpts, rmInFiles)) + cmds.append(self.filterCmd(self.mergeResultFilePath(read_name), out_file, filterOpts, rmInFiles)) + cmds.append("gzip %s" % out_file) + + return "; ".join(cmds) + + + def filterCmd(self, inputFile, outputFile, opts = "", rmInFile = True): + """ + get filter command. + """ + + cmd = "last-map-probs.py %s %s > %s" % (opts, inputFile, outputFile) + if rmInFile: + cmd += "; rm %s" % inputFile + + return cmd + + +class LastExecutorPairedEnd(LastExecutor): + """ + class to run LAST programs to map paired-end read and filtering. + """ + + def __init__(self, refGenome, baseDir, readsDir, resultsDir): + """ + constructor of LastExecutorPairedEnd + """ + + LastExecutor.__init__(self, refGenome, baseDir, readsDir, resultsDir) + + + def batchCmd(self, readFile1, readAttr, lastalOpts = "", mergeOpts = "", filterOpts = "", rmInFiles = True): + """ + get batch command to map read and filtering. + """ + + read_file2 = self.secondReadFilePathByFirstReadFilePath(readFile1, readAttr[2]["type"]) + read_files = (readFile1, read_file2) + + filter_cmd_in_file = "%s %s" % (self.mergeResultFilePath(read_files[0]), self.mergeResultFilePath(read_files[1])) + read_name1 = self.readNameByReadFile(read_files[0]) + read_name2 = self.readNameByReadFile(read_files[1]) + merge_result_file = "%s %s" % (self.mergeResultFilePath(read_name1), self.mergeResultFilePath(read_name2)) + out_file = self.filterResultFilePath(read_name1[0:-2]) + + cmds = [] + cmds.append(self.mappingAndMergeCmd(read_files[0], lastalOpts, mergeOpts, rmInFiles)) + cmds.append(self.mappingAndMergeCmd(read_files[1], lastalOpts, mergeOpts, rmInFiles)) + cmds.append(self.filterCmd(merge_result_file, out_file, filterOpts, rmInFiles)) + cmds.append("gzip %s" % out_file) + + return "; ".join(cmds) + + + def filterCmd(self, inputFile, outputFile, opts = "", rmInFile = True): + """ + get filter command. + """ + + cmd = "last-pair-probs.py %s %s > %s" % (opts, inputFile, outputFile) + if rmInFile: + cmd += "; rm %s" % inputFile + + return cmd + + +class McDetector(BsfCallBase): + """ + class to execute mC detection process. + """ + + def __init__(self, refGenome, resultDirs, mcContextDir, options): + """ + constructor of McDetector + """ + + self.refGenome = refGenome + self.resultDirs = resultDirs + self.mcContextDir = mcContextDir + self.lowerBound = options["lower_bound"] + self.coverageThreshold = options["coverage"] + + if options["only_mcdetection"]: + self.mismapThreshold = options["aln_mismap_prob_thres"] + else: + self.mismapThreshold = None + + if options["local_dir"]: + self.localDir = options["local_dir"] + else: + self.localDir = mcContextDir + + self.chrs = [] + self.refGenomeBuf = [] + self.targetChr = "" + self.targetSeqD = "" + self.targetSeqC = "" + self.targetSeqLen = 0 + + self.mcDetectData = {} + + logging.debug("McDetector.__init__: self.resultDirs: %s" % self.resultDirs) + + + def execute(self, outputFile, numWorkers = 1): + """ + execute mC detection process and output result. + """ + + for line in open(self.refGenome, "r"): + line = line.strip() + if line[0] == ">": + if self.targetChr != "": + self.processOneChr() + self.chrs.append(self.targetChr) + self.targetChr = self.chrnoFromFastaDescription(line) + else: + self.refGenomeBuf.append(line.upper()) + self.processOneChr() + self.chrs.append(self.targetChr) + + self.output(outputFile) + + + def processOneChr(self, chrNo = None): + """ + execute mC detection process with specified chromosome number. + """ + + if chrNo: + self.targetChr = chrNo + + local_dir = "%s/%s" % (self.localDir, self.targetChr) + if not os.path.exists(local_dir): + os.makedirs(local_dir) + + if self.refGenomeBuf: + self.targetSeqD = "".join(self.refGenomeBuf) + del self.refGenomeBuf[:] + else: + self.targetSeqD = self.oneChrGenomeSeq(self.refGenome, chrNo) + + self.targetSeqD = self.targetSeqD.upper() + self.targetSeqLen = len(self.targetSeqD) + self.targetSeqC = self.complementSeq(self.targetSeqD).upper() + + logging.info("mC detection process start: %s (%d)" % (self.targetChr, self.targetSeqLen)) + logging.debug("processOneChr: self.resultDirs: %s" % self.resultDirs) + + for result_dir in self.resultDirs: + logging.debug("processOneChr: result_dir: %s" % result_dir) + for root, dirs, files in os.walk(result_dir): + for filename in files: + self.processMappingResultFile(os.path.join(root, filename)) + + logging.info("mC detection process done: %s (%d)" % (self.targetChr, self.targetSeqLen)) + + logging.info("Output result file start.") + self.outputOneChr(self.targetChr) + logging.info("Output result file done.") + + if self.mcContextDir != self.localDir: + copy(self.mcCotextLocalFilePath(self.targetChr), self.mcContextDir) + + self.targetSeqD = "" + self.targetSeqC = "" + self.targetSeqLen = 0 + + + def output(self, outputFile): + """ + output mC detection result. + """ + + popen_args = ['cat'] + for chr in sorted(self.chrs, self.chrSort): + result_file = self.mcContextGlobalFilePath(chr) + if os.path.exists(result_file) and os.path.getsize(result_file) > 0: + popen_args.append(result_file) + + if len(popen_args) == 1: + if outputFile: + f = open(outputFile, "w") + f.close() + else: + if outputFile: + output_fh = open(outputFile, "w") + else: + output_fh = None + subprocess.check_call(popen_args, stdout = output_fh) + if output_fh: + output_fh.close() + + + def outputOneChr(self, chrNo): + """ + output mC detection result with specified chromosome number. + """ + + logging.debug("McDetector.outputOneChr: chr = %s" % chrNo) + ctx_dir = "%s/%s" % (self.localDir, chrNo) + if os.path.isdir(ctx_dir): + for fname in sorted(os.listdir(ctx_dir)): + self.outputByOneMcContextFile(chrNo, "%s/%s" % (ctx_dir, fname)) + + + def outputByOneMcContextFile(self, chrNo, fpath): + """ + output mC detection result with specified mC context file. + """ + + mc_contexts = {} + f = open(fpath, "r") + for line in f: + try: + ctx_pos, strand, mc_ctx, base = line.strip().split("\t") + ctx_pos = int(ctx_pos) + if not ctx_pos in mc_contexts: + mc_contexts[ctx_pos] = {} + if not strand in mc_contexts[ctx_pos]: + mc_contexts[ctx_pos][strand] = {} + if not mc_ctx in mc_contexts[ctx_pos][strand]: + mc_contexts[ctx_pos][strand][mc_ctx] = [] + mc_contexts[ctx_pos][strand][mc_ctx].append(base) + except ValueError, e: + logging.warning("ValueError: %s: %s -> %s" % (fpath, line.strip(), e.args[0])) + f.close() + + out_f = open(self.mcCotextLocalFilePath(chrNo), "a") + for pos in sorted(mc_contexts.keys()): + for strand in mc_contexts[pos].keys(): + for mc_ctx in mc_contexts[pos][strand].keys(): + coverage, mc_ratio = self.calcCoverage(mc_contexts[pos][strand][mc_ctx]) + if coverage >= self.coverageThreshold and mc_ratio >= self.lowerBound: + out_f.write("%s\t%d\t%s\t%s\t%.02f\t%d\n" % (chrNo, pos, strand, mc_ctx, mc_ratio, coverage)) + out_f.close() + + self.gzipFile(fpath, False) + + + def processMappingResultFile(self, resultFile): + """ + read mapping result file, and output mC context data file. + """ + + logging.info("Process MAF file start: %s" % resultFile) + + chr_line = True + chr = "" + f = None + has_mismap = True + mismap_value = None + skip = False + try: + if self.isGzippedFile(resultFile): + f = gzip.open(resultFile, "rb") + else: + f = open(resultFile, "r") + + counter = 1 + for line in f: + tag = line[0] + if tag != "s" and tag != "a": + continue + + if tag == "s" and skip: + continue + + line = line.strip() + + if tag == "a": + if self.mismapThreshold is not None: + skip = False + mismap_value = self.mafMismapValue(line) + if mismap_value is None: + logging.debug("mismap probability: None") + has_mismap = False + else: + logging.debug("mismap probability: %f" % mismap_value) + if mismap_value - self.mismapThreshold > 0: + skip = True + continue + + tag, name, start, aln_size, strand, seq_size, alignment = line.split() + if chr_line: + if name != self.targetChr: + continue + chr = name + gl_alignment = self.clearGap(alignment) + ref_subseqlen = len(gl_alignment) + ref_start = int(start) + ref_seqlen = int(seq_size) + ref_seq = alignment + else: + if chr == "": + continue + if strand == "-": + ref_start = self.complementStartPosition(ref_seqlen, ref_start, ref_subseqlen) + ref_seq = self.complementSeq(ref_seq) + alignment = self.complementSeq(alignment) + read_seq = alignment.replace("t", "C") + self.extractMcContextsByOneRead(chr, strand, ref_seq.upper(), ref_start, ref_seqlen, read_seq) + counter += 1 + if counter > 500000: + self.outputMcContextData() + counter = 1 + + chr_line = not chr_line + f.close() + + self.outputMcContextData() + + if self.mismapThreshold and not has_mismap: + logging.warning("MAF file has no mis-mapping probability.") + logging.info("Process MAF file done: %s" % resultFile) + except Exception, e: + logging.fatal(resultFile) + logging.fatal(" %s" % str(type(e))) + logging.fatal(" %s" % str(e)) + if f: + f.close() + + + def extractMcContextsByOneRead(self, chr, strand, refSeq, refStart, refSeqLen, readSeq): + """ + extract mC context by one read. + """ + + logging.debug("extractMcContextsByOneRead(%s, %s, %s, %d, %d, %s)" % (chr, strand, refSeq, refStart, refSeqLen, readSeq)) + + nogap_refseq = self.clearGap(refSeq) + bases = list(refSeq) + last_pos = len(nogap_refseq) - 1 + pos = -1 + while True: + try: + pos = bases.index("C", pos + 1) + num_gaps = refSeq.count("-", 0, pos) + real_pos = pos - num_gaps + ctx_type = self.mcContextType(nogap_refseq, real_pos) + ctx_pos = refStart + real_pos + if ctx_type == None: + if strand == "+": + ctx_type = self.mcContextType(self.targetSeqD, ctx_pos) + elif strand == "-": + ctx_type = self.mcContextType(self.targetSeqC, ctx_pos) + if strand == "-": + ctx_pos = refSeqLen - ctx_pos - 1 + line = "%d\t%s\t%s\t%s\n" % (ctx_pos, strand, ctx_type, readSeq[pos]) + base_name = self.mcContextFileBaseName(ctx_pos) + if base_name not in self.mcDetectData: + self.mcDetectData[base_name] = [] + self.mcDetectData[base_name].append(line) + except IndexError: + logging.debug("extractMcContextsByOneRead#IndexError: %s %d %s %s %s %d" % (chr, ctx_pos, strand, ctx_type, readSeq, pos)) + except ValueError: + break + + + def outputMcContextData(self): + """ + output mC context data to file. + """ + + logging.info("Output mC detection data start.") + + for base_name in self.mcDetectData.keys(): + fpath = self.mcContextFilePathByName(base_name) + logging.debug("%s: %d" % (fpath, len(self.mcDetectData[base_name]))) + fo = open(fpath, "a") + fo.write("".join(self.mcDetectData[base_name])) + fo.close() + self.mcDetectData.clear() + + logging.info("Output mC detection data done.") + + + def mcContextHash(self): + h = {} + for strand in self.strands(): + h[strand] = {} + for mc_ctx in self.mcContextTypes(): + h[strand][mc_ctx] = {} + + return h + + + def isC(self, seq): + return seq[0:1].upper() == "C" + + + def isT(self, seq): + return seq[0:1].upper() == "T" + + + def calcCoverage(self, seqAry): + """ + count the number of C and T, calculate mC ratio. + """ + + num_seq = len(seqAry) + c_ary = filter(self.isC, seqAry) + t_ary = filter(self.isT, seqAry) + num_c = len(c_ary) + num_t = len(t_ary) + + if num_c + num_t == 0: + return (num_seq, 0) + else: + return (num_seq, float(num_c) / (float(num_c) + float(num_t))) + + + def mafMismapValue(self, aLine): + """ + get mismap value by maf "a" line. + """ + + mismap_fields = filter((lambda s: s.startswith('mismap=')), aLine.split()) + if len(mismap_fields) > 0: + return float(mismap_fields[0][7:]) + else: + return None + + + def mcContextFilePath(self, pos): + """ + get mC context data file path with specified position. + """ + + return self.mcContextFilePathByName(self.mcContextFileBaseName(pos)) + + + def mcContextFilePathByName(self, name): + """ + get mC context data file path with specified name. + """ + + return "%s/%s/%s.tsv" % (self.localDir, self.targetChr, name) + + + def mcContextFileBaseName(self, pos): + """ + get mC context data file name with specified chromosome number. + """ + + return "%010d" % (int(pos) / 100000) + + + def mcCotextLocalFilePath(self, chrNo): + """ + get local mC context data file path with specified chromosome number. + """ + + return "%s/%s.tsv" % (self.localDir, chrNo) + + + def mcContextGlobalFilePath(self, chrNo): + """ + get global mC context data file path with specified chromosome number. + """ + + return "%s/%s.tsv" % (self.mcContextDir, chrNo) + diff --git a/bsf-call/mapping-p.sh b/bsf-call/mapping-p.sh new file mode 100755 index 0000000..4f6d724 --- /dev/null +++ b/bsf-call/mapping-p.sh @@ -0,0 +1,103 @@ +#!/bin/sh + +date_format="%Y-%m-%d %H:%M:%S" + +echo `date +"$date_format"` $0 start. + +read_dir=$1 +result_dir=$2 +refgenome=$3 +map_probs_opt=$4 +read_name1=$5 +read_format1=$6 +lastal_opt1=$7 +read_name2=$8 +read_format2=$9 +lastal_opt2=${10} + +echo Host: `hostname` +echo Arguments: +echo " Read directory: $read_dir" +echo " Result directory: $result_dir" +echo " Reference genome: $refgenome" +echo " Options for last-pair-probs.py: $map_probs_opt" +echo " Read name (read1): $read_name1" +echo " Read format (read1): $read_format1" +echo " Options for lastal (read1): $lastal_opt1" +echo " Read name (read2): $read_name2" +echo " Read format (read2): $read_format2" +echo " Options for lastal (read2): $lastal_opt2" +echo + +echo `date +"$date_format"` gzip read files start. 1>&2 + +pushd $read_dir +echo `date +"$date_format"` gzip "$read_name1"_1."$read_format1" +gzip "$read_name1"_1."$read_format1" & +pid1=$! + +echo `date +"$date_format"` gzip "$read_name2"_2."$read_format2" +gzip "$read_name2"_2."$read_format2" & +pid2=$! +popd + +wait $pid1 +wait $pid2 + +echo `date +"$date_format"` gzip read files done. 1>&2 +echo `date +"$date_format"` gzip read files done. + +echo `date +"$date_format"` 'lastal (1st read, forward) start.' 1>&2 +echo `date +"$date_format"` zcat $read_dir/"$read_name1"_1."$read_format1".gz '|' lastal -p bisulfite_f.mat -s1 $lastal_opt1 $refgenome.f '- >' $result_dir/"$read_name1"_1_f.maf +zcat $read_dir/"$read_name1"_1."$read_format1".gz | lastal -p bisulfite_f.mat -s1 $lastal_opt1 $refgenome.f - > $result_dir/"$read_name1"_1_f.maf +echo `date +"$date_format"` 'lastal (1st read, forward) done.' exit-status: $? 1>&2 + +echo `date +"$date_format"` 'lastal (1st read, reverse) start.' 1>&2 +echo `date +"$date_format"` zcat $read_dir/"$read_name1"_1."$read_format1".gz '|' lastal -p bisulfite_r.mat -s0 $lastal_opt1 $refgenome.r '- >' $result_dir/"$read_name1"_1_r.maf +zcat $read_dir/"$read_name1"_1."$read_format1".gz | lastal -p bisulfite_r.mat -s0 $lastal_opt1 $refgenome.r - > $result_dir/"$read_name1"_1_r.maf +echo `date +"$date_format"` 'lastal (1st read, reverse) done.' exit-status: $? 1>&2 + +echo `date +"$date_format"` 'last-merge-batches.py (1st read) start.' 1>&2 +echo `date +"$date_format"` last-merge-batches.py $result_dir/"$read_name1"_1_f.maf $result_dir/"$read_name1"_1_r.maf '>' $result_dir/"$read_name1"_1.maf +last-merge-batches.py $result_dir/"$read_name1"_1_f.maf $result_dir/"$read_name1"_1_r.maf > $result_dir/"$read_name1"_1.maf +echo `date +"$date_format"` 'last-merge-batches.py (1st read) done.' exit-status: $? 1>&2 + +echo `date +"$date_format"` delete temporary files start. 1>&2 +rm $result_dir/"$read_name1"_1_f.maf $result_dir/"$read_name1"_1_r.maf +echo `date +"$date_format"` delete temporary files done. exit-status: $? 1>&2 + +echo `date +"$date_format"` 'lastal (2nd read, forward) start.' 1>&2 +echo `date +"$date_format"` zcat $read_dir/"$read_name2"_2."$read_format2".gz '|' lastal -p bisulfite_f.mat -s1 $lastal_opt2 $refgenome.f '- >' $result_dir/"$read_name2"_2_f.maf +zcat $read_dir/"$read_name2"_2."$read_format2".gz | lastal -p bisulfite_f.mat -s1 $lastal_opt2 $refgenome.f - > $result_dir/"$read_name2"_2_f.maf +echo `date +"$date_format"` 'lastal (2nd read, forward) done.' exit-status: $? 1>&2 + +echo `date +"$date_format"` 'lastal (2nd read, reverse) start.' 1>&2 +echo `date +"$date_format"` zcat $read_dir/"$read_name2"_2."$read_format2".gz '|' lastal -p bisulfite_r.mat -s0 $lastal_opt2 $refgenome.r '- >' $result_dir/"$read_name2"_2_r.maf +zcat $read_dir/"$read_name2"_2."$read_format2".gz | lastal -p bisulfite_r.mat -s0 $lastal_opt2 $refgenome.r - > $result_dir/"$read_name2"_2_r.maf +echo `date +"$date_format"` 'lastal (2nd read, reverse) done.' exit-status: $? 1>&2 + +echo `date +"$date_format"` 'last-merge-batches.py (2nd read) start.' 1>&2 +echo `date +"$date_format"` last-merge-batches.py $result_dir/"$read_name2"_2_f.maf $result_dir/"$read_name2"_2_r.maf '>' $result_dir/"$read_name2"_2.maf +last-merge-batches.py $result_dir/"$read_name2"_2_f.maf $result_dir/"$read_name2"_2_r.maf > $result_dir/"$read_name2"_2.maf +echo `date +"$date_format"` 'last-merge-batches.py (2nd read) done.' exit-status: $? 1>&2 + +echo `date +"$date_format"` delete temporary files start. 1>&2 +rm $result_dir/"$read_name2"_2_f.maf $result_dir/"$read_name2"_2_r.maf +echo `date +"$date_format"` delete temporary files done. exit-status: $? 1>&2 + +echo `date +"$date_format"` last-pair-probs.py start. 1>&2 +echo `date +"$date_format"` last-pair-probs.py $map_probs_opt $result_dir/"$read_name1"_1.maf $result_dir/"$read_name2"_2.maf '>' $result_dir/"$read_name1".maf +last-pair-probs.py $map_probs_opt $result_dir/"$read_name1"_1.maf $result_dir/"$read_name2"_2.maf > $result_dir/"$read_name1".maf +echo `date +"$date_format"` last-pair-probs.py done. exit-status: $? 1>&2 + +echo `date +"$date_format"` delete temporary files start. 1>&2 +rm $result_dir/"$read_name1"_1.maf $result_dir/"$read_name2"_2.maf +echo `date +"$date_format"` delete temporary files done. exit-status: $? 1>&2 + +echo `date +"$date_format"` 'gzip result file ('$result_dir/"$read_name1".maf') start.' 1>&2 +gzip $result_dir/"$read_name1".maf +echo `date +"$date_format"` gzip result file done. exit-status: $? 1>&2 + +echo `date +"$date_format"` $0 done. + +exit 0 diff --git a/bsf-call/mapping-s.sh b/bsf-call/mapping-s.sh new file mode 100755 index 0000000..496e23b --- /dev/null +++ b/bsf-call/mapping-s.sh @@ -0,0 +1,65 @@ +#!/bin/sh + +date_format="%Y-%m-%d %H:%M:%S" + +echo `date +"$date_format"` $0 start. + +read_dir=$1 +result_dir=$2 +refgenome=$3 +map_probs_opt=$4 +read_name=$5 +read_format=$6 +lastal_opt=$7 + +echo Host: `hostname` +echo Arguments: +echo " Read directory: $read_dir" +echo " Result directory: $result_dir" +echo " Reference genome: $refgenome" +echo " Options for last-map-probs.py: $map_probs_opt" +echo " Read name: $read_name" +echo " Read format: $read_format" +echo " Options for lastal: $lastal_opt" +echo + +echo `date +"$date_format"` gzip read file start. 1>&2 +pushd $read_dir +echo `date +"$date_format"` gzip "$read_name"_1."$read_format"; gzip "$read_name"_1."$read_format" +popd +echo `date +"$date_format"` gzip read file done. 1>&2 + + +echo `date +"$date_format"` 'lastal (forward) start.' 1>&2 +echo `date +"$date_format"` zcat "$read_dir/$read_name"_1."$read_format".gz '|' lastal -p bisulfite_f.mat -s1 $lastal_opt "$refgenome".f '- >' "$result_dir/$read_name"_1_f.maf +zcat "$read_dir/$read_name"_1."$read_format".gz | lastal -p bisulfite_f.mat -s1 $lastal_opt "$refgenome".f - > "$result_dir/$read_name"_1_f.maf +echo `date +"$date_format"` 'lastal (forward) done.' exit-status: $? 1>&2 + +echo `date +"$date_format"` 'lastal (reverse) start.' 1>&2 +echo `date +"$date_format"` zcat "$read_dir/$read_name"_1."$read_format".gz '|' lastal -p bisulfite_r.mat -s0 $lastal_opt "$refgenome".r '- >' "$result_dir/$read_name"_1_r.maf +zcat "$read_dir/$read_name"_1."$read_format".gz | lastal -p bisulfite_r.mat -s0 $lastal_opt "$refgenome".r - > "$result_dir/$read_name"_1_r.maf +echo `date +"$date_format"` 'lastal (reverse) done.' exit-status: $? 1>&2 + +echo `date +"$date_format"` 'last-merge-batches.py start.' 1>&2 +echo `date +"$date_format"` last-merge-batches.py "$result_dir/$read_name"_1_f.maf "$result_dir/$read_name"_1_r.maf '>' "$result_dir/$read_name"_1.maf +last-merge-batches.py "$result_dir/$read_name"_1_f.maf "$result_dir/$read_name"_1_r.maf > "$result_dir/$read_name"_1.maf +echo `date +"$date_format"` 'last-merge-batches.py done.' exit-status: $? 1>&2 + +echo `date +"$date_format"` delete temporary files start. 1>&2 +rm "$result_dir/$read_name"_1_f.maf "$result_dir/$read_name"_1_r.maf +echo `date +"$date_format"` delete temporary files done. exit-status: $? 1>&2 + +echo `date +"$date_format"` 'last-map-probs.py start.' 1>&2 +echo `date +"$date_format"` last-map-probs.py $map_probs_opt "$result_dir/$read_name"_1.maf '>' "$result_dir/$read_name".maf +last-map-probs.py $map_probs_opt "$result_dir/$read_name"_1.maf > "$result_dir/$read_name".maf +echo `date +"$date_format"` 'last-map-probs.py done.' exit-status: $? 1>&2 + +echo `date +"$date_format"` delete temporary files start. 1>&2 +rm "$result_dir/$read_name"_1.maf +echo `date +"$date_format"` delete temporary files done. exit-status: $? 1>&2 + +echo `date +"$date_format"` 'gzip result file ('$result_dir/"$read_name".maf') start.' 1>&2 +gzip "$result_dir/$read_name".maf +echo `date +"$date_format"` gzip result file done. exit-status: $? 1>&2 + +echo `date +"$date_format"` $0 done. diff --git a/bsf-call/mc-detector.py b/bsf-call/mc-detector.py new file mode 100755 index 0000000..3a0f15a --- /dev/null +++ b/bsf-call/mc-detector.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python + +import sys +import os +import logging +import subprocess +import bsfcall + +__version__ = "0.1" + +refgenome = sys.argv[1] +mapping_dirs = sys.argv[2].split(',') +work_dir = sys.argv[3] +chr_no = sys.argv[4] +if sys.argv[5] == "True": + only_mcdetection = True +else: + only_mcdetection = False + +try: + lower_bound = float(sys.argv[6]) +except Exception: + lower_bound = 0.01 + +try: + coverage_threshold = int(sys.argv[7]) +except Exception: + coverage_threshold = 5 + +try: + mismap = float(sys.argv[8]) +except Exception: + mismap = 0.01 + +try: + local_dir = (sys.argv[9]) +except Exception: + local_dir = None + +if not os.path.exists(work_dir): + os.makedirs(work_dir) + +if local_dir and not os.path.exists(local_dir): + os.makedirs(local_dir) + + +log_level = logging.INFO +log_file = "%s/mc-detector-%s.log" % (work_dir, chr_no) +file_logger = logging.FileHandler(filename=log_file) + +file_logger.setLevel(log_level) +file_logger.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s')) + +logging.getLogger().addHandler(file_logger) +logging.getLogger().setLevel(log_level) + +hostname = subprocess.check_output(['hostname']) + +logging.info("%s start." % sys.argv[0]) +logging.info("Host: %s" % hostname.strip()) +logging.info("Arguments:") +logging.info(" Reference genome: %s" % refgenome) +logging.info(" Mapping result directory: %s" % mapping_dirs) +logging.info(" Only mC detection: %s" % str(only_mcdetection)) +logging.info(" Working directory: %s" % work_dir) +logging.info(" Chromosome: %s" % chr_no) +logging.info(" Threshold of read coverate: %d" % coverage_threshold) +logging.info(" Threshold of mC ratio: %s" % str(lower_bound)) +logging.info(" Threshold of the mismap probability at filtering: %s" % str(mismap)) +logging.info(" Local directory: %s" % local_dir) + +options = {} +options["only_mcdetection"] = only_mcdetection +options["lower_bound"] = lower_bound +options["coverage"] = coverage_threshold +options["aln_mismap_prob_thres"] = mismap +options["local_dir"] = local_dir + +mc_detector = bsfcall.McDetector(refgenome, mapping_dirs, work_dir, options) +mc_detector.processOneChr(chr_no) + +logging.info("%s done." % sys.argv[0]) + +sys.exit(0) diff --git a/bsf-diff/src/__init__.pyc b/bsf-diff/src/__init__.pyc deleted file mode 100644 index cc6d1688c691218c0c2245e87d0f224877a34082..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 135 zcmd1(#LE>MbTS~B0SXv_v;zv@#i>QbF}cOX w$w~P!<@rU~#RZAUsWC~#Y4ItUX=w;Xe0*kJW=VX!UO{CE$W(ifp~WBq0KFm|v;Y7A diff --git a/bsf-diff/src/bsf_diff.pyc b/bsf-diff/src/bsf_diff.pyc deleted file mode 100644 index c74a3118ab6084cd741e4e1d6d4dc48507040cb7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10064 zcmdT~O>7*;a;{nai7Qc}EKw9K%Nk2wYiV(p68*1;`Y}oU#1gU-j&2Ny#rjZjrdP z)!o(A-PP4qUDaFvuc7=u{PTZ4u1NMThwt0?n0FkJP-Gdwt7!XDY^m0F)H_o2`$8~sunkmUiX9g*cDsOpoAL8;|sIgdiWY#f!^Fpp7Lt|y_%=WhpFz*^T`{@XH9%WnkrD6ho~ zHl|&Tqgtr-%`ix#@VfiNBKi3u1SIAN09(?y9@&~>sClGo ze;l+hjUu(r^a9+AAcTB|WR1#%b&Kn*CSiB$wlGYB zc&!mv<4Vw|M2nG0tEaA(uU$R(@0t!tp_TaeLChgze569G{h0x7xbu6M=zRdtK`|U0 zmjxYgUKf&bUZyjz^f-!~xL<80b-xR3c-1JGdcpm8v*>j;@3v=0{gT0A=3qk)4mZq}i3S z>^rhkU}4XZvDuX|+0V&d4-ft)paZs-6TeqB-_a&n1i(t- zpyoG{R8^qf!a=E|DcdXSkVVR#VZD|1+iyP(i=3TD3XhfE00HY>3d*Chq(cBWDcD>e z9ME$D6^5b~r{5Wne&;AYd8a~Nr|~(8k1qS4(Et12PcSWqimdA`EO_csO;g- zk>>-F42ZM2D>N&pK=s&uUUvS~VV@H$gU{^BKRWv`ha&qY6dsYiHzYnS$sMTP05|2i zNb)9Fp!y7{h>WsfbpMEK-gB}Rt!THRrE&AM3_y0a!-Qn8kFT;minU#%bA= zrKQG4(wkn9=XpuS#c}qrkhCOmS@rH1l@Xe#Rg;40b2SZAe9d#lIb2n9^EFBfMIM0p zl%{FhJO;evwoSx!cNSJTYDJ9{$FH{QK7=3Htlfjg7bn_H>6l%R$_NiA911y>UVt;)JPgE6DlgnQ?~ z!s1VO8^Z&6$t{??PTni0D~8)5_46cdC=#^RURZOMc{DX*jbvLZP1>r`&8f{_gD>;dal*5=bS$k>BS@wj z(XO#(jxy0cY8Ta_)nb#Tbd+mR>L*p-T1<8$6P?R^0));+*_t$Hr8)~OMvYK8#>Q{f zw%`-e7VZ>E4!fCDV2d#T2euu1fUyrp=#0G{r=7{DEw*-@Bl*q5q%^D;fEHM}_;JJ6 z>2bfnO^N87G9p&BNwU5#=gyY4Hwc>8o=|05>+Dm@=xovxVA!5n9Qrz24w%(>^foCg zly@BJr0W&Gg)Q^19Vktu^b2S+X&6Q?(0Z^urFbKA_LKVdp z?ZlaPt0g8Xa)BPT+xj#5CeJ0HK)m+|#tG=!ytfHT1Y-p1{N5!-zvg{JaGrq6@+biB z96=d?{VlMrkpt-#$6POE@BwPfw<^#6!6k~;;uLG`j{oiB33&MD^%>{NyF;RHFq8GOtX?9=zIv-=-HU#pP> zeJ#v5+~rGOYYUmLwS|tag-vYx+IGuN`P%R8Ev6;=y|NSQEvA>Zm=$$9wl^FMd^jvQ zAS^E65KZkUvEy2XzX1+628c@m;2R3z9)O=#I3m@fnSK8yY?L5=-o?avWf0}lWr|5TBc0=u_4G;O38TP@$Mx6#euXoD- z-LBK{d-RE(o-;imy#9Oi7WTyP=I-gC93e*2*LBG(uy~n3%?A&lHJRR3R?=^HQv^Ir zY9*>kEreMP8nDpR`lkOPsBJ~whosUI`!O-z>%C72J_9J~0oJi~qG}72SdYB6a`J}{ zHI>1-Gup@vl*~ndLkGHZ@vZc94kfP$Z{kDY=%F0NAUKW?Y@8F=G~8Km2Oh&Q;{_AP zvrobv8qhQQgA(FGgFD-xRC6ff1oz24V(@+*;x)bd;VN#KkO7VguJq64`4PRCW!wkk z!oKJ&clWzoI!yhiF~F$+E1X5#m!$`X78h$AHe90+=_9~LXpf*C;XI;v1n=P~Ns$mh zHedwzq&_F31`DJEkO%-NvH&2(0RtW4e~`Wf^U)zaFCo$ZpuV8gxaxuW9i_Vnmc4D3 zm%51FRic6ty$5C;qRU<57L{mR$HhGm<6h|=cS1=^I%ru*algY9uPW)aZqg|UKj>oo zp;CXO)E_G~7W8uMpL9+1QziXONv|vE?{zc2A>niv^-ZO|rF6Hu*M6sK^v{)OMp?c_ zoU;U&B(@akOecac6&(%(Nw`f{JX=py}6Nguv;4lu@=mw5)IMW}jImKr{8 z*rRHN7kI-y3oAUvkORsY&dX_Bxv7)qBz}vv;UgJM5quzyabz@nETi~#UQ$2-zp{0q zCkuKPPv4g0?-4gGAO#RE%Dp{U_zT3JSllK4GmF1Ne97V$iGOYJOT@pi_`AfHEnXo0 z)Z*_Ee`fK^#8)g{B>t_%$BFwEf1mhw7QX_VF(sF-F(hXH9QNZJLLeNm33WAGB~%hY zE5i(YxDp%&9Nr#DZlSle5oNIrAUl>6yQC8aS&K%(udrkHNsfehtx9@TVkkQ5py3r2 zHv>qNKV*_|Jr!+@tX&B19;7sB}ZJQ&GAh7H3zUDBUU9 zA86aSn}T2wZOxCn=N>V}!{SE~?dsQnRC}yTKd_R`f9rMdSz?HJOVZm6^}q~&&gO4= zmv%}zxCWWCD;L_6#JwRW@#m7xFm8iLf53HM^GV+lt7ldGJ7;a+IowV?%xU%Julkli zzmM=8+i_8nGnn-~iSK7+42(A2+(Y8}%%QllnOt!)agE5%TUkT4(hFcQyV9=&kP~bb zhJz8--A^uJxGyDs$ORCYm%S0$!$BL9A7J|P(pZD0j7a>5(+#VtS0AD3^uembd>+5T z=fa_pzB){idQ2)%(tD$lOoDttfT1p2)1(x451Z)8D4i1*aETE*=BfIeE-LN%>u5p6 z9g+T}&3}i$V=C!$MAhn8c|Aa&TSvU)))2e7iRLOzSp!ndsJ%cE(J&EJFq(8K6p(vc zLx73E)r=dmu{BS#zALUx%3br3rn~lZqN_ADQ7)I;?NbNZwUo6#wO!_wAnUJ%sTK2E zopF}`wb|cJn%FJAk_aBplE(k1uP<$aICQrfh=1L&d3!Yw}ULf;{oP-9YTPDD@ir2`lOu0(DWHh5( zDw%qcr0W`lAvMj8dSgzLR^+FxEgK`FcWUD_OU*cKJ*qVg11{uRUCYINicEH}RYQim zkYOu2r>vkj;`-nP64s2$Z!BVjy}LopL^lmDZ0-4!w&O;KbN~lRcbgHc35;AL)r_Sb z%(oOyaz z!lqOB2ys*MIi=ECTGf@}+uXrG_n63^=&)?Zqf9w%$5=7Alli^HyKa%R?jq6?23ad6 zqcGIC9gFRny0K$X@}-h8D=~ke@w9n_+K0i8^6Van6ioYb_U={_*ZC-@yExU8C}WQr zSq-GmD7oxLJI!(T$_IEHh<^*!)02JU>6OyTt%Krjs)|-?NsxNe7~Jq?|G1>re;f!r zt*j$9&|xd)uG{)$!7nl7+Zss*c2O(m$9nPs;C=C)*_ zX22v3%r0qM><@I5`419avh&{E`OpyGR4#L^okvAv-8B)Lumv;hac0f0GT5V$% zoKLIuY@Zg}->UqyVF0drXGdypea3_nu^Tq6r?Rsdwg?efp-h`wpgdq_+sS9Q2-l#S z6uPUsZYiC>dhP7TVUHryxSZPU)47x<(|XfaZ&UcDP%ngq`-O*v1&>D1Fso2Nl6$>t z=haYR2W-9FcD;`rWRAoe7%&^cck2>OU#Sq{ubl3rh6xF+N1CClnr90y)UjFbyfe%f zpP*7tJyOB+cg+AQLX)x@n1~kJ(+mh%YATy|q*hS3*>-YlSF}Apwchu)?EA6rXYZP} z)9O}?e2YFLGpTy0EA!j*cg*>;n`%j8jkgC)71{JdT#$NJEKmoSo%{UBJ#P-x-WLQ93FZl00{SEGF~L^^3j~V<+$`^B1WN$A z{w740`g+yU43!<+P>&Ra$Ej?G9#!RK;!InhnPF__Q{N!QtJh-ZecsX_`=8X*WJItu z@L+gM;mNOOa^}lD<)h0ouwSWOv(%K~)<)7s_IjQfH+?-<=4$1P_daNp+d}8P-9wdq zf8N}RwxY$rYF|n%L&CIJdCqQHeZW13cdCip=uaROffZwh49#sst+ps{}tMK5=yW{f@x=Q>fQ7#;hG z|HI(MM~aiV%g!jEI7gi^JY?u7a*R(PfTu9Vh%=faKAOveiq9O!@y&;jGdW8&f>xF< z=HA2S?OYxYCX>$lz0#vv;QL&i?-ws(X{uztAL4z@x5n=q)anURVi*BcCNE+&pKw$~ zsBhknEaeIG9P#jYFWPn1RKfZM-Y-~3cf{WZ?ba%49pH6UsrieD_Rn2>-Z;HKYLoju z?r$)v*2rci7iAi3Oqq{GzV|H(UbEdrwd~2fj`rAlboarRb9$BzX`UKN?~mEbGNbJW zXHF8L;4~B`Qcps)HN{bm^a}#DFMm!;K;F`^Wm%)pF!ui^FIc}xGDeu diff --git a/bsf-diff/src/bsfdiffexception.pyc b/bsf-diff/src/bsfdiffexception.pyc deleted file mode 100644 index c6c8c9c279db3cbf307db1c1cc4c0b018e93c63e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 937 zcmb_aO>fgc5FOi)Qlm~XwwleN5)wvl=(Kcs(< z6Z`;X)=d!iSeobA-T8QLcC+6{`SGtmUn_Y0GX8&H3yPrzxMWNK6KE2sQ}_vR5Af|W zg*tqcK|w$Y+gvc^N946-D~647?UXu3|&)7NWnHhIAL$n`Xx?mXzi7`F_RnL zUxm(8IN8qjKSzd*TP5ReL(jN`)^=q{Yw?4Sgrt&AVzS)vC8iAUBZ2#buW{fj0hBQW z>IZ(ZU&{7=nI3ROt+|WV&!yc!G(M(B6)UquttpRzOJA=YFH7N8%0U0oiQXQjS8`Pt qu?4xIQF@r27t)RfuC=e2&LMB?9#72JKan_q_k-k<_*!H-qx3HweBqD) diff --git a/bsf-diff/src/gene.pyc b/bsf-diff/src/gene.pyc deleted file mode 100644 index 43708ec05cf442b62b1b35d27d6f9c25099aca07..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3362 zcmb_f&2k$>5bjyYvL(xw{~f0|ETIgl!ev9aP>BP?HlcFBm0G9Dkcw;B?2fFJS6X>z zl-QuX#CeNcxa27~@(#QOC%D7cy|SE2!6k{TY4yzX%yjpB-90Y-HBJGR>03v5u^P5b9reB8F~&Cfpn!re`~{eca5O^`r1P156YY-e82i zhv{s?y?g)Q;ih{xNnBZL+zb-gQyp#A;(zdIDrjowKz2-P56XUf7V19lYRee>PWSZK zbu7ArX0C&1xDI|JHXlxd51A>EchGb&9accNac39J+y@yd8WQGpUPbQfZ%bMUwh`9-FtDScB zR&3hwN$@P#dX{DkD1zqj?mXGuOZW;{|biF#d9>?32 z3c|hRJ$US2*2VD26{lLWygA3fF5)4GdpZj-ARm^MVE71({-0n#T6uw~GDihFa(s*9 zsNe%)xj#fj#K`82_lvM(EC`aLIJEE4;o`V!*jx-{poOduH`a2OXP6*d$JjclP_}W2 zGZ|m&0Beda6sg`KbH@b&L1ljXHFJ2|B8=w2A(V-NejkXFWFat;aARw)F7qw#QDQxa zq7QJ2;iE_`sO!$y#UE6VyNe3`uVv+WaJZcHc%)4dq-WBBfF!3 zmS`2j!wLag7szQS2W3VW4hFhVg_l-A+{abQCn=wz988V!Y0B%A&rm)~d4p@gu*R$M zIpL>;pQp1n4I8p&g-7;|4k=%dy0Rc=I8t?fr_)!|127h@lz=Laq3|N*OEg^M($Dqj z6%CiDgJ-}LSAI5dnb+cbIGm*6GOePhaoqf(xVjZXYx#<>E7X~ylM-b+%2xTk0eN`f zqD0S?96v`x$kgbBC_7R^>H0O9!uT`|=Vj`p0s~2ZnoeJ;(@nDV(NVkvQ!{h{e~i!H zhRER|{0YZ*Fo)py;p{^e=*)6&?+8||a)ItNXt>IbQK!GDLkt`=Ibe#ao8cyrxflV4 zNj2tVB?bAmlu8-B0USW!HWPiyN!(3a?z(Yfe0yQqGqJIFskqN#+jjezej29(<0i0ii+I9l|z4j~kXegjK%igm?eJ@81qr>PuBp0ObLP}6B$m;^ zc}JcP-Z-OL#%`Dd#=tY}H1fVg|2^eZ&Z+BYlFKX3{u|pCuSvcc-;dG|?-<51o`d|) pfxdiidl(~uqCnl_P2bq)ZR;T;WV?7ZeZ{wJc%Y)@oW;gs<6mzly59f* diff --git a/bsf-diff/src/methylc.pyc b/bsf-diff/src/methylc.pyc deleted file mode 100644 index 28a46e9de138ba6e39a0f437f378ea636d94ea9e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12799 zcmc&*O>7)Tdaa%r4ml%Aq(n&+cjZk-{u|kpWII2O1KyC?=OD?_y$t}n=w_I}zu;&GW9D?MKLxAMHSKU2BN#27w z#)|A>RdscB)%Shx`|DEqzmC>>|MB0S1Sb1e#@`QcB{7Q7m^GA+2}`Eum^D=JTQ6gt~**IavgtI1`GG7?;)R@oK(5{;Bs0pWwt|=4FnDCehYeg4^)J-^N!Xri3G+(mX znCH9y21R`%>G)x{)4A-fM4e6)M}xH6?)4tKx1)6Haj%_5q3cDP-QfTO(kMt-yE`2A z+z*Gbd%c_N^>((pY2>c7({{Z%+<6>#H@8yvowwfpHMcqJ@9d>*KGW`D@US=Be2nRm zs2vAeZWAwy;$`>Rt<}31-D|y`t7j88#)$Y~6ke=%{}2CK4F^k2x#;UjX9cUhu^&V` zoHivTS8jU+WjXRXuH-n12$TgW4Y$8mGTy2t3Xb3wuM(~%u4EI%j~w$OV|-Lfs-Rg` z1pn5r&U37Min$`s-Rg{rN;Mj<|C}w6|@?&cw;(+BG`(D{j|nar0qEM zcZNwijaD=WRj=w!+#ZA))DQd^Jn`Q?(0V83c5p9A!!UY?y9e9$erMPV{a~v-*o^dw z9sgmww-==qJlh%$qLl9n`?22*8+=|jlBn15NGOk#_Bht#x;+l|xKeM11$W@pS@6*Y zE?XqOf!d|5VL!UmO}4t*?MLlPkB0GfveOQtOS$xoDt(DvaB=5x%6WaiJLsmqF9MzB ztWNC(KMgJ}dtlJ+N4}qOExz9$hI>6Uy*V^Tt5WUjFU_Nrad`CHX79_qs`%9z!c*h~M|0>l)ZDZa$MGbqHi0pB)kW5VR)e z7I7t%IOELXC*U$Eg=BjZSMpO7*yIEJ0=SNO>KKRZlG>%BT~@nn+m3kvC}T*)Y*$SD zF%kKsVg^+MD4+tAU=Fb9X~`tLTn)^a!FTj7KpYR2P5cuBG=hnE=^r{~AdS8%k!(q6 z`Q60tbmJuTx1bxNIC%rB9Yl{@OIts9J;|yr!R>YvH-Ku3+hI3={%r_WC-ICB+TPiL za_~-}?GX_v3qaa$KaAq`X5`yhIg>bfHwMN@tPk70#3FyIowU<5_DEI_muU#nWSP|~ z7KFZ4Ot}OM1nrdkcxFSB;ImX#I}Ees2WRn`gqt?bynaqNwNllY1N-NkLe?qo5&&u< zD*#r-CO6RGa_?@pS9EfRDLD10n?zQeNvg>T zcDjTq3qsnpQBv%>&f})EizyYGEr5L#Hvw$!SE8O<6%LR^>_r2Qw43mXPhd()+F-B0 z5yk$n<0mlv7tm{$YWb3bQ+xp$s9&DQ3_mHE{d?K6SJuC4p1`C!W;S_;wiZ}LW80Yc z{j7$U_Sq`g;Jyv9CUhc?U9M%)i%Y%|@uj8;N= z8t24l@!WvezmAG!|NixY@$YH_l8Eh+iQndCz2a>}%e~vDRclMA?j9SCvW z8P=%d1Tv@$xOCiTJC56rWxnKvW>!o1gZtW$OG$2}1kT8oWe$k<5^r&k=7EWNW|(zK zJfdc7*kB@@eb&+v%+}@pE?$>$xmKEY7E5!b{BMEiGRBC@bcrF+5cd6VXV6|*pU+qf z{w`V=P}(8P8TK7>DhE0qH^RK^zjuyJPHt5i{qJP2?`JOA3oU^K~;Q ztb89UDR%Iv+dQ&Hp10?+_Ny#jW$(NF44FxH>Vk3nEp|XUB{LDBZ?@G9T7N2@R5bD&ntA(*QU%`)zB%oj{A&> z;Qz^Y2U6DRYRZFW6LFt);t|1HCq9qc*AT5+Cmu1n!uB<&y1Lx@<5|?cc7mb*7g)>! z1AbOcGR$Y?6oY$KPMdIEl|=>vqc)^3~#IKDyi!= zb-@C$tE{f~)CFgQ;5=xFICt|Ssk7$I{O?WeS;T1GY2Cf?>0KXVh}5Equ?O2X@|7NOY{J&jGwV|5)@0Rz_a;iDE)K@e`T-#0Nn}IvMJ3v(=cPYO3#z? z8is&SR%=;`(t2lImdLNkglAor$nTM~t0AljByCmkNXz?e%;kN+g0l*<_XgX8;iG}o zcLb|PCVFI|Rf!*RB>g_F`WvH+CI@Lip*_4Sqs*LBL2#n06c7mqjQekKWi~*S8HKfl zvWFQ>QOf>;v9Qlj8k@>EF*cysd*|j0?%X_Iyw@r-<-so7fEn)@*KqCB)jpU@gS%fm7z}NM zjHjY@h#1>8-LdhC4~hp`HZG=*>)j;x=)ZJ&Lj=#%m)$|?UBTZmpN0CSAh%_V$M{IH zn-NT~9ovLLj(jn?N$F}@H9av*GqfJ9Os4zyu`sI`XU0NmDKOsQt7!)EC^OTLOBTH9 zMb{E9o9##vJW&u2VK=?)LX6@l-HVYsMo2zBrr#YrKS&|?_^3S#`(rf9U;gZW5<4%@Y>f$w~OEGNBppX|h+PEh_TrM~by7x8<2AyM$ zB8_h@b~cm8*&rhmk-3wYYdkBJzv(Z~pHM#=hxDrr#=3w_sLco=H;g z9_Pb#*jQWP(_dkp7ZVKDVC1)v!@o%EXiP?!wL*44lsKE!cLy=#N&S7yyBspH#I7_Q zB_vaee~Zbp9B%#FGYB*24Tnq9Q3=FAhJ3bl7JCr$A*YRV{$r;`Y5WfKd1lj)$w$ii9yxX@f4v$})f z;6mO%GF}$86gZ1rLe9eWHfl03@o*2>34vO1Fym?lbePN!>D3`&^FHA;-{TA% zP0wdEnyq~FB&4!PINRL?+2r_I7<|d468WZL(^_?>jA;ErueH!c`vKG2_s#6qxWDaA*ttOUeJ-Ff%fHz{PwO{VZ# z8Whm~C%hzKh78}SkO!zcXRM3NrSWRwLb_8f@;F+zp?KfO_K$5X*$kgdYY4N!OD>cg%tJv%PaH9L9DIVYFzjBKQxXLBvSCe;7p`f~iIgudK1-$aCh{{$F zhaFTQ;-m@}|iL5;$v!q#Ml$3utuQ$-oKHp#{tB407q0P{c zkc?9B#SuvxsoRm8KuX(OB!^jRxr!&53fCu0MIVYU4(lOF*ZE3R_&>w+HV}6fN@txj z$gzJ=Mm|zraFLJ9o(P8l4hfKmHd%EYwNWa;yTK-1J9#NVubm_$;xJSk=+PVUR#`B( z^M02FQ{CRDEPlWOCrOOgX0gE{U=gu^svmO>CI5#=aFObKL2?cSZSI*;tul9H?nvzj za-B2ynW`PH)oRna%KD^#+vYuQw+HxK9CKy*MM$Cjc5zWXt&qh##de( zC2FV0bu!< zCYdD5k_07Gy{jlvVl~r4f#NFf+w5j8$WMkbzv%OgLWn(D!^b9}2qm%qiODhrZjZ_o zIbJTY$-A$kz%H!zrsmV8cya<>vvsxTW{qu;3$|w&Xe;ce7p0xN9o(_5RsnmfBDi0x zcmnBs+E1GinZ#Btl}G0V{NR7^zAVj)YzFtr1@Y3JZ=zzAH1>Il;lr;}*IRxfIKqal zhq?H)gimItbE4?$8rV;Q?7jh~lb@J5d-lPVy3re=5VWpeTfMRBe{#d~@3n5-_4>Sd zj>T7z$%Cpl(u|X!ipLn~e{p33aMXF(Iq3vs#SAX4RmAuqDzq_>NQrvEbB09ICTWi7 zLSqN}hkghQ;wFVj=y3*kewz6Sq6Gp3)2FcN>C^$q;kyL@ZIm2 zL8Us1eY_U?UST0cdxy0m*708FU|ff`B82RL1X&r#&Vue*X%1@jgtK6Uigcve5FP&> zmHjU>EUm2nwZhVlzS_x0SXuv64n$eUi9kjLqW1eO;!u8~bs%hKMPE>IT3;8g^J^=H zezJSr;P*c08owl81;j6#E4wcl{NCTcz<3cp33viFKubeLCA*jt)twk`m1R!LvHzGfbdN|r?T>(pbN$VDddEa6ot41XzftJ+_5TvFu=5f@AJ2B#! zSe(W5HmyXGMeSfqxe+-5%Fm-GSChhyT}-FjgC>tckG$4g?8%b^6X7=9Mf)0m<8eOo zEICeBf#;2u!0}ncG_&~2Z}rEWUm2+sl26?KJ}L*)DAY*~r;$2gXtykLvER^eF>Fa+ z<2vgfcuAq zjQ0q0e2X+=U}&7V@)AxooW$pku{f~63G`=1spNZR|GfibT3K(5jmfQ*&Y?f7^aApg#1rSP*W)l64ORx|CnAt);!i?>jhdeG9}vgv=s&C!0W zro$a_NKUKw#~e@PIRd9Vz;5y((|bzO*~sY5#sg*R8yI|8RKjim8VLK|A#cudMrbb_ ze?T;Zd{pTabX(1-;vD6obF!pU=wl}}dKm6KWHHA=_LvTZx6k4aSP=EzPgwjE#c0W@ z9U_bwVzz(BrQ~6?j5U`KvQ!kd9IG9x)t`BlZO(8ZTJY|-4>yd? nYkLHgbvO97XD0mmYqvPJ2hHCn{pB~%e$1B^jxGGo!k_#<6#YU^oR7VzpB5Y>T~;jb8Os7eXB7$o^zd1n#voh-(xbXhndtZf{MYlh23C|=sHWzu(Im30y7L${51n;>XtyB+V&Qd{r4NUDM-F|MU z*A;}k=qVoi2E+hD;3WhH-~l@_RU%3T&{dTIW>d=4hDh!K(wff95?*euOFFAeL!zB# z9k;hCAfsoKGS@+z$ax@_ft-uPpq{BnhI!0XrQ;b0hXHEdc5|8s>HQpl(RFce(2Yw# zKr_k2?JPIe&2iW|D2pz9+HyMf!=g-0*`zvd3dEYc?H0g{&c=CWT@w?c#C`*Al^Cq^O*W_@D+09vmXdtWUlpIy0?8@=CLd38G zK|Ial7c=6pZ-zXA_^2x7Z{iTpGrS+SwkxuNwVwk$i3QwuF-!30%irZ3+_ato54~sQ zu+B-G2=E16ArD~*0mzv9_28U8=m+yySx{M^ zD(LWIbM<1nm~;4Z6uc>E|Sq1fL!DgOIpw9C@rYF z2PSM6d0w1ky-t{CJricV@U{&z{P)6>vcPn^D7%TfqpQfleN%GzcA9Bs@y4}`(U>s? zGe%x>5kzC~S(01hSQ-qDni8co(#+=GqL0+($htD?d#;~nF4EY>yhJk~aGglf$L3TE zLe_Jd&%5Qo=%IFABrclgR+^rqDa%8iwIHp@DD*k83@)=+O44BzYk^Q?x|+YE!k`1% z%bMe^<6LV{vP%rl^%BHCE#Mwiubr7ZwGk6vb?g_`Q8MK9^MQ?FVFLAOPOCrzIQ4aE zm_~#}WwwfUz^G#;m9vR z8GbiF8`aM%TEyrYKAU*#8p!FV!%B6C5-&*SW#%$s73q!qVU8Rx4Fi+0vfboytz3%S zyCff=inD+~KeWbp|12J2P|XtUo=&X92aGcjtS}N-pBnc$3qC>qU~ez0p^GLFZwb}P57w(%n#pK zyt_u-ga&$_(Doy9X0;7#;!H|q=HFEZDJ=S|dh#)*E#=F9=*xuh`i?YJV5#Kf75o6V zQbAwj2 zYkzzG;`pAtKE5$b65hIPZ-5p9hd0TvXHLTLYn^skZS9YIXt6qK8N>@|*EXq`x=q6^ z`hu%O(U+Rmq@2lo_4>uB*&lT9=F={=FlpKHC74EWsRs*VmtLh1Yb3z(UR~KyBepGz zEML&p$8UD%L*B>QQq=FW*fl8&2ujSt9LGOUxDUammT=06W0v_i_V1fm>I`SY*e#D)+zMFe56+PO4bdD)m}@x*pc2@TuZ6v+adxIC$IA uq6PK&69O-<{JmpFKOrK_NpzP)apl#M)K#0=Ll4RsIGs0R*A| diff --git a/bsf-diff/src/significance_test.pyc b/bsf-diff/src/significance_test.pyc deleted file mode 100644 index 6dd4bf9bbae682b1c448dc024bb92da84d697a2b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5347 zcmcIo-)|dP6+Sa|oY-;dG;Pv!7m63Pi$P1$Zg;h4w_VyMDcg#sWn!oZEu-noIGMzr zab_lM12#wmA)e6w1%Qxv<1bjTFFYXr0v?d~3lI+o-r@Vs^|(n3`xLCs>HT@`z2~0q zJLjhK&zah@zyIrjFQZ=x|KGruu_X}MgtR2ErE5vgmQ7nM)=SbYNv|xMWm7LpcS3p< z*{n!!QZ^?|>x6Wtq*s;AD(V#psuE1fPel$zzPnkIU`jSxH7@pRh}x5^;|5Wu^L}$R z>~z93jPuCrb`P5O!hHLn>*Zn4bi%DDi7_A#{hXzCl60H5leBp&%J#ec?I;hMt6uKa zR+9cf8f|Uo&9^UKx!PPwdj0*};~cMx!AUpSI)J(?^iqGjxq`*Q^sVO2I~(n#=FM)m zsdKYtiV^A4Fj%TZ|HiLnVW;^VMB7WZ!rbiVftO;G>6=C0t*o<(gRDLG!#-DR*_>5} zSiR@0cySN~I89DlqruFh;xy;%jc6;5I+5?ie%KDPoHwDrGUP3M*$*H>+>Irn%}YA8 zQdKH_Hl z3dA%%9&H6*b`HmgM25I*$VnMY$PU~^WVa;gWyx*XDUBMjGYp7Dp8r_}CD|#{Emj}@ zNrqEepOC>6w?C}Ppe#G%YE1^V3`BM+vTIBBE;>^8Dx`Soqy6 zCo|;RY3KzH0CF`|4@{5*`4oCGuh;K}ZtrL%R%MA|F4K;nmx5FLT&-iyG+Nj7lQ{RH zI84>(YG$=ZVc6cWTHL~#q9Xe`O3T~0k>x1cj&{9g-tw~~-Oc)*A1)Ux_M(s-!Lv&n zEcFl616ZazWUo=TU9l@x!(P`9Z5fti zSeD@gK?gtqJwQyZBs;9h^C=tXS$!<_a9VPpsZJ2WCxmrr9!>9kp;|-7E=pb?vLl&J6dMy>3za@y7I$^*ISs=VGAsS!iQ&McvgnT|DAEaoc^P0bW-ZDgtZRYSmIVJE8qxzFUV~XSG$yTstKrr3KlMmu z7?M*v-47j&TEtpE$s)!vMM?%3VCwSwhiXR2=0I4h1(x z+1|dFhHfWGU7ssBj8+m-zGU6^?ki+Y8K2LFh zf&}Vt`k3+b8p#&N7*81#mINp2>5*WVM5JJ1MgwzGK6v$LEmocy--@{lXNfED93vx; zuzAP0_AMwlR*iG+M|+K3FyQ5uSWntXx&f}ui(y{911KBa$;ooC!+3X*+y5p+$$H(c zS~K8aNNdij;49m6cHOF!=Bz5(FQTnn`ie?rYGvz&^%|Ia9<6n2(W*gyU>KXsPVlU7 zg})&$B4dN1Ut#PKS8Eyq>INB-w|`+FH(7oBGYh;8mfk@c1GtwoBdQA+1|TL^eP8z8 zA*G-SN*_@jC>&isSK0%4Ljy<>3(rV=wHTx6Q~Em{gGBAv)=w85FtnC7J{V$R

Uup!u6L&P{b}pmc;jI|jk7D;(6y)2=;S6)#Fy2Cyj%cDDasZb>j`8aXC%;Xh z9(5SwMkg|U_C8A*vKVDFqj2Fb8h@dYW#>bVVXE!iptwnKi(-Xh72<@w z=r&Y-j873Op!9i!0FpYXy=s%%eV&Ne=Er?x#104?UcuL%X4be6Qy}A(v=<3ziNua1 z>(H`rv*$-wp(pkeV9L)k2J-%rrUxxLp~F?o>*PsvB*`cc7ANtjsQ09)jy^CH&d@yJuZ!k2MXDP1t8qPh5~smL&&VXESR%<<;r^rl=7G=2@6^J*Hw zBuD-Er8A!J!AQHm#g{QHlaji#CR(l=dw7()YJ1o1CBZ)4qpsc;a>A|95V&$T^B9wM zvgCs&GoB;~oG-9PA);}BK4>_f>%KUOmQLc~>CmybiHwU@>_xjV-*~U_Zllqttef@N ztA6RaLE^iv^EEU$_)OA_2QL3g7Z!iqGBJt1J90)7cX*A;F=_umuTxk4Kb%M}Q-z|Bc5!!4og=eRGMOR-is4XYF`yf6VO6d^2;6KNmZ{zWD25CZ|sW|6gLb zZ$WaA9ne7XP|84dz%(RpOWu^{A}1o>?}U=iNZygWHJQSeS;^-mpPNiIDPg<2K*7I( zbe_6ik{7++XVIqV^^7$`U!-Mu6n$fS|ENs8$)nip7S#|NyvaPdM^#luUsX0*FWf;n z>KEQbo2gGb8`bE@7Q1~PeSGJmPoj-#e{|qeo=Hn=uF7in2;!VcZPt%Apv>6S=>FH+ zk8Vfz%QDilF0$BRe>VB;PVo<(EzkQtO6{)k^<5@j3b3$_;ckH#cnO{nT1$Tf$}}Y2 z?uJ;-W^=-AUob##mp7Aon z+;hV)-8Wrs(l=+yUd+=m7028da~HMO#C;BKtzYe%wZip_z4S0$JFM)U8>N|9I}6d2 zfkL<1YI4 zYNo4sp3Zs6{%PPZ8gtF|G|!iz!cn;d*MeK&#Og_aRZ=GdbAj{hx)*@)< z?Bc;#O=^NG3+JZ>slAtI1B)ulNYzU;!{5RCsWDfA`A`d3Okl@t63w4C$T97v_fYPl z4vm?tOBi|qBvBJ-L;#;pqlc+QU&t`44}G}*elxHv43O0ZIL&h`g1!%dSV)}SS093>cl`?M< zu22c6qWxaK2@?I#lyOVOvofBOaa)ejrbYHbvA?r(3|cZkS8h?_4PqTS26L1ba21;y zbqa_aAwNPHbi~igaZAtzk7v0?1}AXsww&kgJl-E{>a&`&D}jY?>XC)!>rD@k@%-qle( z?Z)&;GCbH-fJKHiFe~QTgQt(x2g-ravHvL@HC~;DUaY#}qfc=H(QDHzAW1Q+s+V xd_{cyMMXCzNnT}W=?sOoUo9UokH2!}!tR@c%)9kB#1{&9PPD>>aA|(&+P{mCfb0MO diff --git a/bsf-diff/src/threadpool.pyc b/bsf-diff/src/threadpool.pyc deleted file mode 100644 index 68a6326934cdba22fabf578398b7607f92ba7661..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5130 zcmcIoU2hx56`fsDlqg%aWT&mt7R4AwTP9JZqAiL7X&XCA3^+&$W92}JkY=-54#kz0 zyVTCmqzt!Hd6H4N23sDfTY^_EqyslHO`OQk+JKzm6UrH-4b zAE;hy-m%0*_BtzAF#9h)-A;h2TjDOcF(bE%+T%fGlHQ}T%rA)YE(d*r0t+8tRg@u@ z!=~IYluA=U7{0+yBF{lf^;YC>Nqq%zQ5SIz+RCi*50dl!H5Ki3+T7yLC?4cynjPjw zJCQ|?AdF7)#M*7GcOjlLRww>-*{?_0EmTags_{wE6sNJ;CH4g6AaQz<<+(mIx|i8e z;?jZXX;*TFF3i;FKu z%X+L>)`KLs#(nqlfXseSW(5Vgfi8^j67QDd!gZ(x5y_goFIKrYZU=oz3i}(>b_V6p z>|}P3og|ZFhq?g3so6QS{kWI)`-_S{JB?|3bmr&kP$6VWRcl0fkBv|r3;e;;4e4yo@Z1NW^?)VXNAR(0!TSW^zhNc){?wVONOCg7UNiDa7X1EqKf{pNohp~aTd<4els7&7>OwQve z{&&(;)qUkcVg`47E|p`rctbr4#X<28lu9H|8)|?0s~1*)T}%aX@MEA}KlZTG2IR%> zPTyDiV9LD?MX`TJvG+}QZ)sTZg#Mw+N(04VRwRzi+MDP2WqmMMYOGDFGg!&P8RTPY zbXnwQZ)@=8cnHRP!CrK>Fs6aYd)paAQr(V#PZ@JlMB#y;~lKNEBJ4i}B%${BL_EC6!fZ|RbcYLO72phERT3|Jl&D#z0oYurMpjhi}QwR4+Z6>yYR4*wX7-$TPvYSp|M zG-1o0++Gh>!WXUiA%p8; z%oCrDNXjIj^s90x1}?1`UJt-o;Ga1_7E)S1QibO3%3X)_6lCtyul3`=e z5t*pl^|W*eCP9J-!#b53m7{OTkr-d#NL{3gMC1{gK3`a(7Xg*{050WJy_%K^$S)YSXkvU=~2x7Bla zXk+GJe`=`HfvgAOg+V}qeE?s3?P~3Wu!N3ajCGaXMc_uT0go-v^nO*xAEV)1l9Vug zmgL-SEl?gYmBffM4l5@gdQ_+!lsQ9Y5B3_** zO~xW0l_>c!vPhq@2!-;McUHRQB*`3Xk&s!*GiNWn9AG8@bX7h}0--eb^ZYYZ?GI6? zR=d#(ZUq|++R0kL)_SlStTxudZb#UIM~vel85(5jJailn%icI=lS{^NuT0}OB9}{^ z6Nv(Tz~&tmn#ElfqBkCAaU`j|QlcEqf~DoYt!681Uu$o)*W0T%*V{KDW?d;B`EcmF zcc{wwaUGBzE4+CbhHcQ_EMj}K@(UKDNM(jb!ZoZW0bZK&sx z5R+gelRa5c)dT);0A(QJlbDbagM7eb?2GFpVA0`|0<`N_ObF_`&>;lS9~Cak!3()u zgK*JtTfpI^A%_r|sY%Cf5nwo+k$grutV^Lcy7kA}nkBHEFy7(DucUg!Rp`yf? z9diKBZF~Gtw zOS@n}Bu~cU@q2G(Tz!v*AD>^-ly+ah|65)*Vv`ZAh)O{VK_U~hlC+SNNivd(fO0`u zK<`9vM30LOnJ#4=dY06jl0ZriC!%gI&%!Q4h63PcY-Yt%UF1cY)Tx;p>mt|TD7;@W z$h|}UB^yVO!77-rRRJd;H47m+N|C)^6Wjt`#zdYo3VGMkdm?MfybU4luGvhk;(Kd0 zHm=|8!lCa~ zPY!||R+SXB*5L^sIW&LYfU!NAnuav@*DO8Cwl18{KyT)#BC*CZD!c?g1@QRswQ*=y WDDNP)Z3LQg17iU;5(9bBAITq|e0vH2 diff --git a/bsf-diff/test/__init__.pyc b/bsf-diff/test/__init__.pyc deleted file mode 100644 index 823342dd466177ee84c1da7e9b286239b42c9f25..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 131 zcmd1(#LKl|-pPPu1}IlIX%fK0Rp8CVP=0EUVj A+yDRo diff --git a/bsf-diff/test/testGene.pyc b/bsf-diff/test/testGene.pyc deleted file mode 100644 index 79b8b03510f05a50e20bad0e16708d9ea1b90a2d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1617 zcmb_c&2G~`5T3Q;r0EYoEg*t~+8aSaB}uu!1w=sQ08wQM5>X_}jn}Q)#17uADx~&Q zo&)hDyaZRCf;))$#!k~#Jz{4!v)|0l|9Jd+X+C`XYimg5coRmlL{*sq@#G7TG#Z~sA z?-08N!Y%z`pV=WG9{HvHA?{;AahI3jprI#GlsHquB{qC2npx+mOUFb(b;`X`%$ys}Qbu`+3m_VO_5mFEMJxeW&Noo5imDXV&ZDl#^6c=q%w!p5S^g~uDp^EUdQ$4) zUfxdlgf=DOQzyCk@vm!^`DARE+qEoS>%9wF8=cP6cJgCH6?21mI*l%J9)AC`U~XT& z)oQm}ttXvM@^h6I&C11#Q)Y94aeHZ8(Hj#?t~8r3-HaLE<=#1)af?89wGFa3VBfQG z39tyq^)@l~JkSvU#zKQH!t2k6;izarL~32;a9lEIh1FY(_(+A1$Yvd1^x1S2A`z`cN2XP%alzL+jg$3d)t3C=WTDSRrH{=>HK>j_z qXcL%{p#1y5AznzVf`UDNjkj3{MZPew&#}p;xvuJ};WX|pIsX8Z0vgf) diff --git a/bsf-diff/test/testSampleData.pyc b/bsf-diff/test/testSampleData.pyc deleted file mode 100644 index ff166e2761e6fcc15b979790af9dd2f965492d32..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1257 zcmaJ=O>Yx15FPI(X;Vl9MMCPC%Oa$R9^nFN+G~VddLW8qxx3!7&F(r{I~1gHO8*-F zf?vTeV4gQ0w5o`-lks@`=K0OI`ZGBA_4fCtl-9qP|3C8QDmEF>oM_rQf~JBpLGy^p zn5Hp_ETVab$}Ua2yol+Jb94UAICkRl(<0B$zNcp43tMk#(;dwngpHi>zpB zT^Ya6zEXKn8l^&pGiB}%Im$RUjjgX#y{uAOE~`4UN}~Kub;&F8T{h16i^UT-t!&%t z{g4KiV_^ diff --git a/bsf-diff/test/testThread.pyc b/bsf-diff/test/testThread.pyc deleted file mode 100644 index f1f1eb4029adcd16dbb5691cadc486a45722bc72..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1271 zcmb_c!EVz)5S_J?v`Je)1Q7gzkx*}3xKIcZT!^AgdccZgx$%a$alCGJqY|l{!ioRm zNB9EfjZ>lq4jk~xligX*n>TM~v-o+^dH(&!U_#p`MF6{W)F6XXFCuV%%rb(7hr(s(fwb z1aaunmi}%-%4Wr#>CtmnE3-1A)XlY^2%s&EM?YNJynIs>($ylNK4R~_jzB@KLArG2 zF)tjbkUCaf9yJ4alB8r!l0@hv$--uJ!%OfSe`OyVOVNyR0nSgIj`=vhYb(q?J3krFoN7F0K3;#i{cQ zUOxu(HI{n;;*pD{fau17AY?M(GG62oo%ZQrG^H6AcPCixdk~msh(A~{p#bk%9m3Uh zC|4qJhebQpd`7-YD})o<#3ycVwJXYMp5({psqJr27J!sagI^^6y)qxltnk#@tSXXu z>Nmm1@hr>JIMx#`&1ZUo2=v8(BbxNFYT}3BPJWoYMRD9@?%edeoO~*)3%5w7>Ezta zVw=ro6S`0m!_o2Lv*)e2)W2QOpJRYx{;LXsv??r*rc24FhHZ%=(mF5!pC;8$<)>l` zeA6o(ajDlFF6vsNpjpTg|7zq4rC;a!;}*59x2QX|0ZX54%9Y;bA)Xk{;nm?z%uUUf zSUBmlXtu>}h}~?_?1+WKty`LHS>f!oXn@lbyW64x&Ysx)77cLji9Kj)@JPcN2i*M@ z4RF2@`=CVwoQGl`wP=9zt=L1==EEN&MB;-hkQt|Y9Ya8%IH}q($Hl#sT-hnj8_P<+N+&~&MkOnPCga4;9%>Mk{ zw5Bu6UiHGd05?E}4Ul0A$gtl-*#3?Y#m-uAT4}3B+7;V8Urkrj&2tc93dZFuM(NAB zf5Zb?zqr!T80Q(rieA^