From 5c567de7c7120b93ea447df60a4a5ed37e7bb85d Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Wed, 17 Jul 2024 14:17:22 +0200 Subject: [PATCH] print estimated ram --- .clang-format | 2 +- Makefile | 4 +- readme.org | 10 +- src/fastphase.cpp | 40 ++- src/io.hpp | 12 +- src/vcfpp.h | 641 ++++++++++++++++++++++++++++++---------------- 6 files changed, 461 insertions(+), 248 deletions(-) diff --git a/.clang-format b/.clang-format index 0d3a09b..e925990 100644 --- a/.clang-format +++ b/.clang-format @@ -27,7 +27,7 @@ BreakConstructorInitializersBeforeComma: false BreakConstructorInitializers: BeforeColon BreakAfterJavaFieldAnnotations: false BreakStringLiterals: true -ColumnLimit: 120 +ColumnLimit: 110 CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: false diff --git a/Makefile b/Makefile index 4f88490..32a66a3 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ CXX = g++ # CXXFLAGS = -std=c++17 -Wall -O3 -g -fsanitize=address # CXXFLAGS = -std=c++17 -Wall -O3 -march=native -DNDEBUG -CXXFLAGS = -std=c++17 -Wall -O3 -mavx2 +CXXFLAGS = -std=c++17 -Wall -O3 -DNDEBUG INC = -I./src -I./inst/include -I$(HTSDIR) LDFLAGS = -L$(HTSDIR) -Wl,-rpath,$(HTSDIR) LIBS = $(HTSDIR)/libhts.a -llzma -lbz2 -lm -lz -lpthread @@ -23,7 +23,7 @@ all: $(BINS) %.o: %.cpp ${CXX} ${CXXFLAGS} -o $@ -c $< ${INC} -$(BINS): $(OBJS) htslib +$(BINS): $(OBJS) ${CXX} ${CXXFLAGS} -o $@ $(OBJS) ${INC} $(LIBS) $(LDFLAGS) htslib: diff --git a/readme.org b/readme.org index 8d0520f..d113c8c 100644 --- a/readme.org +++ b/readme.org @@ -19,19 +19,13 @@ Firstly, the imputation model is in the spirit of [[https://www.ncbi.nlm.nih.gov * Build -Assume you have =htslib= installed in your system path. - #+begin_src shell git clone https://github.com/Zilong-Li/phaseless -make -j4 -#+end_src - -If you follow [[https://github.com/samtools/htslib][htslib]] installation guide and have it in your customized path -#+begin_src shell -make HTSLIB=/path/to/your/htslib -j4 +make -j6 #+end_src * Usage + =phaseless= owns subcommands. please use =phaseless -h= to check it out. ** Imputation diff --git a/src/fastphase.cpp b/src/fastphase.cpp index 7877085..9ea0c8c 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -43,7 +43,14 @@ void FastPhaseK2::initRecombination(const Int2D & pos, std::string rfile, int B_ R = er2R(er); } -void FastPhaseK2::setFlags(double tol_p, double tol_f, double tol_q, bool debug_, bool nQ, bool nP, bool nF, bool nR) +void FastPhaseK2::setFlags(double tol_p, + double tol_f, + double tol_q, + bool debug_, + bool nQ, + bool nP, + bool nF, + bool nR) { alleleEmitThreshold = tol_p; clusterFreqThreshold = tol_f; @@ -182,7 +189,8 @@ double FastPhaseK2::hmmIterWithJumps(const MyFloat1D & GL, const int ic, const i start = grid_chunk[ic]; nsize = nGrids; } - MyArr2D emit_grid = get_emission_by_grid(gli, P.middleRows(pos_chunk[ic], S), collapse.segment(pos_chunk[ic], S)); + MyArr2D emit_grid = + get_emission_by_grid(gli, P.middleRows(pos_chunk[ic], S), collapse.segment(pos_chunk[ic], S)); MyArr2D emit = get_emission_by_gl(gli, P.middleRows(pos_chunk[ic], S)); const auto [alpha, beta, cs] = forward_backwards_diploid(emit_grid, R.middleCols(start, nsize), PI.middleCols(start, nsize)); @@ -222,7 +230,8 @@ double FastPhaseK2::hmmIterWithJumps(const MyFloat1D & GL, const int ic, const i alphatmp += PI.col(gg) * R(2, gg) * 1.0; // inner alpha.col(s-1).sum == 1 beta_mult_emit = emit_grid.col(g) * beta.col(g); // C2 for(z1 = 0; z1 < C; z1++) - ind_post_zj(z1, g) = cs(g) * (PI(z1, gg) * alphatmp * beta_mult_emit(Eigen::seqN(z1, C, C))).sum(); + ind_post_zj(z1, g) = + cs(g) * (PI(z1, gg) * alphatmp * beta_mult_emit(Eigen::seqN(z1, C, C))).sum(); } { // sum over all samples for updates std::scoped_lock lock(mutex_it); @@ -280,30 +289,41 @@ int run_impute_main(Options & opts) std::unique_ptr genome = std::make_unique(); init_bigass(genome, opts); + cao.print(tim.date(), "parsing input -> C =", genome->C, ", N =", genome->nsamples, + ", M =", genome->nsnps, ", nchunks =", genome->nchunks, ", B =", opts.gridsize, + ", G =", genome->G, ", seed =", opts.seed); + // estimate the RAM + // input data : N x M x 3 x float + // output data : N x M x 3 x float + // model :C x M x 4 + (CxCxMx3 + C x M x 4) x threads + double ram = (double)(genome->nsamples * genome->nsnps * 6 + + genome->C * genome->chunksize * 4 * (opts.nthreads + 1) + + genome->C * genome->C * genome->chunksize * 3 * opts.nthreads) + * sizeof(MyFloat) * 8 / (1024 * 1024 * 1024); + cao.print(tim.date(), "roughly estimated RAM usage would be ", ram, " Gbs"); vector> res; FastPhaseK2 faith(genome->nsamples, genome->nsnps, opts.C, opts.seed); faith.setFlags(opts.ptol, opts.ftol, opts.qtol, opts.debug, opts.nQ, opts.nP, opts.nF, opts.nR); faith.initRecombination(genome->pos, opts.in_rfile, opts.gridsize); genome->G = faith.G; - cao.print(tim.date(), "parsing input -> C =", genome->C, ", N =", genome->nsamples, ", M =", genome->nsnps, - ", nchunks =", genome->nchunks, ", B =", opts.gridsize, ", G =", genome->G, ", seed =", opts.seed); double loglike, diff, prevlike{std::numeric_limits::lowest()}; for(int it = 0; SIG_COND && it <= opts.nimpute; it++) { tim.clock(); - if(opts.refillHaps && it > 4 && it < opts.nimpute / 2 && it % 4 == 1) faith.refillHaps(opts.refillHaps); + if(opts.refillHaps && it > 4 && it < opts.nimpute / 2 && it % 4 == 1) + faith.refillHaps(opts.refillHaps); faith.initIteration(); for(int i = 0; i < faith.N; i++) - res.emplace_back( - pool.enqueue(&FastPhaseK2::runAllChunks, &faith, std::ref(genome->gls), i, it == opts.nimpute)); + res.emplace_back(pool.enqueue(&FastPhaseK2::runAllChunks, &faith, std::ref(genome->gls), i, + it == opts.nimpute)); loglike = 0; for(auto && ll : res) loglike += ll.get(); res.clear(); // clear future and renew faith.updateIteration(); diff = it ? loglike - prevlike : NAN; prevlike = loglike; - cao.print(tim.date(), "run whole genome, iteration", it, ", likelihoods =", loglike, ", diff =", diff, ", time", - tim.reltime(), " sec"); + cao.print(tim.date(), "run whole genome, iteration", it, ", likelihoods =", loglike, ", diff =", diff, + ", time", tim.reltime(), " sec"); } // reuse Ezj for AE if(opts.eHap) diff --git a/src/io.hpp b/src/io.hpp index 7057bbd..089640e 100644 --- a/src/io.hpp +++ b/src/io.hpp @@ -390,9 +390,9 @@ inline void chunk_beagle_genotype_likelihoods(const std::unique_ptr & ge { for(j = 0; j < im; j++) { - gl[i * im * 3 + 0 * im + j] = glchunk[j][i * 3 + 0]; - gl[i * im * 3 + 1 * im + j] = glchunk[j][i * 3 + 1]; - gl[i * im * 3 + 2 * im + j] = glchunk[j][i * 3 + 2]; + gl[(size_t) i * im * 3 + 0 * im + j] = glchunk[j][i * 3 + 0]; + gl[(size_t) i * im * 3 + 1 * im + j] = glchunk[j][i * 3 + 1]; + gl[(size_t) i * im * 3 + 2 * im + j] = glchunk[j][i * 3 + 2]; } } glchunk.clear(); @@ -427,9 +427,9 @@ inline void chunk_beagle_genotype_likelihoods(const std::unique_ptr & ge { for(j = 0; j < im; j++) { - gl[i * im * 3 + 0 * im + j] = glchunk[j][i * 3 + 0]; - gl[i * im * 3 + 1 * im + j] = glchunk[j][i * 3 + 1]; - gl[i * im * 3 + 2 * im + j] = glchunk[j][i * 3 + 2]; + gl[(size_t)i * im * 3 + 0 * im + j] = glchunk[j][i * 3 + 0]; + gl[(size_t)i * im * 3 + 1 * im + j] = glchunk[j][i * 3 + 1]; + gl[(size_t)i * im * 3 + 2 * im + j] = glchunk[j][i * 3 + 2]; } } glchunk.clear(); diff --git a/src/vcfpp.h b/src/vcfpp.h index 479796d..9627cd4 100644 --- a/src/vcfpp.h +++ b/src/vcfpp.h @@ -2,7 +2,7 @@ * @file https://github.com/Zilong-Li/vcfpp/vcfpp.h * @author Zilong Li * @email zilong.dk@gmail.com - * @version v0.3.3 + * @version v0.4.0 * @breif a single C++ file for manipulating VCF * Copyright (C) 2022-2023.The use of this code is governed by the LICENSE file. ******************************************************************************/ @@ -79,7 +79,7 @@ using isScalar = typename std::enable_if::value || std::is_ bool>::type; template -using isString = typename std::enable_if::value, void>::type; +using isString = typename std::enable_if::value, bool>::type; template using isValidGT = typename std::enable_if>::value @@ -118,6 +118,15 @@ inline bool isEndWith(std::string const & s, std::string const & e) } } +// determinate the mode for read/write the compressed/uncompressed VCF/BCF +inline std::string getMode(std::string const & fname, std::string mode = "r") +{ + if(isEndWith(fname, "bcf.gz")) mode += "b"; + if(isEndWith(fname, "bcf")) mode += "bu"; + if(isEndWith(fname, "vcf.gz")) mode += "z"; + return mode; +} + // string split by separator inline std::vector split_string(const std::string & s, const std::string & separators) { @@ -139,6 +148,33 @@ inline std::vector split_string(const std::string & s, const std::s return ret; } +// deleter for htsFile +struct htsFile_close +{ + void operator()(htsFile * x) + { + if(x) hts_close(x); + } +}; + +// deleter for variant +struct variant_close +{ + void operator()(bcf1_t * x) + { + if(x) bcf_destroy(x); + } +}; + +// deleter for bcf header +struct bcf_hdr_close +{ + void operator()(bcf_hdr_t * x) + { + if(x) bcf_hdr_destroy(x); + } +}; + /** * @class BcfHeader * @brief Object represents header of the VCF, offering methods to access and modify the tags @@ -157,7 +193,11 @@ class BcfHeader public: BcfHeader() {} - ~BcfHeader() {} + ~BcfHeader() + { + if(hrec) bcf_hrec_destroy(hrec); + if(hdr) bcf_hdr_destroy(hdr); + } /** @brief stream out the header */ friend std::ostream & operator<<(std::ostream & out, const BcfHeader & h) @@ -243,9 +283,15 @@ class BcfHeader { kstring_t s = {0, 0, NULL}; // kstring if(bcf_hdr_format(hdr, 0, &s) == 0) // append header string to s.s! append! - return std::string(s.s, s.l); + { + std::string out(s.s, s.l); + free(s.s); + return out; + } else + { throw std::runtime_error("failed to convert formatted header to string"); + } } /** @brief return all samples in a string vector */ @@ -270,7 +316,7 @@ class BcfHeader { vec.push_back(std::string(seqs[i])); } - // TODO: return uninitialized vec may be undefined. + free(seqs); return vec; } @@ -284,19 +330,26 @@ class BcfHeader int tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag.c_str()); if(tag_id < 0) return 0; if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff)) + { return 1; + } else if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff)) + { return 2; + } else if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff)) + { return 3; + } else + { return 0; + } } /** @brief remove a contig tag from header */ inline void removeContig(std::string tag) const { - bcf_hdr_remove(hdr, BCF_HL_CTG, tag.c_str()); } @@ -358,9 +411,9 @@ class BcfRecord friend class BcfWriter; private: - BcfHeader header; - bcf1_t * line = bcf_init(); // current bcf record - bcf_hdr_t * hdr_d; // a dup header by bcf_hdr_dup(header->hdr) + BcfHeader * header; + std::shared_ptr line = std::shared_ptr(bcf_init(), variant_close()); // variant + bcf_hdr_t * hdr_d = NULL; // a dup header by bcf_hdr_dup(header->hdr) bcf_fmt_t * fmt = NULL; bcf_info_t * info = NULL; int32_t * gts = NULL; @@ -379,24 +432,33 @@ class BcfRecord BcfRecord() {} /// constructor with a given BcfHeader object - BcfRecord(const BcfHeader & h) : header(h) + BcfRecord(BcfHeader & h) { - nsamples = header.nSamples(); - typeOfGT.resize(nsamples); - gtPhase.resize(nsamples, 0); + initHeader(h); } - ~BcfRecord() {} + ~BcfRecord() + { + if(gts) free(gts); + if(hdr_d) bcf_hdr_destroy(hdr_d); + } - /// initilize a BcfRecord object using a given BcfHeader object - void init(const BcfHeader & h) + /// initilize the header associated with BcfRecord object by pointing to another BcfHeader object + void initHeader(BcfHeader & h) { - header = h; - nsamples = header.nSamples(); + header = &h; + if(!header->hdr) throw std::runtime_error("please initial header first\n"); + nsamples = header->nSamples(); typeOfGT.resize(nsamples); gtPhase.resize(nsamples, 0); } + /// reset the header associated with BcfRecord object by pointing to another BcfHeader object + void resetHeader(BcfHeader & h) + { + header = &h; + } + /** @brief stream out the variant */ friend std::ostream & operator<<(std::ostream & out, const BcfRecord & v) { @@ -408,10 +470,16 @@ class BcfRecord inline std::string asString() const { kstring_t s = {0, 0, NULL}; // kstring - if(vcf_format(header.hdr, line, &s) == 0) - return std::string(s.s, s.l); + if(vcf_format(header->hdr, line.get(), &s) == 0) + { + std::string out(s.s, s.l); + free(s.s); + return out; + } else + { throw std::runtime_error("couldn't format current record into a string.\n"); + } } /** @@ -426,8 +494,14 @@ class BcfRecord isValidGT getGenotypes(T & v) { ndst = 0; - ret = bcf_get_genotypes(header.hdr, line, >s, &ndst); - if(ret <= 0) throw std::runtime_error("genotypes not present"); + ret = bcf_get_genotypes(header->hdr, line.get(), >s, &ndst); + if(ret <= 0) + { +# if defined(VERBOSE) + std::cerr << "GT not present for current site. did you initilize the variant object?\n"; +# endif + return false; + } // if nploidy is not set manually. find the max nploidy using the first variant (eg. 2) resize v as // max(nploidy) // * nsamples (ret) NOTE: if ret == nsamples and only one sample then haploid @@ -446,7 +520,7 @@ class BcfRecord isGenoMissing.assign(nsamples, 0); int i, j, nphased = 0; noneMissing = true; - fmt = bcf_get_fmt(header.hdr, line, "GT"); + fmt = bcf_get_fmt(header->hdr, line.get(), "GT"); int nploidy_cur = ret / nsamples; // requires nploidy_cur <= nploidy for(i = 0; i < nsamples; i++) { @@ -473,9 +547,13 @@ class BcfRecord } } if(nphased == nsamples) + { isAllPhased = true; + } else + { isAllPhased = false; + } return true; } @@ -483,16 +561,20 @@ class BcfRecord * @brief fill in the input vector with genotype values, 0, 1 or -9 (missing). * @param v valid input is vector type * @return bool - * @note this function provides full capability to handl all kinds of genotypes in multi-ploidy data with - * the cost of more spae than the other function. missing allele is set as -9. + * @note this function provides full capability to handle all kinds of genotypes + * in multi-ploidy data costing more spae than the other function. missing allele is set as -9. * */ bool getGenotypes(std::vector & v) { ndst = 0; - ret = bcf_get_genotypes(header.hdr, line, >s, &ndst); + ret = bcf_get_genotypes(header->hdr, line.get(), >s, &ndst); if(ret <= 0) - throw std::runtime_error( - "genotypes not present. make sure you initilized the variant object first\n"); + { +# if defined(VERBOSE) + std::cerr << "GT not present for current site. did you initilize the variant object?\n"; +# endif + return false; + } v.resize(ret); isGenoMissing.assign(nsamples, 0); nploidy = ret / nsamples; @@ -524,9 +606,13 @@ class BcfRecord } } if(nphased == nsamples) + { isAllPhased = true; + } else + { isAllPhased = false; + } return true; } @@ -539,29 +625,43 @@ class BcfRecord template isFormatVector getFORMAT(std::string tag, T & v) { - fmt = bcf_get_fmt(header.hdr, line, tag.c_str()); - if(!fmt) throw std::runtime_error("there is no " + tag + " in FORMAT of this variant.\n"); + fmt = bcf_get_fmt(header->hdr, line.get(), tag.c_str()); + if(!fmt) + { + throw std::invalid_argument("no FORMAT=" + tag + " in the VCF header.\n"); + } nvalues = fmt->n; ndst = 0; S * dst = NULL; - int tagid = header.getFormatType(tag); + int tagid = header->getFormatType(tag); if(tagid == 1) - ret = bcf_get_format_int32(header.hdr, line, tag.c_str(), &dst, &ndst); + { + ret = bcf_get_format_int32(header->hdr, line.get(), tag.c_str(), &dst, &ndst); + } else if(tagid == 2) - ret = bcf_get_format_float(header.hdr, line, tag.c_str(), &dst, &ndst); + { + ret = bcf_get_format_float(header->hdr, line.get(), tag.c_str(), &dst, &ndst); + } else if(tagid == 3) - ret = bcf_get_format_char(header.hdr, line, tag.c_str(), &dst, &ndst); + { + ret = bcf_get_format_char(header->hdr, line.get(), tag.c_str(), &dst, &ndst); + } else - throw std::runtime_error("can not find the type of " + tag + " in the header file.\n"); + { + ret = -1; + } + if(ret >= 0) { // user have to check if there is missing in the return v; v = std::vector(dst, dst + ret); + free(dst); return true; } else { - throw std::runtime_error("failed to parse the " + tag + " format of this variant.\n"); + free(dst); + return false; } } @@ -573,14 +673,17 @@ class BcfRecord * */ bool getFORMAT(std::string tag, std::vector & v) { - fmt = bcf_get_fmt(header.hdr, line, tag.c_str()); - if(!fmt) throw std::runtime_error("there is no " + tag + " in FORMAT for this variant of ID=" + ID()); + fmt = bcf_get_fmt(header->hdr, line.get(), tag.c_str()); + if(!fmt) + { + throw std::invalid_argument("no FORMAT=" + tag + " in the VCF header.\n"); + } nvalues = fmt->n; // if ndst < (fmt->n+1)*nsmpl; then realloc is involved ret = -1, ndst = 0; char ** dst = NULL; - if(header.getFormatType(tag) == 3) - ret = bcf_get_format_string(header.hdr, line, tag.c_str(), &dst, &ndst); + if(header->getFormatType(tag) == 3) + ret = bcf_get_format_string(header->hdr, line.get(), tag.c_str(), &dst, &ndst); if(ret > 0) { v.clear(); @@ -589,11 +692,15 @@ class BcfRecord // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT. v.emplace_back(dst[i]); }; + free(dst[0]); + free(dst); return true; } else { - throw std::runtime_error("couldn't parse the " + tag + " format of this variant.\n"); + free(dst[0]); + free(dst); + return false; } } @@ -606,24 +713,33 @@ class BcfRecord template isInfoVector getINFO(std::string tag, T & v) { - info = bcf_get_info(header.hdr, line, tag.c_str()); - if(!info) throw std::runtime_error("there is no " + tag + " tag in INFO of this variant.\n"); + info = bcf_get_info(header->hdr, line.get(), tag.c_str()); + if(!info) + { + throw std::invalid_argument("no INFO=" + tag + " in the VCF header.\n"); + } S * dst = NULL; ndst = 0; ret = -1; if(info->type == BCF_BT_INT8 || info->type == BCF_BT_INT16 || info->type == BCF_BT_INT32) { - ret = bcf_get_info_int32(header.hdr, line, tag.c_str(), &dst, &ndst); + ret = bcf_get_info_int32(header->hdr, line.get(), tag.c_str(), &dst, &ndst); } else if(info->type == BCF_BT_FLOAT) { - ret = bcf_get_info_float(header.hdr, line, tag.c_str(), &dst, &ndst); + ret = bcf_get_info_float(header->hdr, line.get(), tag.c_str(), &dst, &ndst); } if(ret >= 0) + { v = std::vector(dst, dst + ret); // user have to check if there is missing in the return v; + free(dst); + return true; + } else - throw std::runtime_error("couldn't parse the " + tag + " format of this variant.\n"); - return true; + { + free(dst); + return false; + } } /** @@ -635,8 +751,11 @@ class BcfRecord template isScalar getINFO(std::string tag, T & v) { - info = bcf_get_info(header.hdr, line, tag.c_str()); - if(!info) throw std::runtime_error("there is no " + tag + " tag in INFO of this variant.\n"); + info = bcf_get_info(header->hdr, line.get(), tag.c_str()); + if(!info) + { + throw std::invalid_argument("no INFO=" + tag + " in the VCF header.\n"); + } // scalar value if(info->len == 1) { @@ -652,7 +771,11 @@ class BcfRecord } else { - throw std::runtime_error(tag + " has multiple values. please feed a vector instead.\n"); +# if defined(VERBOSE) + std::cerr << "there are multiple values for " + tag + + " in INFO for current site. please use vector instead\n"; +# endif + return false; } } @@ -665,12 +788,20 @@ class BcfRecord template isString getINFO(std::string tag, T & v) { - info = bcf_get_info(header.hdr, line, tag.c_str()); - if(!info) throw std::runtime_error("there is no " + tag + " tag in INFO of this variant.\n"); + info = bcf_get_info(header->hdr, line.get(), tag.c_str()); + if(!info) + { + throw std::invalid_argument("no INFO=" + tag + " in the VCF header.\n"); + } if(info->type == BCF_BT_CHAR) + { v = std::string(reinterpret_cast(info->vptr), info->vptr_len); + return true; + } else - throw std::runtime_error(tag + " has to be of string type\n"); + { + return false; + } } /** @@ -682,64 +813,91 @@ class BcfRecord template isScalar setINFO(std::string tag, const T & v) { - ret = -1; // bcf_hrec_set_val // bcf_update_info_flag(header.hdr, line, tag.c_str(), NULL, 1); - int tag_id = bcf_hdr_id2int(header.hdr, BCF_DT_ID, tag.c_str()); - if(bcf_hdr_id2type(header.hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff)) - ret = bcf_update_info_int32(header.hdr, line, tag.c_str(), &v, 1); - else if(bcf_hdr_id2type(header.hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff)) + int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str()); + if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff)) { - float v2 = static_cast(v); - ret = bcf_update_info_float(header.hdr, line, tag.c_str(), &v2, 1); + ret = bcf_update_info_int32(header->hdr, line.get(), tag.c_str(), &v, 1); } - if(ret < 0) + else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff)) { - throw std::runtime_error("couldn't set " + tag - + " for this variant.\nplease add the tag in header first.\n"); + float v2 = static_cast(v); + ret = bcf_update_info_float(header->hdr, line.get(), tag.c_str(), &v2, 1); } else { - return true; + ret = -1; + } + if(ret < 0) + { +# if defined(VERBOSE) + std::cerr << "couldn't set " + tag + " for this variant.\nplease add the tag in headerfirst.\n"; +# endif + return false; } + return true; } /** * @brief set tag value for INFO * @param tag valid tag name in INFO column declared in the VCF header * @param v valid input include vector vector std::string - * @return boolean + * @return bool * */ template isValidInfo setINFO(std::string tag, const T & v) { - ret = -1; // bcf_update_info_flag(header.hdr, line, tag.c_str(), NULL, 1); - int tag_id = bcf_hdr_id2int(header.hdr, BCF_DT_ID, tag.c_str()); - if(bcf_hdr_id2type(header.hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff)) - ret = bcf_update_info_int32(header.hdr, line, tag.c_str(), v.data(), v.size()); - else if(bcf_hdr_id2type(header.hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff)) - ret = bcf_update_info_float(header.hdr, line, tag.c_str(), v.data(), v.size()); - else if(bcf_hdr_id2type(header.hdr, BCF_HL_INFO, tag_id) == (BCF_HT_STR & 0xff)) - ret = bcf_update_info_string(header.hdr, line, tag.c_str(), v.data()); - if(ret < 0) - throw std::runtime_error("couldn't set " + tag - + " for this variant.\nplease add the tag in header first.\n"); + int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str()); + if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff)) + { + ret = bcf_update_info_int32(header->hdr, line.get(), tag.c_str(), v.data(), v.size()); + } + else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff)) + { + ret = bcf_update_info_float(header->hdr, line.get(), tag.c_str(), v.data(), v.size()); + } + else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_STR & 0xff)) + { + ret = bcf_update_info_string(header->hdr, line.get(), tag.c_str(), v.data()); + } else - return true; + { + ret = -1; + } + + if(ret < 0) + { +# if defined(VERBOSE) + std::cerr << "couldn't set " + tag + " for this variant.\nplease add the tag in headerfirst.\n"; +# endif + return false; + } + return true; } /// remove the given tag from INFO of the variant void removeINFO(std::string tag) { - ret = -1; - int tag_id = bcf_hdr_id2int(header.hdr, BCF_DT_ID, tag.c_str()); - if(bcf_hdr_id2type(header.hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff)) - ret = bcf_update_info_int32(header.hdr, line, tag.c_str(), NULL, 0); - else if(bcf_hdr_id2type(header.hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff)) - ret = bcf_update_info_float(header.hdr, line, tag.c_str(), NULL, 0); - else if(bcf_hdr_id2type(header.hdr, BCF_HL_INFO, tag_id) == (BCF_HT_STR & 0xff)) - ret = bcf_update_info_string(header.hdr, line, tag.c_str(), NULL); + int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str()); + if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff)) + { + ret = bcf_update_info_int32(header->hdr, line.get(), tag.c_str(), NULL, 0); + } + else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff)) + { + ret = bcf_update_info_float(header->hdr, line.get(), tag.c_str(), NULL, 0); + } + else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_STR & 0xff)) + { + ret = bcf_update_info_string(header->hdr, line.get(), tag.c_str(), NULL); + } + else + { + ret = -1; + } + if(ret < 0) { throw std::runtime_error("couldn't remove " + tag + " for this variant.\n"); @@ -756,24 +914,36 @@ class BcfRecord // bcf_gt_type int i, j, k; nploidy = v.size() / nsamples; - gts = (int32_t *)malloc(v.size() * sizeof(int32_t)); + int32_t * gt = (int32_t *)malloc(v.size() * sizeof(int32_t)); for(i = 0; i < nsamples; i++) { for(j = 0; j < nploidy; j++) { k = i * nploidy + j; if(v[k] == -9 || v[k] == bcf_int32_missing) - gts[k] = bcf_gt_missing; + { + gt[k] = bcf_gt_missing; + } else if(gtPhase[i]) - gts[k] = bcf_gt_phased(v[k]); + { + gt[k] = bcf_gt_phased(v[k]); + } else - gts[k] = bcf_gt_unphased(v[k]); + { + gt[k] = bcf_gt_unphased(v[k]); + } } } - if(bcf_update_genotypes(header.hdr, line, gts, v.size()) < 0) - throw std::runtime_error("couldn't set genotypes correctly.\n"); - else - return true; + if(bcf_update_genotypes(header->hdr, line.get(), gt, v.size()) < 0) + { + free(gt); +# if defined(VERBOSE) + std::cerr << "couldn't set genotypes correctly.\n"; +# endif + return false; + } + free(gt); + return true; } /** @@ -790,13 +960,20 @@ class BcfRecord void removeFORMAT(std::string tag) { ret = -1; - int tag_id = bcf_hdr_id2int(header.hdr, BCF_DT_ID, tag.c_str()); - if(bcf_hdr_id2type(header.hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff)) - ret = bcf_update_format_int32(header.hdr, line, tag.c_str(), NULL, 0); - else if(bcf_hdr_id2type(header.hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff)) - ret = bcf_update_format_char(header.hdr, line, tag.c_str(), NULL, 0); - else if(bcf_hdr_id2type(header.hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff)) - ret = bcf_update_format_float(header.hdr, line, tag.c_str(), NULL, 0); + int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str()); + if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff)) + { + ret = bcf_update_format_int32(header->hdr, line.get(), tag.c_str(), NULL, 0); + } + else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff)) + { + ret = bcf_update_format_char(header->hdr, line.get(), tag.c_str(), NULL, 0); + } + else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff)) + { + ret = bcf_update_format_float(header->hdr, line.get(), tag.c_str(), NULL, 0); + } + if(ret < 0) throw std::runtime_error("couldn't remove " + tag + " correctly.\n"); } @@ -809,15 +986,31 @@ class BcfRecord template isValidFMT setFORMAT(std::string tag, const T & v) { - ret = -1; - int tag_id = bcf_hdr_id2int(header.hdr, BCF_DT_ID, tag.c_str()); - if(bcf_hdr_id2type(header.hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff)) - ret = bcf_update_format_int32(header.hdr, line, tag.c_str(), v.data(), v.size()); - else if(bcf_hdr_id2type(header.hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff)) - ret = bcf_update_format_char(header.hdr, line, tag.c_str(), v.data(), v.size()); - else if(bcf_hdr_id2type(header.hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff)) - ret = bcf_update_format_float(header.hdr, line, tag.c_str(), v.data(), v.size()); - if(ret < 0) throw std::runtime_error("couldn't set format " + tag + " correctly.\n"); + int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str()); + if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff)) + { + ret = bcf_update_format_int32(header->hdr, line.get(), tag.c_str(), v.data(), v.size()); + } + else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff)) + { + ret = bcf_update_format_char(header->hdr, line.get(), tag.c_str(), v.data(), v.size()); + } + else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff)) + { + ret = bcf_update_format_float(header->hdr, line.get(), tag.c_str(), v.data(), v.size()); + } + else + { + ret = -1; + } + + if(ret < 0) + { +# if defined(VERBOSE) + std::cerr << "couldn't set format " + tag + " corectly.\n"; +# endif + return false; + } return true; } @@ -826,40 +1019,53 @@ class BcfRecord * one sample in the vcf * @param tag valid tag name in FORMAT column declared in the VCF header * @param v valid input include int, float or double - * @return bool + * @return void * */ template isScalar setFORMAT(std::string tag, const T & v) { - ret = -1; float v2 = v; - int tag_id = bcf_hdr_id2int(header.hdr, BCF_DT_ID, tag.c_str()); - if(bcf_hdr_id2type(header.hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff)) - ret = bcf_update_format_int32(header.hdr, line, tag.c_str(), &v, 1); - else if(bcf_hdr_id2type(header.hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff)) - ret = bcf_update_format_float(header.hdr, line, tag.c_str(), &v2, 1); - if(ret < 0) throw std::runtime_error("couldn't set format " + tag + " correctly.\n"); + int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str()); + if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff)) + { + ret = bcf_update_format_int32(header->hdr, line.get(), tag.c_str(), &v, 1); + } + else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff)) + { + ret = bcf_update_format_float(header->hdr, line.get(), tag.c_str(), &v2, 1); + } + else + { + ret = -1; + } + if(ret < 0) + { +# if defined(VERBOSE) + std::cerr << "couldn't set format " + tag + " corectly.\n"; +# endif + return false; + } return true; } /** @brief add one variant record from given string*/ void addLineFromString(const std::string & vcfline) { - std::vector str(vcfline.begin(), vcfline.end()); - str.push_back('\0'); // don't forget string has no \0; - kstring_t s = {vcfline.length(), vcfline.length(), &str[0]}; // kstring - ret = vcf_parse(&s, header.hdr, line); + kstring_t s = {0, 0, NULL}; + kputsn(vcfline.c_str(), vcfline.length(), &s); + ret = vcf_parse1(&s, header->hdr, line.get()); + free(s.s); if(ret > 0) throw std::runtime_error("error parsing: " + vcfline + "\n"); if(line->errcode == BCF_ERR_CTG_UNDEF) { - std::string contig(bcf_hdr_id2name(header.hdr, line->rid)); - hdr_d = bcf_hdr_dup(header.hdr); - header.hrec = bcf_hdr_id2hrec(hdr_d, BCF_DT_CTG, 0, line->rid); - if(header.hrec == NULL) + std::string contig(bcf_hdr_id2name(header->hdr, line->rid)); + hdr_d = bcf_hdr_dup(header->hdr); + header->hrec = bcf_hdr_id2hrec(hdr_d, BCF_DT_CTG, 0, line->rid); + if(header->hrec == NULL) throw std::runtime_error("contig" + contig + " unknow and not found in the header.\n"); - ret = bcf_hdr_add_hrec(header.hdr, header.hrec); + ret = bcf_hdr_add_hrec(header->hdr, header->hrec); if(ret < 0) throw std::runtime_error("error adding contig " + contig + " to header.\n"); - ret = bcf_hdr_sync(header.hdr); + ret = bcf_hdr_sync(header->hdr); } } @@ -872,10 +1078,14 @@ class BcfRecord /** @brief return boolean value indicates if current variant is Structual Variant or not */ inline bool isSV() const { - if(bcf_get_info(header.hdr, line, "SVTYPE") == NULL) + if(bcf_get_info(header->hdr, line.get(), "SVTYPE") == NULL) + { return false; + } else + { return true; + } } /** @brief return boolean value indicates if current variant is exclusively INDEL */ @@ -933,7 +1143,7 @@ class BcfRecord */ inline bool hasSNP() const { - int type = bcf_has_variant_types(line, VCF_SNP, bcf_match_overlap); + int type = bcf_has_variant_types(line.get(), VCF_SNP, bcf_match_overlap); if(type < 0) throw std::runtime_error("something wrong with variant type\n"); if(type == 0) return false; return true; @@ -942,7 +1152,7 @@ class BcfRecord /// return boolean value indicates if current variant has INDEL type defined in vcf.h (htslib>=1.16) inline bool hasINDEL() const { - int type = bcf_has_variant_types(line, VCF_INDEL, bcf_match_overlap); + int type = bcf_has_variant_types(line.get(), VCF_INDEL, bcf_match_overlap); if(type < 0) throw std::runtime_error("something wrong with variant type\n"); if(type == 0) return false; return true; @@ -951,7 +1161,7 @@ class BcfRecord /// return boolean value indicates if current variant has INS type defined in vcf.h (htslib>=1.16) inline bool hasINS() const { - int type = bcf_has_variant_types(line, VCF_INS, bcf_match_overlap); + int type = bcf_has_variant_types(line.get(), VCF_INS, bcf_match_overlap); if(type < 0) throw std::runtime_error("something wrong with variant type\n"); if(type == 0) return false; return true; @@ -960,7 +1170,7 @@ class BcfRecord /// return boolean value indicates if current variant has DEL type defined in vcf.h (htslib>=1.16) inline bool hasDEL() const { - int type = bcf_has_variant_types(line, VCF_DEL, bcf_match_overlap); + int type = bcf_has_variant_types(line.get(), VCF_DEL, bcf_match_overlap); if(type < 0) throw std::runtime_error("something wrong with variant type\n"); if(type == 0) return false; return true; @@ -969,7 +1179,7 @@ class BcfRecord /// return boolean value indicates if current variant has MNP type defined in vcf.h (htslib>=1.16) inline bool hasMNP() const { - int type = bcf_has_variant_types(line, VCF_MNP, bcf_match_overlap); + int type = bcf_has_variant_types(line.get(), VCF_MNP, bcf_match_overlap); if(type < 0) throw std::runtime_error("something wrong with variant type\n"); if(type == 0) return false; return true; @@ -978,7 +1188,7 @@ class BcfRecord /// return boolean value indicates if current variant has BND type defined in vcf.h (htslib>=1.16) inline bool hasBND() const { - int type = bcf_has_variant_types(line, VCF_BND, bcf_match_overlap); + int type = bcf_has_variant_types(line.get(), VCF_BND, bcf_match_overlap); if(type < 0) throw std::runtime_error("something wrong with variant type\n"); if(type == 0) return false; return true; @@ -987,7 +1197,7 @@ class BcfRecord /// return boolean value indicates if current variant has OTHER type defined in vcf.h (htslib>=1.16) inline bool hasOTHER() const { - int type = bcf_has_variant_types(line, VCF_OTHER, bcf_match_overlap); + int type = bcf_has_variant_types(line.get(), VCF_OTHER, bcf_match_overlap); if(type < 0) throw std::runtime_error("something wrong with variant type\n"); if(type == 0) return false; return true; @@ -996,7 +1206,7 @@ class BcfRecord /// return boolean value indicates if current variant has OVERLAP type defined in vcf.h (htslib>=1.16) inline bool hasOVERLAP() const { - int type = bcf_has_variant_types(line, VCF_OVERLAP, bcf_match_overlap); + int type = bcf_has_variant_types(line.get(), VCF_OVERLAP, bcf_match_overlap); if(type < 0) throw std::runtime_error("something wrong with variant type\n"); if(type == 0) return false; return true; @@ -1005,7 +1215,7 @@ class BcfRecord /** @brief return CHROM name */ inline std::string CHROM() const { - return std::string(bcf_hdr_id2name(header.hdr, line->rid)); + return std::string(bcf_hdr_id2name(header->hdr, line->rid)); } /** @brief return ID field */ @@ -1023,7 +1233,7 @@ class BcfRecord /** @brief modify CHROM value */ inline void setCHR(const char * chr) { - line->rid = bcf_hdr_name2id(header.hdr, chr); + line->rid = bcf_hdr_name2id(header->hdr, chr); } /** @brief modify position given 1-based value */ @@ -1035,13 +1245,13 @@ class BcfRecord /** @brief update ID */ inline void setID(const char * s) { - bcf_update_id(header.hdr, line, s); + bcf_update_id(header->hdr, line.get(), s); } /** @brief set REF and ALT alleles given a string seperated by comma */ inline void setRefAlt(const char * alleles_string) { - bcf_update_alleles_str(header.hdr, line, alleles_string); + bcf_update_alleles_str(header->hdr, line.get(), alleles_string); } /** @brief return 0-base start of the variant (can be any type) */ @@ -1103,14 +1313,14 @@ class BcfRecord } else if(line->d.n_flt == 1) { - return std::string(bcf_hdr_int2id(header.hdr, BCF_DT_ID, line->d.flt[0])); + return std::string(bcf_hdr_int2id(header->hdr, BCF_DT_ID, line->d.flt[0])); } else { std::string s; for(int i = 1; i < line->d.n_flt; i++) { - s += std::string(bcf_hdr_int2id(header.hdr, BCF_DT_ID, line->d.flt[i])) + ","; + s += std::string(bcf_hdr_int2id(header->hdr, BCF_DT_ID, line->d.flt[i])) + ","; } s.pop_back(); return s; @@ -1118,9 +1328,9 @@ class BcfRecord } /** @brief return raw INFO column as string. recommend to use getINFO for specific tag. */ - inline std::string INFO() + inline std::string allINFO() { - int32_t max_dt_id = header.hdr->n[BCF_DT_ID]; + int32_t max_dt_id = header->hdr->n[BCF_DT_ID]; kstring_t * s = (kstring_t *)calloc(1, sizeof(kstring_t)); if(line->n_info) { @@ -1131,9 +1341,9 @@ class BcfRecord if(!z->vptr) continue; if(!first) kputc(';', s); first = 0; - if(z->key < 0 || z->key >= max_dt_id || header.hdr->id[BCF_DT_ID][z->key].key == NULL) + if(z->key < 0 || z->key >= max_dt_id || header->hdr->id[BCF_DT_ID][z->key].key == NULL) throw std::runtime_error("Invalid BCF and wrong INFO tag"); - kputs(header.hdr->id[BCF_DT_ID][z->key].key, s); + kputs(header->hdr->id[BCF_DT_ID][z->key].key, s); if(z->len <= 0) continue; kputc('=', s); if(z->len == 1) @@ -1232,10 +1442,10 @@ class BcfRecord class BcfReader { private: - htsFile * fp = NULL; // hts file + std::shared_ptr fp; // hts file hts_idx_t * hidx = NULL; // hts index file tbx_t * tidx = NULL; // .tbi .csi index file for vcf files - hts_itr_t * itr = NULL; // hts records iterator + hts_itr_t * itr = NULL; // tabix iterator kstring_t s = {0, 0, NULL}; // kstring std::string fname; bool isBcf; // if the input file is bcf or vcf; @@ -1289,48 +1499,41 @@ class BcfReader if(!samples.empty()) setSamples(samples); } - /// return a BcfHeader object - const BcfHeader & getHeader() const - { - return header; - } - - /// open a VCF/BCF/STDIN file for streaming in - void open(const std::string & file) + ~BcfReader() { - fname = file; - fp = hts_open(file.c_str(), "r"); - header.hdr = bcf_hdr_read(fp); - nsamples = bcf_hdr_nsamples(header.hdr); - SamplesName = header.getSamples(); - } - - /// return if the file is opened successfully - bool isOpen() const - { - if(fp != NULL) - return true; - else - return false; + close(); } /// close the BcfReader object. void close() { - if(fp) hts_close(fp); + if(fp) fp.reset(); if(itr) hts_itr_destroy(itr); + if(hidx) hts_idx_destroy(hidx); + if(tidx) tbx_destroy(tidx); + if(s.s) free(s.s); } - ~BcfReader() + /// open a VCF/BCF/STDIN file for streaming in + void open(const std::string & file) { - if(fp) hts_close(fp); - if(itr) hts_itr_destroy(itr); + fname = file; + fp = std::shared_ptr(hts_open(file.c_str(), "r"), htsFile_close()); + header.hdr = bcf_hdr_read(fp.get()); + nsamples = bcf_hdr_nsamples(header.hdr); + SamplesName = header.getSamples(); } /** @brief set the number of threads to use */ inline int setThreads(int n) { - return hts_set_threads(fp, n); + return hts_set_threads(fp.get(), n); + } + + /// return a BcfHeader object + const BcfHeader & getHeader() const + { + return header; } /** @brief get the number of records of given region */ @@ -1396,26 +1599,26 @@ class BcfReader { if(isBcf) { - ret = bcf_itr_next(fp, itr, r.line); - bcf_unpack(r.line, BCF_UN_ALL); + ret = bcf_itr_next(fp.get(), itr, r.line.get()); + bcf_unpack(r.line.get(), BCF_UN_ALL); return (ret >= 0); } else { - int slen = tbx_itr_next(fp, tidx, itr, &s); + int slen = tbx_itr_next(fp.get(), tidx, itr, &s); if(slen > 0) { - ret = vcf_parse(&s, r.header.hdr, r.line); // ret > 0, error - bcf_unpack(r.line, BCF_UN_ALL); + ret = vcf_parse1(&s, r.header->hdr, r.line.get()); // ret > 0, error + bcf_unpack(r.line.get(), BCF_UN_ALL); } return (ret <= 0) && (slen > 0); } } else { - ret = bcf_read(fp, r.header.hdr, r.line); + ret = bcf_read(fp.get(), r.header->hdr, r.line.get()); // unpack record immediately. not lazy - bcf_unpack(r.line, BCF_UN_ALL); + bcf_unpack(r.line.get(), BCF_UN_ALL); return (ret == 0); } } @@ -1428,12 +1631,11 @@ class BcfReader class BcfWriter { private: - htsFile * fp = NULL; // hts file + std::shared_ptr fp; // hts file + std::shared_ptr b = std::shared_ptr(bcf_init(), variant_close()); // variant int ret; - bcf1_t * b = bcf_init(); - kstring_t s = {0, 0, NULL}; // kstring bool isHeaderWritten = false; - bool isClosed = false; + const BcfHeader * hp; public: /// header object initialized by initalHeader @@ -1451,6 +1653,7 @@ class BcfWriter { open(fname); initalHeader(version); + initalHeader(header); } /** @@ -1478,6 +1681,7 @@ class BcfWriter { open(fname, mode); initalHeader(version); + initalHeader(header); } /** @@ -1496,10 +1700,7 @@ class BcfWriter initalHeader(h); } - ~BcfWriter() - { - if(!isClosed) close(); - } + ~BcfWriter() {} /** * @brief Open VCF/BCF file for writing. The format is infered from file's suffix @@ -1507,11 +1708,8 @@ class BcfWriter */ void open(const std::string & fname) { - std::string mode{"w"}; - if(isEndWith(fname, "bcf.gz")) mode += "b"; - if(isEndWith(fname, "bcf")) mode += "bu"; - if(isEndWith(fname, "vcf.gz")) mode += "z"; - fp = hts_open(fname.c_str(), mode.c_str()); + auto mode = getMode(fname, "w"); + fp = std::shared_ptr(hts_open(fname.c_str(), mode.c_str()), htsFile_close()); } /** @@ -1525,16 +1723,15 @@ class BcfWriter */ void open(const std::string & fname, const std::string & mode) { - fp = hts_open(fname.c_str(), mode.c_str()); + fp = std::shared_ptr(hts_open(fname.c_str(), mode.c_str()), htsFile_close()); } /// close the BcfWriter object. void close() { if(!isHeaderWritten) writeHeader(); - if(b) bcf_destroy(b); - if(fp) hts_close(fp); // be careful of double free - isClosed = true; + if(b) b.reset(); + if(fp) fp.reset(); } /// initial a VCF header using the internal template given a specific version. VCF4.1 is the default @@ -1544,51 +1741,53 @@ class BcfWriter header.setVersion(version); } - /// initial a VCF header by refering to another vcf header + /// initial a VCF header by pointing to header of another VCF void initalHeader(const BcfHeader & h) { - header.hdr = h.hdr; // point to another header - if(header.hdr == NULL) throw std::runtime_error("couldn't copy the header from another vcf.\n"); + hp = &h; + } + + /// copy header of given VCF + void copyHeader(const std::string & vcffile) + { + htsFile * fp2 = hts_open(vcffile.c_str(), "r"); + header.hdr = bcf_hdr_read(fp2); + hts_close(fp2); + initalHeader(header); } - /// write a string to a vcf line + /// copy a string to a vcf line void writeLine(const std::string & vcfline) { - if(!isHeaderWritten && !writeHeader()) throw std::runtime_error("could not write header out\n"); - std::vector line(vcfline.begin(), vcfline.end()); - line.push_back('\0'); // don't forget string has no \0; - s.s = &line[0]; - s.l = vcfline.length(); - s.m = vcfline.length(); - ret = vcf_parse(&s, header.hdr, b); + if(!isHeaderWritten && !writeHeader()) throw std::runtime_error("could not write header\n"); + kstring_t s = {0, 0, NULL}; + kputsn(vcfline.c_str(), vcfline.length(), &s); + ret = vcf_parse1(&s, hp->hdr, b.get()); + free(s.s); if(ret > 0) throw std::runtime_error("error parsing: " + vcfline + "\n"); if(b->errcode == BCF_ERR_CTG_UNDEF) { - throw std::runtime_error("contig id " + (std::string)bcf_hdr_id2name(header.hdr, b->rid) + throw std::runtime_error("contig id " + (std::string)bcf_hdr_id2name(hp->hdr, b->rid) + " not found in the header. please run header->AddContig() first.\n"); } - ret = bcf_write(fp, header.hdr, b); + ret = bcf_write(fp.get(), hp->hdr, b.get()); if(ret != 0) throw std::runtime_error("error writing: " + vcfline + "\n"); } /// streams out the header bool writeHeader() { - ret = bcf_hdr_write(fp, header.hdr); - if(ret == 0) - return isHeaderWritten = true; - else - return false; + ret = bcf_hdr_write(fp.get(), hp->hdr); + if(ret == 0) return isHeaderWritten = true; + return false; } /// streams out the given variant of BcfRecord type inline bool writeRecord(BcfRecord & v) { if(!isHeaderWritten) writeHeader(); - if(bcf_write(fp, v.header.hdr, v.line) < 0) - return false; - else - return true; + if(bcf_write(fp.get(), v.header->hdr, v.line.get()) < 0) return false; + return true; } };