From 0d95968f8f09f3f88365d4e15a0e7f8dac3d28ef Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 22 Jan 2024 16:54:43 +0100 Subject: [PATCH 01/67] debug cluster likelihoods --- src/admixture.cpp | 13 ++-- src/common.hpp | 170 +++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 156 insertions(+), 27 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index e61e412..e023329 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -12,7 +12,6 @@ using namespace std; double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & genome) { MyArr2D kapa, Ekg; - MyArr2D alpha, beta, ae; MyArr1D iQ = MyArr1D::Zero(K); MyArr1D Hz(C); double norm = 0, llike = 0, tmp = 0; @@ -20,13 +19,9 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & for(int ic = 0, m = 0; ic < genome->nchunks; ic++) { const int nsnps = genome->pos[ic].size(); - const int nGrids = genome->B > 1 ? (nsnps + genome->B - 1) / genome->B : nsnps; - alpha.setZero(C * C, nGrids); - beta.setZero(C * C, nGrids); - get_cluster_probability(ind, nsnps, alpha, beta, genome->gls[ic], genome->R[ic], genome->PI[ic], - genome->F[ic]); // return gamma - ae.setZero(C * C, nGrids); - get_cluster_frequency(ae, genome->R[ic], genome->PI[ic]); + auto cl = get_cluster_likelihoods(ind, nsnps, genome->B, genome->gls[ic], genome->R[ic], genome->PI[ic], + genome->F[ic]); + const int nGrids = cl.cols(); kapa.setZero(C * K, nGrids); // C x K x M layout Ekg.setZero(K, nGrids); for(s = 0; s < nGrids; s++, m++) @@ -37,7 +32,7 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & for(tmp = 0, c2 = 0; c2 < C; c2++) { c12 = c1 * C + c2; - double xz = alpha(c12, s) * beta(c12, s) / ae(c12, s); + double xz = cl(c12, s); double zy = Hz(c1) * Hz(c2); tmp += xz * zy; } diff --git a/src/common.hpp b/src/common.hpp index ffcaaaa..45a9170 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -145,7 +145,6 @@ struct Pars String1D sampleids; }; - //****************************************************************************** // STRING UTILS //****************************************************************************** @@ -287,6 +286,55 @@ inline auto get_emission_by_gl(const MyArr2D & gli, const MyArr2D & F, double mi return emitDip; } +/* +** @param gli genotype likelihoods of current individual i, (M, 3) +** @param F cluster-specific allele frequence (M, C) +** @return emission probability (M, C2) +*/ +inline auto get_emission_by_grid(const MyFloat1D & GL, + const MyFloat1D & F, + int ind, + int M, + int B, + double minEmission = 1e-10) +{ + const int C = F.size() / M; + const int C2 = C * C; + const int nGrids = B > 1 ? (M + B - 1) / B : M; + MyArr2D emitGrid = MyArr2D::Ones(C2, nGrids); + int z1, z2, z12, i, s, e, g, g1, g2; + int igs = ind * M * 3; + for(g = 0; g < nGrids; g++) + { + s = g * B; + e = g == nGrids - 1 ? M - 1 : B * (g + 1) - 1; + for(z1 = 0; z1 < C; z1++) + { + for(z2 = 0; z2 < C; z2++) + { + z12 = z1 * C + z2; + for(i = s; i <= e; i++) + { + double emit = 0; + for(g1 = 0; g1 <= 1; g1++) + { + for(g2 = 0; g2 <= 1; g2++) + { + emit += GL[igs + (g1 + g2) * M + i] * (g1 * F[z1 * M + i] + (1 - g1) * (1 - F[z1 * M + i])) + * (g2 * F[z2 * M + i] + (1 - g2) * (1 - F[z2 * M + i])); + } + } + emitGrid(z12, g) *= emit; + } + } + } + // apply bounding + // emitGrid.col(g) /= emitGrid.col(g).maxCoeff(); + // emitGrid.col(g) = (emitGrid.col(g) < minEmission).select(minEmission, emitGrid.col(g)); + } + return emitGrid; +} + /* ** @param alpha forward probability, (C2,M) ** @param beta backwards probability (C2,M) @@ -363,14 +411,14 @@ inline auto forward_backwards_diploid(MyArr2D & alpha, } inline auto get_cluster_probability(int ind, - const int M, - MyArr2D & alpha, - MyArr2D & beta, - const MyFloat1D & GL, - const MyFloat1D & R, - const MyFloat1D & PI, - const MyFloat1D & F, - const double minEmission = 1e-6) + const int M, + MyArr2D & alpha, + MyArr2D & beta, + const MyFloat1D & GL, + const MyFloat1D & R, + const MyFloat1D & PI, + const MyFloat1D & F, + const double minEmission = 1e-6) { const int C = F.size() / M; const int C2 = alpha.rows(); @@ -408,8 +456,8 @@ inline auto get_cluster_probability(int ind, } } } - emitGrid.col(g) /= emitGrid.col(g).maxCoeff(); - emitGrid.col(g) = (emitGrid.col(g) < minEmission).select(minEmission, emitGrid.col(g)); // apply bounding + // emitGrid.col(g) /= emitGrid.col(g).maxCoeff(); + // emitGrid.col(g) = (emitGrid.col(g) < minEmission).select(minEmission, emitGrid.col(g)); // apply bounding for(z1 = 0; z1 < C; z1++) { for(z2 = 0; z2 < C; z2++) @@ -449,8 +497,8 @@ inline auto get_cluster_probability(int ind, } } } - emitGrid.col(g) /= emitGrid.col(g).maxCoeff(); - emitGrid.col(g) = (emitGrid.col(g) < minEmission).select(minEmission, emitGrid.col(g)); // apply bounding + // emitGrid.col(g) /= emitGrid.col(g).maxCoeff(); + // emitGrid.col(g) = (emitGrid.col(g) < minEmission).select(minEmission, emitGrid.col(g)); // apply bounding for(z1 = 0; z1 < C; z1++) { for(z2 = 0; z2 < C; z2++) @@ -539,8 +587,6 @@ inline auto get_cluster_probability(int ind, * (g2 * F[z2 * M + s] + (1 - g2) * (1 - F[z2 * M + s])); } } - // emit(k12, s) = emit(k12, s) < minEmission ? minEmission : - // emit(k12, s); alpha(z12, s) = emitSnp(z12, s) * (alpha(z12, s - 1) * R[s * 3 + 0] + PI[s * C + z1] * sumTmp1(z2) + PI[s * C + z2] * sumTmp1(z1) + PI[s * C + z1] * PI[s * C + z2] * constTmp); @@ -612,10 +658,98 @@ inline auto get_cluster_frequency(MyArr2D & ae, const MyFloat1D & R_, const MyFl // (1.0 - ae.colwise().sum()).abs() < 1e-2 is OK. this may be due to // rounding error if we want to colsum equals 1.0. then normlize it // TODO: cluster frequency can be zero for certain cluster. - const double tol = 1e-6; - ae = (ae < tol).select(tol, ae); - ae = (ae > 1 - tol).select(1 - tol, ae); + // const double tol = 1e-6; + // ae = (ae < tol).select(tol, ae); + // ae = (ae > 1 - tol).select(1 - tol, ae); + // ae.rowwise() /= ae.colwise().sum(); +} + +inline auto get_cluster_likelihoods(int ind, + const int M, + const int B, + const MyFloat1D & GL, + const MyFloat1D & R, + const MyFloat1D & PI, + const MyFloat1D & F, + const double minEmission = 1e-10) +{ + const int C = F.size() / M; + const int C2 = C * C; + MyArr2D emitGrid = get_emission_by_grid(GL, F, ind, M, B, minEmission); + const int nGrids = emitGrid.cols(); + MyArr2D alpha(C2, nGrids), beta(C2, nGrids), ae(C2, nGrids); + int z1, z2, z12; + MyArr1D sumTmp1(C); // store sum over internal loop for alpha + MyArr1D sumTmp2(C); // store sum over internal loop for ae + MyArr1D cs = MyArr1D::Zero(nGrids); + double constTmp; + // ======== forward and backward recursion =========== + int g = 0; + for(z1 = 0; z1 < C; z1++) + { + for(z2 = 0; z2 < C; z2++) + { + z12 = z1 * C + z2; + alpha(z12, g) = emitGrid(z12, g) * PI[g * C + z1] * PI[g * C + z2]; + ae(z12, g) = PI[g * C + z1] * PI[g * C + z2]; + cs(g) += alpha(z12, g); + } + } + cs(g) = 1 / cs(g); + alpha.col(g) *= cs(g); // normalize it + // now get the rest + for(g = 1; g < nGrids; g++) + { + sumTmp1 = alpha.col(g - 1).reshaped(C, C).rowwise().sum() * R[g * 3 + 1]; + sumTmp2 = ae.col(g - 1).reshaped(C, C).rowwise().sum() * R[g * 3 + 1]; + constTmp = R[g * 3 + 2]; // since alpha.col(g).sum()==1 + for(z1 = 0; z1 < C; z1++) + { + for(z2 = 0; z2 < C; z2++) + { + z12 = z1 * C + z2; + alpha(z12, g) = emitGrid(z12, g) + * (alpha(z12, g - 1) * R[g * 3 + 0] + PI[g * C + z1] * sumTmp1(z2) + + PI[g * C + z2] * sumTmp1(z1) + PI[g * C + z1] * PI[g * C + z2] * constTmp); + cs(g) += alpha(z12, g); + ae(z12, g) = (ae(z12, g - 1) * R[g * 3 + 0] + PI[g * C + z1] * sumTmp2(z2) + + PI[g * C + z2] * sumTmp2(z1) + PI[g * C + z1] * PI[g * C + z2] * constTmp); + } + } + cs(g) = 1 / cs(g); + alpha.col(g) *= cs(g); // normalize it + } + // const double tol = 1e-6; + // ae = (ae < tol).select(tol, ae); + // ae = (ae > 1 - tol).select(1 - tol, ae); ae.rowwise() /= ae.colwise().sum(); + // next backwards + g = nGrids - 1; + beta.col(g).setConstant(1.0); + for(g = nGrids - 2; g >= 0; g--) + { + auto beta_mult_emit = emitGrid.col(g + 1) * beta.col(g + 1); + sumTmp1.setZero(); + for(constTmp = 0, z1 = 0; z1 < C; z1++) + { + for(z2 = 0; z2 < C; z2++) + { + z12 = z1 * C + z2; + sumTmp1(z1) += beta_mult_emit(z12) * PI[(g + 1) * C + z2] * R[(g + 1) * 3 + 1]; + constTmp += beta_mult_emit(z12) * PI[(g + 1) * C + z1] * PI[(g + 1) * C + z2] * R[(g + 1) * 3 + 2]; + } + } + for(z1 = 0; z1 < C; z1++) + { + for(z2 = 0; z2 < C; z2++) + { + z12 = z1 * C + z2; + beta(z12, g) = + (beta_mult_emit(z12) * R[(g + 1) * 3 + 0] + sumTmp1(z1) + sumTmp1(z2) + constTmp) * cs(g + 1); + } + } + } + return alpha * beta / ae; } inline auto calc_cluster_info(const int N, const MyArr2D & GZP1, const MyArr2D & GZP2) From 204b17dd2b0a3a9dd902912cd48e2e1d5bbb4b1c Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 22 Jan 2024 17:12:57 +0100 Subject: [PATCH 02/67] debug cluster likelihoods --- src/common.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common.hpp b/src/common.hpp index 45a9170..e511f0c 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -722,7 +722,7 @@ inline auto get_cluster_likelihoods(int ind, // const double tol = 1e-6; // ae = (ae < tol).select(tol, ae); // ae = (ae > 1 - tol).select(1 - tol, ae); - ae.rowwise() /= ae.colwise().sum(); + // ae.rowwise() /= ae.colwise().sum(); // next backwards g = nGrids - 1; beta.col(g).setConstant(1.0); From f80f64455d7f0b24b5837a740c52fb1e09939d04 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 22 Jan 2024 20:55:04 +0100 Subject: [PATCH 03/67] norm cluster likelihoods --- src/admixture.cpp | 19 +++++++++---------- src/common.hpp | 16 ++++++++-------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index e023329..0edbf44 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -19,8 +19,9 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & for(int ic = 0, m = 0; ic < genome->nchunks; ic++) { const int nsnps = genome->pos[ic].size(); - auto cl = get_cluster_likelihoods(ind, nsnps, genome->B, genome->gls[ic], genome->R[ic], genome->PI[ic], - genome->F[ic]); + MyArr2D cl = get_cluster_likelihoods(ind, nsnps, genome->B, genome->gls[ic], genome->R[ic], genome->PI[ic], + genome->F[ic]); + cl.rowwise() /= cl.colwise().sum(); const int nGrids = cl.cols(); kapa.setZero(C * K, nGrids); // C x K x M layout Ekg.setZero(K, nGrids); @@ -60,7 +61,7 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & double Admixture::runNativeWithBigAss(int ind, const std::unique_ptr & genome) { MyArr2D w((C * C + C) / 2, K * K); - MyArr2D Ekg, iEkc, alpha, beta, ae; + MyArr2D Ekg, iEkc; double norm = 0, llike = 0; int c1, c2, c12, cc; int k1, k2, k12, s; @@ -68,12 +69,10 @@ double Admixture::runNativeWithBigAss(int ind, const std::unique_ptr & g for(int ic = 0, m = 0; ic < genome->nchunks; ic++) { const int nsnps = genome->pos[ic].size(); - const int nGrids = genome->B > 1 ? (nsnps + genome->B - 1) / genome->B : nsnps; - alpha.setZero(C * C, nGrids); - beta.setZero(C * C, nGrids); - get_cluster_probability(ind, nsnps, alpha, beta, genome->gls[ic], genome->R[ic], genome->PI[ic], genome->F[ic]); - ae.setZero(C * C, nGrids); - get_cluster_frequency(ae, genome->R[ic], genome->PI[ic]); + MyArr2D cl = get_cluster_likelihoods(ind, nsnps, genome->B, genome->gls[ic], genome->R[ic], genome->PI[ic], + genome->F[ic]); + cl.rowwise() /= cl.colwise().sum(); + const int nGrids = cl.cols(); iEkc.setZero(C * K, nGrids); Ekg.setZero(K, nGrids); for(s = 0; s < nGrids; s++, m++) @@ -83,7 +82,7 @@ double Admixture::runNativeWithBigAss(int ind, const std::unique_ptr & g for(c2 = c1; c2 < C; c2++) { c12 = c1 * C + c2; - double xz = alpha(c12, s) * beta(c12, s) / ae(c12, s); + double xz = cl(c12, s); for(k1 = 0; k1 < K; k1++) { for(k2 = 0; k2 < K; k2++) diff --git a/src/common.hpp b/src/common.hpp index e511f0c..7980403 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -664,14 +664,14 @@ inline auto get_cluster_frequency(MyArr2D & ae, const MyFloat1D & R_, const MyFl // ae.rowwise() /= ae.colwise().sum(); } -inline auto get_cluster_likelihoods(int ind, - const int M, - const int B, - const MyFloat1D & GL, - const MyFloat1D & R, - const MyFloat1D & PI, - const MyFloat1D & F, - const double minEmission = 1e-10) +inline MyArr2D get_cluster_likelihoods(int ind, + const int M, + const int B, + const MyFloat1D & GL, + const MyFloat1D & R, + const MyFloat1D & PI, + const MyFloat1D & F, + const double minEmission = 1e-10) { const int C = F.size() / M; const int C2 = C * C; From 978e062e4c88d426ffbde7f4fc85a57617a21d1a Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 22 Jan 2024 21:03:24 +0100 Subject: [PATCH 04/67] robust code --- src/common.hpp | 60 +++++++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/src/common.hpp b/src/common.hpp index 7980403..f505f9c 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -214,7 +214,7 @@ inline bool starts_with(std::string const & str, std::string const & ending) // RECOMBINATION //****************************************************************************** -inline auto calc_position_distance(const Int1D & markers) +inline Int1D calc_position_distance(const Int1D & markers) { Int1D dl(markers.size()); dl[0] = 0; @@ -262,7 +262,7 @@ inline MyArr2D calc_transRate_diploid(const Int1D & dl, double nGen, double expR ** @param F cluster-specific allele frequence (M, C) ** @return emission probability (M, C2) */ -inline auto get_emission_by_gl(const MyArr2D & gli, const MyArr2D & F, double minEmission = 1e-10) +inline MyArr2D get_emission_by_gl(const MyArr2D & gli, const MyArr2D & F, double minEmission = 1e-10) { int k1, k2, g1, g2; const int M = F.rows(); @@ -291,12 +291,12 @@ inline auto get_emission_by_gl(const MyArr2D & gli, const MyArr2D & F, double mi ** @param F cluster-specific allele frequence (M, C) ** @return emission probability (M, C2) */ -inline auto get_emission_by_grid(const MyFloat1D & GL, - const MyFloat1D & F, - int ind, - int M, - int B, - double minEmission = 1e-10) +inline MyArr2D get_emission_by_grid(const MyFloat1D & GL, + const MyFloat1D & F, + int ind, + int M, + int B, + double minEmission = 1e-10) { const int C = F.size() / M; const int C2 = C * C; @@ -344,11 +344,11 @@ inline auto get_emission_by_grid(const MyFloat1D & GL, ** @param PI cluster frequency (C,M) ** @return individual log likelihood */ -inline auto forward_backwards_diploid(MyArr2D & alpha, - MyArr2D & beta, - const MyArr2D & E, - const MyArr2D & R, - const MyArr2D & PI) +inline MyArr1D forward_backwards_diploid(MyArr2D & alpha, + MyArr2D & beta, + const MyArr2D & E, + const MyArr2D & R, + const MyArr2D & PI) { const int M = alpha.cols(); const int C = PI.rows(); @@ -410,15 +410,15 @@ inline auto forward_backwards_diploid(MyArr2D & alpha, return cs; } -inline auto get_cluster_probability(int ind, - const int M, - MyArr2D & alpha, - MyArr2D & beta, - const MyFloat1D & GL, - const MyFloat1D & R, - const MyFloat1D & PI, - const MyFloat1D & F, - const double minEmission = 1e-6) +inline MyArr1D get_cluster_probability(int ind, + const int M, + MyArr2D & alpha, + MyArr2D & beta, + const MyFloat1D & GL, + const MyFloat1D & R, + const MyFloat1D & PI, + const MyFloat1D & F, + const double minEmission = 1e-6) { const int C = F.size() / M; const int C2 = alpha.rows(); @@ -627,7 +627,7 @@ inline auto get_cluster_probability(int ind, return cs; } -inline auto get_cluster_frequency(MyArr2D & ae, const MyFloat1D & R_, const MyFloat1D & PI_) +inline void get_cluster_frequency(MyArr2D & ae, const MyFloat1D & R_, const MyFloat1D & PI_) { const int C2 = ae.rows(); const int M = ae.cols(); @@ -764,7 +764,7 @@ inline auto calc_cluster_info(const int N, const MyArr2D & GZP1, const MyArr2D & } // @params GL genotype likelihoods, N x M x 3 -inline auto estimate_af_by_gl(const MyFloat1D & GL, int N, int M, int niter = 100, double tol = 1e-4) +inline Arr1D estimate_af_by_gl(const MyFloat1D & GL, int N, int M, int niter = 100, double tol = 1e-4) { Arr1D af_est = Arr1D::Constant(M, 0.25); Arr1D af_tmp = Arr1D::Zero(M); @@ -793,7 +793,7 @@ inline auto estimate_af_by_gl(const MyFloat1D & GL, int N, int M, int niter = 10 return af_est; } -inline auto divide_pos_into_grid(const Int1D & pos, int B) +inline Int2D divide_pos_into_grid(const Int1D & pos, int B) { int M = pos.size(); int G = (M + B - 1) / B; @@ -808,7 +808,7 @@ inline auto divide_pos_into_grid(const Int1D & pos, int B) return grids; } -inline auto divide_pos_into_grid(const Int1D & pos, const Bool1D & collapse) +inline Int2D divide_pos_into_grid(const Int1D & pos, const Bool1D & collapse) { assert(pos.size() == collapse.size()); Int2D grids; @@ -830,7 +830,7 @@ inline auto divide_pos_into_grid(const Int1D & pos, const Bool1D & collapse) return grids; } -inline auto find_chunk_to_collapse(const MyArr2D & R, double tol_r = 1e-6) +inline Bool1D find_chunk_to_collapse(const MyArr2D & R, double tol_r = 1e-6) { Bool1D collapse(R.cols(), false); // M sites for(auto i = 0; i < R.cols(); i++) @@ -844,7 +844,7 @@ inline auto find_chunk_to_collapse(const MyArr2D & R, double tol_r = 1e-6) ** @params pos snp position, first dim is each grid, second dim is snps in *that grid */ -inline auto calc_grid_distance(const Int2D & pos) +inline Int1D calc_grid_distance(const Int2D & pos) { Int1D dl(pos.size()); dl[0] = 0; @@ -858,7 +858,7 @@ inline auto calc_grid_distance(const Int2D & pos) /* ** @param E original size of emission, full SNPs x C2 */ -inline auto collapse_emission_by_grid(const MyArr2D & E, const Int2D & grids, double minEmission = 1e-6) +inline MyArr2D collapse_emission_by_grid(const MyArr2D & E, const Int2D & grids, double minEmission = 1e-6) { const int C2 = E.rows(); const int G = grids.size(); @@ -879,7 +879,7 @@ inline auto collapse_emission_by_grid(const MyArr2D & E, const Int2D & grids, do return EG; } -inline auto cat_stdvec_of_eigen(const std::vector & arr3) +inline MyArr2D cat_stdvec_of_eigen(const std::vector & arr3) { int K = arr3.size(); int C = arr3[0].rows(); From f51f702ae3da34b7fbe6390be5bcfcc04e1a5e81 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 22 Jan 2024 21:19:03 +0100 Subject: [PATCH 05/67] reset alpha if acceleration not working better --- src/admixture.cpp | 46 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index 0edbf44..6b049ed 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -214,11 +214,11 @@ int run_admix_main(Options & opts) vector> llike; if(!opts.noaccel) { - MyArr2D F0, Q0, F1, Q1; + MyArr2D F0, Q0, F1, Q1, F2, Q2, Ft, Qt; const int istep{4}; double alpha{std::numeric_limits::lowest()}, qdiff, ldiff, stepMax{4}, alphaMax{1280}; - double prevlike{std::numeric_limits::lowest()}; - for(int it = 0; SIG_COND && (it < opts.nadmix / 3); it++) + double prevlike{std::numeric_limits::lowest()}, logcheck{0}, loglike{0}; + for(int it = 0; SIG_COND && (it < opts.nadmix / 4); it++) { // first accel iteration admixer.initIteration(); @@ -230,20 +230,20 @@ int run_admix_main(Options & opts) llike.clear(); // clear future and renew admixer.updateIteration(); // second accel iteration + tim.clock(); + admixer.initIteration(); F1 = admixer.F; Q1 = admixer.Q; qdiff = (Q1 - Q0).square().sum(); - tim.clock(); - admixer.initIteration(); for(int i = 0; i < genome->nsamples; i++) llike.emplace_back(poolit.enqueue(&Admixture::runOptimalWithBigAss, &admixer, i, std::ref(genome))); - double loglike = 0; + loglike = 0; for(auto && ll : llike) loglike += ll.get(); llike.clear(); // clear future and renew admixer.updateIteration(); ldiff = it ? loglike - prevlike : NAN; prevlike = loglike; - cao.print(tim.date(), "SqS3 iteration", it * 3 + 1, ", diff(Q) =", std::scientific, qdiff, + cao.print(tim.date(), "SqS3 iteration", it * 4 + 1, ", diff(Q) =", std::scientific, qdiff, ", alpha=", alpha, ", likelihoods =", std::fixed, loglike, ", diff(likelihoods)=", ldiff, ", elapsed", tim.reltime(), " sec"); if(ldiff < opts.ltol) @@ -252,6 +252,9 @@ int run_admix_main(Options & opts) opts.ltol); break; } + // save for later comparison + Ft = admixer.F; + Qt = admixer.Q; // accel iteration with steplen alpha = ((F1 - F0).square().sum() + (Q1 - Q0).square().sum()) / ((admixer.F - 2 * F1 + F0).square().sum() + (admixer.Q - 2 * Q1 + Q0).square().sum()); @@ -262,15 +265,42 @@ int run_admix_main(Options & opts) alpha = min(stepMax, alphaMax); stepMax = min(stepMax * istep, alphaMax); } + // third accel iter + // update Q and F using the second em iter admixer.F = F0 + 2 * alpha * (F1 - F0) + alpha * alpha * (admixer.F - 2 * F1 + F0); admixer.Q = Q0 + 2 * alpha * (Q1 - Q0) + alpha * alpha * (admixer.Q - 2 * Q1 + Q0); admixer.protectPars(); admixer.initIteration(); for(int i = 0; i < genome->nsamples; i++) llike.emplace_back(poolit.enqueue(&Admixture::runOptimalWithBigAss, &admixer, i, std::ref(genome))); - for(auto && ll : llike) ll.get(); + loglike = 0; + for(auto && ll : llike) loglike += ll.get(); llike.clear(); // clear future and renew admixer.updateIteration(); + // save current pars + F2 = admixer.F; + Q2 = admixer.Q; + // check if normal third iter is better + admixer.Q = Qt; + admixer.F = Ft; + admixer.initIteration(); + for(int i = 0; i < genome->nsamples; i++) + llike.emplace_back(poolit.enqueue(&Admixture::runOptimalWithBigAss, &admixer, i, std::ref(genome))); + logcheck = 0; + for(auto && ll : llike) logcheck += ll.get(); + llike.clear(); // clear future and renew + admixer.updateIteration(); + if(logcheck - loglike > 0.1) + { + stepMax = istep; + cao.warn(tim.date(), "reset stepMax to 4, normal EM yields better likelihoods than the accelerated EM.", + logcheck, " -", loglike, " > 0.1"); + } + else + { + admixer.Q = Q2; + admixer.F = F2; + } } } else From 484ca5bc765200fe533e4a6c83501ecbe74a322e Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 22 Jan 2024 21:33:05 +0100 Subject: [PATCH 06/67] update makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f811440..5f442a8 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ CXX = g++ # CXXFLAGS = -std=c++17 -Wall -O3 -g -fsanitize=address # CXXFLAGS = -std=c++17 -Wall -O3 -march=native -DNDEBUG -CXXFLAGS = -std=c++17 -Wall -O3 -march=native -fPIC -DNDEBUG +CXXFLAGS = -std=c++17 -Wall -O3 -mavx2 -fPIC -DNDEBUG INC = -I./src -I./inst/include -I$(HTSDIR) LDFLAGS = -L$(HTSDIR) -Wl,-rpath,$(HTSDIR) LIBS = $(HTSDIR)/libhts.a -llzma -lbz2 -lm -lz -lpthread From 380ecd76e8800aa394408901d0432b1f4970da29 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Tue, 23 Jan 2024 09:54:29 +0100 Subject: [PATCH 07/67] cleanup messy code --- src/common.hpp | 224 +++++++++------------------------------------- src/phaseless.cpp | 4 +- src/phaseless.hpp | 4 +- 3 files changed, 46 insertions(+), 186 deletions(-) diff --git a/src/common.hpp b/src/common.hpp index f505f9c..a21ff2b 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -243,8 +243,8 @@ inline void protect_er(MyArr1D & er) inline MyArr1D calc_er(const Int1D & dl, double nGen, double expRate = 0.5) { MyArr1D er(dl.size()); - // for(size_t i = 1; i < dl.size(); i++) distRate(i) = std::exp(-dl[i] / 1e6); - for(size_t i = 1; i < dl.size(); i++) er(i) = std::exp(-dl[i] * expRate * nGen / 1e8); + for(size_t i = 1; i < dl.size(); i++) er(i) = std::exp(-dl[i] / 1e6); + // for(size_t i = 1; i < dl.size(); i++) er(i) = std::exp(-dl[i] * expRate * nGen / 1e8); protect_er(er); return er; } @@ -282,7 +282,7 @@ inline MyArr2D get_emission_by_gl(const MyArr2D & gli, const MyArr2D & F, double } } // emitDip = emitDip.colwise() / emitDip.rowwise().maxCoeff(); // normalize - emitDip = (emitDip < minEmission).select(minEmission, emitDip); + // emitDip = (emitDip < minEmission).select(minEmission, emitDip); return emitDip; } @@ -418,212 +418,75 @@ inline MyArr1D get_cluster_probability(int ind, const MyFloat1D & R, const MyFloat1D & PI, const MyFloat1D & F, - const double minEmission = 1e-6) + const double minEmission = 1e-10) { const int C = F.size() / M; - const int C2 = alpha.rows(); const int nGrids = alpha.cols(); const int B = (M + nGrids - 1) / nGrids; - int g1, g2, z1, z2, z12; - int igs = ind * M * 3; + int z1, z2, z12; MyArr1D sumTmp1(C); // store sum over internal loop MyArr1D cs = MyArr1D::Zero(nGrids); double constTmp; // ======== forward and backward recursion =========== - if(nGrids < M) + MyArr2D emitGrid = get_emission_by_grid(GL, F, ind, M, B, minEmission); + int g{0}; + for(z1 = 0; z1 < C; z1++) { - MyArr2D emitGrid = MyArr2D::Ones(C2, nGrids); - int i, s, e, g{0}; - s = g * B; - e = g == nGrids - 1 ? M - 1 : B * (g + 1) - 1; - for(z1 = 0; z1 < C; z1++) + for(z2 = 0; z2 < C; z2++) { - for(z2 = 0; z2 < C; z2++) - { - z12 = z1 * C + z2; - for(i = s; i <= e; i++) - { - double emit = 0; - for(g1 = 0; g1 <= 1; g1++) - { - for(g2 = 0; g2 <= 1; g2++) - { - emit += GL[igs + (g1 + g2) * M + i] * (g1 * F[z1 * M + i] + (1 - g1) * (1 - F[z1 * M + i])) - * (g2 * F[z2 * M + i] + (1 - g2) * (1 - F[z2 * M + i])); - } - } - emitGrid(z12, g) *= emit; - } - } + z12 = z1 * C + z2; + alpha(z12, g) = emitGrid(z12, g) * PI[g * C + z1] * PI[g * C + z2]; + cs(g) += alpha(z12, g); } - // emitGrid.col(g) /= emitGrid.col(g).maxCoeff(); - // emitGrid.col(g) = (emitGrid.col(g) < minEmission).select(minEmission, emitGrid.col(g)); // apply bounding + } + cs(g) = 1 / cs(g); + alpha.col(g) *= cs(g); // normalize it + // now get the rest + for(g = 1; g < nGrids; g++) + { + sumTmp1 = alpha.col(g - 1).reshaped(C, C).rowwise().sum() * R[g * 3 + 1]; + constTmp = alpha.col(g - 1).sum() * R[g * 3 + 2]; for(z1 = 0; z1 < C; z1++) { for(z2 = 0; z2 < C; z2++) { z12 = z1 * C + z2; - alpha(z12, g) = emitGrid(z12, g) * PI[g * C + z1] * PI[g * C + z2]; + alpha(z12, g) = emitGrid(z12, g) + * (alpha(z12, g - 1) * R[g * 3 + 0] + PI[g * C + z1] * sumTmp1(z2) + + PI[g * C + z2] * sumTmp1(z1) + PI[g * C + z1] * PI[g * C + z2] * constTmp); cs(g) += alpha(z12, g); } } cs(g) = 1 / cs(g); alpha.col(g) *= cs(g); // normalize it - // now get the rest - for(g = 1; g < nGrids; g++) - { - sumTmp1 = alpha.col(g - 1).reshaped(C, C).rowwise().sum() * R[g * 3 + 1]; - constTmp = alpha.col(g - 1).sum() * R[g * 3 + 2]; - s = g * B; - e = g == nGrids - 1 ? M - 1 : B * (g + 1) - 1; - for(z1 = 0; z1 < C; z1++) - { - for(z2 = 0; z2 < C; z2++) - { - z12 = z1 * C + z2; - for(i = s; i <= e; i++) - { - double emit = 0; - for(g1 = 0; g1 <= 1; g1++) - { - for(g2 = 0; g2 <= 1; g2++) - { - emit += GL[igs + (g1 + g2) * M + i] - * (g1 * F[z1 * M + i] + (1 - g1) * (1 - F[z1 * M + i])) - * (g2 * F[z2 * M + i] + (1 - g2) * (1 - F[z2 * M + i])); - } - } - emitGrid(z12, g) *= emit; - } - } - } - // emitGrid.col(g) /= emitGrid.col(g).maxCoeff(); - // emitGrid.col(g) = (emitGrid.col(g) < minEmission).select(minEmission, emitGrid.col(g)); // apply bounding - for(z1 = 0; z1 < C; z1++) - { - for(z2 = 0; z2 < C; z2++) - { - z12 = z1 * C + z2; - alpha(z12, g) = emitGrid(z12, g) - * (alpha(z12, g - 1) * R[g * 3 + 0] + PI[g * C + z1] * sumTmp1(z2) - + PI[g * C + z2] * sumTmp1(z1) + PI[g * C + z1] * PI[g * C + z2] * constTmp); - cs(g) += alpha(z12, g); - } - } - cs(g) = 1 / cs(g); - alpha.col(g) *= cs(g); // normalize it - } - // next backwards - g = nGrids - 1; - beta.col(g).setConstant(1.0); - for(g = nGrids - 2; g >= 0; g--) - { - auto beta_mult_emit = emitGrid.col(g + 1) * beta.col(g + 1); - sumTmp1.setZero(); - for(constTmp = 0, z1 = 0; z1 < C; z1++) - { - for(z2 = 0; z2 < C; z2++) - { - z12 = z1 * C + z2; - sumTmp1(z1) += beta_mult_emit(z12) * PI[(g + 1) * C + z2] * R[(g + 1) * 3 + 1]; - constTmp += beta_mult_emit(z12) * PI[(g + 1) * C + z1] * PI[(g + 1) * C + z2] * R[(g + 1) * 3 + 2]; - } - } - for(z1 = 0; z1 < C; z1++) - { - for(z2 = 0; z2 < C; z2++) - { - z12 = z1 * C + z2; - beta(z12, g) = - (beta_mult_emit(z12) * R[(g + 1) * 3 + 0] + sumTmp1(z1) + sumTmp1(z2) + constTmp) * cs(g + 1); - } - } - } } - else if(nGrids == M) + // next backwards + g = nGrids - 1; + beta.col(g).setConstant(1.0); + for(g = nGrids - 2; g >= 0; g--) { - MyArr2D emitSnp(C2, M); - int s{0}; - for(z1 = 0; z1 < C; z1++) + auto beta_mult_emit = emitGrid.col(g + 1) * beta.col(g + 1); + sumTmp1.setZero(); + for(constTmp = 0, z1 = 0; z1 < C; z1++) { for(z2 = 0; z2 < C; z2++) { z12 = z1 * C + z2; - emitSnp(z12, s) = 0; - for(g1 = 0; g1 <= 1; g1++) - { - for(g2 = 0; g2 <= 1; g2++) - { - emitSnp(z12, s) += GL[igs + (g1 + g2) * M + s] - * (g1 * F[z1 * M + s] + (1 - g1) * (1 - F[z1 * M + s])) - * (g2 * F[z2 * M + s] + (1 - g2) * (1 - F[z2 * M + s])); - } - } - // emit(k12, s) = emit(k12, s) < minEmission ? minEmission : - // emit(k12, s); - alpha(z12, s) = emitSnp(z12, s) * PI[s * C + z1] * PI[s * C + z2]; - cs(s) += alpha(z12, s); - } - } - cs(s) = 1 / cs(s); - alpha.col(s) *= cs(s); // normalize it - - for(s = 1; s < M; s++) - { - sumTmp1 = alpha.col(s - 1).reshaped(C, C).rowwise().sum() * R[s * 3 + 1]; - constTmp = alpha.col(s - 1).sum() * R[s * 3 + 2]; - for(z1 = 0; z1 < C; z1++) - { - for(z2 = 0; z2 < C; z2++) - { - z12 = z1 * C + z2; - emitSnp(z12, s) = 0; - for(g1 = 0; g1 <= 1; g1++) - { - for(g2 = 0; g2 <= 1; g2++) - { - emitSnp(z12, s) += GL[igs + (g1 + g2) * M + s] - * (g1 * F[z1 * M + s] + (1 - g1) * (1 - F[z1 * M + s])) - * (g2 * F[z2 * M + s] + (1 - g2) * (1 - F[z2 * M + s])); - } - } - alpha(z12, s) = emitSnp(z12, s) - * (alpha(z12, s - 1) * R[s * 3 + 0] + PI[s * C + z1] * sumTmp1(z2) - + PI[s * C + z2] * sumTmp1(z1) + PI[s * C + z1] * PI[s * C + z2] * constTmp); - cs(s) += alpha(z12, s); - } + sumTmp1(z1) += beta_mult_emit(z12) * PI[(g + 1) * C + z2] * R[(g + 1) * 3 + 1]; + constTmp += beta_mult_emit(z12) * PI[(g + 1) * C + z1] * PI[(g + 1) * C + z2] * R[(g + 1) * 3 + 2]; } - cs(s) = 1 / cs(s); - alpha.col(s) *= cs(s); // normalize it } - // double indLike = LikeForwardInd.col(M - 1).sum(); // just 1 - // ======== backward recursion =========== - s = M - 1; - beta.col(s).setConstant(1.0); - for(s = M - 2; s >= 0; s--) + for(z1 = 0; z1 < C; z1++) { - auto beta_mult_emit = emitSnp.col(s + 1) * beta.col(s + 1); - sumTmp1.setZero(); - for(constTmp = 0, z1 = 0; z1 < C; z1++) - { - for(z2 = 0; z2 < C; z2++) - { - z12 = z1 * C + z2; - sumTmp1(z1) += beta_mult_emit(z12) * PI[(s + 1) * C + z2] * R[(s + 1) * 3 + 1]; - constTmp += beta_mult_emit(z12) * PI[(s + 1) * C + z1] * PI[(s + 1) * C + z2] * R[(s + 1) * 3 + 2]; - } - } - for(z1 = 0; z1 < C; z1++) + for(z2 = 0; z2 < C; z2++) { - for(z2 = 0; z2 < C; z2++) - { - z12 = z1 * C + z2; - beta(z12, s) = - (beta_mult_emit(z12) * R[(s + 1) * 3 + 0] + sumTmp1(z1) + sumTmp1(z2) + constTmp) * cs(s + 1); - } + z12 = z1 * C + z2; + beta(z12, g) = + (beta_mult_emit(z12) * R[(g + 1) * 3 + 0] + sumTmp1(z1) + sumTmp1(z2) + constTmp) * cs(g + 1); } } } + return cs; } @@ -719,10 +582,7 @@ inline MyArr2D get_cluster_likelihoods(int ind, cs(g) = 1 / cs(g); alpha.col(g) *= cs(g); // normalize it } - // const double tol = 1e-6; - // ae = (ae < tol).select(tol, ae); - // ae = (ae > 1 - tol).select(1 - tol, ae); - // ae.rowwise() /= ae.colwise().sum(); + // TODO: cluster frequency ae can be zero for certain cluster. // next backwards g = nGrids - 1; beta.col(g).setConstant(1.0); @@ -858,7 +718,7 @@ inline Int1D calc_grid_distance(const Int2D & pos) /* ** @param E original size of emission, full SNPs x C2 */ -inline MyArr2D collapse_emission_by_grid(const MyArr2D & E, const Int2D & grids, double minEmission = 1e-6) +inline MyArr2D collapse_emission_by_grid(const MyArr2D & E, const Int2D & grids, double minEmission = 1e-10) { const int C2 = E.rows(); const int G = grids.size(); @@ -871,8 +731,8 @@ inline MyArr2D collapse_emission_by_grid(const MyArr2D & E, const Int2D & grids, e = snp + grids[g].size() - 1; snp = e + 1; for(c = s; c <= e; c++) EG.col(g) *= E.col(c); - EG.col(g) /= EG.col(g).maxCoeff(); // rescale by maximum - EG.col(g) = (EG.col(g) < minEmission).select(minEmission, EG.col(g)); // apply bounding + // EG.col(g) /= EG.col(g).maxCoeff(); // rescale by maximum + // EG.col(g) = (EG.col(g) < minEmission).select(minEmission, EG.col(g)); // apply bounding } assert(snp == E.cols()); diff --git a/src/phaseless.cpp b/src/phaseless.cpp index 400c4cb..0305399 100644 --- a/src/phaseless.cpp +++ b/src/phaseless.cpp @@ -13,7 +13,7 @@ using namespace std; -void Phaseless::initRecombination(const Int1D & pos, std::string rfile, double Ne, int B) +void Phaseless::initRecombination(const Int1D & pos, std::string rfile, int B, double Ne) { nGen = 4 * Ne / C; dist = calc_position_distance(pos); @@ -26,7 +26,7 @@ void Phaseless::initRecombination(const Int1D & pos, std::string rfile, double N R = er2R(er); } -void Phaseless::initRecombination(const Int2D & pos, std::string rfile, double Ne, int B) +void Phaseless::initRecombination(const Int2D & pos, std::string rfile, int B, double Ne) { nGen = 4 * Ne / C; int nchunks = pos.size(); diff --git a/src/phaseless.hpp b/src/phaseless.hpp index 98c89e5..2c55d8b 100644 --- a/src/phaseless.hpp +++ b/src/phaseless.hpp @@ -53,8 +53,8 @@ class Phaseless void setStartPoint(std::string, std::string); void setStartPoint(const std::unique_ptr &); - void initRecombination(const Int1D & pos, std::string rfile = "", double Ne = 20000, int B = 1); - void initRecombination(const Int2D & pos, std::string rfile = "", double Ne = 20000, int B = 1); + void initRecombination(const Int1D & pos, std::string rfile = "", int B = 1, double Ne = 20000); + void initRecombination(const Int2D & pos, std::string rfile = "", int B = 1, double Ne = 20000); void setFlags(double, double, double, bool, bool, bool, bool, bool); void protectPars(); void initIteration(); From 16d69a702f30084b0861ccee858a4a865868813c Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Tue, 23 Jan 2024 10:20:09 +0100 Subject: [PATCH 08/67] norm cl inside --- src/admixture.cpp | 2 -- src/common.hpp | 5 ++++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index 6b049ed..6cfc5fe 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -21,7 +21,6 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & const int nsnps = genome->pos[ic].size(); MyArr2D cl = get_cluster_likelihoods(ind, nsnps, genome->B, genome->gls[ic], genome->R[ic], genome->PI[ic], genome->F[ic]); - cl.rowwise() /= cl.colwise().sum(); const int nGrids = cl.cols(); kapa.setZero(C * K, nGrids); // C x K x M layout Ekg.setZero(K, nGrids); @@ -71,7 +70,6 @@ double Admixture::runNativeWithBigAss(int ind, const std::unique_ptr & g const int nsnps = genome->pos[ic].size(); MyArr2D cl = get_cluster_likelihoods(ind, nsnps, genome->B, genome->gls[ic], genome->R[ic], genome->PI[ic], genome->F[ic]); - cl.rowwise() /= cl.colwise().sum(); const int nGrids = cl.cols(); iEkc.setZero(C * K, nGrids); Ekg.setZero(K, nGrids); diff --git a/src/common.hpp b/src/common.hpp index a21ff2b..a42aa1e 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -609,7 +609,10 @@ inline MyArr2D get_cluster_likelihoods(int ind, } } } - return alpha * beta / ae; + // reuse emitGrids for cluster likelihoods + emitGrid = alpha * beta / ae; + emitGrid.rowwise() /= emitGrid.colwise().sum(); // norm it + return emitGrid; } inline auto calc_cluster_info(const int N, const MyArr2D & GZP1, const MyArr2D & GZP2) From a63864777c9506117b273ff8f07a07177e51ab12 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Tue, 23 Jan 2024 10:43:51 +0100 Subject: [PATCH 09/67] reset cl to 0 if cf < 0.01 --- src/admixture.cpp | 11 ++++++----- src/common.hpp | 22 +++++++++++++--------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index 6cfc5fe..8eab3b2 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -14,13 +14,13 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & MyArr2D kapa, Ekg; MyArr1D iQ = MyArr1D::Zero(K); MyArr1D Hz(C); - double norm = 0, llike = 0, tmp = 0; + double norm = 0, llike = 0, tmp = 0, tol = 0.01; int c1, k1, s, c2, c12; for(int ic = 0, m = 0; ic < genome->nchunks; ic++) { const int nsnps = genome->pos[ic].size(); - MyArr2D cl = get_cluster_likelihoods(ind, nsnps, genome->B, genome->gls[ic], genome->R[ic], genome->PI[ic], - genome->F[ic]); + const auto [cl, cf] = get_cluster_likelihoods(ind, nsnps, genome->B, genome->gls[ic], genome->R[ic], + genome->PI[ic], genome->F[ic]); const int nGrids = cl.cols(); kapa.setZero(C * K, nGrids); // C x K x M layout Ekg.setZero(K, nGrids); @@ -33,6 +33,7 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & { c12 = c1 * C + c2; double xz = cl(c12, s); + if(cf(c1, s) < tol || cf(c2, s) < tol) xz = 0.0; double zy = Hz(c1) * Hz(c2); tmp += xz * zy; } @@ -68,8 +69,8 @@ double Admixture::runNativeWithBigAss(int ind, const std::unique_ptr & g for(int ic = 0, m = 0; ic < genome->nchunks; ic++) { const int nsnps = genome->pos[ic].size(); - MyArr2D cl = get_cluster_likelihoods(ind, nsnps, genome->B, genome->gls[ic], genome->R[ic], genome->PI[ic], - genome->F[ic]); + const auto [cl, cf] = get_cluster_likelihoods(ind, nsnps, genome->B, genome->gls[ic], genome->R[ic], + genome->PI[ic], genome->F[ic]); const int nGrids = cl.cols(); iEkc.setZero(C * K, nGrids); Ekg.setZero(K, nGrids); diff --git a/src/common.hpp b/src/common.hpp index a42aa1e..128ac19 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -527,14 +527,14 @@ inline void get_cluster_frequency(MyArr2D & ae, const MyFloat1D & R_, const MyFl // ae.rowwise() /= ae.colwise().sum(); } -inline MyArr2D get_cluster_likelihoods(int ind, - const int M, - const int B, - const MyFloat1D & GL, - const MyFloat1D & R, - const MyFloat1D & PI, - const MyFloat1D & F, - const double minEmission = 1e-10) +inline auto get_cluster_likelihoods(int ind, + const int M, + const int B, + const MyFloat1D & GL, + const MyFloat1D & R, + const MyFloat1D & PI, + const MyFloat1D & F, + const double minEmission = 1e-10) { const int C = F.size() / M; const int C2 = C * C; @@ -612,7 +612,11 @@ inline MyArr2D get_cluster_likelihoods(int ind, // reuse emitGrids for cluster likelihoods emitGrid = alpha * beta / ae; emitGrid.rowwise() /= emitGrid.colwise().sum(); // norm it - return emitGrid; + // reuse alpha for cluster frequency + alpha.setZero(C, nGrids); + for(g = 0; g < nGrids; g++) alpha.col(g) = ae.col(g).reshaped(C, C).colwise().sum(); + alpha.rowwise() /= alpha.colwise().sum(); // norm it + return std::tuple(emitGrid, alpha); } inline auto calc_cluster_info(const int N, const MyArr2D & GZP1, const MyArr2D & GZP2) From affd9e28274ce01bc05f67ba926ed8896cbfd618 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Wed, 24 Jan 2024 10:07:20 +0100 Subject: [PATCH 10/67] norm cluster freqs --- src/common.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common.hpp b/src/common.hpp index 128ac19..4a8473e 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -614,8 +614,8 @@ inline auto get_cluster_likelihoods(int ind, emitGrid.rowwise() /= emitGrid.colwise().sum(); // norm it // reuse alpha for cluster frequency alpha.setZero(C, nGrids); + ae.rowwise() /= ae.colwise().sum(); // norm it for(g = 0; g < nGrids; g++) alpha.col(g) = ae.col(g).reshaped(C, C).colwise().sum(); - alpha.rowwise() /= alpha.colwise().sum(); // norm it return std::tuple(emitGrid, alpha); } From f8ba496cb0104f2406533fdea89c1228c7e4730f Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Wed, 24 Jan 2024 14:34:44 +0100 Subject: [PATCH 11/67] fix fastphase --- src/admixture.cpp | 2 +- src/common.hpp | 2 +- src/fastphase.cpp | 40 ++++++++++++++++++++-------------------- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index 8eab3b2..ea0afb2 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -33,7 +33,7 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & { c12 = c1 * C + c2; double xz = cl(c12, s); - if(cf(c1, s) < tol || cf(c2, s) < tol) xz = 0.0; + // if(cf(c1, s) < tol || cf(c2, s) < tol) xz = 0.0; double zy = Hz(c1) * Hz(c2); tmp += xz * zy; } diff --git a/src/common.hpp b/src/common.hpp index 4a8473e..5e882c2 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -609,12 +609,12 @@ inline auto get_cluster_likelihoods(int ind, } } } + ae.rowwise() /= ae.colwise().sum(); // norm it // reuse emitGrids for cluster likelihoods emitGrid = alpha * beta / ae; emitGrid.rowwise() /= emitGrid.colwise().sum(); // norm it // reuse alpha for cluster frequency alpha.setZero(C, nGrids); - ae.rowwise() /= ae.colwise().sum(); // norm it for(g = 0; g < nGrids; g++) alpha.col(g) = ae.col(g).reshaped(C, C).colwise().sum(); return std::tuple(emitGrid, alpha); } diff --git a/src/fastphase.cpp b/src/fastphase.cpp index b768d6b..c300ef6 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -73,28 +73,28 @@ void FastPhaseK2::updateIteration() // update PI(C, M) except the first snp // first we normalize Ezj so that each col sum to 1 - Ezj = (Ezj < clusterFreqThreshold).select(clusterFreqThreshold, Ezj); // reset to Ezj.rowwise() /= Ezj.colwise().sum(); - Ezj.col(0) = pi / pi.sum(); // now update the first SNP - - // if(Ezj.isNaN().any() || (Ezj < clusterFreqThreshold).any()) - // { - // // std::cerr << "reset values below threshold\n"; - // Ezj = (Ezj < clusterFreqThreshold).select(0, Ezj); // reset to 0 first - // for(int i = 0; i < G; i++) - // { - // // for columns with an entry below 0 - // // each 0 entry becomes threshold - // // then rest re-scaled so whole thing has sum 1 - // if(auto c = (Ezj.col(i) == 0).count() > 0) - // { - // double xsum = 1 - c * clusterFreqThreshold; - // double csum = Ezj.col(i).sum(); - // Ezj.col(i) = (Ezj.col(i) > 0).select(Ezj.col(i) * xsum / csum, clusterFreqThreshold); - // } - // } - // } + Ezj = (Ezj < clusterFreqThreshold).select(clusterFreqThreshold, Ezj); // reset to + + if(Ezj.isNaN().any() || (Ezj < clusterFreqThreshold).any()) + { + cao.warn("reset cluster frequency to clusterFreqThreshold"); + Ezj = (Ezj < clusterFreqThreshold).select(0, Ezj); // reset to 0 first + for(int i = 0; i < G; i++) + { + // for columns with an entry 0, each entry becomes threshold + // then rest re-scaled so whole thing has sum 1 + if(auto c = (Ezj.col(i) == 0).count() > 0) + { + double xsum = 1 - c * clusterFreqThreshold; + double csum = Ezj.col(i).sum(); + Ezj.col(i) = (Ezj.col(i) > 0).select(Ezj.col(i) * xsum / csum, clusterFreqThreshold); + } + } + } + // now update the first SNP + Ezj.col(0) = pi / pi.sum(); // pi = gammaK.col(0).reshaped(C, C).colwise().sum(); PI = Ezj; if(Ezj.isNaN().any()) cao.error(Ezj, "NaN in PI from FastPhaseK2\n"); From e2cfaba2457ff34c46a15c01fe5d376494b9932f Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Wed, 24 Jan 2024 14:40:57 +0100 Subject: [PATCH 12/67] fix fastphase --- src/fastphase.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/fastphase.cpp b/src/fastphase.cpp index c300ef6..a84d5fd 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -74,8 +74,6 @@ void FastPhaseK2::updateIteration() // update PI(C, M) except the first snp // first we normalize Ezj so that each col sum to 1 Ezj.rowwise() /= Ezj.colwise().sum(); - Ezj = (Ezj < clusterFreqThreshold).select(clusterFreqThreshold, Ezj); // reset to - if(Ezj.isNaN().any() || (Ezj < clusterFreqThreshold).any()) { cao.warn("reset cluster frequency to clusterFreqThreshold"); From f51804cf10c42b61c13e5aa308151327d326e8ef Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Wed, 24 Jan 2024 19:22:14 +0100 Subject: [PATCH 13/67] fix cl --- src/common.hpp | 17 +++++++++++++---- src/fastphase.cpp | 3 +-- src/fastphase.hpp | 4 ++-- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/common.hpp b/src/common.hpp index 5e882c2..e0c36e2 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -609,13 +609,22 @@ inline auto get_cluster_likelihoods(int ind, } } } - ae.rowwise() /= ae.colwise().sum(); // norm it // reuse emitGrids for cluster likelihoods - emitGrid = alpha * beta / ae; - emitGrid.rowwise() /= emitGrid.colwise().sum(); // norm it + emitGrid = alpha * beta; // reuse alpha for cluster frequency alpha.setZero(C, nGrids); - for(g = 0; g < nGrids; g++) alpha.col(g) = ae.col(g).reshaped(C, C).colwise().sum(); + for(g = 0; g < nGrids; g++) + { + alpha.col(g) = ae.col(g).reshaped(C, C).colwise().sum(); + alpha.col(g) /= alpha.col(g).sum(); + for(z1 = 0; z1 < C; z1++) + for(z2 = 0; z2 < C; z2++) + { + z12 = z1 * C + z2; + emitGrid(z12, g) /= (alpha(z1, g) * alpha(z2, g)); + } + } + emitGrid.rowwise() /= emitGrid.colwise().sum(); // norm it return std::tuple(emitGrid, alpha); } diff --git a/src/fastphase.cpp b/src/fastphase.cpp index a84d5fd..6fe97e3 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -34,7 +34,6 @@ void FastPhaseK2::setFlags(bool d, bool r) cao.warn("flags: debug=", debug, ", NR=", NR); } - void FastPhaseK2::initIteration() { // initial temp variables @@ -76,7 +75,7 @@ void FastPhaseK2::updateIteration() Ezj.rowwise() /= Ezj.colwise().sum(); if(Ezj.isNaN().any() || (Ezj < clusterFreqThreshold).any()) { - cao.warn("reset cluster frequency to clusterFreqThreshold"); + cao.warn("reset cluster frequency to clusterFreqThreshold:", clusterFreqThreshold); Ezj = (Ezj < clusterFreqThreshold).select(0, Ezj); // reset to 0 first for(int i = 0; i < G; i++) { diff --git a/src/fastphase.hpp b/src/fastphase.hpp index 00d33fa..b366e82 100644 --- a/src/fastphase.hpp +++ b/src/fastphase.hpp @@ -31,8 +31,8 @@ class FastPhaseK2 // BOUNDING double minRate{0.1}, maxRate{100}; // threshold for R - double clusterFreqThreshold{1e-4}; // threshold for PI - double alleleEmitThreshold{1e-4}; // threshold for F(P) + double clusterFreqThreshold{1e-6}; // threshold for PI + double alleleEmitThreshold{1e-6}; // threshold for F(P) // FLAGS bool debug{0}, NR{0}; From cbc6ff0cdcf1c13027577e92d032e9b29c17ea72 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Fri, 26 Jan 2024 13:30:13 +0100 Subject: [PATCH 14/67] fix pi --- src/fastphase.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/fastphase.cpp b/src/fastphase.cpp index 6fe97e3..92c516b 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -93,6 +93,7 @@ void FastPhaseK2::updateIteration() // now update the first SNP Ezj.col(0) = pi / pi.sum(); // pi = gammaK.col(0).reshaped(C, C).colwise().sum(); PI = Ezj; + PI.rowwise() /= PI.colwise().sum(); // normalize it per site if(Ezj.isNaN().any()) cao.error(Ezj, "NaN in PI from FastPhaseK2\n"); if(debug && !((1 - PI.colwise().sum()).abs() < 1e-3).all()) From cf8f51da83658fd5734e8eca439b9d1633a0c424 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 29 Jan 2024 21:00:51 +0100 Subject: [PATCH 15/67] refactor 0.4.0 --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/plot_haplotypes.R | 27 ++ src/common.hpp | 30 +- src/fastphase.cpp | 768 +++++++--------------------------------- src/fastphase.hpp | 58 ++- src/main.cpp | 2 +- src/parse-phaseless.cpp | 11 +- src/phaseless.cpp | 5 +- 9 files changed, 199 insertions(+), 705 deletions(-) create mode 100644 R/plot_haplotypes.R diff --git a/DESCRIPTION b/DESCRIPTION index c43e637..3d17f80 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: phaseless Title: Admixture and imputation for low coverage sequencing data in one goal -Version: 0.3.0 +Version: 0.4.0 Authors@R: person("Zilong", "Li", , "zilong.dk@gmail.com", role = c("aut", "cre", "cph"), comment = c(ORCID = "0000-0001-5859-2078")) diff --git a/NAMESPACE b/NAMESPACE index 4f3148e..4dfd139 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,5 +4,6 @@ export(parse_impute_opt) export(parse_impute_par) export(parse_joint_par) export(parse_joint_post) +export(plot_gamma) importFrom(Rcpp,sourceCpp) useDynLib(phaseless, .registration = TRUE) diff --git a/R/plot_haplotypes.R b/R/plot_haplotypes.R new file mode 100644 index 0000000..669d2ae --- /dev/null +++ b/R/plot_haplotypes.R @@ -0,0 +1,27 @@ +#' @export +plot_gamma <- function(gammaC, sites = NULL, title="") { + N <- length(gammaC) + C <- nrow(gammaC[[1]]) + M <- ncol(gammaC[[1]]) + if(!is.null(sites) & is.vector(sites) & length(sites) < M) { + M <- length(sites) + } else { + sites <- 1:M + } + plot(0, 0, col = "white", axes=FALSE, xlim = c(0, M), ylim = c(1, N + 1), + xlab = "", ylab = "", + cex.lab = 1.5, cex.main = 2.0, main = title) + d <- 1 + xleft <- 1:M - d + xright <- 1:M - d + for (i in seq(N)) { + ytop <- i + array(0, M) + ybottom <- i + array(0, M) + for(c in 1:C) { + ytop <- ytop + gammaC[[i]][c, sites] + rect(xleft = xleft - d, xright = xright + d, ybottom = ybottom, ytop = ytop, col = c, lwd = 0, border = NA) + ybottom <- ytop + } + } +} + diff --git a/src/common.hpp b/src/common.hpp index e0c36e2..1d8c65b 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -336,32 +336,25 @@ inline MyArr2D get_emission_by_grid(const MyFloat1D & GL, } /* -** @param alpha forward probability, (C2,M) -** @param beta backwards probability (C2,M) -** @param E emission probability with individual genotype -*likelihood,(C2,M) -** @param R transition probability (3,M) +** @param emit emission probability(C2,M) +** @param R jump probability (3,M) ** @param PI cluster frequency (C,M) -** @return individual log likelihood +** @return alpha, beta and scaling vector */ -inline MyArr1D forward_backwards_diploid(MyArr2D & alpha, - MyArr2D & beta, - const MyArr2D & E, - const MyArr2D & R, - const MyArr2D & PI) +inline auto forward_backwards_diploid(const MyArr2D & emit, const MyArr2D & R, const MyArr2D & PI) { - const int M = alpha.cols(); + const int M = emit.cols(); + const int C2 = emit.rows(); const int C = PI.rows(); + MyArr2D alpha(C2, M), beta(C2, M); MyArr1D sumTmp1(C), cs(M); // store sum over internal loop double constTmp; // ======== forward recursion =========== int z1, z2, z12; int s{0}; - alpha.col(s) = E.col(s) * (PI.col(s).matrix() * PI.col(s).transpose().matrix()).reshaped().array(); + alpha.col(s) = emit.col(s) * (PI.col(s).matrix() * PI.col(s).transpose().matrix()).reshaped().array(); cs(s) = 1.0 / alpha.col(s).sum(); alpha.col(s) *= cs(s); // normalize it - // alpha_s = emit * (alpha_(s-1) * R + pi(z1) * tmp1(z2) + pi(z2) * tmp2(z1) - // + P(switch into z1) * P(switch into z2) * constTmp) for(s = 1; s < M; s++) { sumTmp1 = alpha.col(s - 1).reshaped(C, C).rowwise().sum() * R(1, s); @@ -372,7 +365,7 @@ inline MyArr1D forward_backwards_diploid(MyArr2D & alpha, for(z2 = 0; z2 < C; z2++) { z12 = z1 * C + z2; - alpha(z12, s) = E(z12, s) + alpha(z12, s) = emit(z12, s) * (alpha(z12, s - 1) * R(0, s) + PI(z1, s) * sumTmp1(z2) + PI(z2, s) * sumTmp1(z1) + PI(z1, s) * PI(z2, s) * constTmp); } @@ -386,7 +379,7 @@ inline MyArr1D forward_backwards_diploid(MyArr2D & alpha, beta.col(s).setConstant(1.0); for(s = M - 2; s >= 0; s--) { - auto beta_mult_emit = E.col(s + 1) * beta.col(s + 1); + auto beta_mult_emit = emit.col(s + 1) * beta.col(s + 1); sumTmp1.setZero(); for(constTmp = 0, z1 = 0; z1 < C; z1++) { @@ -407,7 +400,8 @@ inline MyArr1D forward_backwards_diploid(MyArr2D & alpha, } } } - return cs; + + return std::tuple(alpha, beta, cs); } inline MyArr1D get_cluster_probability(int ind, diff --git a/src/fastphase.cpp b/src/fastphase.cpp index 92c516b..1d77b8c 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -8,539 +8,154 @@ using namespace std; -void FastPhaseK2::initRecombination(const Int1D & pos, int B_, double Ne) +void FastPhaseK2::initRecombination(const Int2D & pos, std::string rfile, int B, double Ne) { - B = B_; - G = B > 1 ? (pos.size() + B - 1) / B : M; - PI = MyArr2D::Ones(C, G); - PI.rowwise() /= PI.colwise().sum(); // normalize it per site - if(B > 1) - { - grids = divide_pos_into_grid(pos, B); - dist = calc_grid_distance(grids); - } - else - { - dist = calc_position_distance(pos); - } nGen = 4 * Ne / C; - R = calc_transRate_diploid(dist, nGen); + int nchunks = pos.size(); + pos_chunk.resize(nchunks + 1); + int i{0}, ss{0}; + dist.reserve(M); + for(i = 0; i < nchunks; i++) + { + pos_chunk[i] = ss; + auto tmp = calc_position_distance(pos[i]); + dist.insert(dist.end(), tmp.begin(), tmp.end()); + R.middleCols(ss, pos[i].size()) = calc_transRate_diploid(tmp, nGen); + ss += pos[i].size(); + } + pos_chunk[nchunks] = ss; // add sentinel + if(!rfile.empty()) load_csv(R, rfile, true); + er = R.row(0).sqrt(); + protect_er(er); + R = er2R(er); } -void FastPhaseK2::setFlags(bool d, bool r) +void FastPhaseK2::setFlags(double tol_p, double tol_f, double tol_q, bool debug_, bool nQ, bool nP, bool nF, bool nR) { - debug = d; - NR = r; - cao.warn("flags: debug=", debug, ", NR=", NR); + alleleEmitThreshold = tol_p; + clusterFreqThreshold = tol_f; + admixtureThreshold = tol_q; + debug = debug_; + NQ = nQ; + NP = nP; + NF = nF; + NR = nR; } void FastPhaseK2::initIteration() { // initial temp variables - pi.setZero(C); // reset pi at first SNP - Ezj.setZero(C, G); // reset post(Z,j) + Ezj.setZero(C, M); // reset post(Z,j) Ezg1.setZero(C, M); // reset pos(Z,g) Ezg2.setZero(C, M); // reset pos(Z,g) } -void FastPhaseK2::updateIteration() +void FastPhaseK2::protectPars() { + // protect F + if(!NP) + { + if(F.isNaN().any()) + { + if(debug) cao.warn("NaN in F in FastPhaseK2 model. will fill it with AF"); + if(AF.size() == 0) cao.error("AF is not assigned!\n"); + for(int i = 0; i < M; i++) F.row(i) = F.row(i).isNaN().select(AF(i), F.row(i)); + } + // map F to domain but no normalization + F = (F < alleleEmitThreshold).select(alleleEmitThreshold, F); // lower bound + F = (F > 1 - alleleEmitThreshold).select(1 - alleleEmitThreshold, F); // upper bound + } + // protect R if(!NR) { - MyArr1D er = 1.0 - Ezj.colwise().sum() / N; for(int i = 0; i < er.size(); i++) { - double miner = std::exp(-nGen * maxRate * dist[i] / 100 / 1e6); - double maxer = std::exp(-nGen * minRate * dist[i] / 100 / 1e6); + const double miner = std::exp(-nGen * maxRate * dist[i] / 100 / 1e6); + const double maxer = std::exp(-nGen * minRate * dist[i] / 100 / 1e6); er(i) = er(i) < miner ? miner : er(i); er(i) = er(i) > maxer ? maxer : er(i); } + protect_er(er); R = er2R(er); } - - // update F - F = (Ezg2 / (Ezg1 + Ezg2)).transpose(); - if(F.isNaN().any()) + // protect PI + if(!NF) { - if(debug) cao.warn("NaN in F in FastPhaseK2 model. will fill it with AF"); - if(AF.size() == 0) cao.error("AF is not assigned!\n"); - for(int i = 0; i < M; i++) F.row(i) = F.row(i).isNaN().select(AF(i), F.row(i)); + if(PI.isNaN().any()) cao.warn("NaN in PI. reset cluster frequency to ", clusterFreqThreshold); + PI = (PI < clusterFreqThreshold).select(clusterFreqThreshold, PI); + PI = (PI > 1 - clusterFreqThreshold).select(1 - clusterFreqThreshold, PI); + // re-normalize F per site. hope should work well. otherwise do the complicated. + PI.rowwise() /= PI.colwise().sum(); } - // map F to domain but no normalization - F = (F < alleleEmitThreshold).select(alleleEmitThreshold, F); // lower bound - F = (F > 1 - alleleEmitThreshold).select(1 - alleleEmitThreshold, F); // upper bound - - // update PI(C, M) except the first snp - // first we normalize Ezj so that each col sum to 1 - Ezj.rowwise() /= Ezj.colwise().sum(); - if(Ezj.isNaN().any() || (Ezj < clusterFreqThreshold).any()) - { - cao.warn("reset cluster frequency to clusterFreqThreshold:", clusterFreqThreshold); - Ezj = (Ezj < clusterFreqThreshold).select(0, Ezj); // reset to 0 first - for(int i = 0; i < G; i++) - { - // for columns with an entry 0, each entry becomes threshold - // then rest re-scaled so whole thing has sum 1 - if(auto c = (Ezj.col(i) == 0).count() > 0) - { - double xsum = 1 - c * clusterFreqThreshold; - double csum = Ezj.col(i).sum(); - Ezj.col(i) = (Ezj.col(i) > 0).select(Ezj.col(i) * xsum / csum, clusterFreqThreshold); - } - } - } - - // now update the first SNP - Ezj.col(0) = pi / pi.sum(); // pi = gammaK.col(0).reshaped(C, C).colwise().sum(); - PI = Ezj; - PI.rowwise() /= PI.colwise().sum(); // normalize it per site - - if(Ezj.isNaN().any()) cao.error(Ezj, "NaN in PI from FastPhaseK2\n"); - if(debug && !((1 - PI.colwise().sum()).abs() < 1e-3).all()) - cao.error(PI.colwise().sum(), "\ncolsum of PI is not 1.0!\n"); } -/* -** @param niters number of iterations -** @param GL genotype likelihood of all individuals in snp major form -** @param pos SNP position -** @return likelihood difference between last two iters -*/ -double FastPhaseK2::runWithOneThread(int niters, const MyFloat1D & GL) +void FastPhaseK2::updateIteration() { - double diff{-1}, loglike, prevlike; - for(int it = 0; SIG_COND && it <= niters; it++) + // update R + if(!NR) er = 1.0 - Ezj.colwise().sum() / N; + // update F + if(!NP) F = (Ezg2 / (Ezg1 + Ezg2)).transpose(); + // update PI + if(!NF) { - initIteration(); - loglike = 0; - for(int i = 0; i < N; i++) - { - if(it == niters) - loglike += forwardAndBackwardsLowRam(i, GL, true); - else - loglike += forwardAndBackwardsLowRam(i, GL, false); - } - diff = it ? loglike - prevlike : 0; - prevlike = loglike; - if(it != niters) updateIteration(); + PI = Ezj; + PI.rowwise() /= PI.colwise().sum(); } - return diff; + protectPars(); } /* -** @param ind current individual i ** @param GL genotype likelihood of all individuals in snp major form -** @param finalIter boolean, call genotype of not -** @return individual total likelihood -*/ -double FastPhaseK2::forwardAndBackwardsLowRam(int ind, const MyFloat1D & GL, bool finalIter) -{ - if(G == M) - { - return forwardAndBackwardsLowRamNormal(ind, GL, finalIter); - } - else if(G < M) - { - return forwardAndBackwardsLowRamCollapse(ind, GL, finalIter); - } - else - { - throw std::runtime_error("something wrong!"); - } -} - -/* ** @param ind current individual i -** @param GL genotype likelihood of all individuals in snp major form ** @param finalIter boolean, call genotype of not ** @return individual total likelihood */ -double FastPhaseK2::forwardAndBackwardsLowRamNormal(int ind, const MyFloat1D & GL, bool finalIter) +double FastPhaseK2::hmmIterWithJumps(const MyFloat1D & GL, const int ic, const int ind, bool finalIter) { - Eigen::Map gli(GL.data() + ind * M * 3, M, 3); - MyArr2D alpha(C2, M), beta(C2, M); - const MyArr2D emit = get_emission_by_gl(gli, F).transpose(); // C2 x M - const MyArr1D cs = forward_backwards_diploid(alpha, beta, emit, R, PI); - if(debug && !((1 - ((alpha * beta).colwise().sum())).abs() < 1e-4).all()) - cao.error((alpha * beta).colwise().sum() / cs.transpose(), "\ngamma sum is not 1.0!\n"); - MyArr1D gamma1_ae = (alpha.col(0) * beta.col(0)).reshaped(C, C).colwise().sum(); - MyArr1D ind_post_zj(C); - MyArr1D ind_post_zg1(C); - MyArr1D ind_post_zg2(C); - int z1, s = 0; - if(!finalIter) - { - // now get expectation of post(Z,J) - MyArr1D gamma_div_emit(C2), beta_mult_emit(C2); - s = 0; + const int S = pos_chunk[ic + 1] - pos_chunk[ic]; + Eigen::Map gli(GL.data() + ind * S * 3, S, 3); + MyArr2D emit = get_emission_by_gl(gli, F.middleRows(pos_chunk[ic], S)).transpose(); // CC x S + const auto [alpha, beta, cs] = + forward_backwards_diploid(emit, R.middleCols(pos_chunk[ic], S), PI.middleCols(pos_chunk[ic], S)); + if(!((1 - ((alpha * beta).colwise().sum())).abs() < 1e-9).all()) + cao.error((alpha * beta).colwise().sum(), "\ngamma sum is not 1.0!\n"); + // now get posterios + MyArr2D ind_post_zg1(C, S), ind_post_zg2(C, S), ind_post_zj(C, S); + MyArr1D gamma_div_emit(CC), beta_mult_emit(CC); + MyArr1D alphatmp(C); + int z1, m, s; + for(s = 0; s < S; s++) + { + m = s + pos_chunk[ic]; gamma_div_emit = (alpha.col(s) * beta.col(s)) / emit.col(s); // C2 for(z1 = 0; z1 < C; z1++) { - - ind_post_zg1(z1) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (1 - F(s, z1)) - * (gli(s, 0) * (1 - F.row(s)) + gli(s, 1) * F.row(s)).transpose()) - .sum(); - ind_post_zg2(z1) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (F(s, z1)) - * (gli(s, 1) * (1 - F.row(s)) + gli(s, 2) * F.row(s)).transpose()) - .sum(); - } - { - std::scoped_lock lock(mutex_it); - Ezg1.col(s) += ind_post_zg1; - Ezg2.col(s) += ind_post_zg2; - pi += gamma1_ae; - } - for(s = 1; s < M; s++) - { - beta_mult_emit = emit.col(s) * beta.col(s); // C2 - gamma_div_emit = (alpha.col(s) * beta.col(s)) / emit.col(s); // C2 - MyArr1D alphatmp(C); - for(z1 = 0; z1 < C; z1++) - { - ind_post_zg1(z1) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (1 - F(s, z1)) - * (gli(s, 0) * (1 - F.row(s)) + gli(s, 1) * F.row(s)).transpose()) - .sum(); - ind_post_zg2(z1) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (F(s, z1)) - * (gli(s, 1) * (1 - F.row(s)) + gli(s, 2) * F.row(s)).transpose()) - .sum(); - alphatmp(z1) = alpha(Eigen::seqN(z1, C, C), s - 1).sum() * R(1, s); - } - alphatmp += PI.col(s) * R(2, s) * 1.0; // inner alpha.col(s-1).sum == 1 - for(z1 = 0; z1 < C; z1++) - ind_post_zj(z1) = cs(s) * PI(z1, s) * (alphatmp * beta_mult_emit(Eigen::seqN(z1, C, C))).sum(); - { // sum over all samples for updates - std::scoped_lock lock(mutex_it); - Ezj.col(s) += ind_post_zj; - Ezg1.col(s) += ind_post_zg1; - Ezg2.col(s) += ind_post_zg2; - } + ind_post_zg1(z1, s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (1 - F(m, z1)) + * (gli(s, 0) * (1 - F.row(m)) + gli(s, 1) * F.row(m)).transpose()) + .sum(); + ind_post_zg2(z1, s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (F(m, z1)) + * (gli(s, 1) * (1 - F.row(m)) + gli(s, 2) * F.row(m)).transpose()) + .sum(); + if(s > 0) alphatmp(z1) = alpha(Eigen::seqN(z1, C, C), s - 1).sum() * R(1, m); + if(s == 0) ind_post_zj(z1, s) = (alpha.col(0) * beta.col(0)).segment(z1 * C, C).sum(); + if(finalIter) callGenoLoopC(ind, s, z1, gli, gamma_div_emit); } - } - else - { - // now we call geno and output gamma ae - alpha *= beta; // alpha is gamma now - MyArr2D ind_post_z_g(M, 4); - int z2, z12, g1, g2, g12; + if(s == 0) continue; + alphatmp += PI.col(m) * R(2, m) * 1.0; // inner alpha.col(s-1).sum == 1 + beta_mult_emit = emit.col(s) * beta.col(s); // C2 for(z1 = 0; z1 < C; z1++) - { - for(z2 = 0; z2 < C; z2++) - { - z12 = z1 * C + z2; - for(g1 = 0; g1 < 2; g1++) - { - for(g2 = 0; g2 < 2; g2++) - { - g12 = g1 * 2 + g2; - ind_post_z_g.col(g12) = gli.col(g1 + g2) * (g1 * F.col(z1) + (1 - g1) * (1 - F.col(z1))) - * (g2 * F.col(z2) + (1 - g2) * (1 - F.col(z2))); - } - } - // emit.row(z12) == ind_post_z_g.rowwise().sum(); - ind_post_z_g.colwise() *= alpha.row(z12).transpose() / ind_post_z_g.rowwise().sum(); - for(g1 = 0; g1 < 2; g1++) - { - for(g2 = 0; g2 < 2; g2++) - { - g12 = g1 * 2 + g2; - GP(Eigen::seqN(g1 + g2, M, 3), ind) += ind_post_z_g.col(g12); - } - } - { - // update GammaAE now - std::scoped_lock lock(mutex_it); - Ezj.row(z1) += alpha.row(z12); - } - } - } - } - - return (1 / cs).log().sum(); -} - -/* -** @param ind current individual i -** @param GL genotype likelihood of all individuals in snp major form -** @param finalIter boolean, call genotype of not -** @return individual total likelihood -*/ -double FastPhaseK2::forwardAndBackwardsLowRamCollapse(int ind, const MyFloat1D & GL, bool finalIter) -{ - Eigen::Map gli(GL.data() + ind * M * 3, M, 3); - MyArr2D alpha(C2, G), beta(C2, G); - const MyArr2D emit = get_emission_by_gl(gli, F).transpose(); // C2 x M - MyArr2D emitGrids = collapse_emission_by_grid(emit, grids); // - const MyArr1D cs = forward_backwards_diploid(alpha, beta, emitGrids, R, PI); - if(debug && !((1 - ((alpha * beta).colwise().sum())).abs() < 1e-4).all()) - cao.error((alpha * beta).colwise().sum() / cs.transpose(), "\ngamma sum is not 1.0!\n"); - MyArr1D gamma = alpha.col(0) * beta.col(0); // gamma in first snp - MyArr1D gamma1_sum = gamma.reshaped(C, C).colwise().sum(); - MyArr1D ind_post_zj(C); - int snp, z1, e, i, g{0}, s{0}; - e = s + grids[g].size() - 1; - snp = e + 1; - // now get expectation of post(Z,J) - MyArr2D ind_post_zg1(C, grids[g].size()); - MyArr2D ind_post_zg2(C, grids[g].size()); - for(z1 = 0; z1 < C; z1++) - { - for(i = s; i <= e; i++) - { - auto gamma_div_emit = gamma / emit.col(i); - ind_post_zg1(z1, i - s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (1 - F(i, z1)) - * (gli(i, 0) * (1 - F.row(i)) + gli(i, 1) * F.row(i)).transpose()) - .sum(); - ind_post_zg2(z1, i - s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (F(i, z1)) - * (gli(i, 1) * (1 - F.row(i)) + gli(i, 2) * F.row(i)).transpose()) - .sum(); - if(finalIter) callGenoLoopC(ind, i, z1, gli, gamma_div_emit); - } + ind_post_zj(z1, s) = cs(s) * (PI(z1, m) * alphatmp * beta_mult_emit(Eigen::seqN(z1, C, C))).sum(); } - { + { // sum over all samples for updates std::scoped_lock lock(mutex_it); - pi += gamma1_sum; - for(int i = s; i <= e; i++) - { - Ezg1.col(i) += ind_post_zg1.col(i - s); - Ezg2.col(i) += ind_post_zg2.col(i - s); - } - } - - for(g = 1; g < G; g++) - { - s = snp; - e = snp + grids[g].size() - 1; - snp = e + 1; - gamma = alpha.col(g) * beta.col(g); - auto beta_mult_emit = emitGrids.col(g) * beta.col(g); // C2 - MyArr1D alphatmp(C); - MyArr2D ind_post_zg1(C, grids[g].size()); - MyArr2D ind_post_zg2(C, grids[g].size()); - for(z1 = 0; z1 < C; z1++) - { - alphatmp(z1) = alpha(Eigen::seqN(z1, C, C), g - 1).sum() * R(1, g); - for(i = s; i <= e; i++) - { - auto gamma_div_emit = gamma / emit.col(i); // C2 - ind_post_zg1(z1, i - s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (1 - F(i, z1)) - * (gli(i, 0) * (1 - F.row(i)) + gli(i, 1) * F.row(i)).transpose()) - .sum(); - ind_post_zg2(z1, i - s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (F(i, z1)) - * (gli(i, 1) * (1 - F.row(i)) + gli(i, 2) * F.row(i)).transpose()) - .sum(); - if(finalIter) callGenoLoopC(ind, i, z1, gli, gamma_div_emit); - } - } - alphatmp += PI.col(g) * R(2, g) * 1.0; - for(z1 = 0; z1 < C; z1++) - ind_post_zj(z1) = cs(g) * (PI(z1, g) * alphatmp * beta_mult_emit(Eigen::seqN(z1, C, C))).sum(); - { // sum over all samples for updates - std::scoped_lock lock(mutex_it); - Ezj.col(g) += ind_post_zj; - for(int i = s; i <= e; i++) - { - Ezg1.col(i) += ind_post_zg1.col(i - s); - Ezg2.col(i) += ind_post_zg2.col(i - s); - } - } + Ezj.middleCols(pos_chunk[ic], S) += ind_post_zj; + Ezg1.middleCols(pos_chunk[ic], S) += ind_post_zg1; + Ezg2.middleCols(pos_chunk[ic], S) += ind_post_zg2; } return (1 / cs).log().sum(); } -/* -** @param ind current individual i -** @param GL genotype likelihood of all individuals in snp major form -** @param finalIter boolean, call genotype of not -** @return individual total likelihood -*/ -fbd_res1 FastPhaseK2::forwardAndBackwardsHighRam(int ind, const MyFloat1D & GL, bool finalIter) -{ - - if(G == M) - { - return forwardAndBackwardsHighRamNormal(ind, GL, finalIter); - } - else if(G < M) - { - return forwardAndBackwardsHighRamCollapse(ind, GL, finalIter); - } - else - { - throw std::runtime_error("something wrong!"); - } -} - -fbd_res1 FastPhaseK2::forwardAndBackwardsHighRamCollapse(int ind, const MyFloat1D & GL, bool finalIter) -{ - Eigen::Map gli(GL.data() + ind * M * 3, M, 3); - MyArr2D alpha(C2, G), beta(C2, G); - MyArr2D emit = get_emission_by_gl(gli, F).transpose(); // C2 x M - MyArr2D emitGrids = collapse_emission_by_grid(emit, grids); - auto cs = forward_backwards_diploid(alpha, beta, emitGrids, R, PI); - if(debug && !((1 - ((alpha * beta).colwise().sum())).abs() < 1e-4).all()) - cao.cerr((alpha * beta).colwise().sum() / cs.transpose(), "\ngamma sum is not 1.0!\n"); - - MyArr1D gamma = alpha.col(0) * beta.col(0); // gamma in first snp - MyArr1D gamma_ae = gamma.reshaped(C, C).colwise().sum(); - MyArr2D ind_post_zj = MyArr2D::Zero(C, G); - MyArr2D ind_post_zg1 = MyArr2D::Zero(C, M); - MyArr2D ind_post_zg2 = MyArr2D::Zero(C, M); - - if(!finalIter) - { - // now get expectation of post(Z,J) - int z1, snp, s, e, i, g; - g = 0, s = 0; - e = s + grids[g].size() - 1; - snp = e + 1; - for(z1 = 0; z1 < C; z1++) - { - for(i = s; i <= e; i++) - { - auto gamma_div_emit = gamma / emit.col(i); - ind_post_zg1(z1, i) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (1 - F(i, z1)) - * (gli(i, 0) * (1 - F.row(i)) + gli(i, 1) * F.row(i)).transpose()) - .sum(); - ind_post_zg2(z1, i) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (F(i, z1)) - * (gli(i, 1) * (1 - F.row(i)) + gli(i, 2) * F.row(i)).transpose()) - .sum(); - } - } - for(g = 1; g < G; g++) - { - s = snp; - e = snp + grids[g].size() - 1; - snp = e + 1; - gamma = (alpha.col(g) * beta.col(g)); // C2 - MyArr1D beta_mult_emit = emitGrids.col(g) * beta.col(g); // C2 - MyArr1D alphatmp(C); - for(z1 = 0; z1 < C; z1++) - { - alphatmp(z1) = alpha(Eigen::seqN(z1, C, C), g - 1).sum() * R(1, g); - for(i = s; i <= e; i++) - { - auto gamma_div_emit = gamma / emit.col(i); - ind_post_zg1(z1, i) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (1 - F(i, z1)) - * (gli(i, 0) * (1 - F.row(i)) + gli(i, 1) * F.row(i)).transpose()) - .sum(); - ind_post_zg2(z1, i) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (F(i, z1)) - * (gli(i, 1) * (1 - F.row(i)) + gli(i, 2) * F.row(i)).transpose()) - .sum(); - } - } - alphatmp += PI.col(g) * R(2, g) * 1.0; - for(z1 = 0; z1 < C; z1++) - ind_post_zj(z1, g) = cs(g) * (PI(z1, g) * alphatmp * beta_mult_emit(Eigen::seqN(z1, C, C))).sum(); - } - } - else - { - // now we call geno and output gamma ae - for(int g = 0; g < G; g++) - { - ind_post_zj.col(g) += (alpha.col(g) * beta.col(g)).reshaped(C, C).colwise().sum(); - } - } - double indLogLike = (1 / cs).log().sum(); - return std::tuple(indLogLike, ind_post_zj, ind_post_zg1, ind_post_zg2, gamma_ae); -} - -/* -** @param ind current individual i -** @param GL genotype likelihood of all individuals in snp major form -** @param finalIter boolean, call genotype of not -** @return individual total likelihood -*/ -fbd_res1 FastPhaseK2::forwardAndBackwardsHighRamNormal(int ind, const MyFloat1D & GL, bool finalIter) -{ - Eigen::Map gli(GL.data() + ind * M * 3, M, 3); - MyArr2D alpha(C2, M), beta(C2, M); - MyArr2D emit = get_emission_by_gl(gli, F).transpose(); // C2 x M - auto cs = forward_backwards_diploid(alpha, beta, emit, R, PI); - if(debug && !((1 - ((alpha * beta).colwise().sum())).abs() < 1e-4).all()) - cao.error((alpha * beta).colwise().sum() / cs.transpose(), "\ngamma sum is not 1.0!\n"); - MyArr1D gamma_ae = (alpha.col(0) * beta.col(0)).reshaped(C, C).colwise().sum(); - MyArr2D ind_post_zj = MyArr2D::Zero(C, M); - MyArr2D ind_post_zg1 = MyArr2D::Zero(C, M); - MyArr2D ind_post_zg2 = MyArr2D::Zero(C, M); - - int z1, s; - if(!finalIter) - { - MyArr1D gamma_div_emit(C2), beta_mult_emit(C2); - // now get expectation of post(Z,J) - s = 0; - gamma_div_emit = (alpha.col(s) * beta.col(s)) / emit.col(s); // C2 - for(z1 = 0; z1 < C; z1++) - { - - ind_post_zg1(z1, s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (1 - F(s, z1)) - * (gli(s, 0) * (1 - F.row(s)) + gli(s, 1) * F.row(s)).transpose()) - .sum(); - ind_post_zg2(z1, s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (F(s, z1)) - * (gli(s, 1) * (1 - F.row(s)) + gli(s, 2) * F.row(s)).transpose()) - .sum(); - } - for(s = 1; s < M; s++) - { - beta_mult_emit = emit.col(s) * beta.col(s); // C2 - gamma_div_emit = (alpha.col(s) * beta.col(s)) / emit.col(s); // C2 - MyArr1D alphatmp(C); - for(z1 = 0; z1 < C; z1++) - { - alphatmp(z1) = alpha(Eigen::seqN(z1, C, C), s - 1).sum() * R(1, s); - ind_post_zg1(z1, s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (1 - F(s, z1)) - * (gli(s, 0) * (1 - F.row(s)) + gli(s, 1) * F.row(s)).transpose()) - .sum(); - ind_post_zg2(z1, s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (F(s, z1)) - * (gli(s, 1) * (1 - F.row(s)) + gli(s, 2) * F.row(s)).transpose()) - .sum(); - } - alphatmp += PI.col(s) * R(2, s) * 1.0; - for(z1 = 0; z1 < C; z1++) - ind_post_zj(z1, s) = cs(s) * (PI(z1, s) * alphatmp * beta_mult_emit(Eigen::seqN(z1, C, C))).sum(); - } - } - else - { - // now we call geno and output gamma ae - alpha *= beta; // alpha is gamma now - MyArr2D ind_post_z_g(M, 4); - int z2, z12, g1, g2, g12; - for(z1 = 0; z1 < C; z1++) - { - for(z2 = 0; z2 < C; z2++) - { - z12 = z1 * C + z2; - for(g1 = 0; g1 < 2; g1++) - { - for(g2 = 0; g2 < 2; g2++) - { - g12 = g1 * 2 + g2; - ind_post_z_g.col(g12) = gli.col(g1 + g2) * (g1 * F.col(z1) + (1 - g1) * (1 - F.col(z1))) - * (g2 * F.col(z2) + (1 - g2) * (1 - F.col(z2))); - } - } - // emit.row(z12) == ind_post_z_g.rowwise().sum(); - ind_post_z_g.colwise() *= alpha.row(z12).transpose() / ind_post_z_g.rowwise().sum(); - for(g1 = 0; g1 < 2; g1++) - { - for(g2 = 0; g2 < 2; g2++) - { - g12 = g1 * 2 + g2; - GP(Eigen::seqN(g1 + g2, M, 3), ind) += ind_post_z_g.col(g12); - } - } - // update gamma ae now - ind_post_zj.row(z1) += alpha.row(z12); - } - } - } - double indLogLike = (1 / cs).log().sum(); - return std::tuple(indLogLike, ind_post_zj, ind_post_zg1, ind_post_zg2, gamma_ae); -} - void FastPhaseK2::callGenoLoopC(int ind, int s, int z1, const MyArr2D & gli, const MyArr1D & gamma_div_emit) { MyArr1D tmp_zg(4); @@ -557,45 +172,15 @@ void FastPhaseK2::callGenoLoopC(int ind, int s, int z1, const MyArr2D & gli, con } } -void FastPhaseK2::collapse_and_resize(const Int1D & pos, double tol_r) -{ - collapse = find_chunk_to_collapse(R, tol_r); - grids = divide_pos_into_grid(pos, collapse); - G = grids.size(); - MyArr2D PInew(C, G), Rnew(3, G); - int g, s, e, snp = 0; - for(g = 0; g < G; g++) - { - s = snp; - e = snp + grids[g].size() - 1; - snp = e + 1; - PInew.col(g) = PI.col(s); // choose the first snp in this collapsing block - Rnew.col(g) = R.col(s); // choose the first snp in this collapsing block - } - PI = PInew; - R = Rnew; - Ezj = PI; -} - -fbd_res2 make_input_per_chunk(const std::unique_ptr & genome, - const int ic, - const int niters, - const int seed, - const bool collapse, - const double tol_pi, - const double tol_r) +double FastPhaseK2::runAllChunks(const MyFloat2D & GL, const int ind, bool finalIter) { - FastPhaseK2 faith(genome->pos[ic].size(), genome->nsamples, genome->C, seed); - faith.initRecombination(genome->pos[ic]); - faith.AF = estimate_af_by_gl(genome->gls[ic], genome->nsamples, genome->pos[ic].size()).cast(); - faith.runWithOneThread(niters, genome->gls[ic]); - if(collapse) + if(pos_chunk.size() == 0) cao.error("please run initRecombination first"); + double loglike{0}; + for(size_t ic = 0; ic < GL.size(); ic++) { - faith.collapse_and_resize(genome->pos[ic], tol_r); - faith.runWithOneThread(2, genome->gls[ic]); // FIXME update iterations + loglike += hmmIterWithJumps(GL[ic], ic, ind, finalIter); } - return std::tuple(MyFloat1D(faith.GP.data(), faith.GP.data() + faith.GP.size()), faith.R, faith.PI, faith.F, - faith.Ezj); + return loglike; } int run_impute_main(Options & opts) @@ -607,133 +192,34 @@ int run_impute_main(Options & opts) int allthreads = std::thread::hardware_concurrency(); opts.nthreads = opts.nthreads < allthreads ? opts.nthreads : allthreads; cao.print(tim.date(), allthreads, " concurrent threads are available. use", opts.nthreads, " threads"); - ThreadPool poolit(opts.nthreads); + ThreadPool pool(opts.nthreads); std::unique_ptr genome = std::make_unique(); init_bigass(genome, opts); - auto bw = make_bcfwriter(opts.out + ".vcf.gz", genome->chrs, genome->sampleids); - std::ofstream orecomb(opts.out + ".recomb"); - std::ofstream opi(opts.out + ".pi"); - std::ofstream oae(opts.out + ".cluster.freq"); - std::ofstream op(opts.out + ".P"); Eigen::IOFormat fmt(6, Eigen::DontAlignCols, " ", "\n"); - if(opts.single_chunk) - { - vector> res; - for(int ic = 0; ic < genome->nchunks; ic++) - { - FastPhaseK2 faith(genome->pos[ic].size(), genome->nsamples, opts.C, opts.seed); - faith.initRecombination(genome->pos[ic], genome->B); - faith.setFlags(opts.debug, opts.nR); - faith.AF = estimate_af_by_gl(genome->gls[ic], genome->nsamples, genome->pos[ic].size()).cast(); - for(int it = 0; SIG_COND && it <= opts.nimpute; it++) - { - tim.clock(); - faith.initIteration(); - for(int i = 0; i < genome->nsamples; i++) - { - if(it == opts.nimpute) - res.emplace_back(poolit.enqueue(&FastPhaseK2::forwardAndBackwardsHighRam, &faith, i, - std::ref(genome->gls[ic]), true)); - else - res.emplace_back(poolit.enqueue(&FastPhaseK2::forwardAndBackwardsHighRam, &faith, i, - std::ref(genome->gls[ic]), false)); - } - double loglike = 0; - for(auto && ll : res) - { - const auto [l, zj, zg1, zg2, gamma1_ae] = ll.get(); - loglike += l; - faith.Ezj += zj; - faith.Ezg1 += zg1; - faith.Ezg2 += zg2; - faith.pi += gamma1_ae; - } - res.clear(); // clear future and renew - cao.print(tim.date(), "run single chunk", ic, ", iteration", it, ", likelihoods =", loglike, ",", - tim.reltime(), " sec"); - if(it != opts.nimpute) faith.updateIteration(); - } - tim.clock(); - write_bigass_to_bcf(bw, faith.GP.data(), genome->chrs[ic], genome->pos[ic]); - genome->R.emplace_back(MyFloat1D(faith.R.data(), faith.R.data() + faith.R.size())); - genome->PI.emplace_back(MyFloat1D(faith.PI.data(), faith.PI.data() + faith.PI.size())); - genome->F.emplace_back(MyFloat1D(faith.F.data(), faith.F.data() + faith.F.size())); - opi << faith.PI.transpose().format(fmt) << "\n"; - op << faith.F.format(fmt) << "\n"; - orecomb << faith.R.transpose().format(fmt) << "\n"; - faith.Ezj.rowwise() /= faith.Ezj.colwise().sum(); // norm gamma ae - oae << faith.Ezj.transpose().format(fmt) << "\n"; - genome->GammaAE.emplace_back(MyFloat1D(faith.Ezj.data(), faith.Ezj.data() + faith.Ezj.size())); - cao.done(tim.date(), "chunk", ic, " done. outputting elapsed", tim.reltime(), " secs"); - if(opts.collapse) - { - cao.warn(tim.date(), "start collapsing!"); - faith.collapse_and_resize(genome->pos[ic], opts.tol_r); - std::ofstream orecomb2(opts.out + ".recomb2"); - std::ofstream opi2(opts.out + ".pi2"); - std::ofstream oae2(opts.out + ".cluster.freq2"); - std::ofstream oclp(opts.out + ".collapse"); - opts.nimpute = 2; - for(int it = 0; it <= opts.nimpute; it++) - { - tim.clock(); - faith.initIteration(); - for(int i = 0; i < genome->nsamples; i++) - { - if(it == opts.nimpute) - res.emplace_back(poolit.enqueue(&FastPhaseK2::forwardAndBackwardsHighRam, &faith, i, - std::ref(genome->gls[ic]), true)); - else - res.emplace_back(poolit.enqueue(&FastPhaseK2::forwardAndBackwardsHighRam, &faith, i, - std::ref(genome->gls[ic]), false)); - } - double loglike = 0; - for(auto && ll : res) - { - const auto [l, zj, zg1, zg2, gamma1] = ll.get(); - loglike += l; - faith.Ezj += zj; // gamma ae - faith.Ezg1 += zg1; - faith.Ezg2 += zg2; - faith.pi += gamma1; - } - res.clear(); // clear future and renew - cao.print(tim.date(), "run single chunk", ic, ", iteration", it, ", likelihoods =", loglike, ",", - tim.reltime(), " sec"); - if(it != opts.nimpute) faith.updateIteration(); - } - orecomb2 << faith.R.transpose().format(fmt) << "\n"; - opi2 << faith.PI.transpose().format(fmt) << "\n"; - faith.Ezj.rowwise() /= faith.Ezj.colwise().sum(); // norm gamma ae - oae2 << faith.Ezj.transpose().format(fmt) << "\n"; - for(auto cl : faith.collapse) oclp << cl << "\n"; - } - } - } - else - { - if(genome->nchunks < opts.nthreads) - cao.warn(tim.date(), "nchunks < nthreads. only", genome->nchunks, " threads will be working"); - vector> res; - for(int ic = 0; ic < genome->nchunks; ic++) - res.emplace_back(poolit.enqueue(make_input_per_chunk, std::ref(genome), ic, opts.nimpute, opts.seed, - opts.collapse, opts.tol_pi, opts.tol_r)); - int ic = 0; - for(auto && ll : res) + vector> res; + FastPhaseK2 faith(genome->nsamples, genome->nsnps, opts.C, opts.seed); + faith.setFlags(opts.ptol, opts.ftol, opts.qtol, opts.debug, opts.nQ, opts.nP, opts.nF, opts.nR); + faith.initRecombination(genome->pos, opts.in_rfile); + double loglike, diff, prevlike{std::numeric_limits::lowest()}; + for(int it = 0; SIG_COND && it <= opts.nimpute; it++) + { + tim.clock(); + faith.initIteration(); + for(int i = 0; i < faith.N; i++) + res.emplace_back(pool.enqueue(&FastPhaseK2::runAllChunks, &faith, std::ref(genome->gls), i, false)); + loglike = 0; + for(auto && ll : res) loglike += ll.get(); + res.clear(); // clear future and renew + faith.updateIteration(); + diff = it ? loglike - prevlike : NAN; + prevlike = loglike; + cao.print(tim.date(), "run whole genome, iteration", it, ", likelihoods =", loglike, ", diff =", diff, ", time", + tim.reltime(), " sec"); + if(diff < opts.ltol) { - auto [GP, faithR, faithPI, faithF, faithAE] = ll.get(); - write_bigass_to_bcf(bw, GP.data(), genome->chrs[ic], genome->pos[ic]); - genome->R.emplace_back(MyFloat1D(faithR.data(), faithR.data() + faithR.size())); - genome->PI.emplace_back(MyFloat1D(faithPI.data(), faithPI.data() + faithPI.size())); - genome->F.emplace_back(MyFloat1D(faithF.data(), faithF.data() + faithF.size())); - faithAE.rowwise() /= faithAE.colwise().sum(); // norm gamma ae - genome->GammaAE.emplace_back(MyFloat1D(faithAE.data(), faithAE.data() + faithAE.size())); - orecomb << faithR.transpose().format(fmt) << "\n"; - opi << faithPI.transpose().format(fmt) << "\n"; - op << faithF.format(fmt) << "\n"; - oae << faithAE.transpose().format(fmt) << "\n"; - cao.print(tim.date(), "chunk", ic++, " imputation done and outputting"); + cao.print(tim.date(), "hit stopping criteria, diff =", std::scientific, diff, " <", opts.ltol); + break; } } constexpr auto OPTIONS = alpaca::options::fixed_length_encoding; @@ -742,5 +228,9 @@ int run_impute_main(Options & opts) ofs.close(); assert(std::filesystem::file_size(opts.out + ".pars.bin") == bytes_written); cao.done(tim.date(), "imputation done and outputting.", bytes_written, " bytes written to file"); + std::ofstream orecomb(opts.out + ".recomb"); + std::ofstream opi(opts.out + ".cluster.freq"); + opi << faith.PI.transpose().format(fmt) << "\n"; + orecomb << faith.R.transpose().format(fmt) << "\n"; return 0; } diff --git a/src/fastphase.hpp b/src/fastphase.hpp index b366e82..6357d5d 100644 --- a/src/fastphase.hpp +++ b/src/fastphase.hpp @@ -10,72 +10,58 @@ #include #include -using fbd_res1 = std::tuple; -using fbd_res2 = std::tuple; - class FastPhaseK2 { private: std::mutex mutex_it; // in case of race condition + // randon engine + std::default_random_engine rng = std::default_random_engine{}; + // BOUNDING + double minRate{0.1}, maxRate{100}; // threshold for R + double alleleEmitThreshold{1e-6}; // threshold for P + double clusterFreqThreshold{1e-6}; // threshold for F + double admixtureThreshold{1e-6}; // threshold for Q public: - FastPhaseK2(int m, int n, int c, int seed) : M(m), N(n), C(c), C2(c * c) + FastPhaseK2(int n, int m, int c, int seed) : N(n), M(m), C(c), CC(c * c) { - auto rng = std::default_random_engine{}; rng.seed(seed); - F = RandomUniform(M, C, rng, alleleEmitThreshold, - 1 - alleleEmitThreshold); + F = RandomUniform(M, C, rng, alleleEmitThreshold, 1 - alleleEmitThreshold); + PI = MyArr2D::Ones(C, M); + PI.rowwise() /= PI.colwise().sum(); // normalize it per site + R = MyArr2D(3, M); GP.setZero(M * 3, N); } ~FastPhaseK2() {} - // BOUNDING - double minRate{0.1}, maxRate{100}; // threshold for R - double clusterFreqThreshold{1e-6}; // threshold for PI - double alleleEmitThreshold{1e-6}; // threshold for F(P) - // FLAGS - bool debug{0}, NR{0}; + bool debug{0}, local{0}, post{1}, NQ{0}, NF{0}, NP{1}, NR{1}; // SHARED VARIBALES - const int M, N, C, C2; // C2 = C x C + const int N, M, C, CC; // CC = C x C int G, B; // G: number of grids after collapsing block MyArr2D GP; // N x (M x 3), genotype probabilies for all individuals - MyArr1D pi; // C, PI in first SNP MyArr2D PI; // C x M, cluster frequency MyArr2D F; // M x C, cluster-specific allele frequence + MyArr1D er; // M, jumping rate MyArr2D R; // 3 x M, jumping / recombination rate MyArr2D Ezj; // C x M, E(Z=z,J=1|X,par), expectation of switch into state k MyArr2D Ezg1, Ezg2; // C x M + double nGen; Int1D dist; // physical position distance between two markers + Int1D pos_chunk; // store the start pos of each chunk in the full scale MyArr1D AF; - Int2D grids; - Bool1D collapse; - double nGen; - void setFlags(bool, bool); - void initRecombination(const Int1D & pos, int B_ = 1, double Ne = 20000); - void collapse_and_resize(const Int1D & pos, double tol_r = 1e-6); + void initRecombination(const Int2D & pos, std::string rfile = "", int B = 1, double Ne = 20000); + void setFlags(double, double, double, bool, bool, bool, bool, bool); + void protectPars(); void initIteration(); void updateIteration(); void callGenoLoopC(int, int, int, const MyArr2D &, const MyArr1D &); - double runWithOneThread(int, const MyFloat1D &); - double forwardAndBackwardsLowRam(int, const MyFloat1D &, bool); - double forwardAndBackwardsLowRamNormal(int, const MyFloat1D &, bool); - double forwardAndBackwardsLowRamCollapse(int, const MyFloat1D &, bool); - fbd_res1 forwardAndBackwardsHighRam(int, const MyFloat1D &, bool); - fbd_res1 forwardAndBackwardsHighRamNormal(int, const MyFloat1D &, bool); - fbd_res1 forwardAndBackwardsHighRamCollapse(int, const MyFloat1D &, bool); + double hmmIterWithJumps(const MyFloat1D &, const int, const int, bool); + double runAllChunks(const MyFloat2D &, const int, bool); }; -fbd_res2 make_input_per_chunk(const std::unique_ptr & genome, - const int ic, - const int niters, - const int seed, - const bool collapse = false, - const double tol_pi = 0.99, - const double tol_r = 1e-5); - int run_impute_main(Options & opts); #endif // FASTPHASE_H_ diff --git a/src/main.cpp b/src/main.cpp index 0b2fdec..f1725bd 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -18,7 +18,7 @@ int main(int argc, char * argv[]) { // ========= helper message and parameters parsing =========================== - const std::string VERSION{"0.3.1"}; + const std::string VERSION{"0.4.0"}; // below for catching ctrl+c, and dumping files struct sigaction sa; diff --git a/src/parse-phaseless.cpp b/src/parse-phaseless.cpp index c25f108..26497a9 100644 --- a/src/parse-phaseless.cpp +++ b/src/parse-phaseless.cpp @@ -60,7 +60,6 @@ List parse_joint_post(std::string filename, int chunk = 0) { Eigen::Map gli(par->gls[ic].data() + ind * S * 3, S, 3); MyArr2D emit = get_emission_by_gl(gli, P.middleRows(pos_chunk - S, S)).transpose(); // CC x S - MyArr2D alpha(CC, S), beta(CC, S); // first get H ie old PI in fastphase MyArr2D H = MyArr2D::Zero(C, S); int z1, y1, s; // m * C + z1 @@ -68,7 +67,7 @@ List parse_joint_post(std::string filename, int chunk = 0) for(z1 = 0; z1 < C; z1++) for(y1 = 0; y1 < par->K; y1++) H(z1, s) += Q(y1, ind) * par->F[y1][(pos_chunk - S + s) * C + z1]; // cs is 1 / colsum(alpha) - auto cs = forward_backwards_diploid(alpha, beta, emit, R.middleCols(pos_chunk - S, S), H); + const auto [alpha, beta, cs] = forward_backwards_diploid(emit, R.middleCols(pos_chunk - S, S), H); MyArr2D gamma = alpha * beta; ret_gamma[ind] = MyFloat1D(gamma.data(), gamma.data() + gamma.size()); ind_post_zg1.setZero(), ind_post_zg2.setZero(), ind_post_y.setZero(), ind_post_zy.setZero(); @@ -171,7 +170,7 @@ List parse_impute_par(std::string filename, int ic = -1) List ret(N); for(auto ind : ids) { - List alphaI(nchunks), betaI(nchunks), aeI(nchunks); + List gammaI(nchunks), aeI(nchunks); for(int c = 0; c < nchunks; c++) { ic = nchunks > 1 ? c : std::max(ic, c); const int iM = genome->pos[ic].size(); @@ -182,12 +181,10 @@ List parse_impute_par(std::string filename, int ic = -1) if(!((1 - (alpha * beta).colwise().sum()).abs() < 1e-6).all()) cao.error("gamma sum is not 1.0!\n"); ae.setZero(genome->C * genome->C, nGrids); get_cluster_frequency(ae, genome->R[ic], genome->PI[ic]); - alphaI[c] = alpha; - betaI[c] = beta; + gammaI[c] = alpha * beta; aeI[c] = ae; } - ret[ind] = List::create(Named("alpha") = alphaI, - Named("beta") = betaI, + ret[ind] = List::create(Named("gamma") = gammaI, Named("ae") = aeI); } return ret; diff --git a/src/phaseless.cpp b/src/phaseless.cpp index 0305399..6fc07fd 100644 --- a/src/phaseless.cpp +++ b/src/phaseless.cpp @@ -38,7 +38,7 @@ void Phaseless::initRecombination(const Int2D & pos, std::string rfile, int B, d pos_chunk[i] = ss; auto tmp = calc_position_distance(pos[i]); dist.insert(dist.end(), tmp.begin(), tmp.end()); - R.middleCols(ss, pos[i].size()) = calc_transRate_diploid(dist, nGen); + R.middleCols(ss, pos[i].size()) = calc_transRate_diploid(tmp, nGen); ss += pos[i].size(); } pos_chunk[nchunks] = ss; // add sentinel @@ -238,7 +238,6 @@ double Phaseless::runForwardBackwards(const int ind, const int ic, const MyFloat const int S = pos_chunk[ic + 1] - pos_chunk[ic]; Eigen::Map gli(GL.data() + ind * S * 3, S, 3); MyArr2D emit = get_emission_by_gl(gli, P.middleRows(pos_chunk[ic], S)).transpose(); // CC x S - MyArr2D alpha(CC, S), beta(CC, S); // first get H ie old PI in fastphase MyArr2D H = MyArr2D::Zero(C, S); int z1, y1, s; @@ -246,7 +245,7 @@ double Phaseless::runForwardBackwards(const int ind, const int ic, const MyFloat for(z1 = 0; z1 < C; z1++) for(y1 = 0; y1 < K; y1++) H(z1, s) += Q(y1, ind) * F[y1](z1, s + pos_chunk[ic]); // cs is 1 / colsum(alpha) - auto cs = forward_backwards_diploid(alpha, beta, emit, R.middleCols(pos_chunk[ic], S), H); + const auto [alpha, beta, cs] = forward_backwards_diploid(emit, R.middleCols(pos_chunk[ic], S), H); // get posterios getPosterios(ind, ic, gli, emit, H, cs, alpha, beta, finalIter); return (1 / cs).log().sum(); From 600b1b6851b9d304e8725d58a2548255f4e59fb7 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 29 Jan 2024 21:54:36 +0100 Subject: [PATCH 16/67] update pars.bin --- src/common.hpp | 14 +++++++------- src/fastphase.cpp | 6 ++++++ src/parse-phaseless.cpp | 14 +++++--------- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/src/common.hpp b/src/common.hpp index 1d8c65b..f9dc8fc 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -108,7 +108,7 @@ struct BigAss { int chunksize, nsamples, nsnps, nchunks; int B, G, C; // B: snps in a grid; G: total number of grids in a genome - MyFloat2D PI, F, R, GammaAE; // M x C, 3 x M, fastphase pars + MyFloat2D PI, F, R, AE; // M x C, 3 x M, fastphase pars Int1D ends; // chunk index where each chromo ends String1D sampleids, chrs; Int2D pos; // store position of markers of each chunk @@ -484,13 +484,12 @@ inline MyArr1D get_cluster_probability(int ind, return cs; } -inline void get_cluster_frequency(MyArr2D & ae, const MyFloat1D & R_, const MyFloat1D & PI_) +/// R: 3 x M; PI: C x M +inline auto get_cluster_frequency(const MyArr2D & R, const MyArr2D & PI) { - const int C2 = ae.rows(); - const int M = ae.cols(); - const int C = std::sqrt(C2); - Eigen::Map PI(PI_.data(), C, M); - Eigen::Map R(R_.data(), 3, M); + const int C = PI.rows(); + const int M = R.cols(); + MyArr2D ae(C * C, M); int s{0}; ae.col(s) = (PI.col(s).matrix() * PI.col(s).transpose().matrix()).reshaped().array(); @@ -519,6 +518,7 @@ inline void get_cluster_frequency(MyArr2D & ae, const MyFloat1D & R_, const MyFl // ae = (ae < tol).select(tol, ae); // ae = (ae > 1 - tol).select(1 - tol, ae); // ae.rowwise() /= ae.colwise().sum(); + return ae; } inline auto get_cluster_likelihoods(int ind, diff --git a/src/fastphase.cpp b/src/fastphase.cpp index 1d77b8c..dbf7866 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -222,6 +222,12 @@ int run_impute_main(Options & opts) break; } } + genome->R.emplace_back(MyFloat1D(faith.R.data(), faith.R.data() + faith.R.size())); + genome->PI.emplace_back(MyFloat1D(faith.PI.data(), faith.PI.data() + faith.PI.size())); + genome->F.emplace_back(MyFloat1D(faith.F.data(), faith.F.data() + faith.F.size())); + // reuse Ezj + faith.Ezj = get_cluster_frequency(faith.R, faith.PI); + genome->AE.emplace_back(MyFloat1D(faith.Ezj.data(), faith.Ezj.data() + faith.Ezj.size())); constexpr auto OPTIONS = alpaca::options::fixed_length_encoding; std::ofstream ofs(opts.out + ".pars.bin", std::ios::out | std::ios::binary); auto bytes_written = alpaca::serialize(*genome, ofs); diff --git a/src/parse-phaseless.cpp b/src/parse-phaseless.cpp index 26497a9..0dddee6 100644 --- a/src/parse-phaseless.cpp +++ b/src/parse-phaseless.cpp @@ -141,6 +141,7 @@ List parse_impute_opt(std::string filename) { return List::create(Named("C") = genome->C, Named("B") = genome->B, Named("G") = genome->G, + Named("clusterfreq") = genome->AE, Named("chunksize") = genome->chunksize, Named("nsamples") = genome->nsamples, Named("nsnps") = genome->nsnps, @@ -170,7 +171,7 @@ List parse_impute_par(std::string filename, int ic = -1) List ret(N); for(auto ind : ids) { - List gammaI(nchunks), aeI(nchunks); + List gamma(nchunks); for(int c = 0; c < nchunks; c++) { ic = nchunks > 1 ? c : std::max(ic, c); const int iM = genome->pos[ic].size(); @@ -178,14 +179,9 @@ List parse_impute_par(std::string filename, int ic = -1) alpha.setZero(genome->C * genome->C, nGrids); beta.setZero(genome->C * genome->C, nGrids); get_cluster_probability(ind, iM, alpha, beta, genome->gls[ic], genome->R[ic], genome->PI[ic], genome->F[ic]); - if(!((1 - (alpha * beta).colwise().sum()).abs() < 1e-6).all()) cao.error("gamma sum is not 1.0!\n"); - ae.setZero(genome->C * genome->C, nGrids); - get_cluster_frequency(ae, genome->R[ic], genome->PI[ic]); - gammaI[c] = alpha * beta; - aeI[c] = ae; + gamma[c] = alpha * beta; } - ret[ind] = List::create(Named("gamma") = gammaI, - Named("ae") = aeI); + ret[ind] = gamma; } - return ret; + return List::create(Named("gamma")=ret, Named("ae") = genome->AE); } From 50d02caf044a7d3e43c323b09fdcf53d7543922c Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 29 Jan 2024 22:31:41 +0100 Subject: [PATCH 17/67] update outputs --- src/fastphase.cpp | 15 ++++++++++++--- src/fastphase.hpp | 3 ++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/fastphase.cpp b/src/fastphase.cpp index dbf7866..3deb64e 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -48,6 +48,7 @@ void FastPhaseK2::initIteration() Ezj.setZero(C, M); // reset post(Z,j) Ezg1.setZero(C, M); // reset pos(Z,g) Ezg2.setZero(C, M); // reset pos(Z,g) + HapSum.setZero(C, M); // reset post(Z,j) } void FastPhaseK2::protectPars() @@ -87,6 +88,8 @@ void FastPhaseK2::protectPars() // re-normalize F per site. hope should work well. otherwise do the complicated. PI.rowwise() /= PI.colwise().sum(); } + // norm HapSum + HapSum.rowwise() /= HapSum.colwise().sum(); } void FastPhaseK2::updateIteration() @@ -120,7 +123,7 @@ double FastPhaseK2::hmmIterWithJumps(const MyFloat1D & GL, const int ic, const i if(!((1 - ((alpha * beta).colwise().sum())).abs() < 1e-9).all()) cao.error((alpha * beta).colwise().sum(), "\ngamma sum is not 1.0!\n"); // now get posterios - MyArr2D ind_post_zg1(C, S), ind_post_zg2(C, S), ind_post_zj(C, S); + MyArr2D ind_post_zg1(C, S), ind_post_zg2(C, S), ind_post_zj(C, S), gammaC(C, S); MyArr1D gamma_div_emit(CC), beta_mult_emit(CC); MyArr1D alphatmp(C); int z1, m, s; @@ -128,6 +131,7 @@ double FastPhaseK2::hmmIterWithJumps(const MyFloat1D & GL, const int ic, const i { m = s + pos_chunk[ic]; gamma_div_emit = (alpha.col(s) * beta.col(s)) / emit.col(s); // C2 + gammaC = (alpha.col(s) * beta.col(s)).reshaped(C, C).colwise().sum(); for(z1 = 0; z1 < C; z1++) { ind_post_zg1(z1, s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (1 - F(m, z1)) @@ -151,6 +155,7 @@ double FastPhaseK2::hmmIterWithJumps(const MyFloat1D & GL, const int ic, const i Ezj.middleCols(pos_chunk[ic], S) += ind_post_zj; Ezg1.middleCols(pos_chunk[ic], S) += ind_post_zg1; Ezg2.middleCols(pos_chunk[ic], S) += ind_post_zg2; + HapSum.middleCols(pos_chunk[ic], S) += gammaC; } return (1 / cs).log().sum(); @@ -235,8 +240,12 @@ int run_impute_main(Options & opts) assert(std::filesystem::file_size(opts.out + ".pars.bin") == bytes_written); cao.done(tim.date(), "imputation done and outputting.", bytes_written, " bytes written to file"); std::ofstream orecomb(opts.out + ".recomb"); - std::ofstream opi(opts.out + ".cluster.freq"); - opi << faith.PI.transpose().format(fmt) << "\n"; orecomb << faith.R.transpose().format(fmt) << "\n"; + std::ofstream opi(opts.out + ".pi"); + opi << faith.PI.transpose().format(fmt) << "\n"; + std::ofstream ohap(opts.out + ".hapsum"); + ohap << faith.HapSum.transpose().format(fmt) << "\n"; + std::ofstream oae(opts.out + ".ae"); + oae << faith.Ezj.transpose().format(fmt) << "\n"; return 0; } diff --git a/src/fastphase.hpp b/src/fastphase.hpp index 6357d5d..039b77c 100644 --- a/src/fastphase.hpp +++ b/src/fastphase.hpp @@ -45,8 +45,9 @@ class FastPhaseK2 MyArr2D F; // M x C, cluster-specific allele frequence MyArr1D er; // M, jumping rate MyArr2D R; // 3 x M, jumping / recombination rate - MyArr2D Ezj; // C x M, E(Z=z,J=1|X,par), expectation of switch into state k MyArr2D Ezg1, Ezg2; // C x M + MyArr2D Ezj; // C x M, E(Z=z,J=1|X,par), expectation of switch into state k + MyArr2D HapSum; // C x M, sum(gammaK) for all inds double nGen; Int1D dist; // physical position distance between two markers Int1D pos_chunk; // store the start pos of each chunk in the full scale From 018422cf9136cf4fc4df17fbb201089dcfddac4a Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 29 Jan 2024 22:41:18 +0100 Subject: [PATCH 18/67] update outputs --- src/fastphase.cpp | 2 +- tests/test-fastphase.cpp | 205 --------------------------------------- 2 files changed, 1 insertion(+), 206 deletions(-) diff --git a/src/fastphase.cpp b/src/fastphase.cpp index 3deb64e..4fd31b8 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -131,7 +131,7 @@ double FastPhaseK2::hmmIterWithJumps(const MyFloat1D & GL, const int ic, const i { m = s + pos_chunk[ic]; gamma_div_emit = (alpha.col(s) * beta.col(s)) / emit.col(s); // C2 - gammaC = (alpha.col(s) * beta.col(s)).reshaped(C, C).colwise().sum(); + gammaC.col(s) = (alpha.col(s) * beta.col(s)).reshaped(C, C).colwise().sum(); for(z1 = 0; z1 < C; z1++) { ind_post_zg1(z1, s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (1 - F(m, z1)) diff --git a/tests/test-fastphase.cpp b/tests/test-fastphase.cpp index 45e0f30..2545098 100644 --- a/tests/test-fastphase.cpp +++ b/tests/test-fastphase.cpp @@ -9,211 +9,6 @@ using namespace std; using namespace Eigen; -TEST_CASE("fastphasek2 forwardAndBackwardsHighRamCollapse", "[test-fastphasek2]") -{ - int C{10}, seed{1}, chunksize{10000}, niters{40}; - std::unique_ptr genome = std::make_unique(); - genome->chunksize = chunksize, genome->C = C; - chunk_beagle_genotype_likelihoods(genome, "../data/bgl.gz"); - int ic = 0; - FastPhaseK2 faith(genome->pos[ic].size(), genome->nsamples, genome->C, seed); - faith.initRecombination(genome->pos[ic]); - faith.AF = estimate_af_by_gl(genome->gls[ic], genome->nsamples, genome->pos[ic].size()).cast(); - ThreadPool pool(4); - vector> llike; - double prevlike{std::numeric_limits::lowest()}, loglike; - for(int it = 0; it <= niters; it++) - { - faith.initIteration(); - for(int i = 0; i < genome->nsamples; i++) - { - if(it == niters) - llike.emplace_back(pool.enqueue(&FastPhaseK2::forwardAndBackwardsHighRam, &faith, i, - std::ref(genome->gls[ic]), true)); - else - llike.emplace_back(pool.enqueue(&FastPhaseK2::forwardAndBackwardsHighRam, &faith, i, - std::ref(genome->gls[ic]), false)); - } - loglike = 0; - for(auto && ll : llike) - { - const auto [l, zj, zg1, zg2, gamma1] = ll.get(); - loglike += l; - faith.Ezj += zj; - faith.Ezg1 += zg1; - faith.Ezg2 += zg2; - faith.pi += gamma1; - } - llike.clear(); // clear future and renew - // REQUIRE(loglike > prevlike); - if(it != niters) faith.updateIteration(); - prevlike = loglike; - } - // start collapsing - faith.collapse_and_resize(genome->pos[ic]); - prevlike = std::numeric_limits::lowest(); - niters = 2; - for(int it = 0; it <= niters; it++) - { - faith.initIteration(); - for(int i = 0; i < genome->nsamples; i++) - { - if(it == niters) - llike.emplace_back(pool.enqueue(&FastPhaseK2::forwardAndBackwardsHighRam, &faith, i, - std::ref(genome->gls[ic]), true)); - else - llike.emplace_back(pool.enqueue(&FastPhaseK2::forwardAndBackwardsHighRam, &faith, i, - std::ref(genome->gls[ic]), false)); - } - loglike = 0; - for(auto && ll : llike) - { - const auto [l, zj, zg1, zg2, gamma1] = ll.get(); - loglike += l; - faith.Ezj += zj; - faith.Ezg1 += zg1; - faith.Ezg2 += zg2; - faith.pi += gamma1; - } - llike.clear(); // clear future and renew - // if(it > 1) REQUIRE(loglike > prevlike); - prevlike = loglike; - if(it != niters) faith.updateIteration(); - } -} - -TEST_CASE("fastphasek2 forwardAndBackwardsLowRamNormal", "[test-fastphasek2]") -{ - int C{10}, seed{1}, chunksize{10000}, niters{40}; - std::unique_ptr genome = std::make_unique(); - genome->chunksize = chunksize, genome->C = C; - chunk_beagle_genotype_likelihoods(genome, "../data/bgl.gz"); - int ic = 0; - FastPhaseK2 faith(genome->pos[ic].size(), genome->nsamples, genome->C, seed); - faith.initRecombination(genome->pos[ic]); - faith.AF = estimate_af_by_gl(genome->gls[ic], genome->nsamples, genome->pos[ic].size()).cast(); - ThreadPool pool(4); - vector> llike; - double prevlike{std::numeric_limits::lowest()}, loglike; - for(int it = 0; it <= niters; it++) - { - faith.initIteration(); - for(int i = 0; i < genome->nsamples; i++) - { - if(it == niters) - llike.emplace_back(pool.enqueue(&FastPhaseK2::forwardAndBackwardsLowRam, &faith, i, - std::ref(genome->gls[ic]), true)); - else - llike.emplace_back(pool.enqueue(&FastPhaseK2::forwardAndBackwardsLowRam, &faith, i, - std::ref(genome->gls[ic]), false)); - } - loglike = 0; - for(auto && ll : llike) loglike += ll.get(); - llike.clear(); // clear future and renew - // REQUIRE(loglike > prevlike); - if(it != niters) faith.updateIteration(); - prevlike = loglike; - } -} - -TEST_CASE("fastphasek2 forwardAndBackwardsLowRamCollapse", "[test-fastphasek2]") -{ - int C{10}, seed{999}, chunksize{10000}, niters{40}; - std::unique_ptr genome = std::make_unique(); - genome->chunksize = chunksize, genome->C = C; - chunk_beagle_genotype_likelihoods(genome, "../data/bgl.gz"); - int ic = 0; - FastPhaseK2 faith(genome->pos[ic].size(), genome->nsamples, genome->C, seed); - faith.initRecombination(genome->pos[ic]); - faith.AF = estimate_af_by_gl(genome->gls[ic], genome->nsamples, genome->pos[ic].size()).cast(); - ThreadPool pool(4); - vector> llike; - double prevlike{std::numeric_limits::lowest()}, loglike; - for(int it = 0; it <= niters; it++) - { - faith.initIteration(); - for(int i = 0; i < genome->nsamples; i++) - { - if(it == niters) - llike.emplace_back(pool.enqueue(&FastPhaseK2::forwardAndBackwardsLowRam, &faith, i, - std::ref(genome->gls[ic]), true)); - else - llike.emplace_back(pool.enqueue(&FastPhaseK2::forwardAndBackwardsLowRam, &faith, i, - std::ref(genome->gls[ic]), false)); - } - loglike = 0; - for(auto && ll : llike) loglike += ll.get(); - llike.clear(); // clear future and renew - // REQUIRE(loglike > prevlike); - prevlike = loglike; - if(it != niters) faith.updateIteration(); - } - // start collapsing - faith.collapse_and_resize(genome->pos[ic]); - prevlike = std::numeric_limits::lowest(); - niters = 2; - for(int it = 0; it <= niters; it++) - { - faith.initIteration(); - for(int i = 0; i < genome->nsamples; i++) - { - if(it == niters) - llike.emplace_back(pool.enqueue(&FastPhaseK2::forwardAndBackwardsLowRam, &faith, i, - std::ref(genome->gls[ic]), true)); - else - llike.emplace_back(pool.enqueue(&FastPhaseK2::forwardAndBackwardsLowRam, &faith, i, - std::ref(genome->gls[ic]), false)); - } - loglike = 0; - for(auto && ll : llike) loglike += ll.get(); - llike.clear(); // clear future and renew - // if(it > 1) REQUIRE(loglike > prevlike); - prevlike = loglike; - if(it != niters) faith.updateIteration(); - } -} - -TEST_CASE("fastphasek2 forwardAndBackwardsHighRamNormal", "[test-fastphasek2]") -{ - int C{10}, seed{1}, chunksize{10000}, niters{40}; - std::unique_ptr genome = std::make_unique(); - genome->chunksize = chunksize, genome->C = C; - chunk_beagle_genotype_likelihoods(genome, "../data/bgl.gz"); - int ic = 0; - FastPhaseK2 faith(genome->pos[ic].size(), genome->nsamples, genome->C, seed); - faith.initRecombination(genome->pos[ic]); - faith.AF = estimate_af_by_gl(genome->gls[ic], genome->nsamples, genome->pos[ic].size()).cast(); - ThreadPool pool(4); - vector> llike; - double prevlike{std::numeric_limits::lowest()}, loglike; - for(int it = 0; it <= niters; it++) - { - faith.initIteration(); - for(int i = 0; i < genome->nsamples; i++) - { - if(it == niters) - llike.emplace_back(pool.enqueue(&FastPhaseK2::forwardAndBackwardsHighRam, &faith, i, - std::ref(genome->gls[ic]), true)); - else - llike.emplace_back(pool.enqueue(&FastPhaseK2::forwardAndBackwardsHighRam, &faith, i, - std::ref(genome->gls[ic]), false)); - } - loglike = 0; - for(auto && ll : llike) - { - const auto [l, zj, zg1, zg2, gamma1] = ll.get(); - loglike += l; - faith.Ezj += zj; - faith.Ezg1 += zg1; - faith.Ezg2 += zg2; - faith.pi += gamma1; - } - llike.clear(); // clear future and renew - // REQUIRE(loglike > prevlike); - if(it != niters) faith.updateIteration(); - prevlike = loglike; - } -} TEST_CASE("fastphasek4", "[test-fastphasek4]") { From c767b89f6f56c3b7f8c73a7d4461e5a4eba3c4fe Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Tue, 30 Jan 2024 11:19:20 +0100 Subject: [PATCH 19/67] cleanup and update tests --- src/admixture.cpp | 22 +++- src/common.hpp | 209 ++++---------------------------- src/fastphase.cpp | 18 ++- src/parse-phaseless.cpp | 17 +-- tests/Makefile | 5 +- tests/test-forward-backward.cpp | 66 ---------- tests/test-phaseless.cpp | 111 ++--------------- 7 files changed, 74 insertions(+), 374 deletions(-) delete mode 100644 tests/test-forward-backward.cpp diff --git a/src/admixture.cpp b/src/admixture.cpp index ea0afb2..3212b96 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -18,9 +18,15 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & int c1, k1, s, c2, c12; for(int ic = 0, m = 0; ic < genome->nchunks; ic++) { - const int nsnps = genome->pos[ic].size(); - const auto [cl, cf] = get_cluster_likelihoods(ind, nsnps, genome->B, genome->gls[ic], genome->R[ic], - genome->PI[ic], genome->F[ic]); + const int S = genome->pos[ic].size(); + const int G = genome->B > 1 ? (S + genome->B - 1) / genome->B : S; + assert(S == G); // only test B=1 now + Eigen::Map gli(genome->gls[ic].data() + ind * S * 3, S, 3); + Eigen::Map P(genome->F[ic].data(), S, C); + Eigen::Map PI(genome->PI[ic].data(), C, S); + Eigen::Map R(genome->R[ic].data(), 3, S); + Eigen::Map AE(genome->AE[ic].data(), C * C, S); + const auto cl = get_cluster_likelihoods(gli, P, R, PI, AE); const int nGrids = cl.cols(); kapa.setZero(C * K, nGrids); // C x K x M layout Ekg.setZero(K, nGrids); @@ -68,9 +74,13 @@ double Admixture::runNativeWithBigAss(int ind, const std::unique_ptr & g MyArr1D iQ = MyArr1D::Zero(K); for(int ic = 0, m = 0; ic < genome->nchunks; ic++) { - const int nsnps = genome->pos[ic].size(); - const auto [cl, cf] = get_cluster_likelihoods(ind, nsnps, genome->B, genome->gls[ic], genome->R[ic], - genome->PI[ic], genome->F[ic]); + const int S = genome->pos[ic].size(); + Eigen::Map gli(genome->gls[ic].data() + ind * S * 3, S, 3); + Eigen::Map P(genome->F[ic].data(), S, C); + Eigen::Map PI(genome->PI[ic].data(), C, S); + Eigen::Map R(genome->R[ic].data(), 3, S); + Eigen::Map AE(genome->AE[ic].data(), C * C, S); + const auto cl = get_cluster_likelihoods(gli, P, R, PI, AE); const int nGrids = cl.cols(); iEkc.setZero(C * K, nGrids); Ekg.setZero(K, nGrids); diff --git a/src/common.hpp b/src/common.hpp index f9dc8fc..bad73a9 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -259,14 +259,14 @@ inline MyArr2D calc_transRate_diploid(const Int1D & dl, double nGen, double expR /* ** @param gli genotype likelihoods of current individual i, (M, 3) -** @param F cluster-specific allele frequence (M, C) +** @param P cluster-specific allele frequence (M, C) ** @return emission probability (M, C2) */ -inline MyArr2D get_emission_by_gl(const MyArr2D & gli, const MyArr2D & F, double minEmission = 1e-10) +inline MyArr2D get_emission_by_gl(const MyArr2D & gli, const MyArr2D & P, double minEmission = 1e-10) { int k1, k2, g1, g2; - const int M = F.rows(); - const int C = F.cols(); + const int M = P.rows(); + const int C = P.cols(); MyArr2D emitDip(M, C * C); // emission probabilies, nsnps x (C x C) for(k1 = 0; k1 < C; k1++) for(k2 = 0; k2 < C; k2++) @@ -276,8 +276,8 @@ inline MyArr2D get_emission_by_gl(const MyArr2D & gli, const MyArr2D & F, double { for(g2 = 0; g2 <= 1; g2++) { - emitDip.col(k1 * C + k2) += gli.col(g1 + g2) * (g1 * F.col(k1) + (1 - g1) * (1 - F.col(k1))) - * (g2 * F.col(k2) + (1 - g2) * (1 - F.col(k2))); + emitDip.col(k1 * C + k2) += gli.col(g1 + g2) * (g1 * P.col(k1) + (1 - g1) * (1 - P.col(k1))) + * (g2 * P.col(k2) + (1 - g2) * (1 - P.col(k2))); } } } @@ -288,17 +288,17 @@ inline MyArr2D get_emission_by_gl(const MyArr2D & gli, const MyArr2D & F, double /* ** @param gli genotype likelihoods of current individual i, (M, 3) -** @param F cluster-specific allele frequence (M, C) +** @param P cluster-specific allele frequence (M, C) ** @return emission probability (M, C2) */ inline MyArr2D get_emission_by_grid(const MyFloat1D & GL, - const MyFloat1D & F, + const MyFloat1D & P, int ind, int M, int B, double minEmission = 1e-10) { - const int C = F.size() / M; + const int C = P.size() / M; const int C2 = C * C; const int nGrids = B > 1 ? (M + B - 1) / B : M; MyArr2D emitGrid = MyArr2D::Ones(C2, nGrids); @@ -320,8 +320,8 @@ inline MyArr2D get_emission_by_grid(const MyFloat1D & GL, { for(g2 = 0; g2 <= 1; g2++) { - emit += GL[igs + (g1 + g2) * M + i] * (g1 * F[z1 * M + i] + (1 - g1) * (1 - F[z1 * M + i])) - * (g2 * F[z2 * M + i] + (1 - g2) * (1 - F[z2 * M + i])); + emit += GL[igs + (g1 + g2) * M + i] * (g1 * P[z1 * M + i] + (1 - g1) * (1 - P[z1 * M + i])) + * (g2 * P[z2 * M + i] + (1 - g2) * (1 - P[z2 * M + i])); } } emitGrid(z12, g) *= emit; @@ -404,85 +404,6 @@ inline auto forward_backwards_diploid(const MyArr2D & emit, const MyArr2D & R, c return std::tuple(alpha, beta, cs); } -inline MyArr1D get_cluster_probability(int ind, - const int M, - MyArr2D & alpha, - MyArr2D & beta, - const MyFloat1D & GL, - const MyFloat1D & R, - const MyFloat1D & PI, - const MyFloat1D & F, - const double minEmission = 1e-10) -{ - const int C = F.size() / M; - const int nGrids = alpha.cols(); - const int B = (M + nGrids - 1) / nGrids; - int z1, z2, z12; - MyArr1D sumTmp1(C); // store sum over internal loop - MyArr1D cs = MyArr1D::Zero(nGrids); - double constTmp; - // ======== forward and backward recursion =========== - MyArr2D emitGrid = get_emission_by_grid(GL, F, ind, M, B, minEmission); - int g{0}; - for(z1 = 0; z1 < C; z1++) - { - for(z2 = 0; z2 < C; z2++) - { - z12 = z1 * C + z2; - alpha(z12, g) = emitGrid(z12, g) * PI[g * C + z1] * PI[g * C + z2]; - cs(g) += alpha(z12, g); - } - } - cs(g) = 1 / cs(g); - alpha.col(g) *= cs(g); // normalize it - // now get the rest - for(g = 1; g < nGrids; g++) - { - sumTmp1 = alpha.col(g - 1).reshaped(C, C).rowwise().sum() * R[g * 3 + 1]; - constTmp = alpha.col(g - 1).sum() * R[g * 3 + 2]; - for(z1 = 0; z1 < C; z1++) - { - for(z2 = 0; z2 < C; z2++) - { - z12 = z1 * C + z2; - alpha(z12, g) = emitGrid(z12, g) - * (alpha(z12, g - 1) * R[g * 3 + 0] + PI[g * C + z1] * sumTmp1(z2) - + PI[g * C + z2] * sumTmp1(z1) + PI[g * C + z1] * PI[g * C + z2] * constTmp); - cs(g) += alpha(z12, g); - } - } - cs(g) = 1 / cs(g); - alpha.col(g) *= cs(g); // normalize it - } - // next backwards - g = nGrids - 1; - beta.col(g).setConstant(1.0); - for(g = nGrids - 2; g >= 0; g--) - { - auto beta_mult_emit = emitGrid.col(g + 1) * beta.col(g + 1); - sumTmp1.setZero(); - for(constTmp = 0, z1 = 0; z1 < C; z1++) - { - for(z2 = 0; z2 < C; z2++) - { - z12 = z1 * C + z2; - sumTmp1(z1) += beta_mult_emit(z12) * PI[(g + 1) * C + z2] * R[(g + 1) * 3 + 1]; - constTmp += beta_mult_emit(z12) * PI[(g + 1) * C + z1] * PI[(g + 1) * C + z2] * R[(g + 1) * 3 + 2]; - } - } - for(z1 = 0; z1 < C; z1++) - { - for(z2 = 0; z2 < C; z2++) - { - z12 = z1 * C + z2; - beta(z12, g) = - (beta_mult_emit(z12) * R[(g + 1) * 3 + 0] + sumTmp1(z1) + sumTmp1(z2) + constTmp) * cs(g + 1); - } - } - } - - return cs; -} /// R: 3 x M; PI: C x M inline auto get_cluster_frequency(const MyArr2D & R, const MyArr2D & PI) @@ -521,105 +442,19 @@ inline auto get_cluster_frequency(const MyArr2D & R, const MyArr2D & PI) return ae; } -inline auto get_cluster_likelihoods(int ind, - const int M, - const int B, - const MyFloat1D & GL, - const MyFloat1D & R, - const MyFloat1D & PI, - const MyFloat1D & F, +inline auto get_cluster_likelihoods(const MyArr2D & gli, + const MyArr2D & P, + const MyArr2D & R, + const MyArr2D & PI, + const MyArr2D & AE, const double minEmission = 1e-10) { - const int C = F.size() / M; - const int C2 = C * C; - MyArr2D emitGrid = get_emission_by_grid(GL, F, ind, M, B, minEmission); - const int nGrids = emitGrid.cols(); - MyArr2D alpha(C2, nGrids), beta(C2, nGrids), ae(C2, nGrids); - int z1, z2, z12; - MyArr1D sumTmp1(C); // store sum over internal loop for alpha - MyArr1D sumTmp2(C); // store sum over internal loop for ae - MyArr1D cs = MyArr1D::Zero(nGrids); - double constTmp; - // ======== forward and backward recursion =========== - int g = 0; - for(z1 = 0; z1 < C; z1++) - { - for(z2 = 0; z2 < C; z2++) - { - z12 = z1 * C + z2; - alpha(z12, g) = emitGrid(z12, g) * PI[g * C + z1] * PI[g * C + z2]; - ae(z12, g) = PI[g * C + z1] * PI[g * C + z2]; - cs(g) += alpha(z12, g); - } - } - cs(g) = 1 / cs(g); - alpha.col(g) *= cs(g); // normalize it - // now get the rest - for(g = 1; g < nGrids; g++) - { - sumTmp1 = alpha.col(g - 1).reshaped(C, C).rowwise().sum() * R[g * 3 + 1]; - sumTmp2 = ae.col(g - 1).reshaped(C, C).rowwise().sum() * R[g * 3 + 1]; - constTmp = R[g * 3 + 2]; // since alpha.col(g).sum()==1 - for(z1 = 0; z1 < C; z1++) - { - for(z2 = 0; z2 < C; z2++) - { - z12 = z1 * C + z2; - alpha(z12, g) = emitGrid(z12, g) - * (alpha(z12, g - 1) * R[g * 3 + 0] + PI[g * C + z1] * sumTmp1(z2) - + PI[g * C + z2] * sumTmp1(z1) + PI[g * C + z1] * PI[g * C + z2] * constTmp); - cs(g) += alpha(z12, g); - ae(z12, g) = (ae(z12, g - 1) * R[g * 3 + 0] + PI[g * C + z1] * sumTmp2(z2) - + PI[g * C + z2] * sumTmp2(z1) + PI[g * C + z1] * PI[g * C + z2] * constTmp); - } - } - cs(g) = 1 / cs(g); - alpha.col(g) *= cs(g); // normalize it - } - // TODO: cluster frequency ae can be zero for certain cluster. - // next backwards - g = nGrids - 1; - beta.col(g).setConstant(1.0); - for(g = nGrids - 2; g >= 0; g--) - { - auto beta_mult_emit = emitGrid.col(g + 1) * beta.col(g + 1); - sumTmp1.setZero(); - for(constTmp = 0, z1 = 0; z1 < C; z1++) - { - for(z2 = 0; z2 < C; z2++) - { - z12 = z1 * C + z2; - sumTmp1(z1) += beta_mult_emit(z12) * PI[(g + 1) * C + z2] * R[(g + 1) * 3 + 1]; - constTmp += beta_mult_emit(z12) * PI[(g + 1) * C + z1] * PI[(g + 1) * C + z2] * R[(g + 1) * 3 + 2]; - } - } - for(z1 = 0; z1 < C; z1++) - { - for(z2 = 0; z2 < C; z2++) - { - z12 = z1 * C + z2; - beta(z12, g) = - (beta_mult_emit(z12) * R[(g + 1) * 3 + 0] + sumTmp1(z1) + sumTmp1(z2) + constTmp) * cs(g + 1); - } - } - } - // reuse emitGrids for cluster likelihoods - emitGrid = alpha * beta; - // reuse alpha for cluster frequency - alpha.setZero(C, nGrids); - for(g = 0; g < nGrids; g++) - { - alpha.col(g) = ae.col(g).reshaped(C, C).colwise().sum(); - alpha.col(g) /= alpha.col(g).sum(); - for(z1 = 0; z1 < C; z1++) - for(z2 = 0; z2 < C; z2++) - { - z12 = z1 * C + z2; - emitGrid(z12, g) /= (alpha(z1, g) * alpha(z2, g)); - } - } - emitGrid.rowwise() /= emitGrid.colwise().sum(); // norm it - return std::tuple(emitGrid, alpha); + MyArr2D emit = get_emission_by_gl(gli, P).transpose(); // CC x S + const auto [alpha, beta, cs] = forward_backwards_diploid(emit, R, PI); + // reuse emit + emit = (alpha * beta) / AE; + emit.rowwise() /= emit.colwise().sum(); // norm it + return emit; } inline auto calc_cluster_info(const int N, const MyArr2D & GZP1, const MyArr2D & GZP2) diff --git a/src/fastphase.cpp b/src/fastphase.cpp index 4fd31b8..62add83 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -227,12 +227,20 @@ int run_impute_main(Options & opts) break; } } - genome->R.emplace_back(MyFloat1D(faith.R.data(), faith.R.data() + faith.R.size())); - genome->PI.emplace_back(MyFloat1D(faith.PI.data(), faith.PI.data() + faith.PI.size())); - genome->F.emplace_back(MyFloat1D(faith.F.data(), faith.F.data() + faith.F.size())); - // reuse Ezj + // reuse Ezj for AE faith.Ezj = get_cluster_frequency(faith.R, faith.PI); - genome->AE.emplace_back(MyFloat1D(faith.Ezj.data(), faith.Ezj.data() + faith.Ezj.size())); + for(int ic = 0; ic < genome->nchunks; ic++) + { + const int S = faith.pos_chunk[ic + 1] - faith.pos_chunk[ic]; + MyArr2D out = faith.Ezj.middleCols(faith.pos_chunk[ic], S); + genome->AE.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); + out = faith.R.middleCols(faith.pos_chunk[ic], S); + genome->R.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); + out = faith.PI.middleCols(faith.pos_chunk[ic], S); + genome->PI.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); + out = faith.F.middleRows(faith.pos_chunk[ic], S); + genome->F.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); + } constexpr auto OPTIONS = alpaca::options::fixed_length_encoding; std::ofstream ofs(opts.out + ".pars.bin", std::ios::out | std::ios::binary); auto bytes_written = alpaca::serialize(*genome, ofs); diff --git a/src/parse-phaseless.cpp b/src/parse-phaseless.cpp index 0dddee6..7e77a54 100644 --- a/src/parse-phaseless.cpp +++ b/src/parse-phaseless.cpp @@ -163,25 +163,28 @@ List parse_impute_par(std::string filename, int ic = -1) std::unique_ptr genome = std::make_unique(alpaca::deserialize(ifs, filesize, ec)); ifs.close(); assert((bool)ec == false); - MyArr2D alpha, beta, ae; Int1D ids; for(int ind = 0; ind < genome->nsamples; ind++) ids.push_back(ind); int nchunks = ic < 0 ? genome->nchunks : 1; int N = ids.size(); List ret(N); + const int C = genome->C; for(auto ind : ids) { List gamma(nchunks); for(int c = 0; c < nchunks; c++) { ic = nchunks > 1 ? c : std::max(ic, c); - const int iM = genome->pos[ic].size(); - const int nGrids = genome->B > 1 ? (iM + genome->B - 1) / genome->B : iM; - alpha.setZero(genome->C * genome->C, nGrids); - beta.setZero(genome->C * genome->C, nGrids); - get_cluster_probability(ind, iM, alpha, beta, genome->gls[ic], genome->R[ic], genome->PI[ic], genome->F[ic]); + const int S = genome->pos[ic].size(); + const int nGrids = genome->B > 1 ? (S + genome->B - 1) / genome->B : S; + Eigen::Map gli(genome->gls[ic].data() + ind * S * 3, S, 3); + Eigen::Map P(genome->F[ic].data(), S, C); + Eigen::Map PI(genome->PI[ic].data(), C, S); + Eigen::Map R(genome->R[ic].data(), 3, S); + MyArr2D emit = get_emission_by_gl(gli, P).transpose(); // CC x S + const auto [alpha, beta, cs] = forward_backwards_diploid(emit, R, PI); gamma[c] = alpha * beta; } ret[ind] = gamma; } - return List::create(Named("gamma")=ret, Named("ae") = genome->AE); + return List::create(Named("gamma")=ret); } diff --git a/tests/Makefile b/tests/Makefile index 0a29d5c..bd77db5 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -5,9 +5,9 @@ INC = -I. -I../src -I../inst/include -I$(HTSDIR) LDFLAGS = -L$(HTSDIR) -Wl,-rpath,$(HTSDIR) LIBS = -lhts -llzma -lbz2 -lm -lz -lpthread -OBJS = test-main.o test-joint.o test-phaseless.o test-fastphase.o test-forward-backward.o +OBJS = test-main.o test-joint.o test-phaseless.o test-fastphase.o -BINS = test-joint.bin test-phaseless.bin test-fastphase.bin test-forward-backward.bin +BINS = test-joint.bin test-phaseless.bin test-fastphase.bin DEPS = ../src/phaseless.o ../src/fastphase.o ../src/admixture.o @@ -27,7 +27,6 @@ test: $(BINS) ./test-joint.bin --success ./test-phaseless.bin --success ./test-fastphase.bin --success - ./test-forward-backward.bin --success clean: rm -f *.o *.bin diff --git a/tests/test-forward-backward.cpp b/tests/test-forward-backward.cpp deleted file mode 100644 index 20d1285..0000000 --- a/tests/test-forward-backward.cpp +++ /dev/null @@ -1,66 +0,0 @@ -#define _DECLARE_TOOLBOX_HERE - -#include "../src/fastphase.hpp" -#include "../src/io.hpp" -#include "../src/threadpool.hpp" -#include "catch.hh" - -using namespace std; -using namespace Eigen; - -TEST_CASE("reconstruct alpha and beta from saved pars.bin", "[test-forward-backward]") -{ - int C{10}, seed{1}, chunksize{10000}, niters{40}; - std::unique_ptr genome = std::make_unique(); - genome->chunksize = chunksize, genome->C = C; - chunk_beagle_genotype_likelihoods(genome, "../data/bgl.gz"); - ThreadPool poolit(4); - vector> res; - for(int ic = 0; ic < genome->nchunks; ic++) - { - FastPhaseK2 faith(genome->pos[ic].size(), genome->nsamples, C, seed); - faith.initRecombination(genome->pos[ic]); - faith.AF = - estimate_af_by_gl(genome->gls[ic], genome->nsamples, genome->pos[ic].size()).cast(); - double prevlike{std::numeric_limits::lowest()}, loglike; - for(int it = 0; it <= niters; it++) - { - faith.initIteration(); - for(int i = 0; i < genome->nsamples; i++) - { - if(it == niters) - res.emplace_back(poolit.enqueue(&FastPhaseK2::forwardAndBackwardsHighRam, &faith, i, - std::ref(genome->gls[ic]), true)); - else - res.emplace_back(poolit.enqueue(&FastPhaseK2::forwardAndBackwardsHighRam, &faith, i, - std::ref(genome->gls[ic]), false)); - } - loglike = 0; - for(auto && ll : res) - { - const auto [l, zj, zg1, zg2, gamma1] = ll.get(); - loglike += l; - faith.Ezj += zj; - faith.Ezg1 += zg1; - faith.Ezg2 += zg2; - faith.pi += gamma1; - } - res.clear(); // clear future and renew - REQUIRE(loglike > prevlike); // might only work for double not float precision - prevlike = loglike; - if(it != niters) faith.updateIteration(); - } - MyArr2D alpha, beta; - const int iM = genome->pos[ic].size(); - for(int ind = 0; ind < genome->nsamples; ind++) - { - alpha.setZero(genome->C * genome->C, iM); - beta.setZero(genome->C * genome->C, iM); - Eigen::Map gli(genome->gls[ic].data() + ind * iM * 3, iM, 3); - MyArr2D emit = get_emission_by_gl(gli, faith.F).transpose(); // C2 x M - forward_backwards_diploid(alpha, beta, emit, faith.R, faith.PI); - alpha *= beta; - REQUIRE(((alpha.colwise().sum() - 1.0).abs() < 1e-6).all()); - } - } -} diff --git a/tests/test-phaseless.cpp b/tests/test-phaseless.cpp index 62d265c..ac1e624 100644 --- a/tests/test-phaseless.cpp +++ b/tests/test-phaseless.cpp @@ -11,72 +11,26 @@ using namespace std; using namespace Eigen; TEST_CASE("phaseless naive vs dump dataset 1", "[test-phaseless]") -{ - int K{3}, C{10}, seed{1}, nadmix{10}, chunksize{10000}, nimpute{40}; - std::unique_ptr genome = std::make_unique(); - genome->chunksize = chunksize, genome->C = C; - chunk_beagle_genotype_likelihoods(genome, "../data/bgl.gz"); - ThreadPool poolit(genome->nchunks); - vector> res; - filesystem::path outdir{"test.dir"}; - filesystem::create_directories(outdir); - for(int ic = 0; ic < genome->nchunks; ic++) - { - res.emplace_back(poolit.enqueue(make_input_per_chunk, std::ref(genome), ic, nimpute, seed, false, 0.99, 1e-5)); - } - for(auto && ll : res) - { - auto [GP, R, PI, F, AE] = ll.get(); - AE.rowwise() /= AE.colwise().sum(); // norm gamma ae - genome->GammaAE.emplace_back(MyFloat1D(AE.data(), AE.data() + AE.size())); - genome->PI.emplace_back(MyFloat1D(PI.data(), PI.data() + PI.size())); - genome->F.emplace_back(MyFloat1D(F.data(), F.data() + F.size())); - genome->R.emplace_back(MyFloat1D(R.data(), R.data() + R.size())); - } - res.clear(); // clear future and renew - double llike1, llike2; - Admixture admixer1(genome->nsamples, genome->nsnps, genome->C, K, seed); - Admixture admixer2(genome->nsamples, genome->nsnps, genome->C, K, seed); - for(int it = 0; it < nadmix; it++) - { - admixer1.initIteration(); - llike1 = 0; - for(int i = 0; i < genome->nsamples; i++) llike1 += admixer1.runOptimalWithBigAss(i, genome); - admixer1.updateIteration(); - admixer2.initIteration(); - llike2 = 0; - for(int i = 0; i < genome->nsamples; i++) llike2 += admixer2.runNativeWithBigAss(i, genome); - admixer2.updateIteration(); - cerr << std::setprecision(6) << "llike2: " << llike2 << "\tllike1: " << llike1 << "\n"; - REQUIRE(abs(llike1 - llike2) < 1e-10); - } - REQUIRE(((admixer1.Q - admixer2.Q).abs() < 1e-6).all()); -} - -TEST_CASE("phaseless naive vs dump dataset 2", "[test-phaseless]") { int K{3}, C{10}, seed{1}, nadmix{10}, chunksize{10000}, nimpute{40}; std::unique_ptr genome = std::make_unique(); genome->chunksize = chunksize, genome->C = C; chunk_beagle_genotype_likelihoods(genome, "../data/all.bgl.gz"); - ThreadPool poolit(genome->nchunks); - vector> res; + ThreadPool pool(genome->nchunks); + vector> res; filesystem::path outdir{"test.dir"}; filesystem::create_directories(outdir); - for(int ic = 0; ic < genome->nchunks; ic++) + FastPhaseK2 faith(genome->nsamples, genome->nsnps, C, seed); + for(int it = 0; it <= nimpute; it++) { - res.emplace_back(poolit.enqueue(make_input_per_chunk, std::ref(genome), ic, nimpute, seed, false, 0.99, 1e-5)); + faith.initIteration(); + for(int i = 0; i < faith.N; i++) + res.emplace_back(pool.enqueue(&FastPhaseK2::runAllChunks, &faith, std::ref(genome->gls), i, false)); + double loglike = 0; + for(auto && ll : res) loglike += ll.get(); + res.clear(); // clear future and renew + faith.updateIteration(); } - for(auto && ll : res) - { - auto [GP, R, PI, F, AE] = ll.get(); - AE.rowwise() /= AE.colwise().sum(); // norm gamma ae - genome->GammaAE.emplace_back(MyFloat1D(AE.data(), AE.data() + AE.size())); - genome->PI.emplace_back(MyFloat1D(PI.data(), PI.data() + PI.size())); - genome->F.emplace_back(MyFloat1D(F.data(), F.data() + F.size())); - genome->R.emplace_back(MyFloat1D(R.data(), R.data() + R.size())); - } - res.clear(); // clear future and renew double llike1, llike2; Admixture admixer1(genome->nsamples, genome->nsnps, genome->C, K, seed); Admixture admixer2(genome->nsamples, genome->nsnps, genome->C, K, seed); @@ -95,46 +49,3 @@ TEST_CASE("phaseless naive vs dump dataset 2", "[test-phaseless]") } REQUIRE(((admixer1.Q - admixer2.Q).abs() < 1e-6).all()); } - -TEST_CASE("phaseless normal iteration with make_input_per_chunk", "[test-phaseless]") -{ - int K{3}, C{5}, seed{1}, nadmix{10}, chunksize{10000}, nimpute{40}; - std::unique_ptr genome = std::make_unique(); - genome->chunksize = chunksize, genome->C = C; - chunk_beagle_genotype_likelihoods(genome, "../data/bgl.gz"); - ThreadPool poolit(genome->nchunks); - vector> res; - filesystem::path outdir{"test.dir"}; - filesystem::create_directories(outdir); - for(int ic = 0; ic < genome->nchunks; ic++) - { - res.emplace_back(poolit.enqueue(make_input_per_chunk, std::ref(genome), ic, nimpute, seed, false, 0.99, 1e-5)); - } - for(auto && ll : res) - { - auto [GP, R, PI, F, AE] = ll.get(); - AE.rowwise() /= AE.colwise().sum(); // norm gamma ae - genome->GammaAE.emplace_back(MyFloat1D(AE.data(), AE.data() + AE.size())); - genome->PI.emplace_back(MyFloat1D(PI.data(), PI.data() + PI.size())); - genome->F.emplace_back(MyFloat1D(F.data(), F.data() + F.size())); - genome->R.emplace_back(MyFloat1D(R.data(), R.data() + R.size())); - } - res.clear(); // clear future and renew - Admixture admixer(genome->nsamples, genome->nsnps, genome->C, K, seed); - vector> llike; - double loglike, diff, prevlike{std::numeric_limits::lowest()}; - for(int it = 0; it < nadmix; it++) - { - admixer.initIteration(); - for(int i = 0; i < genome->nsamples; i++) - llike.emplace_back(poolit.enqueue(&Admixture::runNativeWithBigAss, &admixer, i, std::ref(genome))); - loglike = 0; - for(auto && ll : llike) loglike += ll.get(); - llike.clear(); // clear future and renew - diff = it ? loglike - prevlike : 0; - REQUIRE(loglike > prevlike); - if(diff > 0 && diff < 0.1) break; - prevlike = loglike; - admixer.updateIteration(); - } -} From a40ef4219cde86621921c637e71d1bf350a21715 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Tue, 30 Jan 2024 12:43:50 +0100 Subject: [PATCH 20/67] update tests --- src/io.hpp | 4 ++-- tests/test-phaseless.cpp | 29 ++++++++++++++++++++++------- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/src/io.hpp b/src/io.hpp index 2d4d7db..5ea5508 100644 --- a/src/io.hpp +++ b/src/io.hpp @@ -446,10 +446,10 @@ inline void update_bigass_inplace(const std::unique_ptr & genome) for(ic = 0; ic < genome->nchunks; ic++) { if(ic == 0) continue; // assume the first chunksize is not greater than the defined - if(genome->pos[ic - 1].size() < genome->chunksize) + if((int)genome->pos[ic - 1].size() < genome->chunksize) { ndiff = genome->chunksize - genome->pos[ic - 1].size(); - if(genome->pos[ic].size() >= ndiff) + if((int) genome->pos[ic].size() >= ndiff) { genome->pos[ic - 1].insert(genome->pos[ic - 1].end(), genome->pos[ic].begin(), genome->pos[ic].begin() + ndiff); diff --git a/tests/test-phaseless.cpp b/tests/test-phaseless.cpp index ac1e624..a483b52 100644 --- a/tests/test-phaseless.cpp +++ b/tests/test-phaseless.cpp @@ -13,14 +13,15 @@ using namespace Eigen; TEST_CASE("phaseless naive vs dump dataset 1", "[test-phaseless]") { int K{3}, C{10}, seed{1}, nadmix{10}, chunksize{10000}, nimpute{40}; + Options opts; + opts.C = C, opts.chunksize = chunksize, opts.gridsize = 1, opts.seed = seed; + opts.in_beagle = "../data/bgl.gz"; std::unique_ptr genome = std::make_unique(); - genome->chunksize = chunksize, genome->C = C; - chunk_beagle_genotype_likelihoods(genome, "../data/all.bgl.gz"); - ThreadPool pool(genome->nchunks); + init_bigass(genome, opts); + ThreadPool pool(4); vector> res; - filesystem::path outdir{"test.dir"}; - filesystem::create_directories(outdir); FastPhaseK2 faith(genome->nsamples, genome->nsnps, C, seed); + faith.initRecombination(genome->pos, opts.in_rfile); for(int it = 0; it <= nimpute; it++) { faith.initIteration(); @@ -31,17 +32,31 @@ TEST_CASE("phaseless naive vs dump dataset 1", "[test-phaseless]") res.clear(); // clear future and renew faith.updateIteration(); } + // reuse Ezj for AE + faith.Ezj = get_cluster_frequency(faith.R, faith.PI); + for(int ic = 0; ic < genome->nchunks; ic++) + { + const int S = faith.pos_chunk[ic + 1] - faith.pos_chunk[ic]; + MyArr2D out = faith.Ezj.middleCols(faith.pos_chunk[ic], S); + genome->AE.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); + out = faith.R.middleCols(faith.pos_chunk[ic], S); + genome->R.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); + out = faith.PI.middleCols(faith.pos_chunk[ic], S); + genome->PI.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); + out = faith.F.middleRows(faith.pos_chunk[ic], S); + genome->F.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); + } double llike1, llike2; Admixture admixer1(genome->nsamples, genome->nsnps, genome->C, K, seed); Admixture admixer2(genome->nsamples, genome->nsnps, genome->C, K, seed); for(int it = 0; it < nadmix; it++) { - admixer1.initIteration(); llike1 = 0; + admixer1.initIteration(); for(int i = 0; i < genome->nsamples; i++) llike1 += admixer1.runOptimalWithBigAss(i, genome); admixer1.updateIteration(); - admixer2.initIteration(); llike2 = 0; + admixer2.initIteration(); for(int i = 0; i < genome->nsamples; i++) llike2 += admixer2.runNativeWithBigAss(i, genome); admixer2.updateIteration(); cerr << std::setprecision(6) << "llike2: " << llike2 << "\tllike1: " << llike1 << "\n"; From 04deb35523b1261ee504d7d90152072a9beee46b Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Tue, 30 Jan 2024 14:20:42 +0100 Subject: [PATCH 21/67] fastphase start with estimated af --- src/common.hpp | 5 ++--- src/fastphase.cpp | 24 +++++++++++++++++++++++- src/fastphase.hpp | 1 + src/main.cpp | 10 ++++++++++ src/parse-phaseless.cpp | 1 - tests/Makefile | 2 +- 6 files changed, 37 insertions(+), 6 deletions(-) diff --git a/src/common.hpp b/src/common.hpp index bad73a9..30047a9 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -97,7 +97,7 @@ struct Options double ftol{1e-6}; // threshold for F double qtol{1e-6}; // threshold for Q bool noaccel{0}, noscreen{0}, single_chunk{0}, debug{0}, collapse{0}; - bool nQ{0}, nP{0}, nF{0}, nR{0}, aQ{0}, oVCF{0}; + bool nQ{0}, nP{0}, nF{0}, nR{0}, aQ{0}, oVCF{0}, eAF{1}, eHap{1}; std::string out, in_beagle, in_vcf, in_bin, in_impute, in_joint; std::string samples{""}, region{""}, in_plink{""}, in_qfile{""}, in_pfile{""}, in_rfile{""}; std::string opts_in_effect{"Options in effect:\n "}; @@ -106,8 +106,7 @@ struct Options // all the genome info I need from fastphase struct BigAss { - int chunksize, nsamples, nsnps, nchunks; - int B, G, C; // B: snps in a grid; G: total number of grids in a genome + int chunksize, nsamples, nsnps, nchunks, B, C, G; MyFloat2D PI, F, R, AE; // M x C, 3 x M, fastphase pars Int1D ends; // chunk index where each chromo ends String1D sampleids, chrs; diff --git a/src/fastphase.cpp b/src/fastphase.cpp index 62add83..fdecaa8 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -42,6 +42,16 @@ void FastPhaseK2::setFlags(double tol_p, double tol_f, double tol_q, bool debug_ NR = nR; } +void FastPhaseK2::setStartPoint(const std::unique_ptr & genome) +{ + for(size_t ic = 0; ic < genome->pos.size(); ic++) + { + MyArr1D af = estimate_af_by_gl(genome->gls[ic], genome->nsamples, genome->pos[ic].size()).cast(); + const int S = pos_chunk[ic + 1] - pos_chunk[ic]; + for(int s = 0; s < S; s++) F.row(pos_chunk[ic] + s) = af(s); + } +} + void FastPhaseK2::initIteration() { // initial temp variables @@ -206,6 +216,7 @@ int run_impute_main(Options & opts) FastPhaseK2 faith(genome->nsamples, genome->nsnps, opts.C, opts.seed); faith.setFlags(opts.ptol, opts.ftol, opts.qtol, opts.debug, opts.nQ, opts.nP, opts.nF, opts.nR); faith.initRecombination(genome->pos, opts.in_rfile); + if(opts.eAF) faith.setStartPoint(genome); double loglike, diff, prevlike{std::numeric_limits::lowest()}; for(int it = 0; SIG_COND && it <= opts.nimpute; it++) { @@ -228,7 +239,18 @@ int run_impute_main(Options & opts) } } // reuse Ezj for AE - faith.Ezj = get_cluster_frequency(faith.R, faith.PI); + if(opts.eHap) + { + cao.print("use hapsum"); + faith.Ezj.setZero(faith.CC, faith.HapSum.cols()); + for(int m = 0; m < faith.HapSum.cols(); m++) + faith.Ezj.col(m) = + (faith.HapSum.col(m).matrix() * faith.HapSum.col(m).transpose().matrix()).reshaped().array(); + } + else + { + faith.Ezj = get_cluster_frequency(faith.R, faith.PI); + } for(int ic = 0; ic < genome->nchunks; ic++) { const int S = faith.pos_chunk[ic + 1] - faith.pos_chunk[ic]; diff --git a/src/fastphase.hpp b/src/fastphase.hpp index 039b77c..4bbfd6b 100644 --- a/src/fastphase.hpp +++ b/src/fastphase.hpp @@ -55,6 +55,7 @@ class FastPhaseK2 void initRecombination(const Int2D & pos, std::string rfile = "", int B = 1, double Ne = 20000); void setFlags(double, double, double, bool, bool, bool, bool, bool); + void setStartPoint(const std::unique_ptr & genome); void protectPars(); void initIteration(); void updateIteration(); diff --git a/src/main.cpp b/src/main.cpp index f1725bd..d3061a5 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -176,6 +176,14 @@ int main(int argc, char * argv[]) .help("seed for reproducibility") .default_value(999) .scan<'i', int>(); + cmd_impute.add_argument("--no-eaf") + .help("do not start with eaf") + .default_value(true) + .implicit_value(true); + cmd_impute.add_argument("--write-hapsum") + .help("write Hapsum instead of AE into parse.bin") + .default_value(false) + .implicit_value(true); cmd_impute.add_argument("--minRecombRate") .help("min recombination rate to determine if a SNP should be collapsed") .default_value(1e-4) @@ -287,6 +295,8 @@ int main(int argc, char * argv[]) opts.seed = cmd_impute.get("--seed"); opts.chunksize = cmd_impute.get("--chunksize"); opts.single_chunk = cmd_impute.get("--single-chunk"); + opts.eAF = cmd_impute.get("--no-eaf"); + opts.eHap = cmd_impute.get("--write-hapsum"); opts.collapse = cmd_impute.get("--collapse"); opts.tol_r = cmd_impute.get("--minRecombRate"); if(opts.single_chunk) opts.chunksize = INT_MAX; diff --git a/src/parse-phaseless.cpp b/src/parse-phaseless.cpp index 7e77a54..703f673 100644 --- a/src/parse-phaseless.cpp +++ b/src/parse-phaseless.cpp @@ -140,7 +140,6 @@ List parse_impute_opt(std::string filename) { assert((bool)ec == false); return List::create(Named("C") = genome->C, Named("B") = genome->B, - Named("G") = genome->G, Named("clusterfreq") = genome->AE, Named("chunksize") = genome->chunksize, Named("nsamples") = genome->nsamples, diff --git a/tests/Makefile b/tests/Makefile index bd77db5..8f6f322 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -1,6 +1,6 @@ HTSDIR = ../inst/include/htslib-1.18 CXX = g++ -CXXFLAGS = -std=c++17 -Wall -O3 -march=native +CXXFLAGS = -std=c++17 -Wall -O3 -mavx2 INC = -I. -I../src -I../inst/include -I$(HTSDIR) LDFLAGS = -L$(HTSDIR) -Wl,-rpath,$(HTSDIR) LIBS = -lhts -llzma -lbz2 -lm -lz -lpthread From 45aba9506e2091ab6846e3c104ccf8dc1f506b70 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Tue, 30 Jan 2024 14:29:11 +0100 Subject: [PATCH 22/67] update cli --- src/main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index d3061a5..31487be 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -178,7 +178,7 @@ int main(int argc, char * argv[]) .scan<'i', int>(); cmd_impute.add_argument("--no-eaf") .help("do not start with eaf") - .default_value(true) + .default_value(false) .implicit_value(true); cmd_impute.add_argument("--write-hapsum") .help("write Hapsum instead of AE into parse.bin") @@ -295,7 +295,7 @@ int main(int argc, char * argv[]) opts.seed = cmd_impute.get("--seed"); opts.chunksize = cmd_impute.get("--chunksize"); opts.single_chunk = cmd_impute.get("--single-chunk"); - opts.eAF = cmd_impute.get("--no-eaf"); + opts.eAF = !cmd_impute.get("--no-eaf"); opts.eHap = cmd_impute.get("--write-hapsum"); opts.collapse = cmd_impute.get("--collapse"); opts.tol_r = cmd_impute.get("--minRecombRate"); From 0fcf704f01f24892cd42ff6f56d14ebabe8e874b Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Wed, 31 Jan 2024 10:41:05 +0100 Subject: [PATCH 23/67] start with eaf get stuck at local optimal --- Makefile | 2 +- src/common.hpp | 2 +- src/fastphase.cpp | 11 ----------- src/fastphase.hpp | 1 - src/main.cpp | 5 ----- 5 files changed, 2 insertions(+), 19 deletions(-) diff --git a/Makefile b/Makefile index 5f442a8..4f88490 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ CXX = g++ # CXXFLAGS = -std=c++17 -Wall -O3 -g -fsanitize=address # CXXFLAGS = -std=c++17 -Wall -O3 -march=native -DNDEBUG -CXXFLAGS = -std=c++17 -Wall -O3 -mavx2 -fPIC -DNDEBUG +CXXFLAGS = -std=c++17 -Wall -O3 -mavx2 INC = -I./src -I./inst/include -I$(HTSDIR) LDFLAGS = -L$(HTSDIR) -Wl,-rpath,$(HTSDIR) LIBS = $(HTSDIR)/libhts.a -llzma -lbz2 -lm -lz -lpthread diff --git a/src/common.hpp b/src/common.hpp index 30047a9..a82bb8c 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -97,7 +97,7 @@ struct Options double ftol{1e-6}; // threshold for F double qtol{1e-6}; // threshold for Q bool noaccel{0}, noscreen{0}, single_chunk{0}, debug{0}, collapse{0}; - bool nQ{0}, nP{0}, nF{0}, nR{0}, aQ{0}, oVCF{0}, eAF{1}, eHap{1}; + bool nQ{0}, nP{0}, nF{0}, nR{0}, aQ{0}, oVCF{0}, eHap{0}; std::string out, in_beagle, in_vcf, in_bin, in_impute, in_joint; std::string samples{""}, region{""}, in_plink{""}, in_qfile{""}, in_pfile{""}, in_rfile{""}; std::string opts_in_effect{"Options in effect:\n "}; diff --git a/src/fastphase.cpp b/src/fastphase.cpp index fdecaa8..2e10ab3 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -42,16 +42,6 @@ void FastPhaseK2::setFlags(double tol_p, double tol_f, double tol_q, bool debug_ NR = nR; } -void FastPhaseK2::setStartPoint(const std::unique_ptr & genome) -{ - for(size_t ic = 0; ic < genome->pos.size(); ic++) - { - MyArr1D af = estimate_af_by_gl(genome->gls[ic], genome->nsamples, genome->pos[ic].size()).cast(); - const int S = pos_chunk[ic + 1] - pos_chunk[ic]; - for(int s = 0; s < S; s++) F.row(pos_chunk[ic] + s) = af(s); - } -} - void FastPhaseK2::initIteration() { // initial temp variables @@ -216,7 +206,6 @@ int run_impute_main(Options & opts) FastPhaseK2 faith(genome->nsamples, genome->nsnps, opts.C, opts.seed); faith.setFlags(opts.ptol, opts.ftol, opts.qtol, opts.debug, opts.nQ, opts.nP, opts.nF, opts.nR); faith.initRecombination(genome->pos, opts.in_rfile); - if(opts.eAF) faith.setStartPoint(genome); double loglike, diff, prevlike{std::numeric_limits::lowest()}; for(int it = 0; SIG_COND && it <= opts.nimpute; it++) { diff --git a/src/fastphase.hpp b/src/fastphase.hpp index 4bbfd6b..039b77c 100644 --- a/src/fastphase.hpp +++ b/src/fastphase.hpp @@ -55,7 +55,6 @@ class FastPhaseK2 void initRecombination(const Int2D & pos, std::string rfile = "", int B = 1, double Ne = 20000); void setFlags(double, double, double, bool, bool, bool, bool, bool); - void setStartPoint(const std::unique_ptr & genome); void protectPars(); void initIteration(); void updateIteration(); diff --git a/src/main.cpp b/src/main.cpp index 31487be..d4669a1 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -176,10 +176,6 @@ int main(int argc, char * argv[]) .help("seed for reproducibility") .default_value(999) .scan<'i', int>(); - cmd_impute.add_argument("--no-eaf") - .help("do not start with eaf") - .default_value(false) - .implicit_value(true); cmd_impute.add_argument("--write-hapsum") .help("write Hapsum instead of AE into parse.bin") .default_value(false) @@ -295,7 +291,6 @@ int main(int argc, char * argv[]) opts.seed = cmd_impute.get("--seed"); opts.chunksize = cmd_impute.get("--chunksize"); opts.single_chunk = cmd_impute.get("--single-chunk"); - opts.eAF = !cmd_impute.get("--no-eaf"); opts.eHap = cmd_impute.get("--write-hapsum"); opts.collapse = cmd_impute.get("--collapse"); opts.tol_r = cmd_impute.get("--minRecombRate"); From c76a1f7eaa263be29bcac2b8fe6fe7bd89a7fac6 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Wed, 31 Jan 2024 22:02:20 +0100 Subject: [PATCH 24/67] test refill haps 0.4.1 --- R/plot_haplotypes.R | 32 ++++++++++++++++++++--- src/fastphase.cpp | 63 ++++++++++++++++++++++++++++++--------------- src/fastphase.hpp | 2 ++ src/main.cpp | 2 +- 4 files changed, 73 insertions(+), 26 deletions(-) diff --git a/R/plot_haplotypes.R b/R/plot_haplotypes.R index 669d2ae..a7b2454 100644 --- a/R/plot_haplotypes.R +++ b/R/plot_haplotypes.R @@ -1,5 +1,5 @@ #' @export -plot_gamma <- function(gammaC, sites = NULL, title="") { +plotGamma <- function(gammaC, sites = NULL, ...) { N <- length(gammaC) C <- nrow(gammaC[[1]]) M <- ncol(gammaC[[1]]) @@ -8,9 +8,7 @@ plot_gamma <- function(gammaC, sites = NULL, title="") { } else { sites <- 1:M } - plot(0, 0, col = "white", axes=FALSE, xlim = c(0, M), ylim = c(1, N + 1), - xlab = "", ylab = "", - cex.lab = 1.5, cex.main = 2.0, main = title) + plot(0, 0, col = "white", axes=FALSE, xlim = c(0, M), ylim = c(1, N + 1),...) d <- 1 xleft <- 1:M - d xright <- 1:M - d @@ -25,3 +23,29 @@ plot_gamma <- function(gammaC, sites = NULL, title="") { } } +#' @export +plotHapFreqWithPhysicalPos <- function(K, + pos, + hapfreq, + ...) { + ## + colStore <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") + nCols <- length(colStore) + nGrids <- length(pos) + sum <- array(0, nGrids) + xlim <- range(pos) + ylim <- c(0, 1) + ## OK so if there are grids, use the grid points + plot(x = 0, y = 0, xlim = xlim, ylim = ylim, axes = FALSE, ...) + x <- c(pos[1], pos, pos[length(pos):1]) + m <- array(0, c(nGrids, K + 1)) + for(i in 1:K) { + m[, i + 1] <- m[, i] + hapfreq[i, ] + } + for(i in K:1) { + polygon( + x = x, y = c(m[1, i], m[, i + 1], m[nGrids:1, i]), + xlim = xlim, ylim = ylim, col = colStore[(i %% nCols) + 1] + ) + } +} diff --git a/src/fastphase.cpp b/src/fastphase.cpp index 2e10ab3..7ff08fc 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -42,6 +42,26 @@ void FastPhaseK2::setFlags(double tol_p, double tol_f, double tol_q, bool debug_ NR = nR; } +void FastPhaseK2::refillHaps() +{ + // bin hapsum per 100 snps ? + for(int c = 0; c < C; c++) + { + for(int m = 0; m < M; m++) + { + if(HapSum(c, m) >= minHapfreq) continue; + MyArr1D h = HapSum.col(m); + h(c) = 0; // do not re-sample current + h /= h.sum(); + MyFloat1D p(h.data(), h.data() + h.size()); + std::discrete_distribution distribution{p.begin(), p.end()}; + int choice = distribution(rng); + assert(choice != c); + F(m, c) = F(m, choice); + } + } +} + void FastPhaseK2::initIteration() { // initial temp variables @@ -51,6 +71,21 @@ void FastPhaseK2::initIteration() HapSum.setZero(C, M); // reset post(Z,j) } +void FastPhaseK2::updateIteration() +{ + // update R + if(!NR) er = 1.0 - Ezj.colwise().sum() / N; + // update F + if(!NP) F = (Ezg2 / (Ezg1 + Ezg2)).transpose(); + // update PI + if(!NF) + { + PI = Ezj; + PI.rowwise() /= PI.colwise().sum(); + } + protectPars(); +} + void FastPhaseK2::protectPars() { // protect F @@ -92,21 +127,6 @@ void FastPhaseK2::protectPars() HapSum.rowwise() /= HapSum.colwise().sum(); } -void FastPhaseK2::updateIteration() -{ - // update R - if(!NR) er = 1.0 - Ezj.colwise().sum() / N; - // update F - if(!NP) F = (Ezg2 / (Ezg1 + Ezg2)).transpose(); - // update PI - if(!NF) - { - PI = Ezj; - PI.rowwise() /= PI.colwise().sum(); - } - protectPars(); -} - /* ** @param GL genotype likelihood of all individuals in snp major form ** @param ind current individual i @@ -221,11 +241,12 @@ int run_impute_main(Options & opts) prevlike = loglike; cao.print(tim.date(), "run whole genome, iteration", it, ", likelihoods =", loglike, ", diff =", diff, ", time", tim.reltime(), " sec"); - if(diff < opts.ltol) - { - cao.print(tim.date(), "hit stopping criteria, diff =", std::scientific, diff, " <", opts.ltol); - break; - } + // if(diff < opts.ltol) + // { + // cao.print(tim.date(), "hit stopping criteria, diff =", std::scientific, diff, " <", opts.ltol); + // break; + // } + if(it > 4 && it < 30 && it % 4 == 1) faith.refillHaps(); } // reuse Ezj for AE if(opts.eHap) @@ -262,7 +283,7 @@ int run_impute_main(Options & opts) orecomb << faith.R.transpose().format(fmt) << "\n"; std::ofstream opi(opts.out + ".pi"); opi << faith.PI.transpose().format(fmt) << "\n"; - std::ofstream ohap(opts.out + ".hapsum"); + std::ofstream ohap(opts.out + ".hapfreq"); ohap << faith.HapSum.transpose().format(fmt) << "\n"; std::ofstream oae(opts.out + ".ae"); oae << faith.Ezj.transpose().format(fmt) << "\n"; diff --git a/src/fastphase.hpp b/src/fastphase.hpp index 039b77c..ae40f2e 100644 --- a/src/fastphase.hpp +++ b/src/fastphase.hpp @@ -21,6 +21,7 @@ class FastPhaseK2 double alleleEmitThreshold{1e-6}; // threshold for P double clusterFreqThreshold{1e-6}; // threshold for F double admixtureThreshold{1e-6}; // threshold for Q + double minHapfreq{0.01}; // min haplotype frequency, or min(1/(10*C), 1/100) public: FastPhaseK2(int n, int m, int c, int seed) : N(n), M(m), C(c), CC(c * c) @@ -55,6 +56,7 @@ class FastPhaseK2 void initRecombination(const Int2D & pos, std::string rfile = "", int B = 1, double Ne = 20000); void setFlags(double, double, double, bool, bool, bool, bool, bool); + void refillHaps(); // re-sample F for sites with hapfreq < minHapfreq void protectPars(); void initIteration(); void updateIteration(); diff --git a/src/main.cpp b/src/main.cpp index d4669a1..1a5eca2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -18,7 +18,7 @@ int main(int argc, char * argv[]) { // ========= helper message and parameters parsing =========================== - const std::string VERSION{"0.4.0"}; + const std::string VERSION{"0.4.1"}; // below for catching ctrl+c, and dumping files struct sigaction sa; From 591735f10287336b5a52c3086d0144c620675830 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Wed, 31 Jan 2024 22:19:31 +0100 Subject: [PATCH 25/67] update settings for refill --- src/fastphase.cpp | 2 +- src/fastphase.hpp | 5 +++-- src/main.cpp | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/fastphase.cpp b/src/fastphase.cpp index 7ff08fc..e607dc0 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -246,7 +246,7 @@ int run_impute_main(Options & opts) // cao.print(tim.date(), "hit stopping criteria, diff =", std::scientific, diff, " <", opts.ltol); // break; // } - if(it > 4 && it < 30 && it % 4 == 1) faith.refillHaps(); + if(it > 4 && it < opts.nimpute && it % 4 == 1) faith.refillHaps(); } // reuse Ezj for AE if(opts.eHap) diff --git a/src/fastphase.hpp b/src/fastphase.hpp index ae40f2e..f25f3a0 100644 --- a/src/fastphase.hpp +++ b/src/fastphase.hpp @@ -18,8 +18,8 @@ class FastPhaseK2 std::default_random_engine rng = std::default_random_engine{}; // BOUNDING double minRate{0.1}, maxRate{100}; // threshold for R - double alleleEmitThreshold{1e-6}; // threshold for P - double clusterFreqThreshold{1e-6}; // threshold for F + double alleleEmitThreshold{1e-4}; // threshold for P + double clusterFreqThreshold{1e-4}; // threshold for F double admixtureThreshold{1e-6}; // threshold for Q double minHapfreq{0.01}; // min haplotype frequency, or min(1/(10*C), 1/100) @@ -32,6 +32,7 @@ class FastPhaseK2 PI.rowwise() /= PI.colwise().sum(); // normalize it per site R = MyArr2D(3, M); GP.setZero(M * 3, N); + minHapfreq = std::min(1.0 / (10 * C), minHapfreq); } ~FastPhaseK2() {} diff --git a/src/main.cpp b/src/main.cpp index 1a5eca2..d0dfafa 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -48,11 +48,11 @@ int main(int argc, char * argv[]) .scan<'g', double>(); program.add_argument("-P", "--ptol") .help("lower boundary for P") - .default_value(1e-6) + .default_value(1e-4) .scan<'g', double>(); program.add_argument("-F", "--ftol") .help("lower boundary for F") - .default_value(1e-6) + .default_value(1e-4) .scan<'g', double>(); program.add_argument("-Q", "--qtol") .help("lower boundary for Q") From 1a70c851fb05996af1fbf01011e695224e9f73cf Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Thu, 1 Feb 2024 12:30:53 +0100 Subject: [PATCH 26/67] update call geno --- src/fastphase.cpp | 43 ++++++++++++++++++++++++------------------- src/fastphase.hpp | 2 +- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/src/fastphase.cpp b/src/fastphase.cpp index e607dc0..bc50b43 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -44,9 +44,10 @@ void FastPhaseK2::setFlags(double tol_p, double tol_f, double tol_q, bool debug_ void FastPhaseK2::refillHaps() { - // bin hapsum per 100 snps ? + int s{0}; for(int c = 0; c < C; c++) { + // bin hapsum per 100 snps ? for(int m = 0; m < M; m++) { if(HapSum(c, m) >= minHapfreq) continue; @@ -57,9 +58,12 @@ void FastPhaseK2::refillHaps() std::discrete_distribution distribution{p.begin(), p.end()}; int choice = distribution(rng); assert(choice != c); + h.maxCoeff(&choice); // if no binning, this may be better F(m, c) = F(m, choice); + s++; } } + cao.warn("refill ", 100 * s / (C * M), "% infrequently used haps"); } void FastPhaseK2::initIteration() @@ -93,7 +97,7 @@ void FastPhaseK2::protectPars() { if(F.isNaN().any()) { - if(debug) cao.warn("NaN in F in FastPhaseK2 model. will fill it with AF"); + cao.warn("NaN in F in FastPhaseK2 model. will fill it with AF"); if(AF.size() == 0) cao.error("AF is not assigned!\n"); for(int i = 0; i < M; i++) F.row(i) = F.row(i).isNaN().select(AF(i), F.row(i)); } @@ -120,7 +124,7 @@ void FastPhaseK2::protectPars() if(PI.isNaN().any()) cao.warn("NaN in PI. reset cluster frequency to ", clusterFreqThreshold); PI = (PI < clusterFreqThreshold).select(clusterFreqThreshold, PI); PI = (PI > 1 - clusterFreqThreshold).select(1 - clusterFreqThreshold, PI); - // re-normalize F per site. hope should work well. otherwise do the complicated. + // re-normalize F per site. hope should work okay. otherwise do the complicated. PI.rowwise() /= PI.colwise().sum(); } // norm HapSum @@ -162,7 +166,7 @@ double FastPhaseK2::hmmIterWithJumps(const MyFloat1D & GL, const int ic, const i .sum(); if(s > 0) alphatmp(z1) = alpha(Eigen::seqN(z1, C, C), s - 1).sum() * R(1, m); if(s == 0) ind_post_zj(z1, s) = (alpha.col(0) * beta.col(0)).segment(z1 * C, C).sum(); - if(finalIter) callGenoLoopC(ind, s, z1, gli, gamma_div_emit); + if(finalIter) callGenoLoopC(z1, m, ind, gli.row(s), F.row(m), gamma_div_emit); } if(s == 0) continue; alphatmp += PI.col(m) * R(2, m) * 1.0; // inner alpha.col(s-1).sum == 1 @@ -181,19 +185,24 @@ double FastPhaseK2::hmmIterWithJumps(const MyFloat1D & GL, const int ic, const i return (1 / cs).log().sum(); } -void FastPhaseK2::callGenoLoopC(int ind, int s, int z1, const MyArr2D & gli, const MyArr1D & gamma_div_emit) +void FastPhaseK2::callGenoLoopC(int z1, + int m, + int ind, + const MyArr1D & gli, + const MyArr1D & p, + const MyArr1D & gamma_div_emit) { MyArr1D tmp_zg(4); for(int z2 = 0; z2 < C; z2++) { int z12 = z1 * C + z2; - tmp_zg(0) = gli(s, 0) * (1 - F(s, z1)) * (1 - F(s, z2)); - tmp_zg(1) = gli(s, 1) * (1 - F(s, z1)) * F(s, z2); - tmp_zg(2) = gli(s, 1) * F(s, z1) * (1 - F(s, z2)); - tmp_zg(3) = gli(s, 2) * F(s, z1) * F(s, z2); - GP(3 * s + 0, ind) += gamma_div_emit(z12) * tmp_zg(0); - GP(3 * s + 1, ind) += gamma_div_emit(z12) * (tmp_zg(1) + tmp_zg(2)); - GP(3 * s + 2, ind) += gamma_div_emit(z12) * tmp_zg(3); + tmp_zg(0) = gli(0) * (1 - p(z1)) * (1 - p(z2)); + tmp_zg(1) = gli(1) * (1 - p(z1)) * p(z2); + tmp_zg(2) = gli(1) * p(z1) * (1 - p(z2)); + tmp_zg(3) = gli(2) * p(z1) * p(z2); + GP(3 * m + 0, ind) += gamma_div_emit(z12) * tmp_zg(0); + GP(3 * m + 1, ind) += gamma_div_emit(z12) * (tmp_zg(1) + tmp_zg(2)); + GP(3 * m + 2, ind) += gamma_div_emit(z12) * tmp_zg(3); } } @@ -230,9 +239,11 @@ int run_impute_main(Options & opts) for(int it = 0; SIG_COND && it <= opts.nimpute; it++) { tim.clock(); + if(it > 4 && it < opts.nimpute && it % 4 == 1) faith.refillHaps(); faith.initIteration(); for(int i = 0; i < faith.N; i++) - res.emplace_back(pool.enqueue(&FastPhaseK2::runAllChunks, &faith, std::ref(genome->gls), i, false)); + res.emplace_back( + pool.enqueue(&FastPhaseK2::runAllChunks, &faith, std::ref(genome->gls), i, it == opts.nimpute)); loglike = 0; for(auto && ll : res) loglike += ll.get(); res.clear(); // clear future and renew @@ -241,12 +252,6 @@ int run_impute_main(Options & opts) prevlike = loglike; cao.print(tim.date(), "run whole genome, iteration", it, ", likelihoods =", loglike, ", diff =", diff, ", time", tim.reltime(), " sec"); - // if(diff < opts.ltol) - // { - // cao.print(tim.date(), "hit stopping criteria, diff =", std::scientific, diff, " <", opts.ltol); - // break; - // } - if(it > 4 && it < opts.nimpute && it % 4 == 1) faith.refillHaps(); } // reuse Ezj for AE if(opts.eHap) diff --git a/src/fastphase.hpp b/src/fastphase.hpp index f25f3a0..68ef61b 100644 --- a/src/fastphase.hpp +++ b/src/fastphase.hpp @@ -61,7 +61,7 @@ class FastPhaseK2 void protectPars(); void initIteration(); void updateIteration(); - void callGenoLoopC(int, int, int, const MyArr2D &, const MyArr1D &); + void callGenoLoopC(int, int, int, const MyArr1D &, const MyArr1D &, const MyArr1D &); double hmmIterWithJumps(const MyFloat1D &, const int, const int, bool); double runAllChunks(const MyFloat2D &, const int, bool); }; From 1b80bdb39d98d0ef2190d23ad748fabcbf25a396 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Thu, 1 Feb 2024 12:41:24 +0100 Subject: [PATCH 27/67] write vcf --- src/fastphase.cpp | 5 +- src/main.cpp | 2 +- src/vcfpp.h | 168 +++++++++++++++++++++++----------------------- 3 files changed, 88 insertions(+), 87 deletions(-) diff --git a/src/fastphase.cpp b/src/fastphase.cpp index bc50b43..961cc79 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -239,7 +239,7 @@ int run_impute_main(Options & opts) for(int it = 0; SIG_COND && it <= opts.nimpute; it++) { tim.clock(); - if(it > 4 && it < opts.nimpute && it % 4 == 1) faith.refillHaps(); + if(it > 4 && it < opts.nimpute / 2 && it % 4 == 1) faith.refillHaps(); faith.initIteration(); for(int i = 0; i < faith.N; i++) res.emplace_back( @@ -266,6 +266,7 @@ int run_impute_main(Options & opts) { faith.Ezj = get_cluster_frequency(faith.R, faith.PI); } + auto bw = make_bcfwriter(opts.out + ".vcf.gz", genome->chrs, genome->sampleids); for(int ic = 0; ic < genome->nchunks; ic++) { const int S = faith.pos_chunk[ic + 1] - faith.pos_chunk[ic]; @@ -277,6 +278,8 @@ int run_impute_main(Options & opts) genome->PI.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); out = faith.F.middleRows(faith.pos_chunk[ic], S); genome->F.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); + out = faith.GP.middleRows(faith.pos_chunk[ic], S * 3); + write_bigass_to_bcf(bw, out.data(), genome->chrs[ic], genome->pos[ic]); } constexpr auto OPTIONS = alpaca::options::fixed_length_encoding; std::ofstream ofs(opts.out + ".pars.bin", std::ios::out | std::ios::binary); diff --git a/src/main.cpp b/src/main.cpp index d0dfafa..d637396 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -156,7 +156,7 @@ int main(int argc, char * argv[]) .scan<'i', int>(); cmd_impute.add_argument("-n", "--threads") .help("number of threads") - .default_value(1) + .default_value(10) .scan<'i', int>(); cmd_impute.add_argument("-o", "--out") .help("output prefix") diff --git a/src/vcfpp.h b/src/vcfpp.h index 11e0d0a..479796d 100644 --- a/src/vcfpp.h +++ b/src/vcfpp.h @@ -2,7 +2,7 @@ * @file https://github.com/Zilong-Li/vcfpp/vcfpp.h * @author Zilong Li * @email zilong.dk@gmail.com - * @version v0.3.1 + * @version v0.3.3 * @breif a single C++ file for manipulating VCF * Copyright (C) 2022-2023.The use of this code is governed by the LICENSE file. ******************************************************************************/ @@ -22,7 +22,7 @@ * \section install_sec Installation * * - include "vcfpp.h" to your program and compile it by g++ my.cpp -std=c++11 -Wall -I. -lhts - * -lz -lm -lbz2 -llzma -lcurl + * - * - make sure you have https://github.com/samtools/htslib installed on your system and the it is in your * environment. * @@ -86,12 +86,6 @@ using isValidGT = typename std::enable_if>::va || std::is_same>::value, bool>::type; -template -using isGtVector = typename std::enable_if< - std::is_same>::value || std::is_same>::value - || std::is_same::value || std::is_same>::value, - bool>::type; - template using isFormatVector = typename std::enable_if>::value || std::is_same>::value @@ -147,8 +141,8 @@ inline std::vector split_string(const std::string & s, const std::s /** * @class BcfHeader - * @brief Object represents the header in VCF - * @note nothing important + * @brief Object represents header of the VCF, offering methods to access and modify the tags + * @note BcfHeader has 3 friends, BcfReader, BcfWriter and BcfRecord. **/ class BcfHeader { @@ -165,7 +159,7 @@ class BcfHeader ~BcfHeader() {} - /** @brief print out the header */ + /** @brief stream out the header */ friend std::ostream & operator<<(std::ostream & out, const BcfHeader & h) { out << h.asString(); @@ -354,8 +348,8 @@ class BcfHeader /** * @class BcfRecord - * @brief Object represents a record in VCF - * @note the object is constructed using a BcfHeader object and needs to be filled in by calling + * @brief Object represents a variant record in the VCF, offering methods to access and modify fields. + * @note BcfRecord has to be associated with a BcfHeader object and needs to be filled in by calling *BcfReader.getNextVariant function. **/ class BcfRecord @@ -381,7 +375,10 @@ class BcfRecord std::vector isGenoMissing; public: - /** @brief initilize a BcfRecord object using a given BcfHeader object. */ + /// empty constructor. call init() afterwards + BcfRecord() {} + + /// constructor with a given BcfHeader object BcfRecord(const BcfHeader & h) : header(h) { nsamples = header.nSamples(); @@ -391,6 +388,15 @@ class BcfRecord ~BcfRecord() {} + /// initilize a BcfRecord object using a given BcfHeader object + void init(const BcfHeader & h) + { + header = h; + nsamples = header.nSamples(); + typeOfGT.resize(nsamples); + gtPhase.resize(nsamples, 0); + } + /** @brief stream out the variant */ friend std::ostream & operator<<(std::ostream & out, const BcfRecord & v) { @@ -409,13 +415,12 @@ class BcfRecord } /** - * @brief fill in the input vector with genotypes of 0 and 1. only works for ploidy<=2. genotypes with -missing allele is coded as heterozygous - * @param v valid input are vector vector type + * @brief fill in the input vector with genotypes of 0 and 1. only works for ploidy<=2. Genotypes with + * missing allele is coded as heterozygous + * @param v valid input includes vector and vector type * @return bool - * @note user can use isNoneMissing() to check if there is genotype with missingness. then one can decide -if the default behaviour of this function is desired. Alternatively, user can use vector as the input -type as noted in the other overloading function. + * @note use isNoneMissing() to check if all genotypes are with no missingness. Alternatively, one can + * use vector as the input type as noted in the other overloading function getGenotypes(). * */ template isValidGT getGenotypes(T & v) @@ -475,7 +480,7 @@ type as noted in the other overloading function. } /** - * @brief fill in the input vector with genotyps, 0, 1 or -9 (missing). + * @brief fill in the input vector with genotype values, 0, 1 or -9 (missing). * @param v valid input is vector type * @return bool * @note this function provides full capability to handl all kinds of genotypes in multi-ploidy data with @@ -485,7 +490,9 @@ type as noted in the other overloading function. { ndst = 0; ret = bcf_get_genotypes(header.hdr, line, >s, &ndst); - if(ret <= 0) throw std::runtime_error("genotypes not present"); + if(ret <= 0) + throw std::runtime_error( + "genotypes not present. make sure you initilized the variant object first\n"); v.resize(ret); isGenoMissing.assign(nsamples, 0); nploidy = ret / nsamples; @@ -567,7 +574,7 @@ type as noted in the other overloading function. bool getFORMAT(std::string tag, std::vector & v) { fmt = bcf_get_fmt(header.hdr, line, tag.c_str()); - if(!fmt) throw std::runtime_error("there is no " + tag + " in FORMAT of this variant.\n"); + if(!fmt) throw std::runtime_error("there is no " + tag + " in FORMAT for this variant of ID=" + ID()); nvalues = fmt->n; // if ndst < (fmt->n+1)*nsmpl; then realloc is involved ret = -1, ndst = 0; @@ -722,7 +729,7 @@ type as noted in the other overloading function. return true; } - /** remove the given tag from INFO*/ + /// remove the given tag from INFO of the variant void removeINFO(std::string tag) { ret = -1; @@ -740,23 +747,24 @@ type as noted in the other overloading function. } /** - * @brief set genotypes from scratch assume genotype not present - * @param v valid input includevector, vector, vector, std::string + * @brief set genotypes from scratch even if genotypes not present + * @param v the genotypes of vector type * @return bool * */ - template - isGtVector setGenotypes(const T & v) + bool setGenotypes(const std::vector & v) { // bcf_gt_type int i, j, k; nploidy = v.size() / nsamples; - gts = (int *)malloc(nsamples * nploidy * sizeof(int)); + gts = (int32_t *)malloc(v.size() * sizeof(int32_t)); for(i = 0; i < nsamples; i++) { for(j = 0; j < nploidy; j++) { k = i * nploidy + j; - if(gtPhase[i]) + if(v[k] == -9 || v[k] == bcf_int32_missing) + gts[k] = bcf_gt_missing; + else if(gtPhase[i]) gts[k] = bcf_gt_phased(v[k]); else gts[k] = bcf_gt_unphased(v[k]); @@ -768,48 +776,30 @@ type as noted in the other overloading function. return true; } - /** - * @brief update genotypes for current record, assume genotypes present - * @param v valid input includevector, vector, vector, std::string - * @return bool - * */ - template - isGtVector updateGenotypes(const T & v) - { - // bcf_gt_type - ndst = 0; - ret = bcf_get_genotypes(header.hdr, line, >s, &ndst); - if(ret <= 0) throw std::runtime_error("genotypes not present for current record.\n"); - assert(ret == v.size()); - nploidy = ret / nsamples; - int i, j, k; - for(i = 0; i < nsamples; i++) - { - for(j = 0; j < nploidy; j++) - { - k = i * nploidy + j; - if(gtPhase[i]) - gts[k] = bcf_gt_phased(v[k]); - else - gts[k] = bcf_gt_unphased(v[k]); - } - } - if(bcf_update_genotypes(header.hdr, line, gts, ret) < 0) - throw std::runtime_error("couldn't set genotypes correctly.\n"); - else - return true; - } - /** * @brief set phasing status for all diploid samples using given vector * @param v valid input includes vector * */ void setPhasing(const std::vector & v) { - assert(v.size() == nsamples); + assert((int)v.size() == nsamples); gtPhase = v; } + /// remove the given tag from FORMAT of the variant + void removeFORMAT(std::string tag) + { + ret = -1; + int tag_id = bcf_hdr_id2int(header.hdr, BCF_DT_ID, tag.c_str()); + if(bcf_hdr_id2type(header.hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff)) + ret = bcf_update_format_int32(header.hdr, line, tag.c_str(), NULL, 0); + else if(bcf_hdr_id2type(header.hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff)) + ret = bcf_update_format_char(header.hdr, line, tag.c_str(), NULL, 0); + else if(bcf_hdr_id2type(header.hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff)) + ret = bcf_update_format_float(header.hdr, line, tag.c_str(), NULL, 0); + if(ret < 0) throw std::runtime_error("couldn't remove " + tag + " correctly.\n"); + } + /** * @brief set tag values for all samples in FORMAT using given vector * @param tag valid tag name in FORMAT column declared in the VCF header @@ -1030,12 +1020,6 @@ type as noted in the other overloading function. return line->pos + 1; } - /** @brief modify position given 1-based value */ - inline void setAlleleStr(const char * alleles_string) - { - bcf_update_alleles_str(header.hdr, line, alleles_string); - } - /** @brief modify CHROM value */ inline void setCHR(const char * chr) { @@ -1048,6 +1032,18 @@ type as noted in the other overloading function. line->pos = p - 1; } + /** @brief update ID */ + inline void setID(const char * s) + { + bcf_update_id(header.hdr, line, s); + } + + /** @brief set REF and ALT alleles given a string seperated by comma */ + inline void setRefAlt(const char * alleles_string) + { + bcf_update_alleles_str(header.hdr, line, alleles_string); + } + /** @brief return 0-base start of the variant (can be any type) */ inline int64_t Start() const { @@ -1212,12 +1208,6 @@ type as noted in the other overloading function. nploidy = v; } - /// return the shape of current tag in FORMAT (nsamples x nvalues) - inline std::tuple shapeOfQuery() const - { - return std::make_tuple(nsamples, nvalues); - } - /** * @brief vector of nsamples length. keep track of the type of genotype (one of GT_HOM_RR, GT_HET_RA, * GT_HOM_AA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A or GT_UNKN). @@ -1237,8 +1227,7 @@ type as noted in the other overloading function. /** * @class BcfReader - * @brief Stream in variants from vcf/bcf file or stdin - * @note nothing important + * @brief Stream in variants from compressed/uncompressed VCF/BCF file or stdin **/ class BcfReader { @@ -1252,11 +1241,11 @@ class BcfReader bool isBcf; // if the input file is bcf or vcf; public: - /** @brief a BcfHeader object */ + /// a BcfHeader object BcfHeader header; - /** @brief number of samples in the VCF */ + /// number of samples in the VCF int nsamples; - /** @brief number of samples in the VCF */ + /// number of samples in the VCF std::vector SamplesName; /// Construct an empty BcfReader @@ -1296,10 +1285,8 @@ class BcfReader BcfReader(const std::string & file, const std::string & region, const std::string & samples) : fname(file) { open(file); - header.setSamples(samples); - nsamples = bcf_hdr_nsamples(header.hdr); if(!region.empty()) setRegion(region); - SamplesName = header.getSamples(); + if(!samples.empty()) setSamples(samples); } /// return a BcfHeader object @@ -1355,6 +1342,18 @@ class BcfReader return c; } + /** + * @brief explicitly stream to specific samples + * @param samples the string is bcftools-like format, which is comma separated list of samples to include + * (or exclude with "^" prefix). + * */ + void setSamples(const std::string & samples) + { + header.setSamples(samples); + nsamples = bcf_hdr_nsamples(header.hdr); + SamplesName = header.getSamples(); + } + /** * @brief explicitly stream to specific region * @param region the string is samtools-like format which is chr:start-end @@ -1424,8 +1423,7 @@ class BcfReader /** * @class BcfWriter - * @brief Stream out variants to vcf/bcf file or stdout - * @note nothing important + * @brief Stream out variants to compressed/uncompressed VCF/BCF file or stdout **/ class BcfWriter { From 780eac9144f63cddaee89dd232b0424c8bb859d8 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Thu, 1 Feb 2024 15:14:25 +0100 Subject: [PATCH 28/67] add --refill-haps option --- src/common.hpp | 2 +- src/fastphase.cpp | 2 +- src/main.cpp | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/common.hpp b/src/common.hpp index a82bb8c..6e26a80 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -97,7 +97,7 @@ struct Options double ftol{1e-6}; // threshold for F double qtol{1e-6}; // threshold for Q bool noaccel{0}, noscreen{0}, single_chunk{0}, debug{0}, collapse{0}; - bool nQ{0}, nP{0}, nF{0}, nR{0}, aQ{0}, oVCF{0}, eHap{0}; + bool nQ{0}, nP{0}, nF{0}, nR{0}, aQ{0}, oVCF{0}, eHap{0}, refillHaps{0}; std::string out, in_beagle, in_vcf, in_bin, in_impute, in_joint; std::string samples{""}, region{""}, in_plink{""}, in_qfile{""}, in_pfile{""}, in_rfile{""}; std::string opts_in_effect{"Options in effect:\n "}; diff --git a/src/fastphase.cpp b/src/fastphase.cpp index 961cc79..544f2ce 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -239,7 +239,7 @@ int run_impute_main(Options & opts) for(int it = 0; SIG_COND && it <= opts.nimpute; it++) { tim.clock(); - if(it > 4 && it < opts.nimpute / 2 && it % 4 == 1) faith.refillHaps(); + if(opts.refillHaps && it > 4 && it < opts.nimpute / 2 && it % 4 == 1) faith.refillHaps(); faith.initIteration(); for(int i = 0; i < faith.N; i++) res.emplace_back( diff --git a/src/main.cpp b/src/main.cpp index d637396..8500ead 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -180,6 +180,10 @@ int main(int argc, char * argv[]) .help("write Hapsum instead of AE into parse.bin") .default_value(false) .implicit_value(true); + cmd_impute.add_argument("--refill-haps") + .help("refill infrequently used haplotype clusters") + .default_value(false) + .implicit_value(true); cmd_impute.add_argument("--minRecombRate") .help("min recombination rate to determine if a SNP should be collapsed") .default_value(1e-4) @@ -293,6 +297,7 @@ int main(int argc, char * argv[]) opts.single_chunk = cmd_impute.get("--single-chunk"); opts.eHap = cmd_impute.get("--write-hapsum"); opts.collapse = cmd_impute.get("--collapse"); + opts.refillHaps = cmd_impute.get("--refill-haps"); opts.tol_r = cmd_impute.get("--minRecombRate"); if(opts.single_chunk) opts.chunksize = INT_MAX; if((opts.in_beagle.empty() && opts.in_vcf.empty()) || cmd_impute.get("--help")) From 51737c867e23800326fd6737ba76965b076c9f34 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Thu, 1 Feb 2024 15:44:55 +0100 Subject: [PATCH 29/67] add --refill-haps values --- src/common.hpp | 4 ++-- src/fastphase.cpp | 19 +++++++++++++++---- src/fastphase.hpp | 2 +- src/main.cpp | 8 ++++---- 4 files changed, 22 insertions(+), 11 deletions(-) diff --git a/src/common.hpp b/src/common.hpp index 6e26a80..10d3ee8 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -91,13 +91,13 @@ inline MatrixType RandomUniform(const Eigen::Index numRows, struct Options { int ichunk{0}, chunksize{10000}, K{2}, C{10}, nadmix{1000}, nimpute{40}, nthreads{1}, seed{999}; - int gridsize{1}; + int gridsize{1}, refillHaps{0}; double ltol{1e-1}, info{0}, tol_pi{0.99}, tol_r{1e-5}; double ptol{1e-6}; // threshold for P double ftol{1e-6}; // threshold for F double qtol{1e-6}; // threshold for Q bool noaccel{0}, noscreen{0}, single_chunk{0}, debug{0}, collapse{0}; - bool nQ{0}, nP{0}, nF{0}, nR{0}, aQ{0}, oVCF{0}, eHap{0}, refillHaps{0}; + bool nQ{0}, nP{0}, nF{0}, nR{0}, aQ{0}, oVCF{0}, eHap{0}; std::string out, in_beagle, in_vcf, in_bin, in_impute, in_joint; std::string samples{""}, region{""}, in_plink{""}, in_qfile{""}, in_pfile{""}, in_rfile{""}; std::string opts_in_effect{"Options in effect:\n "}; diff --git a/src/fastphase.cpp b/src/fastphase.cpp index 544f2ce..caf60fe 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -42,7 +42,7 @@ void FastPhaseK2::setFlags(double tol_p, double tol_f, double tol_q, bool debug_ NR = nR; } -void FastPhaseK2::refillHaps() +void FastPhaseK2::refillHaps(int strategy) { int s{0}; for(int c = 0; c < C; c++) @@ -58,8 +58,19 @@ void FastPhaseK2::refillHaps() std::discrete_distribution distribution{p.begin(), p.end()}; int choice = distribution(rng); assert(choice != c); - h.maxCoeff(&choice); // if no binning, this may be better - F(m, c) = F(m, choice); + if(strategy == 1) + { + F(m, c) = alleleEmitThreshold; + } + else if(strategy == 2) + { + h.maxCoeff(&choice); // if no binning, this may be better + F(m, c) = F(m, choice); + } + else + { + F(m, c) = F(m, choice); + } s++; } } @@ -239,7 +250,7 @@ int run_impute_main(Options & opts) for(int it = 0; SIG_COND && it <= opts.nimpute; it++) { tim.clock(); - if(opts.refillHaps && it > 4 && it < opts.nimpute / 2 && it % 4 == 1) faith.refillHaps(); + if(opts.refillHaps && it > 4 && it < opts.nimpute / 2 && it % 4 == 1) faith.refillHaps(opts.refillHaps); faith.initIteration(); for(int i = 0; i < faith.N; i++) res.emplace_back( diff --git a/src/fastphase.hpp b/src/fastphase.hpp index 68ef61b..60312a1 100644 --- a/src/fastphase.hpp +++ b/src/fastphase.hpp @@ -57,7 +57,7 @@ class FastPhaseK2 void initRecombination(const Int2D & pos, std::string rfile = "", int B = 1, double Ne = 20000); void setFlags(double, double, double, bool, bool, bool, bool, bool); - void refillHaps(); // re-sample F for sites with hapfreq < minHapfreq + void refillHaps(int); // re-sample F for sites with hapfreq < minHapfreq void protectPars(); void initIteration(); void updateIteration(); diff --git a/src/main.cpp b/src/main.cpp index 8500ead..621760a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -181,9 +181,9 @@ int main(int argc, char * argv[]) .default_value(false) .implicit_value(true); cmd_impute.add_argument("--refill-haps") - .help("refill infrequently used haplotype clusters") - .default_value(false) - .implicit_value(true); + .help("refill infrequently used haplotype clusters.\n 0: disable this;\n 1: reset P to min allele emission probability for that haplotype cluster;\n 2: re-sample P by copying from haplotype with the highest probability;\n 3: re-sample P by copying from others with respect to their probability.") + .default_value(0) + .scan<'i', int>(); cmd_impute.add_argument("--minRecombRate") .help("min recombination rate to determine if a SNP should be collapsed") .default_value(1e-4) @@ -297,7 +297,7 @@ int main(int argc, char * argv[]) opts.single_chunk = cmd_impute.get("--single-chunk"); opts.eHap = cmd_impute.get("--write-hapsum"); opts.collapse = cmd_impute.get("--collapse"); - opts.refillHaps = cmd_impute.get("--refill-haps"); + opts.refillHaps = cmd_impute.get("--refill-haps"); opts.tol_r = cmd_impute.get("--minRecombRate"); if(opts.single_chunk) opts.chunksize = INT_MAX; if((opts.in_beagle.empty() && opts.in_vcf.empty()) || cmd_impute.get("--help")) From bd9acc4432ccecd60991f0e70220915d6f571e26 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Fri, 2 Feb 2024 13:25:52 +0100 Subject: [PATCH 30/67] update argparse to 0.3.0 --- inst/include/argparse/argparse.hpp | 648 +++++++++++++++++++++++++---- src/main.cpp | 46 +- 2 files changed, 587 insertions(+), 107 deletions(-) diff --git a/inst/include/argparse/argparse.hpp b/inst/include/argparse/argparse.hpp index b77787b..0c85127 100644 --- a/inst/include/argparse/argparse.hpp +++ b/inst/include/argparse/argparse.hpp @@ -29,6 +29,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #pragma once + +#ifndef ARGPARSE_MODULE_USE_STD_MODULE #include #include #include @@ -53,6 +55,7 @@ SOFTWARE. #include #include #include +#endif namespace argparse { @@ -72,7 +75,7 @@ struct HasContainerTraits< decltype(std::declval().size())>> : std::true_type {}; template -static constexpr bool IsContainer = HasContainerTraits::value; +inline constexpr bool IsContainer = HasContainerTraits::value; template struct HasStreamableTraits : std::false_type {}; @@ -84,7 +87,7 @@ struct HasStreamableTraits< : std::true_type {}; template -static constexpr bool IsStreamable = HasStreamableTraits::value; +inline constexpr bool IsStreamable = HasStreamableTraits::value; constexpr std::size_t repr_max_container_size = 5; @@ -145,6 +148,7 @@ constexpr bool standard_unsigned_integer = true; } // namespace +constexpr int radix_2 = 2; constexpr int radix_8 = 8; constexpr int radix_10 = 10; constexpr int radix_16 = 16; @@ -180,12 +184,28 @@ constexpr bool starts_with(std::basic_string_view prefix, } enum class chars_format { - scientific = 0x1, - fixed = 0x2, - hex = 0x4, + scientific = 0xf1, + fixed = 0xf2, + hex = 0xf4, + binary = 0xf8, general = fixed | scientific }; +struct ConsumeBinaryPrefixResult { + bool is_binary; + std::string_view rest; +}; + +constexpr auto consume_binary_prefix(std::string_view s) + -> ConsumeBinaryPrefixResult { + if (starts_with(std::string_view{"0b"}, s) || + starts_with(std::string_view{"0B"}, s)) { + s.remove_prefix(2); + return {true, s}; + } + return {false, s}; +} + struct ConsumeHexPrefixResult { bool is_hexadecimal; std::string_view rest; @@ -211,13 +231,14 @@ inline auto do_from_chars(std::string_view s) -> T { if (ptr == last) { return x; } - throw std::invalid_argument{"pattern does not match to the end"}; + throw std::invalid_argument{"pattern '" + std::string(s) + + "' does not match to the end"}; } if (ec == std::errc::invalid_argument) { - throw std::invalid_argument{"pattern not found"}; + throw std::invalid_argument{"pattern '" + std::string(s) + "' not found"}; } if (ec == std::errc::result_out_of_range) { - throw std::range_error{"not representable"}; + throw std::range_error{"'" + std::string(s) + "' not representable"}; } return x; // unreachable } @@ -228,25 +249,97 @@ template struct parse_number { } }; -template struct parse_number { +template struct parse_number { auto operator()(std::string_view s) -> T { - if (auto [ok, rest] = consume_hex_prefix(s); ok) { - return do_from_chars(rest); + if (auto [ok, rest] = consume_binary_prefix(s); ok) { + return do_from_chars(rest); } throw std::invalid_argument{"pattern not found"}; } }; +template struct parse_number { + auto operator()(std::string_view s) -> T { + if (starts_with("0x"sv, s) || starts_with("0X"sv, s)) { + if (auto [ok, rest] = consume_hex_prefix(s); ok) { + try { + return do_from_chars(rest); + } catch (const std::invalid_argument &err) { + throw std::invalid_argument("Failed to parse '" + std::string(s) + + "' as hexadecimal: " + err.what()); + } catch (const std::range_error &err) { + throw std::range_error("Failed to parse '" + std::string(s) + + "' as hexadecimal: " + err.what()); + } + } + } else { + // Allow passing hex numbers without prefix + // Shape 'x' already has to be specified + try { + return do_from_chars(s); + } catch (const std::invalid_argument &err) { + throw std::invalid_argument("Failed to parse '" + std::string(s) + + "' as hexadecimal: " + err.what()); + } catch (const std::range_error &err) { + throw std::range_error("Failed to parse '" + std::string(s) + + "' as hexadecimal: " + err.what()); + } + } + + throw std::invalid_argument{"pattern '" + std::string(s) + + "' not identified as hexadecimal"}; + } +}; + template struct parse_number { auto operator()(std::string_view s) -> T { auto [ok, rest] = consume_hex_prefix(s); if (ok) { - return do_from_chars(rest); + try { + return do_from_chars(rest); + } catch (const std::invalid_argument &err) { + throw std::invalid_argument("Failed to parse '" + std::string(s) + + "' as hexadecimal: " + err.what()); + } catch (const std::range_error &err) { + throw std::range_error("Failed to parse '" + std::string(s) + + "' as hexadecimal: " + err.what()); + } + } + + auto [ok_binary, rest_binary] = consume_binary_prefix(s); + if (ok_binary) { + try { + return do_from_chars(rest_binary); + } catch (const std::invalid_argument &err) { + throw std::invalid_argument("Failed to parse '" + std::string(s) + + "' as binary: " + err.what()); + } catch (const std::range_error &err) { + throw std::range_error("Failed to parse '" + std::string(s) + + "' as binary: " + err.what()); + } } + if (starts_with("0"sv, s)) { - return do_from_chars(rest); + try { + return do_from_chars(rest); + } catch (const std::invalid_argument &err) { + throw std::invalid_argument("Failed to parse '" + std::string(s) + + "' as octal: " + err.what()); + } catch (const std::range_error &err) { + throw std::range_error("Failed to parse '" + std::string(s) + + "' as octal: " + err.what()); + } + } + + try { + return do_from_chars(rest); + } catch (const std::invalid_argument &err) { + throw std::invalid_argument("Failed to parse '" + std::string(s) + + "' as decimal integer: " + err.what()); + } catch (const std::range_error &err) { + throw std::range_error("Failed to parse '" + std::string(s) + + "' as decimal integer: " + err.what()); } - return do_from_chars(rest); } }; @@ -261,7 +354,7 @@ template <> inline const auto generic_strtod = strtold; template inline auto do_strtod(std::string const &s) -> T { if (isspace(static_cast(s[0])) || s[0] == '+') { - throw std::invalid_argument{"pattern not found"}; + throw std::invalid_argument{"pattern '" + s + "' not found"}; } auto [first, last] = pointer_range(s); @@ -273,10 +366,11 @@ template inline auto do_strtod(std::string const &s) -> T { if (ptr == last) { return x; } - throw std::invalid_argument{"pattern does not match to the end"}; + throw std::invalid_argument{"pattern '" + s + + "' does not match to the end"}; } if (errno == ERANGE) { - throw std::range_error{"not representable"}; + throw std::range_error{"'" + s + "' not representable"}; } return x; // unreachable } @@ -287,8 +381,20 @@ template struct parse_number { throw std::invalid_argument{ "chars_format::general does not parse hexfloat"}; } + if (auto r = consume_binary_prefix(s); r.is_binary) { + throw std::invalid_argument{ + "chars_format::general does not parse binfloat"}; + } - return do_strtod(s); + try { + return do_strtod(s); + } catch (const std::invalid_argument &err) { + throw std::invalid_argument("Failed to parse '" + s + + "' as number: " + err.what()); + } catch (const std::range_error &err) { + throw std::range_error("Failed to parse '" + s + + "' as number: " + err.what()); + } } }; @@ -297,6 +403,31 @@ template struct parse_number { if (auto r = consume_hex_prefix(s); !r.is_hexadecimal) { throw std::invalid_argument{"chars_format::hex parses hexfloat"}; } + if (auto r = consume_binary_prefix(s); r.is_binary) { + throw std::invalid_argument{"chars_format::hex does not parse binfloat"}; + } + + try { + return do_strtod(s); + } catch (const std::invalid_argument &err) { + throw std::invalid_argument("Failed to parse '" + s + + "' as hexadecimal: " + err.what()); + } catch (const std::range_error &err) { + throw std::range_error("Failed to parse '" + s + + "' as hexadecimal: " + err.what()); + } + } +}; + +template struct parse_number { + auto operator()(std::string const &s) -> T { + if (auto r = consume_hex_prefix(s); r.is_hexadecimal) { + throw std::invalid_argument{ + "chars_format::binary does not parse hexfloat"}; + } + if (auto r = consume_binary_prefix(s); !r.is_binary) { + throw std::invalid_argument{"chars_format::binary parses binfloat"}; + } return do_strtod(s); } @@ -308,12 +439,24 @@ template struct parse_number { throw std::invalid_argument{ "chars_format::scientific does not parse hexfloat"}; } + if (auto r = consume_binary_prefix(s); r.is_binary) { + throw std::invalid_argument{ + "chars_format::scientific does not parse binfloat"}; + } if (s.find_first_of("eE") == std::string::npos) { throw std::invalid_argument{ "chars_format::scientific requires exponent part"}; } - return do_strtod(s); + try { + return do_strtod(s); + } catch (const std::invalid_argument &err) { + throw std::invalid_argument("Failed to parse '" + s + + "' as scientific notation: " + err.what()); + } catch (const std::range_error &err) { + throw std::range_error("Failed to parse '" + s + + "' as scientific notation: " + err.what()); + } } }; @@ -323,12 +466,24 @@ template struct parse_number { throw std::invalid_argument{ "chars_format::fixed does not parse hexfloat"}; } + if (auto r = consume_binary_prefix(s); r.is_binary) { + throw std::invalid_argument{ + "chars_format::fixed does not parse binfloat"}; + } if (s.find_first_of("eE") != std::string::npos) { throw std::invalid_argument{ "chars_format::fixed does not parse exponent part"}; } - return do_strtod(s); + try { + return do_strtod(s); + } catch (const std::invalid_argument &err) { + throw std::invalid_argument("Failed to parse '" + s + + "' as fixed notation: " + err.what()); + } catch (const std::range_error &err) { + throw std::range_error("Failed to parse '" + s + + "' as fixed notation: " + err.what()); + } } }; @@ -347,6 +502,65 @@ std::string join(StrIt first, StrIt last, const std::string &separator) { return value.str(); } +template struct can_invoke_to_string { + template + static auto test(int) + -> decltype(std::to_string(std::declval()), std::true_type{}); + + template static auto test(...) -> std::false_type; + + static constexpr bool value = decltype(test(0))::value; +}; + +template struct IsChoiceTypeSupported { + using CleanType = typename std::decay::type; + static const bool value = std::is_integral::value || + std::is_same::value || + std::is_same::value || + std::is_same::value; +}; + +template +std::size_t get_levenshtein_distance(const StringType &s1, + const StringType &s2) { + std::vector> dp( + s1.size() + 1, std::vector(s2.size() + 1, 0)); + + for (std::size_t i = 0; i <= s1.size(); ++i) { + for (std::size_t j = 0; j <= s2.size(); ++j) { + if (i == 0) { + dp[i][j] = j; + } else if (j == 0) { + dp[i][j] = i; + } else if (s1[i - 1] == s2[j - 1]) { + dp[i][j] = dp[i - 1][j - 1]; + } else { + dp[i][j] = 1 + std::min({dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]}); + } + } + } + + return dp[s1.size()][s2.size()]; +} + +template +std::string_view +get_most_similar_string(const std::map &map, + const std::string_view input) { + std::string_view most_similar{}; + std::size_t min_distance = std::numeric_limits::max(); + + for (const auto &entry : map) { + std::size_t distance = get_levenshtein_distance(entry.first, input); + if (distance < min_distance) { + min_distance = distance; + most_similar = entry.first; + } + } + + return most_similar; +} + } // namespace details enum class nargs_pattern { optional, any, at_least_one }; @@ -404,7 +618,15 @@ class Argument { } template Argument &default_value(T &&value) { + m_num_args_range = NArgsRange{0, m_num_args_range.get_max()}; m_default_value_repr = details::repr(value); + + if constexpr (std::is_convertible_v) { + m_default_value_str = std::string{std::string_view{value}}; + } else if constexpr (details::can_invoke_to_string::value) { + m_default_value_str = std::to_string(value); + } + m_default_value = std::forward(value); return *this; } @@ -424,8 +646,18 @@ class Argument { return *this; } + // This is shorthand for: + // program.add_argument("foo") + // .default_value(false) + // .implicit_value(true) + Argument &flag() { + default_value(false); + implicit_value(true); + return *this; + } + template - auto action(F &&callable, Args &&... bound_args) + auto action(F &&callable, Args &&...bound_args) -> std::enable_if_t, Argument &> { using action_type = std::conditional_t< @@ -465,6 +697,9 @@ class Argument { } else if constexpr (is_one_of(Shape, 'u') && details::standard_unsigned_integer) { action(details::parse_number()); + } else if constexpr (is_one_of(Shape, 'b') && + details::standard_unsigned_integer) { + action(details::parse_number()); } else if constexpr (is_one_of(Shape, 'o') && details::standard_unsigned_integer) { action(details::parse_number()); @@ -506,10 +741,12 @@ class Argument { m_num_args_range = NArgsRange{0, 1}; break; case nargs_pattern::any: - m_num_args_range = NArgsRange{0, (std::numeric_limits::max)()}; + m_num_args_range = + NArgsRange{0, (std::numeric_limits::max)()}; break; case nargs_pattern::at_least_one: - m_num_args_range = NArgsRange{1, (std::numeric_limits::max)()}; + m_num_args_range = + NArgsRange{1, (std::numeric_limits::max)()}; break; } return *this; @@ -520,6 +757,82 @@ class Argument { return nargs(nargs_pattern::any); } + template void add_choice(T &&choice) { + static_assert(details::IsChoiceTypeSupported::value, + "Only string or integer type supported for choice"); + static_assert(std::is_convertible_v || + details::can_invoke_to_string::value, + "Choice is not convertible to string_type"); + if (!m_choices.has_value()) { + m_choices = std::vector{}; + } + + if constexpr (std::is_convertible_v) { + m_choices.value().push_back( + std::string{std::string_view{std::forward(choice)}}); + } else if constexpr (details::can_invoke_to_string::value) { + m_choices.value().push_back(std::to_string(std::forward(choice))); + } + } + + Argument &choices() { + if (!m_choices.has_value()) { + throw std::runtime_error("Zero choices provided"); + } + return *this; + } + + template + Argument &choices(T &&first, U &&...rest) { + add_choice(std::forward(first)); + choices(std::forward(rest)...); + return *this; + } + + void find_default_value_in_choices_or_throw() const { + + const auto &choices = m_choices.value(); + + if (m_default_value.has_value()) { + if (std::find(choices.begin(), choices.end(), m_default_value_str) == + choices.end()) { + // provided arg not in list of allowed choices + // report error + + std::string choices_as_csv = + std::accumulate(choices.begin(), choices.end(), std::string(), + [](const std::string &a, const std::string &b) { + return a + (a.empty() ? "" : ", ") + b; + }); + + throw std::runtime_error( + std::string{"Invalid default value "} + m_default_value_repr + + " - allowed options: {" + choices_as_csv + "}"); + } + } + } + + template + void find_value_in_choices_or_throw(Iterator it) const { + + const auto &choices = m_choices.value(); + + if (std::find(choices.begin(), choices.end(), *it) == choices.end()) { + // provided arg not in list of allowed choices + // report error + + std::string choices_as_csv = + std::accumulate(choices.begin(), choices.end(), std::string(), + [](const std::string &a, const std::string &b) { + return a + (a.empty() ? "" : ", ") + b; + }); + + throw std::runtime_error(std::string{"Invalid argument "} + + details::repr(*it) + " - allowed options: {" + + choices_as_csv + "}"); + } + } + template Iterator consume(Iterator start, Iterator end, std::string_view used_name = {}) { @@ -529,6 +842,14 @@ class Argument { m_is_used = true; m_used_name = used_name; + if (m_choices.has_value()) { + // Check each value in (start, end) and make sure + // it is in the list of allowed choices/options + for (auto it = start; it != end; ++it) { + find_value_in_choices_or_throw(it); + } + } + const auto num_args_max = m_num_args_range.get_max(); const auto num_args_min = m_num_args_range.get_min(); std::size_t dist = 0; @@ -599,6 +920,34 @@ class Argument { throw_nargs_range_validation_error(); } } + + if (m_choices.has_value()) { + // Make sure the default value (if provided) + // is in the list of choices + find_default_value_in_choices_or_throw(); + } + } + + std::string get_names_csv(char separator = ',') const { + return std::accumulate( + m_names.begin(), m_names.end(), std::string{""}, + [&](const std::string &result, const std::string &name) { + return result.empty() ? name : result + separator + name; + }); + } + + std::string get_usage_full() const { + std::stringstream usage; + + usage << get_names_csv('/'); + const std::string metavar = !m_metavar.empty() ? m_metavar : "VAR"; + if (m_num_args_range.get_max() > 0) { + usage << " " << metavar; + if (m_num_args_range.get_max() > 1) { + usage << "..."; + } + } + return usage.str(); } std::string get_inline_usage() const { @@ -673,7 +1022,36 @@ class Argument { name_stream << " " << argument.m_metavar; } } - stream << name_stream.str() << "\t" << argument.m_help; + + // align multiline help message + auto stream_width = stream.width(); + auto name_padding = std::string(name_stream.str().size(), ' '); + auto pos = std::string::size_type{}; + auto prev = std::string::size_type{}; + auto first_line = true; + auto hspace = " "; // minimal space between name and help message + stream << name_stream.str(); + std::string_view help_view(argument.m_help); + while ((pos = argument.m_help.find('\n', prev)) != std::string::npos) { + auto line = help_view.substr(prev, pos - prev + 1); + if (first_line) { + stream << hspace << line; + first_line = false; + } else { + stream.width(stream_width); + stream << name_padding << hspace << line; + } + prev += pos - prev + 1; + } + if (first_line) { + stream << hspace << argument.m_help; + } else { + auto leftover = help_view.substr(prev, argument.m_help.size() - prev); + if (!leftover.empty()) { + stream.width(stream_width); + stream << name_padding << hspace << leftover; + } + } // print nargs spec if (!argument.m_help.empty()) { @@ -706,8 +1084,7 @@ class Argument { using ValueType = typename T::value_type; auto lhs = get(); return std::equal(std::begin(lhs), std::end(lhs), std::begin(rhs), - std::end(rhs), - [](const auto &a, const auto &b) { + std::end(rhs), [](const auto &a, const auto &b) { return std::any_cast(a) == b; }); } @@ -1031,7 +1408,10 @@ class Argument { std::string m_metavar; std::any m_default_value; std::string m_default_value_repr; + std::optional + m_default_value_str; // used for checking default_value against choices std::any m_implicit_value; + std::optional> m_choices{std::nullopt}; using valued_action = std::function; using void_action = std::function; std::variant m_action{ @@ -1052,14 +1432,19 @@ class ArgumentParser { public: explicit ArgumentParser(std::string program_name = {}, std::string version = "1.0", - default_arguments add_args = default_arguments::all) + default_arguments add_args = default_arguments::all, + bool exit_on_default_arguments = true, + std::ostream &os = std::cout) : m_program_name(std::move(program_name)), m_version(std::move(version)), + m_exit_on_default_arguments(exit_on_default_arguments), m_parser_path(m_program_name) { if ((add_args & default_arguments::help) == default_arguments::help) { add_argument("-h", "--help") .action([&](const auto & /*unused*/) { - std::cout << help().str(); - std::exit(0); + os << help().str(); + if (m_exit_on_default_arguments) { + std::exit(0); + } }) .default_value(false) .help("shows help message and exits") @@ -1069,8 +1454,10 @@ class ArgumentParser { if ((add_args & default_arguments::version) == default_arguments::version) { add_argument("-v", "--version") .action([&](const auto & /*unused*/) { - std::cout << m_version << std::endl; - std::exit(0); + os << m_version << std::endl; + if (m_exit_on_default_arguments) { + std::exit(0); + } }) .default_value(false) .help("prints version information and exits") @@ -1079,51 +1466,25 @@ class ArgumentParser { } } - ArgumentParser(ArgumentParser &&) noexcept = default; - ArgumentParser &operator=(ArgumentParser &&) = default; - - ArgumentParser(const ArgumentParser &other) - : m_program_name(other.m_program_name), m_version(other.m_version), - m_description(other.m_description), m_epilog(other.m_epilog), - m_prefix_chars(other.m_prefix_chars), - m_assign_chars(other.m_assign_chars), m_is_parsed(other.m_is_parsed), - m_positional_arguments(other.m_positional_arguments), - m_optional_arguments(other.m_optional_arguments), - m_parser_path(other.m_parser_path), m_subparsers(other.m_subparsers) { - for (auto it = std::begin(m_positional_arguments); - it != std::end(m_positional_arguments); ++it) { - index_argument(it); - } - for (auto it = std::begin(m_optional_arguments); - it != std::end(m_optional_arguments); ++it) { - index_argument(it); - } - for (auto it = std::begin(m_subparsers); it != std::end(m_subparsers); - ++it) { - m_subparser_map.insert_or_assign(it->get().m_program_name, it); - m_subparser_used.insert_or_assign(it->get().m_program_name, false); - } - } - ~ArgumentParser() = default; - ArgumentParser &operator=(const ArgumentParser &other) { - auto tmp = other; - std::swap(*this, tmp); - return *this; - } + // ArgumentParser is meant to be used in a single function. + // Setup everything and parse arguments in one place. + // + // ArgumentParser internally uses std::string_views, + // references, iterators, etc. + // Many of these elements become invalidated after a copy or move. + ArgumentParser(const ArgumentParser &other) = delete; + ArgumentParser &operator=(const ArgumentParser &other) = delete; + ArgumentParser(ArgumentParser &&) noexcept = delete; + ArgumentParser &operator=(ArgumentParser &&) = delete; explicit operator bool() const { - auto arg_used = std::any_of(m_argument_map.cbegin(), - m_argument_map.cend(), - [](auto &it) { - return it.second->m_is_used; - }); - auto subparser_used = std::any_of(m_subparser_used.cbegin(), - m_subparser_used.cend(), - [](auto &it) { - return it.second; - }); + auto arg_used = std::any_of(m_argument_map.cbegin(), m_argument_map.cend(), + [](auto &it) { return it.second->m_is_used; }); + auto subparser_used = + std::any_of(m_subparser_used.cbegin(), m_subparser_used.cend(), + [](auto &it) { return it.second; }); return m_is_parsed && (arg_used || subparser_used); } @@ -1145,10 +1506,47 @@ class ArgumentParser { return *argument; } + class MutuallyExclusiveGroup { + friend class ArgumentParser; + + public: + MutuallyExclusiveGroup() = delete; + + explicit MutuallyExclusiveGroup(ArgumentParser &parent, + bool required = false) + : m_parent(parent), m_required(required), m_elements({}) {} + + MutuallyExclusiveGroup(const MutuallyExclusiveGroup &other) = delete; + MutuallyExclusiveGroup & + operator=(const MutuallyExclusiveGroup &other) = delete; + + MutuallyExclusiveGroup(MutuallyExclusiveGroup &&other) noexcept + : m_parent(other.m_parent), m_required(other.m_required), + m_elements(std::move(other.m_elements)) { + other.m_elements.clear(); + } + + template Argument &add_argument(Targs... f_args) { + auto &argument = m_parent.add_argument(std::forward(f_args)...); + m_elements.push_back(&argument); + return argument; + } + + private: + ArgumentParser &m_parent; + bool m_required{false}; + std::vector m_elements{}; + }; + + MutuallyExclusiveGroup &add_mutually_exclusive_group(bool required = false) { + m_mutually_exclusive_groups.emplace_back(*this, required); + return m_mutually_exclusive_groups.back(); + } + // Parameter packed add_parents method // Accepts a variadic number of ArgumentParser objects template - ArgumentParser &add_parents(const Targs &... f_args) { + ArgumentParser &add_parents(const Targs &...f_args) { for (const ArgumentParser &parent_parser : {std::ref(f_args)...}) { for (const auto &argument : parent_parser.m_positional_arguments) { auto it = m_positional_arguments.insert( @@ -1177,8 +1575,7 @@ class ArgumentParser { /* Getter for arguments and subparsers. * @throws std::logic_error in case of an invalid argument or subparser name */ - template - T& at(std::string_view name) { + template T &at(std::string_view name) { if constexpr (std::is_same_v) { return (*this)[name]; } else { @@ -1211,6 +1608,43 @@ class ArgumentParser { for ([[maybe_unused]] const auto &[unused, argument] : m_argument_map) { argument->validate(); } + + // Check each mutually exclusive group and make sure + // there are no constraint violations + for (const auto &group : m_mutually_exclusive_groups) { + auto mutex_argument_used{false}; + Argument *mutex_argument_it{nullptr}; + for (Argument *arg : group.m_elements) { + if (!mutex_argument_used && arg->m_is_used) { + mutex_argument_used = true; + mutex_argument_it = arg; + } else if (mutex_argument_used && arg->m_is_used) { + // Violation + throw std::runtime_error("Argument '" + arg->get_usage_full() + + "' not allowed with '" + + mutex_argument_it->get_usage_full() + "'"); + } + } + + if (!mutex_argument_used && group.m_required) { + // at least one argument from the group is + // required + std::string argument_names{}; + std::size_t i = 0; + std::size_t size = group.m_elements.size(); + for (Argument *arg : group.m_elements) { + if (i + 1 == size) { + // last + argument_names += "'" + arg->get_usage_full() + "' "; + } else { + argument_names += "'" + arg->get_usage_full() + "' or "; + } + i += 1; + } + throw std::runtime_error("One of the arguments " + argument_names + + "is required"); + } + } } /* Call parse_known_args_internal - which does all the work @@ -1289,7 +1723,7 @@ class ArgumentParser { } /* Indexing operator. Return a reference to an Argument object - * Used in conjuction with Argument.operator== e.g., parser["foo"] == true + * Used in conjunction with Argument.operator== e.g., parser["foo"] == true * @throws std::logic_error in case of an invalid argument name */ Argument &operator[](std::string_view arg_name) const { @@ -1350,12 +1784,20 @@ class ArgumentParser { stream << argument; } - if (!parser.m_subparser_map.empty()) { + bool has_visible_subcommands = std::any_of( + parser.m_subparser_map.begin(), parser.m_subparser_map.end(), + [](auto &p) { return !p.second->get().m_suppress; }); + + if (has_visible_subcommands) { stream << (parser.m_positional_arguments.empty() ? (parser.m_optional_arguments.empty() ? "" : "\n") : "\n") << "Subcommands:\n"; for (const auto &[command, subparser] : parser.m_subparser_map) { + if (subparser->get().m_suppress) { + continue; + } + stream << std::setw(2) << " "; stream << std::setw(static_cast(longest_arg_length - 2)) << command; @@ -1400,7 +1842,11 @@ class ArgumentParser { if (!m_subparser_map.empty()) { stream << " {"; std::size_t i{0}; - for (const auto &[command, unused] : m_subparser_map) { + for (const auto &[command, subparser] : m_subparser_map) { + if (subparser->get().m_suppress) { + continue; + } + if (i == 0) { stream << command; } else { @@ -1430,6 +1876,8 @@ class ArgumentParser { m_subparser_used.insert_or_assign(parser.m_program_name, false); } + void set_suppress(bool suppress) { m_suppress = suppress; } + private: bool is_valid_prefix_char(char c) const { return m_prefix_chars.find(c) != std::string::npos; @@ -1534,8 +1982,41 @@ class ArgumentParser { unprocessed_arguments); } - throw std::runtime_error( - "Maximum number of positional arguments exceeded"); + if (m_positional_arguments.empty()) { + + // Ask the user if they argument they provided was a typo + // for some sub-parser, + // e.g., user provided `git totes` instead of `git notes` + if (!m_subparser_map.empty()) { + throw std::runtime_error( + "Failed to parse '" + current_argument + "', did you mean '" + + std::string{details::get_most_similar_string( + m_subparser_map, current_argument)} + + "'"); + } + + // Ask the user if they meant to use a specific optional argument + if (!m_optional_arguments.empty()) { + for (const auto &opt : m_optional_arguments) { + if (!opt.m_implicit_value.has_value()) { + // not a flag, requires a value + if (!opt.m_is_used) { + throw std::runtime_error( + "Zero positional arguments expected, did you mean " + + opt.get_usage_full()); + } + } + } + + throw std::runtime_error("Zero positional arguments expected"); + } else { + throw std::runtime_error("Zero positional arguments expected"); + } + } else { + throw std::runtime_error("Maximum number of positional arguments " + "exceeded, failed to parse '" + + current_argument + "'"); + } } auto argument = positional_argument_it++; it = argument->consume(it, end); @@ -1654,7 +2135,8 @@ class ArgumentParser { } std::size_t max_size = 0; for ([[maybe_unused]] const auto &[unused, argument] : m_argument_map) { - max_size = std::max(max_size, argument->get_arguments_length()); + max_size = + std::max(max_size, argument->get_arguments_length()); } for ([[maybe_unused]] const auto &[command, unused] : m_subparser_map) { max_size = std::max(max_size, command.size()); @@ -1663,6 +2145,7 @@ class ArgumentParser { } using argument_it = std::list::iterator; + using mutex_group_it = std::vector::iterator; using argument_parser_it = std::list>::iterator; @@ -1676,6 +2159,7 @@ class ArgumentParser { std::string m_version; std::string m_description; std::string m_epilog; + bool m_exit_on_default_arguments = true; std::string m_prefix_chars{"-"}; std::string m_assign_chars{"="}; bool m_is_parsed = false; @@ -1686,6 +2170,8 @@ class ArgumentParser { std::list> m_subparsers; std::map m_subparser_map; std::map m_subparser_used; + std::vector m_mutually_exclusive_groups; + bool m_suppress = false; }; } // namespace argparse diff --git a/src/main.cpp b/src/main.cpp index 621760a..3441521 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -30,18 +30,17 @@ int main(int argc, char * argv[]) // clang-format off ArgumentParser program("phaseless", VERSION, default_arguments::version); + program.add_epilog("This project is still under development!\n" + "Contact: zilong.dk@gmail.com"); program.add_argument("-D","--debug") .help("enable debug mode") - .default_value(false) - .implicit_value(true); + .flag(); program.add_argument("-S", "--no-stdout") .help("disable print log to screen") - .default_value(false) - .implicit_value(true); + .flag(); program.add_argument("-a", "--no-accel") .help("disable accelerated EM") - .default_value(false) - .implicit_value(true); + .flag(); program.add_argument("-l", "--ltol") .help("convergence tolerance of difference in log likelihoods") .default_value(1e-1) @@ -68,12 +67,10 @@ int main(int argc, char * argv[]) .implicit_value(true); program.add_argument("-r", "--NR") .help("disable updating R") - .default_value(false) - .implicit_value(true); + .flag(); program.add_argument("-f","--NF") .help("disable updating F") - .default_value(false) - .implicit_value(true); + .flag(); program.add_argument("--qfile") .help("read Q file as the start point") .default_value(std::string{""}); @@ -114,16 +111,13 @@ int main(int argc, char * argv[]) .scan<'i', int>(); cmd_joint.add_argument("-S", "--single-chunk") .help("treat input as big single chunk") - .default_value(false) - .implicit_value(true); + .flag(); cmd_joint.add_argument("-V", "--vcf") .help("output the VCF file") - .default_value(false) - .implicit_value(true); + .flag(); cmd_joint.add_argument("-Q", "--aQ") .help("aphla is accelarated with Q only") - .default_value(false) - .implicit_value(true); + .flag(); cmd_joint.add_argument("-d","--seed") .help("seed for reproducibility") .default_value(999) @@ -138,8 +132,7 @@ int main(int argc, char * argv[]) .scan<'i', int>(); cmd_impute.add_argument("-C", "--collapse") .help("collapse SNPs in a reasonable window") - .default_value(false) - .implicit_value(true); + .flag(); cmd_impute.add_argument("-B", "--grid-size") .help("number of SNPs (>1) in each grid. 1 disables collapsing") .default_value(1) @@ -149,6 +142,7 @@ int main(int argc, char * argv[]) .default_value(std::string{""}); cmd_impute.add_argument("-g", "--beagle") .help("gziped beagle format as input") + .required() .default_value(std::string{""}); cmd_impute.add_argument("-i", "--iterations") .help("number of EM iterations") @@ -170,18 +164,20 @@ int main(int argc, char * argv[]) .scan<'i', int>(); cmd_impute.add_argument("-S", "--single-chunk") .help("treat input as big single chunk") - .default_value(false) - .implicit_value(true); + .flag(); cmd_impute.add_argument("-d","--seed") .help("seed for reproducibility") .default_value(999) .scan<'i', int>(); cmd_impute.add_argument("--write-hapsum") .help("write Hapsum instead of AE into parse.bin") - .default_value(false) - .implicit_value(true); + .flag(); cmd_impute.add_argument("--refill-haps") - .help("refill infrequently used haplotype clusters.\n 0: disable this;\n 1: reset P to min allele emission probability for that haplotype cluster;\n 2: re-sample P by copying from haplotype with the highest probability;\n 3: re-sample P by copying from others with respect to their probability.") + .help("refill infrequently used haplotype clusters.\n" + "1: reset P to min allele emission probability for that haplotype cluster\n" + "2: re-sample P by copying from haplotype with the highest probability\n" + "3: re-sample P by copying from others with respect to their probability\n" + "0: disable this") .default_value(0) .scan<'i', int>(); cmd_impute.add_argument("--minRecombRate") @@ -226,8 +222,7 @@ int main(int argc, char * argv[]) .default_value(std::string{"convert"}); cmd_convert.add_argument("-p", "--plink2beagle") .help("use plink1 file as input without .bed") - .default_value(false) - .implicit_value(true); + .flag(); cmd_convert.add_argument("-n", "--threads") .help("number of threads") .default_value(4) @@ -327,7 +322,6 @@ int main(int argc, char * argv[]) } else { - cao.cerr("Contact: Zilong Li (zilong.dk@gmail.com)\n"); cao.cerr(program.help().str()); std::exit(1); } From b2bfdaf49e1d08e52c6d4307b98c4c6afd582622 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Fri, 2 Feb 2024 13:59:52 +0100 Subject: [PATCH 31/67] do not show nargs range if just one args --- inst/include/argparse/argparse.hpp | 3 ++- src/main.cpp | 6 ++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/inst/include/argparse/argparse.hpp b/inst/include/argparse/argparse.hpp index 0c85127..d5a5959 100644 --- a/inst/include/argparse/argparse.hpp +++ b/inst/include/argparse/argparse.hpp @@ -1128,7 +1128,8 @@ class Argument { if (range.m_max == (std::numeric_limits::max)()) { stream << "[nargs: " << range.m_min << " or more] "; } else { - stream << "[nargs=" << range.m_min << ".." << range.m_max << "] "; + if (!(range.m_min == 0 && range.m_max == 1)) + stream << "[nargs=" << range.m_min << ".." << range.m_max << "] "; } } return stream; diff --git a/src/main.cpp b/src/main.cpp index 3441521..d0d1788 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -59,12 +59,10 @@ int main(int argc, char * argv[]) .scan<'g', double>(); program.add_argument("-q","--NQ") .help("disable updating Q") - .default_value(false) - .implicit_value(true); + .flag(); program.add_argument("-p", "--NP") .help("disable updating P") - .default_value(false) - .implicit_value(true); + .flag(); program.add_argument("-r", "--NR") .help("disable updating R") .flag(); From 9f542977185e0b5d82f9a7e8c161ff62e04e0c3b Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Fri, 2 Feb 2024 16:18:22 +0100 Subject: [PATCH 32/67] constrain F with P instead of norm F --- src/admixture.cpp | 24 +++++++++++++++++++----- src/admixture.hpp | 5 +++-- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index 3212b96..d8b296e 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -162,19 +162,33 @@ void Admixture::protectPars() } if(F.isNaN().any()) cao.error("NaN in F\n"); - F = (F < clusterFreqThreshold).select(clusterFreqThreshold, F); // lower bound - F = (F > 1 - clusterFreqThreshold).select(1 - clusterFreqThreshold, F); // upper bound + // F = (F < clusterFreqThreshold).select(clusterFreqThreshold, F); // lower bound + // F = (F > 1 - clusterFreqThreshold).select(1 - clusterFreqThreshold, F); // upper bound normalizeF(); } void Admixture::normalizeF() { - for(int k = 0; k < K; k++) // normalize F per snp per k + for(int k = 0; k < K; k++) + { + for(int c = 0; c < C; c++) + for(int m = 0; m < M; m++) + if(F(k * C + c, m) < P(c, m)) F(k * C + c, m) = P(c, m); F.middleRows(k * C, C).rowwise() /= F.middleRows(k * C, C).colwise().sum(); + } } -void Admixture::setStartPoint(std::string qfile) +void Admixture::setStartPoint(const std::unique_ptr & genome, std::string qfile) { + P = MyArr2D(C, M); + for(int ic = 0, m = 0; ic < genome->nchunks; ic++) + { + const int S = genome->pos[ic].size(); + Eigen::Map AE(genome->AE[ic].data(), C * C, S); + for(int s = 0; s < S; s++) P.col(m + s) = AE.col(s).reshaped(C, C).colwise().sum(); + m += S; + } + if(!qfile.empty()) load_csv(Q, qfile); } @@ -219,7 +233,7 @@ int run_admix_main(Options & opts) cao.warn(tim.date(), "-> running admixture with seed =", opts.seed); Admixture admixer(genome->nsamples, genome->G, genome->C, opts.K, opts.seed); admixer.setFlags(opts.debug, opts.nQ); - admixer.setStartPoint(opts.in_qfile); + admixer.setStartPoint(genome, opts.in_qfile); vector> llike; if(!opts.noaccel) { diff --git a/src/admixture.hpp b/src/admixture.hpp index 1cc14f0..50bc364 100644 --- a/src/admixture.hpp +++ b/src/admixture.hpp @@ -29,7 +29,7 @@ class Admixture Q.rowwise() /= Q.colwise().sum(); // normalize Q per individual F = RandomUniform(C * K, M, rng, clusterFreqThreshold, 1 - clusterFreqThreshold); - normalizeF(); + for(int k = 0; k < K; k++) F.middleRows(k * C, C).rowwise() /= F.middleRows(k * C, C).colwise().sum(); } ~Admixture() {} @@ -37,6 +37,7 @@ class Admixture // SHARED VARIBALES const int N, M, C, K; // M: number of grids in total, C2 = C x C MyArr2D F; // (C x K) x M + MyArr2D P; // C x M, for each k, F <= P MyArr2D Q; // K x N MyArr2D Ekc; // (C * K) x M, expected number of alleles per c per k MyArr2D NormF; // K x M @@ -46,7 +47,7 @@ class Admixture void protectPars(); void normalizeF(); void setFlags(bool, bool); - void setStartPoint(std::string qfile); + void setStartPoint(const std::unique_ptr & genome, std::string qfile); void writeQ(std::string out); double runNativeWithBigAss(int ind, const std::unique_ptr & genome); double runOptimalWithBigAss(int ind, const std::unique_ptr & genome); From e87c7af46573b649f6aad4ba6648c3a1fc4ea0e3 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Fri, 2 Feb 2024 16:19:11 +0100 Subject: [PATCH 33/67] test constrained F v0.5.0 --- src/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.cpp b/src/main.cpp index d0d1788..88934e5 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -18,7 +18,7 @@ int main(int argc, char * argv[]) { // ========= helper message and parameters parsing =========================== - const std::string VERSION{"0.4.1"}; + const std::string VERSION{"0.5.0"}; // below for catching ctrl+c, and dumping files struct sigaction sa; From 2140a62b78769b1ffab82f7ff51c7bed21e246dd Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Fri, 2 Feb 2024 17:21:59 +0100 Subject: [PATCH 34/67] output F --- src/admixture.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/admixture.cpp b/src/admixture.cpp index d8b296e..7d755b5 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -354,6 +354,9 @@ int run_admix_main(Options & opts) } cao.done(tim.date(), "admixture done and outputting"); admixer.writeQ(opts.out + ".Q"); + Eigen::IOFormat fmt(6, Eigen::DontAlignCols, " ", "\n"); + std::ofstream of(opts.out + ".F"); + of << admixer.F.transpose().format(fmt) << "\n"; cao.done(tim.date(), "-> good job. have a nice day, bye!"); return 0; From 816c7bcf269d7da820a3e2fcf4516390e52d48e1 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 5 Feb 2024 13:26:11 +0100 Subject: [PATCH 35/67] add --output-F option --- src/admixture.cpp | 12 +++++++----- src/admixture.hpp | 2 +- src/common.hpp | 5 +++-- src/fastphase.cpp | 1 - src/main.cpp | 4 ++++ src/phaseless.cpp | 1 - 6 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index 7d755b5..8763288 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -164,10 +164,10 @@ void Admixture::protectPars() if(F.isNaN().any()) cao.error("NaN in F\n"); // F = (F < clusterFreqThreshold).select(clusterFreqThreshold, F); // lower bound // F = (F > 1 - clusterFreqThreshold).select(1 - clusterFreqThreshold, F); // upper bound - normalizeF(); + constrainF(); } -void Admixture::normalizeF() +void Admixture::constrainF() { for(int k = 0; k < K; k++) { @@ -354,9 +354,11 @@ int run_admix_main(Options & opts) } cao.done(tim.date(), "admixture done and outputting"); admixer.writeQ(opts.out + ".Q"); - Eigen::IOFormat fmt(6, Eigen::DontAlignCols, " ", "\n"); - std::ofstream of(opts.out + ".F"); - of << admixer.F.transpose().format(fmt) << "\n"; + if(opts.oF) + { + std::ofstream of(opts.out + ".F"); + of << admixer.F.transpose().format(fmt) << "\n"; + } cao.done(tim.date(), "-> good job. have a nice day, bye!"); return 0; diff --git a/src/admixture.hpp b/src/admixture.hpp index 50bc364..535b8fb 100644 --- a/src/admixture.hpp +++ b/src/admixture.hpp @@ -45,7 +45,7 @@ class Admixture void initIteration(); void updateIteration(); void protectPars(); - void normalizeF(); + void constrainF(); void setFlags(bool, bool); void setStartPoint(const std::unique_ptr & genome, std::string qfile); void writeQ(std::string out); diff --git a/src/common.hpp b/src/common.hpp index 10d3ee8..b39ba1d 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -76,6 +76,8 @@ using MyMat1D = Eigen::Matrix; using MyArr2D = Eigen::Array; using MyArr1D = Eigen::Array; +inline Eigen::IOFormat fmt(6, Eigen::DontAlignCols, " ", "\n"); + template inline MatrixType RandomUniform(const Eigen::Index numRows, const Eigen::Index numCols, @@ -97,7 +99,7 @@ struct Options double ftol{1e-6}; // threshold for F double qtol{1e-6}; // threshold for Q bool noaccel{0}, noscreen{0}, single_chunk{0}, debug{0}, collapse{0}; - bool nQ{0}, nP{0}, nF{0}, nR{0}, aQ{0}, oVCF{0}, eHap{0}; + bool nQ{0}, nP{0}, nF{0}, nR{0}, aQ{0}, oVCF{0}, eHap{0}, oF{0}; std::string out, in_beagle, in_vcf, in_bin, in_impute, in_joint; std::string samples{""}, region{""}, in_plink{""}, in_qfile{""}, in_pfile{""}, in_rfile{""}; std::string opts_in_effect{"Options in effect:\n "}; @@ -403,7 +405,6 @@ inline auto forward_backwards_diploid(const MyArr2D & emit, const MyArr2D & R, c return std::tuple(alpha, beta, cs); } - /// R: 3 x M; PI: C x M inline auto get_cluster_frequency(const MyArr2D & R, const MyArr2D & PI) { diff --git a/src/fastphase.cpp b/src/fastphase.cpp index caf60fe..bc65096 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -241,7 +241,6 @@ int run_impute_main(Options & opts) std::unique_ptr genome = std::make_unique(); init_bigass(genome, opts); - Eigen::IOFormat fmt(6, Eigen::DontAlignCols, " ", "\n"); vector> res; FastPhaseK2 faith(genome->nsamples, genome->nsnps, opts.C, opts.seed); faith.setFlags(opts.ptol, opts.ftol, opts.qtol, opts.debug, opts.nQ, opts.nP, opts.nF, opts.nR); diff --git a/src/main.cpp b/src/main.cpp index 88934e5..2bb78e9 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -69,6 +69,9 @@ int main(int argc, char * argv[]) program.add_argument("-f","--NF") .help("disable updating F") .flag(); + program.add_argument("--output-F") + .help("output F") + .flag(); program.add_argument("--qfile") .help("read Q file as the start point") .default_value(std::string{""}); @@ -255,6 +258,7 @@ int main(int argc, char * argv[]) opts.nP = program.get("--NP"); opts.nR = program.get("--NR"); opts.nF = program.get("--NF"); + opts.oF = program.get("--output-F"); opts.ltol = program.get("--ltol"); opts.noaccel = program.get("--no-accel"); diff --git a/src/phaseless.cpp b/src/phaseless.cpp index 6fc07fd..38d8756 100644 --- a/src/phaseless.cpp +++ b/src/phaseless.cpp @@ -276,7 +276,6 @@ int run_phaseless_main(Options & opts) std::unique_ptr genome = std::make_unique(); init_bigass(genome, opts); - Eigen::IOFormat fmt(6, Eigen::DontAlignCols, " ", "\n"); vector> res; std::ofstream oanc(opts.out + ".Q"); std::ofstream op(opts.out + ".P"); From d13d2861a95ad8ec34989b0c776c2e3855bab7d9 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 5 Feb 2024 14:05:36 +0100 Subject: [PATCH 36/67] update tests --- tests/Makefile | 6 +++--- tests/{test-phaseless.cpp => test-admixture.cpp} | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) rename tests/{test-phaseless.cpp => test-admixture.cpp} (96%) diff --git a/tests/Makefile b/tests/Makefile index 8f6f322..4c34b95 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -5,9 +5,9 @@ INC = -I. -I../src -I../inst/include -I$(HTSDIR) LDFLAGS = -L$(HTSDIR) -Wl,-rpath,$(HTSDIR) LIBS = -lhts -llzma -lbz2 -lm -lz -lpthread -OBJS = test-main.o test-joint.o test-phaseless.o test-fastphase.o +OBJS = test-main.o test-joint.o test-admixture.o test-fastphase.o -BINS = test-joint.bin test-phaseless.bin test-fastphase.bin +BINS = test-joint.bin test-admixture.bin test-fastphase.bin DEPS = ../src/phaseless.o ../src/fastphase.o ../src/admixture.o @@ -25,7 +25,7 @@ all: $(BINS) $(OBJS) test: $(BINS) ./test-joint.bin --success - ./test-phaseless.bin --success + ./test-admixture.bin --success ./test-fastphase.bin --success clean: diff --git a/tests/test-phaseless.cpp b/tests/test-admixture.cpp similarity index 96% rename from tests/test-phaseless.cpp rename to tests/test-admixture.cpp index a483b52..4138fe2 100644 --- a/tests/test-phaseless.cpp +++ b/tests/test-admixture.cpp @@ -48,7 +48,9 @@ TEST_CASE("phaseless naive vs dump dataset 1", "[test-phaseless]") } double llike1, llike2; Admixture admixer1(genome->nsamples, genome->nsnps, genome->C, K, seed); + admixer1.setStartPoint(genome, opts.in_qfile); Admixture admixer2(genome->nsamples, genome->nsnps, genome->C, K, seed); + admixer2.setStartPoint(genome, opts.in_qfile); for(int it = 0; it < nadmix; it++) { llike1 = 0; From 9104a2ca879835194af6ba4304389472ef710ccb Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 5 Feb 2024 21:23:49 +0100 Subject: [PATCH 37/67] add --constrain-F option and R scripts --- R/plot_admixture.R | 83 ++++++++++++++++++++++++++++++++++++++++++++++ src/admixture.cpp | 14 +++++--- src/admixture.hpp | 3 +- src/common.hpp | 2 +- src/main.cpp | 45 +++++++++++++------------ 5 files changed, 119 insertions(+), 28 deletions(-) create mode 100644 R/plot_admixture.R diff --git a/R/plot_admixture.R b/R/plot_admixture.R new file mode 100644 index 0000000..c80c895 --- /dev/null +++ b/R/plot_admixture.R @@ -0,0 +1,83 @@ +#' @export +admix.alignKStephens <- function(qlist){ + require(label.switching) + K <- unique(sapply(qlist, ncol)) + if(length(K) > 1) stop("K in qlist should be the same") + # if there is only 1 run, just return it + if(length(qlist)==1) { + qlist1 <- qlist + } else { + # if num of inds or K differ, throw error + ninds <- unique(sapply(qlist, nrow)) + if(length(ninds) > 1) stop("number of inds in qlist should be the same") + # if all runs have K=1, just return it + if(K==1){ + qlist1 <- qlist + } else { + qmat <- lapply(qlist,as.matrix) + p <- aperm(simplify2array(qmat), c(3,1,2)) + perm <- label.switching::stephens(p) + # reorder and rename columns + qlist1 <- lapply(seq_len(dim(p)[1]), + function(i) { + q_perm <- qmat[[i]][, perm$permutations[i,,drop=FALSE],drop=FALSE] + q_perm <- as.data.frame(q_perm) + attributes(q_perm) <- attributes(qlist[[i]]) + q_perm + } + ) + names(qlist1) <- names(qlist) + } + } + return(qlist1) +} + +#' @export +admix.plotQ <- function(qlist, pop, sortind = TRUE, cluster = 1, debug = FALSE, ...) { + N <- length(qlist) + K <- unique(sapply(qlist, ncol)) + nind <- unique(sapply(qlist, nrow)) + par(mfrow=c(N, 1)) + sortQ <- function(Q, pop) { + lapply(split(Q, pop), function(p) { + ord <- order(p[,cluster]) + p[ord,] + }) + } + for(i in seq_along(qlist)){ + Q <- qlist[[i]] + if(sortind) { + s <- sortQ(Q, pop) + ordpop <- order(pop) + namepop <- names(s) + Q <- t(do.call(rbind, s)) + } else { + ordpop <- 1:length(pop) + namepop <- unique(pop) + Q <- t(Q) + colnames(Q) <- pop + } + if(i == N && !debug) { + med<- tapply(1:nind,pop[ordpop],median) + ord <- match(names(med), namepop) + groups <- rep(NA, ncol(Q)) + groups[as.integer(med)] <- namepop[ord] + colnames(Q) <- groups + } else if (N!=1) { + colnames(Q) <- NULL + } + par(mar=c(3.1,5.1,3.1,1.1)) + h <- barplot(Q, col = 1:K, border = NA, space = 0, ylab = "Admixture proportion", main = names(qlist)[i], xaxs = "i", ...) + abline(v=tapply(h,pop[order(pop)],max)+0.5,col="black",lwd=2,lty=2) + } +} + +#' @export +plot.admixQ <- function(qfiles, pop, ...) { + qlist <- lapply(qfiles, read.table) + names(qlist) <- names(qfiles) + pop <- read.table(pop)[,1] + a <- admix.alignKStephens(qlist) + admix.plotQ(a, pop, cex.main = 3, cex.lab=3, cex.names = 3,...) +} + diff --git a/src/admixture.cpp b/src/admixture.cpp index 8763288..6cab08a 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -171,9 +171,12 @@ void Admixture::constrainF() { for(int k = 0; k < K; k++) { - for(int c = 0; c < C; c++) - for(int m = 0; m < M; m++) - if(F(k * C + c, m) < P(c, m)) F(k * C + c, m) = P(c, m); + if(cF) + { + for(int c = 0; c < C; c++) + for(int m = 0; m < M; m++) + if(F(k * C + c, m) < P(c, m)) F(k * C + c, m) = P(c, m); + } F.middleRows(k * C, C).rowwise() /= F.middleRows(k * C, C).colwise().sum(); } } @@ -201,10 +204,11 @@ void Admixture::writeQ(std::string out) ofs.close(); } -void Admixture::setFlags(bool debug_, bool nonewQ_) +void Admixture::setFlags(bool debug_, bool nonewQ_, bool cF_) { debug = debug_; nonewQ = nonewQ_; + cF = cF_; } int run_admix_main(Options & opts) @@ -232,7 +236,7 @@ int run_admix_main(Options & opts) cao.warn(tim.date(), "-> running admixture with seed =", opts.seed); Admixture admixer(genome->nsamples, genome->G, genome->C, opts.K, opts.seed); - admixer.setFlags(opts.debug, opts.nQ); + admixer.setFlags(opts.debug, opts.nQ, opts.cF); admixer.setStartPoint(genome, opts.in_qfile); vector> llike; if(!opts.noaccel) diff --git a/src/admixture.hpp b/src/admixture.hpp index 535b8fb..1add632 100644 --- a/src/admixture.hpp +++ b/src/admixture.hpp @@ -20,6 +20,7 @@ class Admixture double admixtureThreshold{1e-6}; // threshold for Q bool debug = false; bool nonewQ = false; + bool cF = false; public: Admixture(int n, int m, int c, int k, int seed) : N(n), M(m), C(c), K(k) @@ -46,7 +47,7 @@ class Admixture void updateIteration(); void protectPars(); void constrainF(); - void setFlags(bool, bool); + void setFlags(bool, bool, bool); void setStartPoint(const std::unique_ptr & genome, std::string qfile); void writeQ(std::string out); double runNativeWithBigAss(int ind, const std::unique_ptr & genome); diff --git a/src/common.hpp b/src/common.hpp index b39ba1d..4bec950 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -99,7 +99,7 @@ struct Options double ftol{1e-6}; // threshold for F double qtol{1e-6}; // threshold for Q bool noaccel{0}, noscreen{0}, single_chunk{0}, debug{0}, collapse{0}; - bool nQ{0}, nP{0}, nF{0}, nR{0}, aQ{0}, oVCF{0}, eHap{0}, oF{0}; + bool nQ{0}, nP{0}, nF{0}, nR{0}, aQ{0}, oVCF{0}, eHap{0}, oF{0}, cF{0}; std::string out, in_beagle, in_vcf, in_bin, in_impute, in_joint; std::string samples{""}, region{""}, in_plink{""}, in_qfile{""}, in_pfile{""}, in_rfile{""}; std::string opts_in_effect{"Options in effect:\n "}; diff --git a/src/main.cpp b/src/main.cpp index 2bb78e9..583de4d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -41,22 +41,6 @@ int main(int argc, char * argv[]) program.add_argument("-a", "--no-accel") .help("disable accelerated EM") .flag(); - program.add_argument("-l", "--ltol") - .help("convergence tolerance of difference in log likelihoods") - .default_value(1e-1) - .scan<'g', double>(); - program.add_argument("-P", "--ptol") - .help("lower boundary for P") - .default_value(1e-4) - .scan<'g', double>(); - program.add_argument("-F", "--ftol") - .help("lower boundary for F") - .default_value(1e-4) - .scan<'g', double>(); - program.add_argument("-Q", "--qtol") - .help("lower boundary for Q") - .default_value(1e-6) - .scan<'g', double>(); program.add_argument("-q","--NQ") .help("disable updating Q") .flag(); @@ -69,9 +53,25 @@ int main(int argc, char * argv[]) program.add_argument("-f","--NF") .help("disable updating F") .flag(); - program.add_argument("--output-F") + program.add_argument("-F","--write-F") .help("output F") .flag(); + program.add_argument("--ltol") + .help("convergence tolerance for difference in log likelihoods") + .default_value(1e-2) + .scan<'g', double>(); + program.add_argument("--ptol") + .help("lower boundary for P") + .default_value(1e-6) + .scan<'g', double>(); + program.add_argument("--ftol") + .help("lower boundary for F") + .default_value(1e-9) + .scan<'g', double>(); + program.add_argument("--qtol") + .help("lower boundary for Q") + .default_value(1e-9) + .scan<'g', double>(); program.add_argument("--qfile") .help("read Q file as the start point") .default_value(std::string{""}); @@ -198,11 +198,11 @@ int main(int argc, char * argv[]) .scan<'i', int>(); cmd_admix.add_argument("-i", "--iterations") .help("number of maximun EM iterations") - .default_value(1000) + .default_value(2000) .scan<'i', int>(); cmd_admix.add_argument("-n", "--threads") .help("number of threads") - .default_value(4) + .default_value(10) .scan<'i', int>(); cmd_admix.add_argument("-o", "--out") .help("output prefix") @@ -211,7 +211,9 @@ int main(int argc, char * argv[]) .help("seed for reproducibility") .default_value(999) .scan<'i', int>(); - // cmd_admix.add_parents(program); + cmd_admix.add_argument("-F", "--constrain-F") + .help("apply constraint on F so that it is not smaller than cluster frequency in fastphase model") + .flag(); argparse::ArgumentParser cmd_convert("convert", VERSION, default_arguments::help); cmd_convert.add_description("different file format converter"); @@ -258,7 +260,7 @@ int main(int argc, char * argv[]) opts.nP = program.get("--NP"); opts.nR = program.get("--NR"); opts.nF = program.get("--NF"); - opts.oF = program.get("--output-F"); + opts.oF = program.get("--write-F"); opts.ltol = program.get("--ltol"); opts.noaccel = program.get("--no-accel"); @@ -309,6 +311,7 @@ int main(int argc, char * argv[]) opts.K = cmd_admix.get("-k"); opts.nthreads = cmd_admix.get("--threads"); opts.nadmix = cmd_admix.get("--iterations"); + opts.cF = cmd_admix.get("--constrain-F"); if(opts.in_bin.empty() || cmd_admix.get("--help")) throw std::runtime_error(cmd_admix.help().str()); run_admix_main(opts); } From ad376bbcf39d20c2877fa90e4e0121c7b47bbb84 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 5 Feb 2024 21:32:14 +0100 Subject: [PATCH 38/67] if not cF, then apply fixed boundary --- src/admixture.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index 6cab08a..da24e9c 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -162,8 +162,11 @@ void Admixture::protectPars() } if(F.isNaN().any()) cao.error("NaN in F\n"); - // F = (F < clusterFreqThreshold).select(clusterFreqThreshold, F); // lower bound - // F = (F > 1 - clusterFreqThreshold).select(1 - clusterFreqThreshold, F); // upper bound + if(!cF) + { + F = (F < clusterFreqThreshold).select(clusterFreqThreshold, F); // lower bound + F = (F > 1 - clusterFreqThreshold).select(1 - clusterFreqThreshold, F); // upper bound + } constrainF(); } From eaf2368da398bb76f9c17b64b8868c02c2f40af8 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 5 Feb 2024 22:01:18 +0100 Subject: [PATCH 39/67] if cF, constrain init F as well --- src/admixture.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/admixture.cpp b/src/admixture.cpp index da24e9c..201e2f9 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -194,6 +194,16 @@ void Admixture::setStartPoint(const std::unique_ptr & genome, std::strin for(int s = 0; s < S; s++) P.col(m + s) = AE.col(s).reshaped(C, C).colwise().sum(); m += S; } + if(cF) + { + for(int k = 0; k < K; k++) + { + for(int c = 0; c < C; c++) + for(int m = 0; m < M; m++) + if(F(k * C + c, m) < P(c, m)) F(k * C + c, m) = P(c, m); + F.middleRows(k * C, C).rowwise() /= F.middleRows(k * C, C).colwise().sum(); + } + } if(!qfile.empty()) load_csv(Q, qfile); } From 3f108a8ce63b9df0ed3b1e5b79b6ea2bdddb45c2 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Tue, 6 Feb 2024 10:02:03 +0100 Subject: [PATCH 40/67] update output precesion for Q --- src/admixture.cpp | 14 +++----------- src/admixture.hpp | 1 - src/common.hpp | 3 ++- src/fastphase.cpp | 8 ++++---- src/phaseless.cpp | 5 ++--- 5 files changed, 11 insertions(+), 20 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index 201e2f9..c5b1d5d 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -208,15 +208,6 @@ void Admixture::setStartPoint(const std::unique_ptr & genome, std::strin if(!qfile.empty()) load_csv(Q, qfile); } -void Admixture::writeQ(std::string out) -{ - std::ofstream ofs(out); - if(!ofs) cao.error(out, strerror(errno)); - Q = (Q * 1e6).round() / 1e6; - ofs << std::fixed << Q.transpose() << "\n"; - ofs.close(); -} - void Admixture::setFlags(bool debug_, bool nonewQ_, bool cF_) { debug = debug_; @@ -370,11 +361,12 @@ int run_admix_main(Options & opts) } } cao.done(tim.date(), "admixture done and outputting"); - admixer.writeQ(opts.out + ".Q"); + std::ofstream oq(opts.out + ".Q"); + oq << std::fixed << admixer.Q.transpose().format(fmt10) << "\n"; if(opts.oF) { std::ofstream of(opts.out + ".F"); - of << admixer.F.transpose().format(fmt) << "\n"; + of << admixer.F.transpose().format(fmt6) << "\n"; } cao.done(tim.date(), "-> good job. have a nice day, bye!"); diff --git a/src/admixture.hpp b/src/admixture.hpp index 1add632..bb9c508 100644 --- a/src/admixture.hpp +++ b/src/admixture.hpp @@ -49,7 +49,6 @@ class Admixture void constrainF(); void setFlags(bool, bool, bool); void setStartPoint(const std::unique_ptr & genome, std::string qfile); - void writeQ(std::string out); double runNativeWithBigAss(int ind, const std::unique_ptr & genome); double runOptimalWithBigAss(int ind, const std::unique_ptr & genome); }; diff --git a/src/common.hpp b/src/common.hpp index 4bec950..15e06af 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -76,7 +76,8 @@ using MyMat1D = Eigen::Matrix; using MyArr2D = Eigen::Array; using MyArr1D = Eigen::Array; -inline Eigen::IOFormat fmt(6, Eigen::DontAlignCols, " ", "\n"); +inline Eigen::IOFormat fmt6(6, Eigen::DontAlignCols, " ", "\n"); +inline Eigen::IOFormat fmt10(10, Eigen::DontAlignCols, " ", "\n"); template inline MatrixType RandomUniform(const Eigen::Index numRows, diff --git a/src/fastphase.cpp b/src/fastphase.cpp index bc65096..16254ff 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -298,12 +298,12 @@ int run_impute_main(Options & opts) assert(std::filesystem::file_size(opts.out + ".pars.bin") == bytes_written); cao.done(tim.date(), "imputation done and outputting.", bytes_written, " bytes written to file"); std::ofstream orecomb(opts.out + ".recomb"); - orecomb << faith.R.transpose().format(fmt) << "\n"; + orecomb << faith.R.transpose().format(fmt6) << "\n"; std::ofstream opi(opts.out + ".pi"); - opi << faith.PI.transpose().format(fmt) << "\n"; + opi << faith.PI.transpose().format(fmt6) << "\n"; std::ofstream ohap(opts.out + ".hapfreq"); - ohap << faith.HapSum.transpose().format(fmt) << "\n"; + ohap << faith.HapSum.transpose().format(fmt6) << "\n"; std::ofstream oae(opts.out + ".ae"); - oae << faith.Ezj.transpose().format(fmt) << "\n"; + oae << faith.Ezj.transpose().format(fmt6) << "\n"; return 0; } diff --git a/src/phaseless.cpp b/src/phaseless.cpp index 38d8756..63a8c03 100644 --- a/src/phaseless.cpp +++ b/src/phaseless.cpp @@ -409,10 +409,9 @@ int run_phaseless_main(Options & opts) } } } - faith.Q = (faith.Q * 1e6).round() / 1e6; - oanc << std::fixed << faith.Q.transpose().format(fmt) << "\n"; + oanc << std::fixed << faith.Q.transpose().format(fmt10) << "\n"; oanc.close(); - op << std::fixed << faith.P.format(fmt) << "\n"; + op << faith.P.format(fmt6) << "\n"; std::unique_ptr par = std::make_unique(); par->init(faith.K, faith.C, faith.M, faith.N, faith.er, faith.P, faith.Q, faith.F); par->pos = genome->pos; From 8da1e30528a0be3e1ce8ca05798e52dafd7fda07 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Tue, 6 Feb 2024 10:59:15 +0100 Subject: [PATCH 41/67] add B init --- src/common.hpp | 1 + src/fastphase.cpp | 35 ++++++++++++++++++++++++++++------- src/fastphase.hpp | 16 +++++++--------- 3 files changed, 36 insertions(+), 16 deletions(-) diff --git a/src/common.hpp b/src/common.hpp index 15e06af..a83308d 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -47,6 +47,7 @@ inline void handler(int s) using Bool1D = std::vector; using Int1D = std::vector; using Int2D = std::vector; +using Int3D = std::vector; using Float1D = std::vector; using Float2D = std::vector; using Double1D = std::vector; diff --git a/src/fastphase.cpp b/src/fastphase.cpp index 16254ff..0e38ba9 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -8,19 +8,39 @@ using namespace std; -void FastPhaseK2::initRecombination(const Int2D & pos, std::string rfile, int B, double Ne) +void FastPhaseK2::initRecombination(const Int2D & pos, std::string rfile, int B_, double Ne) { nGen = 4 * Ne / C; + B = B_; int nchunks = pos.size(); pos_chunk.resize(nchunks + 1); - int i{0}, ss{0}; dist.reserve(M); - for(i = 0; i < nchunks; i++) + Int1D tmpdist; + int i{0}, ss{0}; + for(i = 0, G = 0; i < nchunks; i++) + { + if(B > 1) + G += (pos[i].size() + B - 1) / B; + else + G += pos[i].size(); + } + R = MyArr2D(3, G); + PI = MyArr2D::Ones(C, G); + PI.rowwise() /= PI.colwise().sum(); // normalize it per site + for(i = 0, ss = 0; i < nchunks; i++) { pos_chunk[i] = ss; - auto tmp = calc_position_distance(pos[i]); - dist.insert(dist.end(), tmp.begin(), tmp.end()); - R.middleCols(ss, pos[i].size()) = calc_transRate_diploid(tmp, nGen); + if(B > 1) + { + grids.emplace_back(divide_pos_into_grid(pos[i], B)); + tmpdist = calc_grid_distance(grids[i]); + } + else + { + tmpdist = calc_position_distance(pos[i]); + } + R.middleCols(ss, tmpdist.size()) = calc_transRate_diploid(tmpdist, nGen); + dist.insert(dist.end(), tmpdist.begin(), tmpdist.end()); ss += pos[i].size(); } pos_chunk[nchunks] = ss; // add sentinel @@ -28,6 +48,7 @@ void FastPhaseK2::initRecombination(const Int2D & pos, std::string rfile, int B, er = R.row(0).sqrt(); protect_er(er); R = er2R(er); + cao.warn("init done"); } void FastPhaseK2::setFlags(double tol_p, double tol_f, double tol_q, bool debug_, bool nQ, bool nP, bool nF, bool nR) @@ -150,7 +171,7 @@ void FastPhaseK2::protectPars() */ double FastPhaseK2::hmmIterWithJumps(const MyFloat1D & GL, const int ic, const int ind, bool finalIter) { - const int S = pos_chunk[ic + 1] - pos_chunk[ic]; + const int S = pos_chunk[ic + 1] - pos_chunk[ic]; // could be num of grids if B > 1 Eigen::Map gli(GL.data() + ind * S * 3, S, 3); MyArr2D emit = get_emission_by_gl(gli, F.middleRows(pos_chunk[ic], S)).transpose(); // CC x S const auto [alpha, beta, cs] = diff --git a/src/fastphase.hpp b/src/fastphase.hpp index 60312a1..c365718 100644 --- a/src/fastphase.hpp +++ b/src/fastphase.hpp @@ -22,26 +22,24 @@ class FastPhaseK2 double clusterFreqThreshold{1e-4}; // threshold for F double admixtureThreshold{1e-6}; // threshold for Q double minHapfreq{0.01}; // min haplotype frequency, or min(1/(10*C), 1/100) + // FLAGS + bool debug{0}, local{0}, post{1}, NQ{0}, NF{0}, NP{1}, NR{1}; + int G{0}, B{1}; // G: number of grids after collapsing block + double nGen; public: FastPhaseK2(int n, int m, int c, int seed) : N(n), M(m), C(c), CC(c * c) { rng.seed(seed); + minHapfreq = std::min(1.0 / (10 * C), minHapfreq); F = RandomUniform(M, C, rng, alleleEmitThreshold, 1 - alleleEmitThreshold); - PI = MyArr2D::Ones(C, M); - PI.rowwise() /= PI.colwise().sum(); // normalize it per site - R = MyArr2D(3, M); GP.setZero(M * 3, N); - minHapfreq = std::min(1.0 / (10 * C), minHapfreq); } ~FastPhaseK2() {} - // FLAGS - bool debug{0}, local{0}, post{1}, NQ{0}, NF{0}, NP{1}, NR{1}; // SHARED VARIBALES const int N, M, C, CC; // CC = C x C - int G, B; // G: number of grids after collapsing block MyArr2D GP; // N x (M x 3), genotype probabilies for all individuals MyArr2D PI; // C x M, cluster frequency MyArr2D F; // M x C, cluster-specific allele frequence @@ -50,12 +48,12 @@ class FastPhaseK2 MyArr2D Ezg1, Ezg2; // C x M MyArr2D Ezj; // C x M, E(Z=z,J=1|X,par), expectation of switch into state k MyArr2D HapSum; // C x M, sum(gammaK) for all inds - double nGen; Int1D dist; // physical position distance between two markers Int1D pos_chunk; // store the start pos of each chunk in the full scale + Int3D grids; // physical position of each grid for each chunk MyArr1D AF; - void initRecombination(const Int2D & pos, std::string rfile = "", int B = 1, double Ne = 20000); + void initRecombination(const Int2D & pos, std::string rfile = "", int B_ = 1, double Ne = 20000); void setFlags(double, double, double, bool, bool, bool, bool, bool); void refillHaps(int); // re-sample F for sites with hapfreq < minHapfreq void protectPars(); From 1bd98e6976e35330c7ff67219f89d31cd67876f0 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Tue, 6 Feb 2024 14:18:43 +0100 Subject: [PATCH 42/67] use indicator for collapsing --- src/common.hpp | 107 ++++++++++++++++++++++++------------------- src/fastphase.cpp | 33 +++++-------- src/fastphase.hpp | 2 + src/main.cpp | 3 +- tests/Makefile | 5 +- tests/test-utils.cpp | 18 ++++++++ 6 files changed, 97 insertions(+), 71 deletions(-) create mode 100644 tests/test-utils.cpp diff --git a/src/common.hpp b/src/common.hpp index a83308d..3fbe6c8 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -44,7 +44,6 @@ inline void handler(int s) } // STD TYPES -using Bool1D = std::vector; using Int1D = std::vector; using Int2D = std::vector; using Int3D = std::vector; @@ -61,6 +60,8 @@ using Mat2D = Eigen::Matrix; using Arr2D = Eigen::Array; using Arr1D = Eigen::Array; +using Bool1D = Eigen::Array; +using Bool2D = std::vector; // MY TYPES #ifdef USE_FLOAT @@ -225,6 +226,35 @@ inline Int1D calc_position_distance(const Int1D & markers) return dl; } +inline Int1D calc_grid_distance(const Int1D & pos, const Bool1D & collapse) +{ + // B = 1 + if((collapse == true).count() == 0) return calc_position_distance(pos); + // B > 1, split pos into grids + const int G = ((collapse == true).count() + 1) / 2; + Int2D gpos(G); + int s, e, g, m = pos.size(); + for(s = 0, e = 1, g = 0; g < G; g++) + { + for(;; e++) + if(collapse[e] == true) break; + if(g == G - 1) e = m - 1; + gpos[g] = Int1D(pos.begin() + s, pos.begin() + e + 1); + // cao.cerr("size of g:", gpos[g].size(), ",s:",s, ",e:",e); + // for(auto j : gpos[g]) cao.cerr(j); + s = e + 1 >= m ? m - 1 : e + 1; + e = s + 1 >= m ? m - 1 : s + 1; + } + Int1D dl(G); + dl[0] = 0; + for(g = 1; g < G; g++) + { + // cao.cerr(g, ":", gpos[g][gpos[g].size() / 2], ",", gpos[g - 1][gpos[g - 1].size() / 2]); + dl[g] = gpos[g][gpos[g].size() / 2] - gpos[g - 1][gpos[g - 1].size() / 2]; + } + return dl; +} + inline MyArr2D er2R(const MyArr1D & er) { MyArr2D R(3, er.size()); @@ -294,23 +324,19 @@ inline MyArr2D get_emission_by_gl(const MyArr2D & gli, const MyArr2D & P, double ** @param P cluster-specific allele frequence (M, C) ** @return emission probability (M, C2) */ -inline MyArr2D get_emission_by_grid(const MyFloat1D & GL, - const MyFloat1D & P, - int ind, - int M, - int B, +inline MyArr2D get_emission_by_grid(const MyArr2D & gli, + const MyArr2D & P, + const Int2D & grids, double minEmission = 1e-10) { - const int C = P.size() / M; - const int C2 = C * C; - const int nGrids = B > 1 ? (M + B - 1) / B : M; - MyArr2D emitGrid = MyArr2D::Ones(C2, nGrids); - int z1, z2, z12, i, s, e, g, g1, g2; - int igs = ind * M * 3; + const int C = P.cols(); + const int nGrids = grids.size(); + MyArr2D emitGrid = MyArr2D::Ones(C * C, nGrids); + int z1, z2, z12, i, s, e, g, g1, g2, m; for(g = 0; g < nGrids; g++) { - s = g * B; - e = g == nGrids - 1 ? M - 1 : B * (g + 1) - 1; + s = grids[g][0]; + e = grids[g][grids[g].size()]; for(z1 = 0; z1 < C; z1++) { for(z2 = 0; z2 < C; z2++) @@ -319,12 +345,13 @@ inline MyArr2D get_emission_by_grid(const MyFloat1D & GL, for(i = s; i <= e; i++) { double emit = 0; + m = i + grids[g].size(); for(g1 = 0; g1 <= 1; g1++) { for(g2 = 0; g2 <= 1; g2++) { - emit += GL[igs + (g1 + g2) * M + i] * (g1 * P[z1 * M + i] + (1 - g1) * (1 - P[z1 * M + i])) - * (g2 * P[z2 * M + i] + (1 - g2) * (1 - P[z2 * M + i])); + emit += gli(m, g1 + g2) * (g1 * P(m, z1) + (1 - g1) * (1 - P(m, z1))) + * (g2 * P(m, z2) + (1 - g2) * (1 - P(m, z2))); } } emitGrid(z12, g) *= emit; @@ -500,19 +527,32 @@ inline Arr1D estimate_af_by_gl(const MyFloat1D & GL, int N, int M, int niter = 1 return af_est; } -inline Int2D divide_pos_into_grid(const Int1D & pos, int B) +// grid size must be >=3 +inline Bool1D find_grid_to_collapse(const Int1D & pos, int B) { int M = pos.size(); - int G = (M + B - 1) / B; - Int2D grids(G); + Bool1D collapse = Bool1D::Constant(M, false); + if(B == 1) return collapse; + int G = (M + B - 1) / B; // get ceiling number int g, s, e; for(g = 0; g < G; g++) { s = g * B; e = g == G - 1 ? M - 1 : B * (g + 1) - 1; - grids[g] = Int1D(pos.begin() + s, pos.begin() + e + 1); + collapse(s) = true; + collapse(e) = true; } - return grids; + return collapse; +} + +inline Bool1D find_grid_to_collapse(const MyArr2D & R, double tol_r = 1e-6) +{ + Bool1D collapse = Bool1D::Constant(R.cols(), false); + for(auto i = 0; i < R.cols(); i++) + { + if(std::sqrt(R(2, i)) < tol_r) collapse(i) = true; + } + return collapse; } inline Int2D divide_pos_into_grid(const Int1D & pos, const Bool1D & collapse) @@ -537,31 +577,6 @@ inline Int2D divide_pos_into_grid(const Int1D & pos, const Bool1D & collapse) return grids; } -inline Bool1D find_chunk_to_collapse(const MyArr2D & R, double tol_r = 1e-6) -{ - Bool1D collapse(R.cols(), false); // M sites - for(auto i = 0; i < R.cols(); i++) - { - if(std::sqrt(R(2, i)) < tol_r) collapse[i] = true; - } - return collapse; -} - -/* -** @params pos snp position, first dim is each grid, second dim is snps in -*that grid -*/ -inline Int1D calc_grid_distance(const Int2D & pos) -{ - Int1D dl(pos.size()); - dl[0] = 0; - for(auto i = 1; i < pos.size(); i++) - { - dl[i] = pos[i][pos[i].size() / 2] - pos[i - 1][pos[i - 1].size() / 2]; - } - return dl; -} - /* ** @param E original size of emission, full SNPs x C2 */ diff --git a/src/fastphase.cpp b/src/fastphase.cpp index 0e38ba9..349bc64 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -12,43 +12,32 @@ void FastPhaseK2::initRecombination(const Int2D & pos, std::string rfile, int B_ { nGen = 4 * Ne / C; B = B_; + if(B == 2) cao.error("-B can not be 2"); int nchunks = pos.size(); pos_chunk.resize(nchunks + 1); - dist.reserve(M); - Int1D tmpdist; - int i{0}, ss{0}; - for(i = 0, G = 0; i < nchunks; i++) - { - if(B > 1) - G += (pos[i].size() + B - 1) / B; - else - G += pos[i].size(); - } + int i{0}, ss{0}, sg{0}; + // how many grids in total + for(i = 0, G = 0; i < nchunks; i++) G += B > 1 ? (pos[i].size() + B - 1) / B : pos[i].size(); R = MyArr2D(3, G); PI = MyArr2D::Ones(C, G); PI.rowwise() /= PI.colwise().sum(); // normalize it per site - for(i = 0, ss = 0; i < nchunks; i++) + dist.reserve(M); + Int1D tmpdist; + for(i = 0, ss = 0, sg = 0; i < nchunks; i++) { pos_chunk[i] = ss; - if(B > 1) - { - grids.emplace_back(divide_pos_into_grid(pos[i], B)); - tmpdist = calc_grid_distance(grids[i]); - } - else - { - tmpdist = calc_position_distance(pos[i]); - } - R.middleCols(ss, tmpdist.size()) = calc_transRate_diploid(tmpdist, nGen); + collapse.segment(ss, pos[i].size()) = find_grid_to_collapse(pos[i], B); + tmpdist = calc_grid_distance(pos[i], collapse); + R.middleCols(sg, tmpdist.size()) = calc_transRate_diploid(tmpdist, nGen); dist.insert(dist.end(), tmpdist.begin(), tmpdist.end()); ss += pos[i].size(); + sg += tmpdist.size(); } pos_chunk[nchunks] = ss; // add sentinel if(!rfile.empty()) load_csv(R, rfile, true); er = R.row(0).sqrt(); protect_er(er); R = er2R(er); - cao.warn("init done"); } void FastPhaseK2::setFlags(double tol_p, double tol_f, double tol_q, bool debug_, bool nQ, bool nP, bool nF, bool nR) diff --git a/src/fastphase.hpp b/src/fastphase.hpp index c365718..cb537b3 100644 --- a/src/fastphase.hpp +++ b/src/fastphase.hpp @@ -34,6 +34,7 @@ class FastPhaseK2 minHapfreq = std::min(1.0 / (10 * C), minHapfreq); F = RandomUniform(M, C, rng, alleleEmitThreshold, 1 - alleleEmitThreshold); GP.setZero(M * 3, N); + collapse = Bool1D::Constant(M, false); } ~FastPhaseK2() {} @@ -51,6 +52,7 @@ class FastPhaseK2 Int1D dist; // physical position distance between two markers Int1D pos_chunk; // store the start pos of each chunk in the full scale Int3D grids; // physical position of each grid for each chunk + Bool1D collapse; MyArr1D AF; void initRecombination(const Int2D & pos, std::string rfile = "", int B_ = 1, double Ne = 20000); diff --git a/src/main.cpp b/src/main.cpp index 583de4d..3cfbcc2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -135,7 +135,7 @@ int main(int argc, char * argv[]) .help("collapse SNPs in a reasonable window") .flag(); cmd_impute.add_argument("-B", "--grid-size") - .help("number of SNPs (>1) in each grid. 1 disables collapsing") + .help("number of SNPs (>=3) in each grid. 1 disables collapsing") .default_value(1) .scan<'i', int>(); cmd_impute.add_argument("-f", "--vcf") @@ -180,6 +180,7 @@ int main(int argc, char * argv[]) "3: re-sample P by copying from others with respect to their probability\n" "0: disable this") .default_value(0) + .choices(0, 1, 2, 3) .scan<'i', int>(); cmd_impute.add_argument("--minRecombRate") .help("min recombination rate to determine if a SNP should be collapsed") diff --git a/tests/Makefile b/tests/Makefile index 4c34b95..92451b8 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -5,9 +5,9 @@ INC = -I. -I../src -I../inst/include -I$(HTSDIR) LDFLAGS = -L$(HTSDIR) -Wl,-rpath,$(HTSDIR) LIBS = -lhts -llzma -lbz2 -lm -lz -lpthread -OBJS = test-main.o test-joint.o test-admixture.o test-fastphase.o +OBJS = test-main.o test-joint.o test-admixture.o test-fastphase.o test-utils.o -BINS = test-joint.bin test-admixture.bin test-fastphase.bin +BINS = test-joint.bin test-admixture.bin test-fastphase.bin test-utils.bin DEPS = ../src/phaseless.o ../src/fastphase.o ../src/admixture.o @@ -24,6 +24,7 @@ all: $(BINS) $(OBJS) ${CXX} ${CXXFLAGS} -o $@ $< test-main.o $(DEPS) $(LDFLAGS) $(LIBS) test: $(BINS) + ./test-utils.bin --success ./test-joint.bin --success ./test-admixture.bin --success ./test-fastphase.bin --success diff --git a/tests/test-utils.cpp b/tests/test-utils.cpp new file mode 100644 index 0000000..0400eb9 --- /dev/null +++ b/tests/test-utils.cpp @@ -0,0 +1,18 @@ +#define _DECLARE_TOOLBOX_HERE + +#include "../src/common.hpp" +#include "catch.hh" + +using namespace std; +using namespace Eigen; + +TEST_CASE("calc_grid_distance", "[test-utils]") +{ + // Int1D pos{1,2,3,4,5,6,7,8,9,10,11}; + Int1D pos{1,2,3,4,5,6,7,8,9}; + const auto b = find_grid_to_collapse(pos, 1); + cao.cerr("b:", b); + auto d = calc_grid_distance(pos, b); + // cao.cerr("size:", d.size()); + for(auto i : d) cao.cerr(i); +} From fb14ba8f13a53895c62f049b4d1b4324c87b5c56 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Tue, 6 Feb 2024 15:30:47 +0100 Subject: [PATCH 43/67] nail B! --- src/common.hpp | 66 +++++++++++++++++++++++++-------------- src/fastphase.cpp | 78 ++++++++++++++++++++++++++++------------------- src/fastphase.hpp | 2 +- 3 files changed, 91 insertions(+), 55 deletions(-) diff --git a/src/common.hpp b/src/common.hpp index 3fbe6c8..0d6a1e0 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -226,30 +226,51 @@ inline Int1D calc_position_distance(const Int1D & markers) return dl; } -inline Int1D calc_grid_distance(const Int1D & pos, const Bool1D & collapse) +// the form is [s,e] +inline Int2D find_grid_start_end(const Bool1D & collapse) { - // B = 1 - if((collapse == true).count() == 0) return calc_position_distance(pos); - // B > 1, split pos into grids const int G = ((collapse == true).count() + 1) / 2; - Int2D gpos(G); - int s, e, g, m = pos.size(); + Int2D gpos(G, Int1D(2)); + int s, e, g, m = collapse.size(); for(s = 0, e = 1, g = 0; g < G; g++) { for(;; e++) if(collapse[e] == true) break; if(g == G - 1) e = m - 1; - gpos[g] = Int1D(pos.begin() + s, pos.begin() + e + 1); + gpos[g][0] = s; + gpos[g][1] = e; // cao.cerr("size of g:", gpos[g].size(), ",s:",s, ",e:",e); // for(auto j : gpos[g]) cao.cerr(j); s = e + 1 >= m ? m - 1 : e + 1; e = s + 1 >= m ? m - 1 : s + 1; } + return gpos; +} + +inline Int2D split_pos_into_grid(const Int1D & pos, const Bool1D & collapse) +{ + const auto se = find_grid_start_end(collapse); + const int G = se.size(); + Int2D gpos(G); + for(int g = 0; g < G; g++) + { + // here we want [s, e) + gpos[g] = Int1D(pos.begin() + se[g][0], pos.begin() + se[g][1] + 1); + } + return gpos; +} + +inline Int1D calc_grid_distance(const Int1D & pos, const Bool1D & collapse) +{ + // B = 1 + if((collapse == true).count() == 0) return calc_position_distance(pos); + // B > 1, split pos into grids + Int2D gpos = split_pos_into_grid(pos, collapse); + const int G = gpos.size(); Int1D dl(G); dl[0] = 0; - for(g = 1; g < G; g++) + for(int g = 1; g < G; g++) { - // cao.cerr(g, ":", gpos[g][gpos[g].size() / 2], ",", gpos[g - 1][gpos[g - 1].size() / 2]); dl[g] = gpos[g][gpos[g].size() / 2] - gpos[g - 1][gpos[g - 1].size() / 2]; } return dl; @@ -316,7 +337,7 @@ inline MyArr2D get_emission_by_gl(const MyArr2D & gli, const MyArr2D & P, double } // emitDip = emitDip.colwise() / emitDip.rowwise().maxCoeff(); // normalize // emitDip = (emitDip < minEmission).select(minEmission, emitDip); - return emitDip; + return emitDip.transpose(); } /* @@ -326,35 +347,34 @@ inline MyArr2D get_emission_by_gl(const MyArr2D & gli, const MyArr2D & P, double */ inline MyArr2D get_emission_by_grid(const MyArr2D & gli, const MyArr2D & P, - const Int2D & grids, + const Bool1D & collapse, double minEmission = 1e-10) { + if((collapse == true).count() == 0) return get_emission_by_gl(gli, P, minEmission); const int C = P.cols(); - const int nGrids = grids.size(); - MyArr2D emitGrid = MyArr2D::Ones(C * C, nGrids); - int z1, z2, z12, i, s, e, g, g1, g2, m; + const auto se = find_grid_start_end(collapse); + const int nGrids = se.size(); + MyArr2D emit = MyArr2D::Ones(C * C, nGrids); + int z1, z2, z12, g, g1, g2, m; for(g = 0; g < nGrids; g++) { - s = grids[g][0]; - e = grids[g][grids[g].size()]; for(z1 = 0; z1 < C; z1++) { for(z2 = 0; z2 < C; z2++) { z12 = z1 * C + z2; - for(i = s; i <= e; i++) + for(m = se[g][0]; m <= se[g][1]; m++) { - double emit = 0; - m = i + grids[g].size(); + double iemit = 0; for(g1 = 0; g1 <= 1; g1++) { for(g2 = 0; g2 <= 1; g2++) { - emit += gli(m, g1 + g2) * (g1 * P(m, z1) + (1 - g1) * (1 - P(m, z1))) - * (g2 * P(m, z2) + (1 - g2) * (1 - P(m, z2))); + iemit += gli(m, g1 + g2) * (g1 * P(m, z1) + (1 - g1) * (1 - P(m, z1))) + * (g2 * P(m, z2) + (1 - g2) * (1 - P(m, z2))); } } - emitGrid(z12, g) *= emit; + emit(z12, g) *= iemit; } } } @@ -362,7 +382,7 @@ inline MyArr2D get_emission_by_grid(const MyArr2D & gli, // emitGrid.col(g) /= emitGrid.col(g).maxCoeff(); // emitGrid.col(g) = (emitGrid.col(g) < minEmission).select(minEmission, emitGrid.col(g)); } - return emitGrid; + return emit; } /* diff --git a/src/fastphase.cpp b/src/fastphase.cpp index 349bc64..ee93874 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -15,8 +15,9 @@ void FastPhaseK2::initRecombination(const Int2D & pos, std::string rfile, int B_ if(B == 2) cao.error("-B can not be 2"); int nchunks = pos.size(); pos_chunk.resize(nchunks + 1); + grid_chunk.resize(nchunks + 1); int i{0}, ss{0}, sg{0}; - // how many grids in total + // how many grids in total for(i = 0, G = 0; i < nchunks; i++) G += B > 1 ? (pos[i].size() + B - 1) / B : pos[i].size(); R = MyArr2D(3, G); PI = MyArr2D::Ones(C, G); @@ -26,6 +27,7 @@ void FastPhaseK2::initRecombination(const Int2D & pos, std::string rfile, int B_ for(i = 0, ss = 0, sg = 0; i < nchunks; i++) { pos_chunk[i] = ss; + grid_chunk[i] = sg; collapse.segment(ss, pos[i].size()) = find_grid_to_collapse(pos[i], B); tmpdist = calc_grid_distance(pos[i], collapse); R.middleCols(sg, tmpdist.size()) = calc_transRate_diploid(tmpdist, nGen); @@ -34,6 +36,7 @@ void FastPhaseK2::initRecombination(const Int2D & pos, std::string rfile, int B_ sg += tmpdist.size(); } pos_chunk[nchunks] = ss; // add sentinel + grid_chunk[nchunks] = sg; // add sentinel if(!rfile.empty()) load_csv(R, rfile, true); er = R.row(0).sqrt(); protect_er(er); @@ -90,10 +93,10 @@ void FastPhaseK2::refillHaps(int strategy) void FastPhaseK2::initIteration() { // initial temp variables - Ezj.setZero(C, M); // reset post(Z,j) Ezg1.setZero(C, M); // reset pos(Z,g) Ezg2.setZero(C, M); // reset pos(Z,g) - HapSum.setZero(C, M); // reset post(Z,j) + Ezj.setZero(C, G); // reset post(Z,j) + HapSum.setZero(C, G); // reset post(Z,j) } void FastPhaseK2::updateIteration() @@ -160,47 +163,59 @@ void FastPhaseK2::protectPars() */ double FastPhaseK2::hmmIterWithJumps(const MyFloat1D & GL, const int ic, const int ind, bool finalIter) { - const int S = pos_chunk[ic + 1] - pos_chunk[ic]; // could be num of grids if B > 1 + const int S = pos_chunk[ic + 1] - pos_chunk[ic]; + const int nGrids = grid_chunk[ic + 1] - grid_chunk[ic]; Eigen::Map gli(GL.data() + ind * S * 3, S, 3); - MyArr2D emit = get_emission_by_gl(gli, F.middleRows(pos_chunk[ic], S)).transpose(); // CC x S + MyArr2D emit = get_emission_by_grid(gli, F.middleRows(pos_chunk[ic], S), collapse.segment(pos_chunk[ic], S)); + int start = pos_chunk[ic], nsize = S; + if(nGrids != S) + { + start = grid_chunk[ic]; + nsize = nGrids; + } const auto [alpha, beta, cs] = - forward_backwards_diploid(emit, R.middleCols(pos_chunk[ic], S), PI.middleCols(pos_chunk[ic], S)); + forward_backwards_diploid(emit, R.middleCols(start, nsize), PI.middleCols(start, nsize)); if(!((1 - ((alpha * beta).colwise().sum())).abs() < 1e-9).all()) cao.error((alpha * beta).colwise().sum(), "\ngamma sum is not 1.0!\n"); // now get posterios - MyArr2D ind_post_zg1(C, S), ind_post_zg2(C, S), ind_post_zj(C, S), gammaC(C, S); + MyArr2D ind_post_zg1(C, S), ind_post_zg2(C, S), ind_post_zj(C, nGrids), gammaC(C, nGrids); MyArr1D gamma_div_emit(CC), beta_mult_emit(CC); MyArr1D alphatmp(C); - int z1, m, s; - for(s = 0; s < S; s++) + int z1, m, s, g{0}, gg{0}; + const auto se = find_grid_start_end(collapse.segment(pos_chunk[ic], S)); + for(g = 0; g < nGrids; g++) { - m = s + pos_chunk[ic]; - gamma_div_emit = (alpha.col(s) * beta.col(s)) / emit.col(s); // C2 - gammaC.col(s) = (alpha.col(s) * beta.col(s)).reshaped(C, C).colwise().sum(); + gg = g + grid_chunk[ic]; + gamma_div_emit = (alpha.col(g) * beta.col(g)) / emit.col(g); // C2 + gammaC.col(g) = (alpha.col(g) * beta.col(g)).reshaped(C, C).colwise().sum(); for(z1 = 0; z1 < C; z1++) { - ind_post_zg1(z1, s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (1 - F(m, z1)) - * (gli(s, 0) * (1 - F.row(m)) + gli(s, 1) * F.row(m)).transpose()) - .sum(); - ind_post_zg2(z1, s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (F(m, z1)) - * (gli(s, 1) * (1 - F.row(m)) + gli(s, 2) * F.row(m)).transpose()) - .sum(); - if(s > 0) alphatmp(z1) = alpha(Eigen::seqN(z1, C, C), s - 1).sum() * R(1, m); - if(s == 0) ind_post_zj(z1, s) = (alpha.col(0) * beta.col(0)).segment(z1 * C, C).sum(); - if(finalIter) callGenoLoopC(z1, m, ind, gli.row(s), F.row(m), gamma_div_emit); + for(s = se[g][0]; s <= se[g][1]; s++) + { + m = s + pos_chunk[ic]; + ind_post_zg1(z1, s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (1 - F(m, z1)) + * (gli(s, 0) * (1 - F.row(m)) + gli(s, 1) * F.row(m)).transpose()) + .sum(); + ind_post_zg2(z1, s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (F(m, z1)) + * (gli(s, 1) * (1 - F.row(m)) + gli(s, 2) * F.row(m)).transpose()) + .sum(); + if(finalIter) callGenoLoopC(z1, m, ind, gli.row(s), F.row(m), gamma_div_emit); + } + if(g == 0) ind_post_zj(z1, g) = (alpha.col(g) * beta.col(g)).segment(z1 * C, C).sum(); + if(g > 0) alphatmp(z1) = alpha(Eigen::seqN(z1, C, C), g - 1).sum() * R(1, gg); } - if(s == 0) continue; - alphatmp += PI.col(m) * R(2, m) * 1.0; // inner alpha.col(s-1).sum == 1 - beta_mult_emit = emit.col(s) * beta.col(s); // C2 + if(g == 0) continue; + alphatmp += PI.col(gg) * R(2, gg) * 1.0; // inner alpha.col(s-1).sum == 1 + beta_mult_emit = emit.col(g) * beta.col(g); // C2 for(z1 = 0; z1 < C; z1++) - ind_post_zj(z1, s) = cs(s) * (PI(z1, m) * alphatmp * beta_mult_emit(Eigen::seqN(z1, C, C))).sum(); + ind_post_zj(z1, g) = cs(g) * (PI(z1, gg) * alphatmp * beta_mult_emit(Eigen::seqN(z1, C, C))).sum(); } { // sum over all samples for updates std::scoped_lock lock(mutex_it); - Ezj.middleCols(pos_chunk[ic], S) += ind_post_zj; Ezg1.middleCols(pos_chunk[ic], S) += ind_post_zg1; Ezg2.middleCols(pos_chunk[ic], S) += ind_post_zg2; - HapSum.middleCols(pos_chunk[ic], S) += gammaC; + Ezj.middleCols(grid_chunk[ic], nGrids) += ind_post_zj; + HapSum.middleCols(pos_chunk[ic], nGrids) += gammaC; } return (1 / cs).log().sum(); @@ -254,7 +269,7 @@ int run_impute_main(Options & opts) vector> res; FastPhaseK2 faith(genome->nsamples, genome->nsnps, opts.C, opts.seed); faith.setFlags(opts.ptol, opts.ftol, opts.qtol, opts.debug, opts.nQ, opts.nP, opts.nF, opts.nR); - faith.initRecombination(genome->pos, opts.in_rfile); + faith.initRecombination(genome->pos, opts.in_rfile, opts.gridsize); double loglike, diff, prevlike{std::numeric_limits::lowest()}; for(int it = 0; SIG_COND && it <= opts.nimpute; it++) { @@ -290,11 +305,12 @@ int run_impute_main(Options & opts) for(int ic = 0; ic < genome->nchunks; ic++) { const int S = faith.pos_chunk[ic + 1] - faith.pos_chunk[ic]; - MyArr2D out = faith.Ezj.middleCols(faith.pos_chunk[ic], S); + const int G = faith.grid_chunk[ic + 1] - faith.grid_chunk[ic]; + MyArr2D out = faith.Ezj.middleCols(faith.pos_chunk[ic], G); genome->AE.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); - out = faith.R.middleCols(faith.pos_chunk[ic], S); + out = faith.R.middleCols(faith.pos_chunk[ic], G); genome->R.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); - out = faith.PI.middleCols(faith.pos_chunk[ic], S); + out = faith.PI.middleCols(faith.pos_chunk[ic], G); genome->PI.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); out = faith.F.middleRows(faith.pos_chunk[ic], S); genome->F.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); diff --git a/src/fastphase.hpp b/src/fastphase.hpp index cb537b3..de8fe40 100644 --- a/src/fastphase.hpp +++ b/src/fastphase.hpp @@ -50,7 +50,7 @@ class FastPhaseK2 MyArr2D Ezj; // C x M, E(Z=z,J=1|X,par), expectation of switch into state k MyArr2D HapSum; // C x M, sum(gammaK) for all inds Int1D dist; // physical position distance between two markers - Int1D pos_chunk; // store the start pos of each chunk in the full scale + Int1D pos_chunk, grid_chunk; // store the start pos of each chunk in the full scale Int3D grids; // physical position of each grid for each chunk Bool1D collapse; MyArr1D AF; From 2aa97b8afe25fa94a4d62214a966e58564935524 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Tue, 6 Feb 2024 15:40:42 +0100 Subject: [PATCH 44/67] add collapse and rename F for bigass --- src/admixture.cpp | 6 +++--- src/common.hpp | 4 +++- src/fastphase.cpp | 33 +++++++++++++++++---------------- src/fastphase.hpp | 4 ++-- 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index c5b1d5d..7879ca4 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -14,7 +14,7 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & MyArr2D kapa, Ekg; MyArr1D iQ = MyArr1D::Zero(K); MyArr1D Hz(C); - double norm = 0, llike = 0, tmp = 0, tol = 0.01; + double norm = 0, llike = 0, tmp = 0; int c1, k1, s, c2, c12; for(int ic = 0, m = 0; ic < genome->nchunks; ic++) { @@ -22,7 +22,7 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & const int G = genome->B > 1 ? (S + genome->B - 1) / genome->B : S; assert(S == G); // only test B=1 now Eigen::Map gli(genome->gls[ic].data() + ind * S * 3, S, 3); - Eigen::Map P(genome->F[ic].data(), S, C); + Eigen::Map P(genome->P[ic].data(), S, C); Eigen::Map PI(genome->PI[ic].data(), C, S); Eigen::Map R(genome->R[ic].data(), 3, S); Eigen::Map AE(genome->AE[ic].data(), C * C, S); @@ -76,7 +76,7 @@ double Admixture::runNativeWithBigAss(int ind, const std::unique_ptr & g { const int S = genome->pos[ic].size(); Eigen::Map gli(genome->gls[ic].data() + ind * S * 3, S, 3); - Eigen::Map P(genome->F[ic].data(), S, C); + Eigen::Map P(genome->P[ic].data(), S, C); Eigen::Map PI(genome->PI[ic].data(), C, S); Eigen::Map R(genome->R[ic].data(), 3, S); Eigen::Map AE(genome->AE[ic].data(), C * C, S); diff --git a/src/common.hpp b/src/common.hpp index 0d6a1e0..7fbf2d0 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -44,6 +44,7 @@ inline void handler(int s) } // STD TYPES +using Char1D = std::vector; using Int1D = std::vector; using Int2D = std::vector; using Int3D = std::vector; @@ -112,8 +113,9 @@ struct Options struct BigAss { int chunksize, nsamples, nsnps, nchunks, B, C, G; - MyFloat2D PI, F, R, AE; // M x C, 3 x M, fastphase pars + MyFloat2D PI, P, R, AE; // M x C, 3 x M, fastphase pars Int1D ends; // chunk index where each chromo ends + Char1D collapse; // indicators for collapsing String1D sampleids, chrs; Int2D pos; // store position of markers of each chunk MyFloat2D gls; // store gl(N, M*3) of each chunk diff --git a/src/fastphase.cpp b/src/fastphase.cpp index ee93874..fe00008 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -73,16 +73,16 @@ void FastPhaseK2::refillHaps(int strategy) assert(choice != c); if(strategy == 1) { - F(m, c) = alleleEmitThreshold; + P(m, c) = alleleEmitThreshold; } else if(strategy == 2) { h.maxCoeff(&choice); // if no binning, this may be better - F(m, c) = F(m, choice); + P(m, c) = P(m, choice); } else { - F(m, c) = F(m, choice); + P(m, c) = P(m, choice); } s++; } @@ -104,7 +104,7 @@ void FastPhaseK2::updateIteration() // update R if(!NR) er = 1.0 - Ezj.colwise().sum() / N; // update F - if(!NP) F = (Ezg2 / (Ezg1 + Ezg2)).transpose(); + if(!NP) P = (Ezg2 / (Ezg1 + Ezg2)).transpose(); // update PI if(!NF) { @@ -119,15 +119,15 @@ void FastPhaseK2::protectPars() // protect F if(!NP) { - if(F.isNaN().any()) + if(P.isNaN().any()) { cao.warn("NaN in F in FastPhaseK2 model. will fill it with AF"); if(AF.size() == 0) cao.error("AF is not assigned!\n"); - for(int i = 0; i < M; i++) F.row(i) = F.row(i).isNaN().select(AF(i), F.row(i)); + for(int i = 0; i < M; i++) P.row(i) = P.row(i).isNaN().select(AF(i), P.row(i)); } // map F to domain but no normalization - F = (F < alleleEmitThreshold).select(alleleEmitThreshold, F); // lower bound - F = (F > 1 - alleleEmitThreshold).select(1 - alleleEmitThreshold, F); // upper bound + P = (P < alleleEmitThreshold).select(alleleEmitThreshold, P); // lower bound + P = (P > 1 - alleleEmitThreshold).select(1 - alleleEmitThreshold, P); // upper bound } // protect R if(!NR) @@ -166,7 +166,7 @@ double FastPhaseK2::hmmIterWithJumps(const MyFloat1D & GL, const int ic, const i const int S = pos_chunk[ic + 1] - pos_chunk[ic]; const int nGrids = grid_chunk[ic + 1] - grid_chunk[ic]; Eigen::Map gli(GL.data() + ind * S * 3, S, 3); - MyArr2D emit = get_emission_by_grid(gli, F.middleRows(pos_chunk[ic], S), collapse.segment(pos_chunk[ic], S)); + MyArr2D emit = get_emission_by_grid(gli, P.middleRows(pos_chunk[ic], S), collapse.segment(pos_chunk[ic], S)); int start = pos_chunk[ic], nsize = S; if(nGrids != S) { @@ -193,13 +193,13 @@ double FastPhaseK2::hmmIterWithJumps(const MyFloat1D & GL, const int ic, const i for(s = se[g][0]; s <= se[g][1]; s++) { m = s + pos_chunk[ic]; - ind_post_zg1(z1, s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (1 - F(m, z1)) - * (gli(s, 0) * (1 - F.row(m)) + gli(s, 1) * F.row(m)).transpose()) + ind_post_zg1(z1, s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (1 - P(m, z1)) + * (gli(s, 0) * (1 - P.row(m)) + gli(s, 1) * P.row(m)).transpose()) .sum(); - ind_post_zg2(z1, s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (F(m, z1)) - * (gli(s, 1) * (1 - F.row(m)) + gli(s, 2) * F.row(m)).transpose()) + ind_post_zg2(z1, s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (P(m, z1)) + * (gli(s, 1) * (1 - P.row(m)) + gli(s, 2) * P.row(m)).transpose()) .sum(); - if(finalIter) callGenoLoopC(z1, m, ind, gli.row(s), F.row(m), gamma_div_emit); + if(finalIter) callGenoLoopC(z1, m, ind, gli.row(s), P.row(m), gamma_div_emit); } if(g == 0) ind_post_zj(z1, g) = (alpha.col(g) * beta.col(g)).segment(z1 * C, C).sum(); if(g > 0) alphatmp(z1) = alpha(Eigen::seqN(z1, C, C), g - 1).sum() * R(1, gg); @@ -302,6 +302,7 @@ int run_impute_main(Options & opts) faith.Ezj = get_cluster_frequency(faith.R, faith.PI); } auto bw = make_bcfwriter(opts.out + ".vcf.gz", genome->chrs, genome->sampleids); + genome->collapse = Char1D(faith.collapse.data(), faith.collapse.data() + faith.collapse.size()); for(int ic = 0; ic < genome->nchunks; ic++) { const int S = faith.pos_chunk[ic + 1] - faith.pos_chunk[ic]; @@ -312,8 +313,8 @@ int run_impute_main(Options & opts) genome->R.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); out = faith.PI.middleCols(faith.pos_chunk[ic], G); genome->PI.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); - out = faith.F.middleRows(faith.pos_chunk[ic], S); - genome->F.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); + out = faith.P.middleRows(faith.pos_chunk[ic], S); + genome->P.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); out = faith.GP.middleRows(faith.pos_chunk[ic], S * 3); write_bigass_to_bcf(bw, out.data(), genome->chrs[ic], genome->pos[ic]); } diff --git a/src/fastphase.hpp b/src/fastphase.hpp index de8fe40..71b4182 100644 --- a/src/fastphase.hpp +++ b/src/fastphase.hpp @@ -32,7 +32,7 @@ class FastPhaseK2 { rng.seed(seed); minHapfreq = std::min(1.0 / (10 * C), minHapfreq); - F = RandomUniform(M, C, rng, alleleEmitThreshold, 1 - alleleEmitThreshold); + P = RandomUniform(M, C, rng, alleleEmitThreshold, 1 - alleleEmitThreshold); GP.setZero(M * 3, N); collapse = Bool1D::Constant(M, false); } @@ -43,7 +43,7 @@ class FastPhaseK2 const int N, M, C, CC; // CC = C x C MyArr2D GP; // N x (M x 3), genotype probabilies for all individuals MyArr2D PI; // C x M, cluster frequency - MyArr2D F; // M x C, cluster-specific allele frequence + MyArr2D P; // M x C, cluster-specific allele frequence MyArr1D er; // M, jumping rate MyArr2D R; // 3 x M, jumping / recombination rate MyArr2D Ezg1, Ezg2; // C x M From 46d8a5f2e3f85b97ddb85f387685613a8d500233 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Tue, 6 Feb 2024 21:45:10 +0100 Subject: [PATCH 45/67] update admix and emission function --- src/admixture.cpp | 94 +++++++++++++++++++--------------------- src/admixture.hpp | 8 ++-- src/common.hpp | 37 ++++++++++------ src/fastphase.cpp | 3 ++ src/fastphase.hpp | 5 +-- src/io.hpp | 3 -- src/phaseless.cpp | 2 +- tests/test-admixture.cpp | 12 ++--- tests/test-utils.cpp | 10 +++-- 9 files changed, 93 insertions(+), 81 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index 7879ca4..275fe21 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -15,24 +15,23 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & MyArr1D iQ = MyArr1D::Zero(K); MyArr1D Hz(C); double norm = 0, llike = 0, tmp = 0; - int c1, k1, s, c2, c12; - for(int ic = 0, m = 0; ic < genome->nchunks; ic++) + int c1, k1, s, c2, c12, ss, ic, g; + for(ic = 0, g = 0, ss = 0; ic < genome->nchunks; ic++) { const int S = genome->pos[ic].size(); - const int G = genome->B > 1 ? (S + genome->B - 1) / genome->B : S; - assert(S == G); // only test B=1 now + const int nGrids = grids[ic]; Eigen::Map gli(genome->gls[ic].data() + ind * S * 3, S, 3); Eigen::Map P(genome->P[ic].data(), S, C); - Eigen::Map PI(genome->PI[ic].data(), C, S); - Eigen::Map R(genome->R[ic].data(), 3, S); - Eigen::Map AE(genome->AE[ic].data(), C * C, S); - const auto cl = get_cluster_likelihoods(gli, P, R, PI, AE); - const int nGrids = cl.cols(); + Eigen::Map PI(genome->PI[ic].data(), C, nGrids); + Eigen::Map R(genome->R[ic].data(), 3, nGrids); + Eigen::Map AE(genome->AE[ic].data(), C * C, nGrids); + const auto cl = get_cluster_likelihoods(gli, P, R, PI, AE, collapse.segment(ss, S)); + ss += S; kapa.setZero(C * K, nGrids); // C x K x M layout Ekg.setZero(K, nGrids); - for(s = 0; s < nGrids; s++, m++) + for(s = 0; s < nGrids; s++, g++) { - for(c1 = 0; c1 < C; c1++) Hz(c1) = (Q.col(ind) * F(Eigen::seqN(c1, K, C), m)).sum(); + for(c1 = 0; c1 < C; c1++) Hz(c1) = (Q.col(ind) * F(Eigen::seqN(c1, K, C), g)).sum(); for(norm = 0, c1 = 0; c1 < C; c1++) { for(tmp = 0, c2 = 0; c2 < C; c2++) @@ -44,7 +43,7 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & tmp += xz * zy; } norm += tmp; - kapa(Eigen::seqN(c1, K, C), s) = (Q.col(ind) * F(Eigen::seqN(c1, K, C), m)) * tmp / Hz(c1); + kapa(Eigen::seqN(c1, K, C), s) = (Q.col(ind) * F(Eigen::seqN(c1, K, C), g)) * tmp / Hz(c1); } llike += log(norm); kapa.col(s) /= kapa.col(s).sum(); @@ -53,12 +52,12 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & iQ += Ekg.rowwise().sum(); { // for update F std::scoped_lock lock(mutex_it); // sum over all samples - Ekc.middleCols(m - nGrids, nGrids) += 2 * kapa; - NormF.middleCols(m - nGrids, nGrids) += Ekg; + Ekc.middleCols(g - nGrids, nGrids) += 2 * kapa; + NormF.middleCols(g - nGrids, nGrids) += Ekg; } } // update Q, iQ.sum() should be 2M - if(!nonewQ) Q.col(ind) = iQ / (2 * M); + if(!nonewQ) Q.col(ind) = iQ / (2 * G); return llike; } @@ -70,21 +69,22 @@ double Admixture::runNativeWithBigAss(int ind, const std::unique_ptr & g MyArr2D Ekg, iEkc; double norm = 0, llike = 0; int c1, c2, c12, cc; - int k1, k2, k12, s; + int k1, k2, k12, s, ss, ic, g; MyArr1D iQ = MyArr1D::Zero(K); - for(int ic = 0, m = 0; ic < genome->nchunks; ic++) + for(ic = 0, g = 0, ss = 0; ic < genome->nchunks; ic++) { const int S = genome->pos[ic].size(); + const int nGrids = grids[ic]; Eigen::Map gli(genome->gls[ic].data() + ind * S * 3, S, 3); Eigen::Map P(genome->P[ic].data(), S, C); - Eigen::Map PI(genome->PI[ic].data(), C, S); - Eigen::Map R(genome->R[ic].data(), 3, S); - Eigen::Map AE(genome->AE[ic].data(), C * C, S); - const auto cl = get_cluster_likelihoods(gli, P, R, PI, AE); - const int nGrids = cl.cols(); + Eigen::Map PI(genome->PI[ic].data(), C, nGrids); + Eigen::Map R(genome->R[ic].data(), 3, nGrids); + Eigen::Map AE(genome->AE[ic].data(), C * C, nGrids); + const auto cl = get_cluster_likelihoods(gli, P, R, PI, AE, collapse.segment(ss, S)); + ss += S; iEkc.setZero(C * K, nGrids); Ekg.setZero(K, nGrids); - for(s = 0; s < nGrids; s++, m++) + for(s = 0; s < nGrids; s++, g++) { for(norm = 0, cc = 0, c1 = 0; c1 < C; c1++) { @@ -97,7 +97,7 @@ double Admixture::runNativeWithBigAss(int ind, const std::unique_ptr & g for(k2 = 0; k2 < K; k2++) { k12 = k1 * K + k2; - w(cc, k12) = xz * F(k1 * C + c1, m) * Q(k1, ind) * F(k2 * C + c2, m) * Q(k2, ind); + w(cc, k12) = xz * F(k1 * C + c1, g) * Q(k1, ind) * F(k2 * C + c2, g) * Q(k2, ind); if(c1 != c2) w(cc, k12) *= 2; norm += w(cc, k12); } @@ -129,20 +129,20 @@ double Admixture::runNativeWithBigAss(int ind, const std::unique_ptr & g iQ += Ekg.rowwise().sum(); { std::scoped_lock lock(mutex_it); // sum over all samples - Ekc.middleCols(m - nGrids, nGrids) += iEkc; - NormF.middleCols(m - nGrids, nGrids) += Ekg; + Ekc.middleCols(g - nGrids, nGrids) += iEkc; + NormF.middleCols(g - nGrids, nGrids) += Ekg; } } // update Q, iQ.sum() should be 2M - if(!nonewQ) Q.col(ind) = iQ / (2 * M); + if(!nonewQ) Q.col(ind) = iQ / (2 * G); return llike; } void Admixture::initIteration() { - Ekc.setZero(C * K, M); - NormF.setZero(K, M); + Ekc.setZero(C * K, G); + NormF.setZero(K, G); } void Admixture::updateIteration() @@ -177,8 +177,8 @@ void Admixture::constrainF() if(cF) { for(int c = 0; c < C; c++) - for(int m = 0; m < M; m++) - if(F(k * C + c, m) < P(c, m)) F(k * C + c, m) = P(c, m); + for(int g = 0; g < G; g++) + if(F(k * C + c, g) < P(c, g)) F(k * C + c, g) = P(c, g); } F.middleRows(k * C, C).rowwise() /= F.middleRows(k * C, C).colwise().sum(); } @@ -186,25 +186,22 @@ void Admixture::constrainF() void Admixture::setStartPoint(const std::unique_ptr & genome, std::string qfile) { - P = MyArr2D(C, M); - for(int ic = 0, m = 0; ic < genome->nchunks; ic++) + P = MyArr2D(C, G); + collapse = Bool1D::Constant(genome->nsnps, false); + int ic{0}, m{0}; + for(auto c : genome->collapse) collapse(ic++) = (c == 1); + grids.resize(genome->nchunks); + for(ic = 0, m = 0; ic < genome->nchunks; ic++) { const int S = genome->pos[ic].size(); - Eigen::Map AE(genome->AE[ic].data(), C * C, S); - for(int s = 0; s < S; s++) P.col(m + s) = AE.col(s).reshaped(C, C).colwise().sum(); - m += S; + const auto se = find_grid_start_end(collapse.segment(m, S)); + const int iG = se.size(); + grids[ic] = iG; + Eigen::Map AE(genome->AE[ic].data(), C * C, iG); + for(int g = 0; g < iG; g++) P.col(m + g) = AE.col(g).reshaped(C, C).colwise().sum(); + m += iG; } - if(cF) - { - for(int k = 0; k < K; k++) - { - for(int c = 0; c < C; c++) - for(int m = 0; m < M; m++) - if(F(k * C + c, m) < P(c, m)) F(k * C + c, m) = P(c, m); - F.middleRows(k * C, C).rowwise() /= F.middleRows(k * C, C).colwise().sum(); - } - } - + assert(m == G); if(!qfile.empty()) load_csv(Q, qfile); } @@ -236,10 +233,9 @@ int run_admix_main(Options & opts) cao.done(tim.date(), filesize, " bytes deserialized from file. skip imputation, ec", ec); cao.print(tim.date(), "parsing input -> C =", genome->C, ", N =", genome->nsamples, ", M =", genome->nsnps, ", nchunks =", genome->nchunks, ", B =", genome->B, ", G =", genome->G); - assert(opts.K < genome->C); - cao.warn(tim.date(), "-> running admixture with seed =", opts.seed); Admixture admixer(genome->nsamples, genome->G, genome->C, opts.K, opts.seed); + cao.warn(tim.date(), "-> running admixture with seed =", opts.seed); admixer.setFlags(opts.debug, opts.nQ, opts.cF); admixer.setStartPoint(genome, opts.in_qfile); vector> llike; diff --git a/src/admixture.hpp b/src/admixture.hpp index bb9c508..62ee4d3 100644 --- a/src/admixture.hpp +++ b/src/admixture.hpp @@ -23,12 +23,12 @@ class Admixture bool cF = false; public: - Admixture(int n, int m, int c, int k, int seed) : N(n), M(m), C(c), K(k) + Admixture(int n, int m, int c, int k, int seed) : N(n), G(m), C(c), K(k) { rng.seed(seed); Q = RandomUniform(K, N, rng, admixtureThreshold, 1 - admixtureThreshold); Q.rowwise() /= Q.colwise().sum(); // normalize Q per individual - F = RandomUniform(C * K, M, rng, clusterFreqThreshold, + F = RandomUniform(C * K, G, rng, clusterFreqThreshold, 1 - clusterFreqThreshold); for(int k = 0; k < K; k++) F.middleRows(k * C, C).rowwise() /= F.middleRows(k * C, C).colwise().sum(); } @@ -36,12 +36,14 @@ class Admixture ~Admixture() {} // SHARED VARIBALES - const int N, M, C, K; // M: number of grids in total, C2 = C x C + const int N, G, C, K; // M: number of grids in total MyArr2D F; // (C x K) x M MyArr2D P; // C x M, for each k, F <= P MyArr2D Q; // K x N MyArr2D Ekc; // (C * K) x M, expected number of alleles per c per k MyArr2D NormF; // K x M + Bool1D collapse; + Int1D grids; void initIteration(); void updateIteration(); diff --git a/src/common.hpp b/src/common.hpp index 7fbf2d0..7c33b05 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -231,20 +231,30 @@ inline Int1D calc_position_distance(const Int1D & markers) // the form is [s,e] inline Int2D find_grid_start_end(const Bool1D & collapse) { - const int G = ((collapse == true).count() + 1) / 2; - Int2D gpos(G, Int1D(2)); + int G = ((collapse == true).count() + 1) / 2; int s, e, g, m = collapse.size(); + bool b1 = ((collapse == true).count() == 0); + if(b1) G = m; + Int2D gpos(G, Int1D(2)); for(s = 0, e = 1, g = 0; g < G; g++) { - for(;; e++) - if(collapse[e] == true) break; - if(g == G - 1) e = m - 1; - gpos[g][0] = s; - gpos[g][1] = e; - // cao.cerr("size of g:", gpos[g].size(), ",s:",s, ",e:",e); - // for(auto j : gpos[g]) cao.cerr(j); - s = e + 1 >= m ? m - 1 : e + 1; - e = s + 1 >= m ? m - 1 : s + 1; + if(b1) + { + gpos[g][0] = g; + gpos[g][1] = g; + } + else + { + for(;; e++) + if(collapse[e] == true) break; + if(g == G - 1) e = m - 1; + gpos[g][0] = s; + gpos[g][1] = e; + // cao.cerr("size of g:", gpos[g].size(), ",s:",s, ",e:",e); + // for(auto j : gpos[g]) cao.cerr(j); + s = e + 1 >= m ? m - 1 : e + 1; + e = s + 1 >= m ? m - 1 : s + 1; + } } return gpos; } @@ -498,9 +508,10 @@ inline auto get_cluster_likelihoods(const MyArr2D & gli, const MyArr2D & R, const MyArr2D & PI, const MyArr2D & AE, + const Bool1D & collapse, const double minEmission = 1e-10) { - MyArr2D emit = get_emission_by_gl(gli, P).transpose(); // CC x S + MyArr2D emit = get_emission_by_grid(gli, P, collapse); // CC x S const auto [alpha, beta, cs] = forward_backwards_diploid(emit, R, PI); // reuse emit emit = (alpha * beta) / AE; @@ -579,7 +590,7 @@ inline Bool1D find_grid_to_collapse(const MyArr2D & R, double tol_r = 1e-6) inline Int2D divide_pos_into_grid(const Int1D & pos, const Bool1D & collapse) { - assert(pos.size() == collapse.size()); + assert((int)pos.size() == (int)collapse.size()); Int2D grids; for(auto i = 0; i < collapse.size(); i++) { diff --git a/src/fastphase.cpp b/src/fastphase.cpp index fe00008..ef37f5f 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -270,6 +270,9 @@ int run_impute_main(Options & opts) FastPhaseK2 faith(genome->nsamples, genome->nsnps, opts.C, opts.seed); faith.setFlags(opts.ptol, opts.ftol, opts.qtol, opts.debug, opts.nQ, opts.nP, opts.nF, opts.nR); faith.initRecombination(genome->pos, opts.in_rfile, opts.gridsize); + genome->G = faith.G; + cao.print(tim.date(), "parsing input -> C =", genome->C, ", N =", genome->nsamples, ", M =", genome->nsnps, + ", nchunks =", genome->nchunks, ", B =", opts.gridsize, ", G =", genome->G, ", seed =", opts.seed); double loglike, diff, prevlike{std::numeric_limits::lowest()}; for(int it = 0; SIG_COND && it <= opts.nimpute; it++) { diff --git a/src/fastphase.hpp b/src/fastphase.hpp index 71b4182..cc217a2 100644 --- a/src/fastphase.hpp +++ b/src/fastphase.hpp @@ -24,8 +24,6 @@ class FastPhaseK2 double minHapfreq{0.01}; // min haplotype frequency, or min(1/(10*C), 1/100) // FLAGS bool debug{0}, local{0}, post{1}, NQ{0}, NF{0}, NP{1}, NR{1}; - int G{0}, B{1}; // G: number of grids after collapsing block - double nGen; public: FastPhaseK2(int n, int m, int c, int seed) : N(n), M(m), C(c), CC(c * c) @@ -40,6 +38,8 @@ class FastPhaseK2 // SHARED VARIBALES + int G{0}, B{1}; // G: number of grids after collapsing block + double nGen; const int N, M, C, CC; // CC = C x C MyArr2D GP; // N x (M x 3), genotype probabilies for all individuals MyArr2D PI; // C x M, cluster frequency @@ -51,7 +51,6 @@ class FastPhaseK2 MyArr2D HapSum; // C x M, sum(gammaK) for all inds Int1D dist; // physical position distance between two markers Int1D pos_chunk, grid_chunk; // store the start pos of each chunk in the full scale - Int3D grids; // physical position of each grid for each chunk Bool1D collapse; MyArr1D AF; diff --git a/src/io.hpp b/src/io.hpp index 5ea5508..042eb34 100644 --- a/src/io.hpp +++ b/src/io.hpp @@ -569,9 +569,6 @@ inline void init_bigass(const std::unique_ptr & genome, const Options & genome->G = G; if(genome->B == 1 && genome->G != genome->nsnps) cao.error("number of grids should be same as snps if B=1"); - cao.print(tim.date(), "parsing input -> C =", genome->C, ", N =", genome->nsamples, - ", M =", genome->nsnps, ", nchunks =", genome->nchunks, ", B =", opts.gridsize, - ", seed =", opts.seed); cao.done(tim.date(), "elapsed time for parsing beagle file", std::fixed, tim.reltime(), " secs"); } diff --git a/src/phaseless.cpp b/src/phaseless.cpp index 63a8c03..6bd33f2 100644 --- a/src/phaseless.cpp +++ b/src/phaseless.cpp @@ -237,7 +237,7 @@ double Phaseless::runForwardBackwards(const int ind, const int ic, const MyFloat { const int S = pos_chunk[ic + 1] - pos_chunk[ic]; Eigen::Map gli(GL.data() + ind * S * 3, S, 3); - MyArr2D emit = get_emission_by_gl(gli, P.middleRows(pos_chunk[ic], S)).transpose(); // CC x S + MyArr2D emit = get_emission_by_gl(gli, P.middleRows(pos_chunk[ic], S)); // CC x S // first get H ie old PI in fastphase MyArr2D H = MyArr2D::Zero(C, S); int z1, y1, s; diff --git a/tests/test-admixture.cpp b/tests/test-admixture.cpp index 4138fe2..70af5dc 100644 --- a/tests/test-admixture.cpp +++ b/tests/test-admixture.cpp @@ -34,17 +34,19 @@ TEST_CASE("phaseless naive vs dump dataset 1", "[test-phaseless]") } // reuse Ezj for AE faith.Ezj = get_cluster_frequency(faith.R, faith.PI); + genome->collapse = Char1D(faith.collapse.data(), faith.collapse.data() + faith.collapse.size()); for(int ic = 0; ic < genome->nchunks; ic++) { const int S = faith.pos_chunk[ic + 1] - faith.pos_chunk[ic]; - MyArr2D out = faith.Ezj.middleCols(faith.pos_chunk[ic], S); + const int G = faith.grid_chunk[ic + 1] - faith.grid_chunk[ic]; + MyArr2D out = faith.Ezj.middleCols(faith.pos_chunk[ic], G); genome->AE.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); - out = faith.R.middleCols(faith.pos_chunk[ic], S); + out = faith.R.middleCols(faith.pos_chunk[ic], G); genome->R.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); - out = faith.PI.middleCols(faith.pos_chunk[ic], S); + out = faith.PI.middleCols(faith.pos_chunk[ic], G); genome->PI.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); - out = faith.F.middleRows(faith.pos_chunk[ic], S); - genome->F.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); + out = faith.P.middleRows(faith.pos_chunk[ic], S); + genome->P.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); } double llike1, llike2; Admixture admixer1(genome->nsamples, genome->nsnps, genome->C, K, seed); diff --git a/tests/test-utils.cpp b/tests/test-utils.cpp index 0400eb9..70bdc00 100644 --- a/tests/test-utils.cpp +++ b/tests/test-utils.cpp @@ -9,10 +9,12 @@ using namespace Eigen; TEST_CASE("calc_grid_distance", "[test-utils]") { // Int1D pos{1,2,3,4,5,6,7,8,9,10,11}; - Int1D pos{1,2,3,4,5,6,7,8,9}; - const auto b = find_grid_to_collapse(pos, 1); - cao.cerr("b:", b); + Int1D pos{1, 2, 3, 4, 5, 6, 7, 8, 9}; + const auto b = find_grid_to_collapse(pos, 4); + + Char1D cb(b.size()); + for(int i = 0; i < b.size();i++) cb[i] = (b(i)==true); + for(auto i : cb) cao.cerr(i==1); auto d = calc_grid_distance(pos, b); // cao.cerr("size:", d.size()); - for(auto i : d) cao.cerr(i); } From 7999b03b04710cfdf7ccd62131f62c2adcb1bfbd Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Tue, 6 Feb 2024 21:54:50 +0100 Subject: [PATCH 46/67] update cli --- src/main.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main.cpp b/src/main.cpp index 3cfbcc2..6ba456d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -180,7 +180,6 @@ int main(int argc, char * argv[]) "3: re-sample P by copying from others with respect to their probability\n" "0: disable this") .default_value(0) - .choices(0, 1, 2, 3) .scan<'i', int>(); cmd_impute.add_argument("--minRecombRate") .help("min recombination rate to determine if a SNP should be collapsed") From df5b0fdf784e4e3cb24cf9a4dd81c78338ba3bd9 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Tue, 6 Feb 2024 22:36:08 +0100 Subject: [PATCH 47/67] fix out of range --- src/common.hpp | 1 + src/fastphase.cpp | 29 +++++++++++++++-------------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/common.hpp b/src/common.hpp index 7c33b05..dcf9fc5 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -274,6 +274,7 @@ inline Int2D split_pos_into_grid(const Int1D & pos, const Bool1D & collapse) inline Int1D calc_grid_distance(const Int1D & pos, const Bool1D & collapse) { + assert(pos.size() == collapse.size()); // B = 1 if((collapse == true).count() == 0) return calc_position_distance(pos); // B > 1, split pos into grids diff --git a/src/fastphase.cpp b/src/fastphase.cpp index ef37f5f..d57d9f9 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -22,16 +22,16 @@ void FastPhaseK2::initRecombination(const Int2D & pos, std::string rfile, int B_ R = MyArr2D(3, G); PI = MyArr2D::Ones(C, G); PI.rowwise() /= PI.colwise().sum(); // normalize it per site - dist.reserve(M); + dist.reserve(G); Int1D tmpdist; for(i = 0, ss = 0, sg = 0; i < nchunks; i++) { pos_chunk[i] = ss; grid_chunk[i] = sg; collapse.segment(ss, pos[i].size()) = find_grid_to_collapse(pos[i], B); - tmpdist = calc_grid_distance(pos[i], collapse); - R.middleCols(sg, tmpdist.size()) = calc_transRate_diploid(tmpdist, nGen); + tmpdist = calc_grid_distance(pos[i], collapse.segment(ss, pos[i].size())); dist.insert(dist.end(), tmpdist.begin(), tmpdist.end()); + R.middleCols(sg, tmpdist.size()) = calc_transRate_diploid(tmpdist, nGen); ss += pos[i].size(); sg += tmpdist.size(); } @@ -58,13 +58,14 @@ void FastPhaseK2::setFlags(double tol_p, double tol_f, double tol_q, bool debug_ void FastPhaseK2::refillHaps(int strategy) { int s{0}; + int nchunks = pos_chunk.size() - 1; for(int c = 0; c < C; c++) { // bin hapsum per 100 snps ? - for(int m = 0; m < M; m++) + for(int ic = 0, g = 0; ic < nchunks; ic++) { - if(HapSum(c, m) >= minHapfreq) continue; - MyArr1D h = HapSum.col(m); + if(HapSum(c, g) >= minHapfreq) continue; + MyArr1D h = HapSum.col(g); h(c) = 0; // do not re-sample current h /= h.sum(); MyFloat1D p(h.data(), h.data() + h.size()); @@ -73,16 +74,16 @@ void FastPhaseK2::refillHaps(int strategy) assert(choice != c); if(strategy == 1) { - P(m, c) = alleleEmitThreshold; + P(g, c) = alleleEmitThreshold; } else if(strategy == 2) { h.maxCoeff(&choice); // if no binning, this may be better - P(m, c) = P(m, choice); + P(g, c) = P(g, choice); } else { - P(m, c) = P(m, choice); + P(g, c) = P(g, choice); } s++; } @@ -166,13 +167,13 @@ double FastPhaseK2::hmmIterWithJumps(const MyFloat1D & GL, const int ic, const i const int S = pos_chunk[ic + 1] - pos_chunk[ic]; const int nGrids = grid_chunk[ic + 1] - grid_chunk[ic]; Eigen::Map gli(GL.data() + ind * S * 3, S, 3); - MyArr2D emit = get_emission_by_grid(gli, P.middleRows(pos_chunk[ic], S), collapse.segment(pos_chunk[ic], S)); int start = pos_chunk[ic], nsize = S; if(nGrids != S) { start = grid_chunk[ic]; nsize = nGrids; } + MyArr2D emit = get_emission_by_grid(gli, P.middleRows(pos_chunk[ic], S), collapse.segment(pos_chunk[ic], S)); const auto [alpha, beta, cs] = forward_backwards_diploid(emit, R.middleCols(start, nsize), PI.middleCols(start, nsize)); if(!((1 - ((alpha * beta).colwise().sum())).abs() < 1e-9).all()) @@ -215,7 +216,7 @@ double FastPhaseK2::hmmIterWithJumps(const MyFloat1D & GL, const int ic, const i Ezg1.middleCols(pos_chunk[ic], S) += ind_post_zg1; Ezg2.middleCols(pos_chunk[ic], S) += ind_post_zg2; Ezj.middleCols(grid_chunk[ic], nGrids) += ind_post_zj; - HapSum.middleCols(pos_chunk[ic], nGrids) += gammaC; + HapSum.middleCols(grid_chunk[ic], nGrids) += gammaC; } return (1 / cs).log().sum(); @@ -310,11 +311,11 @@ int run_impute_main(Options & opts) { const int S = faith.pos_chunk[ic + 1] - faith.pos_chunk[ic]; const int G = faith.grid_chunk[ic + 1] - faith.grid_chunk[ic]; - MyArr2D out = faith.Ezj.middleCols(faith.pos_chunk[ic], G); + MyArr2D out = faith.Ezj.middleCols(faith.grid_chunk[ic], G); genome->AE.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); - out = faith.R.middleCols(faith.pos_chunk[ic], G); + out = faith.R.middleCols(faith.grid_chunk[ic], G); genome->R.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); - out = faith.PI.middleCols(faith.pos_chunk[ic], G); + out = faith.PI.middleCols(faith.grid_chunk[ic], G); genome->PI.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); out = faith.P.middleRows(faith.pos_chunk[ic], S); genome->P.emplace_back(MyFloat1D(out.data(), out.data() + out.size())); From 738c400c50d02e106f2f4cf5ce158247ff2bb22c Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Wed, 7 Feb 2024 10:19:14 +0100 Subject: [PATCH 48/67] fix snp index --- src/admixture.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index 275fe21..be4d2c7 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -188,20 +188,21 @@ void Admixture::setStartPoint(const std::unique_ptr & genome, std::strin { P = MyArr2D(C, G); collapse = Bool1D::Constant(genome->nsnps, false); - int ic{0}, m{0}; + int ic{0}, sg{0}, ss{0}; for(auto c : genome->collapse) collapse(ic++) = (c == 1); grids.resize(genome->nchunks); - for(ic = 0, m = 0; ic < genome->nchunks; ic++) + for(ic = 0, sg = 0, ss = 0; ic < genome->nchunks; ic++) { const int S = genome->pos[ic].size(); - const auto se = find_grid_start_end(collapse.segment(m, S)); + const auto se = find_grid_start_end(collapse.segment(ss, S)); const int iG = se.size(); grids[ic] = iG; Eigen::Map AE(genome->AE[ic].data(), C * C, iG); - for(int g = 0; g < iG; g++) P.col(m + g) = AE.col(g).reshaped(C, C).colwise().sum(); - m += iG; + for(int g = 0; g < iG; g++) P.col(sg + g) = AE.col(g).reshaped(C, C).colwise().sum(); + sg += iG; + ss += S; } - assert(m == G); + assert(sg == G); if(!qfile.empty()) load_csv(Q, qfile); } From 6ab88decdab545efffeb41a3fcc0e72ca54f3fde Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Wed, 7 Feb 2024 10:38:27 +0100 Subject: [PATCH 49/67] update refillhaps --- src/fastphase.cpp | 58 +++++++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/src/fastphase.cpp b/src/fastphase.cpp index d57d9f9..560f3ea 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -57,35 +57,45 @@ void FastPhaseK2::setFlags(double tol_p, double tol_f, double tol_q, bool debug_ void FastPhaseK2::refillHaps(int strategy) { - int s{0}; + int s{0}, ic{0}, g{0}, i{0}; int nchunks = pos_chunk.size() - 1; - for(int c = 0; c < C; c++) + // bin hapsum per 100 snps ? + for(ic = 0; ic < nchunks; ic++) { - // bin hapsum per 100 snps ? - for(int ic = 0, g = 0; ic < nchunks; ic++) + const int S = pos_chunk[ic + 1] - pos_chunk[ic]; + const auto se = find_grid_start_end(collapse.segment(pos_chunk[ic], S)); + for(g = 0; g < (int)se.size(); g++) { - if(HapSum(c, g) >= minHapfreq) continue; - MyArr1D h = HapSum.col(g); - h(c) = 0; // do not re-sample current - h /= h.sum(); - MyFloat1D p(h.data(), h.data() + h.size()); - std::discrete_distribution distribution{p.begin(), p.end()}; - int choice = distribution(rng); - assert(choice != c); - if(strategy == 1) + for(int c = 0; c < C; c++) { - P(g, c) = alleleEmitThreshold; - } - else if(strategy == 2) - { - h.maxCoeff(&choice); // if no binning, this may be better - P(g, c) = P(g, choice); - } - else - { - P(g, c) = P(g, choice); + + if(HapSum(c, g) >= minHapfreq) continue; + MyArr1D h = HapSum.col(g); + h(c) = 0; // do not re-sample current + h /= h.sum(); + MyFloat1D p(h.data(), h.data() + h.size()); + std::discrete_distribution distribution{p.begin(), p.end()}; + int choice = distribution(rng); + assert(choice != c); + // now go through all sites in the grid + for(i = se[g][0]; i <= se[g][1]; i++) + { + if(strategy == 1) + { + P(i + pos_chunk[ic], c) = alleleEmitThreshold; + } + else if(strategy == 2) + { + h.maxCoeff(&choice); // if no binning, this may be better + P(i + pos_chunk[ic], c) = P(g, choice); + } + else + { + P(i + pos_chunk[ic], c) = P(g, choice); + } + s++; + } } - s++; } } cao.warn("refill ", 100 * s / (C * M), "% infrequently used haps"); From 8c2df2624f3a2b91b78684bbd3f22cde9270f01d Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Wed, 7 Feb 2024 11:22:32 +0100 Subject: [PATCH 50/67] rescale emit if B > 1! --- src/common.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/common.hpp b/src/common.hpp index dcf9fc5..613c84f 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -334,23 +334,23 @@ inline MyArr2D get_emission_by_gl(const MyArr2D & gli, const MyArr2D & P, double int k1, k2, g1, g2; const int M = P.rows(); const int C = P.cols(); - MyArr2D emitDip(M, C * C); // emission probabilies, nsnps x (C x C) + MyArr2D emit(M, C * C); // emission probabilies, nsnps x (C x C) for(k1 = 0; k1 < C; k1++) for(k2 = 0; k2 < C; k2++) { - emitDip.col(k1 * C + k2).setZero(); + emit.col(k1 * C + k2).setZero(); for(g1 = 0; g1 <= 1; g1++) { for(g2 = 0; g2 <= 1; g2++) { - emitDip.col(k1 * C + k2) += gli.col(g1 + g2) * (g1 * P.col(k1) + (1 - g1) * (1 - P.col(k1))) - * (g2 * P.col(k2) + (1 - g2) * (1 - P.col(k2))); + emit.col(k1 * C + k2) += gli.col(g1 + g2) * (g1 * P.col(k1) + (1 - g1) * (1 - P.col(k1))) + * (g2 * P.col(k2) + (1 - g2) * (1 - P.col(k2))); } } } - // emitDip = emitDip.colwise() / emitDip.rowwise().maxCoeff(); // normalize - // emitDip = (emitDip < minEmission).select(minEmission, emitDip); - return emitDip.transpose(); + emit = emit.colwise() / emit.rowwise().maxCoeff(); // normalize + emit = (emit < minEmission).select(minEmission, emit); + return emit.transpose(); } /* @@ -392,8 +392,8 @@ inline MyArr2D get_emission_by_grid(const MyArr2D & gli, } } // apply bounding - // emitGrid.col(g) /= emitGrid.col(g).maxCoeff(); - // emitGrid.col(g) = (emitGrid.col(g) < minEmission).select(minEmission, emitGrid.col(g)); + emit.col(g) /= emit.col(g).maxCoeff(); + emit.col(g) = (emit.col(g) < minEmission).select(minEmission, emit.col(g)); } return emit; } From d16b23ff95a02b2fb1f88f5b6faa4867fa0748ff Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Wed, 7 Feb 2024 12:54:31 +0100 Subject: [PATCH 51/67] fix post(z,g) for B>1 --- src/common.hpp | 28 ++++++++++++++++++++++++++-- src/fastphase.cpp | 12 +++++++----- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/src/common.hpp b/src/common.hpp index 613c84f..76cb3f7 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -348,11 +348,35 @@ inline MyArr2D get_emission_by_gl(const MyArr2D & gli, const MyArr2D & P, double } } } - emit = emit.colwise() / emit.rowwise().maxCoeff(); // normalize + // emit = emit.colwise() / emit.rowwise().maxCoeff(); // normalize emit = (emit < minEmission).select(minEmission, emit); return emit.transpose(); } +inline MyArr1D get_emission_by_site(const MyArr1D & gli, const MyArr1D & P, double minEmission = 1e-10) +{ + const int C = P.size(); + MyArr1D emit = MyArr1D::Zero(C * C); + int z1, z2, z12, g1, g2; + for(z1 = 0; z1 < C; z1++) + { + for(z2 = 0; z2 < C; z2++) + { + z12 = z1 * C + z2; + for(g1 = 0; g1 <= 1; g1++) + { + for(g2 = 0; g2 <= 1; g2++) + { + emit(z12) += + gli(g1 + g2) * (g1 * P(z1) + (1 - g1) * (1 - P(z1))) * (g2 * P(z2) + (1 - g2) * (1 - P(z2))); + } + } + } + } + emit = (emit < minEmission).select(minEmission, emit); + return emit; +} + /* ** @param gli genotype likelihoods of current individual i, (M, 3) ** @param P cluster-specific allele frequence (M, C) @@ -392,7 +416,7 @@ inline MyArr2D get_emission_by_grid(const MyArr2D & gli, } } // apply bounding - emit.col(g) /= emit.col(g).maxCoeff(); + emit.col(g) /= emit.col(g).sum(); emit.col(g) = (emit.col(g) < minEmission).select(minEmission, emit.col(g)); } return emit; diff --git a/src/fastphase.cpp b/src/fastphase.cpp index 560f3ea..2e468ad 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -183,27 +183,29 @@ double FastPhaseK2::hmmIterWithJumps(const MyFloat1D & GL, const int ic, const i start = grid_chunk[ic]; nsize = nGrids; } - MyArr2D emit = get_emission_by_grid(gli, P.middleRows(pos_chunk[ic], S), collapse.segment(pos_chunk[ic], S)); + MyArr2D emit_grid = get_emission_by_grid(gli, P.middleRows(pos_chunk[ic], S), collapse.segment(pos_chunk[ic], S)); const auto [alpha, beta, cs] = - forward_backwards_diploid(emit, R.middleCols(start, nsize), PI.middleCols(start, nsize)); + forward_backwards_diploid(emit_grid, R.middleCols(start, nsize), PI.middleCols(start, nsize)); if(!((1 - ((alpha * beta).colwise().sum())).abs() < 1e-9).all()) cao.error((alpha * beta).colwise().sum(), "\ngamma sum is not 1.0!\n"); // now get posterios MyArr2D ind_post_zg1(C, S), ind_post_zg2(C, S), ind_post_zj(C, nGrids), gammaC(C, nGrids); - MyArr1D gamma_div_emit(CC), beta_mult_emit(CC); + MyArr1D gamma_div_emit(CC), beta_mult_emit(CC), igamma(CC); MyArr1D alphatmp(C); int z1, m, s, g{0}, gg{0}; const auto se = find_grid_start_end(collapse.segment(pos_chunk[ic], S)); for(g = 0; g < nGrids; g++) { gg = g + grid_chunk[ic]; - gamma_div_emit = (alpha.col(g) * beta.col(g)) / emit.col(g); // C2 gammaC.col(g) = (alpha.col(g) * beta.col(g)).reshaped(C, C).colwise().sum(); + igamma = alpha.col(g) * beta.col(g); + gamma_div_emit = igamma / emit_grid.col(g); // C2 for(z1 = 0; z1 < C; z1++) { for(s = se[g][0]; s <= se[g][1]; s++) { m = s + pos_chunk[ic]; + if(B > 1) gamma_div_emit = igamma / get_emission_by_site(gli.row(s), P.row(m)); ind_post_zg1(z1, s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (1 - P(m, z1)) * (gli(s, 0) * (1 - P.row(m)) + gli(s, 1) * P.row(m)).transpose()) .sum(); @@ -217,7 +219,7 @@ double FastPhaseK2::hmmIterWithJumps(const MyFloat1D & GL, const int ic, const i } if(g == 0) continue; alphatmp += PI.col(gg) * R(2, gg) * 1.0; // inner alpha.col(s-1).sum == 1 - beta_mult_emit = emit.col(g) * beta.col(g); // C2 + beta_mult_emit = emit_grid.col(g) * beta.col(g); // C2 for(z1 = 0; z1 < C; z1++) ind_post_zj(z1, g) = cs(g) * (PI(z1, gg) * alphatmp * beta_mult_emit(Eigen::seqN(z1, C, C))).sum(); } From b15001cea9bd75607133f80fadbbc20bbbf554e1 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Wed, 7 Feb 2024 13:35:49 +0100 Subject: [PATCH 52/67] speedup but more ram --- src/fastphase.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/fastphase.cpp b/src/fastphase.cpp index 2e468ad..0cb6397 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -184,6 +184,7 @@ double FastPhaseK2::hmmIterWithJumps(const MyFloat1D & GL, const int ic, const i nsize = nGrids; } MyArr2D emit_grid = get_emission_by_grid(gli, P.middleRows(pos_chunk[ic], S), collapse.segment(pos_chunk[ic], S)); + MyArr2D emit = get_emission_by_gl(gli, P.middleRows(pos_chunk[ic], S)); const auto [alpha, beta, cs] = forward_backwards_diploid(emit_grid, R.middleCols(start, nsize), PI.middleCols(start, nsize)); if(!((1 - ((alpha * beta).colwise().sum())).abs() < 1e-9).all()) @@ -199,13 +200,14 @@ double FastPhaseK2::hmmIterWithJumps(const MyFloat1D & GL, const int ic, const i gg = g + grid_chunk[ic]; gammaC.col(g) = (alpha.col(g) * beta.col(g)).reshaped(C, C).colwise().sum(); igamma = alpha.col(g) * beta.col(g); - gamma_div_emit = igamma / emit_grid.col(g); // C2 + if(B == 1) gamma_div_emit = igamma / emit_grid.col(g); for(z1 = 0; z1 < C; z1++) { for(s = se[g][0]; s <= se[g][1]; s++) { m = s + pos_chunk[ic]; - if(B > 1) gamma_div_emit = igamma / get_emission_by_site(gli.row(s), P.row(m)); + // if(B > 1) gamma_div_emit = igamma / get_emission_by_site(gli.row(s), P.row(m)); + if(B > 1) gamma_div_emit = igamma / emit.col(s); ind_post_zg1(z1, s) = (gamma_div_emit(Eigen::seqN(z1, C, C)) * (1 - P(m, z1)) * (gli(s, 0) * (1 - P.row(m)) + gli(s, 1) * P.row(m)).transpose()) .sum(); From 6dd93419adf0b3c2902417f6b8b6f294f5266160 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Wed, 7 Feb 2024 15:05:10 +0100 Subject: [PATCH 53/67] update r package --- NAMESPACE | 6 +++- R/plot_haplotypes.R | 63 ++++++++++++++++++++++------------------- src/parse-phaseless.cpp | 22 ++++++++------ 3 files changed, 53 insertions(+), 38 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 4dfd139..3a05a58 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,9 +1,13 @@ # Generated by roxygen2: do not edit by hand +S3method(plot,admixQ) +S3method(plot,gamma) +S3method(plot,hapfreq) +export(admix.alignKStephens) +export(admix.plotQ) export(parse_impute_opt) export(parse_impute_par) export(parse_joint_par) export(parse_joint_post) -export(plot_gamma) importFrom(Rcpp,sourceCpp) useDynLib(phaseless, .registration = TRUE) diff --git a/R/plot_haplotypes.R b/R/plot_haplotypes.R index a7b2454..f762845 100644 --- a/R/plot_haplotypes.R +++ b/R/plot_haplotypes.R @@ -1,8 +1,9 @@ #' @export -plotGamma <- function(gammaC, sites = NULL, ...) { - N <- length(gammaC) - C <- nrow(gammaC[[1]]) - M <- ncol(gammaC[[1]]) +plot.gamma <- function(gamma, sites = NULL, ...) { + stopifnot(is.list(gamma)) + N <- length(gamma) + C <- nrow(gamma[[1]]) + M <- ncol(gamma[[1]]) if(!is.null(sites) & is.vector(sites) & length(sites) < M) { M <- length(sites) } else { @@ -16,36 +17,40 @@ plotGamma <- function(gammaC, sites = NULL, ...) { ytop <- i + array(0, M) ybottom <- i + array(0, M) for(c in 1:C) { - ytop <- ytop + gammaC[[i]][c, sites] + ytop <- ytop + gamma[[i]][c, sites] rect(xleft = xleft - d, xright = xright + d, ybottom = ybottom, ytop = ytop, col = c, lwd = 0, border = NA) ybottom <- ytop } } } + #' @export -plotHapFreqWithPhysicalPos <- function(K, - pos, - hapfreq, - ...) { - ## - colStore <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") - nCols <- length(colStore) - nGrids <- length(pos) - sum <- array(0, nGrids) - xlim <- range(pos) - ylim <- c(0, 1) - ## OK so if there are grids, use the grid points - plot(x = 0, y = 0, xlim = xlim, ylim = ylim, axes = FALSE, ...) - x <- c(pos[1], pos, pos[length(pos):1]) - m <- array(0, c(nGrids, K + 1)) - for(i in 1:K) { - m[, i + 1] <- m[, i] + hapfreq[i, ] - } - for(i in K:1) { - polygon( - x = x, y = c(m[1, i], m[, i + 1], m[nGrids:1, i]), - xlim = xlim, ylim = ylim, col = colStore[(i %% nCols) + 1] - ) - } +plot.hapfreq <- function(hapfreq, + pos, + recomb = NULL, + colors = c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"), + ...) { + stopifnot(is.matrix(hapfreq), is.vector(pos)) + nCols <- length(colors) + nGrids <- length(pos) + K <- nrow(hapfreq) + sum <- array(0, nGrids) + xlim <- range(pos) + ylim <- c(0, 1) + ## OK so if there are grids, use the grid points + plot(x = 0, y = 0, xlim = xlim, ylim = ylim, axes = FALSE, ...) + x <- c(pos[1], pos, pos[length(pos):1]) + m <- array(0, c(nGrids, K + 1)) + for(i in 1:K) { + m[, i + 1] <- m[, i] + hapfreq[i, ] + } + for(i in K:1) { + polygon( + x = x, y = c(m[1, i], m[, i + 1], m[nGrids:1, i]), + xlim = xlim, ylim = ylim, col = colors[(i %% nCols) + 1] + ) + } + if(!is.null(recomb)) + lines(pos[-1], recomb, type = "l", col = "red") } diff --git a/src/parse-phaseless.cpp b/src/parse-phaseless.cpp index 703f673..5f9f165 100644 --- a/src/parse-phaseless.cpp +++ b/src/parse-phaseless.cpp @@ -59,7 +59,7 @@ List parse_joint_post(std::string filename, int chunk = 0) for(int ind = 0; ind < par->N; ind++) { Eigen::Map gli(par->gls[ic].data() + ind * S * 3, S, 3); - MyArr2D emit = get_emission_by_gl(gli, P.middleRows(pos_chunk - S, S)).transpose(); // CC x S + MyArr2D emit = get_emission_by_gl(gli, P.middleRows(pos_chunk - S, S)); // first get H ie old PI in fastphase MyArr2D H = MyArr2D::Zero(C, S); int z1, y1, s; // m * C + z1 @@ -166,22 +166,28 @@ List parse_impute_par(std::string filename, int ic = -1) for(int ind = 0; ind < genome->nsamples; ind++) ids.push_back(ind); int nchunks = ic < 0 ? genome->nchunks : 1; int N = ids.size(); - List ret(N); const int C = genome->C; + Bool1D collapse = Bool1D::Constant(genome->nsnps, false); + int j{0}; + for(auto c : genome->collapse) collapse(j++) = (c == 1); + List ret(N); for(auto ind : ids) { List gamma(nchunks); - for(int c = 0; c < nchunks; c++) { + for(int c = 0, ss = 0; c < nchunks; c++) { ic = nchunks > 1 ? c : std::max(ic, c); const int S = genome->pos[ic].size(); - const int nGrids = genome->B > 1 ? (S + genome->B - 1) / genome->B : S; + const auto se = find_grid_start_end(collapse.segment(ss, S)); + const int G = se.size(); Eigen::Map gli(genome->gls[ic].data() + ind * S * 3, S, 3); - Eigen::Map P(genome->F[ic].data(), S, C); - Eigen::Map PI(genome->PI[ic].data(), C, S); - Eigen::Map R(genome->R[ic].data(), 3, S); - MyArr2D emit = get_emission_by_gl(gli, P).transpose(); // CC x S + Eigen::Map P(genome->P[ic].data(), S, C); + Eigen::Map PI(genome->PI[ic].data(), C, G); + Eigen::Map R(genome->R[ic].data(), 3, G); + // Eigen::Map AE(genome->AE[ic].data(), C, G); + MyArr2D emit = get_emission_by_grid(gli, P, collapse.segment(ss, S)); const auto [alpha, beta, cs] = forward_backwards_diploid(emit, R, PI); gamma[c] = alpha * beta; + ss += S; } ret[ind] = gamma; } From 834fbfebe6138fc5dad2035f8def7b2e99892fe6 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Thu, 8 Feb 2024 10:26:23 +0100 Subject: [PATCH 54/67] update version --- src/admixture.cpp | 18 ++++++++++-------- src/main.cpp | 2 +- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index be4d2c7..78db76f 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -162,24 +162,26 @@ void Admixture::protectPars() } if(F.isNaN().any()) cao.error("NaN in F\n"); - if(!cF) + if(cF) + { + constrainF(); + } + else { F = (F < clusterFreqThreshold).select(clusterFreqThreshold, F); // lower bound F = (F > 1 - clusterFreqThreshold).select(1 - clusterFreqThreshold, F); // upper bound + for(int k = 0; k < K; k++) F.middleRows(k * C, C).rowwise() /= F.middleRows(k * C, C).colwise().sum(); } - constrainF(); } +// need to fix the cluster ordering void Admixture::constrainF() { for(int k = 0; k < K; k++) { - if(cF) - { - for(int c = 0; c < C; c++) - for(int g = 0; g < G; g++) - if(F(k * C + c, g) < P(c, g)) F(k * C + c, g) = P(c, g); - } + for(int c = 0; c < C; c++) + for(int g = 0; g < G; g++) + if(F(k * C + c, g) < P(c, g)) F(k * C + c, g) = P(c, g); F.middleRows(k * C, C).rowwise() /= F.middleRows(k * C, C).colwise().sum(); } } diff --git a/src/main.cpp b/src/main.cpp index 6ba456d..3b0f6a1 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -18,7 +18,7 @@ int main(int argc, char * argv[]) { // ========= helper message and parameters parsing =========================== - const std::string VERSION{"0.5.0"}; + const std::string VERSION{"0.5.1"}; // below for catching ctrl+c, and dumping files struct sigaction sa; From 9c752b3ee8c56418a640e7c8ff624e36cb71ad83 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Fri, 9 Feb 2024 10:27:52 +0100 Subject: [PATCH 55/67] update private var --- src/admixture.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index 78db76f..d8e2ec9 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -15,8 +15,8 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & MyArr1D iQ = MyArr1D::Zero(K); MyArr1D Hz(C); double norm = 0, llike = 0, tmp = 0; - int c1, k1, s, c2, c12, ss, ic, g; - for(ic = 0, g = 0, ss = 0; ic < genome->nchunks; ic++) + int c1, k1, s, c2, c12; + for(int ic = 0, g = 0, ss = 0; ic < genome->nchunks; ic++) { const int S = genome->pos[ic].size(); const int nGrids = grids[ic]; @@ -69,9 +69,9 @@ double Admixture::runNativeWithBigAss(int ind, const std::unique_ptr & g MyArr2D Ekg, iEkc; double norm = 0, llike = 0; int c1, c2, c12, cc; - int k1, k2, k12, s, ss, ic, g; + int k1, k2, k12, s; MyArr1D iQ = MyArr1D::Zero(K); - for(ic = 0, g = 0, ss = 0; ic < genome->nchunks; ic++) + for(int ic = 0, g = 0, ss = 0; ic < genome->nchunks; ic++) { const int S = genome->pos[ic].size(); const int nGrids = grids[ic]; From f8be6a4e84cc60fa8d78322ebe6d65a190dc7945 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Fri, 9 Feb 2024 10:54:59 +0100 Subject: [PATCH 56/67] new option --- src/admixture.cpp | 60 ++++++++++++++++++++++++++--------------------- src/common.hpp | 4 ++-- src/main.cpp | 4 ++++ 3 files changed, 39 insertions(+), 29 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index d8e2ec9..80d1de7 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -245,7 +245,7 @@ int run_admix_main(Options & opts) if(!opts.noaccel) { MyArr2D F0, Q0, F1, Q1, F2, Q2, Ft, Qt; - const int istep{4}; + const double istep{4}; double alpha{std::numeric_limits::lowest()}, qdiff, ldiff, stepMax{4}, alphaMax{1280}; double prevlike{std::numeric_limits::lowest()}, logcheck{0}, loglike{0}; for(int it = 0; SIG_COND && (it < opts.nadmix / 4); it++) @@ -282,9 +282,12 @@ int run_admix_main(Options & opts) opts.ltol); break; } - // save for later comparison - Ft = admixer.F; - Qt = admixer.Q; + if(!opts.force) + { + // save for later comparison + Ft = admixer.F; + Qt = admixer.Q; + } // accel iteration with steplen alpha = ((F1 - F0).square().sum() + (Q1 - Q0).square().sum()) / ((admixer.F - 2 * F1 + F0).square().sum() + (admixer.Q - 2 * Q1 + Q0).square().sum()); @@ -307,29 +310,32 @@ int run_admix_main(Options & opts) for(auto && ll : llike) loglike += ll.get(); llike.clear(); // clear future and renew admixer.updateIteration(); - // save current pars - F2 = admixer.F; - Q2 = admixer.Q; - // check if normal third iter is better - admixer.Q = Qt; - admixer.F = Ft; - admixer.initIteration(); - for(int i = 0; i < genome->nsamples; i++) - llike.emplace_back(poolit.enqueue(&Admixture::runOptimalWithBigAss, &admixer, i, std::ref(genome))); - logcheck = 0; - for(auto && ll : llike) logcheck += ll.get(); - llike.clear(); // clear future and renew - admixer.updateIteration(); - if(logcheck - loglike > 0.1) - { - stepMax = istep; - cao.warn(tim.date(), "reset stepMax to 4, normal EM yields better likelihoods than the accelerated EM.", - logcheck, " -", loglike, " > 0.1"); - } - else - { - admixer.Q = Q2; - admixer.F = F2; + if(!opts.force) + { // save current pars + F2 = admixer.F; + Q2 = admixer.Q; + // check if normal third iter is better + admixer.Q = Qt; + admixer.F = Ft; + admixer.initIteration(); + for(int i = 0; i < genome->nsamples; i++) + llike.emplace_back(poolit.enqueue(&Admixture::runOptimalWithBigAss, &admixer, i, std::ref(genome))); + logcheck = 0; + for(auto && ll : llike) logcheck += ll.get(); + llike.clear(); // clear future and renew + admixer.updateIteration(); + if(logcheck - loglike > 0.1) + { + stepMax = istep; + cao.warn(tim.date(), + "reset stepMax to 4, normal EM yields better likelihoods than the accelerated EM.", + logcheck, " -", loglike, " > 0.1"); + } + else + { + admixer.Q = Q2; + admixer.F = F2; + } } } } diff --git a/src/common.hpp b/src/common.hpp index 76cb3f7..c9749be 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -103,7 +103,7 @@ struct Options double ftol{1e-6}; // threshold for F double qtol{1e-6}; // threshold for Q bool noaccel{0}, noscreen{0}, single_chunk{0}, debug{0}, collapse{0}; - bool nQ{0}, nP{0}, nF{0}, nR{0}, aQ{0}, oVCF{0}, eHap{0}, oF{0}, cF{0}; + bool nQ{0}, nP{0}, nF{0}, nR{0}, aQ{0}, oVCF{0}, eHap{0}, oF{0}, cF{0}, force{0}; std::string out, in_beagle, in_vcf, in_bin, in_impute, in_joint; std::string samples{""}, region{""}, in_plink{""}, in_qfile{""}, in_pfile{""}, in_rfile{""}; std::string opts_in_effect{"Options in effect:\n "}; @@ -274,7 +274,7 @@ inline Int2D split_pos_into_grid(const Int1D & pos, const Bool1D & collapse) inline Int1D calc_grid_distance(const Int1D & pos, const Bool1D & collapse) { - assert(pos.size() == collapse.size()); + assert((int)pos.size() == (int)collapse.size()); // B = 1 if((collapse == true).count() == 0) return calc_position_distance(pos); // B > 1, split pos into grids diff --git a/src/main.cpp b/src/main.cpp index 3b0f6a1..28dc050 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -211,6 +211,9 @@ int main(int argc, char * argv[]) .help("seed for reproducibility") .default_value(999) .scan<'i', int>(); + cmd_admix.add_argument("-f", "--force-accept") + .help("always accept the acceleration solution") + .flag(); cmd_admix.add_argument("-F", "--constrain-F") .help("apply constraint on F so that it is not smaller than cluster frequency in fastphase model") .flag(); @@ -312,6 +315,7 @@ int main(int argc, char * argv[]) opts.nthreads = cmd_admix.get("--threads"); opts.nadmix = cmd_admix.get("--iterations"); opts.cF = cmd_admix.get("--constrain-F"); + opts.force = cmd_admix.get("--force-accept"); if(opts.in_bin.empty() || cmd_admix.get("--help")) throw std::runtime_error(cmd_admix.help().str()); run_admix_main(opts); } From b520950ca27015f5455bd060413e1b600f85915d Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Fri, 9 Feb 2024 11:08:33 +0100 Subject: [PATCH 57/67] update log --- src/admixture.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index 80d1de7..2d2a1b1 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -248,7 +248,7 @@ int run_admix_main(Options & opts) const double istep{4}; double alpha{std::numeric_limits::lowest()}, qdiff, ldiff, stepMax{4}, alphaMax{1280}; double prevlike{std::numeric_limits::lowest()}, logcheck{0}, loglike{0}; - for(int it = 0; SIG_COND && (it < opts.nadmix / 4); it++) + for(int it = 0; SIG_COND && (it < opts.nadmix / 3); it++) { // first accel iteration admixer.initIteration(); @@ -273,7 +273,7 @@ int run_admix_main(Options & opts) admixer.updateIteration(); ldiff = it ? loglike - prevlike : NAN; prevlike = loglike; - cao.print(tim.date(), "SqS3 iteration", it * 4 + 1, ", diff(Q) =", std::scientific, qdiff, + cao.print(tim.date(), "SqS3 iteration", it * 3 + 1, ", diff(Q) =", std::scientific, qdiff, ", alpha=", alpha, ", likelihoods =", std::fixed, loglike, ", diff(likelihoods)=", ldiff, ", elapsed", tim.reltime(), " sec"); if(ldiff < opts.ltol) From f08e02ed6439254d2a9aab7e3ab3afbfab22d7b9 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 12 Feb 2024 11:14:57 +0100 Subject: [PATCH 58/67] add --min-P --- src/admixture.cpp | 10 +++++++--- src/admixture.hpp | 3 ++- src/main.cpp | 5 +++++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index 2d2a1b1..c6454e8 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -38,7 +38,7 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & { c12 = c1 * C + c2; double xz = cl(c12, s); - // if(cf(c1, s) < tol || cf(c2, s) < tol) xz = 0.0; + if(AE(c1, s) < magicTol || AE(c2, s) < magicTol) xz = 0.0; double zy = Hz(c1) * Hz(c2); tmp += xz * zy; } @@ -92,6 +92,7 @@ double Admixture::runNativeWithBigAss(int ind, const std::unique_ptr & g { c12 = c1 * C + c2; double xz = cl(c12, s); + if(AE(c1, s) < magicTol || AE(c2, s) < magicTol) xz = 0.0; for(k1 = 0; k1 < K; k1++) { for(k2 = 0; k2 < K; k2++) @@ -208,8 +209,11 @@ void Admixture::setStartPoint(const std::unique_ptr & genome, std::strin if(!qfile.empty()) load_csv(Q, qfile); } -void Admixture::setFlags(bool debug_, bool nonewQ_, bool cF_) +void Admixture::setFlags(double cftol, double Ftol, double Qtol, bool debug_, bool nonewQ_, bool cF_) { + magicTol = cftol; + clusterFreqThreshold = Ftol; + admixtureThreshold = Qtol; debug = debug_; nonewQ = nonewQ_; cF = cF_; @@ -239,7 +243,7 @@ int run_admix_main(Options & opts) Admixture admixer(genome->nsamples, genome->G, genome->C, opts.K, opts.seed); cao.warn(tim.date(), "-> running admixture with seed =", opts.seed); - admixer.setFlags(opts.debug, opts.nQ, opts.cF); + admixer.setFlags(opts.ptol, opts.ftol, opts.qtol, opts.debug, opts.nQ, opts.cF); admixer.setStartPoint(genome, opts.in_qfile); vector> llike; if(!opts.noaccel) diff --git a/src/admixture.hpp b/src/admixture.hpp index 62ee4d3..14202c0 100644 --- a/src/admixture.hpp +++ b/src/admixture.hpp @@ -18,6 +18,7 @@ class Admixture // BOUNDING double clusterFreqThreshold{1e-6}; // threshold for F double admixtureThreshold{1e-6}; // threshold for Q + double magicTol{1e-2}; // threshold for cluster frequency estimated from fastphase bool debug = false; bool nonewQ = false; bool cF = false; @@ -49,7 +50,7 @@ class Admixture void updateIteration(); void protectPars(); void constrainF(); - void setFlags(bool, bool, bool); + void setFlags(double, double, double, bool, bool, bool); void setStartPoint(const std::unique_ptr & genome, std::string qfile); double runNativeWithBigAss(int ind, const std::unique_ptr & genome); double runOptimalWithBigAss(int ind, const std::unique_ptr & genome); diff --git a/src/main.cpp b/src/main.cpp index 28dc050..d9d6769 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -217,6 +217,10 @@ int main(int argc, char * argv[]) cmd_admix.add_argument("-F", "--constrain-F") .help("apply constraint on F so that it is not smaller than cluster frequency in fastphase model") .flag(); + cmd_admix.add_argument("-P", "--min-P") + .help("set cluster likelihood to zeros if P (in fastphase) < min-P") + .default_value(0.0) + .scan<'g', double>(); argparse::ArgumentParser cmd_convert("convert", VERSION, default_arguments::help); cmd_convert.add_description("different file format converter"); @@ -308,6 +312,7 @@ int main(int argc, char * argv[]) } else if(program.is_subcommand_used(cmd_admix)) { + opts.ptol = cmd_admix.get("--min-P"); opts.in_bin.assign(cmd_admix.get("--bin")); opts.out.assign(cmd_admix.get("--out")); opts.seed = cmd_admix.get("--seed"); From 9ff9718a3a0e12a1df8aa134ea4a69e272e20b94 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 12 Feb 2024 11:24:21 +0100 Subject: [PATCH 59/67] update defaults --- src/admixture.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/admixture.hpp b/src/admixture.hpp index 14202c0..2032c8b 100644 --- a/src/admixture.hpp +++ b/src/admixture.hpp @@ -16,9 +16,9 @@ class Admixture // randon engine std::default_random_engine rng = std::default_random_engine{}; // BOUNDING - double clusterFreqThreshold{1e-6}; // threshold for F - double admixtureThreshold{1e-6}; // threshold for Q - double magicTol{1e-2}; // threshold for cluster frequency estimated from fastphase + double clusterFreqThreshold{1e-9}; // threshold for F + double admixtureThreshold{1e-9}; // threshold for Q + double magicTol{0.0}; // threshold for cluster frequency estimated from fastphase bool debug = false; bool nonewQ = false; bool cF = false; From c6b988565a35a1e264542fc692e1c113346850d7 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 12 Feb 2024 11:55:11 +0100 Subject: [PATCH 60/67] AE < P --- src/admixture.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index c6454e8..3232845 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -38,7 +38,7 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & { c12 = c1 * C + c2; double xz = cl(c12, s); - if(AE(c1, s) < magicTol || AE(c2, s) < magicTol) xz = 0.0; + if(AE(c12, s) < magicTol) xz = 0.0; double zy = Hz(c1) * Hz(c2); tmp += xz * zy; } @@ -92,7 +92,7 @@ double Admixture::runNativeWithBigAss(int ind, const std::unique_ptr & g { c12 = c1 * C + c2; double xz = cl(c12, s); - if(AE(c1, s) < magicTol || AE(c2, s) < magicTol) xz = 0.0; + if(AE(c12, s) < magicTol) xz = 0.0; for(k1 = 0; k1 < K; k1++) { for(k2 = 0; k2 < K; k2++) From 5e5e65f080da2ca6be1d1bc50003ce6b9f0dfad0 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Tue, 13 Feb 2024 17:45:55 +0100 Subject: [PATCH 61/67] fix refillHaps --- src/fastphase.cpp | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/fastphase.cpp b/src/fastphase.cpp index 0cb6397..7877085 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -57,20 +57,19 @@ void FastPhaseK2::setFlags(double tol_p, double tol_f, double tol_q, bool debug_ void FastPhaseK2::refillHaps(int strategy) { - int s{0}, ic{0}, g{0}, i{0}; + int s{0}, c{0}, ic{0}, g{0}, i{0}, sg{0}; int nchunks = pos_chunk.size() - 1; // bin hapsum per 100 snps ? - for(ic = 0; ic < nchunks; ic++) + for(c = 0; c < C; c++) { - const int S = pos_chunk[ic + 1] - pos_chunk[ic]; - const auto se = find_grid_start_end(collapse.segment(pos_chunk[ic], S)); - for(g = 0; g < (int)se.size(); g++) + for(ic = 0, sg = 0; ic < nchunks; ic++) { - for(int c = 0; c < C; c++) + const int S = pos_chunk[ic + 1] - pos_chunk[ic]; + const auto se = find_grid_start_end(collapse.segment(pos_chunk[ic], S)); + for(g = 0; g < (int)se.size(); g++, sg++) { - - if(HapSum(c, g) >= minHapfreq) continue; - MyArr1D h = HapSum.col(g); + if(HapSum(c, sg) >= minHapfreq) continue; + MyArr1D h = HapSum.col(sg); h(c) = 0; // do not re-sample current h /= h.sum(); MyFloat1D p(h.data(), h.data() + h.size()); @@ -87,11 +86,11 @@ void FastPhaseK2::refillHaps(int strategy) else if(strategy == 2) { h.maxCoeff(&choice); // if no binning, this may be better - P(i + pos_chunk[ic], c) = P(g, choice); + P(i + pos_chunk[ic], c) = P(i + pos_chunk[ic], choice); } else { - P(i + pos_chunk[ic], c) = P(g, choice); + P(i + pos_chunk[ic], c) = P(i + pos_chunk[ic], choice); } s++; } From 9395b6fd67c81d53547d21b3a2a8644a6b0a5539 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Thu, 15 Feb 2024 11:59:45 +0100 Subject: [PATCH 62/67] output F in joint --- src/phaseless.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/phaseless.cpp b/src/phaseless.cpp index 6bd33f2..cb43d04 100644 --- a/src/phaseless.cpp +++ b/src/phaseless.cpp @@ -277,8 +277,6 @@ int run_phaseless_main(Options & opts) std::unique_ptr genome = std::make_unique(); init_bigass(genome, opts); vector> res; - std::ofstream oanc(opts.out + ".Q"); - std::ofstream op(opts.out + ".P"); Phaseless faith(opts.K, opts.C, genome->nsamples, genome->nsnps, opts.seed); faith.setFlags(opts.ptol, opts.ftol, opts.qtol, opts.debug, opts.nQ, opts.nP, opts.nF, opts.nR); faith.setStartPoint(opts.in_qfile, opts.in_pfile); @@ -409,9 +407,16 @@ int run_phaseless_main(Options & opts) } } } + std::ofstream oanc(opts.out + ".Q"); oanc << std::fixed << faith.Q.transpose().format(fmt10) << "\n"; oanc.close(); + std::ofstream op(opts.out + ".P"); op << faith.P.format(fmt6) << "\n"; + if(opts.oF) + { + std::ofstream of(opts.out + ".F"); + for(size_t k = 0; k < faith.F.size(); k++) of << faith.F[k].format(fmt6) << "\n"; + } std::unique_ptr par = std::make_unique(); par->init(faith.K, faith.C, faith.M, faith.N, faith.er, faith.P, faith.Q, faith.F); par->pos = genome->pos; From 36797e99f72ce84d8c9bde50beefeb22338a2994 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Sun, 18 Feb 2024 14:31:26 +0100 Subject: [PATCH 63/67] new idea -> pi 0.5.2 --- src/admixture.cpp | 50 +++++++++++++++++++++++++++++------------------ src/admixture.hpp | 4 ++-- src/common.hpp | 2 +- src/main.cpp | 12 ++++++++---- 4 files changed, 42 insertions(+), 26 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index 3232845..a35eb36 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -15,8 +15,8 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & MyArr1D iQ = MyArr1D::Zero(K); MyArr1D Hz(C); double norm = 0, llike = 0, tmp = 0; - int c1, k1, s, c2, c12; - for(int ic = 0, g = 0, ss = 0; ic < genome->nchunks; ic++) + int c1, k1, s, c2, c12, ic, g, ss, ng; + for(ic = 0, g = 0, ss = 0, ng = 0; ic < genome->nchunks; ic++) { const int S = genome->pos[ic].size(); const int nGrids = grids[ic]; @@ -31,21 +31,28 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & Ekg.setZero(K, nGrids); for(s = 0; s < nGrids; s++, g++) { - for(c1 = 0; c1 < C; c1++) Hz(c1) = (Q.col(ind) * F(Eigen::seqN(c1, K, C), g)).sum(); - for(norm = 0, c1 = 0; c1 < C; c1++) + if(magicTol > 0 && pi.col(g).maxCoeff() < magicTol) + { + kapa.col(s).fill(1.0); + } + else { - for(tmp = 0, c2 = 0; c2 < C; c2++) + ng++; + for(c1 = 0; c1 < C; c1++) Hz(c1) = (Q.col(ind) * F(Eigen::seqN(c1, K, C), g)).sum(); + for(norm = 0, c1 = 0; c1 < C; c1++) { - c12 = c1 * C + c2; - double xz = cl(c12, s); - if(AE(c12, s) < magicTol) xz = 0.0; - double zy = Hz(c1) * Hz(c2); - tmp += xz * zy; + for(tmp = 0, c2 = 0; c2 < C; c2++) + { + c12 = c1 * C + c2; + double xz = cl(c12, s); + double zy = Hz(c1) * Hz(c2); + tmp += xz * zy; + } + norm += tmp; + kapa(Eigen::seqN(c1, K, C), s) = (Q.col(ind) * F(Eigen::seqN(c1, K, C), g)) * tmp / Hz(c1); } - norm += tmp; - kapa(Eigen::seqN(c1, K, C), s) = (Q.col(ind) * F(Eigen::seqN(c1, K, C), g)) * tmp / Hz(c1); + llike += log(norm); } - llike += log(norm); kapa.col(s) /= kapa.col(s).sum(); for(k1 = 0; k1 < K; k1++) Ekg(k1, s) = 2 * kapa.middleRows(k1 * C, C).col(s).sum(); } @@ -57,7 +64,7 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & } } // update Q, iQ.sum() should be 2M - if(!nonewQ) Q.col(ind) = iQ / (2 * G); + if(!nonewQ) Q.col(ind) = iQ / (2 * ng); return llike; } @@ -92,7 +99,7 @@ double Admixture::runNativeWithBigAss(int ind, const std::unique_ptr & g { c12 = c1 * C + c2; double xz = cl(c12, s); - if(AE(c12, s) < magicTol) xz = 0.0; + if(magicTol > 0 && pi.col(g).maxCoeff() < magicTol) xz = 0.0; for(k1 = 0; k1 < K; k1++) { for(k2 = 0; k2 < K; k2++) @@ -156,7 +163,7 @@ void Admixture::protectPars() { if(!nonewQ) { - if(Q.isNaN().any()) cao.error("NaN in Q\n"); + if(Q.isNaN().any()) cao.error("NaN in Q\n", Q); Q = (Q < admixtureThreshold).select(admixtureThreshold, Q); // lower bound Q = (Q > 1 - admixtureThreshold).select(1 - admixtureThreshold, Q); // upper bound Q.rowwise() /= Q.colwise().sum(); // normalize Q per individual @@ -187,8 +194,13 @@ void Admixture::constrainF() } } -void Admixture::setStartPoint(const std::unique_ptr & genome, std::string qfile) +void Admixture::setStartPoint(const std::unique_ptr & genome, std::string qfile, std::string pifile) { + if(!pifile.empty()) + { + pi.setZero(C, G); + load_csv(pi, pifile); + } P = MyArr2D(C, G); collapse = Bool1D::Constant(genome->nsnps, false); int ic{0}, sg{0}, ss{0}; @@ -243,8 +255,8 @@ int run_admix_main(Options & opts) Admixture admixer(genome->nsamples, genome->G, genome->C, opts.K, opts.seed); cao.warn(tim.date(), "-> running admixture with seed =", opts.seed); - admixer.setFlags(opts.ptol, opts.ftol, opts.qtol, opts.debug, opts.nQ, opts.cF); - admixer.setStartPoint(genome, opts.in_qfile); + admixer.setFlags(opts.tol_pi, opts.ftol, opts.qtol, opts.debug, opts.nQ, opts.cF); + admixer.setStartPoint(genome, opts.in_qfile, opts.pi_file); vector> llike; if(!opts.noaccel) { diff --git a/src/admixture.hpp b/src/admixture.hpp index 2032c8b..7a57e40 100644 --- a/src/admixture.hpp +++ b/src/admixture.hpp @@ -39,7 +39,7 @@ class Admixture // SHARED VARIBALES const int N, G, C, K; // M: number of grids in total MyArr2D F; // (C x K) x M - MyArr2D P; // C x M, for each k, F <= P + MyArr2D P, pi; // C x M, for each k, F <= P MyArr2D Q; // K x N MyArr2D Ekc; // (C * K) x M, expected number of alleles per c per k MyArr2D NormF; // K x M @@ -51,7 +51,7 @@ class Admixture void protectPars(); void constrainF(); void setFlags(double, double, double, bool, bool, bool); - void setStartPoint(const std::unique_ptr & genome, std::string qfile); + void setStartPoint(const std::unique_ptr & genome, std::string qfile, std::string pifile); double runNativeWithBigAss(int ind, const std::unique_ptr & genome); double runOptimalWithBigAss(int ind, const std::unique_ptr & genome); }; diff --git a/src/common.hpp b/src/common.hpp index c9749be..86e5281 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -105,7 +105,7 @@ struct Options bool noaccel{0}, noscreen{0}, single_chunk{0}, debug{0}, collapse{0}; bool nQ{0}, nP{0}, nF{0}, nR{0}, aQ{0}, oVCF{0}, eHap{0}, oF{0}, cF{0}, force{0}; std::string out, in_beagle, in_vcf, in_bin, in_impute, in_joint; - std::string samples{""}, region{""}, in_plink{""}, in_qfile{""}, in_pfile{""}, in_rfile{""}; + std::string samples{""}, region{""}, in_plink{""}, in_qfile{""}, in_pfile{""}, in_rfile{""}, pi_file{""}; std::string opts_in_effect{"Options in effect:\n "}; }; diff --git a/src/main.cpp b/src/main.cpp index d9d6769..f0cf3b2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -192,13 +192,16 @@ int main(int argc, char * argv[]) cmd_admix.add_argument("-b", "--bin") .help("binary format from impute command as input") .default_value(std::string{""}); + cmd_admix.add_argument("-p", "--pi-file") + .help("pi file from impute command") + .default_value(std::string{""}); cmd_admix.add_argument("-k", "--ancestry") .help("number of ancestry in admixture assumption") .default_value(2) .scan<'i', int>(); cmd_admix.add_argument("-i", "--iterations") .help("number of maximun EM iterations") - .default_value(2000) + .default_value(1000) .scan<'i', int>(); cmd_admix.add_argument("-n", "--threads") .help("number of threads") @@ -217,8 +220,8 @@ int main(int argc, char * argv[]) cmd_admix.add_argument("-F", "--constrain-F") .help("apply constraint on F so that it is not smaller than cluster frequency in fastphase model") .flag(); - cmd_admix.add_argument("-P", "--min-P") - .help("set cluster likelihood to zeros if P (in fastphase) < min-P") + cmd_admix.add_argument("-P", "--min-pi") + .help("set cluster likelihood to zeros if PI (in fastphase) < min-pi") .default_value(0.0) .scan<'g', double>(); @@ -312,8 +315,9 @@ int main(int argc, char * argv[]) } else if(program.is_subcommand_used(cmd_admix)) { - opts.ptol = cmd_admix.get("--min-P"); + opts.tol_pi = cmd_admix.get("--min-pi"); opts.in_bin.assign(cmd_admix.get("--bin")); + opts.pi_file.assign(cmd_admix.get("--pi-file")); opts.out.assign(cmd_admix.get("--out")); opts.seed = cmd_admix.get("--seed"); opts.K = cmd_admix.get("-k"); From c93b6e49edf744c111b9722a9ebf7a50350d9273 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Sun, 18 Feb 2024 14:40:49 +0100 Subject: [PATCH 64/67] trick on pi --- src/admixture.cpp | 36 ++++++++++++++++-------------------- src/main.cpp | 2 +- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index a35eb36..fd8bd7b 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -31,28 +31,23 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & Ekg.setZero(K, nGrids); for(s = 0; s < nGrids; s++, g++) { - if(magicTol > 0 && pi.col(g).maxCoeff() < magicTol) + ng++; + for(c1 = 0; c1 < C; c1++) Hz(c1) = (Q.col(ind) * F(Eigen::seqN(c1, K, C), g)).sum(); + for(norm = 0, c1 = 0; c1 < C; c1++) { - kapa.col(s).fill(1.0); - } - else - { - ng++; - for(c1 = 0; c1 < C; c1++) Hz(c1) = (Q.col(ind) * F(Eigen::seqN(c1, K, C), g)).sum(); - for(norm = 0, c1 = 0; c1 < C; c1++) + for(tmp = 0, c2 = 0; c2 < C; c2++) { - for(tmp = 0, c2 = 0; c2 < C; c2++) - { - c12 = c1 * C + c2; - double xz = cl(c12, s); - double zy = Hz(c1) * Hz(c2); - tmp += xz * zy; - } - norm += tmp; - kapa(Eigen::seqN(c1, K, C), s) = (Q.col(ind) * F(Eigen::seqN(c1, K, C), g)) * tmp / Hz(c1); + c12 = c1 * C + c2; + double xz = cl(c12, s); + double zy = Hz(c1) * Hz(c2); + if(magicTol > 0 && pi(c1, g) < magicTol) xz = 0.0; + if(magicTol > 0 && pi(c2, g) < magicTol) xz = 0.0; + tmp += xz * zy; } - llike += log(norm); + norm += tmp; + kapa(Eigen::seqN(c1, K, C), s) = (Q.col(ind) * F(Eigen::seqN(c1, K, C), g)) * tmp / Hz(c1); } + llike += log(norm); kapa.col(s) /= kapa.col(s).sum(); for(k1 = 0; k1 < K; k1++) Ekg(k1, s) = 2 * kapa.middleRows(k1 * C, C).col(s).sum(); } @@ -99,7 +94,8 @@ double Admixture::runNativeWithBigAss(int ind, const std::unique_ptr & g { c12 = c1 * C + c2; double xz = cl(c12, s); - if(magicTol > 0 && pi.col(g).maxCoeff() < magicTol) xz = 0.0; + if(magicTol > 0 && pi(c1, g) < magicTol) xz = 0.0; + if(magicTol > 0 && pi(c2, g) < magicTol) xz = 0.0; for(k1 = 0; k1 < K; k1++) { for(k2 = 0; k2 < K; k2++) @@ -196,7 +192,7 @@ void Admixture::constrainF() void Admixture::setStartPoint(const std::unique_ptr & genome, std::string qfile, std::string pifile) { - if(!pifile.empty()) + if(!pifile.empty() && magicTol > 0) { pi.setZero(C, G); load_csv(pi, pifile); diff --git a/src/main.cpp b/src/main.cpp index f0cf3b2..b50cf2d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -18,7 +18,7 @@ int main(int argc, char * argv[]) { // ========= helper message and parameters parsing =========================== - const std::string VERSION{"0.5.1"}; + const std::string VERSION{"0.5.2"}; // below for catching ctrl+c, and dumping files struct sigaction sa; From 9a4d8418d4d2a4512b443951215d5bae6329e77e Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Sun, 25 Feb 2024 10:50:14 +0100 Subject: [PATCH 65/67] Revert "trick on pi" This reverts commit c93b6e49edf744c111b9722a9ebf7a50350d9273. --- src/admixture.cpp | 36 ++++++++++++++++++++---------------- src/main.cpp | 2 +- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index fd8bd7b..a35eb36 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -31,23 +31,28 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & Ekg.setZero(K, nGrids); for(s = 0; s < nGrids; s++, g++) { - ng++; - for(c1 = 0; c1 < C; c1++) Hz(c1) = (Q.col(ind) * F(Eigen::seqN(c1, K, C), g)).sum(); - for(norm = 0, c1 = 0; c1 < C; c1++) + if(magicTol > 0 && pi.col(g).maxCoeff() < magicTol) { - for(tmp = 0, c2 = 0; c2 < C; c2++) + kapa.col(s).fill(1.0); + } + else + { + ng++; + for(c1 = 0; c1 < C; c1++) Hz(c1) = (Q.col(ind) * F(Eigen::seqN(c1, K, C), g)).sum(); + for(norm = 0, c1 = 0; c1 < C; c1++) { - c12 = c1 * C + c2; - double xz = cl(c12, s); - double zy = Hz(c1) * Hz(c2); - if(magicTol > 0 && pi(c1, g) < magicTol) xz = 0.0; - if(magicTol > 0 && pi(c2, g) < magicTol) xz = 0.0; - tmp += xz * zy; + for(tmp = 0, c2 = 0; c2 < C; c2++) + { + c12 = c1 * C + c2; + double xz = cl(c12, s); + double zy = Hz(c1) * Hz(c2); + tmp += xz * zy; + } + norm += tmp; + kapa(Eigen::seqN(c1, K, C), s) = (Q.col(ind) * F(Eigen::seqN(c1, K, C), g)) * tmp / Hz(c1); } - norm += tmp; - kapa(Eigen::seqN(c1, K, C), s) = (Q.col(ind) * F(Eigen::seqN(c1, K, C), g)) * tmp / Hz(c1); + llike += log(norm); } - llike += log(norm); kapa.col(s) /= kapa.col(s).sum(); for(k1 = 0; k1 < K; k1++) Ekg(k1, s) = 2 * kapa.middleRows(k1 * C, C).col(s).sum(); } @@ -94,8 +99,7 @@ double Admixture::runNativeWithBigAss(int ind, const std::unique_ptr & g { c12 = c1 * C + c2; double xz = cl(c12, s); - if(magicTol > 0 && pi(c1, g) < magicTol) xz = 0.0; - if(magicTol > 0 && pi(c2, g) < magicTol) xz = 0.0; + if(magicTol > 0 && pi.col(g).maxCoeff() < magicTol) xz = 0.0; for(k1 = 0; k1 < K; k1++) { for(k2 = 0; k2 < K; k2++) @@ -192,7 +196,7 @@ void Admixture::constrainF() void Admixture::setStartPoint(const std::unique_ptr & genome, std::string qfile, std::string pifile) { - if(!pifile.empty() && magicTol > 0) + if(!pifile.empty()) { pi.setZero(C, G); load_csv(pi, pifile); diff --git a/src/main.cpp b/src/main.cpp index b50cf2d..f0cf3b2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -18,7 +18,7 @@ int main(int argc, char * argv[]) { // ========= helper message and parameters parsing =========================== - const std::string VERSION{"0.5.2"}; + const std::string VERSION{"0.5.1"}; // below for catching ctrl+c, and dumping files struct sigaction sa; From 846071fba361b5b380063b9d1453380b06a6fde0 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Sun, 25 Feb 2024 10:51:38 +0100 Subject: [PATCH 66/67] Revert "new idea -> pi 0.5.2" This reverts commit 36797e99f72ce84d8c9bde50beefeb22338a2994. --- src/admixture.cpp | 50 ++++++++++++++++++----------------------------- src/admixture.hpp | 4 ++-- src/common.hpp | 2 +- src/main.cpp | 12 ++++-------- 4 files changed, 26 insertions(+), 42 deletions(-) diff --git a/src/admixture.cpp b/src/admixture.cpp index a35eb36..3232845 100644 --- a/src/admixture.cpp +++ b/src/admixture.cpp @@ -15,8 +15,8 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & MyArr1D iQ = MyArr1D::Zero(K); MyArr1D Hz(C); double norm = 0, llike = 0, tmp = 0; - int c1, k1, s, c2, c12, ic, g, ss, ng; - for(ic = 0, g = 0, ss = 0, ng = 0; ic < genome->nchunks; ic++) + int c1, k1, s, c2, c12; + for(int ic = 0, g = 0, ss = 0; ic < genome->nchunks; ic++) { const int S = genome->pos[ic].size(); const int nGrids = grids[ic]; @@ -31,28 +31,21 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & Ekg.setZero(K, nGrids); for(s = 0; s < nGrids; s++, g++) { - if(magicTol > 0 && pi.col(g).maxCoeff() < magicTol) - { - kapa.col(s).fill(1.0); - } - else + for(c1 = 0; c1 < C; c1++) Hz(c1) = (Q.col(ind) * F(Eigen::seqN(c1, K, C), g)).sum(); + for(norm = 0, c1 = 0; c1 < C; c1++) { - ng++; - for(c1 = 0; c1 < C; c1++) Hz(c1) = (Q.col(ind) * F(Eigen::seqN(c1, K, C), g)).sum(); - for(norm = 0, c1 = 0; c1 < C; c1++) + for(tmp = 0, c2 = 0; c2 < C; c2++) { - for(tmp = 0, c2 = 0; c2 < C; c2++) - { - c12 = c1 * C + c2; - double xz = cl(c12, s); - double zy = Hz(c1) * Hz(c2); - tmp += xz * zy; - } - norm += tmp; - kapa(Eigen::seqN(c1, K, C), s) = (Q.col(ind) * F(Eigen::seqN(c1, K, C), g)) * tmp / Hz(c1); + c12 = c1 * C + c2; + double xz = cl(c12, s); + if(AE(c12, s) < magicTol) xz = 0.0; + double zy = Hz(c1) * Hz(c2); + tmp += xz * zy; } - llike += log(norm); + norm += tmp; + kapa(Eigen::seqN(c1, K, C), s) = (Q.col(ind) * F(Eigen::seqN(c1, K, C), g)) * tmp / Hz(c1); } + llike += log(norm); kapa.col(s) /= kapa.col(s).sum(); for(k1 = 0; k1 < K; k1++) Ekg(k1, s) = 2 * kapa.middleRows(k1 * C, C).col(s).sum(); } @@ -64,7 +57,7 @@ double Admixture::runOptimalWithBigAss(int ind, const std::unique_ptr & } } // update Q, iQ.sum() should be 2M - if(!nonewQ) Q.col(ind) = iQ / (2 * ng); + if(!nonewQ) Q.col(ind) = iQ / (2 * G); return llike; } @@ -99,7 +92,7 @@ double Admixture::runNativeWithBigAss(int ind, const std::unique_ptr & g { c12 = c1 * C + c2; double xz = cl(c12, s); - if(magicTol > 0 && pi.col(g).maxCoeff() < magicTol) xz = 0.0; + if(AE(c12, s) < magicTol) xz = 0.0; for(k1 = 0; k1 < K; k1++) { for(k2 = 0; k2 < K; k2++) @@ -163,7 +156,7 @@ void Admixture::protectPars() { if(!nonewQ) { - if(Q.isNaN().any()) cao.error("NaN in Q\n", Q); + if(Q.isNaN().any()) cao.error("NaN in Q\n"); Q = (Q < admixtureThreshold).select(admixtureThreshold, Q); // lower bound Q = (Q > 1 - admixtureThreshold).select(1 - admixtureThreshold, Q); // upper bound Q.rowwise() /= Q.colwise().sum(); // normalize Q per individual @@ -194,13 +187,8 @@ void Admixture::constrainF() } } -void Admixture::setStartPoint(const std::unique_ptr & genome, std::string qfile, std::string pifile) +void Admixture::setStartPoint(const std::unique_ptr & genome, std::string qfile) { - if(!pifile.empty()) - { - pi.setZero(C, G); - load_csv(pi, pifile); - } P = MyArr2D(C, G); collapse = Bool1D::Constant(genome->nsnps, false); int ic{0}, sg{0}, ss{0}; @@ -255,8 +243,8 @@ int run_admix_main(Options & opts) Admixture admixer(genome->nsamples, genome->G, genome->C, opts.K, opts.seed); cao.warn(tim.date(), "-> running admixture with seed =", opts.seed); - admixer.setFlags(opts.tol_pi, opts.ftol, opts.qtol, opts.debug, opts.nQ, opts.cF); - admixer.setStartPoint(genome, opts.in_qfile, opts.pi_file); + admixer.setFlags(opts.ptol, opts.ftol, opts.qtol, opts.debug, opts.nQ, opts.cF); + admixer.setStartPoint(genome, opts.in_qfile); vector> llike; if(!opts.noaccel) { diff --git a/src/admixture.hpp b/src/admixture.hpp index 7a57e40..2032c8b 100644 --- a/src/admixture.hpp +++ b/src/admixture.hpp @@ -39,7 +39,7 @@ class Admixture // SHARED VARIBALES const int N, G, C, K; // M: number of grids in total MyArr2D F; // (C x K) x M - MyArr2D P, pi; // C x M, for each k, F <= P + MyArr2D P; // C x M, for each k, F <= P MyArr2D Q; // K x N MyArr2D Ekc; // (C * K) x M, expected number of alleles per c per k MyArr2D NormF; // K x M @@ -51,7 +51,7 @@ class Admixture void protectPars(); void constrainF(); void setFlags(double, double, double, bool, bool, bool); - void setStartPoint(const std::unique_ptr & genome, std::string qfile, std::string pifile); + void setStartPoint(const std::unique_ptr & genome, std::string qfile); double runNativeWithBigAss(int ind, const std::unique_ptr & genome); double runOptimalWithBigAss(int ind, const std::unique_ptr & genome); }; diff --git a/src/common.hpp b/src/common.hpp index 86e5281..c9749be 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -105,7 +105,7 @@ struct Options bool noaccel{0}, noscreen{0}, single_chunk{0}, debug{0}, collapse{0}; bool nQ{0}, nP{0}, nF{0}, nR{0}, aQ{0}, oVCF{0}, eHap{0}, oF{0}, cF{0}, force{0}; std::string out, in_beagle, in_vcf, in_bin, in_impute, in_joint; - std::string samples{""}, region{""}, in_plink{""}, in_qfile{""}, in_pfile{""}, in_rfile{""}, pi_file{""}; + std::string samples{""}, region{""}, in_plink{""}, in_qfile{""}, in_pfile{""}, in_rfile{""}; std::string opts_in_effect{"Options in effect:\n "}; }; diff --git a/src/main.cpp b/src/main.cpp index f0cf3b2..d9d6769 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -192,16 +192,13 @@ int main(int argc, char * argv[]) cmd_admix.add_argument("-b", "--bin") .help("binary format from impute command as input") .default_value(std::string{""}); - cmd_admix.add_argument("-p", "--pi-file") - .help("pi file from impute command") - .default_value(std::string{""}); cmd_admix.add_argument("-k", "--ancestry") .help("number of ancestry in admixture assumption") .default_value(2) .scan<'i', int>(); cmd_admix.add_argument("-i", "--iterations") .help("number of maximun EM iterations") - .default_value(1000) + .default_value(2000) .scan<'i', int>(); cmd_admix.add_argument("-n", "--threads") .help("number of threads") @@ -220,8 +217,8 @@ int main(int argc, char * argv[]) cmd_admix.add_argument("-F", "--constrain-F") .help("apply constraint on F so that it is not smaller than cluster frequency in fastphase model") .flag(); - cmd_admix.add_argument("-P", "--min-pi") - .help("set cluster likelihood to zeros if PI (in fastphase) < min-pi") + cmd_admix.add_argument("-P", "--min-P") + .help("set cluster likelihood to zeros if P (in fastphase) < min-P") .default_value(0.0) .scan<'g', double>(); @@ -315,9 +312,8 @@ int main(int argc, char * argv[]) } else if(program.is_subcommand_used(cmd_admix)) { - opts.tol_pi = cmd_admix.get("--min-pi"); + opts.ptol = cmd_admix.get("--min-P"); opts.in_bin.assign(cmd_admix.get("--bin")); - opts.pi_file.assign(cmd_admix.get("--pi-file")); opts.out.assign(cmd_admix.get("--out")); opts.seed = cmd_admix.get("--seed"); opts.K = cmd_admix.get("-k"); From 0b50b148fd1f110c942ac5392a969ec326f41f6c Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Tue, 16 Jul 2024 15:01:41 +0200 Subject: [PATCH 67/67] local tests --- R/plot_haplotypes.R | 54 +++++++++++++++++++++++++++++++++++++++++++++ src/common.hpp | 6 ++--- src/fastphase.cpp | 4 ++-- 3 files changed, 59 insertions(+), 5 deletions(-) diff --git a/R/plot_haplotypes.R b/R/plot_haplotypes.R index f762845..0770be6 100644 --- a/R/plot_haplotypes.R +++ b/R/plot_haplotypes.R @@ -54,3 +54,57 @@ plot.hapfreq <- function(hapfreq, if(!is.null(recomb)) lines(pos[-1], recomb, type = "l", col = "red") } + + +table(dy <- as.matrix(read.table("~/Downloads/chr16.Mkomazi.males.Ychr.txt"))) + +table(da <- as.matrix(read.table("~/Downloads/chr16.Mkomazi.males.Achr.txt"))) + +image(da) + + +N <- 4 +M <- 7570 +d <- 1 +xleft <- 1:M - d +xright <- 1:M - d + +samples <- c("NGrataE03715", "NGrataE03718", "NGrataE03717", "NGrataE03723") + +f <- function(da){ + for(i in 1:N){ + ybottom <- i + array(0, M) + ytop <- ybottom+1 + rect(xleft = xleft - d, xright = xright + d, ybottom = ybottom, ytop = ytop, col = da[i,]+1, lwd = 3, border = NA) + rect(xleft = xleft - d, xright = xright + d, ybottom = ytop-0.01, ytop = ytop, col = "gray", lwd = 3, border = NA) + mtext(samples[i], side = 2, at = (ytop[1]+ybottom[1])/2, cex = 1.5) + } +} + +##par(mfrow = c(2,1), cex.lab = 2, cex.main = 2) + +op <- par(mfrow = c(2,1), + oma = c(6,0,0,0) + 0.1, + mar = c(0,4,2,1) + 0.1, + cex.lab = 2, cex.main=2) +plot(0, 0, col = "white", axes=FALSE, xlim = c(0, M), ylim = c(1, N + 1), main = "Y chromosome", xlab="",ylab="") +f(dy) +plot(0, 0, col = "white", axes=FALSE, xlim = c(0, M), ylim = c(1, N + 1), main = "A chromosome", xlab="SNP index", ylab="") +f(da) + + +mycols <- c("gray", "black") + + + +rect(xleft = xleft - d, xright = xright + d, ybottom = ybottom, ytop = ytop, col = mycols[da[i,]+1], lwd = 2, border = NA) + + + +p <- read.table("~/Downloads/fypa.pyfa.pos") + +dev.off() + + +plot.hapfreq(d,pos[1:ncol(d)], colors=1:2) +plot.hapfreq(d, 1:ncol(d), colors=1:2) diff --git a/src/common.hpp b/src/common.hpp index c9749be..fc8c55f 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -310,9 +310,9 @@ inline void protect_er(MyArr1D & er) inline MyArr1D calc_er(const Int1D & dl, double nGen, double expRate = 0.5) { MyArr1D er(dl.size()); - for(size_t i = 1; i < dl.size(); i++) er(i) = std::exp(-dl[i] / 1e6); - // for(size_t i = 1; i < dl.size(); i++) er(i) = std::exp(-dl[i] * expRate * nGen / 1e8); - protect_er(er); + // for(size_t i = 1; i < dl.size(); i++) er(i) = std::exp(-dl[i] / 1e6); + for(size_t i = 1; i < dl.size(); i++) er(i) = std::exp(-dl[i] * expRate * nGen / 1e8); + // protect_er(er); return er; } diff --git a/src/fastphase.cpp b/src/fastphase.cpp index 7877085..5abab1a 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -39,7 +39,7 @@ void FastPhaseK2::initRecombination(const Int2D & pos, std::string rfile, int B_ grid_chunk[nchunks] = sg; // add sentinel if(!rfile.empty()) load_csv(R, rfile, true); er = R.row(0).sqrt(); - protect_er(er); + // protect_er(er); R = er2R(er); } @@ -149,7 +149,7 @@ void FastPhaseK2::protectPars() er(i) = er(i) < miner ? miner : er(i); er(i) = er(i) > maxer ? maxer : er(i); } - protect_er(er); + // protect_er(er); R = er2R(er); } // protect PI