From 600b1b6851b9d304e8725d58a2548255f4e59fb7 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Mon, 29 Jan 2024 21:54:36 +0100 Subject: [PATCH] update pars.bin --- src/common.hpp | 14 +++++++------- src/fastphase.cpp | 6 ++++++ src/parse-phaseless.cpp | 14 +++++--------- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/src/common.hpp b/src/common.hpp index 1d8c65b..f9dc8fc 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -108,7 +108,7 @@ struct BigAss { int chunksize, nsamples, nsnps, nchunks; int B, G, C; // B: snps in a grid; G: total number of grids in a genome - MyFloat2D PI, F, R, GammaAE; // M x C, 3 x M, fastphase pars + MyFloat2D PI, F, R, AE; // M x C, 3 x M, fastphase pars Int1D ends; // chunk index where each chromo ends String1D sampleids, chrs; Int2D pos; // store position of markers of each chunk @@ -484,13 +484,12 @@ inline MyArr1D get_cluster_probability(int ind, return cs; } -inline void get_cluster_frequency(MyArr2D & ae, const MyFloat1D & R_, const MyFloat1D & PI_) +/// R: 3 x M; PI: C x M +inline auto get_cluster_frequency(const MyArr2D & R, const MyArr2D & PI) { - const int C2 = ae.rows(); - const int M = ae.cols(); - const int C = std::sqrt(C2); - Eigen::Map PI(PI_.data(), C, M); - Eigen::Map R(R_.data(), 3, M); + const int C = PI.rows(); + const int M = R.cols(); + MyArr2D ae(C * C, M); int s{0}; ae.col(s) = (PI.col(s).matrix() * PI.col(s).transpose().matrix()).reshaped().array(); @@ -519,6 +518,7 @@ inline void get_cluster_frequency(MyArr2D & ae, const MyFloat1D & R_, const MyFl // ae = (ae < tol).select(tol, ae); // ae = (ae > 1 - tol).select(1 - tol, ae); // ae.rowwise() /= ae.colwise().sum(); + return ae; } inline auto get_cluster_likelihoods(int ind, diff --git a/src/fastphase.cpp b/src/fastphase.cpp index 1d77b8c..dbf7866 100644 --- a/src/fastphase.cpp +++ b/src/fastphase.cpp @@ -222,6 +222,12 @@ int run_impute_main(Options & opts) break; } } + genome->R.emplace_back(MyFloat1D(faith.R.data(), faith.R.data() + faith.R.size())); + genome->PI.emplace_back(MyFloat1D(faith.PI.data(), faith.PI.data() + faith.PI.size())); + genome->F.emplace_back(MyFloat1D(faith.F.data(), faith.F.data() + faith.F.size())); + // reuse Ezj + faith.Ezj = get_cluster_frequency(faith.R, faith.PI); + genome->AE.emplace_back(MyFloat1D(faith.Ezj.data(), faith.Ezj.data() + faith.Ezj.size())); constexpr auto OPTIONS = alpaca::options::fixed_length_encoding; std::ofstream ofs(opts.out + ".pars.bin", std::ios::out | std::ios::binary); auto bytes_written = alpaca::serialize(*genome, ofs); diff --git a/src/parse-phaseless.cpp b/src/parse-phaseless.cpp index 26497a9..0dddee6 100644 --- a/src/parse-phaseless.cpp +++ b/src/parse-phaseless.cpp @@ -141,6 +141,7 @@ List parse_impute_opt(std::string filename) { return List::create(Named("C") = genome->C, Named("B") = genome->B, Named("G") = genome->G, + Named("clusterfreq") = genome->AE, Named("chunksize") = genome->chunksize, Named("nsamples") = genome->nsamples, Named("nsnps") = genome->nsnps, @@ -170,7 +171,7 @@ List parse_impute_par(std::string filename, int ic = -1) List ret(N); for(auto ind : ids) { - List gammaI(nchunks), aeI(nchunks); + List gamma(nchunks); for(int c = 0; c < nchunks; c++) { ic = nchunks > 1 ? c : std::max(ic, c); const int iM = genome->pos[ic].size(); @@ -178,14 +179,9 @@ List parse_impute_par(std::string filename, int ic = -1) alpha.setZero(genome->C * genome->C, nGrids); beta.setZero(genome->C * genome->C, nGrids); get_cluster_probability(ind, iM, alpha, beta, genome->gls[ic], genome->R[ic], genome->PI[ic], genome->F[ic]); - if(!((1 - (alpha * beta).colwise().sum()).abs() < 1e-6).all()) cao.error("gamma sum is not 1.0!\n"); - ae.setZero(genome->C * genome->C, nGrids); - get_cluster_frequency(ae, genome->R[ic], genome->PI[ic]); - gammaI[c] = alpha * beta; - aeI[c] = ae; + gamma[c] = alpha * beta; } - ret[ind] = List::create(Named("gamma") = gammaI, - Named("ae") = aeI); + ret[ind] = gamma; } - return ret; + return List::create(Named("gamma")=ret, Named("ae") = genome->AE); }