Skip to content

Commit

Permalink
logging: add logging support for query_all
Browse files Browse the repository at this point in the history
refactored getting the size of the cpg map, storing it before allocating
the vectors for seqnames, starts and rownames
  • Loading branch information
jamespeapen committed Aug 28, 2024
1 parent 93009e9 commit 0e1850f
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 8 deletions.
3 changes: 3 additions & 0 deletions R/query_all.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ query_all <- function(
getOption("iscream.threads"),
check_thread_count(nthreads)
)

validate_log_level(n_threads = n_threads)

Cpp_query_all(bedfiles, regions, bismark, merged, sparse, nthreads = n_threads)
}

37 changes: 29 additions & 8 deletions src/query_all.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "query.hpp"
#include "parsers.hpp"
#include "decoders.hpp"
#include "log.hpp"
#include <unordered_map>
#include "../inst/include/khashl.h"
#include "../inst/include/iscream_types.h"
Expand Down Expand Up @@ -105,14 +106,17 @@ QueryAll<Mat>::QueryAll(std::vector<std::string>& bedfile_vec, std::vector<std::
cov_mat.resize(5, bedfile_vec.size());
m_mat.resize(5, bedfile_vec.size());

Rprintf("Querying %zu regions from %zu bedfiles\n", regions.size(), bedfile_vec.size());
setup_logger("iscream::query_all");

spdlog::info("Querying {0} regions from {1} bedfiles\n", regions.size(), bedfile_vec.size());
Progress bar(bedfile_vec.size(), true);

#if defined(_OPENMP)
#pragma omp parallel for num_threads(nthreads)
#endif
for (int bedfile_n = 0; bedfile_n < bedfile_vec.size(); bedfile_n++) {
if ( !Progress::check_abort() ) {
spdlog::debug("Querying {}", bedfile_vec[bedfile_n]);
MultiRegionQuery cpgs_in_file = query_intervals(bedfile_vec[bedfile_n].c_str(), regions);
for (RegionQuery cpgs_in_interval : cpgs_in_file) {
populate_matrix(cpgs_in_interval, bedfile_n, bismark);
Expand All @@ -122,9 +126,11 @@ QueryAll<Mat>::QueryAll(std::vector<std::string>& bedfile_vec, std::vector<std::
}
bar.cleanup();

Rcpp::CharacterVector c(kh_size(cpg_map));
Rcpp::IntegerVector s(kh_size(cpg_map));
Rcpp::CharacterVector rownames(kh_size(cpg_map));
int mapsize = kh_size(cpg_map);
Rcpp::CharacterVector c(mapsize);
Rcpp::IntegerVector s(mapsize);
Rcpp::CharacterVector rownames(mapsize);
spdlog::debug("Created temporary seqnames, samplenames and rownames vectors of size {}", kh_size(cpg_map));

khint_t iter;
for (iter = 0; iter < kh_end(cpg_map); ++iter) {
Expand All @@ -140,21 +146,25 @@ QueryAll<Mat>::QueryAll(std::vector<std::string>& bedfile_vec, std::vector<std::
}
seqnames = c;
start = s;
spdlog::debug("Populated seqnames, samplenames and rownames vectors");

int mapsize = kh_size(cpg_map);
if (cov_mat.n_rows > mapsize) {
Rprintf("Correcting matrix size\n");
int diff_rows = cov_mat.n_rows - mapsize;
spdlog::debug("{} extra rows allocated", diff_rows);
cov_mat.resize(mapsize, bedfile_vec.size());
m_mat.resize(mapsize, bedfile_vec.size());
spdlog::debug("Corrected matrix size");
}

spdlog::debug("Setting sample names");
for (int i = 0; i < sample_names.size(); i++) {
std::filesystem::path sample_path = bedfile_vec[i];
sample_names[i] = sample_path.extension() == ".gz" ? sample_path.stem().stem().string() : sample_path.stem().string();
// spdlog::debug("Got {} as sample name from {}", sample_names[i], bedfile_vec[i]);
}

if (sparse) {
spdlog::debug("Creating sparse matrix");
Rcpp::S4 cov_rmat = Rcpp::wrap(cov_mat);
Rcpp::S4 M_rmat = Rcpp::wrap(m_mat);
cov_rmat.slot("Dimnames") = Rcpp::List::create(rownames, sample_names);
Expand All @@ -164,6 +174,7 @@ QueryAll<Mat>::QueryAll(std::vector<std::string>& bedfile_vec, std::vector<std::
Rcpp::_["M"] = M_rmat
);
} else {
spdlog::debug("Creating dense matrix");
Rcpp::NumericMatrix cov_rmat = Rcpp::wrap(cov_mat);
Rcpp::NumericMatrix M_rmat = Rcpp::wrap(m_mat);
Rcpp::colnames(cov_rmat) = sample_names;
Expand All @@ -182,17 +193,26 @@ QueryAll<Mat>::QueryAll(std::vector<std::string>& bedfile_vec, std::vector<std::
template <class Mat>
void QueryAll<Mat>::populate_matrix(RegionQuery& query, int& col_n, const bool bismark) {

int cpg_count = query.cpgs_in_interval.size();
std::vector<BedLine> lines;
std::vector<CpG> ids;
#pragma omp critical
{
khmap_m_resize(cpg_map, query.cpgs_in_interval.size());
khmap_m_resize(cpg_map, cpg_count);
}
for (std::string cpg_string : query.cpgs_in_interval) {

BedLine parsed_bedline = bismark ? parseCovRecord(cpg_string) : parseBEDRecord(cpg_string);
lines.push_back(parsed_bedline);

spdlog::trace(
"Parsed {} into chr: {}, start: {}, end: {}",
cpg_string,
parsed_bedline.chr,
parsed_bedline.start,
parsed_bedline.end,
parsed_bedline.cov,
parsed_bedline.m_count
);
#pragma omp critical
{
if (!chr_map.count(parsed_bedline.chr)) {
Expand Down Expand Up @@ -229,6 +249,7 @@ void QueryAll<Mat>::populate_matrix(RegionQuery& query, int& col_n, const bool b

khint_t retrieve_b;
int idx;
spdlog::debug("Inserting {} CpGs into matrix", cpg_count);
for (size_t i = 0; i < lines.size(); i++) {
retrieve_b = khmap_get(cpg_map, ids[i]);
idx = kh_val(cpg_map, retrieve_b);
Expand Down

0 comments on commit 0e1850f

Please sign in to comment.