diff --git a/lib/cpp/preprocessing/longitudinal_features_lagger.cpp b/lib/cpp/preprocessing/longitudinal_features_lagger.cpp index de94398bb..5c3e61813 100644 --- a/lib/cpp/preprocessing/longitudinal_features_lagger.cpp +++ b/lib/cpp/preprocessing/longitudinal_features_lagger.cpp @@ -6,33 +6,38 @@ #include "tick/preprocessing/longitudinal_features_lagger.h" + LongitudinalFeaturesLagger::LongitudinalFeaturesLagger( - const SBaseArrayDouble2dPtrList1D &features, const SArrayULongPtr n_lags) - : n_intervals(features[0]->n_rows()), - n_lags(n_lags), - n_samples(features.size()), - n_observations(n_samples * n_intervals), - n_features(features[0]->n_cols()), - n_lagged_features(n_lags->sum() + n_lags->size()) { - col_offset = ArrayULong(n_lags->size()); - col_offset.init_to_zero(); - if (n_features != n_lags->size()) { - TICK_ERROR("Features matrix column number should match n_lags length."); - } - if ((*n_lags)[0] >= n_intervals) { - TICK_ERROR("n_lags elements must be between 0 and (n_intervals - 1)."); - } + ulong n_intervals, + SArrayULongPtr _n_lags) + : n_intervals(n_intervals), + n_lags(_n_lags), + n_features(_n_lags->size()), + n_lagged_features(_n_lags->size() + _n_lags->sum()) { + if (n_lags != nullptr) compute_col_offset(n_lags); +} + +void LongitudinalFeaturesLagger::compute_col_offset(const SArrayULongPtr n_lags) { + ArrayULong col_offset_temp = ArrayULong(n_lags->size()); + col_offset_temp.init_to_zero(); for (ulong i(1); i < n_lags->size(); i++) { if ((*n_lags)[i] >= n_intervals) { TICK_ERROR("n_lags elements must be between 0 and (n_intervals - 1)."); } - col_offset[i] = col_offset[i - 1] + (*n_lags)[i - 1] + 1; + col_offset_temp[i] = col_offset_temp[i - 1] + (*n_lags)[i-1] + 1; } + col_offset = col_offset_temp.as_sarray_ptr(); } void LongitudinalFeaturesLagger::dense_lag_preprocessor(ArrayDouble2d &features, ArrayDouble2d &out, ulong censoring) const { + if (n_intervals != features.n_rows()) { + TICK_ERROR("Features matrix rows count should match n_intervals."); + } + if (n_features != features.n_cols()) { + TICK_ERROR("Features matrix column count should match n_lags length."); + } if (out.n_cols() != n_lagged_features) { TICK_ERROR( "n_columns of &out should be equal to n_features + sum(n_lags)."); @@ -46,8 +51,9 @@ void LongitudinalFeaturesLagger::dense_lag_preprocessor(ArrayDouble2d &features, n_cols_feature = (*n_lags)[feature] + 1; for (ulong j = 0; j < n_intervals; j++) { row = j; - col = col_offset[feature]; - value = features(row, feature); + col = (*col_offset)[feature]; + // use view_row instead of (row, feature) to be const + value = view_row(features, row)[feature]; max_col = col + n_cols_feature; if (value != 0) { while (row < censoring && col < max_col) { @@ -60,9 +66,14 @@ void LongitudinalFeaturesLagger::dense_lag_preprocessor(ArrayDouble2d &features, } } -void LongitudinalFeaturesLagger::sparse_lag_preprocessor( - ArrayULong &row, ArrayULong &col, ArrayDouble &data, ArrayULong &out_row, - ArrayULong &out_col, ArrayDouble &out_data, ulong censoring) const { +void LongitudinalFeaturesLagger::sparse_lag_preprocessor(ArrayULong &row, + ArrayULong &col, + ArrayDouble &data, + ArrayULong &out_row, + ArrayULong &out_col, + ArrayDouble &out_data, + ulong censoring) const { + // TODO: add checks here ? Or do them in Python ? ulong j(0), r, c, offset, new_col, max_col; double value; @@ -70,7 +81,7 @@ void LongitudinalFeaturesLagger::sparse_lag_preprocessor( value = data[i]; r = row[i]; c = col[i]; - offset = col_offset[c]; + offset = (*col_offset)[c]; max_col = offset + (*n_lags)[c] + 1; new_col = offset; diff --git a/lib/cpp/preprocessing/longitudinal_features_lagger_mp.cpp b/lib/cpp/preprocessing/longitudinal_features_lagger_mp.cpp new file mode 100644 index 000000000..d8406615c --- /dev/null +++ b/lib/cpp/preprocessing/longitudinal_features_lagger_mp.cpp @@ -0,0 +1,259 @@ +// License: BSD 3 clause + +// +// Created by Maryan Morel on 15/05/2017. +// + +#include +#include "tick/preprocessing/longitudinal_features_lagger_mp.h" + +LongitudinalFeaturesLagger_MP::LongitudinalFeaturesLagger_MP(ulong n_intervals, + SArrayULongPtr _n_lags, size_t n_jobs) + : LongitudinalPreprocessor(n_jobs), + n_intervals(n_intervals), + n_lags(_n_lags), + n_features(_n_lags->size()), + n_lagged_features(_n_lags->size() + _n_lags->sum()) { + if (n_lags != nullptr) compute_col_offset(n_lags); + n_output_features = get_n_output_features(); +} + +void LongitudinalFeaturesLagger_MP::compute_col_offset(const SArrayULongPtr n_lags) { + ArrayULong col_offset_temp = ArrayULong(n_lags->size()); + col_offset_temp.init_to_zero(); + for (ulong i(1); i < n_lags->size(); i++) { + if ((*n_lags)[i] > n_intervals) { // (*n_lags)[i] >= n_intervals + TICK_ERROR("n_lags elements must be between 0 and n_intervals."); // (n_intervals - 1) was + // actually wrong? + } + col_offset_temp[i] = col_offset_temp[i - 1] + (*n_lags)[i - 1] + 1; + } + col_offset = col_offset_temp.as_sarray_ptr(); +} + +void LongitudinalFeaturesLagger_MP::dense_lag_preprocessor(ArrayDouble2d &features, + ArrayDouble2d &out, + ulong censoring) const { + if (n_intervals != features.n_rows()) { + TICK_ERROR("Features matrix rows count should match n_intervals."); + } + if (n_features != features.n_cols()) { + TICK_ERROR("Features matrix column count should match n_lags length."); + } + if (out.n_cols() != n_lagged_features) { + TICK_ERROR("n_columns of &out should be equal to n_features + sum(n_lags)."); + } + if (out.n_rows() != n_intervals) { + TICK_ERROR("n_rows of &out is inconsistent with n_intervals"); + } + ulong n_cols_feature, row, col, max_col; + double value; + for (ulong feature = 0; feature < n_features; feature++) { + n_cols_feature = (*n_lags)[feature] + 1; + for (ulong j = 0; j < n_intervals; j++) { + row = j; + col = (*col_offset)[feature]; + // use view_row instead of (row, feature) to be const + value = view_row(features, row)[feature]; + max_col = col + n_cols_feature; + if (value != 0) { + while (row < censoring && col < max_col) { + out[row * n_lagged_features + col] = value; + row++; + col++; + } + } + } + } +} + +void LongitudinalFeaturesLagger_MP::sparse_lag_preprocessor(ArrayULong &row, ArrayULong &col, + ArrayDouble &data, ArrayULong &out_row, + ArrayULong &out_col, + ArrayDouble &out_data, + ulong censoring) const { + // TODO: add checks here ? Or do them in Python ? + if (row.size() != col.size() || col.size() != data.size() || data.size() != row.size()) + TICK_ERROR("row, col and data arrays should have the same size (coo matrix)"); + if (out_row.size() != out_col.size() || out_col.size() != out_data.size() || + out_data.size() != out_row.size()) + TICK_ERROR("out_row, out_col and out_data arrays should have the same size (coo matrix)"); + + ulong j(0), r, c, offset, new_col, max_col; + double value; + + for (ulong i = 0; i < data.size(); i++) { + value = data[i]; + r = row[i]; + c = col[i]; + offset = (*col_offset)[c]; + max_col = offset + (*n_lags)[c] + 1; + new_col = offset; + + while (r < censoring && new_col < max_col) { + out_row[j] = r; + out_col[j] = new_col; + out_data[j] = value; + r++; + new_col++; + j++; + } + } +} + +ulong LongitudinalFeaturesLagger_MP::get_n_output_features() { + ulong arraysum = 0; + std::vector arraysumint; + for (size_t i = 0; i < n_lags->size(); i++) arraysumint.push_back((*n_lags)[i] + 1); + for (ulong i : arraysumint) arraysum += i; + return arraysum; +} + +SSparseArrayDouble2dPtr LongitudinalFeaturesLagger_MP::sparse_lagger( + SSparseArrayDouble2dPtr &feature_matrix, ulong censoring_i) { + if (censoring_i > n_intervals || censoring_i < 1) + TICK_ERROR("censoring shoud be an integer in [1, n_intervals]"); + + CooMatrix coo(feature_matrix); + + // TODO FIX this is wrong, but the coo.toSparse removes all the useless zero data. + ulong estimated_nnz = coo.nnz * n_output_features; + + // std::cout << "estimated nnz : " << estimated_nnz << " nnz : " << coo.nnz + // << " arraysum : " << n_output_features << std::endl; + + ArrayULong out_row(estimated_nnz); + ArrayULong out_col(estimated_nnz); + ArrayDouble out_data(estimated_nnz); + + out_row.init_to_zero(); + out_col.init_to_zero(); + out_data.init_to_zero(); + + sparse_lag_preprocessor(coo.rows, coo.cols, coo.data, out_row, out_col, out_data, censoring_i); + + coo.rows = out_row; + coo.cols = out_col; + coo.data = out_data; + + return coo.toSparse(n_intervals, n_output_features); +} + +void LongitudinalFeaturesLagger_MP::transform_thread_dense( + std::vector splited_features, std::vector &output, + std::mutex &thread_mutex, std::vector splited_censoring) { + for (ulong i = 0; i < splited_features.size(); i++) { + ArrayDouble2d transformed(splited_features[i].n_rows(), n_output_features); + transformed.init_to_zero(); + dense_lag_preprocessor(splited_features[i], transformed, splited_censoring[i]); + thread_mutex.lock(); // just in case, needed ? + output.push_back(transformed); + thread_mutex.unlock(); + } +} + +void LongitudinalFeaturesLagger_MP::transform_thread_sparse( + std::vector splited_features, + std::vector &output, std::mutex &thread_mutex, + std::vector splited_censoring) { + for (ulong i = 0; i < splited_features.size(); i++) { + SSparseArrayDouble2dPtr transformed = sparse_lagger(splited_features[i], splited_censoring[i]); + thread_mutex.lock(); // just in case, needed ? + output.push_back(transformed); + thread_mutex.unlock(); + } +} + +std::vector LongitudinalFeaturesLagger_MP::transform( + std::vector features, std::vector censoring) { + if (features.empty()) TICK_ERROR("features is empty"); + + if (censoring.empty()) { + for (ulong i = 0; i < features.size(); i++) censoring.push_back(n_intervals); + } + + if (features.size() != censoring.size()) + TICK_ERROR("features size and censoring size doesn\'t match"); + + std::pair base_shape = {features[0].n_rows(), features[0].n_cols()}; + for (ArrayDouble2d f : features) + if (f.n_rows() != base_shape.first || f.n_cols() != base_shape.second) + TICK_ERROR("All the elements of features should have the same shape"); + + size_t thread_count = std::min((size_t)features.size(), n_jobs); + std::vector> splited_features = split_vector(features, thread_count); + features.clear(); + std::vector> splited_censoring = split_vector(censoring, thread_count); + censoring.clear(); + + if (splited_features.size() != splited_censoring.size()) + TICK_ERROR("Unexepected error : splited_features.size() != splited_censoring.size()"); + if (splited_features.size() != thread_count || splited_censoring.size() != thread_count) + TICK_ERROR( + "Unexepected error : splited_features.size() != thread_count || splited_censoring.size() " + "!= thread_count"); + if (splited_features.empty() || splited_censoring.empty()) + TICK_ERROR("Unexepected error : splited_features.empty() || splited_censoring.empty()"); + + std::vector output; + std::vector threads; + std::mutex thread_mutex; + + for (size_t i = 0; i < thread_count; i++) + threads.push_back(std::thread(&LongitudinalFeaturesLagger_MP::transform_thread_dense, this, + splited_features[i], std::ref(output), std::ref(thread_mutex), + splited_censoring[i])); + + splited_features.clear(); + splited_censoring.clear(); + + for (size_t i = 0; i < threads.size(); i++) threads[i].join(); + + return output; +} + +std::vector LongitudinalFeaturesLagger_MP::transform( + std::vector features, std::vector censoring) { + if (features.empty()) TICK_ERROR("features is empty"); + + if (censoring.empty()) { + for (ulong i = 0; i < features.size(); i++) censoring.push_back(n_intervals); + } + + if (features.size() != censoring.size()) + TICK_ERROR("features size and censoring size doesn\'t match"); + + std::pair base_shape = {features[0]->n_rows(), features[0]->n_cols()}; + n_intervals = base_shape.first; + for (SSparseArrayDouble2dPtr f : features) + if (f->n_rows() != base_shape.first || f->n_cols() != base_shape.second) + TICK_ERROR("All the elements of features should have the same shape"); + + size_t thread_count = std::min((size_t)features.size(), n_jobs); + std::vector> splited_features = + split_vector(features, thread_count); + features.clear(); + std::vector> splited_censoring = split_vector(censoring, thread_count); + censoring.clear(); + + if (splited_features.size() != splited_censoring.size()) + TICK_ERROR("Unexepected error : splited_features.size() != splited_censoring.size()"); + if (splited_features.empty() || splited_censoring.empty()) + TICK_ERROR("Unexepected error : splited_features.empty() || splited_censoring.empty()"); + + std::vector output; + std::vector threads; + std::mutex thread_mutex; + + for (size_t i = 0; i < thread_count; i++) + threads.push_back(std::thread(&LongitudinalFeaturesLagger_MP::transform_thread_sparse, this, + splited_features[i], std::ref(output), std::ref(thread_mutex), + splited_censoring[i])); + + splited_features.clear(); + splited_censoring.clear(); + + for (size_t i = 0; i < threads.size(); i++) threads[i].join(); + + return output; +} diff --git a/lib/cpp/preprocessing/longitudinal_preprocessor.cpp b/lib/cpp/preprocessing/longitudinal_preprocessor.cpp new file mode 100644 index 000000000..5eb1c744a --- /dev/null +++ b/lib/cpp/preprocessing/longitudinal_preprocessor.cpp @@ -0,0 +1,40 @@ +#include "tick/preprocessing/longitudinal_preprocessor.h" + +template +std::vector> LongitudinalPreprocessor::split_vector(std::vector array, size_t chunks) { + if (chunks == 0) + TICK_ERROR("Chunks size cannot be zero"); + + ulong size = array.size(); + + ulong closest_multi = size; + for (; closest_multi%chunks != 0; closest_multi++) {} + + ulong chunk_size = ceil(closest_multi / chunks); + + std::vector> out; + for (ulong i = 0; i < size; i++) { + ulong new_size = std::min(size - chunk_size * i, chunk_size); + if (new_size == 0) + break; + std::vector tmp(new_size); + for (ulong j = 0; j < new_size; j++) { + tmp[j] = array[i*chunk_size+j]; + } + out.push_back(tmp); + } + return out; +} + +template DLL_PUBLIC + std::vector> + LongitudinalPreprocessor::split_vector(std::vector array, size_t chunks); + +template DLL_PUBLIC + std::vector> + LongitudinalPreprocessor::split_vector(std::vector array, size_t chunks); + +template DLL_PUBLIC + std::vector> + LongitudinalPreprocessor::split_vector(std::vector array, size_t chunks); + diff --git a/lib/cpp/preprocessing/sparse_longitudinal_features_product.cpp b/lib/cpp/preprocessing/sparse_longitudinal_features_product.cpp index 9f935634d..a522a7594 100644 --- a/lib/cpp/preprocessing/sparse_longitudinal_features_product.cpp +++ b/lib/cpp/preprocessing/sparse_longitudinal_features_product.cpp @@ -7,10 +7,6 @@ #include "tick/preprocessing/sparse_longitudinal_features_product.h" #include -SparseLongitudinalFeaturesProduct::SparseLongitudinalFeaturesProduct( - const SBaseArrayDouble2dPtrList1D &features) - : n_features(features[0]->n_cols()) {} - ulong SparseLongitudinalFeaturesProduct::get_feature_product_col( ulong col1, ulong col2, ulong n_cols) const { if (col1 > col2) { // ensure we have the right order as the following formula diff --git a/lib/include/tick/array/coo_matrix.h b/lib/include/tick/array/coo_matrix.h new file mode 100644 index 000000000..b6925a060 --- /dev/null +++ b/lib/include/tick/array/coo_matrix.h @@ -0,0 +1,154 @@ + + +#ifndef LIB_INCLUDE_TICK_ARRAY_COO_MATRIX_H_ +#define LIB_INCLUDE_TICK_ARRAY_COO_MATRIX_H_ + +template +class CooMatrix { + public: + CooMatrix() {} + + CooMatrix(const ArrayULong &rows, const ArrayULong &cols, const Array data) { + this->rows = rows; + this->cols = cols; + this->data = data; + checkCoo(); + } + + explicit CooMatrix(std::shared_ptr> sparse) { + rows = ArrayULong(sparse->size_sparse()); + nnz = 0; + + std::vector nnz_rows; + for (ulong i = 0; i < sparse->n_rows(); i++) { + nnz_rows.push_back(sparse->row_indices()[i + 1] - sparse->row_indices()[i]); + } + + ulong out_i = 0; + ulong row_i = 0; + for (ulong nnz_i : nnz_rows) { + nnz += nnz_i; + for (ulong i = 0; i < nnz_i; i++) { + rows[out_i] = row_i; + out_i++; + if (out_i > rows.size()) TICK_ERROR("Invalid sparse matrix"); + } + row_i++; + } + + auto toArrayULong = [](ArrayUInt &array) { + ArrayULong out(array.size()); + for (ulong i = 0; i < array.size(); i++) out[i] = (ulong)array[i]; + return out; + }; + + ArrayUInt temp(sparse->size_sparse(), sparse->indices()); + cols = toArrayULong(temp); + data = Array(sparse->size_sparse(), sparse->data()); + + checkCoo(); + } + + void checkCoo() { + if (rows.size() != cols.size() || cols.size() != data.size() || data.size() != rows.size()) + TICK_ERROR("CooMatrix::checkCoo row, cols, and data size are different"); + // more check? + } + + void clearZero() { + checkCoo(); + + std::vector out_row; + std::vector out_col; + std::vector out_data; + for (ulong i = 0; i < rows.size(); i++) { + if (rows[i] != 0 || cols[i] != 0 || data[i] != (T)0) { + out_row.push_back(rows[i]); + out_col.push_back(cols[i]); + out_data.push_back(data[i]); + } + } + rows = ArrayULong(out_row.size()); + cols = ArrayULong(out_col.size()); + data = Array(out_data.size()); + + for (ulong i = 0; i < rows.size(); i++) { + rows[i] = out_row[i]; + cols[i] = out_col[i]; + data[i] = out_data[i]; + } + } + + void sortByRow() { + checkCoo(); + + std::vector> sort_data; + for (ulong i = 0; i < rows.size(); i++) sort_data.emplace_back(rows[i], cols[i], data[i]); + + std::sort(sort_data.begin(), sort_data.end()); + + for (ulong i = 0; i < rows.size(); i++) { + rows[i] = std::get<0>(sort_data[i]); + cols[i] = std::get<1>(sort_data[i]); + data[i] = std::get<2>(sort_data[i]); + } + + checkCoo(); + } + + std::shared_ptr> toSparse(ulong n_rows, ulong n_cols) { + checkCoo(); + + clearZero(); + sortByRow(); + + std::vector rows_vec(n_rows + 1); + std::vector cols_vec; + rows_vec[0] = 0; + + std::vector nnz_rows; + for (ulong i = 0; i < n_rows; i++) { + ulong nnz_this_row = 0; + for (ulong j = 0; j < rows.size(); j++) { + if (rows[j] == i) { + nnz_this_row++; + } + } + nnz_rows.push_back(nnz_this_row); + } + + if (nnz_rows.size() != n_rows) { + TICK_ERROR("Unexcepted error nnz_rows.size() != n_rows"); + } + + for (ulong i = 1; i < n_rows + 1; i++) { + rows_vec[i] = rows_vec[i - 1] + nnz_rows[i - 1]; + } + + ulong maxcol = 0; + for (ulong i = 0; i < cols.size(); i++) { + if (cols[i] > maxcol) maxcol = cols[i]; + cols_vec.push_back(cols[i]); + } + + unsigned int *row_ptr = new unsigned int[rows_vec.size()]; + unsigned int *col_ptr = new unsigned int[cols_vec.size()]; + T *data_ptr = new T[data.size()]; + + memcpy(row_ptr, rows_vec.data(), rows_vec.size() * sizeof(unsigned int)); + memcpy(col_ptr, cols_vec.data(), cols_vec.size() * sizeof(unsigned int)); + memcpy(data_ptr, data.data(), data.size() * sizeof(T)); + + std::shared_ptr> arrayptr = SSparseArray2d::new_ptr(0, 0, 0); + + arrayptr->set_data_indices_rowindices(data_ptr, col_ptr, row_ptr, n_rows, n_cols); + return arrayptr; + } + + ArrayULong rows; + ArrayULong cols; + Array data; + ulong nnz; +}; + +#endif // LIB_INCLUDE_TICK_ARRAY_COO_MATRIX_H_ diff --git a/lib/include/tick/preprocessing/longitudinal_features_lagger.h b/lib/include/tick/preprocessing/longitudinal_features_lagger.h index 8856ec8d4..61e76db7d 100644 --- a/lib/include/tick/preprocessing/longitudinal_features_lagger.h +++ b/lib/include/tick/preprocessing/longitudinal_features_lagger.h @@ -10,22 +10,27 @@ #include #include #include "tick/base/base.h" +#include "tick/base/serialization.h" -class LongitudinalFeaturesLagger { +class DLL_PUBLIC LongitudinalFeaturesLagger { protected: ulong n_intervals; SArrayULongPtr n_lags; - ArrayULong col_offset; - ulong n_samples; - ulong n_observations; ulong n_features; ulong n_lagged_features; + SArrayULongPtr col_offset; public: - LongitudinalFeaturesLagger(const SBaseArrayDouble2dPtrList1D &features, - const SArrayULongPtr n_lags); + // This exists solely for cereal/swig + LongitudinalFeaturesLagger() = default; - void dense_lag_preprocessor(ArrayDouble2d &features, ArrayDouble2d &out, + LongitudinalFeaturesLagger(ulong n_intervals, + SArrayULongPtr n_lags); + + void compute_col_offset(SArrayULongPtr n_lags); + + void dense_lag_preprocessor(ArrayDouble2d &features, + ArrayDouble2d &out, ulong censoring) const; void sparse_lag_preprocessor(ArrayULong &row, ArrayULong &col, @@ -34,14 +39,26 @@ class LongitudinalFeaturesLagger { ulong censoring) const; template - void serialize(Archive &ar) { + void load(Archive &ar) { + ar(CEREAL_NVP(n_intervals)); + ar(CEREAL_NVP(n_features)); + ar(CEREAL_NVP(n_lagged_features)); + + Array temp_n_lags, temp_col_offset; + ar(cereal::make_nvp("n_lags", temp_n_lags)); + + n_lags = temp_n_lags.as_sarray_ptr(); + col_offset = temp_col_offset.as_sarray_ptr(); + } + + + template + void save(Archive &ar) const { ar(CEREAL_NVP(n_intervals)); - ar(CEREAL_NVP(n_lags)); - ar(CEREAL_NVP(col_offset)); - ar(CEREAL_NVP(n_samples)); - ar(CEREAL_NVP(n_observations)); ar(CEREAL_NVP(n_features)); ar(CEREAL_NVP(n_lagged_features)); + ar(cereal::make_nvp("n_lags", *n_lags)); + ar(cereal::make_nvp("col_offset", *col_offset)); } }; diff --git a/lib/include/tick/preprocessing/longitudinal_features_lagger_mp.h b/lib/include/tick/preprocessing/longitudinal_features_lagger_mp.h new file mode 100644 index 000000000..7b8692d87 --- /dev/null +++ b/lib/include/tick/preprocessing/longitudinal_features_lagger_mp.h @@ -0,0 +1,81 @@ +// +// Created by Maryan Morel on 15/05/2017. +// + +#ifndef LIB_INCLUDE_TICK_PREPROCESSING_LONGITUDINAL_FEATURES_LAGGER_MP_H_ +#define LIB_INCLUDE_TICK_PREPROCESSING_LONGITUDINAL_FEATURES_LAGGER_MP_H_ + +// License: BSD 3 clause + +#include +#include +#include "tick/base/base.h" +#include "tick/base/serialization.h" +#include "tick/array/coo_matrix.h" +#include "tick/preprocessing/longitudinal_preprocessor.h" + +class DLL_PUBLIC LongitudinalFeaturesLagger_MP : LongitudinalPreprocessor { + protected: + ulong n_intervals; + SArrayULongPtr n_lags; + ulong n_features; + ulong n_lagged_features; + SArrayULongPtr col_offset; + ulong n_output_features; + + private: + void transform_thread_sparse(std::vector splited_features, + std::vector &output, + std::mutex &thread_mutex, std::vector splited_censoring); + void transform_thread_dense(std::vector splited_features, + std::vector &output, std::mutex &thread_mutex, + std::vector splited_censoring); + ulong get_n_output_features(); + + public: + // This exists solely for cereal/swig + LongitudinalFeaturesLagger_MP() = default; + + LongitudinalFeaturesLagger_MP(ulong n_intervals, SArrayULongPtr n_lags, size_t n_jobs = 1); + + void compute_col_offset(SArrayULongPtr n_lags); + + void dense_lag_preprocessor(ArrayDouble2d &features, ArrayDouble2d &out, ulong censoring) const; + + void sparse_lag_preprocessor(ArrayULong &row, ArrayULong &col, ArrayDouble &data, + ArrayULong &out_row, ArrayULong &out_col, ArrayDouble &out_data, + ulong censoring) const; + + SSparseArrayDouble2dPtr sparse_lagger(SSparseArrayDouble2dPtr &feature_matrix, ulong censoring_i); + + std::vector transform(std::vector features, + std::vector censoring = {}); + std::vector transform(std::vector features, + std::vector censoring = {}); + + template + void load(Archive &ar) { + ar(CEREAL_NVP(n_intervals)); + ar(CEREAL_NVP(n_features)); + ar(CEREAL_NVP(n_lagged_features)); + ar(CEREAL_NVP(n_output_features)); + + Array temp_n_lags, temp_col_offset; + ar(cereal::make_nvp("n_lags", temp_n_lags)); + + n_lags = temp_n_lags.as_sarray_ptr(); + col_offset = temp_col_offset.as_sarray_ptr(); + } + + template + void save(Archive &ar) const { + ar(CEREAL_NVP(n_intervals)); + ar(CEREAL_NVP(n_features)); + ar(CEREAL_NVP(n_lagged_features)); + ar(CEREAL_NVP(n_output_features)); + ar(cereal::make_nvp("n_lags", *n_lags)); + ar(cereal::make_nvp("col_offset", *col_offset)); + } +}; + +#endif // LIB_INCLUDE_TICK_PREPROCESSING_LONGITUDINAL_FEATURES_LAGGER_MP_H_ diff --git a/lib/include/tick/preprocessing/longitudinal_preprocessor.h b/lib/include/tick/preprocessing/longitudinal_preprocessor.h new file mode 100644 index 000000000..36fe012e7 --- /dev/null +++ b/lib/include/tick/preprocessing/longitudinal_preprocessor.h @@ -0,0 +1,36 @@ + +#ifndef LIB_INCLUDE_TICK_PREPROCESSING_LONGITUDINAL_PREPROCESSOR_H_ +#define LIB_INCLUDE_TICK_PREPROCESSING_LONGITUDINAL_PREPROCESSOR_H_ + +#include +#include +#include "tick/base/base.h" +#include "tick/base/serialization.h" + +class DLL_PUBLIC LongitudinalPreprocessor { + protected: + size_t n_jobs; + + public: + explicit LongitudinalPreprocessor(size_t n_jobs) { + if (n_jobs == 0) + this->n_jobs = std::thread::hardware_concurrency(); + else + this->n_jobs = n_jobs; + } + + template + std::vector> split_vector(std::vector array, size_t chunks); + + template + void load(Archive &ar) { + ar(CEREAL_NVP(n_jobs)); + } + + template + void save(Archive &ar) const { + ar(CEREAL_NVP(n_jobs)); + } +}; + +#endif // LIB_INCLUDE_TICK_PREPROCESSING_LONGITUDINAL_PREPROCESSOR_H_ diff --git a/lib/include/tick/preprocessing/sparse_longitudinal_features_product.h b/lib/include/tick/preprocessing/sparse_longitudinal_features_product.h index a8e7be058..829cab950 100644 --- a/lib/include/tick/preprocessing/sparse_longitudinal_features_product.h +++ b/lib/include/tick/preprocessing/sparse_longitudinal_features_product.h @@ -10,14 +10,18 @@ #include #include #include "tick/base/base.h" +#include "tick/base/serialization.h" -class SparseLongitudinalFeaturesProduct { +class DLL_PUBLIC SparseLongitudinalFeaturesProduct { protected: ulong n_features; public: + // This exists soley for cereal/swig + SparseLongitudinalFeaturesProduct() = default; + explicit SparseLongitudinalFeaturesProduct( - const SBaseArrayDouble2dPtrList1D &features); + const ulong n_features): n_features(n_features) {} inline ulong get_feature_product_col(ulong col1, ulong col2, ulong n_cols) const; @@ -28,7 +32,12 @@ class SparseLongitudinalFeaturesProduct { ArrayDouble &out_data) const; template - void serialize(Archive &ar) { + void load(Archive &ar) { + ar(CEREAL_NVP(n_features)); + } + + template + void save(Archive &ar) const { ar(CEREAL_NVP(n_features)); } }; diff --git a/lib/swig/tick/preprocessing/longitudinal_features_lagger.i b/lib/swig/tick/preprocessing/longitudinal_features_lagger.i index 7cef9bf80..65958cd23 100644 --- a/lib/swig/tick/preprocessing/longitudinal_features_lagger.i +++ b/lib/swig/tick/preprocessing/longitudinal_features_lagger.i @@ -7,21 +7,20 @@ class LongitudinalFeaturesLagger { public: - LongitudinalFeaturesLagger(const SBaseArrayDouble2dPtrList1D &features, - const SArrayULongPtr n_lags); + // This exists soley for cereal/swig + LongitudinalFeaturesLagger(); + + LongitudinalFeaturesLagger(ulong n_intervals, + SArrayULongPtr n_lags); void dense_lag_preprocessor(ArrayDouble2d &features, ArrayDouble2d &out, ulong censoring) const; - void sparse_lag_preprocessor(ArrayULong &row, - ArrayULong &col, - ArrayDouble &data, - ArrayULong &out_row, - ArrayULong &out_col, - ArrayDouble &out_data, + void sparse_lag_preprocessor(ArrayULong &row, ArrayULong &col, + ArrayDouble &data, ArrayULong &out_row, + ArrayULong &out_col, ArrayDouble &out_data, ulong censoring) const; - }; TICK_MAKE_PICKLABLE(LongitudinalFeaturesLagger); \ No newline at end of file diff --git a/lib/swig/tick/preprocessing/preprocessing_module.i b/lib/swig/tick/preprocessing/preprocessing_module.i index fd2b999a0..f4d83fa3a 100644 --- a/lib/swig/tick/preprocessing/preprocessing_module.i +++ b/lib/swig/tick/preprocessing/preprocessing_module.i @@ -2,9 +2,9 @@ %module preprocessing -%include tick/base/defs.i %include std_shared_ptr.i -%include tick/base/serialization.i +%include tick/base/defs.i +#include tick/base/serialization.h %shared_ptr(SparseLongitudinalFeaturesProduct); %shared_ptr(LongitudinalFeaturesLagger); diff --git a/lib/swig/tick/preprocessing/sparse_longitudinal_features_product.i b/lib/swig/tick/preprocessing/sparse_longitudinal_features_product.i index 4675c182f..6867bdb3e 100644 --- a/lib/swig/tick/preprocessing/sparse_longitudinal_features_product.i +++ b/lib/swig/tick/preprocessing/sparse_longitudinal_features_product.i @@ -7,7 +7,10 @@ class SparseLongitudinalFeaturesProduct { public: - SparseLongitudinalFeaturesProduct(const SBaseArrayDouble2dPtrList1D &features); + // This exists soley for cereal/swig + SparseLongitudinalFeaturesProduct(); + + SparseLongitudinalFeaturesProduct(const ulong n_features); void sparse_features_product(ArrayULong &row, ArrayULong &col, diff --git a/tick/preprocessing/longitudinal_features_lagger.py b/tick/preprocessing/longitudinal_features_lagger.py index 549ab7461..b5f5de798 100644 --- a/tick/preprocessing/longitudinal_features_lagger.py +++ b/tick/preprocessing/longitudinal_features_lagger.py @@ -137,7 +137,7 @@ def fit(self, features, labels=None, censoring=None): self._set("_n_intervals", n_intervals) self._set("_n_output_features", int((self.n_lags + 1).sum())) self._set("_cpp_preprocessor", - _LongitudinalFeaturesLagger(features, self.n_lags)) + _LongitudinalFeaturesLagger(n_intervals, self.n_lags)) self._set("_fitted", True) return self diff --git a/tick/preprocessing/longitudinal_features_product.py b/tick/preprocessing/longitudinal_features_product.py index e3ed421c8..7b89b2ce4 100644 --- a/tick/preprocessing/longitudinal_features_product.py +++ b/tick/preprocessing/longitudinal_features_product.py @@ -157,7 +157,7 @@ def fit(self, features, labels=None, censoring=None): if sps.issparse(features[0]) and self.exposure_type == "infinite": self._set("_preprocessor", - SparseLongitudinalFeaturesProduct(features)) + SparseLongitudinalFeaturesProduct(n_intervals)) self._set("_fitted", True)