diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1a7c293 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +# Python +**/.idea +__pycache__ + +# docs +/docs/_build + diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..9590a65 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,159 @@ +# Levenberg Marquardt curve fitting in CUDA +# https://github.com/gpufit/Gpufit +# see also CMake configuration in /docs/installation.rst + +# CMake + +cmake_minimum_required( VERSION 3.7 ) +set_property( GLOBAL PROPERTY USE_FOLDERS ON ) + +if( NOT PROJECT_NAME ) + project( Gpufit VERSION 1.0.0 ) + include( CTest ) +endif() + +if( MSVC ) # link runtime statically + foreach( type ${CMAKE_CONFIGURATION_TYPES} ${CMAKE_BUILD_TYPE} ) + string( TOUPPER ${type} TYPE ) + foreach( flags CMAKE_C_FLAGS_${TYPE} CMAKE_CXX_FLAGS_${TYPE} ) + get_property( help CACHE ${flags} PROPERTY HELPSTRING ) + string( REPLACE "/MD" "/MT" ${flags} "${${flags}}" ) + set( ${flags} "${${flags}}" CACHE STRING "${help}" FORCE ) + endforeach() + endforeach() +endif() + +function( add_launcher target executable arguments working_directory ) + if( MSVC12 OR MSVC14 ) + file( WRITE ${CMAKE_CURRENT_BINARY_DIR}/${target}.vcxproj.user +"\n" +"\n" +" \n" +" ${executable}\n" +" ${arguments}\n" +" ${working_directory}\n" +" \n" +"\n" + ) + endif() +endfunction() + +# Boost + +find_package( Boost 1.58.0 ) +if( Boost_FOUND ) + function( add_boost_test modules name ) + string( REPLACE ";" "_" prefix "${modules}" ) + set( target ${prefix}_Test_${name} ) + add_executable( ${target} ${name}.cpp + ${PROJECT_SOURCE_DIR}/Tests/utils.h + ${PROJECT_SOURCE_DIR}/Tests/utils.cpp + ) + target_include_directories( ${target} PRIVATE ${PROJECT_SOURCE_DIR} ) + target_link_libraries( ${target} ${modules} Boost::boost ) + set_property( TARGET ${target} + PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" ) + set_property( TARGET ${target} PROPERTY FOLDER Tests ) + + add_test( NAME ${target} + COMMAND ${target} --build_info --log_level=all --report_level=detailed ) + endfunction() +else() + set( BUILD_TESTING OFF ) + message( WARNING "Boost NOT found - skipping tests! (set BOOST_ROOT manually)" ) +endif() + +# MATLAB + +find_package( Matlab ) +if( Matlab_FOUND ) + find_program( Matlab_EXECUTABLE matlab + PATHS "${Matlab_ROOT_DIR}/bin" PATH_SUFFIXES win32 win64 NO_DEFAULT_PATH ) + function( add_matlab_launcher target ) + set( paths "${CMAKE_BINARY_DIR}/$(Configuration)" ${ARGN} ) + list( GET paths -1 working_directory ) + string( REPLACE ";" "','" paths "${paths}" ) + set( arguments "-r addpath('${paths}');addpath(genpath(pwd))" ) + add_launcher( ${target} "${Matlab_EXECUTABLE}" "${arguments}" "${working_directory}" ) + endfunction() +endif() + +# Python + +find_package( PythonInterp ) +if( PYTHONINTERP_FOUND ) + function( add_python_launcher target ) + set( paths "${CMAKE_BINARY_DIR}/$(Configuration)" ${ARGN} ) + list( GET paths -1 working_directory ) + string( REPLACE ";" "')\nsys.path.append('" paths "${paths}" ) + set( arguments "-i -c \"import sys\nsys.path.append('${paths}')\"" ) + add_launcher( ${target} "${PYTHON_EXECUTABLE}" "${arguments}" "${working_directory}" ) + endfunction() +endif() + +# Cpufit + +add_subdirectory( Cpufit ) + +# Gpufit + +add_subdirectory( Gpufit ) + +# Examples using Gpufit and Cpufit + +add_subdirectory( examples ) + +# Launcher +# +# Uses the following variables: +# +# Matlab_WORKING_DIRECTORY (Default: user home directory) +# -- Working directory for MATLAB applications using Cpufit and Gpufit. +# Python_WORKING_DIRECTORY (Default: user home directory) +# -- Working directory for Python applications using Gpufit. + +if( WIN32 ) + file( TO_CMAKE_PATH "$ENV{HOMEPATH}" home ) +else() + file( TO_CMAKE_PATH "$ENV{HOME}" home ) +endif() + +if( Matlab_FOUND ) + set( Matlab_WORKING_DIRECTORY "${home}" CACHE PATH "MATLAB working directory" ) + if( Matlab_WORKING_DIRECTORY ) + add_custom_target( RUN_MATLAB ) + set_property( TARGET RUN_MATLAB PROPERTY FOLDER CMakePredefinedTargets ) + add_dependencies( RUN_MATLAB CpufitMex GpufitMex ) + add_matlab_launcher( RUN_MATLAB + "${CMAKE_SOURCE_DIR}/Cpufit/matlab" + "${CMAKE_SOURCE_DIR}/Gpufit/matlab" + "${Matlab_WORKING_DIRECTORY}" + ) + endif() +endif() + +if( PYTHONINTERP_FOUND ) + set( Python_WORKING_DIRECTORY "${home}" CACHE PATH "Python working directory" ) + if( Python_WORKING_DIRECTORY ) + add_custom_target( RUN_PYTHON ) + set_property( TARGET RUN_PYTHON PROPERTY FOLDER CMakePredefinedTargets ) + add_dependencies( RUN_PYTHON Gpufit ) + add_python_launcher( RUN_PYTHON + "${CMAKE_SOURCE_DIR}/Gpufit/python" + "${Python_WORKING_DIRECTORY}" + ) + endif() +endif() + +# Tests + +if( BUILD_TESTING ) + add_subdirectory( tests ) +endif() + +# Package + +#set( CPACK_PACKAGE_VERSION ${PROJECT_VERSION} ) +#set( CPACK_GENERATOR ZIP ) + +#include( CPack ) diff --git a/Cpufit/CMakeLists.txt b/Cpufit/CMakeLists.txt new file mode 100644 index 0000000..9af1643 --- /dev/null +++ b/Cpufit/CMakeLists.txt @@ -0,0 +1,29 @@ + +# Cpufit + +set( CpuHeaders + Cpufit.h + info.h + lm_fit.h + interface.h +) + +set( CpuSources + Cpufit.cpp + info.cpp + lm_fit.cpp + lm_fit_cpp.cpp + interface.cpp + Cpufit.def +) + +add_library( Cpufit SHARED + ${CpuHeaders} + ${CpuSources} +) +set_property( TARGET Cpufit + PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" ) + +#install( TARGETS Cpufit RUNTIME DESTINATION bin ) + +add_subdirectory( matlab ) diff --git a/Cpufit/Cpufit.def b/Cpufit/Cpufit.def new file mode 100644 index 0000000..07c1849 --- /dev/null +++ b/Cpufit/Cpufit.def @@ -0,0 +1,4 @@ +LIBRARY "Cpufit" +EXPORTS + cpufit @1 + cpufit_get_last_error @2 \ No newline at end of file diff --git a/Cpufit/README.md b/Cpufit/README.md new file mode 100644 index 0000000..cee0619 --- /dev/null +++ b/Cpufit/README.md @@ -0,0 +1 @@ +# Cpufit \ No newline at end of file diff --git a/Cpufit/cpufit.cpp b/Cpufit/cpufit.cpp new file mode 100644 index 0000000..c8c74cb --- /dev/null +++ b/Cpufit/cpufit.cpp @@ -0,0 +1,76 @@ +#include "cpufit.h" +#include "interface.h" + +#include + +std::string last_error ; + +int cpufit +( + size_t n_fits, + size_t n_points, + float * data, + float * weights, + int model_id, + float * initial_parameters, + float tolerance, + int max_n_iterations, + int * parameters_to_fit, + int estimator_id, + size_t user_info_size, + char * user_info, + float * output_parameters, + int * output_states, + float * output_chi_squares, + int * output_n_iterations +) +try +{ + __int32 n_points_32 = 0; + if (n_points <= (unsigned int)(std::numeric_limits<__int32>::max())) + { + n_points_32 = __int32(n_points); + } + else + { + throw std::runtime_error("maximum number of data points per fit exceeded"); + } + + FitInterface fi( + data, + weights, + n_fits, + n_points_32, + tolerance, + max_n_iterations, + estimator_id, + initial_parameters, + parameters_to_fit, + user_info, + user_info_size, + output_parameters, + output_states, + output_chi_squares, + output_n_iterations); + + fi.fit(model_id); + + return STATUS_OK; +} +catch (std::exception & exception) +{ + last_error = exception.what(); + + return STATUS_ERROR; +} +catch (...) +{ + last_error = "Unknown Error"; + + return STATUS_ERROR; +} + +char const * cpufit_get_last_error() +{ + return last_error.c_str(); +} diff --git a/Cpufit/cpufit.h b/Cpufit/cpufit.h new file mode 100644 index 0000000..1575636 --- /dev/null +++ b/Cpufit/cpufit.h @@ -0,0 +1,56 @@ +#ifndef CPU_FIT_H_INCLUDED +#define CPU_FIT_H_INCLUDED + +// fitting model ID +#define GAUSS_1D 0 +#define GAUSS_2D 1 +#define GAUSS_2D_ELLIPTIC 2 +#define GAUSS_2D_ROTATED 3 +#define CAUCHY_2D_ELLIPTIC 4 +#define LINEAR_1D 5 + +// estimator ID +#define LSE 0 +#define MLE 1 + +// fit state +#define STATE_CONVERGED 0 +#define STATE_MAX_ITERATION 1 +#define STATE_SINGULAR_HESSIAN 2 +#define STATE_NEG_CURVATURE_MLE 3 + +// cpufit return state +#define STATUS_OK 0 +#define STATUS_ERROR -1 + +#ifdef __cplusplus +extern "C" { +#endif + +int cpufit +( + size_t n_fits, + size_t n_points, + float * data, + float * weights, + int model_id, + float * initial_parameters, + float tolerance, + int max_n_iterations, + int * parameters_to_fit, + int estimator_id, + size_t user_info_size, + char * user_info, + float * output_parameters, + int * output_states, + float * output_chi_squares, + int * output_n_iterations +) ; + +char const * cpufit_get_last_error() ; + +#ifdef __cplusplus +} +#endif + +#endif // CPU_FIT_H_INCLUDED diff --git a/Cpufit/info.cpp b/Cpufit/info.cpp new file mode 100644 index 0000000..dcd5085 --- /dev/null +++ b/Cpufit/info.cpp @@ -0,0 +1,30 @@ +#include "info.h" + +Info::Info(void) : + n_parameters_(0), + n_parameters_to_fit_(0), + max_n_iterations_(0), + n_fits_(0), + n_points_(0), + model_id_(0), + estimator_id_(0), + user_info_size_(0) +{ +} + +Info::~Info(void) +{ +} + +void Info::set_number_of_parameters_to_fit(int const * parameters_to_fit) +{ + n_parameters_to_fit_ = n_parameters_; + + for (int i = 0; i < n_parameters_; i++) + { + if (!parameters_to_fit[i]) + { + n_parameters_to_fit_--; + } + } +} \ No newline at end of file diff --git a/Cpufit/info.h b/Cpufit/info.h new file mode 100644 index 0000000..0faa764 --- /dev/null +++ b/Cpufit/info.h @@ -0,0 +1,28 @@ +#ifndef CPUFIT_PARAMETERS_H_INCLUDED +#define CPUFIT_PARAMETERS_H_INCLUDED + +#include + +class Info +{ +public: + Info(); + virtual ~Info(); + void set_number_of_parameters_to_fit(int const * parameters_to_fit); + +private: + +public: + int n_parameters_; + int n_parameters_to_fit_; + std::size_t n_fits_; + std::size_t n_points_; + int max_n_iterations_; + int model_id_; + int estimator_id_; + std::size_t user_info_size_; + +private: +}; + +#endif diff --git a/Cpufit/interface.cpp b/Cpufit/interface.cpp new file mode 100644 index 0000000..50dc01d --- /dev/null +++ b/Cpufit/interface.cpp @@ -0,0 +1,118 @@ +#include "cpufit.h" +#include "interface.h" + +FitInterface::FitInterface( + float const * data, + float const * weights, + std::size_t n_fits, + int n_points, + float tolerance, + int max_n_iterations, + int estimator_id, + float const * initial_parameters, + int const * parameters_to_fit, + char * user_info, + std::size_t user_info_size, + float * output_parameters, + int * output_states, + float * output_chi_squares, + int * output_n_iterations) : + data_(data), + weight_(weights), + n_fits_(n_fits), + n_points_(n_points), + tolerance_(tolerance), + max_n_iterations_(max_n_iterations), + estimator_id_(estimator_id), + initial_parameters_(initial_parameters), + parameters_to_fit_(parameters_to_fit), + user_info_(user_info), + user_info_size_(user_info_size), + output_parameters_(output_parameters), + output_states_(output_states), + output_chi_squares_(output_chi_squares), + output_n_iterations_(output_n_iterations), + n_parameters_(0) +{} + +FitInterface::~FitInterface() +{} + +void FitInterface::check_sizes() +{ + std::size_t maximum_size = std::numeric_limits< std::size_t >::max(); + + if (n_fits_ > maximum_size / n_points_ / sizeof(float)) + { + throw std::runtime_error("maximum absolute number of data points exceeded"); + } + + if (n_fits_ > maximum_size / n_parameters_ / sizeof(float)) + { + throw std::runtime_error("maximum number of fits and/or parameters exceeded"); + } +} + +void FitInterface::configure_info(Info & info, int const model_id) +{ + info.model_id_ = model_id; + info.n_fits_ = n_fits_; + info.n_points_ = n_points_; + info.max_n_iterations_ = max_n_iterations_; + info.estimator_id_ = estimator_id_; + info.user_info_size_ = user_info_size_; + info.n_parameters_ = n_parameters_; + + info.set_number_of_parameters_to_fit(parameters_to_fit_); +} + +void FitInterface::set_number_of_parameters(int const model_id) +{ + switch (model_id) + { + case GAUSS_1D: + n_parameters_ = 4; + break; + case GAUSS_2D: + n_parameters_ = 5; + break; + case GAUSS_2D_ELLIPTIC: + n_parameters_ = 6; + break; + case GAUSS_2D_ROTATED: + n_parameters_ = 7; + break; + case CAUCHY_2D_ELLIPTIC: + n_parameters_ = 6; + break; + case LINEAR_1D: + n_parameters_ = 2; + break; + default: + break; + } +} + +void FitInterface::fit(int const model_id) +{ + set_number_of_parameters(model_id); + + check_sizes(); + + Info info; + configure_info(info, model_id); + + LMFit lmfit( + data_, + weight_, + info, + initial_parameters_, + parameters_to_fit_, + user_info_, + output_parameters_, + output_states_, + output_chi_squares_, + output_n_iterations_); + + lmfit.run(tolerance_); +} diff --git a/Cpufit/interface.h b/Cpufit/interface.h new file mode 100644 index 0000000..09bdc11 --- /dev/null +++ b/Cpufit/interface.h @@ -0,0 +1,57 @@ +#ifndef CPUFIT_INTERFACE_H_INCLUDED +#define CPUFIT_INTERFACE_H_INCLUDED + +#include "lm_fit.h" + +class FitInterface +{ +public: + FitInterface( + float const * data, + float const * weights, + std::size_t n_fits, + int n_points, + float tolerance, + int max_n_iterations, + int estimator_id, + float const * initial_parameters, + int const * parameters_to_fit, + char * user_info, + std::size_t user_info_size, + float * output_parameters, + int * output_states, + float * output_chi_squares, + int * output_n_iterations); + + virtual ~FitInterface(); + + void fit(int const model_id); + +private: + void set_number_of_parameters(int const model_id); + void check_sizes(); + void configure_info(Info & info, int const model_id); + +public: + +private: + int n_parameters_; + float const * const data_; + float const * const weight_; + std::size_t const n_fits_; + int const n_points_; + float const tolerance_; + int const max_n_iterations_; + int const estimator_id_; + float const * const initial_parameters_; + int const * const parameters_to_fit_; + char * const user_info_; + std::size_t const user_info_size_; + + float * output_parameters_; + int * output_states_; + float * output_chi_squares_; + int * output_n_iterations_; +}; + +#endif diff --git a/Cpufit/lm_fit.cpp b/Cpufit/lm_fit.cpp new file mode 100644 index 0000000..e6fa64f --- /dev/null +++ b/Cpufit/lm_fit.cpp @@ -0,0 +1,57 @@ +#include "lm_fit.h" +#include +#include +#include +#include +#include +#include + +LMFit::LMFit( + float const * const data, + float const * const weights, + Info const & info, + float const * const initial_parameters, + int const * const parameters_to_fit, + char * const user_info, + float * output_parameters, + int * output_states, + float * output_chi_squares, + int * output_n_iterations + ) : + data_(data), + weights_(weights), + initial_parameters_(initial_parameters), + parameters_to_fit_(parameters_to_fit), + user_info_(user_info), + output_parameters_(output_parameters), + output_states_(output_states), + output_chi_squares_(output_chi_squares), + output_n_iterations_(output_n_iterations), + info_(info) +{} + +LMFit::~LMFit() +{ +} + +void LMFit::run(float const tolerance) +{ + for (std::size_t fit_index = 0; fit_index < info_.n_fits_; fit_index++) + { + LMFitCPP gf_cpp( + tolerance, + fit_index, + data_ + fit_index*info_.n_points_, + weights_ ? weights_ + fit_index*info_.n_points_ : 0, + info_, + initial_parameters_ + fit_index*info_.n_parameters_, + parameters_to_fit_, + user_info_, + output_parameters_ + fit_index*info_.n_parameters_, + output_states_ + fit_index, + output_chi_squares_ + fit_index, + output_n_iterations_ + fit_index); + + gf_cpp.run(); + } +} \ No newline at end of file diff --git a/Cpufit/lm_fit.h b/Cpufit/lm_fit.h new file mode 100644 index 0000000..a5fd96d --- /dev/null +++ b/Cpufit/lm_fit.h @@ -0,0 +1,137 @@ +#ifndef CPUFIT_GAUSS_FIT_H_INCLUDED +#define CPUFIT_GAUSS_FIT_H_INCLUDED + +#include "info.h" + +class LMFitCPP; + +class LMFit +{ +public: + LMFit( + float const * data, + float const * weights, + Info const& info, + float const * initial_parameters, + int const * parameters_to_fit, + char * user_info, + float * output_parameters, + int * output_states, + float * output_chi_squares, + int * output_n_iterations); + + virtual ~LMFit(); + + void run(float const tolerance); + +private: + float const * const data_; + float const * const weights_; + float const * const initial_parameters_; + int const * const parameters_to_fit_; + char * const user_info_; + + float * output_parameters_; + int * output_states_; + float * output_chi_squares_; + int * output_n_iterations_; + + Info const & info_; +}; + +class LMFitCPP +{ +public: + LMFitCPP( + float const tolerance, + std::size_t const fit_index, + float const * data, + float const * weight, + Info const & info, + float const * initial_parameters, + int const * parameters_to_fit, + char * user_info, + float * output_parameters, + int * output_states, + float * output_chi_squares, + int * output_n_iterations); + + virtual ~LMFitCPP() + {}; + + void run(); + +private: + void calc_curve_values(); + void calc_coefficients(); + + void calc_curve_values(std::vector& curve, std::vector& derivatives); + + void calc_values_gauss2d(std::vector& gaussian); + void calc_derivatives_gauss2d(std::vector & derivatives); + + void calc_values_gauss2delliptic(std::vector& gaussian); + void calc_derivatives_gauss2delliptic(std::vector & derivatives); + + void calc_values_gauss2drotated(std::vector& gaussian); + void calc_derivatives_gauss2drotated(std::vector & derivatives); + + void calc_values_gauss1d(std::vector& gaussian); + void calc_derivatives_gauss1d(std::vector & derivatives); + + void calc_values_cauchy2delliptic(std::vector& cauchy); + void calc_derivatives_cauchy2delliptic(std::vector & derivatives); + + void calc_values_linear1d(std::vector& line); + void calc_derivatives_linear1d(std::vector & derivatives); + + void calculate_hessian(std::vector const & derivatives, + std::vector const & curve); + + void calc_gradient(std::vector const & derivatives, + std::vector const & curve); + + void calc_chi_square( + std::vector const & curve); + + void modify_step_width(); + void gauss_jordan(); + void update_parameters(); + + bool check_for_convergence(); + void evaluate_iteration(int const iteration); + void prepare_next_iteration(); + +public: + +private: + + std::size_t const fit_index_; + float const * const data_; + float const * const weight_; + float const * const initial_parameters_; + int const * const parameters_to_fit_; + + bool converged_; + float * parameters_; + int * state_; + float * chi_square_; + int * n_iterations_; + + std::vector prev_parameters_; + Info const & info_; + + float lambda_; + std::vector curve_; + std::vector derivatives_; + std::vector hessian_; + std::vector modified_hessian_; + std::vector gradient_; + std::vector delta_; + float prev_chi_square_; + float const tolerance_; + + char * const user_info_; +}; + +#endif \ No newline at end of file diff --git a/Cpufit/lm_fit_cpp.cpp b/Cpufit/lm_fit_cpp.cpp new file mode 100644 index 0000000..7eaae9d --- /dev/null +++ b/Cpufit/lm_fit_cpp.cpp @@ -0,0 +1,711 @@ +#include "cpufit.h" +#include "lm_fit.h" + +#include +#include +#include + +LMFitCPP::LMFitCPP( + float const tolerance, + std::size_t const fit_index, + float const * data, + float const * weight, + Info const & info, + float const * initial_parameters, + int const * parameters_to_fit, + char * user_info, + float * output_parameters, + int * output_state, + float * output_chi_square, + int * output_n_iterations + ) : + fit_index_(fit_index), + data_(data), + weight_(weight), + initial_parameters_(initial_parameters), + tolerance_(tolerance), + converged_(false), + info_(info), + parameters_to_fit_(parameters_to_fit), + curve_(info.n_points_), + derivatives_(info.n_points_*info.n_parameters_), + hessian_(info.n_parameters_to_fit_*info.n_parameters_to_fit_), + modified_hessian_(info.n_parameters_to_fit_*info.n_parameters_to_fit_), + gradient_(info.n_parameters_to_fit_), + delta_(info.n_parameters_to_fit_), + prev_chi_square_(0), + lambda_(0.001f), + prev_parameters_(info.n_parameters_to_fit_), + user_info_(user_info), + parameters_(output_parameters), + state_(output_state), + chi_square_(output_chi_square), + n_iterations_(output_n_iterations) +{} + +void LMFitCPP::calc_derivatives_gauss2d( + std::vector & derivatives) +{ + std::size_t const fit_size_x = std::size_t(std::sqrt(info_.n_points_)); + + for (std::size_t y = 0; y < fit_size_x; y++) + for (std::size_t x = 0; x < fit_size_x; x++) + { + float const argx = (x - parameters_[1]) * (x - parameters_[1]) / (2 * parameters_[3] * parameters_[3]); + float const argy = (y - parameters_[2]) * (y - parameters_[2]) / (2 * parameters_[3] * parameters_[3]); + float const ex = exp(-(argx + argy)); + + derivatives[0 * info_.n_points_ + y*fit_size_x + x] + = ex; + derivatives[1 * info_.n_points_ + y*fit_size_x + x] + = (parameters_[0] * (x - parameters_[1])*ex) / (parameters_[3] * parameters_[3]); + derivatives[2 * info_.n_points_ + y*fit_size_x + x] + = (parameters_[0] * (y - parameters_[2])*ex) / (parameters_[3] * parameters_[3]); + derivatives[3 * info_.n_points_ + y*fit_size_x + x] + = (parameters_[0] + * ((x - parameters_[1])*(x - parameters_[1]) + + (y - parameters_[2])*(y - parameters_[2]))*ex) + / (parameters_[3] * parameters_[3] * parameters_[3]); + derivatives[4 * info_.n_points_ + y*fit_size_x + x] + = 1; + } +} + +void LMFitCPP::calc_derivatives_gauss2delliptic( + std::vector & derivatives) +{ + std::size_t const fit_size_x = std::size_t(std::sqrt(info_.n_points_)); + + for (std::size_t y = 0; y < fit_size_x; y++) + for (std::size_t x = 0; x < fit_size_x; x++) + { + float const argx = (x - parameters_[1]) * (x - parameters_[1]) / (2 * parameters_[3] * parameters_[3]); + float const argy = (y - parameters_[2]) * (y - parameters_[2]) / (2 * parameters_[4] * parameters_[4]); + float const ex = exp(-(argx +argy)); + + derivatives[0 * info_.n_points_ + y*fit_size_x + x] + = ex; + derivatives[1 * info_.n_points_ + y*fit_size_x + x] + = (parameters_[0] * (x - parameters_[1])*ex) / (parameters_[3] * parameters_[3]); + derivatives[2 * info_.n_points_ + y*fit_size_x + x] + = (parameters_[0] * (y - parameters_[2])*ex) / (parameters_[4] * parameters_[4]); + derivatives[3 * info_.n_points_ + y*fit_size_x + x] + = (parameters_[0] * (x - parameters_[1])*(x - parameters_[1])*ex) / (parameters_[3] * parameters_[3] * parameters_[3]); + derivatives[4 * info_.n_points_ + y*fit_size_x + x] + = (parameters_[0] * (y - parameters_[2])*(y - parameters_[2])*ex) / (parameters_[4] * parameters_[4] * parameters_[4]); + derivatives[5 * info_.n_points_ + y*fit_size_x + x] + = 1; + } +} + +void LMFitCPP::calc_derivatives_gauss2drotated( + std::vector & derivatives) +{ + std::size_t const fit_size_x = std::size_t(std::sqrt(info_.n_points_)); + + float const amplitude = parameters_[0]; + float const x0 = parameters_[1]; + float const y0 = parameters_[2]; + float const sig_x = parameters_[3]; + float const sig_y = parameters_[4]; + float const background = parameters_[5]; + float const rot_sin = sin(parameters_[6]); + float const rot_cos = cos(parameters_[6]); + + for (std::size_t y = 0; y < fit_size_x; y++) + for (std::size_t x = 0; x < fit_size_x; x++) + { + float const arga = ((x - x0) * rot_cos) - ((y - y0) * rot_sin); + float const argb = ((x - x0) * rot_sin) + ((y - y0) * rot_cos); + float const ex = exp((-0.5f) * (((arga / sig_x) * (arga / sig_x)) + ((argb / sig_y) * (argb / sig_y)))); + + derivatives[0 * info_.n_points_ + y*fit_size_x + x] + = ex; + derivatives[1 * info_.n_points_ + y*fit_size_x + x] + = ex * (amplitude * rot_cos * arga / (sig_x*sig_x) + amplitude * rot_sin *argb / (sig_y*sig_y)); + derivatives[2 * info_.n_points_ + y*fit_size_x + x] + = ex * (-amplitude * rot_sin * arga / (sig_x*sig_x) + amplitude * rot_cos *argb / (sig_y*sig_y)); + derivatives[3 * info_.n_points_ + y*fit_size_x + x] + = ex * amplitude * arga * arga / (sig_x*sig_x*sig_x); + derivatives[4 * info_.n_points_ + y*fit_size_x + x] + = ex * amplitude * argb * argb / (sig_y*sig_y*sig_y); + derivatives[5 * info_.n_points_ + y*fit_size_x + x] + = 1; + derivatives[6 * info_.n_points_ + y*fit_size_x + x] + = ex * amplitude * arga * argb * (1.0f / (sig_x*sig_x) - 1.0f / (sig_y*sig_y)); + } +} + +void LMFitCPP::calc_derivatives_gauss1d( + std::vector & derivatives) +{ + for (std::size_t x = 0; x < info_.n_points_; x++) + { + float argx = ((x - parameters_[1])*(x - parameters_[1])) / (2 * parameters_[2] * parameters_[2]); + float ex = exp(-argx); + + derivatives[0 * info_.n_points_ + x] = ex; + derivatives[1 * info_.n_points_ + x] = (parameters_[0] * (x - parameters_[1])*ex) / (parameters_[2] * parameters_[2]); + derivatives[2 * info_.n_points_ + x] = (parameters_[0] * (x - parameters_[1])*(x - parameters_[1])*ex) / (parameters_[2] * parameters_[2] * parameters_[2]); + derivatives[3 * info_.n_points_ + x] = 1; + } +} + +void LMFitCPP::calc_derivatives_cauchy2delliptic( + std::vector & derivatives) +{ + std::size_t const fit_size_x = std::size_t(std::sqrt(info_.n_points_)); + + for (std::size_t y = 0; y < fit_size_x; y++) + for (std::size_t x = 0; x < fit_size_x; x++) + { + float const argx = + ((parameters_[1] - x) / parameters_[3]) + *((parameters_[1] - x) / parameters_[3]) + 1.f; + float const argy = + ((parameters_[2] - y) / parameters_[4]) + *((parameters_[2] - y) / parameters_[4]) + 1.f; + + derivatives[0 * info_.n_points_ + y*fit_size_x + x] + = 1.f / (argx*argy); + derivatives[1 * info_.n_points_ + y*fit_size_x + x] = + -2.f * parameters_[0] * (parameters_[1] - x) + / (parameters_[3] * parameters_[3] * argx*argx*argy); + derivatives[2 * info_.n_points_ + y*fit_size_x + x] = + -2.f * parameters_[0] * (parameters_[2] - y) + / (parameters_[4] * parameters_[4] * argy*argy*argx); + derivatives[3 * info_.n_points_ + y*fit_size_x + x] = + 2.f * parameters_[0] * (parameters_[1] - x) * (parameters_[1] - x) + / (parameters_[3] * parameters_[3] * parameters_[3] * argx*argx*argy); + derivatives[4 * info_.n_points_ + y*fit_size_x + x] = + 2.f * parameters_[0] * (parameters_[2] - y) * (parameters_[2] - y) + / (parameters_[4] * parameters_[4] * parameters_[4] * argy*argy*argx); + derivatives[5 * info_.n_points_ + y*fit_size_x + x] + = 1.f; + } +} + +void LMFitCPP::calc_derivatives_linear1d( + std::vector & derivatives) +{ + float * user_info_float = (float*)user_info_; + float x = 0.f; + + for (std::size_t point_index = 0; point_index < info_.n_points_; point_index++) + { + if (!user_info_float) + { + x = float(point_index); + } + else if (info_.user_info_size_ / sizeof(float) == info_.n_points_) + { + x = user_info_float[point_index]; + } + else if (info_.user_info_size_ / sizeof(float) > info_.n_points_) + { + std::size_t const fit_begin = fit_index_ * info_.n_points_; + x = user_info_float[fit_begin + point_index]; + } + + derivatives[0 * info_.n_points_ + point_index] = 1.f; + derivatives[1 * info_.n_points_ + point_index] = x; + } +} + +void LMFitCPP::calc_values_cauchy2delliptic(std::vector& cauchy) +{ + int const size_x = int(std::sqrt(float(info_.n_points_))); + int const size_y = size_x; + + for (int iy = 0; iy < size_y; iy++) + { + for (int ix = 0; ix < size_x; ix++) + { + float const argx = + ((parameters_[1] - ix) / parameters_[3]) + *((parameters_[1] - ix) / parameters_[3]) + 1.f; + float const argy = + ((parameters_[2] - iy) / parameters_[4]) + *((parameters_[2] - iy) / parameters_[4]) + 1.f; + + cauchy[iy*size_x + ix] = parameters_[0] / (argx * argy) + parameters_[5]; + } + } +} + +void LMFitCPP::calc_values_gauss2d(std::vector& gaussian) +{ + int const size_x = int(std::sqrt(float(info_.n_points_))); + int const size_y = size_x; + + for (int iy = 0; iy < size_y; iy++) + { + for (int ix = 0; ix < size_x; ix++) + { + float argx = (ix - parameters_[1]) * (ix - parameters_[1]) / (2 * parameters_[3] * parameters_[3]); + float argy = (iy - parameters_[2]) * (iy - parameters_[2]) / (2 * parameters_[3] * parameters_[3]); + float ex = exp(-(argx +argy)); + + gaussian[iy*size_x + ix] = parameters_[0] * ex + parameters_[4]; + } + } +} + +void LMFitCPP::calc_values_gauss2delliptic(std::vector& gaussian) +{ + int const size_x = int(std::sqrt(float(info_.n_points_))); + int const size_y = size_x; + for (int iy = 0; iy < size_y; iy++) + { + for (int ix = 0; ix < size_x; ix++) + { + float argx = (ix - parameters_[1]) * (ix - parameters_[1]) / (2 * parameters_[3] * parameters_[3]); + float argy = (iy - parameters_[2]) * (iy - parameters_[2]) / (2 * parameters_[4] * parameters_[4]); + float ex = exp(-(argx + argy)); + + gaussian[iy*size_x + ix] + = parameters_[0] * ex + parameters_[5]; + } + } +} + +void LMFitCPP::calc_values_gauss2drotated(std::vector& gaussian) +{ + int const size_x = int(std::sqrt(float(info_.n_points_))); + int const size_y = size_x; + + float amplitude = parameters_[0]; + float background = parameters_[5]; + float x0 = parameters_[1]; + float y0 = parameters_[2]; + float sig_x = parameters_[3]; + float sig_y = parameters_[4]; + float rot_sin = sin(parameters_[6]); + float rot_cos = cos(parameters_[6]); + + for (int iy = 0; iy < size_y; iy++) + { + for (int ix = 0; ix < size_x; ix++) + { + int const pixel_index = iy*size_x + ix; + + float arga = ((ix - x0) * rot_cos) - ((iy - y0) * rot_sin); + float argb = ((ix - x0) * rot_sin) + ((iy - y0) * rot_cos); + + float ex + = exp((-0.5f) * (((arga / sig_x) * (arga / sig_x)) + ((argb / sig_y) * (argb / sig_y)))); + + gaussian[pixel_index] = amplitude * ex + background; + } + } +} + +void LMFitCPP::calc_values_gauss1d(std::vector& gaussian) +{ + for (std::size_t ix = 0; ix < info_.n_points_; ix++) + { + float argx + = ((ix - parameters_[1])*(ix - parameters_[1])) + / (2 * parameters_[2] * parameters_[2]); + float ex = exp(-argx); + gaussian[ix] = parameters_[0] * ex + parameters_[3]; + } +} + +void LMFitCPP::calc_values_linear1d(std::vector& line) +{ + float * user_info_float = (float*)user_info_; + float x = 0.f; + for (std::size_t point_index = 0; point_index < info_.n_points_; point_index++) + { + if (!user_info_float) + { + x = float(point_index); + } + else if (info_.user_info_size_ / sizeof(float) == info_.n_points_) + { + x = user_info_float[point_index]; + } + else if (info_.user_info_size_ / sizeof(float) > info_.n_points_) + { + std::size_t const fit_begin = fit_index_ * info_.n_points_; + x = user_info_float[fit_begin + point_index]; + } + line[point_index] = parameters_[0] + parameters_[1] * x; + } +} + +void LMFitCPP::calc_curve_values(std::vector& curve, std::vector& derivatives) +{ + if (info_.model_id_ == GAUSS_1D) + { + calc_values_gauss1d(curve); + calc_derivatives_gauss1d(derivatives); + } + else if (info_.model_id_ == GAUSS_2D) + { + calc_values_gauss2d(curve); + calc_derivatives_gauss2d(derivatives); + } + else if (info_.model_id_ == GAUSS_2D_ELLIPTIC) + { + calc_values_gauss2delliptic(curve); + calc_derivatives_gauss2delliptic(derivatives); + } + else if (info_.model_id_ == GAUSS_2D_ROTATED) + { + calc_values_gauss2drotated(curve); + calc_derivatives_gauss2drotated(derivatives); + } + else if (info_.model_id_ == CAUCHY_2D_ELLIPTIC) + { + calc_values_cauchy2delliptic(curve); + calc_derivatives_cauchy2delliptic(derivatives); + } + else if (info_.model_id_ == LINEAR_1D) + { + calc_values_linear1d(curve); + calc_derivatives_linear1d(derivatives); + } +} + +void LMFitCPP::calculate_hessian( + std::vector const & derivatives, + std::vector const & curve) +{ + for (int jp = 0, jhessian = 0; jp < info_.n_parameters_; jp++) + { + if (parameters_to_fit_[jp]) + { + for (int ip = 0, ihessian = 0; ip < jp + 1; ip++) + { + if (parameters_to_fit_[ip]) + { + std::size_t const ijhessian + = ihessian * info_.n_parameters_to_fit_ + jhessian; + std::size_t const jihessian + = jhessian * info_.n_parameters_to_fit_ + ihessian; + std::size_t const derivatives_index_i = ip*info_.n_points_; + std::size_t const derivatives_index_j = jp*info_.n_points_; + + double sum = 0.0; + for (std::size_t pixel_index = 0; pixel_index < info_.n_points_; pixel_index++) + { + if (info_.estimator_id_ == LSE) + { + if (!weight_) + { + sum + += derivatives[derivatives_index_i + pixel_index] + * derivatives[derivatives_index_j + pixel_index]; + } + else + { + sum + += derivatives[derivatives_index_i + pixel_index] + * derivatives[derivatives_index_j + pixel_index] + * weight_[pixel_index]; + } + } + else if (info_.estimator_id_ == MLE) + { + sum + += data_[pixel_index] / (curve[pixel_index] * curve[pixel_index]) + * derivatives[derivatives_index_i + pixel_index] + * derivatives[derivatives_index_j + pixel_index]; + } + } + hessian_[ijhessian] = float(sum); + if (ijhessian != jihessian) + { + hessian_[jihessian] + = hessian_[ijhessian]; + } + ihessian++; + } + } + jhessian++; + } + } + +} + +void LMFitCPP::calc_gradient( + std::vector const & derivatives, + std::vector const & curve) +{ + + for (int ip = 0, gradient_index = 0; ip < info_.n_parameters_; ip++) + { + if (parameters_to_fit_[ip]) + { + std::size_t const derivatives_index = ip*info_.n_points_; + double sum = 0.0; + for (std::size_t pixel_index = 0; pixel_index < info_.n_points_; pixel_index++) + { + float deviant = data_[pixel_index] - curve[pixel_index]; + + if (info_.estimator_id_ == LSE) + { + if (!weight_) + { + sum + += deviant * derivatives[derivatives_index + pixel_index]; + } + else + { + sum + += deviant * derivatives[derivatives_index + pixel_index] * weight_[pixel_index]; + } + + } + else if (info_.estimator_id_ == MLE) + { + sum + += -derivatives[derivatives_index + pixel_index] * (1 - data_[pixel_index] / curve[pixel_index]); + } + } + gradient_[gradient_index] = float(sum); + gradient_index++; + } + } + +} + +void LMFitCPP::calc_chi_square( + std::vector const & values) +{ + double sum = 0.0; + for (size_t pixel_index = 0; pixel_index < values.size(); pixel_index++) + { + float deviant = values[pixel_index] - data_[pixel_index]; + if (info_.estimator_id_ == LSE) + { + if (!weight_) + { + sum += deviant * deviant; + } + else + { + sum += deviant * deviant * weight_[pixel_index]; + } + } + else if (info_.estimator_id_ == MLE) + { + if (values[pixel_index] <= 0.f) + { + (*state_) = 3; + return; + } + if (data_[pixel_index] != 0.f) + { + sum + += 2 * (deviant - data_[pixel_index] * logf(values[pixel_index] / data_[pixel_index])); + } + else + { + sum += 2 * deviant; + } + } + } + *chi_square_ = float(sum); +} + +void LMFitCPP::calc_curve_values() +{ + std::vector & curve = curve_; + std::vector & derivatives = derivatives_; + + calc_curve_values(curve, derivatives); +} + +void LMFitCPP::calc_coefficients() +{ + std::vector & curve = curve_; + std::vector & derivatives = derivatives_; + + calc_chi_square(curve); + + if ((*chi_square_) < prev_chi_square_ || prev_chi_square_ == 0) + { + calculate_hessian(derivatives, curve); + calc_gradient(derivatives, curve); + } +} + +void LMFitCPP::gauss_jordan() +{ + delta_ = gradient_; + + std::vector & alpha = modified_hessian_; + std::vector & beta = delta_; + + int icol, irow; + float big, dum, pivinv; + + std::vector indxc(info_.n_parameters_to_fit_, 0); + std::vector indxr(info_.n_parameters_to_fit_, 0); + std::vector ipiv(info_.n_parameters_to_fit_, 0); + + for (int kp = 0; kp < info_.n_parameters_to_fit_; kp++) + { + big = 0.0; + for (int jp = 0; jp < info_.n_parameters_to_fit_; jp++) + { + if (ipiv[jp] != 1) + { + for (int ip = 0; ip < info_.n_parameters_to_fit_; ip++) + { + if (ipiv[ip] == 0) + { + if (fabs(alpha[jp*info_.n_parameters_to_fit_ + ip]) >= big) + { + big = fabs(alpha[jp*info_.n_parameters_to_fit_ + ip]); + irow = jp; + icol = ip; + } + } + } + } + } + ++(ipiv[icol]); + + + if (irow != icol) + { + for (int ip = 0; ip < info_.n_parameters_to_fit_; ip++) + { + std::swap(alpha[irow*info_.n_parameters_to_fit_ + ip], alpha[icol*info_.n_parameters_to_fit_ + ip]); + } + std::swap(beta[irow], beta[icol]); + } + indxr[kp] = irow; + indxc[kp] = icol; + if (alpha[icol*info_.n_parameters_to_fit_ + icol] == 0.0) + { + (*state_) = 2; + break; + } + pivinv = 1.0f / alpha[icol*info_.n_parameters_to_fit_ + icol]; + alpha[icol*info_.n_parameters_to_fit_ + icol] = 1.0; + for (int ip = 0; ip < info_.n_parameters_to_fit_; ip++) + { + alpha[icol*info_.n_parameters_to_fit_ + ip] *= pivinv; + } + beta[icol] *= pivinv; + + for (int jp = 0; jp < info_.n_parameters_to_fit_; jp++) + { + if (jp != icol) + { + dum = alpha[jp*info_.n_parameters_to_fit_ + icol]; + alpha[jp*info_.n_parameters_to_fit_ + icol] = 0.0; + for (int ip = 0; ip < info_.n_parameters_to_fit_; ip++) + { + alpha[jp*info_.n_parameters_to_fit_ + ip] -= alpha[icol*info_.n_parameters_to_fit_ + ip] * dum; + } + beta[jp] -= beta[icol] * dum; + } + } + } +} + +void LMFitCPP::update_parameters() +{ + for (int parameter_index = 0, delta_index = 0; parameter_index < info_.n_parameters_; parameter_index++) + { + if (parameters_to_fit_[parameter_index]) + { + prev_parameters_[parameter_index] = parameters_[parameter_index]; + parameters_[parameter_index] = parameters_[parameter_index] + delta_[delta_index++]; + } + } +} + +bool LMFitCPP::check_for_convergence() +{ + bool const fit_found + = std::abs(*chi_square_ - prev_chi_square_) < std::max(tolerance_, tolerance_ * std::abs(*chi_square_)); + + return fit_found; +} + +void LMFitCPP::evaluate_iteration(int const iteration) +{ + bool const max_iterations_reached = iteration == info_.max_n_iterations_ - 1; + if (converged_ || max_iterations_reached) + { + (*n_iterations_) = iteration + 1; + if (!converged_) + { + (*state_) = 1; + } + } +} + +void LMFitCPP::prepare_next_iteration() +{ + if ((*chi_square_) < prev_chi_square_) + { + lambda_ *= 0.1f; + prev_chi_square_ = (*chi_square_); + } + else + { + lambda_ *= 10.f; + (*chi_square_) = prev_chi_square_; + for (int parameter_index = 0, delta_index = 0; parameter_index < info_.n_parameters_; parameter_index++) + { + if (parameters_to_fit_[parameter_index]) + { + parameters_[parameter_index] = prev_parameters_[parameter_index]; + } + } + } +} + +void LMFitCPP::modify_step_width() +{ + modified_hessian_ = hessian_; + size_t const n_parameters = (size_t)(sqrt((float)(hessian_.size()))); + for (size_t parameter_index = 0; parameter_index < n_parameters; parameter_index++) + { + modified_hessian_[parameter_index*n_parameters + parameter_index] + = modified_hessian_[parameter_index*n_parameters + parameter_index] + * (1.0f + (lambda_)); + } +} + +void LMFitCPP::run() +{ + for (int i = 0; i < info_.n_parameters_; i++) + parameters_[i] = initial_parameters_[i]; + + (*state_) = 0; + calc_curve_values(); + calc_coefficients(); + prev_chi_square_ = (*chi_square_); + + for (int iteration = 0; (*state_) == 0; iteration++) + { + modify_step_width(); + + gauss_jordan(); + + update_parameters(); + + calc_curve_values(); + calc_coefficients(); + + converged_ = check_for_convergence(); + + evaluate_iteration(iteration); + + prepare_next_iteration(); + + if (converged_ || (*state_) != 0) + { + break; + } + } +} diff --git a/Cpufit/matlab/CMakeLists.txt b/Cpufit/matlab/CMakeLists.txt new file mode 100644 index 0000000..46276bd --- /dev/null +++ b/Cpufit/matlab/CMakeLists.txt @@ -0,0 +1,62 @@ + +# MATLAB Cpufit binding + +find_package( Matlab COMPONENTS MX_LIBRARY ) + +if( NOT Matlab_FOUND ) + message( STATUS "Matlab and/or MX_Library NOT found - skipping Cpufit Matlab binding!" ) + return() +endif() + +# Matlab MEX FILE + +set( Headers + ) + +set( Sources + mex/CpufitMex.cpp + ) + +add_library( CpufitMex SHARED + ${Headers} + ${Sources} + ) +set_property( TARGET CpufitMex + PROPERTY SUFFIX .${Matlab_MEX_EXTENSION} ) +set_property( TARGET CpufitMex + PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" ) + +target_include_directories( CpufitMex PRIVATE ${Matlab_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) +target_link_libraries( CpufitMex Cpufit ${Matlab_LIBRARIES} ) + +if( WIN32 ) + SET(CMAKE_SHARED_LINKER_FLAGS "/export:mexFunction") +endif() + +add_matlab_launcher( CpufitMex "${CMAKE_CURRENT_SOURCE_DIR}" ) + +# MATLAB Cpufit + Gpufit PACKAGE + +set( build_directory "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/matlab" ) +set( package_files + "${CMAKE_CURRENT_SOURCE_DIR}/cpufit.m" +) +set( binary_gpufit $ ) +set( binary_mex $ ) + +add_custom_target( MATLAB_CPUFIT_GPUFIT_PACKAGE ALL + COMMAND ${CMAKE_COMMAND} -E + make_directory ${build_directory} + COMMAND ${CMAKE_COMMAND} -E + copy_if_different ${package_files} ${build_directory} + COMMAND ${CMAKE_COMMAND} -E + copy_if_different ${binary_gpufit} ${build_directory} + COMMAND ${CMAKE_COMMAND} -E + copy_if_different ${binary_mex} ${build_directory} + COMMENT "Adding Cpufit to Matlab package" +) +set_property( TARGET MATLAB_CPUFIT_GPUFIT_PACKAGE PROPERTY FOLDER CMakePredefinedTargets ) +add_dependencies( MATLAB_CPUFIT_GPUFIT_PACKAGE MATLAB_GPUFIT_PACKAGE Cpufit CpufitMex ) + +# add launcher + diff --git a/Cpufit/matlab/README.md b/Cpufit/matlab/README.md new file mode 100644 index 0000000..a2dc84c --- /dev/null +++ b/Cpufit/matlab/README.md @@ -0,0 +1,3 @@ +Matlab binding for Cpufit, the control CPU implementation of +the [Gpufit library](https://github.com/gpufit/Gpufit) which +implements Levenberg Marquardt curve fitting in CUDA \ No newline at end of file diff --git a/Cpufit/matlab/cpufit.m b/Cpufit/matlab/cpufit.m new file mode 100644 index 0000000..243c654 --- /dev/null +++ b/Cpufit/matlab/cpufit.m @@ -0,0 +1,119 @@ +function [parameters, states, chi_squares, n_iterations, time]... + = cpufit(data, weights, model_id, initial_parameters, tolerance, max_n_iterations, parameters_to_fit, estimator_id, user_info) +% Wrapper around the Cpufit mex file. +% +% Optional arguments can be given as empty matrix []. +% +% Default values as specified + +%% size checks + +% number of input parameter (variable) +if nargin < 9 + user_info = []; + if nargin < 8 + estimator_id = []; + if nargin < 7 + parameters_to_fit = []; + if nargin < 6 + max_n_iterations = []; + if nargin < 5 + tolerance = []; + assert(nargin == 4, 'Not enough parameters'); + end + end + end + end +end + +% data is 2D and read number of points and fits +data_size = size(data); +assert(length(data_size) == 2, 'data is not two-dimensional'); +n_points = data_size(1); +n_fits = data_size(2); + +% consistency with weights (if given) +if ~isempty(weights) + assert(isequal(data_size, size(weights)), 'Dimension mismatch between data and weights') +end + +% initial parameters is 2D and read number of parameters +initial_parameters_size = size(initial_parameters); +assert(length(initial_parameters_size) == 2, 'initial_parameters is not two-dimensional'); +n_parameters = initial_parameters_size(1); +assert(n_fits == initial_parameters_size(2), 'Dimension mismatch in number of fits between data and initial_parameters'); + +% consistency with parameters_to_fit (if given) +if ~isempty(parameters_to_fit) + assert(size(parameters_to_fit, 1) == n_parameters, 'Dimension mismatch in number of parameters between initial_parameters and parameters_to_fit'); +end + +%% default values + +% tolerance +if isempty(tolerance) + tolerance = 1e-4; +end + +% max_n_iterations +if isempty(max_n_iterations) + max_n_iterations = 25; +end + +% estimator_id +if isempty(estimator_id) + estimator_id = EstimatorID.LSE; +end + +% parameters_to_fit +if isempty(parameters_to_fit) + parameters_to_fit = ones(n_parameters, 1, 'int32'); +end + +% now only weights and user_info could be not given (empty matrix) + +%% type checks + +% data, weights (if given), initial_parameters are all single +assert(isa(data, 'single'), 'Type of data is not single'); +if ~isempty(weights) + assert(isa(weights, 'single'), 'Type of weights is not single'); +end +assert(isa(initial_parameters, 'single'), 'Type of initial_parameters is not single'); + +% parameters_to_fit is int32 (cast to int32 if incorrect type) +if ~isa(parameters_to_fit, 'int32') + parameters_to_fit = int32(parameters_to_fit); +end + +% max_n_iterations must be int32 (cast if incorrect type) +if ~isa(max_n_iterations, 'int32') + max_n_iterations = int32(max_n_iterations); +end + +% tolerance must be single (cast if incorrect type) +if ~isa(tolerance, 'single') + tolerance = single(tolerance); +end + +% we don't check type of user_info, but we extract the size in bytes of it +if ~isempty(user_info) + user_info_info = whos('user_info'); + user_info_size = user_info_info.bytes; +else + user_info_size = 0; +end + + +%% run Cpufit taking the time +tic; +[parameters, states, chi_squares, n_iterations] ... + = CpufitMex(data, weights, n_fits, n_points, tolerance, max_n_iterations, estimator_id, initial_parameters, parameters_to_fit, model_id, n_parameters, user_info, user_info_size); + +time = toc; + +% reshape the output parameters array to have dimensions +% (n_parameters,n_fits) +parameters = reshape(parameters,n_parameters,n_fits); + +end diff --git a/Cpufit/matlab/examples/gauss2d.m b/Cpufit/matlab/examples/gauss2d.m new file mode 100644 index 0000000..3cb91f0 --- /dev/null +++ b/Cpufit/matlab/examples/gauss2d.m @@ -0,0 +1,182 @@ +function gauss2d() +% Example of the Matlab binding of the Gpufit library implementing +% Levenberg Marquardt curve fitting in C/C++ +% https://github.com/gpufit/Gpufit +% +% Multiple fits of a 2D Gaussian peak function with Poisson distributed noise +% http://gpufit.readthedocs.io/en/latest/bindings.html#matlab + +% perform some 2D Gaussian peak fits with a symmetrical Gaussian peak +fit_gauss2d(); + +% perform some 2D Gaussian peak fits with an asymmetrical, rotated Gaussian peak +fit_gauss2d_rotated(); + +end +function fit_gauss2d() + +%% number of fits and fit points +number_fits = 1e4; +size_x = 20; +number_parameters = 5; + +%% set input arguments + +% true parameters +true_parameters = single([20, 9.5, 9.5, 3, 10]); + +% initialize random number generator +rng(0); + +% initial parameters (randomized) +initial_parameters = repmat(single(true_parameters'), [1, number_fits]); +% randomize relative to width for positions +initial_parameters([2,3], :) = initial_parameters([2,3], :) + true_parameters(4) * (-0.2 + 0.4 * rand(2, number_fits)); +% randomize relative for other parameters +initial_parameters([1,4,5], :) = initial_parameters([1,4,5], :) .* (0.8 + 0.4 * rand(3, number_fits)); + +% generate x and y values +g = single(0 : size_x - 1); +[x, y] = ndgrid(g, g); + +% generate data with Poisson noise +data = gaussian_2d(x, y, true_parameters); +data = repmat(data(:), [1, number_fits]); +data = poissrnd(data); + +% tolerance +tolerance = 1e-3; + +% maximum number of iterations +max_n_iterations = 20; + +% estimator id +estimator_id = EstimatorID.MLE; + +% model ID +model_id = ModelID.GAUSS_2D; + +%% run Cpufit +[parameters, states, chi_squares, n_iterations, time] = cpufit(data, [], ... + model_id, initial_parameters, tolerance, max_n_iterations, [], estimator_id, []); + +%% displaying results +display_results('2D Gaussian peak', model_id, number_fits, number_parameters, size_x, time, true_parameters, parameters, states, chi_squares, n_iterations); + +end + +function fit_gauss2d_rotated() + +%% number of fits and fit points +number_fits = 1e4; +size_x = 20; +number_parameters = 7; + +%% set input arguments + +% true parameters +true_parameters = single([200, 9.5, 9.5, 3, 4, 10, 0.5]); + +% initialize random number generator +rng(0); + +% initial parameters (randomized) +initial_parameters = repmat(single(true_parameters'), [1, number_fits]); +% randomize relative to width for positions +initial_parameters(2, :) = initial_parameters(2, :) + true_parameters(4) * (-0.2 + 0.4 * rand(1, number_fits)); +initial_parameters(3, :) = initial_parameters(3, :) + true_parameters(5) * (-0.2 + 0.4 * rand(1, number_fits)); +% randomize relative for other parameters +initial_parameters([1,4,5,6,7], :) = initial_parameters([1,4,5,6,7], :) .* (0.8 + 0.4 * rand(5, number_fits)); + +% generate x and y values +g = single(0 : size_x - 1); +[x, y] = ndgrid(g, g); + +% generate data with Poisson noise +data = gaussian_2d_rotated(x, y, true_parameters); +data = repmat(data(:), [1, number_fits]); +data = poissrnd(data); + +% tolerance +tolerance = 1e-3; + +% maximum number of iterations +max_n_iterations = 20; + +% estimator id +estimator_id = EstimatorID.MLE; + +% model ID +model_id = ModelID.GAUSS_2D_ROTATED; + +%% run Cpufit +[parameters, states, chi_squares, n_iterations, time] = cpufit(data, [], ... + model_id, initial_parameters, tolerance, max_n_iterations, [], estimator_id, []); + +%% displaying results +display_results('2D rotated Gaussian peak', model_id, number_fits, number_parameters, size_x, time, true_parameters, parameters, states, chi_squares, n_iterations); + + +end + +function g = gaussian_2d(x, y, p) +% Generates a 2D Gaussian peak. +% http://gpufit.readthedocs.io/en/latest/api.html#gauss-2d +% +% x,y - x and y grid position values +% p - parameters (amplitude, x,y center position, width, offset) + +g = p(1) * exp(-((x - p(2)).^2 + (y - p(3)).^2) / (2 * p(4)^2)) + p(5); + +end + +function g = gaussian_2d_rotated(x, y, p) +% Generates a 2D rotated elliptic Gaussian peak. +% http://gpufit.readthedocs.io/en/latest/api.html#d-rotated-elliptic-gaussian-peak +% +% x,y - x and y grid position values +% p - parameters (amplitude, x,y center position, width, offset) + +% cosine and sine of rotation angle +cp = cos(p(7)); +sp = sin(p(7)); + +% Gaussian peak with two axes +arga = (x - p(2)) .* cp - (y - p(3)) .* sp; +argb = (x - p(2)) .* sp + (y - p(3)) .* cp; +ex = exp(-0.5 .* (((arga / p(4)) .* (arga / p(4))) + ((argb / p(5)) .* (argb / p(5))))); +g = p(1) .* ex + p(6); + +end + +function display_results(name, model_id, number_fits, number_parameters, size_x, time, true_parameters, parameters, states, chi_squares, n_iterations) + +%% displaying results +converged = states == 0; +fprintf('\nCpufit of %s\n', name); + +% print summary +fprintf('\nmodel ID: %d\n', model_id); +fprintf('number of fits: %d\n', number_fits); +fprintf('fit size: %d x %d\n', size_x, size_x); +fprintf('mean chi-square: %6.2f\n', mean(chi_squares(converged))); +fprintf('mean iterations: %6.2f\n', mean(n_iterations(converged))); +fprintf('time: %6.2f s\n', time); + +% get fit states +number_converged = sum(converged); +fprintf('\nratio converged %6.2f %%\n', number_converged / number_fits * 100); +fprintf('ratio max it. exceeded %6.2f %%\n', sum(states == 1) / number_fits * 100); +fprintf('ratio singular hessian %6.2f %%\n', sum(states == 2) / number_fits * 100); +fprintf('ratio neg curvature MLE %6.2f %%\n', sum(states == 3) / number_fits * 100); + +% mean and std of fitted parameters +converged_parameters = parameters(:, converged); +converged_parameters_mean = mean(converged_parameters, 2); +converged_parameters_std = std(converged_parameters, [], 2); +fprintf('\nparameters of %s\n', name); +for i = 1 : number_parameters + fprintf('p%d true %6.2f mean %6.2f std %6.2f\n', i, true_parameters(i), converged_parameters_mean(i), converged_parameters_std(i)); +end + +end \ No newline at end of file diff --git a/Cpufit/matlab/examples/gauss2d_plot.m b/Cpufit/matlab/examples/gauss2d_plot.m new file mode 100644 index 0000000..8d34707 --- /dev/null +++ b/Cpufit/matlab/examples/gauss2d_plot.m @@ -0,0 +1,117 @@ +function gauss2d_plot() +% Example of the Matlab binding of the Gpufit library implementing +% Levenberg Marquardt curve fitting in C/C++ +% https://github.com/gpufit/Gpufit +% +% Multiple fits of a 2D Gaussian peak function with Poisson distributed noise +% repeated for a different total number of fits each time and plotting the +% results +% http://gpufit.readthedocs.io/en/latest/bindings.html#matlab + +%% number of fit points +size_x = 5; +n_points = size_x * size_x; + +%% set input arguments + +% mean true parameters +mean_true_parameters = single([100, 3, 3, 1, 10]); + +% average noise level +average_noise_level = 10; + +% initialize random number generator +rng(0); + +% tolerance +tolerance = 1e-4; + +% max number of itetations +max_n_iterations = 10; + +% model id +model_id = ModelID.GAUSS_2D; + +%% loop over different number of fits +n_fits_all = round(logspace(2, 6, 20)); + +% generate x and y values +g = single(0 : size_x - 1); +[x, y] = ndgrid(g, g); + +% loop +speed = zeros(length(n_fits_all), 1); +for i = 1:length(n_fits_all) + n_fits = n_fits_all(i); + + % vary positions of 2D Gaussians peaks slightly + test_parameters = repmat(mean_true_parameters', [1, n_fits]); + test_parameters([2,3], :) = test_parameters([2,3], :) + mean_true_parameters(4) * (-0.2 + 0.4 * rand(2, n_fits)); + + % generate data + data = gaussians_2d(x, y, test_parameters); + data = reshape(data, [n_points, n_fits]); + + % add noise + data = data + average_noise_level * randn(size(data), 'single'); + + % initial parameters (randomized) + initial_parameters = repmat(mean_true_parameters', [1, n_fits]); + % randomize relative to width for positions + initial_parameters([2,3], :) = initial_parameters([2,3], :) + mean_true_parameters(4) * (-0.2 + 0.4 * rand(2, n_fits)); + % randomize relative for other parameters + initial_parameters([1,4,5], :) = initial_parameters([1,4,5], :) .* (0.8 + 0.4 * rand(3, n_fits)); + + % run Cpufit + [parameters, states, chi_squares, n_iterations, time] = cpufit(data, [], ... + model_id, initial_parameters, tolerance, max_n_iterations); + + % analyze result + converged = states == 0; + speed(i) = n_fits / time; + precision_x0 = std(parameters(2, converged) - test_parameters(2, converged)); + + % display result + fprintf(' iterations: %.2f | time: %.3f s | speed: %8.0f fits/s\n', ... + mean(n_iterations(converged)), time, speed(i)); +end + +%% plot +figure(); +semilogx(n_fits_all, speed, 'bo-') +xlabel('number of fits per function call') +ylabel('fits per second') +legend('Cpufit', 'Location', 'NorthWest') +grid on; +xlim(n_fits_all([1,end])); + +end + +function g = gaussians_2d(x, y, p) +% Generates many 2D Gaussians peaks for a given set of parameters + +n_fits = size(p, 2); +msg = sprintf('generating %d fits ', n_fits); +fprintf(msg); + +g = zeros([size(x), n_fits], 'single'); + +progress = 0; +L = 50; % length of progressbar +l = 0; +for i = 1 : n_fits + + pi = p(:, i); + g(:, :, i) = pi(1) * exp(-((x - pi(2)).^2 + (y - pi(3)).^2) / (2 * pi(4)^2)) + pi(5); + + progress = progress + 1; + if progress >= n_fits / L + progress = 0; + fprintf('|'); + l = l + 1; + end +end +fprintf(repmat('\b', [1, length(msg) + l])); +fprintf('%7d fits', n_fits); + +end diff --git a/Cpufit/matlab/mex/CpufitMex.cpp b/Cpufit/matlab/mex/CpufitMex.cpp new file mode 100644 index 0000000..3a10184 --- /dev/null +++ b/Cpufit/matlab/mex/CpufitMex.cpp @@ -0,0 +1,145 @@ +#include "Cpufit/cpufit.h" + +#include + +#include +#include + +/* + Get a arbitrary scalar (non complex) and check for class id. + https://www.mathworks.com/help/matlab/apiref/mxclassid.html +*/ +template inline bool get_scalar(const mxArray *p, T &v, const mxClassID id) +{ + if (mxIsNumeric(p) && !mxIsComplex(p) && mxGetNumberOfElements(p) == 1 && mxGetClassID(p) == id) + { + v = *static_cast(mxGetData(p)); + return true; + } + else { + return false; + } +} + +void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray const *prhs[]) +{ + int expected_nrhs = 0; + int expected_nlhs = 0; + bool wrong_nrhs = false; + bool wrong_nlhs = false; + + expected_nrhs = 13; + expected_nlhs = 4; + if (nrhs != expected_nrhs) + { + wrong_nrhs = true; + } + else if (nlhs != expected_nlhs) + { + wrong_nlhs = true; + } + + if (wrong_nrhs || wrong_nlhs) + { + if (nrhs != expected_nrhs) + { + char s1[50]; + _itoa_s(expected_nrhs, s1, 10); + char const s2[] = " input arguments required."; + size_t const string_length = strlen(s1) + 1 + strlen(s2); + strcat_s(s1, string_length, s2); + mexErrMsgIdAndTxt("Cpufit:Mex", s1); + } + else if (nlhs != expected_nlhs) + { + char s1[50]; + _itoa_s(expected_nlhs, s1, 10); + char const s2[] = " output arguments required."; + size_t const string_length = strlen(s1) + 1 + strlen(s2); + strcat_s(s1, string_length, s2); + mexErrMsgIdAndTxt("Cpufit:Mex", s1); + } + } + + // input parameters + float * data = (float*)mxGetPr(prhs[0]); + float * weights = (float*)mxGetPr(prhs[1]); + std::size_t n_fits = (std::size_t)*mxGetPr(prhs[2]); + std::size_t n_points = (std::size_t)*mxGetPr(prhs[3]); + + // tolerance + float tolerance = 0; + if (!get_scalar(prhs[4], tolerance, mxSINGLE_CLASS)) + { + mexErrMsgIdAndTxt("Cpufit:Mex", "tolerance is not a single"); + } + + // max_n_iterations + int max_n_iterations = 0; + if (!get_scalar(prhs[5], max_n_iterations, mxINT32_CLASS)) + { + mexErrMsgIdAndTxt("Cpufit:Mex", "max_n_iteration is not int32"); + } + + int estimator_id = (int)*mxGetPr(prhs[6]); + float * initial_parameters = (float*)mxGetPr(prhs[7]); + int * parameters_to_fit = (int*)mxGetPr(prhs[8]); + int model_id = (int)*mxGetPr(prhs[9]); + int n_parameters = (int)*mxGetPr(prhs[10]); + int * user_info = (int*)mxGetPr(prhs[11]); + std::size_t user_info_size = (std::size_t)*mxGetPr(prhs[12]); + + // output parameters + float * output_parameters; + mxArray * mx_parameters; + mx_parameters = mxCreateNumericMatrix(1, n_fits*n_parameters, mxSINGLE_CLASS, mxREAL); + output_parameters = (float*)mxGetData(mx_parameters); + plhs[0] = mx_parameters; + + int * output_states; + mxArray * mx_states; + mx_states = mxCreateNumericMatrix(1, n_fits, mxINT32_CLASS, mxREAL); + output_states = (int*)mxGetData(mx_states); + plhs[1] = mx_states; + + float * output_chi_squares; + mxArray * mx_chi_squares; + mx_chi_squares = mxCreateNumericMatrix(1, n_fits, mxSINGLE_CLASS, mxREAL); + output_chi_squares = (float*)mxGetData(mx_chi_squares); + plhs[2] = mx_chi_squares; + + int * output_n_iterations; + mxArray * mx_n_iterations; + mx_n_iterations = mxCreateNumericMatrix(1, n_fits, mxINT32_CLASS, mxREAL); + output_n_iterations = (int*)mxGetData(mx_n_iterations); + plhs[3] = mx_n_iterations; + + // call to gpufit + int const status + = cpufit + ( + n_fits, + n_points, + data, + weights, + model_id, + initial_parameters, + tolerance, + max_n_iterations, + parameters_to_fit, + estimator_id, + user_info_size, + reinterpret_cast< char * >( user_info ), + output_parameters, + output_states, + output_chi_squares, + output_n_iterations + ) ; + + // check status + if (status != STATUS_OK) + { + std::string const error = cpufit_get_last_error() ; + mexErrMsgIdAndTxt( "Cpufit:Mex", error.c_str() ) ; + } +} diff --git a/Gpufit/CMakeLists.txt b/Gpufit/CMakeLists.txt new file mode 100644 index 0000000..76da81e --- /dev/null +++ b/Gpufit/CMakeLists.txt @@ -0,0 +1,160 @@ + +# CUDA +# +# Uses the following variables: +# +# CUDA_ARCHITECTURES (Default All) +# -- Argument passed to CUDA_SELECT_NVCC_ARCH_FLAGS(...) +# resulting in code_generation_flags +# (see http://cmake.org/cmake/help/v3.7/module/FindCUDA.html). +# CUDA_ARCHITECTURES: Auto | Common | All | ARCH_AND_PTX ... +# Auto: Detects local machine GPU architecture. +# Common: Covers common subset of architectures. +# All: Covers all known architectures. +# ARCH_AND_PTX: NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX +# NAME: Fermi Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal +# NUM: Any number. +# Only those pairs are currently accepted by NVCC though: +# 2.0 2.1 3.0 3.2 3.5 3.7 5.0 5.2 5.3 6.0 6.2 +# Examples: +# 2.1(2.0) results in +# -gencode;arch=compute_20,code=sm_21 +# Kepler+Tesla results in +# -gencode;arch=compute_37,code=sm_37 +# 6.2+PTX results in +# -gencode;arch=compute_62,code=sm_62;-gencode;arch=compute_62,code=compute_62 +# +# CUDA_NVCC_FLAGS (Default ${code_generation_flags}) +# -- Additional NVCC command line arguments +# (see http://cmake.org/cmake/help/v3.7/module/FindCUDA.html). +# NOTE that multiple arguments must be semi-colon delimited +# (e.g. --compiler-options;-Wall) +# +# Multiple CUDA versions installed, specify which version to use +# Set CUDA_BIN_PATH before running CMake or CUDA_TOOLKIT_ROOT_DIR after first configuration +# to installation folder of desired CUDA version + +find_package( CUDA 6.5 REQUIRED ) + +set( CUDA_ARCHITECTURES All CACHE STRING + "Auto | Common | All | ... see CUDA_SELECT_NVCC_ARCH_FLAGS(...)" ) + +if( CUDA_ARCHITECTURES STREQUAL Auto ) + set( file ${PROJECT_BINARY_DIR}/detect_cuda_architectures.cpp ) + file( WRITE ${file} "" + "#include \n" + "#include \n" + "int main()\n" + "{\n" + " int count = 0;\n" + " if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n" + " if (count == 0) return -1;\n" + " for (int device = 0; device < count; ++device)\n" + " {\n" + " cudaDeviceProp prop;\n" + " if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n" + " std::printf(\"%d.%d \", prop.major, prop.minor);\n" + " }\n" + " return 0;\n" + "}\n" + ) + try_run( run_result compile_result ${PROJECT_BINARY_DIR} ${file} + CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS}" + LINK_LIBRARIES ${CUDA_LIBRARIES} + RUN_OUTPUT_VARIABLE architectures + ) + if( run_result EQUAL 0 ) + string( REPLACE "2.1" "2.1(2.0)" architectures "${architectures}" ) + if( CUDA_VERSION VERSION_LESS "7.0" ) + string( REGEX REPLACE "3\\.[27]|5\\.[23]|6\\.[01]" "5.2+PTX" architectures "${architectures}" ) + elseif( CUDA_VERSION VERSION_LESS "8.0" ) + string( REGEX REPLACE "5\\.3|6\\.[01]" "5.3+PTX" architectures "${architectures}" ) + endif() + set( CUDA_ARCHITECTURES "${architectures}" ) + endif() +elseif( CUDA_ARCHITECTURES STREQUAL All ) +# All does not include the latest PTX! + set( CUDA_ARCHITECTURES "2.1(2.0)" "3.0" "3.5" "5.0" "5.2" ) + if( CUDA_VERSION VERSION_GREATER "6.5" ) + list( APPEND CUDA_ARCHITECTURES "3.2" "3.7" "5.3" ) + endif() + if( CUDA_VERSION VERSION_GREATER "7.5" ) + list( APPEND CUDA_ARCHITECTURES "6.0" "6.1" ) + endif() + string( APPEND CUDA_ARCHITECTURES "+PTX" ) +endif() +CUDA_SELECT_NVCC_ARCH_FLAGS( code_generation_flags "${CUDA_ARCHITECTURES}" ) +list( APPEND CUDA_NVCC_FLAGS ${code_generation_flags} ) +message( STATUS "CUDA_NVCC_FLAGS=${code_generation_flags}" ) + +# Gpufit + +set( GpuHeaders + gpufit.h + definitions.h + info.h + lm_fit.h + interface.h +) + +set( GpuSources + gpufit.cpp + info.cpp + lm_fit.cpp + lm_fit_cuda.cpp + interface.cpp + gpufit.def +) + +set( GpuCudaHeaders + linear_1d.cuh + gauss_1d.cuh + gauss_2d.cuh + gauss_2d_rotated.cuh + gauss_2d_elliptic.cuh + cauchy_2d_elliptic.cuh + lse.cuh + mle.cuh + cuda_gaussjordan.cuh + cuda_kernels.cuh + gpu_data.cuh +) + +set( GpuCudaSources + lm_fit_cuda.cu + cuda_gaussjordan.cu + cuda_kernels.cu + info.cu + gpu_data.cu +) + +source_group("CUDA Source Files" FILES ${GpuCudaSources}) +source_group("CUDA Header Files" FILES ${GpuCudaHeaders}) + +cuda_add_library( Gpufit SHARED + ${GpuHeaders} + ${GpuSources} + ${GpuCudaHeaders} + ${GpuCudaSources} +) + +set_property( TARGET Gpufit + PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" ) + +#install( TARGETS Gpufit RUNTIME DESTINATION bin ) + +# Examples + +add_subdirectory( examples ) + +# Tests + +if( BUILD_TESTING ) + add_subdirectory( tests ) +endif() + +# Bindings + +add_subdirectory( matlab ) +add_subdirectory( python ) + diff --git a/Gpufit/Gpufit.def b/Gpufit/Gpufit.def new file mode 100644 index 0000000..0e3b9db --- /dev/null +++ b/Gpufit/Gpufit.def @@ -0,0 +1,7 @@ +LIBRARY "Gpufit" +EXPORTS + gpufit @1 + gpufit_get_last_error @2 + gpufit_get_cuda_version @3 + gpufit_cuda_available @4 + gpufit_portable_interface @5 \ No newline at end of file diff --git a/Gpufit/cauchy_2d_elliptic.cuh b/Gpufit/cauchy_2d_elliptic.cuh new file mode 100644 index 0000000..b1c2a4e --- /dev/null +++ b/Gpufit/cauchy_2d_elliptic.cuh @@ -0,0 +1,107 @@ +#ifndef GPUFIT_CAUCHY2DELLIPTIC_CUH_INCLUDED +#define GPUFIT_CAUCHY2DELLIPTIC_CUH_INCLUDED + +/* Description of the calculate_cauchy2delliptic function +* ======================================================= +* +* This function calculates the values of two-dimensional elliptic cauchy model +* functions and their partial derivatives with respect to the model parameters. +* +* No independent variables are passed to this model function. Hence, the +* (X, Y) coordinate of the first data value is assumed to be (0.0, 0.0). For +* a fit size of M x N data points, the (X, Y) coordinates of the data are +* simply the corresponding array index values of the data array, starting from +* zero. +* +* Parameters: +* +* parameters: An input vector of concatenated sets of model parameters. +* p[0]: amplitude +* p[1]: center coordinate x +* p[2]: center coordinate y +* p[3]: width x (standard deviation) +* p[4]: width y (standard deviation) +* p[5]: offset +* +* n_fits: The number of fits. (not used) +* +* n_points: The number of data points per fit. +* +* n_parameters: The number of model parameters. +* +* values: An output vector of concatenated sets of model function values. +* +* derivatives: An output vector of concatenated sets of model function partial +* derivatives. +* +* chunk_index: The chunk index. (not used) +* +* user_info: An input vector containing user information. (not used) +* +* user_info_size: The number of elements in user_info. (not used) +* +* Calling the calculate_cauchy2delliptic function +* =============================================== +* +* This __device__ function can be only called from a __global__ function or an other +* __device__ function. When calling the function, the blocks and threads of the __global__ +* function must be set up correctly, as shown in the following example code. +* +* dim3 threads(1, 1, 1); +* dim3 blocks(1, 1, 1); +* +* threads.x = n_points * n_fits_per_block; +* blocks.x = n_fits / n_fits_per_block; +* +* global_function<<< blocks,threads >>>(parameter1, ...); +* +*/ + +__device__ void calculate_cauchy2delliptic( + float const * parameters, + int const n_fits, + int const n_points, + int const n_parameters, + float * values, + float * derivatives, + int const chunk_index, + char * user_info, + std::size_t const user_info_size) +{ + int const n_points_x = sqrt((float)n_points); + int const n_fits_per_block = blockDim.x / n_points; + int const fit_in_block = threadIdx.x / n_points; + int const point_index = threadIdx.x - (fit_in_block*n_points); + int const fit_index = blockIdx.x*n_fits_per_block + fit_in_block; + + int const point_index_y = point_index / n_points_x; + int const point_index_x = point_index - (point_index_y*n_points_x); + + float* current_value = &values[fit_index*n_points]; + float const * p = ¶meters[fit_index*n_parameters]; + + float const argx = ((p[1] - point_index_x) / p[3]) *((p[1] - point_index_x) / p[3]) + 1; + float const argy = ((p[2] - point_index_y) / p[4]) *((p[2] - point_index_y) / p[4]) + 1; + current_value[point_index] = p[0] * 1 / argx * 1 / argy + p[5]; + + ////////////////////////////////////////////////////////////////////////////// + + float * current_derivative = &derivatives[fit_index * n_points*n_parameters]; + + current_derivative[0 * n_points + point_index] + = 1 / (argx*argy); + current_derivative[1 * n_points + point_index] + = -2 * p[0] * (p[1] - point_index_x) * 1 / (p[3] * p[3] * argx*argx*argy); + current_derivative[2 * n_points + point_index] + = -2 * p[0] * (p[2] - point_index_y) * 1 / (p[4] * p[4] * argy*argy*argx); + current_derivative[3 * n_points + point_index] + = 2 * p[0] * (p[1] - point_index_x) * (p[1] - point_index_x) + / (p[3] * p[3] * p[3] * argx * argx * argy); + current_derivative[4 * n_points + point_index] + = 2 * p[0] * (p[2] - point_index_y) * (p[2] - point_index_y) + / (p[4] * p[4] * p[4] * argy * argy * argx); + current_derivative[5 * n_points + point_index] + = 1; +} + +#endif diff --git a/Gpufit/cuda_gaussjordan.cu b/Gpufit/cuda_gaussjordan.cu new file mode 100644 index 0000000..c6519bc --- /dev/null +++ b/Gpufit/cuda_gaussjordan.cu @@ -0,0 +1,279 @@ +/* CUDA implementation of Gauss-Jordan elimination algorithm. +* +* Gauss-Jordan elimination method +* =============================== +* +* This function solves a set of linear equations using the Gauss-Jordan elimination method. +* Considering a set of N equations with N unknowns, this can be written in matrix form as +* an NxN matrix of coefficients and a Nx1 column vector of right-hand side values. +* +* For example, consider the following problem with 3 equations and 3 unknowns (N=3): +* +* A x + B y + C z = MM +* D x + E y + F z = NN +* G x + H y + J z = PP +* +* We can write this as follows in matrix form: +* +* [ A B C ] [ x ] = [ MM ] +* [ D E F ] [ y ] = [ NN ] +* [ G H I ] [ z ] = [ PP ] +* +* or, [A]*[X] = [B] where [A] is the matrix of coefficients and [B] is the vector of +* right-hand side values. +* +* The Gauss Jordan elimiation method solves the system of equations in the following +* manner. First, we form the augmented matrix (A|B): +* +* [ A B C | MM ] +* [ D E F | NN ] +* [ G H I | PP ] +* +* and then the augmented matrix is manipulated until its left side has the reduced +* row-echelon form. That is to say that any individual row may be multiplied +* by a scalar factor, and any linear combination of rows may be added to another +* row. Finally, two rows may be swapped without affecting the solution. +* +* When the manipulations are complete and the left side of the matrix has the desired +* form, the right side then corresponds to the solution of the system. +* +* +* Description of the cuda_gaussjordan function +* ============================================ +* +* This algorithm is designed to perform many solutions of the Gauss Jordan elimination +* method in parallel. One limitation of the algorithm implemented here is that for +* each solution the number of equations and unknowns (N) must be identical. +* +* Parameters: +* +* alpha: Coefficients matrices. The matrix of coefficients for a single solution is +* a vector of NxN, where N is the number of equations. This array stores the +* coefficients for the entire set of M input problems, concatenated end to end, +* and hence the total size of the array is MxNxN. +* +* beta: Vector of right hand side values, concatenated together for all input problems. +* For a set of M inputs, the size of the vector is MxN. Upon completion, this +* vector contains the results vector X for each solution. +* +* skip_calculation: An input vector which allows the calculation to be skipped for +* a particular solution. For a set of M inputs, the size of this +* vector is M. +* +* singular: An output vector used to report whether a given solution is singular. For +* a set of M inputs, this vector has size M. Memory needs to be allocated +* by the calling the function. +* +* n_equations: The number of equations and unknowns for a single solution. This is +* equal to the size N. +* +* n_equations_pow2: The next highest power of 2 greater than n_equations. +* +* +* Calling the cuda_gaussjordan function +* ===================================== +* +* When calling the function, the blocks and threads must be set up correctly, as well +* as the shared memory space, as shown in the following example code. +* +* dim3 threads(1, 1, 1); +* dim3 blocks(1, 1, 1); +* +* threads.x = n_equations + 1; +* threads.y = n_equations; +* blocks.x = n_solutions; +* blocks.y = 1; +* +* int const shared_size = sizeof(float) * +* ( (threads.x * threads.y) + n_parameters_pow2 + n_parameters_pow2 ); +* +* int * singular; +* CUDA_CHECK_STATUS(cudaMalloc((void**)&singular, n_solutions * sizeof(int))); +* +* cuda_gaussjordan<<< blocks, threads, shared_size >>>( +* alpha, +* beta, +* skip_calculation, +* singular, +* n_equations, +* n_equations_pow2); +* +*/ + +#include "cuda_gaussjordan.cuh" + +__global__ void cuda_gaussjordan( + float * delta, + float const * beta, + float const * alpha, + int const * skip_calculation, + int * singular, + std::size_t const n_equations, + std::size_t const n_equations_pow2) +{ + extern __shared__ float extern_array[]; //shared memory between threads of a single block, + //used for storing the calculation_matrix, the + //abs_row vector, and the abs_row_index vector + + // In this routine we will store the augmented matrix (A|B), referred to here + // as the calculation matrix in a shared memory space which is visible to all + // threads within a block. Also stored in shared memory are two vectors which + // are used to find the largest element in each row (the pivot). These vectors + // are called abs_row and abs_row_index. + // + // Sizes of data stored in shared memory: + // + // calculation_matrix: n_equations * (n_equations+1) + // abs_row: n_equations_pow2 + // abs_row_index: n_equations_pow2 + // + // Note that each thread represents an element of the augmented matrix, with + // the column and row indicated by the x and y index of the thread. Each + // solution is calculated within one block, and the solution index is the + // block index x value. + + int const col_index = threadIdx.x; //column index in the calculation_matrix + int const row_index = threadIdx.y; //row index in the calculation_matrix + int const solution_index = blockIdx.x; + + int const n_col = blockDim.x; //number of columns in calculation matrix (=threads.x) + int const n_row = blockDim.y; //number of rows in calculation matrix (=threads.y) + int const alpha_size = blockDim.y * blockDim.y; //number of entries in alpha matrix for one solution (NxN) + + if (skip_calculation[solution_index]) + return; + + float p; //local variable used in pivot calculation + + float * calculation_matrix = extern_array; //point to the shared memory + + float * abs_row = extern_array + n_equations * (n_equations + 1); //abs_row is located after the calculation_matrix + //within the shared memory + + int * abs_row_index = (int *)abs_row + n_equations_pow2; //abs_row_index is located after abs_row + // + //note that although the shared memory is defined as + //float, we are storing data of type int in this + //part of the shared memory + + //initialize the singular vector + if (col_index == 0 && row_index == 0) + { + singular[solution_index] = 0; + } + + //initialize abs_row and abs_row_index, using only the threads on the diagonal + if (col_index == row_index) + { + abs_row[col_index + (n_equations_pow2 - n_equations)] = 0.0f; + abs_row_index[col_index + (n_equations_pow2 - n_equations)] = col_index + (n_equations_pow2 - n_equations); + } + + //initialize the calculation_matrix (alpha and beta, concatenated, for one solution) + if (col_index != n_equations) + calculation_matrix[row_index*n_col + col_index] = alpha[solution_index * alpha_size + row_index * n_equations + col_index]; + else + calculation_matrix[row_index*n_col + col_index] = beta[solution_index * n_equations + row_index]; + + //wait for thread synchronization + + __syncthreads(); + + //start of main outer loop over the rows of the calculation matrix + + for (int current_row = 0; current_row < n_equations; current_row++) + { + + // work in only one row, skipping the last column + if (row_index == current_row && col_index != n_equations) + { + + //save the absolute values of the current row + abs_row[col_index] = abs(calculation_matrix[row_index * n_col + col_index]); + + //save the column indices + abs_row_index[col_index] = col_index; + + __threadfence(); + + //find the largest absolute value in the current row and write its index in abs_row_index[0] + for (int n = 2; n <= n_equations_pow2; n = n * 2) + { + if (col_index < (n_equations_pow2 / n)) + { + if (abs_row[abs_row_index[col_index]] < abs_row[abs_row_index[col_index + (n_equations_pow2 / n)]]) + { + abs_row_index[col_index] = abs_row_index[col_index + (n_equations_pow2 / n)]; + } + } + } + } + + __syncthreads(); + + //singularity check - if all values in the row are zero, no solution exists + if (row_index == current_row && col_index != n_equations) + { + if (abs_row[abs_row_index[0]] == 0.0f) + { + singular[solution_index] = 1; + } + } + + //devide the row by the biggest value in the row + if (row_index == current_row) + { + calculation_matrix[row_index * n_col + col_index] + = calculation_matrix[row_index * n_col + col_index] / calculation_matrix[row_index * n_col + abs_row_index[0]]; + } + + __syncthreads(); + + //The value of the largest element of the current row was found, and then current + //row was divided by this value such that the largest value of the current row + //is equal to one. + // + //Next, the matrix is manipulated to reduce to zero all other entries in the column + //in which the largest value was found. To do this, the values in the current row + //are scaled appropriately and substracted from the other rows of the matrix. + // + //For each element of the matrix that is not in the current row, calculate the value + //to be subtracted and let each thread store this value in the scalar variable p. + + p = calculation_matrix[current_row * n_col + col_index] * calculation_matrix[row_index * n_col + abs_row_index[0]]; + __syncthreads(); + + if (row_index != current_row) + { + calculation_matrix[row_index * n_col + col_index] = calculation_matrix[row_index * n_col + col_index] - p; + } + __syncthreads(); + + } + + //At this point, if the solution exists, the calculation matrix has been reduced to the + //identity matrix on the left side, and the solution vector on the right side. However + //we have not swapped rows during the procedure, so the identity matrix is out of order. + // + //For example, starting with the following augmented matrix as input: + // + // [ 3 2 -4 | 4 ] + // [ 2 3 3 | 15 ] + // [ 5 -3 1 | 14 ] + // + //we will obtain: + // + // [ 0 0 1 | 2 ] + // [ 0 1 0 | 1 ] + // [ 1 0 0 | 3 ] + // + //Which needs to be re-arranged to obtain the correct solution vector. In the final + //step, each thread checks to see if its value equals 1, and if so it assigns the value + //in its rightmost column to the appropriate entry in the beta vector. The solution is + //stored in beta upon completetion. + + if (col_index != n_equations && calculation_matrix[row_index * n_col + col_index] == 1) + delta[n_row * solution_index + col_index] = calculation_matrix[row_index * n_col + n_equations]; + + __syncthreads(); +} diff --git a/Gpufit/cuda_gaussjordan.cuh b/Gpufit/cuda_gaussjordan.cuh new file mode 100644 index 0000000..2d41cda --- /dev/null +++ b/Gpufit/cuda_gaussjordan.cuh @@ -0,0 +1,15 @@ +#ifndef GPUFIT_CUDA_GAUSS_JORDAN_CUH_INCLUDED +#define GPUFIT_CUDA_GAUSS_JORDAN_CUH_INCLUDED + +#include + +extern __global__ void cuda_gaussjordan( + float * delta, + float const * beta, + float const * alpha, + int const * skip_calculation, + int * singular, + std::size_t const n_equations, + std::size_t const n_equations_pow2); + +#endif \ No newline at end of file diff --git a/Gpufit/cuda_kernels.cu b/Gpufit/cuda_kernels.cu new file mode 100644 index 0000000..2661a7e --- /dev/null +++ b/Gpufit/cuda_kernels.cu @@ -0,0 +1,1081 @@ +#include "gpufit.h" +#include "cuda_kernels.cuh" +#include "definitions.h" +#include "linear_1d.cuh" +#include "gauss_1d.cuh" +#include "gauss_2d.cuh" +#include "gauss_2d_elliptic.cuh" +#include "gauss_2d_rotated.cuh" +#include "cauchy_2d_elliptic.cuh" +#include "lse.cuh" +#include "mle.cuh" + +/* Description of the cuda_calc_curve_values function +* =================================================== +* +* This function calls one of the fitting curve functions depending on the input +* parameter model_id. The fitting curve function calculates the values of +* the fitting curves and its partial derivatives with respect to the fitting +* curve parameters. Multiple fits are calculated in parallel. +* +* Parameters: +* +* parameters: An input vector of concatenated sets of model parameters. +* +* n_fits: The number of fits. +* +* n_points: The number of data points per fit. +* +* n_parameters: The number of curve parameters. +* +* finished: An input vector which allows the calculation to be skipped for single +* fits. +* +* values: An output vector of concatenated sets of model function values. +* +* derivatives: An output vector of concatenated sets of model function partial +* derivatives. +* +* n_fits_per_block: The number of fits calculated by each threadblock. +* +* model_id: The fitting model ID. +* +* chunk_index: The chunk index. +* +* user_info: An input vector containing user information. +* +* user_info_size: The number of elements in user_info. +* +* Calling the cuda_calc_curve_values function +* =========================================== +* +* When calling the function, the blocks and threads must be set up correctly, +* as shown in the following example code. +* +* dim3 threads(1, 1, 1); +* dim3 blocks(1, 1, 1); +* +* threads.x = n_points * n_fits_per_block; +* blocks.x = n_fits / n_fits_per_block; +* +* cuda_calc_curve_values<<< blocks, threads >>>( +* parameters, +* n_points, +* n_parameters, +* finished, +* values, +* derivatives, +* n_fits_per_block, +* model_id, +* chunk_index, +* user_info, +* user_info_size); +* +*/ + +__global__ void cuda_calc_curve_values( + float const * parameters, + int const n_fits, + int const n_points, + int const n_parameters, + int const * finished, + float * values, + float * derivatives, + int const n_fits_per_block, + int const model_id, + int const chunk_index, + char * user_info, + std::size_t const user_info_size) +{ + int const fit_in_block = threadIdx.x / n_points; + int const point_index = threadIdx.x - fit_in_block * n_points; + int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block; + + if (finished[fit_index]) + return; + if (point_index >= n_points) + return; + + if (model_id == GAUSS_1D) + calculate_gauss1d(parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size); + else if (model_id == GAUSS_2D) + calculate_gauss2d(parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size); + else if (model_id == GAUSS_2D_ELLIPTIC) + calculate_gauss2delliptic(parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size); + else if (model_id == GAUSS_2D_ROTATED) + calculate_gauss2drotated(parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size); + else if (model_id == CAUCHY_2D_ELLIPTIC) + calculate_cauchy2delliptic(parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size); + else if (model_id == LINEAR_1D) + calculate_linear1d(parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size); +} + +/* Description of the sum_up_floats function +* ========================================== +* +* This function sums up a vector of float values and stores the result at the +* first place of the vector. +* +* Parameters: +* +* shared_array: An input vector of float values. The vector must be stored +* on the shared memory of the GPU. The size of this vector must be a +* power of two. Use zero padding to extend it to the next highest +* power of 2 greater than the number of elements. +* +* size: The number of elements in the input vector considering zero padding. +* +* Calling the sum_up_floats function +* ================================== +* +* This __device__ function can be only called from a __global__ function or +* an other __device__ function. When calling the function, the blocks and threads +* of the __global__ function must be set up correctly, as shown in the following +* example code. +* +* dim3 threads(1, 1, 1); +* dim3 blocks(1, 1, 1); +* +* threads.x = size * vectors_per_block; +* blocks.x = n_vectors / vectors_per_block; +* +* global_function<<< blocks,threads >>>(parameter1, ...); +* +*/ + +__device__ void sum_up_floats(volatile float* shared_array, int const size) +{ + int const fit_in_block = threadIdx.x / size; + int const point_index = threadIdx.x - (fit_in_block*size); + + int current_n_points = size >> 1; + __syncthreads(); + while (current_n_points) + { + if (point_index < current_n_points) + { + shared_array[point_index] += shared_array[point_index + current_n_points]; + } + current_n_points >>= 1; + __syncthreads(); + } +} + +/* Description of the cuda_calculate_chi_squares function +* ======================================================== +* +* This function calculates the chi-square values calling a __device__ function. +* The calcluation is performed for multiple fits in parallel. +* +* Parameters: +* +* chi_squares: An output vector of concatenated chi-square values. +* +* states: An output vector of values which indicate whether the fitting process +* was carreid out correctly or which problem occurred. In this function +* it is only used for MLE. It is set to 3 if a fitting curve value is +* negative. This vector includes the states for multiple fits. +* +* iteration_falied: An output vector which indicates whether the chi-square values +* calculated by the current iteration decreased compared to the +* previous iteration. +* +* prev_chi_squares: An input vector of concatenated chi-square values calculated +* by the previous iteration. +* +* data: An input vector of data for multiple fits +* +* values: An input vector of concatenated sets of model function values. +* +* weight: An input vector of values for weighting chi-square, gradient and hessian, +* while using LSE +* +* n_points: The number of data points per fit. +* +* estimator_id: The estimator ID. +* +* finished: An input vector which allows the calculation to be skipped for single +* fits. +* +* n_fits_per_block: The number of fits calculated by each thread block. +* +* user_info: An input vector containing user information. +* +* user_info_size: The number of elements in user_info. +* +* Calling the cuda_calculate_chi_squares function +* ================================================ +* +* When calling the function, the blocks and threads must be set up correctly, +* as shown in the following example code. +* +* dim3 threads(1, 1, 1); +* dim3 blocks(1, 1, 1); +* +* threads.x = power_of_two_n_points * n_fits_per_block; +* blocks.x = n_fits / n_fits_per_block; +* +* cuda_calculate_chi_squares<<< blocks, threads >>>( +* chi_squares, +* states, +* iteration_falied, +* prev_chi_squares, +* data, +* values, +* weight, +* n_points, +* estimator_id, +* finished, +* n_fits_per_block, +* user_info, +* user_info_size); +* +*/ + +__global__ void cuda_calculate_chi_squares( + float * chi_squares, + int * states, + int * iteration_falied, + float const * prev_chi_squares, + float const * data, + float const * values, + float const * weights, + int const n_points, + int const estimator_id, + int const * finished, + int const n_fits_per_block, + char * user_info, + std::size_t const user_info_size) +{ + int const shared_size = blockDim.x / n_fits_per_block; + int const fit_in_block = threadIdx.x / shared_size; + int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block; + int const point_index = threadIdx.x - fit_in_block * shared_size; + int const first_point = fit_index * n_points; + + if (finished[fit_index]) + { + return; + } + + float const * current_data = &data[first_point]; + float const * current_weight = weights ? &weights[first_point] : NULL; + float const * current_value = &values[first_point]; + int * current_state = &states[fit_index]; + + extern __shared__ float extern_array[]; + + volatile float * shared_chi_square = &extern_array[fit_in_block*shared_size]; + + if (point_index >= n_points) + { + shared_chi_square[point_index] = 0.f; + } + + if (point_index < n_points) + { + if (estimator_id == LSE) + { + calculate_chi_square_lse( + shared_chi_square, + point_index, + current_data, + current_value, + current_weight, + current_state, + user_info, + user_info_size); + } + else if (estimator_id == MLE) + { + calculate_chi_square_mle( + shared_chi_square, + point_index, + current_data, + current_value, + current_weight, + current_state, + user_info, + user_info_size); + } + } + sum_up_floats(shared_chi_square, shared_size); + chi_squares[fit_index] = shared_chi_square[0]; + + + bool const prev_chi_squares_initialized = prev_chi_squares[fit_index] != 0; + bool const chi_square_increased = (chi_squares[fit_index] >= prev_chi_squares[fit_index]); + if (prev_chi_squares_initialized && chi_square_increased) + { + iteration_falied[fit_index] = 1; + } + else + { + iteration_falied[fit_index] = 0; + } +} + +/* Description of the cuda_calculate_gradients function +* ======================================================== +* +* This function calculates the gradient values of the chi-square function calling +* a __device__ function. The calcluation is performed for multiple fits in parallel. +* +* Parameters: +* +* gradients: An output vector of concatenated sets of gradient vector values. +* +* data: An input vector of data for multiple fits +* +* values: An input vector of concatenated sets of model function values. +* +* derivatives: An input vector of concatenated sets of model function partial +* derivatives. +* +* weight: An input vector of values for weighting chi-square, gradient and hessian, +* while using LSE +* +* n_points: The number of data points per fit. +* +* n_parameters: The number of fitting curve parameters. +* +* n_parameters_to_fit: The number of fitting curve parameters, that are not held +* fixed. +* +* parameters_to_fit_indices: An input vector of indices of fitting curve parameters, +* that are not held fixed. +* +* estimator_id: The estimator ID. +* +* finished: An input vector which allows the calculation to be skipped for single +* fits. +* +* skip: An input vector which allows the calculation to be skipped for single fits. +* +* n_fits_per_block: The number of fits calculated by each thread block. +* +* user_info: An input vector containing user information. +* +* user_info_size: The number of elements in user_info. +* +* Calling the cuda_calculate_gradients function +* ================================================ +* +* When calling the function, the blocks and threads must be set up correctly, +* as shown in the following example code. +* +* dim3 threads(1, 1, 1); +* dim3 blocks(1, 1, 1); +* +* threads.x = power_of_two_n_points * n_fits_per_block; +* blocks.x = n_fits / n_fits_per_block; +* +* cuda_calculate_gradients<<< blocks, threads >>>( +* gradients, +* data, +* values, +* derivatives, +* weight, +* n_points, +* n_parameters, +* n_parameters_to_fit, +* parameters_to_fit_indices, +* estimator_id, +* finished, +* skip, +* n_fits_per_block, +* user_info, +* user_info_size); +* +*/ + +__global__ void cuda_calculate_gradients( + float * gradients, + float const * data, + float const * values, + float const * derivatives, + float const * weights, + int const n_points, + int const n_parameters, + int const n_parameters_to_fit, + int const * parameters_to_fit_indices, + int const estimator_id, + int const * finished, + int const * skip, + int const n_fits_per_block, + char * user_info, + std::size_t const user_info_size) +{ + int const shared_size = blockDim.x / n_fits_per_block; + int const fit_in_block = threadIdx.x / shared_size; + int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block; + int const point_index = threadIdx.x - fit_in_block * shared_size; + int const first_point = fit_index * n_points; + + if (finished[fit_index] || skip[fit_index]) + { + return; + } + + float const * current_data = &data[first_point]; + float const * current_weight = weights ? &weights[first_point] : NULL; + float const * current_derivative = &derivatives[first_point * n_parameters]; + float const * current_value = &values[first_point]; + + extern __shared__ float extern_array[]; + + volatile float * shared_gradient = &extern_array[fit_in_block * shared_size]; + + if (point_index >= n_points) + { + shared_gradient[point_index] = 0.f; + } + + for (int parameter_index = 0; parameter_index < n_parameters_to_fit; parameter_index++) + { + if (point_index < n_points) + { + int const derivative_index = parameters_to_fit_indices[parameter_index] * n_points + point_index; + + if (estimator_id == LSE) + { + calculate_gradient_lse( + shared_gradient, + point_index, + derivative_index, + current_data, + current_value, + current_derivative, + current_weight, + user_info, + user_info_size); + } + else if (estimator_id == MLE) + { + calculate_gradient_mle( + shared_gradient, + point_index, + derivative_index, + current_data, + current_value, + current_derivative, + current_weight, + user_info, + user_info_size); + } + } + sum_up_floats(shared_gradient, shared_size); + gradients[fit_index * n_parameters_to_fit + parameter_index] = shared_gradient[0]; + } +} + +/* Description of the cuda_calculate_hessians function +* ======================================================== +* +* This function calculates the hessian matrix values of the chi-square function +* calling a __device__ functions. The calcluation is performed for multiple fits +* in parallel. +* +* Parameters: +* +* hessians: An output vector of concatenated sets of hessian matrix values. +* +* data: An input vector of data for multiple fits +* +* values: An input vector of concatenated sets of model function values. +* +* derivatives: An input vector of concatenated sets of model function partial +* derivatives. +* +* weight: An input vector of values for weighting chi-square, gradient and hessian, +* while using LSE +* +* n_points: The number of data points per fit. +* +* n_parameters: The number of fitting curve parameters. +* +* n_parameters_to_fit: The number of fitting curve parameters, that are not held +* fixed. +* +* parameters_to_fit_indices: An input vector of indices of fitting curve parameters, +* that are not held fixed. +* +* estimator_id: The estimator ID. +* +* skip: An input vector which allows the calculation to be skipped for single fits. +* +* finished: An input vector which allows the calculation to be skipped for single +* fits. +* +* user_info: An input vector containing user information. +* +* user_info_size: The number of elements in user_info. +* +* Calling the cuda_calculate_hessians function +* ================================================ +* +* When calling the function, the blocks and threads must be set up correctly, +* as shown in the following example code. +* +* dim3 threads(1, 1, 1); +* dim3 blocks(1, 1, 1); +* +* threads.x = n_parameters_to_fit; +* threads.y = n_parameters_to_fit; +* blocks.x = n_fits; +* +* cuda_calculate_hessians<<< blocks, threads >>>( +* hessians, +* data, +* values, +* derivatives, +* weight, +* n_points, +* n_parameters, +* n_parameters_to_fit, +* parameters_to_fit_indices, +* estimator_id, +* skip, +* finished, +* user_info, +* user_info_size); +* +*/ + +__global__ void cuda_calculate_hessians( + float * hessians, + float const * data, + float const * values, + float const * derivatives, + float const * weights, + int const n_points, + int const n_parameters, + int const n_parameters_to_fit, + int const * parameters_to_fit_indices, + int const estimator_id, + int const * skip, + int const * finished, + char * user_info, + std::size_t const user_info_size) +{ + int const fit_index = blockIdx.x; + int const first_point = fit_index * n_points; + + int const parameter_index_i = threadIdx.x; + int const parameter_index_j = threadIdx.y; + + if (finished[fit_index] || skip[fit_index]) + { + return; + } + + float * current_hessian = &hessians[fit_index * n_parameters_to_fit * n_parameters_to_fit]; + float const * current_data = &data[first_point]; + float const * current_weight = weights ? &weights[first_point] : NULL; + float const * current_derivative = &derivatives[first_point*n_parameters]; + float const * current_value = &values[first_point]; + + int const hessian_index_ij = parameter_index_i * n_parameters_to_fit + parameter_index_j; + int const derivative_index_i = parameters_to_fit_indices[parameter_index_i] * n_points; + int const derivative_index_j = parameters_to_fit_indices[parameter_index_j] * n_points; + + double sum = 0.0; + for (int point_index = 0; point_index < n_points; point_index++) + { + if (estimator_id == LSE) + { + calculate_hessian_lse( + &sum, + point_index, + derivative_index_i + point_index, + derivative_index_j + point_index, + current_data, + current_value, + current_derivative, + current_weight, + user_info, + user_info_size); + } + else if (estimator_id == MLE) + { + calculate_hessian_mle( + &sum, + point_index, + derivative_index_i + point_index, + derivative_index_j + point_index, + current_data, + current_value, + current_derivative, + current_weight, + user_info, + user_info_size); + } + } + current_hessian[hessian_index_ij] = sum; +} + +/* Description of the cuda_modify_step_widths function +* ==================================================== +* +* This function midifies the diagonal elements of the hessian matrices by multiplying +* them by the factor (1+ lambda). This operation controls the step widths of the +* iteration. If the last iteration failed, befor modifying the hessian, the diagonal +* elements of the hessian are calculated back to represent unmodified values. +* +* hessians: An input and output vector of hessian matrices, which are modified by +* the lambda values. +* +* lambdas: An input vector of values for modifying the hessians. +* +* n_parameters: The number of fitting curve parameters. +* +* iteration_falied: An input vector which indicates whether the previous iteration +* failed. +* +* finished: An input vector which allows the calculation to be skipped for single fits. +* +* n_fits_per_block: The number of fits calculated by each thread block. +* +* Calling the cuda_modify_step_widths function +* ============================================ +* +* When calling the function, the blocks and threads must be set up correctly, +* as shown in the following example code. +* +* dim3 threads(1, 1, 1); +* dim3 blocks(1, 1, 1); +* +* threads.x = n_parameters_to_fit * n_fits_per_block; +* blocks.x = n_fits / n_fits_per_block; +* +* cuda_modify_step_width<<< blocks, threads >>>( +* hessians, +* lambdas, +* n_parameters, +* iteration_failed, +* finished, +* n_fits_per_block); +* +*/ + +__global__ void cuda_modify_step_widths( + float * hessians, + float const * lambdas, + unsigned int const n_parameters, + int const * iteration_failed, + int const * finished, + int const n_fits_per_block) +{ + int const shared_size = blockDim.x / n_fits_per_block; + int const fit_in_block = threadIdx.x / shared_size; + int const parameter_index = threadIdx.x - fit_in_block * shared_size; + int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block; + + if (finished[fit_index]) + { + return; + } + + float * current_hessian = &hessians[fit_index * n_parameters * n_parameters]; + + if (iteration_failed[fit_index]) + { + current_hessian[parameter_index * n_parameters + parameter_index] + = current_hessian[parameter_index * n_parameters + parameter_index] + / (1.0f + lambdas[fit_index] / 10.f); + } + + current_hessian[parameter_index * n_parameters + parameter_index] + = current_hessian[parameter_index * n_parameters + parameter_index] + * (1.0f + lambdas[fit_index]); +} + +/* Description of the cuda_update_parameters function +* =================================================== +* +* This function stores the fitting curve parameter values in prev_parameters and +* updates them after each iteration. +* +* Parameters: +* +* deltas: An input vector of concatenated delta values, which are added to the +* model parameters. +* +* parameters: An input and output vector of concatenated sets of model +* parameters. +* +* n_parameters_to_fit: The number of fitted curve parameters. +* +* parameters_to_fit_indices: The indices of fitted curve parameters. +* +* finished: An input vector which allows the calculation to be skipped for single fits. +* +* n_fits_per_block: The number of fits calculated by each threadblock. +* +* Calling the cuda_update_parameters function +* =========================================== +* +* When calling the function, the blocks and threads must be set up correctly, +* as shown in the following example code. +* +* dim3 threads(1, 1, 1); +* dim3 blocks(1, 1, 1); +* +* threads.x = n_parameters * n_fits_per_block; +* blocks.x = n_fits / n_fits_per_block; +* +* cuda_update_parameters<<< blocks, threads >>>( +* deltas, +* parameters, +* n_parameters_to_fit, +* parameters_to_fit_indices, +* finished, +* n_fits_per_block); +* +*/ + +__global__ void cuda_update_parameters( + float * parameters, + float * prev_parameters, + float const * deltas, + int const n_parameters_to_fit, + int const * parameters_to_fit_indices, + int const * finished, + int const n_fits_per_block) +{ + int const n_parameters = blockDim.x / n_fits_per_block; + int const fit_in_block = threadIdx.x / n_parameters; + int const parameter_index = threadIdx.x - fit_in_block * n_parameters; + int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block; + + float * current_parameters = ¶meters[fit_index * n_parameters]; + float * current_prev_parameters = &prev_parameters[fit_index * n_parameters]; + + current_prev_parameters[parameter_index] = current_parameters[parameter_index]; + + if (finished[fit_index]) + { + return; + } + + if (parameter_index >= n_parameters_to_fit) + { + return; + } + + float const * current_deltas = &deltas[fit_index * n_parameters_to_fit]; + + current_parameters[parameters_to_fit_indices[parameter_index]] += current_deltas[parameter_index]; +} + +/* Description of the cuda_update_state_after_gaussjordan function +* ================================================================ +* +* This function interprets the singular flag vector of the Gauss Jordan function +* according to this LM implementation. +* +* Parameters: +* +* n_fits: The number of fits. +* +* singular_checks: An input vector used to report whether a fit is singular. +* +* states: An output vector of values which indicate whether the fitting process +* was carreid out correctly or which problem occurred. If a hessian +* matrix of a fit is singular, it is set to 2. +* +* Calling the cuda_update_state_after_gaussjordan function +* ======================================================== +* +* When calling the function, the blocks and threads must be set up correctly, +* as shown in the following example code. +* +* dim3 threads(1, 1, 1); +* dim3 blocks(1, 1, 1); +* +* int const example_value = 256; +* +* threads.x = min(n_fits, example_value); +* blocks.x = int(ceil(float(n_fits) / float(threads.x))); +* +* cuda_update_state_after_gaussjordan<<< blocks, threads >>>( +* n_fits, +* singular_checks, +* states); +* +*/ + + +__global__ void cuda_update_state_after_gaussjordan( + int const n_fits, + int const * singular_checks, + int * states) +{ + int const fit_index = blockIdx.x * blockDim.x + threadIdx.x; + + if (fit_index >= n_fits) + { + return; + } + + if (singular_checks[fit_index] == 1) + { + states[fit_index] = STATE_SINGULAR_HESSIAN; + } + +} + +/* Description of the cuda_check_for_convergence function +* ======================================================= +* +* This function checks after each iteration whether the fits are converged or not. +* It also checks whether the set maximum number of iterations is reached. +* +* Parameters: +* +* finished: An input and output vector which allows the calculation to be skipped +* for single fits. +* +* tolerance: The tolerance value for the convergence set by user. +* +* states: An output vector of values which indicate whether the fitting process +* was carreid out correctly or which problem occurred. If the maximum +* number of iterationsis reached without converging, it is set to 1. If +* the fit converged it keeps its initial value of 0. +* +* chi_squares: An input vector of chi-square values for multiple fits. Used for the +* convergence check. +* +* prev_chi_squares: An input vector of chi-square values for multiple fits calculated +* in the previous iteration. Used for the convergence check. +* +* iteration: The value of the current iteration. It is compared to the value +* of the maximum number of iteration set by user. +* +* max_n_iterations: The maximum number of iterations set by user. +* +* n_fits: The number of fits. +* +* Calling the cuda_check_for_convergence function +* =============================================== +* +* When calling the function, the blocks and threads must be set up correctly, +* as shown in the following example code. +* +* dim3 threads(1, 1, 1); +* dim3 blocks(1, 1, 1); +* +* int const example_value = 256; +* +* threads.x = min(n_fits, example_value); +* blocks.x = int(ceil(float(n_fits) / float(threads.x))); +* +* cuda_check_for_convergence<<< blocks, threads >>>( +* finished, +* tolerance, +* states, +* chi_squares, +* prev_chi_squares, +* iteration, +* max_n_iterations, +* n_fits); +* +*/ + +__global__ void cuda_check_for_convergence( + int * finished, + float const tolerance, + int * states, + float const * chi_squares, + float const * prev_chi_squares, + int const iteration, + int const max_n_iterations, + int const n_fits) +{ + int const fit_index = blockIdx.x * blockDim.x + threadIdx.x; + + if (fit_index >= n_fits) + { + return; + } + + if (finished[fit_index]) + { + return; + } + + int const fit_found = abs(chi_squares[fit_index] - prev_chi_squares[fit_index]) < tolerance * fmaxf(1, chi_squares[fit_index]); + + int const max_n_iterations_reached = iteration == max_n_iterations - 1; + + if (fit_found) + { + finished[fit_index] = 1; + } + else if (max_n_iterations_reached) + { + states[fit_index] = STATE_MAX_ITERATION; + } +} + +/* Description of the cuda_evaluate_iteration function +* ==================================================== +* +* This function evaluates the current iteration. +* - It marks a fit as finished if a problem occured. +* - It saves the needed number of iterations if a fit finished. +* - It checks if all fits finished +* +* Parameters: +* +* all_finished: An output flag, that indicates whether all fits finished. +* +* n_iterations: An output vector of needed iterations for each fit. +* +* finished: An input and output vector which allows the evaluation to be skipped +* for single fits +* +* iteration: The values of the current iteration. +* +* states: An input vector of values which indicate whether the fitting process +* was carreid out correctly or which problem occurred. +* +* n_fits: The number of fits. +* +* Calling the cuda_evaluate_iteration function +* ============================================ +* +* When calling the function, the blocks and threads must be set up correctly, +* as shown in the following example code. +* +* dim3 threads(1, 1, 1); +* dim3 blocks(1, 1, 1); +* +* int const example_value = 256; +* +* threads.x = min(n_fits, example_value); +* blocks.x = int(ceil(float(n_fits) / float(threads.x))); +* +* cuda_evaluate_iteration<<< blocks, threads >>>( +* all_finished, +* n_iterations, +* finished, +* iteration, +* states, +* n_fits) +* +*/ + +__global__ void cuda_evaluate_iteration( + int * all_finished, + int * n_iterations, + int * finished, + int const iteration, + int const * states, + int const n_fits) +{ + int const fit_index = blockIdx.x * blockDim.x + threadIdx.x; + + if (fit_index >= n_fits) + { + return; + } + + if (states[fit_index] != STATE_CONVERGED) + { + finished[fit_index] = 1; + } + + if (finished[fit_index] && n_iterations[fit_index] == 0) + { + n_iterations[fit_index] = iteration + 1; + } + + if (!finished[fit_index]) + { + * all_finished = 0; + } +} + +/* Description of the cuda_prepare_next_iteration function +* ======================================================== +* +* This function prepares the next iteration. It either updates chi-square values +* or sets chi-squares and curve parameters to previous values. This function also +* updates lambda values. +* +* Parameters: +* +* lambdas: An output vector of values which control the step width by modifying +* the diagonal elements of the hessian matrices. +* +* chi_squares: An input vector of chi-square values for multiple fits. +* +* prev_chi_squares: An input vector of chi-square values for multiple fits calculated +* in the previous iteration. +* +* parameters: An output vector of concatenated sets of model parameters. +* +* prev_parameters: An input vector of concatenated sets of model parameters +* calculated in the previous iteration. +* +* n_fits: The number of fits. +* +* n_parameters: The number of fitting curve parameters. +* +* Calling the cuda_prepare_next_iteration function +* ================================================ +* +* When calling the function, the blocks and threads must be set up correctly, +* as shown in the following example code. +* +* dim3 threads(1, 1, 1); +* dim3 blocks(1, 1, 1); +* +* int const example_value = 256; +* +* threads.x = min(n_fits, example_value); +* blocks.x = int(ceil(float(n_fits) / float(threads.x))); +* +* cuda_prepare_next_iteration<<< blocks, threads >>>( +* lambdas, +* chi_squares, +* prev_chi_squares, +* parameters, +* prev_parameters, +* n_fits, +* n_parameters); +* +*/ + +__global__ void cuda_prepare_next_iteration( + float * lambdas, + float * chi_squares, + float * prev_chi_squares, + float * parameters, + float const * prev_parameters, + int const n_fits, + int const n_parameters) +{ + int const fit_index = blockIdx.x * blockDim.x + threadIdx.x; + + if (fit_index >= n_fits) + { + return; + } + + if (chi_squares[fit_index] < prev_chi_squares[fit_index]) + { + lambdas[fit_index] *= 0.1f; + prev_chi_squares[fit_index] = chi_squares[fit_index]; + } + else + { + lambdas[fit_index] *= 10.f; + chi_squares[fit_index] = prev_chi_squares[fit_index]; + for (int iparameter = 0; iparameter < n_parameters; iparameter++) + { + parameters[fit_index * n_parameters + iparameter] = prev_parameters[fit_index * n_parameters + iparameter]; + } + } +} diff --git a/Gpufit/cuda_kernels.cuh b/Gpufit/cuda_kernels.cuh new file mode 100644 index 0000000..6836480 --- /dev/null +++ b/Gpufit/cuda_kernels.cuh @@ -0,0 +1,108 @@ +#ifndef GPUFIT_CUDA_KERNELS_CUH_INCLUDED +#define GPUFIT_CUDA_KERNELS_CUH_INCLUDED + +#include + +extern __global__ void cuda_calculate_chi_squares( + float * chi_squares, + int * states, + int * iteration_falied, + float const * prev_chi_squares, + float const * data, + float const * values, + float const * weights, + int const n_points, + int const estimator_id, + int const * finished, + int const n_fits_per_block, + char * user_info, + std::size_t const user_info_size); +extern __global__ void cuda_calculate_gradients( + float * gradients, + float const * data, + float const * values, + float const * derivatives, + float const * weights, + int const n_points, + int const n_parameters, + int const n_parameters_to_fit, + int const * parameters_to_fit_indices, + int const estimator_id, + int const * finished, + int const * skip, + int const n_fits_per_block, + char * user_info, + std::size_t const user_info_size); +extern __global__ void cuda_calculate_hessians( + float * hessians, + float const * data, + float const * values, + float const * derivatives, + float const * weights, + int const n_points, + int const n_parameters, + int const n_parameters_to_fit, + int const * parameters_to_fit_indices, + int const estimator_id, + int const * skip, + int const * finished, + char * user_info, + std::size_t const user_info_size); +extern __global__ void cuda_modify_step_widths( + float * hessians, + float const * lambdas, + unsigned int const n_parameters, + int const * iteration_failed, + int const * finished, + int const n_fits_per_block); +extern __global__ void cuda_calc_curve_values( + float const * parameters, + int const n_fits, + int const n_points, + int const n_parameters, + int const * finished, + float * values, + float * derivatives, + int const n_fits_per_block, + int const model_id, + int const chunk_index, + char * user_info, + std::size_t const user_info_size); +extern __global__ void cuda_update_parameters( + float * parameters, + float * prev_parameters, + float const * deltas, + int const n_parameters_to_fit, + int const * parameters_to_fit_indices, + int const * finished, + int const n_fits_per_block); +extern __global__ void cuda_check_for_convergence( + int * finished, + float const tolerance, + int * states, + float const * chi_squares, + float const * prev_chi_squares, + int const iteration, + int const max_n_iterations, + int const n_fits); +extern __global__ void cuda_evaluate_iteration( + int * all_finished, + int * n_iterations, + int * finished, + int const iteration, + int const * states, + int const n_fits); +extern __global__ void cuda_prepare_next_iteration( + float * lambdas, + float * chi_squares, + float * prev_chi_squares, + float * function_parameters, + float const * prev_parameters, + int const n_fits, + int const n_parameters); +extern __global__ void cuda_update_state_after_gaussjordan( + int const n_fits, + int const * singular_checks, + int * states); + +#endif diff --git a/Gpufit/definitions.h b/Gpufit/definitions.h new file mode 100644 index 0000000..348220d --- /dev/null +++ b/Gpufit/definitions.h @@ -0,0 +1,12 @@ +#ifndef GPUFIT_DEFINITIONS_H_INCLUDED +#define GPUFIT_DEFINITIONS_H_INCLUDED + + // Status +#include +#define CUDA_CHECK_STATUS( cuda_function_call ) \ + if (cudaError_t const status = cuda_function_call) \ + { \ + throw std::runtime_error( cudaGetErrorString( status ) ) ; \ + } + +#endif diff --git a/Gpufit/examples/CMakeLists.txt b/Gpufit/examples/CMakeLists.txt new file mode 100644 index 0000000..bb4902f --- /dev/null +++ b/Gpufit/examples/CMakeLists.txt @@ -0,0 +1,14 @@ + +function( add_example module name ) + add_executable( ${name} ${name}.cpp ) + target_link_libraries( ${name} ${module} ) + set_property( TARGET ${name} + PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" ) + set_property( TARGET ${name} PROPERTY FOLDER GpufitExamples ) +endfunction() + +# Examples + +add_example( Gpufit Simple_Example ) +add_example( Gpufit Linear_Regression_Example ) +add_example( Gpufit Gauss_Fit_2D_Example ) diff --git a/Gpufit/examples/Gauss_Fit_2D_Example.cpp b/Gpufit/examples/Gauss_Fit_2D_Example.cpp new file mode 100644 index 0000000..8e628c7 --- /dev/null +++ b/Gpufit/examples/Gauss_Fit_2D_Example.cpp @@ -0,0 +1,260 @@ +#include "../gpufit.h" + +#include +#include +#include +#include +#include +#include + +void generate_gauss_2d( + std::vector const & x, + std::vector const & y, + std::vector & g, + std::vector const & p) +{ + // generates a Gaussian 2D peak function on a set of x and y values with some paramters p (size 5) + // we assume that x.size == y.size == g.size, no checks done + + // given x and y values and parameters p computes a model function g + for (size_t i = 0; i < x.size(); i++) + { + float arg = -((x[i] - p[1]) * (x[i] - p[1]) + (y[i] - p[2]) * (y[i] - p[2])) / (2 * p[3] * p[3]); + g[i] = p[0] * exp(arg) + p[4]; + } +} + +void gauss_fit_2d_example() +{ + /* + This example generates test data in form of 10000 two dimensional Gaussian + peaks with the size of 5x5 data points per peak. It is noised by Poisson + distributed noise. The initial guesses were randomized, within a specified + range of the true value. The GAUSS_2D model is fitted to the test data sets + using the MLE estimator. + + The console output shows + - the execution time, + - the ratio of converged fits including ratios of not converged fits for + different reasons, + - the values of the true parameters and the mean values of the fitted + parameters including their standard deviation, + - the mean chi square value + - and the mean number of iterations needed. + + True parameters and noise and number of fits is the same as for the Matlab/Python 2D Gaussian examples. + */ + + + // number of fits, fit points and parameters + size_t const number_fits = 10000; + size_t const size_x = 20; + size_t const number_points = size_x * size_x; + size_t const number_parameters = 5; + + // true parameters (amplitude, center x position, center y position, width, offset) + std::vector< float > true_parameters{ 10.f, 9.5f, 9.5f, 3.f, 10.f}; + + // initialize random number generator + std::mt19937 rng; + rng.seed(0); + std::uniform_real_distribution< float> uniform_dist(0, 1); + + // initial parameters (randomized) + std::vector< float > initial_parameters(number_fits * number_parameters); + for (size_t i = 0; i < number_fits; i++) + { + for (size_t j = 0; j < number_parameters; j++) + { + if (j == 1 || j == 2) + { + initial_parameters[i * number_parameters + j] + = true_parameters[j] + true_parameters[3] + * (-0.2f + 0.4f * uniform_dist(rng)); + } + else + { + initial_parameters[i * number_parameters + j] + = true_parameters[j] * (0.8f + 0.4f * uniform_dist(rng)); + } + } + } + + // generate x and y values + std::vector< float > x(number_points); + std::vector< float > y(number_points); + for (size_t i = 0; i < size_x; i++) + { + for (size_t j = 0; j < size_x; j++) { + x[i * size_x + j] = static_cast(j); + y[i * size_x + j] = static_cast(i); + } + } + + // generate test data with Poisson noise + std::vector< float > temp(number_points); + generate_gauss_2d(x, y, temp, true_parameters); + + std::vector< float > data(number_fits * number_points); + for (size_t i = 0; i < number_fits; i++) + { + for (size_t j = 0; j < number_points; j++) + { + std::poisson_distribution< int > poisson_dist(temp[j]); + data[i * number_points + j] = static_cast(poisson_dist(rng)); + } + } + + // tolerance + float const tolerance = 0.001f; + + // maximal number of iterations + int const max_number_iterations = 20; + + // estimator ID + int const estimator_id = MLE; + + // model ID + int const model_id = GAUSS_2D; + + // parameters to fit (all of them) + std::vector< int > parameters_to_fit(number_parameters, 1); + + // output parameters + std::vector< float > output_parameters(number_fits * number_parameters); + std::vector< int > output_states(number_fits); + std::vector< float > output_chi_square(number_fits); + std::vector< int > output_number_iterations(number_fits); + + // call to gpufit (C interface) + std::chrono::high_resolution_clock::time_point time_0 = std::chrono::high_resolution_clock::now(); + int const status = gpufit + ( + number_fits, + number_points, + data.data(), + 0, + model_id, + initial_parameters.data(), + tolerance, + max_number_iterations, + parameters_to_fit.data(), + estimator_id, + 0, + 0, + output_parameters.data(), + output_states.data(), + output_chi_square.data(), + output_number_iterations.data() + ); + std::chrono::high_resolution_clock::time_point time_1 = std::chrono::high_resolution_clock::now(); + + // check status + if (status != STATUS_OK) + { + throw std::runtime_error(gpufit_get_last_error()); + } + + // print execution time + std::cout + << "execution time " + << std::chrono::duration_cast(time_1 - time_0).count() << " ms\n"; + + // get fit states + std::vector< int > output_states_histogram(5, 0); + for (std::vector< int >::iterator it = output_states.begin(); it != output_states.end(); ++it) + { + output_states_histogram[*it]++; + } + + std::cout << "ratio converged " << (float)output_states_histogram[0] / number_fits << "\n"; + std::cout << "ratio max iteration exceeded " << (float)output_states_histogram[1] / number_fits << "\n"; + std::cout << "ratio singular hessian " << (float)output_states_histogram[2] / number_fits << "\n"; + std::cout << "ratio neg curvature MLE " << (float)output_states_histogram[3] / number_fits << "\n"; + std::cout << "ratio gpu not read " << (float)output_states_histogram[4] / number_fits << "\n"; + + // compute mean of fitted parameters for converged fits + std::vector< float > output_parameters_mean(number_parameters, 0); + for (size_t i = 0; i != number_fits; i++) + { + if (output_states[i] == STATE_CONVERGED) + { + for (size_t j = 0; j < number_parameters; j++) + { + output_parameters_mean[j] += output_parameters[i * number_parameters + j]; + } + } + } + // normalize + for (size_t j = 0; j < number_parameters; j++) + { + output_parameters_mean[j] /= output_states_histogram[0]; + } + + // compute std of fitted parameters for converged fits + std::vector< float > output_parameters_std(number_parameters, 0); + for (size_t i = 0; i != number_fits; i++) + { + if (output_states[i] == STATE_CONVERGED) + { + for (size_t j = 0; j < number_parameters; j++) + { + output_parameters_std[j] + += (output_parameters[i * number_parameters + j] - output_parameters_mean[j]) + * (output_parameters[i * number_parameters + j] - output_parameters_mean[j]); + } + } + } + // normalize and take square root + for (size_t j = 0; j < number_parameters; j++) + { + output_parameters_std[j] = sqrt(output_parameters_std[j] / output_states_histogram[0]); + } + + // print true value, fitted mean and std for every parameter + for (size_t j = 0; j < number_parameters; j++) + { + std::cout + << "parameter " << j + << " true " << true_parameters[j] + << " fitted mean " << output_parameters_mean[j] + << " std " << output_parameters_std[j] << "\n"; + } + + // compute mean chi-square for those converged + float output_chi_square_mean = 0; + for (size_t i = 0; i != number_fits; i++) + { + if (output_states[i] == STATE_CONVERGED) + { + output_chi_square_mean += output_chi_square[i]; + } + } + output_chi_square_mean /= static_cast(output_states_histogram[0]); + std::cout << "mean chi square " << output_chi_square_mean << "\n"; + + // compute mean number of iterations for those converged + float output_number_iterations_mean = 0; + for (size_t i = 0; i != number_fits; i++) + { + if (output_states[i] == STATE_CONVERGED) + { + output_number_iterations_mean += static_cast(output_number_iterations[i]); + } + } + // normalize + output_number_iterations_mean /= static_cast(output_states_histogram[0]); + std::cout << "mean number of iterations " << output_number_iterations_mean << "\n"; + +} + +int main(int argc, char *argv[]) +{ + gauss_fit_2d_example(); + + std::cout << std::endl << "Example completed!" << std::endl; + std::cout << "Press ENTER to exit" << std::endl; + std::getchar(); + + return 0; +} diff --git a/Gpufit/examples/Linear_Regression_Example.cpp b/Gpufit/examples/Linear_Regression_Example.cpp new file mode 100644 index 0000000..e70e05d --- /dev/null +++ b/Gpufit/examples/Linear_Regression_Example.cpp @@ -0,0 +1,207 @@ +#include "../gpufit.h" + +#include +#include +#include +#include + +void linear_regression_example() +{ + /* + This example generates test data in form of 10000 one dimensional linear + curves with the size of 20 data points per curve. It is noised by normal + distributed noise. The initial guesses were randomized, within a specified + range of the true value. The LINEAR_1D model is fitted to the test data sets + using the LSE estimator. The optional parameter user_info is used to pass + custom x positions of the data sets. The same x position values are used for + every fit. + + The console output shows + - the ratio of converged fits including ratios of not converged fits for + different reasons, + - the values of the true parameters and the mean values of the fitted + parameters including their standard deviation, + - the mean chi square value + - and the mean number of iterations needed. + */ + + // number of fits, fit points and parameters + size_t const number_fits = 10000; + size_t const number_points = 20; + size_t const number_parameters = 2; + + // custom x positions for the data points of every fit, stored in user info + std::vector< float > user_info(number_points); + for (size_t i = 0; i < number_points; i++) + { + user_info[i] = static_cast(pow(2, i)); + } + + // size of user info in bytes + size_t const user_info_size = number_points * sizeof(float); + + // initialize random number generator + std::mt19937 rng; + rng.seed(0); + std::uniform_real_distribution< float > uniform_dist(0, 1); + std::normal_distribution< float > normal_dist(0, 1); + + // true parameters + std::vector< float > true_parameters { 5, 2 }; // offset, slope + + // initial parameters (randomized) + std::vector< float > initial_parameters(number_fits * number_parameters); + for (size_t i = 0; i != number_fits; i++) + { + // random offset + initial_parameters[i * number_parameters + 0] = true_parameters[0] * (0.8f + 0.4f * uniform_dist(rng)); + // random slope + initial_parameters[i * number_parameters + 1] = true_parameters[0] * (0.8f + 0.4f * uniform_dist(rng)); + } + + // generate data + std::vector< float > data(number_points * number_fits); + for (size_t i = 0; i != data.size(); i++) + { + size_t j = i / number_points; // the fit + size_t k = i % number_points; // the position within a fit + + float x = user_info[k]; + float y = true_parameters[0] + x * true_parameters[1]; + data[i] = y + normal_dist(rng); + } + + // tolerance + float const tolerance = 0.001f; + + // maximal number of iterations + int const max_number_iterations = 20; + + // estimator ID + int const estimator_id = LSE; + + // model ID + int const model_id = LINEAR_1D; + + // parameters to fit (all of them) + std::vector< int > parameters_to_fit(number_parameters, 1); + + // output parameters + std::vector< float > output_parameters(number_fits * number_parameters); + std::vector< int > output_states(number_fits); + std::vector< float > output_chi_square(number_fits); + std::vector< int > output_number_iterations(number_fits); + + // call to gpufit (C interface) + int const status = gpufit + ( + number_fits, + number_points, + data.data(), + 0, + model_id, + initial_parameters.data(), + tolerance, + max_number_iterations, + parameters_to_fit.data(), + estimator_id, + user_info_size, + reinterpret_cast< char * >( user_info.data() ), + output_parameters.data(), + output_states.data(), + output_chi_square.data(), + output_number_iterations.data() + ); + + // check status + if (status != STATUS_OK) + { + throw std::runtime_error(gpufit_get_last_error()); + } + + // get fit states + std::vector< int > output_states_histogram(5, 0); + for (std::vector< int >::iterator it = output_states.begin(); it != output_states.end(); ++it) + { + output_states_histogram[*it]++; + } + + std::cout << "ratio converged " << (float) output_states_histogram[0] / number_fits << "\n"; + std::cout << "ratio max iteration exceeded " << (float) output_states_histogram[1] / number_fits << "\n"; + std::cout << "ratio singular hessian " << (float) output_states_histogram[2] / number_fits << "\n"; + std::cout << "ratio neg curvature MLE " << (float) output_states_histogram[3] / number_fits << "\n"; + std::cout << "ratio gpu not read " << (float) output_states_histogram[4] / number_fits << "\n"; + + // compute mean fitted parameters for converged fits + std::vector< float > output_parameters_mean(number_parameters, 0); + for (size_t i = 0; i != number_fits; i++) + { + if (output_states[i] == STATE_CONVERGED) + { + // add offset + output_parameters_mean[0] += output_parameters[i * number_parameters + 0]; + // add slope + output_parameters_mean[1] += output_parameters[i * number_parameters + 1]; + } + } + output_parameters_mean[0] /= output_states_histogram[0]; + output_parameters_mean[1] /= output_states_histogram[0]; + + // compute std of fitted parameters for converged fits + std::vector< float > output_parameters_std(number_parameters, 0); + for (size_t i = 0; i != number_fits; i++) + { + if (output_states[i] == STATE_CONVERGED) + { + // add squared deviation for offset + output_parameters_std[0] += (output_parameters[i * number_parameters + 0] - output_parameters_mean[0]) * (output_parameters[i * number_parameters + 0] - output_parameters_mean[0]); + // add squared deviation for slope + output_parameters_std[1] += (output_parameters[i * number_parameters + 1] - output_parameters_mean[1]) * (output_parameters[i * number_parameters + 1] - output_parameters_mean[1]); + } + } + // divide and take square root + output_parameters_std[0] = sqrt(output_parameters_std[0] / output_states_histogram[0]); + output_parameters_std[1] = sqrt(output_parameters_std[1] / output_states_histogram[0]); + + // print mean and std + std::cout << "offset true " << true_parameters[0] << " mean " << output_parameters_mean[0] << " std " << output_parameters_std[0] << "\n"; + std::cout << "slope true " << true_parameters[1] << " mean " << output_parameters_mean[1] << " std " << output_parameters_std[1] << "\n"; + + // compute mean chi-square for those converged + float output_chi_square_mean = 0; + for (size_t i = 0; i != number_fits; i++) + { + if (output_states[i] == STATE_CONVERGED) + { + output_chi_square_mean += output_chi_square[i]; + } + } + output_chi_square_mean /= static_cast(output_states_histogram[0]); + std::cout << "mean chi square " << output_chi_square_mean << "\n"; + + // compute mean number of iterations for those converged + float output_number_iterations_mean = 0; + for (size_t i = 0; i != number_fits; i++) + { + if (output_states[i] == STATE_CONVERGED) + { + output_number_iterations_mean += static_cast(output_number_iterations[i]); + } + } + + // normalize + output_number_iterations_mean /= static_cast(output_states_histogram[0]); + std::cout << "mean number of iterations " << output_number_iterations_mean << "\n"; +} + + +int main(int argc, char *argv[]) +{ + linear_regression_example(); + + std::cout << std::endl << "Example completed!" << std::endl; + std::cout << "Press ENTER to exit" << std::endl; + std::getchar(); + + return 0; +} diff --git a/Gpufit/examples/Simple_Example.cpp b/Gpufit/examples/Simple_Example.cpp new file mode 100644 index 0000000..6d8ea91 --- /dev/null +++ b/Gpufit/examples/Simple_Example.cpp @@ -0,0 +1,94 @@ +#include "../gpufit.h" +#include +#include + +void simple_example() +{ + /* + Simple example demonstrating a minimal call of all needed parameters to + the C interface. It can be built and executed, but in this exeample + gpufit doesn't do anything useful and it doesn't yield meaningful + output. No test data is generated. The values of the input data vector + and the initial fit parameters vector are set to 0. + + This example can be devided in three parts: + - definition of input and output parameters + - call to gpufit + - status check + */ + + /*************** definition of input and output parameters ***************/ + + // number of fits, number of points per fit + size_t const number_fits = 10; + size_t const number_points = 10; + + // model ID and number of parameter + int const model_id = GAUSS_1D; + size_t const number_parameters = 4; + + // initial parameters + std::vector< float > initial_parameters(number_fits * number_parameters); + + // data + std::vector< float > data(number_points * number_fits); + + // tolerance + float const tolerance = 0.001f; + + // maximal number of iterations + int const max_number_iterations = 10; + + // estimator ID + int const estimator_id = LSE; + + // parameters to fit (all of them) + std::vector< int > parameters_to_fit(number_parameters, 1); + + // output parameters + std::vector< float > output_parameters(number_fits * number_parameters); + std::vector< int > output_states(number_fits); + std::vector< float > output_chi_square(number_fits); + std::vector< int > output_number_iterations(number_fits); + + /***************************** call to gpufit ****************************/ + + int const status = gpufit + ( + number_fits, + number_points, + data.data(), + 0, + model_id, + initial_parameters.data(), + tolerance, + max_number_iterations, + parameters_to_fit.data(), + estimator_id, + 0, + 0, + output_parameters.data(), + output_states.data(), + output_chi_square.data(), + output_number_iterations.data() + ); + + /****************************** status check *****************************/ + + if (status != STATUS_OK) + { + throw std::runtime_error(gpufit_get_last_error()); + } +} + + +int main(int argc, char *argv[]) +{ + simple_example(); + + std::cout << std::endl << "Example completed!" << std::endl; + std::cout << "Press ENTER to exit" << std::endl; + std::getchar(); + + return 0; +} diff --git a/Gpufit/gauss_1d.cuh b/Gpufit/gauss_1d.cuh new file mode 100644 index 0000000..5fefc55 --- /dev/null +++ b/Gpufit/gauss_1d.cuh @@ -0,0 +1,91 @@ +#ifndef GPUFIT_GAUSS1D_CUH_INCLUDED +#define GPUFIT_GAUSS1D_CUH_INCLUDED + +/* Description of the calculate_gauss1d function +* ============================================== +* +* This function calculates the values of one-dimensional gauss model functions +* and their partial derivatives with respect to the model parameters. +* +* No independent variables are passed to this model function. Hence, the +* (X) coordinate of the first data value is assumed to be (0.0). For +* a fit size of M data points, the (X) coordinates of the data are +* simply the corresponding array index values of the data array, starting from +* zero. +* +* Parameters: +* +* parameters: An input vector of concatenated sets of model parameters. +* p[0]: amplitude +* p[1]: center coordinate +* p[2]: width (standard deviation) +* p[3]: offset +* +* n_fits: The number of fits. (not used) +* +* n_points: The number of data points per fit. +* +* n_parameters: The number of model parameters. +* +* values: An output vector of concatenated sets of model function values. +* +* derivatives: An output vector of concatenated sets of model function partial +* derivatives. +* +* chunk_index: The chunk index. (not used) +* +* user_info: An input vector containing user information. (not used) +* +* user_info_size: The number of elements in user_info. (not used) +* +* Calling the calculate_gauss1d function +* ====================================== +* +* This __device__ function can be only called from a __global__ function or an other +* __device__ function. When calling the function, the blocks and threads of the __global__ +* function must be set up correctly, as shown in the following example code. +* +* dim3 threads(1, 1, 1); +* dim3 blocks(1, 1, 1); +* +* threads.x = n_points * n_fits_per_block; +* blocks.x = n_fits / n_fits_per_block; +* +* global_function<<< blocks,threads >>>(parameter1, ...); +* +*/ + +__device__ void calculate_gauss1d( + float const * parameters, + int const n_fits, + int const n_points, + int const n_parameters, + float * values, + float * derivatives, + int const chunk_index, + char * user_info, + std::size_t const user_info_size) +{ + int const n_fits_per_block = blockDim.x / n_points; + int const fit_in_block = threadIdx.x / n_points; + int const point_index = threadIdx.x - (fit_in_block*n_points); + int const fit_index = blockIdx.x*n_fits_per_block + fit_in_block; + + float * current_value = &values[fit_index * n_points]; + float const * p = ¶meters[fit_index * n_parameters]; + + float const argx = (point_index - p[1]) * (point_index - p[1]) / (2 * p[2] * p[2]); + float const ex = exp(-argx); + current_value[point_index] = p[0] * ex + p[3]; + + // derivatives + + float * current_derivative = &derivatives[fit_index * n_points * n_parameters + point_index]; + + current_derivative[0] = ex; + current_derivative[1 * n_points] = p[0] * ex * (point_index - p[1]) / (p[2] * p[2]); + current_derivative[2 * n_points] = p[0] * ex * (point_index - p[1]) * (point_index - p[1]) / (p[2] * p[2] * p[2]); + current_derivative[3 * n_points] = 1.f; +} + +#endif diff --git a/Gpufit/gauss_2d.cuh b/Gpufit/gauss_2d.cuh new file mode 100644 index 0000000..0448cfa --- /dev/null +++ b/Gpufit/gauss_2d.cuh @@ -0,0 +1,97 @@ +#ifndef GPUFIT_GAUSS2D_CUH_INCLUDED +#define GPUFIT_GAUSS2D_CUH_INCLUDED + +/* Description of the calculate_gauss2d function +* ============================================== +* +* This function calculates the values of two-dimensional gauss model functions +* and their partial derivatives with respect to the model parameters. +* +* No independent variables are passed to this model function. Hence, the +* (X, Y) coordinate of the first data value is assumed to be (0.0, 0.0). For +* a fit size of M x N data points, the (X, Y) coordinates of the data are +* simply the corresponding array index values of the data array, starting from +* zero. +* +* Parameters: +* +* parameters: An input vector of concatenated sets of model parameters. +* p[0]: amplitude +* p[1]: center coordinate x +* p[2]: center coordinate y +* p[3]: width (standard deviation; equal width in x and y dimensions) +* p[4]: offset +* +* n_fits: The number of fits. (not used) +* +* n_points: The number of data points per fit. +* +* n_parameters: The number of model parameters. +* +* values: An output vector of concatenated sets of model function values. +* +* derivatives: An output vector of concatenated sets of model function partial +* derivatives. +* +* chunk_index: The chunk index. (not used) +* +* user_info: An input vector containing user information. (not used) +* +* user_info_size: The number of elements in user_info. (not used) +* +* Calling the calculate_gauss2d function +* ====================================== +* +* This __device__ function can be only called from a __global__ function or an other +* __device__ function. When calling the function, the blocks and threads of the __global__ +* function must be set up correctly, as shown in the following example code. +* +* dim3 threads(1, 1, 1); +* dim3 blocks(1, 1, 1); +* +* threads.x = n_points * n_fits_per_block; +* blocks.x = n_fits / n_fits_per_block; +* +* global_function<<< blocks,threads >>>(parameter1, ...); +* +*/ + +__device__ void calculate_gauss2d( + float const * parameters, + int const n_fits, + int const n_points, + int const n_parameters, + float * values, + float * derivatives, + int const chunk_index, + char * user_info, + std::size_t const user_info_size) +{ + int const n_points_x = sqrt((float)n_points); + int const n_fits_per_block = blockDim.x / n_points; + int const fit_in_block = threadIdx.x / n_points; + int const point_index = threadIdx.x - fit_in_block * n_points; + int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block; + int const point_index_y = point_index / n_points_x; + int const point_index_x = point_index - point_index_y * n_points_x; + + float* current_value = &values[fit_index * n_points]; + float const * p = ¶meters[fit_index * n_parameters]; + + float const argx = (point_index_x - p[1]) * (point_index_x - p[1]) / (2 * p[3] * p[3]); + float const argy = (point_index_y - p[2]) * (point_index_y - p[2]) / (2 * p[3] * p[3]); + float const ex = exp(-(argx + argy)); + current_value[point_index] = p[0] * ex + p[4]; + + // derivatives + + float * current_derivative = &derivatives[fit_index * n_points * n_parameters + point_index]; + + current_derivative[0] = ex; + current_derivative[1 * n_points] = p[0] * ex * (point_index_x - p[1]) / (p[3] * p[3]); + current_derivative[2 * n_points] = p[0] * ex * (point_index_y - p[2]) / (p[3] * p[3]); + current_derivative[3 * n_points] = ex * p[0] * ((point_index_x - p[1]) * (point_index_x - p[1]) + (point_index_y - p[2]) * (point_index_y - p[2])) / (p[3] * p[3] * p[3]); + current_derivative[4 * n_points] = 1; +} + +#endif diff --git a/Gpufit/gauss_2d_elliptic.cuh b/Gpufit/gauss_2d_elliptic.cuh new file mode 100644 index 0000000..5417667 --- /dev/null +++ b/Gpufit/gauss_2d_elliptic.cuh @@ -0,0 +1,100 @@ +#ifndef GPUFIT_GAUSS2DELLIPTIC_CUH_INCLUDED +#define GPUFIT_GAUSS2DELLIPTIC_CUH_INCLUDED + +/* Description of the calculate_gauss2delliptic function +* ====================================================== +* +* This function calculates the values of two-dimensional elliptic gauss model +* functions and their partial derivatives with respect to the model parameters. +* +* No independent variables are passed to this model function. Hence, the +* (X, Y) coordinate of the first data value is assumed to be (0.0, 0.0). For +* a fit size of M x N data points, the (X, Y) coordinates of the data are +* simply the corresponding array index values of the data array, starting from +* zero. +* +* Parameters: +* +* parameters: An input vector of concatenated sets of model parameters. +* p[0]: amplitude +* p[1]: center coordinate x +* p[2]: center coordinate y +* p[3]: width x (standard deviation) +* p[4]: width y (standard deviation) +* p[5]: offset +* +* n_fits: The number of fits. (not used) +* +* n_points: The number of data points per fit. +* +* n_parameters: The number of model parameters. +* +* values: An output vector of concatenated sets of model function values. +* +* derivatives: An output vector of concatenated sets of model function partial +* derivatives. +* +* chunk_index: The chunk index. (not used) +* +* user_info: An input vector containing user information. (not used) +* +* user_info_size: The number of elements in user_info. (not used) +* +* Calling the calculate_gauss2delliptic function +* ============================================== +* +* This __device__ function can be only called from a __global__ function or an other +* __device__ function. When calling the function, the blocks and threads of the __global__ +* function must be set up correctly, as shown in the following example code. +* +* dim3 threads(1, 1, 1); +* dim3 blocks(1, 1, 1); +* +* threads.x = n_points * n_fits_per_block; +* blocks.x = n_fits / n_fits_per_block; +* +* global_function<<< blocks,threads >>>(parameter1, ...); +* +*/ + +__device__ void calculate_gauss2delliptic( + float const * parameters, + int const n_fits, + int const n_points, + int const n_parameters, + float * values, + float * derivatives, + int const chunk_index, + char * user_info, + std::size_t const user_info_size) +{ + int const n_points_x = sqrt((float)n_points); + int const n_fits_per_block = blockDim.x / n_points; + int const fit_in_block = threadIdx.x / n_points; + int const point_index = threadIdx.x - (fit_in_block*n_points); + int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block; + + int const point_index_y = point_index / n_points_x; + int const point_index_x = point_index - point_index_y * n_points_x; + + float* current_value = &values[fit_index * n_points]; + float const * p = ¶meters[fit_index * n_parameters]; + + float const argx = (point_index_x - p[1]) * (point_index_x - p[1]) / (2 * p[3] * p[3]); + float const argy = (point_index_y - p[2]) * (point_index_y - p[2]) / (2 * p[4] * p[4]); + float const ex = exp(-(argx + argy)); + current_value[point_index] = p[0] * ex + p[5]; + + // derivatives + + float * current_derivative = &derivatives[fit_index * n_points * n_parameters + point_index]; + + current_derivative[0] = ex; + current_derivative[1 * n_points] = p[0] * ex * (point_index_x - p[1]) / (p[3] * p[3]); + current_derivative[2 * n_points] = p[0] * ex * (point_index_y - p[2]) / (p[4] * p[4]); + current_derivative[3 * n_points] = p[0] * ex * (point_index_x - p[1]) * (point_index_x - p[1]) / (p[3] * p[3] * p[3]); + current_derivative[4 * n_points] = p[0] * ex * (point_index_y - p[2]) * (point_index_y - p[2]) / (p[4] * p[4] * p[4]); + current_derivative[5 * n_points] = 1; +} + +#endif diff --git a/Gpufit/gauss_2d_rotated.cuh b/Gpufit/gauss_2d_rotated.cuh new file mode 100644 index 0000000..09d042f --- /dev/null +++ b/Gpufit/gauss_2d_rotated.cuh @@ -0,0 +1,106 @@ +#ifndef GPUFIT_GAUSS2DROTATED_CUH_INCLUDED +#define GPUFIT_GAUSS2DROTATED_CUH_INCLUDED + +/* Description of the calculate_gauss2drotated function +* ===================================================== +* +* This function calculates the values of two-dimensional elliptic gauss model +* functions including a rotation parameter and their partial derivatives with +* respect to the model parameters. +* +* No independent variables are passed to this model function. Hence, the +* (X, Y) coordinate of the first data value is assumed to be (0.0, 0.0). For +* a fit size of M x N data points, the (X, Y) coordinates of the data are +* simply the corresponding array index values of the data array, starting from +* zero. +* +* Parameters: +* +* parameters: An input vector of concatenated sets of model parameters. +* p[0]: amplitude +* p[1]: center coordinate x +* p[2]: center coordinate y +* p[3]: width x (standard deviation) +* p[4]: width y (standard deviation) +* p[5]: offset +* p[6]: rotation angle [radians] +* +* n_fits: The number of fits. (not used) +* +* n_points: The number of data points per fit. +* +* n_parameters: The number of model parameters. +* +* values: An output vector of concatenated sets of model function values. +* +* derivatives: An output vector of concatenated sets of model function partial +* derivatives. +* +* chunk_index: The chunk index. (not used) +* +* user_info: An input vector containing user information. (not used) +* +* user_info_size: The number of elements in user_info. (not used) +* +* Calling the calculate_gauss2drotated function +* ============================================= +* +* This __device__ function can be only called from a __global__ function or an other +* __device__ function. When calling the function, the blocks and threads of the __global__ +* function must be set up correctly, as shown in the following example code. +* +* dim3 threads(1, 1, 1); +* dim3 blocks(1, 1, 1); +* +* threads.x = n_points * n_fits_per_block; +* blocks.x = n_fits / n_fits_per_block; +* +* global_function<<< blocks,threads >>>(parameter1, ...); +* +*/ + +__device__ void calculate_gauss2drotated( + float const * parameters, + int const n_fits, + int const n_points, + int const n_parameters, + float * values, + float * derivatives, + int const chunk_index, + char * user_info, + std::size_t const user_info_size) +{ + int const n_points_x = sqrt((float)n_points); + int const n_fits_per_block = blockDim.x / n_points; + int const fit_in_block = threadIdx.x / n_points; + int const point_index = threadIdx.x - (fit_in_block*n_points); + int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block; + + int const point_index_y = point_index / n_points_x; + int const point_index_x = point_index - point_index_y * n_points_x; + + float* current_value = &values[fit_index * n_points]; + float const * p = ¶meters[fit_index * n_parameters]; + + float const cosp6 = cosf(p[6]); + float const sinp6 = sinf(p[6]); + + float const arga = (point_index_x - p[1]) * cosp6 - (point_index_y - p[2]) * sinp6; + float const argb = (point_index_x - p[1]) * sinp6 + (point_index_y - p[2]) * cosp6; + float const ex = exp(-0.5 * (((arga / p[3]) * (arga / p[3])) + ((argb / p[4]) * (argb / p[4])))); + current_value[point_index] = p[0] * ex + p[5]; + + // derivatives + + float * current_derivative = &derivatives[fit_index * n_points * n_parameters + point_index]; + + current_derivative[0] = ex; + current_derivative[1 * n_points] = (((p[0] * cosp6 * arga) / (p[3] * p[3])) + ((p[0] * sinp6 * argb) / (p[4] * p[4]))) * ex; + current_derivative[2 * n_points] = (((-p[0] * sinp6 * arga) / (p[3] * p[3])) + ((p[0] * cosp6 * argb) / (p[4] * p[4]))) * ex; + current_derivative[3 * n_points] = p[0] * arga * arga / (p[3] * p[3] * p[3]) * ex; + current_derivative[4 * n_points] = p[0] * argb * argb / (p[4] * p[4] * p[4]) * ex; + current_derivative[5 * n_points] = 1; + current_derivative[6 * n_points] = p[0] * arga * argb * (1.0 / (p[3] * p[3]) - 1.0 / (p[4] * p[4])) * ex; +} + +#endif diff --git a/Gpufit/gpu_data.cu b/Gpufit/gpu_data.cu new file mode 100644 index 0000000..afbca05 --- /dev/null +++ b/Gpufit/gpu_data.cu @@ -0,0 +1,175 @@ +#include "gpu_data.cuh" +#include +#include + +GPUData::GPUData(Info const & info) : + chunk_size_(0), + info_(info), + + data_( info_.max_chunk_size_*info_.n_points_ ), + weights_( info_.use_weights_ ? info_.n_points_ * info_.max_chunk_size_ : 0 ), + parameters_( info_.max_chunk_size_*info_.n_parameters_ ), + prev_parameters_( info_.max_chunk_size_*info_.n_parameters_ ), + parameters_to_fit_indices_( info_.n_parameters_to_fit_ ), + user_info_( info_.user_info_size_ ), + + chi_squares_( info_.max_chunk_size_ ), + prev_chi_squares_( info_.max_chunk_size_ ), + gradients_( info_.max_chunk_size_ * info_.n_parameters_to_fit_ ), + hessians_( info_.max_chunk_size_ * info_.n_parameters_to_fit_ * info_.n_parameters_to_fit_ ), + deltas_(info_.max_chunk_size_ * info_.n_parameters_to_fit_), + + values_( info_.max_chunk_size_ * info_.n_points_ ), + derivatives_( info_.max_chunk_size_ * info_.n_points_ * info_.n_parameters_ ), + + lambdas_( info_.max_chunk_size_ ), + states_( info_.max_chunk_size_ ), + finished_( info_.max_chunk_size_ ), + iteration_falied_(info_.max_chunk_size_), + all_finished_( 1 ), + n_iterations_( info_.max_chunk_size_ ) +{ + +} + +void GPUData::reset(int const chunk_size) +{ + chunk_size_ = chunk_size; + + set(data_, 0.f, chunk_size_ * info_.n_points_); + if (info_.use_weights_) + set(weights_, 0.f, chunk_size_ * info_.n_points_); + set(parameters_, 0.f, chunk_size_ * info_.n_parameters_); + set(prev_parameters_, 0.f, chunk_size_ * info_.n_parameters_); + set(parameters_to_fit_indices_, 0, info_.n_parameters_to_fit_); + + set(chi_squares_, 0.f, chunk_size_); + set(prev_chi_squares_, 0.f, chunk_size_); + set(gradients_, 0.f, chunk_size_ * info_.n_parameters_to_fit_); + set(hessians_, 0.f, chunk_size_ * info_.n_parameters_to_fit_ * info_.n_parameters_to_fit_); + set(deltas_, 0.f, chunk_size_ * info_.n_parameters_to_fit_); + + set(values_, 0.f, chunk_size_*info_.n_points_); + set(derivatives_, 0.f, chunk_size_ * info_.n_points_ * info_.n_parameters_); + + set(lambdas_, 0.f, chunk_size_); + set(states_, 0, chunk_size_); + set(finished_, 0, chunk_size_); + set(iteration_falied_, 0, chunk_size_); + set(all_finished_, 0, 1); + set(n_iterations_, 0, chunk_size_); +} + +void GPUData::init +( + int const chunk_index, + float const * const data, + float const * const weights, + float const * const initial_parameters, + std::vector const & parameters_to_fit_indices) +{ + chunk_index_ = chunk_index; + write( + data_, + &data[chunk_index_*info_.max_chunk_size_*info_.n_points_], + chunk_size_*info_.n_points_); + if (info_.use_weights_) + write(weights_, &weights[chunk_index_*info_.max_chunk_size_*info_.n_points_], + chunk_size_*info_.n_points_); + write( + parameters_, + &initial_parameters[chunk_index_*info_.max_chunk_size_*info_.n_parameters_], + chunk_size_ * info_.n_parameters_); + write(parameters_to_fit_indices_, parameters_to_fit_indices); + + set(lambdas_, 0.001f, chunk_size_); +} + +void GPUData::init_user_info(char const * const user_info) +{ + if (info_.user_info_size_ > 0) + write(user_info_, user_info, info_.user_info_size_); +} + +void GPUData::read(bool * dst, int const * src) +{ + int int_dst = 0; + CUDA_CHECK_STATUS(cudaMemcpy(&int_dst, src, sizeof(int), cudaMemcpyDeviceToHost)); + * dst = (int_dst == 1) ? true : false; +} + +void GPUData::write(float* dst, float const * src, int const count) +{ + CUDA_CHECK_STATUS(cudaMemcpy(dst, src, count * sizeof(float), cudaMemcpyHostToDevice)); +} + +void GPUData::write(int* dst, std::vector const & src) +{ + std::size_t const size = src.size() * sizeof(int); + CUDA_CHECK_STATUS(cudaMemcpy(dst, src.data(), size, cudaMemcpyHostToDevice)); +} + +void GPUData::write(char* dst, char const * src, std::size_t const count) +{ + CUDA_CHECK_STATUS(cudaMemcpy(dst, src, count * sizeof(char), cudaMemcpyHostToDevice)); +} + +void GPUData::copy(float * dst, float const * src, std::size_t const count) +{ + CUDA_CHECK_STATUS(cudaMemcpy(dst, src, count * sizeof(float), cudaMemcpyDeviceToDevice)); +} + +__global__ void set_kernel(int* dst, int const value, int const count) +{ + int const index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index >= count) + return; + + dst[index] = value; +} + +void GPUData::set(int* arr, int const value, int const count) +{ + int const tx = 256; + int const bx = (count / tx) + 1; + + dim3 threads(tx, 1, 1); + dim3 blocks(bx, 1, 1); + + set_kernel<<< blocks, threads >>>(arr, value, count); + CUDA_CHECK_STATUS(cudaGetLastError()); +} + +void GPUData::set(int* arr, int const value) +{ + int const tx = 1; + int const bx = 1; + + dim3 threads(tx, 1, 1); + dim3 blocks(bx, 1, 1); + + set_kernel<<< blocks, threads >>>(arr, value, 1); + CUDA_CHECK_STATUS(cudaGetLastError()); +} + +__global__ void set_kernel(float* dst, float const value, std::size_t const count) +{ + std::size_t const index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index >= count) + return; + + dst[index] = value; +} + +void GPUData::set(float* arr, float const value, int const count) +{ + int const tx = 256; + int const bx = (count / tx) + 1; + + dim3 threads(tx, 1, 1); + dim3 blocks(bx, 1, 1); + set_kernel<<< blocks, threads >>>(arr, value, count); + CUDA_CHECK_STATUS(cudaGetLastError()); +} diff --git a/Gpufit/gpu_data.cuh b/Gpufit/gpu_data.cuh new file mode 100644 index 0000000..b35f09d --- /dev/null +++ b/Gpufit/gpu_data.cuh @@ -0,0 +1,122 @@ +#ifndef GPUFIT_GPU_DATA_CUH_INCLUDED +#define GPUFIT_GPU_DATA_CUH_INCLUDED + +#include "info.h" + +#include + +#include +#include +#include + +template< typename Type > +struct Device_Array +{ + explicit Device_Array( std::size_t const size ) + { + std::size_t const maximum_size = std::numeric_limits< std::size_t >::max() ; + std::size_t const type_size = sizeof( Type ) ; + if (size <= maximum_size / type_size) + { + cudaError_t const status = cudaMalloc( & data_, size * type_size ) ; + if (status == cudaSuccess) + { + return ; + } + else + { + throw std::runtime_error( cudaGetErrorString( status ) ) ; + } + } + else + { + throw std::runtime_error( "maximum array size exceeded" ) ; + } + } + + ~Device_Array() { cudaFree( data_ ) ; } + + operator Type * () { return static_cast< Type * >( data_ ) ; } + operator Type const * () const { return static_cast< Type * >( data_ ) ; } + + Type * copy( std::size_t const size, Type * const to ) const + { + /// \todo check size parameter + + std::size_t const type_size = sizeof( Type ) ; + cudaError_t const status + = cudaMemcpy( to, data_, size * type_size, cudaMemcpyDeviceToHost ) ; + if (status == cudaSuccess) + { + return to + size ; + } + else + { + throw std::runtime_error( cudaGetErrorString( status ) ) ; + } + } + +private: + void * data_ ; +} ; + +class GPUData +{ +public: + GPUData(Info const & info); + + void reset(int const chunk_size); + void init + ( + int const chunk_index, + float const * data, + float const * weights, + float const * initial_parameters, + std::vector const & parameters_to_fit_indices + ) ; + void init_user_info(char const * user_info); + + void read(bool * dst, int const * src); + void set(int* arr, int const value); + void copy(float * dst, float const * src, std::size_t const count); + +private: + void set(float* arr, float const value, int const count); + void set(int* arr, int const value, int const count); + void write(float* dst, float const * src, int const count); + void write(int* dst, std::vector const & src); + void write(char* dst, char const * src, std::size_t const count); + +private: + int chunk_size_; + Info const & info_; + +public: + int chunk_index_; + + Device_Array< float > data_; + Device_Array< float > weights_; + Device_Array< float > parameters_; + Device_Array< float > prev_parameters_; + Device_Array< int > parameters_to_fit_indices_; + Device_Array< char > user_info_; + + Device_Array< float > chi_squares_; + Device_Array< float > prev_chi_squares_; + Device_Array< float > gradients_; + Device_Array< float > hessians_; + Device_Array< float > deltas_; + + + Device_Array< float > values_; + Device_Array< float > derivatives_; + + Device_Array< float > lambdas_; + Device_Array< int > states_; + Device_Array< int > finished_; + Device_Array< int > iteration_falied_; + Device_Array< int > all_finished_; + Device_Array< int > n_iterations_; +}; + +#endif diff --git a/Gpufit/gpufit.cpp b/Gpufit/gpufit.cpp new file mode 100644 index 0000000..e7f2d31 --- /dev/null +++ b/Gpufit/gpufit.cpp @@ -0,0 +1,130 @@ +#include "gpufit.h" +#include "interface.h" + +#include + +std::string last_error ; + +int gpufit +( + size_t n_fits, + size_t n_points, + float * data, + float * weights, + int model_id, + float * initial_parameters, + float tolerance, + int max_n_iterations, + int * parameters_to_fit, + int estimator_id, + size_t user_info_size, + char * user_info, + float * output_parameters, + int * output_states, + float * output_chi_squares, + int * output_n_iterations +) +try +{ + __int32 n_points_32 = 0; + if (n_points <= (unsigned int)(std::numeric_limits<__int32>::max())) + { + n_points_32 = __int32(n_points); + } + else + { + throw std::runtime_error("maximum number of data points per fit exceeded"); + } + + FitInterface fi( + data, + weights, + n_fits, + n_points_32, + tolerance, + max_n_iterations, + estimator_id, + initial_parameters, + parameters_to_fit, + user_info, + user_info_size, + output_parameters, + output_states, + output_chi_squares, + output_n_iterations); + + fi.fit(model_id); + + return STATUS_OK ; +} +catch( std::exception & exception ) +{ + last_error = exception.what() ; + + return STATUS_ERROR ; +} +catch( ... ) +{ + last_error = "unknown error" ; + + return STATUS_ERROR; +} + +char const * gpufit_get_last_error() +{ + return last_error.c_str() ; +} + +int gpufit_cuda_available() +{ + try + { + getDeviceCount(); + return 1; + } + catch (std::exception & exception) + { + last_error = exception.what(); + + return 0; + } +} + +int gpufit_get_cuda_version(int * runtime_version, int * driver_version) +{ + try + { + cudaRuntimeGetVersion(runtime_version); + cudaDriverGetVersion(driver_version); + return 1; + } + catch (std::exception & exception) + { + last_error = exception.what(); + + return 0; + } +} + +int gpufit_portable_interface(int argc, void *argv[]) +{ + + return gpufit( + *((size_t *) argv[0]), + *((size_t *) argv[1]), + (float *) argv[2], + (float *) argv[3], + *((int *) argv[4]), + (float *) argv[5], + *((float *) argv[6]), + *((int *) argv[7]), + (int *) argv[8], + *((int *) argv[9]), + *((size_t *) argv[10]), + (char *) argv[11], + (float *) argv[12], + (int *) argv[13], + (float *) argv[14], + (int *) argv[15]); + +} \ No newline at end of file diff --git a/Gpufit/gpufit.h b/Gpufit/gpufit.h new file mode 100644 index 0000000..985e6d7 --- /dev/null +++ b/Gpufit/gpufit.h @@ -0,0 +1,63 @@ +#ifndef GPU_FIT_H_INCLUDED +#define GPU_FIT_H_INCLUDED + +// fitting model ID +#define GAUSS_1D 0 +#define GAUSS_2D 1 +#define GAUSS_2D_ELLIPTIC 2 +#define GAUSS_2D_ROTATED 3 +#define CAUCHY_2D_ELLIPTIC 4 +#define LINEAR_1D 5 + +// estimator ID +#define LSE 0 +#define MLE 1 + +// fit state +#define STATE_CONVERGED 0 +#define STATE_MAX_ITERATION 1 +#define STATE_SINGULAR_HESSIAN 2 +#define STATE_NEG_CURVATURE_MLE 3 +#define STATE_GPU_NOT_READY 4 + +// gpufit return state +#define STATUS_OK 0 +#define STATUS_ERROR -1 + +#ifdef __cplusplus +extern "C" { +#endif + +int gpufit +( + size_t n_fits, + size_t n_points, + float * data, + float * weights, + int model_id, + float * initial_parameters, + float tolerance, + int max_n_iterations, + int * parameters_to_fit, + int estimator_id, + size_t user_info_size, + char * user_info, + float * output_parameters, + int * output_states, + float * output_chi_squares, + int * output_n_iterations +) ; + +char const * gpufit_get_last_error() ; + +int gpufit_cuda_available(); + +int gpufit_get_cuda_version(int * runtime_version, int * driver_version); + +int gpufit_portable_interface(int argc, void *argv[]); + +#ifdef __cplusplus +} +#endif + +#endif // GPU_FIT_H_INCLUDED diff --git a/Gpufit/info.cpp b/Gpufit/info.cpp new file mode 100644 index 0000000..e2fecca --- /dev/null +++ b/Gpufit/info.cpp @@ -0,0 +1,124 @@ +#include "info.h" +#include + +Info::Info() : + n_parameters_(0), + n_parameters_to_fit_(0), + max_chunk_size_(0), + max_n_iterations_(0), + n_points_(0), + power_of_two_n_points_(0), + n_fits_(0), + user_info_size_(0), + n_fits_per_block_(0), + model_id_(0), + estimator_id_(0), + max_threads_(0), + max_blocks_(0), + available_gpu_memory_(0) +{ +} + +Info::~Info(void) +{ +} + +void Info::set_number_of_parameters_to_fit(int const * const parameters_to_fit) +{ + n_parameters_to_fit_ = n_parameters_; + + for (int i = 0; i < n_parameters_; i++) + { + if (!parameters_to_fit[i]) + { + n_parameters_to_fit_--; + } + } +} + +void Info::set_fits_per_block(std::size_t const current_chunk_size) +{ + n_fits_per_block_ = 8; + bool is_divisible = false; + bool enough_threads = false; + do + { + n_fits_per_block_ /= 2; + is_divisible = current_chunk_size % n_fits_per_block_ == 0; + enough_threads = n_fits_per_block_ * n_points_ < max_threads_ / 4; + } while ((!is_divisible || !enough_threads) && n_fits_per_block_ > 1); +} + +void Info::set_max_chunk_size() +{ + int one_fit_memory + = sizeof(float) + *(2 * n_points_ + + 2 * n_parameters_ + + 2 * n_parameters_to_fit_ + + 1 * n_parameters_to_fit_*n_parameters_to_fit_ + + 1 * n_points_*n_parameters_ + + 4) + + sizeof(int) + * 3; + + if (use_weights_) + one_fit_memory += sizeof(float) * n_points_; + + std::size_t tmp_chunk_size = available_gpu_memory_ / one_fit_memory; + + if (tmp_chunk_size == 0) + { + throw std::runtime_error("not enough free GPU memory available"); + } + + tmp_chunk_size = (std::min)(tmp_chunk_size, max_blocks_); + + std::size_t highest_factor = 1; + + if (n_parameters_to_fit_) + { + highest_factor + = n_points_ + * n_parameters_to_fit_ + * n_parameters_to_fit_ + * sizeof(float); + } + else + { + highest_factor = n_points_ * n_parameters_; + } + + std::size_t const highest_size_t_value + = std::numeric_limits< std::size_t >::max(); + + if (tmp_chunk_size > highest_size_t_value / highest_factor) + { + tmp_chunk_size = highest_size_t_value / highest_factor; + } + + max_chunk_size_ = tmp_chunk_size; + + int i = 1; + int const divisor = 10; + while (tmp_chunk_size > divisor) + { + i *= divisor; + tmp_chunk_size /= divisor; + } + max_chunk_size_ = max_chunk_size_ / i * i; + max_chunk_size_ = std::min(max_chunk_size_, n_fits_); +} + + +void Info::configure() +{ + power_of_two_n_points_ = 1; + while (power_of_two_n_points_ < n_points_) + { + power_of_two_n_points_ *= 2; + } + + get_gpu_properties(); + set_max_chunk_size(); +} diff --git a/Gpufit/info.cu b/Gpufit/info.cu new file mode 100644 index 0000000..60568f8 --- /dev/null +++ b/Gpufit/info.cu @@ -0,0 +1,31 @@ +#include "info.h" +#include + +void Info::get_gpu_properties() +{ + cudaDeviceProp devProp; + CUDA_CHECK_STATUS(cudaGetDeviceProperties(&devProp, 0)); + max_threads_ = devProp.maxThreadsPerBlock; + max_blocks_ = devProp.maxGridSize[0]; + + std::size_t free_bytes; + std::size_t total_bytes; + CUDA_CHECK_STATUS(cudaMemGetInfo(&free_bytes, &total_bytes)); + available_gpu_memory_ = std::size_t(double(free_bytes) * 0.1); + + if (available_gpu_memory_ > user_info_size_) + { + available_gpu_memory_ -= user_info_size_; + } + else + { + throw std::runtime_error("maximum user info size exceeded"); + } +} + +int getDeviceCount() +{ + int deviceCount; + CUDA_CHECK_STATUS(cudaGetDeviceCount(&deviceCount)); + return deviceCount; +} \ No newline at end of file diff --git a/Gpufit/info.h b/Gpufit/info.h new file mode 100644 index 0000000..3f17623 --- /dev/null +++ b/Gpufit/info.h @@ -0,0 +1,48 @@ +#ifndef GPUFIT_PARAMETERS_H_INCLUDED +#define GPUFIT_PARAMETERS_H_INCLUDED + +#include "definitions.h" +#include + + +class Info +{ +public: + Info(); + virtual ~Info(); + + void set_fits_per_block(std::size_t const n_fits); + void set_number_of_parameters_to_fit(int const * parameters_to_fit); + void configure(); + +private: + void get_gpu_properties(); + void set_max_chunk_size(); + +public: + int n_parameters_; + int n_parameters_to_fit_; + + int n_points_; + int power_of_two_n_points_; + + std::size_t n_fits_; + + std::size_t user_info_size_; + + int max_n_iterations_; + std::size_t max_chunk_size_; + int n_fits_per_block_; + int model_id_; + int estimator_id_; + bool use_weights_; + +private: + int max_threads_; + std::size_t max_blocks_; + std::size_t available_gpu_memory_; +}; + +int getDeviceCount(); + +#endif diff --git a/Gpufit/interface.cpp b/Gpufit/interface.cpp new file mode 100644 index 0000000..e8ddac3 --- /dev/null +++ b/Gpufit/interface.cpp @@ -0,0 +1,123 @@ +#include "gpufit.h" +#include "interface.h" + +FitInterface::FitInterface +( + float const * data, + float const * weights, + std::size_t n_fits, + int n_points, + float tolerance, + int max_n_iterations, + int estimator_id, + float const * initial_parameters, + int * parameters_to_fit, + char * user_info, + std::size_t user_info_size, + float * output_parameters, + int * output_states, + float * output_chi_squares, + int * output_n_iterations +) : + data_( data ), + weights_( weights ), + initial_parameters_( initial_parameters ), + parameters_to_fit_( parameters_to_fit ), + user_info_( user_info ), + n_fits_(n_fits), + n_points_(n_points), + tolerance_(tolerance), + max_n_iterations_(max_n_iterations), + estimator_id_(estimator_id), + user_info_size_(user_info_size), + output_parameters_( output_parameters ), + output_states_(output_states), + output_chi_squares_(output_chi_squares), + output_n_iterations_(output_n_iterations), + n_parameters_(0) +{} + +FitInterface::~FitInterface() +{} + +void FitInterface::check_sizes() +{ + std::size_t maximum_size = std::numeric_limits< std::size_t >::max(); + + if (n_fits_ > maximum_size / n_points_ / sizeof(float)) + { + throw std::runtime_error("maximum absolute number of data points exceeded"); + } + + if (n_fits_ > maximum_size / n_parameters_ / sizeof(float)) + { + throw std::runtime_error("maximum number of fits and/or parameters exceeded"); + } +} + +void FitInterface::set_number_of_parameters(int const model_id) +{ + switch (model_id) + { + case GAUSS_1D: + n_parameters_ = 4; + break; + case GAUSS_2D: + n_parameters_ = 5; + break; + case GAUSS_2D_ELLIPTIC: + n_parameters_ = 6; + break; + case GAUSS_2D_ROTATED: + n_parameters_ = 7; + break; + case CAUCHY_2D_ELLIPTIC: + n_parameters_ = 6; + break; + case LINEAR_1D: + n_parameters_ = 2; + break; + default: + break; + } +} + +void FitInterface::configure_info(Info & info, int const model_id) +{ + info.model_id_ = model_id; + info.n_fits_ = n_fits_; + info.n_points_ = n_points_; + info.max_n_iterations_ = max_n_iterations_; + info.estimator_id_ = estimator_id_; + info.user_info_size_ = user_info_size_; + info.n_parameters_ = n_parameters_; + info.use_weights_ = weights_ ? true : false; + + info.set_number_of_parameters_to_fit(parameters_to_fit_); + info.configure(); +} + +void FitInterface::fit(int const model_id) +{ + set_number_of_parameters(model_id); + + check_sizes(); + + Info info; + configure_info(info, model_id); + + LMFit lmfit + ( + data_, + weights_, + info, + initial_parameters_, + parameters_to_fit_, + user_info_, + output_parameters_, + output_states_, + output_chi_squares_, + output_n_iterations_ + ) ; + lmfit.run(tolerance_); +} diff --git a/Gpufit/interface.h b/Gpufit/interface.h new file mode 100644 index 0000000..27814aa --- /dev/null +++ b/Gpufit/interface.h @@ -0,0 +1,63 @@ +#ifndef GPUFIT_INTERFACE_H_INCLUDED +#define GPUFIT_INTERFACE_H_INCLUDED + +#include "lm_fit.h" + +static_assert( sizeof( int ) == 4, "32 bit 'int' type required" ) ; + +class FitInterface +{ +public: + FitInterface + ( + float const * data, + float const * weights, + std::size_t n_fits, + int n_points, + float tolerance, + int max_n_iterations, + int estimator_id, + float const * initial_parameters, + int * parameters_to_fit, + char * user_info, + std::size_t user_info_size, + float * output_parameters, + int * output_states, + float * output_chi_squares, + int * output_n_iterations + ) ; + + virtual ~FitInterface(); + void fit(int const model_id); + +private: + void set_number_of_parameters(int const model_id); + void check_sizes(); + void configure_info(Info & info, int const model_id); + +public: + +private: + //input + float const * const data_ ; + float const * const weights_; + float const * const initial_parameters_; + int const * const parameters_to_fit_; + char * const user_info_; + int n_parameters_; + + std::size_t const n_fits_; + int const n_points_; + float const tolerance_; + int const max_n_iterations_; + int const estimator_id_; + std::size_t const user_info_size_; + + //output + float * output_parameters_; + int * output_states_; + float * output_chi_squares_; + int * output_n_iterations_; +}; + +#endif diff --git a/Gpufit/linear_1d.cuh b/Gpufit/linear_1d.cuh new file mode 100644 index 0000000..0b6a5c8 --- /dev/null +++ b/Gpufit/linear_1d.cuh @@ -0,0 +1,103 @@ +#ifndef GPUFIT_LINEAR1D_CUH_INCLUDED +#define GPUFIT_LINEAR1D_CUH_INCLUDED + +/* Description of the calculate_linear1d function +* =================================================== +* +* This function calculates the values of one-dimensional linear model functions +* and their partial derivatives with respect to the model parameters. +* +* This function makes use of the user information data to pass in the +* independent variables (X values) corresponding to the data. +* +* Note that if no user information is provided, the (X) coordinate of the +* first data value is assumed to be (0.0). In this case, for a fit size of +* M data points, the (X) coordinates of the data are simply the corresponding +* array index values of the data array, starting from zero. +* +* Parameters: +* +* parameters: An input vector of concatenated sets of model parameters. +* p[0]: offset +* p[1]: slope +* +* n_fits: The number of fits. +* +* n_points: The number of data points per fit. +* +* n_parameters: The number of model parameters. +* +* values: An output vector of concatenated sets of model function values. +* +* derivatives: An output vector of concatenated sets of model function partial +* derivatives. +* +* chunk_index: The chunk index. Used for indexing of user_info. +* +* user_info: An input vector containing user information. +* +* user_info_size: The number of elements in user_info. +* +* Calling the calculate_linear1d function +* ======================================= +* +* This __device__ function can be only called from a __global__ function or an other +* __device__ function. When calling the function, the blocks and threads of the __global__ +* function must be set up correctly, as shown in the following example code. +* +* dim3 threads(1, 1, 1); +* dim3 blocks(1, 1, 1); +* +* threads.x = n_points * n_fits_per_block; +* blocks.x = n_fits / n_fits_per_block; +* +* global_function<<< blocks,threads >>>(parameter1, ...); +* +*/ + +__device__ void calculate_linear1d( + float const * parameters, + int const n_fits, + int const n_points, + int const n_parameters, + float * values, + float * derivatives, + int const chunk_index, + char * user_info, + std::size_t const user_info_size) +{ + int const n_fits_per_block = blockDim.x / n_points; + int const fit_in_block = threadIdx.x / n_points; + int const point_index = threadIdx.x - (fit_in_block*n_points); + int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block; + + float * user_info_float = (float*) user_info; + float x = 0.0f; + if (!user_info_float) + { + x = point_index; + } + else if (user_info_size / sizeof(float) == n_points) + { + x = user_info_float[point_index]; + } + else if (user_info_size / sizeof(float) > n_points) + { + int const chunk_begin = chunk_index * n_fits * n_points; + int const fit_begin = fit_index * n_points; + x = user_info_float[chunk_begin + fit_begin + point_index]; + } + + float* current_value = &values[fit_index*n_points]; + float const * current_parameters = ¶meters[fit_index * n_parameters]; + + current_value[point_index] = current_parameters[0] + current_parameters[1] * x; + + // derivatives + + float * current_derivative = &derivatives[fit_index * n_parameters * n_points + point_index]; + current_derivative[0] = 1.f; + current_derivative[1 * n_points] = x; +} + +#endif diff --git a/Gpufit/lm_fit.cpp b/Gpufit/lm_fit.cpp new file mode 100644 index 0000000..19a658f --- /dev/null +++ b/Gpufit/lm_fit.cpp @@ -0,0 +1,92 @@ +#include "lm_fit.h" +#include + +LMFit::LMFit +( + float const * const data, + float const * const weights, + Info & info, + float const * const initial_parameters, + int const * const parameters_to_fit, + char * const user_info, + float * output_parameters, + int * output_states, + float * output_chi_squares, + int * output_n_iterations +) : + data_( data ), + weights_( weights ), + initial_parameters_( initial_parameters ), + parameters_to_fit_( parameters_to_fit ), + user_info_( user_info ), + output_parameters_( output_parameters ), + output_states_( output_states ), + output_chi_squares_( output_chi_squares ), + output_n_iterations_( output_n_iterations ), + info_(info), + chunk_size_(0), + ichunk_(0), + n_fits_left_(info.n_fits_), + parameters_to_fit_indices_(0) +{} + +LMFit::~LMFit() +{} + +void LMFit::set_parameters_to_fit_indices() +{ + int const n_parameters_to_fit = info_.n_parameters_; + for (int i = 0; i < n_parameters_to_fit; i++) + { + if (parameters_to_fit_[i]) + { + parameters_to_fit_indices_.push_back(i); + } + } +} + +void LMFit::get_results(GPUData const & gpu_data, int const n_fits) +{ + output_parameters_ + = gpu_data.parameters_.copy( n_fits*info_.n_parameters_, output_parameters_ ) ; + output_states_ = gpu_data.states_.copy( n_fits, output_states_ ) ; + output_chi_squares_ = gpu_data.chi_squares_.copy( n_fits, output_chi_squares_ ) ; + output_n_iterations_ = gpu_data.n_iterations_.copy( n_fits, output_n_iterations_ ) ; +} + +void LMFit::run(float const tolerance) +{ + set_parameters_to_fit_indices(); + + GPUData gpu_data(info_); + gpu_data.init_user_info(user_info_); + + // loop over data chunks + while (n_fits_left_ > 0) + { + chunk_size_ = int((std::min)(n_fits_left_, info_.max_chunk_size_)); + + info_.set_fits_per_block(chunk_size_); + + gpu_data.reset(chunk_size_); + gpu_data.init( + ichunk_, + data_, + weights_, + initial_parameters_, + parameters_to_fit_indices_); + + LMFitCUDA lmfit_cuda( + tolerance, + info_, + gpu_data, + chunk_size_); + + lmfit_cuda.run(); + + get_results(gpu_data, chunk_size_); + + n_fits_left_ -= chunk_size_; + ichunk_++; + } +} diff --git a/Gpufit/lm_fit.h b/Gpufit/lm_fit.h new file mode 100644 index 0000000..6ee3b86 --- /dev/null +++ b/Gpufit/lm_fit.h @@ -0,0 +1,88 @@ +#ifndef GPUFIT_LM_FIT_H_INCLUDED +#define GPUFIT_LM_FIT_H_INCLUDED + +#include "definitions.h" +#include "info.h" +#include "gpu_data.cuh" + +class LMFitCUDA; + +class LMFit +{ +public: + LMFit + ( + float const * data, + float const * weights, + Info & info, + float const * initial_parameters, + int const * parameters_to_fit, + char * user_info, + float * output_parameters, + int * output_states, + float * output_chi_squares, + int * output_n_iterations + ) ; + + virtual ~LMFit(); + + void run(float const tolerance); + +private: + void set_parameters_to_fit_indices(); + void get_results(GPUData const & gpu_data, int const n_fits); + + float const * const data_ ; + float const * const weights_ ; + float const * const initial_parameters_ ; + int const * const parameters_to_fit_; + char const * const user_info_; + + float * output_parameters_ ; + int * output_states_ ; + float * output_chi_squares_ ; + int * output_n_iterations_ ; + + int ichunk_; + int chunk_size_; + std::size_t n_fits_left_; + + Info & info_; + + std::vector parameters_to_fit_indices_; +}; + +class LMFitCUDA +{ +public: + LMFitCUDA( + float const tolerance, + Info const & info, + GPUData & gpu_data, + int const n_fits); + + virtual ~LMFitCUDA(); + + void run(); + +private: + void calc_curve_values(); + void calc_chi_squares(); + void calc_gradients(); + void calc_hessians(); + void evaluate_iteration(int const iteration); + void solve_equation_system(); + +public: + +private: + Info const & info_; + GPUData & gpu_data_; + int const n_fits_; + + bool all_finished_; + + float tolerance_; +}; + +#endif diff --git a/Gpufit/lm_fit_cuda.cpp b/Gpufit/lm_fit_cuda.cpp new file mode 100644 index 0000000..94799a0 --- /dev/null +++ b/Gpufit/lm_fit_cuda.cpp @@ -0,0 +1,57 @@ +#include "lm_fit.h" + +LMFitCUDA::LMFitCUDA( + float const tolerance, + Info const & info, + GPUData & gpu_data, + int const n_fits + ) : + info_(info), + gpu_data_(gpu_data), + n_fits_(n_fits), + all_finished_(false), + tolerance_(tolerance) +{ +} + +LMFitCUDA::~LMFitCUDA() +{ +} + +void LMFitCUDA::run() +{ + // initialize the chi-square values + calc_curve_values(); + calc_chi_squares(); + calc_gradients(); + calc_hessians(); + + gpu_data_.copy( + gpu_data_.prev_chi_squares_, + gpu_data_.chi_squares_, + n_fits_); + + // loop over the fit iterations + for (int iteration = 0; !all_finished_; iteration++) + { + // modify step width + // Gauss Jordan + // update fitting parameters + solve_equation_system(); + + // calculate fitting curve values and its derivatives + // calculate chi-squares, gradients and hessians + calc_curve_values(); + calc_chi_squares(); + calc_gradients(); + calc_hessians(); + + // check which fits have converged + // flag finished fits + // check whether all fits finished + // save the number of needed iterations by each fitting process + // check whether chi-squares are increasing or decreasing + // update chi-squares, curve parameters and lambdas + evaluate_iteration(iteration); + } +} \ No newline at end of file diff --git a/Gpufit/lm_fit_cuda.cu b/Gpufit/lm_fit_cuda.cu new file mode 100644 index 0000000..8d74fb9 --- /dev/null +++ b/Gpufit/lm_fit_cuda.cu @@ -0,0 +1,253 @@ +#include "lm_fit.h" +#include +#include "cuda_kernels.cuh" +#include "cuda_gaussjordan.cuh" + +void LMFitCUDA::solve_equation_system() +{ + dim3 threads(1, 1, 1); + dim3 blocks(1, 1, 1); + + threads.x = info_.n_parameters_to_fit_*info_.n_fits_per_block_; + threads.y = 1; + blocks.x = n_fits_ / info_.n_fits_per_block_; + blocks.y = 1; + cuda_modify_step_widths<<< blocks, threads >>>( + gpu_data_.hessians_, + gpu_data_.lambdas_, + info_.n_parameters_to_fit_, + gpu_data_.iteration_falied_, + gpu_data_.finished_, + info_.n_fits_per_block_); + CUDA_CHECK_STATUS(cudaGetLastError()); + + int n_parameters_pow2 = 1; + + while (n_parameters_pow2 < info_.n_parameters_to_fit_) + { + n_parameters_pow2 *= 2; + } + + //set up to run the Gauss Jordan elimination + int const n_equations = info_.n_parameters_to_fit_; + int const n_solutions = n_fits_; + + threads.x = n_equations + 1; + threads.y = n_equations; + blocks.x = n_solutions; + blocks.y = 1; + + //set the size of the shared memory area for each block + int const shared_size + = sizeof(float) * ((threads.x * threads.y) + + n_parameters_pow2 + n_parameters_pow2); + + //set up the singular_test vector + int * singular_tests; + CUDA_CHECK_STATUS(cudaMalloc((void**)&singular_tests, n_fits_ * sizeof(int))); + + //run the Gauss Jordan elimination + cuda_gaussjordan<<< blocks, threads, shared_size >>>( + gpu_data_.deltas_, + gpu_data_.gradients_, + gpu_data_.hessians_, + gpu_data_.finished_, + singular_tests, + info_.n_parameters_to_fit_, + n_parameters_pow2); + CUDA_CHECK_STATUS(cudaGetLastError()); + + //set up to update the lm_state_gpu_ variable with the Gauss Jordan results + threads.x = std::min(n_fits_, 256); + threads.y = 1; + blocks.x = int(std::ceil(float(n_fits_) / float(threads.x))); + blocks.y = 1; + + //update the lm_state_gpu_ variable + cuda_update_state_after_gaussjordan<<< blocks, threads >>>( + n_fits_, + singular_tests, + gpu_data_.states_); + CUDA_CHECK_STATUS(cudaGetLastError()); + + CUDA_CHECK_STATUS(cudaFree(singular_tests)); + + threads.x = info_.n_parameters_*info_.n_fits_per_block_; + threads.y = 1; + blocks.x = n_fits_ / info_.n_fits_per_block_; + blocks.y = 1; + cuda_update_parameters<<< blocks, threads >>>( + gpu_data_.parameters_, + gpu_data_.prev_parameters_, + gpu_data_.deltas_, + info_.n_parameters_to_fit_, + gpu_data_.parameters_to_fit_indices_, + gpu_data_.finished_, + info_.n_fits_per_block_); + CUDA_CHECK_STATUS(cudaGetLastError()); +} + +void LMFitCUDA::calc_curve_values() +{ + dim3 threads(1, 1, 1); + dim3 blocks(1, 1, 1); + + threads.x = info_.n_points_ * info_.n_fits_per_block_; + threads.y = 1; + blocks.x = n_fits_ / info_.n_fits_per_block_; + blocks.y = 1; + + cuda_calc_curve_values << < blocks, threads >> >( + gpu_data_.parameters_, + n_fits_, + info_.n_points_, + info_.n_parameters_, + gpu_data_.finished_, + gpu_data_.values_, + gpu_data_.derivatives_, + info_.n_fits_per_block_, + info_.model_id_, + gpu_data_.chunk_index_, + gpu_data_.user_info_, + info_.user_info_size_); + CUDA_CHECK_STATUS(cudaGetLastError()); +} + +void LMFitCUDA::calc_chi_squares() +{ + dim3 threads(1, 1, 1); + dim3 blocks(1, 1, 1); + + int const shared_size + = sizeof(float) + * info_.power_of_two_n_points_ + * info_.n_fits_per_block_; + + threads.x = info_.power_of_two_n_points_*info_.n_fits_per_block_; + threads.y = 1; + blocks.x = n_fits_ / info_.n_fits_per_block_; + blocks.y = 1; + + cuda_calculate_chi_squares <<< blocks, threads, shared_size >>>( + gpu_data_.chi_squares_, + gpu_data_.states_, + gpu_data_.iteration_falied_, + gpu_data_.prev_chi_squares_, + gpu_data_.data_, + gpu_data_.values_, + gpu_data_.weights_, + info_.n_points_, + info_.estimator_id_, + gpu_data_.finished_, + info_.n_fits_per_block_, + gpu_data_.user_info_, + info_.user_info_size_); + CUDA_CHECK_STATUS(cudaGetLastError()); +} + +void LMFitCUDA::calc_gradients() +{ + dim3 threads(1, 1, 1); + dim3 blocks(1, 1, 1); + + int const shared_size + = sizeof(float) + * info_.power_of_two_n_points_ + * info_.n_fits_per_block_; + + threads.x = info_.power_of_two_n_points_*info_.n_fits_per_block_; + threads.y = 1; + blocks.x = n_fits_ / info_.n_fits_per_block_; + blocks.y = 1; + + cuda_calculate_gradients <<< blocks, threads, shared_size >>>( + gpu_data_.gradients_, + gpu_data_.data_, + gpu_data_.values_, + gpu_data_.derivatives_, + gpu_data_.weights_, + info_.n_points_, + info_.n_parameters_, + info_.n_parameters_to_fit_, + gpu_data_.parameters_to_fit_indices_, + info_.estimator_id_, + gpu_data_.finished_, + gpu_data_.iteration_falied_, + info_.n_fits_per_block_, + gpu_data_.user_info_, + info_.user_info_size_); + CUDA_CHECK_STATUS(cudaGetLastError()); +} + +void LMFitCUDA::calc_hessians() +{ + dim3 threads(1, 1, 1); + dim3 blocks(1, 1, 1); + + threads.x = info_.n_parameters_to_fit_; + threads.y = info_.n_parameters_to_fit_; + blocks.x = n_fits_; + blocks.y = 1; + + cuda_calculate_hessians <<< blocks, threads >>>( + gpu_data_.hessians_, + gpu_data_.data_, + gpu_data_.values_, + gpu_data_.derivatives_, + gpu_data_.weights_, + info_.n_points_, + info_.n_parameters_, + info_.n_parameters_to_fit_, + gpu_data_.parameters_to_fit_indices_, + info_.estimator_id_, + gpu_data_.iteration_falied_, + gpu_data_.finished_, + gpu_data_.user_info_, + info_.user_info_size_); + CUDA_CHECK_STATUS(cudaGetLastError()); +} + +void LMFitCUDA::evaluate_iteration(int const iteration) +{ + dim3 threads(1, 1, 1); + dim3 blocks(1, 1, 1); + + threads.x = std::min(n_fits_, 256); + threads.y = 1; + blocks.x = int(std::ceil(float(n_fits_) / float(threads.x))); + blocks.y = 1; + + cuda_check_for_convergence<<< blocks, threads >>>( + gpu_data_.finished_, + tolerance_, + gpu_data_.states_, + gpu_data_.chi_squares_, + gpu_data_.prev_chi_squares_, + iteration, + info_.max_n_iterations_, + n_fits_); + CUDA_CHECK_STATUS(cudaGetLastError()); + + gpu_data_.set(gpu_data_.all_finished_, 1); + + cuda_evaluate_iteration<<< blocks, threads >>>( + gpu_data_.all_finished_, + gpu_data_.n_iterations_, + gpu_data_.finished_, + iteration, + gpu_data_.states_, + n_fits_); + CUDA_CHECK_STATUS(cudaGetLastError()); + + gpu_data_.read(&all_finished_, gpu_data_.all_finished_); + + cuda_prepare_next_iteration<<< blocks, threads >>>( + gpu_data_.lambdas_, + gpu_data_.chi_squares_, + gpu_data_.prev_chi_squares_, + gpu_data_.parameters_, + gpu_data_.prev_parameters_, + n_fits_, + info_.n_parameters_); + CUDA_CHECK_STATUS(cudaGetLastError()); +} diff --git a/Gpufit/lse.cuh b/Gpufit/lse.cuh new file mode 100644 index 0000000..e615b01 --- /dev/null +++ b/Gpufit/lse.cuh @@ -0,0 +1,186 @@ +#ifndef GPUFIT_LSE_CUH_INCLUDED +#define GPUFIT_LSE_CUH_INCLUDED + +/* Description of the calculate_chi_square_lse function +* ===================================================== +* +* This function calculates the chi-square values for the weighted LSE estimator. +* +* Parameters: +* +* chi_square: An output vector of chi-square values for each data point. +* +* point_index: The data point index. +* +* data: An input vector of data values. +* +* value: An input vector of fitting curve values. +* +* weight: An input vector of values for weighting the chi-square values. +* +* state: A pointer to a value which indicates whether the fitting +* process was carreid out correctly or which problem occurred. +* In this function it is not used. It can be used in functions calculating +* other estimators than the LSE, such as MLE. It is passed into this function +* to provide the same interface for all estimator functions. +* +* user_info: An input vector containing user information. (not used) +* +* user_info_size: The number of elements in user_info. (not used) +* +* Calling the calculate_chi_square_lse function +* ============================================= +* +* This __device__ function can be only called from a __global__ function or an other +* __device__ function. +* +*/ + +__device__ void calculate_chi_square_lse( + volatile float * chi_square, + int const point_index, + float const * data, + float const * value, + float const * weight, + int * state, + char * user_info, + std::size_t const user_info_size) +{ + float const deviation = value[point_index] - data[point_index]; + + if (weight) + { + chi_square[point_index] = deviation * deviation * weight[point_index]; + } + else + { + chi_square[point_index] = deviation * deviation; + } +} + +/* Description of the calculate_hessian_lse function +* ================================================== +* +* This function calculates the hessian matrix values of the weighted LSE equation. +* The calculation is performed based on previously calculated fitting curve derivative +* values. +* +* Parameters: +* +* hessian: An output vector of values of the hessian matrix for each data point. +* +* point_index: The data point index. +* +* parameter_index_i: Index of the hessian column. +* +* parameter_index_j: Index of the hessian row. +* +* data: An input vector of data values. +* +* value: An input vector of fitting curve values. +* +* derivative: An input vector of partial derivative values of the fitting +* curve for each data point. +* +* weight: An input vector of values for weighting the hessian matrix values. +* +* user_info: An input vector containing user information. (not used) +* +* user_info_size: The number of elements in user_info. (not used) +* +* Calling the calculate_hessian_lse function +* ========================================== +* +* This __device__ function can be only called from a __global__ function or an other +* __device__ function. +* +*/ + +__device__ void calculate_hessian_lse( + double * hessian, + int const point_index, + int const parameter_index_i, + int const parameter_index_j, + float const * data, + float const * value, + float const * derivative, + float const * weight, + char * user_info, + std::size_t const user_info_size) +{ + if (weight) + { + *hessian + += derivative[parameter_index_i] * derivative[parameter_index_j] + * weight[point_index]; + } + else + { + *hessian + += derivative[parameter_index_i] * derivative[parameter_index_j]; + } +} + +/* Description of the calculate_gradient_lse function +* =================================================== +* +* This function calculates the gradient values of the weighted LSE equation +* based on previously calculated fitting curve derivative values. +* +* Parameters: +* +* gradient: An output vector of values of the gradient vector for each data point. +* +* point_index: The data point index. +* +* parameter_index: The parameter index. +* +* n_parameters: The number of fitting curve parameters. +* +* data: An input vector of data values. +* +* value: An input vector of fitting curve values. +* +* derivative: An input vector of partial derivative values of the fitting +* curve for each data point. +* +* weight: An input vector of values for weighting gradient values. +* +* user_info: An input vector containing user information. (not used) +* +* user_info_size: The number of elements in user_info. (not used) +* +* Calling the calculate_gradient_lse function +* =========================================== +* +* This __device__ function can be only called from a __global__ function or an other +* __device__ function. +* +*/ + +__device__ void calculate_gradient_lse( + volatile float * gradient, + int const point_index, + int const parameter_index, + float const * data, + float const * value, + float const * derivative, + float const * weight, + char * user_info, + std::size_t const user_info_size) +{ + float const deviation = data[point_index] - value[point_index]; + + if (weight) + { + gradient[point_index] + = derivative[parameter_index] * deviation * weight[point_index]; + } + else + { + gradient[point_index] + = derivative[parameter_index] * deviation; + } +} + +#endif diff --git a/Gpufit/matlab/CMakeLists.txt b/Gpufit/matlab/CMakeLists.txt new file mode 100644 index 0000000..b0c5dc8 --- /dev/null +++ b/Gpufit/matlab/CMakeLists.txt @@ -0,0 +1,69 @@ + +# MATLAB Gpufit binding + +find_package( Matlab COMPONENTS MX_LIBRARY ) + +if( NOT Matlab_FOUND ) + message( STATUS "Matlab and/or MX_Library NOT found - skipping Gpufit Matlab binding!" ) + return() +endif() + +# MATLAB MEX FILE + +set( Headers + ) + +set( Sources + mex/GpufitMex.cpp + ) + +add_library( GpufitMex SHARED + ${Headers} + ${Sources} + ) + +set_property( TARGET GpufitMex + PROPERTY SUFFIX .${Matlab_MEX_EXTENSION} ) + +set_property( TARGET GpufitMex + PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" ) + +target_include_directories( GpufitMex PRIVATE ${Matlab_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR} ) + +target_link_libraries( GpufitMex Gpufit ${Matlab_LIBRARIES} ) + +if( WIN32 ) + SET(CMAKE_SHARED_LINKER_FLAGS "/export:mexFunction") +endif() + +add_matlab_launcher( GpufitMex "${CMAKE_CURRENT_SOURCE_DIR}" ) + +# MATLAB Gpufit PACKAGE + +set( build_directory "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/matlab" ) +set( package_files + "${CMAKE_CURRENT_SOURCE_DIR}/EstimatorID.m" + "${CMAKE_CURRENT_SOURCE_DIR}/gpufit.m" + "${CMAKE_CURRENT_SOURCE_DIR}/ModelID.m" + "${CMAKE_CURRENT_SOURCE_DIR}/README.txt" +) +set( binary_gpufit $ ) +set( binary_mex $ ) + +add_custom_target( MATLAB_GPUFIT_PACKAGE + COMMAND ${CMAKE_COMMAND} -E + remove_directory ${build_directory} + COMMAND ${CMAKE_COMMAND} -E + make_directory ${build_directory} + COMMAND ${CMAKE_COMMAND} -E + copy_if_different ${package_files} ${build_directory} + COMMAND ${CMAKE_COMMAND} -E + copy_if_different ${binary_gpufit} ${build_directory} + COMMAND ${CMAKE_COMMAND} -E + copy_if_different ${binary_mex} ${build_directory} + COMMENT "Creating Gpufit Matlab package" +) +set_property( TARGET MATLAB_GPUFIT_PACKAGE PROPERTY FOLDER CMakePredefinedTargets ) +add_dependencies( MATLAB_GPUFIT_PACKAGE Gpufit GpufitMex) + +# add launcher diff --git a/Gpufit/matlab/EstimatorID.m b/Gpufit/matlab/EstimatorID.m new file mode 100644 index 0000000..a853ffa --- /dev/null +++ b/Gpufit/matlab/EstimatorID.m @@ -0,0 +1,6 @@ +classdef EstimatorID + properties (Constant = true) + LSE = 0 + MLE = 1 + end +end \ No newline at end of file diff --git a/Gpufit/matlab/ModelID.m b/Gpufit/matlab/ModelID.m new file mode 100644 index 0000000..174c703 --- /dev/null +++ b/Gpufit/matlab/ModelID.m @@ -0,0 +1,10 @@ +classdef ModelID + properties (Constant = true) + GAUSS_1D = 0 + GAUSS_2D = 1 + GAUSS_2D_ELLIPTIC = 2 + GAUSS_2D_ROTATED = 3 + CAUCHY_2D_ELLIPTIC = 4 + LINEAR_1D = 5 + end +end \ No newline at end of file diff --git a/Gpufit/matlab/README.txt b/Gpufit/matlab/README.txt new file mode 100644 index 0000000..02ddfd2 --- /dev/null +++ b/Gpufit/matlab/README.txt @@ -0,0 +1,19 @@ +Matlab binding for the [Gpufit library](https://github.com/gpufit/Gpufit) which implements Levenberg Marquardt curve fitting in CUDA + +Requirements + +- A CUDA capable graphics card with a recent Nvidia graphics driver (at least 367.48 / July 2016) +- Windows +- Matlab 32/64bit + +Installation + +An installation is not necessary. However, this path must be part of the Matlab path. Use `addpath` if necessary. + +Examples + +See examples folder. The examples are fully functional only from Matlab2014a. + +Troubleshooting + +A common reason for the error message 'CUDA driver version is insufficient for CUDA runtime version' is an outdated Nvidia graphics driver. \ No newline at end of file diff --git a/Gpufit/matlab/examples/gauss2d.m b/Gpufit/matlab/examples/gauss2d.m new file mode 100644 index 0000000..bf478a4 --- /dev/null +++ b/Gpufit/matlab/examples/gauss2d.m @@ -0,0 +1,182 @@ +function gauss2d() +% Example of the Matlab binding of the Gpufit library implementing +% Levenberg Marquardt curve fitting in CUDA +% https://github.com/gpufit/Gpufit +% +% Multiple fits of a 2D Gaussian peak function with Poisson distributed noise +% http://gpufit.readthedocs.io/en/latest/bindings.html#matlab + +% perform some 2D Gaussian peak fits with a symmetrical Gaussian peak +fit_gauss2d(); + +% perform some 2D Gaussian peak fits with an asymmetrical, rotated Gaussian peak +fit_gauss2d_rotated(); + +end +function fit_gauss2d() + +%% number of fits and fit points +number_fits = 1e4; +size_x = 20; +number_parameters = 5; + +%% set input arguments + +% true parameters +true_parameters = single([20, 9.5, 9.5, 3, 10]); + +% initialize random number generator +rng(0); + +% initial parameters (randomized) +initial_parameters = repmat(single(true_parameters'), [1, number_fits]); +% randomize relative to width for positions +initial_parameters([2,3], :) = initial_parameters([2,3], :) + true_parameters(4) * (-0.2 + 0.4 * rand(2, number_fits)); +% randomize relative for other parameters +initial_parameters([1,4,5], :) = initial_parameters([1,4,5], :) .* (0.8 + 0.4 * rand(3, number_fits)); + +% generate x and y values +g = single(0 : size_x - 1); +[x, y] = ndgrid(g, g); + +% generate data with Poisson noise +data = gaussian_2d(x, y, true_parameters); +data = repmat(data(:), [1, number_fits]); +data = poissrnd(data); + +% tolerance +tolerance = 1e-3; + +% maximum number of iterations +max_n_iterations = 20; + +% estimator id +estimator_id = EstimatorID.MLE; + +% model ID +model_id = ModelID.GAUSS_2D; + +%% run Gpufit +[parameters, states, chi_squares, n_iterations, time] = gpufit(data, [], ... + model_id, initial_parameters, tolerance, max_n_iterations, [], estimator_id, []); + +%% displaying results +display_results('2D Gaussian peak', model_id, number_fits, number_parameters, size_x, time, true_parameters, parameters, states, chi_squares, n_iterations); + +end + +function fit_gauss2d_rotated() + +%% number of fits and fit points +number_fits = 1e4; +size_x = 20; +number_parameters = 7; + +%% set input arguments + +% true parameters +true_parameters = single([200, 9.5, 9.5, 3, 4, 10, 0.5]); + +% initialize random number generator +rng(0); + +% initial parameters (randomized) +initial_parameters = repmat(single(true_parameters'), [1, number_fits]); +% randomize relative to width for positions +initial_parameters(2, :) = initial_parameters(2, :) + true_parameters(4) * (-0.2 + 0.4 * rand(1, number_fits)); +initial_parameters(3, :) = initial_parameters(3, :) + true_parameters(5) * (-0.2 + 0.4 * rand(1, number_fits)); +% randomize relative for other parameters +initial_parameters([1,4,5,6,7], :) = initial_parameters([1,4,5,6,7], :) .* (0.8 + 0.4 * rand(5, number_fits)); + +% generate x and y values +g = single(0 : size_x - 1); +[x, y] = ndgrid(g, g); + +% generate data with Poisson noise +data = gaussian_2d_rotated(x, y, true_parameters); +data = repmat(data(:), [1, number_fits]); +data = poissrnd(data); + +% tolerance +tolerance = 1e-3; + +% maximum number of iterations +max_n_iterations = 20; + +% estimator id +estimator_id = EstimatorID.MLE; + +% model ID +model_id = ModelID.GAUSS_2D_ROTATED; + +%% run Gpufit +[parameters, states, chi_squares, n_iterations, time] = gpufit(data, [], ... + model_id, initial_parameters, tolerance, max_n_iterations, [], estimator_id, []); + +%% displaying results +display_results('2D rotated Gaussian peak', model_id, number_fits, number_parameters, size_x, time, true_parameters, parameters, states, chi_squares, n_iterations); + + +end + +function g = gaussian_2d(x, y, p) +% Generates a 2D Gaussian peak. +% http://gpufit.readthedocs.io/en/latest/api.html#gauss-2d +% +% x,y - x and y grid position values +% p - parameters (amplitude, x,y center position, width, offset) + +g = p(1) * exp(-((x - p(2)).^2 + (y - p(3)).^2) / (2 * p(4)^2)) + p(5); + +end + +function g = gaussian_2d_rotated(x, y, p) +% Generates a 2D rotated elliptic Gaussian peak. +% http://gpufit.readthedocs.io/en/latest/api.html#d-rotated-elliptic-gaussian-peak +% +% x,y - x and y grid position values +% p - parameters (amplitude, x,y center position, width, offset) + +% cosine and sine of rotation angle +cp = cos(p(7)); +sp = sin(p(7)); + +% Gaussian peak with two axes +arga = (x - p(2)) .* cp - (y - p(3)) .* sp; +argb = (x - p(2)) .* sp + (y - p(3)) .* cp; +ex = exp(-0.5 .* (((arga / p(4)) .* (arga / p(4))) + ((argb / p(5)) .* (argb / p(5))))); +g = p(1) .* ex + p(6); + +end + +function display_results(name, model_id, number_fits, number_parameters, size_x, time, true_parameters, parameters, states, chi_squares, n_iterations) + +%% displaying results +converged = states == 0; +fprintf('\nGpufit of %s\n', name); + +% print summary +fprintf('\nmodel ID: %d\n', model_id); +fprintf('number of fits: %d\n', number_fits); +fprintf('fit size: %d x %d\n', size_x, size_x); +fprintf('mean chi-square: %6.2f\n', mean(chi_squares(converged))); +fprintf('mean iterations: %6.2f\n', mean(n_iterations(converged))); +fprintf('time: %6.2f s\n', time); + +% get fit states +number_converged = sum(converged); +fprintf('\nratio converged %6.2f %%\n', number_converged / number_fits * 100); +fprintf('ratio max it. exceeded %6.2f %%\n', sum(states == 1) / number_fits * 100); +fprintf('ratio singular hessian %6.2f %%\n', sum(states == 2) / number_fits * 100); +fprintf('ratio neg curvature MLE %6.2f %%\n', sum(states == 3) / number_fits * 100); + +% mean and std of fitted parameters +converged_parameters = parameters(:, converged); +converged_parameters_mean = mean(converged_parameters, 2); +converged_parameters_std = std(converged_parameters, [], 2); +fprintf('\nparameters of %s\n', name); +for i = 1 : number_parameters + fprintf('p%d true %6.2f mean %6.2f std %6.2f\n', i, true_parameters(i), converged_parameters_mean(i), converged_parameters_std(i)); +end + +end \ No newline at end of file diff --git a/Gpufit/matlab/examples/gauss2d_comparison.m b/Gpufit/matlab/examples/gauss2d_comparison.m new file mode 100644 index 0000000..39dc68b --- /dev/null +++ b/Gpufit/matlab/examples/gauss2d_comparison.m @@ -0,0 +1,206 @@ +function gauss2d_comparison() +% Example of the Matlab binding of the Gpufit library implementing +% Levenberg Marquardt curve fitting in CUDA +% https://github.com/gpufit/Gpufit +% +% Multiple fits of a 2D Gaussian peak function with Poisson distributed noise +% compared to a generic Matlab implementation using fminunc and supplying +% the gradient by the user (uses quasi-newton as algorithm) +% http://gpufit.readthedocs.io/en/latest/bindings.html#matlab + +%% number of fits and fit points +number_fits = 1e3; +size_x = 20; +number_parameters = 5; + +%% set input arguments + +% true parameters +true_parameters = single([10, 9.5, 9.5, 3, 10]); + +% initialize random number generator +rng(0); + +% initial parameters (randomized) +initial_parameters = repmat(single(true_parameters'), [1, number_fits]); +% randomize relative to width for positions +initial_parameters([2,3], :) = initial_parameters([2,3], :) + true_parameters(4) * (-0.2 + 0.4 * rand(2, number_fits)); +% randomize relative for other parameters +initial_parameters([1,4,5], :) = initial_parameters([1,4,5], :) .* (0.8 + 0.4 * rand(3, number_fits)); + +% generate x and y values +g = single(0 : size_x - 1); +[x, y] = ndgrid(g, g); + +% generate data with Poisson noise +data = gaussian_2d(x, y, true_parameters); +data = repmat(data(:), [1, number_fits]); +data = poissrnd(data); + +% tolerance +tolerance = 1e-4; + +% maximum number of iterations +max_n_iterations = 20; + +% estimator id +estimator_id = EstimatorID.MLE; + +% model ID +model_id = ModelID.GAUSS_2D; % Gaussian peak in 2D + +%% run Gpufit +fprintf('run Gpufit\n'); +[gf_parameters, gf_states, gf_chi_squares, gf_n_iterations, time] = gpufit(data, [], ... + model_id, initial_parameters, tolerance, max_n_iterations, [], estimator_id, []); + +% display results +display_results('Gpufit', gf_parameters, gf_states, gf_chi_squares, gf_n_iterations, time, true_parameters); + +% store parameters + +%% run Matlab + +% convert data and initial_parameters to double (otherwise causes an error +% in fminunc) +data = double(data); +initial_parameters = double(initial_parameters); +xi = double(x(:)'); +yi = double(y(:)'); + +% set fit options +options = optimoptions(@fminunc,'Display', 'off', 'MaxIter', max_n_iterations, 'Algorithm', 'quasi-newton', 'TolFun', tolerance, 'GradObj', 'on', 'DerivativeCheck', 'off', 'Diagnostics', 'off'); + +% initialize output arrays +m_parameters = zeros(number_parameters, number_fits); +m_states = zeros(1, number_fits); +m_chi_squares = zeros(1, number_fits); +m_n_iterations = zeros(1, number_fits); + +% loop over each fit +fprintf('\n') +progress = 0; +L = 50; % length of progressbar +tic; +for i = 1 : number_fits + + % get data and initial_parameters + d = data(:, i)'; + p0 = initial_parameters(:, i); + + % define minimizer function (give grid and data as implicit parameters) + fun = @(p) minimizer(p, xi, yi, d); + + % call to fminunc + [p, fval, exitflag, output] = fminunc(fun, p0, options); + + % copy to output + m_parameters(:, i) = p; + m_chi_squares(i) = fval; + m_states(i) = exitflag - 1; + m_n_iterations(i) = output.iterations; + + progress = progress + 1; + if progress >= number_fits / L + progress = 0; + fprintf('|'); + end +end +time = toc; +fprintf(repmat('\b', [1, L])); + +% display results +display_results('Matlab (one CPU kernel)', m_parameters, m_states, m_chi_squares, m_n_iterations, time, true_parameters); + +end + +function [f, g] = minimizer(p, xi, yi, d) +% calls the model with the current parameters, then the likelihood function +% and returns value and derivatives of the likelihood function +% +% p - current parameters +% xi, yi - grid positions +% d - current data + +if nargout > 1 + [m, mg] = gaussian_2d_with_gradient(xi, yi, p); + [f, g] = poisson_likelihood(m, mg, d); +else + m = gaussian_2d(xi, yi, p); + f = poisson_likelihood(m, [], d); +end + +end + +function [f, g] = poisson_likelihood(m, mg, d) +% Calculates value and derivatives of the poisson likelihood function for +% given model and model derivatives + +h = d > 0; +f = 2 * (sum(m-d) - sum(d(h) .* log(m(h) ./ d(h)))); + +if nargout > 1 % gradient required + h = 2 * (1 - d ./ max(m, 1e-6)); + h = repmat(h, [size(mg, 1), 1]); + g = h .* mg; + g = sum(g, 2); +end + +end + + +function display_results(name, parameters, ~, chi_squares, n_iterations, time, true_parameters) +% displaying results + +fprintf('*%s*\n', name); +number_parameters = size(parameters, 1); +number_fits = size(parameters, 2); + +% print summary +fprintf('\nnumber of fits: %d\n', number_fits); +fprintf('mean chi-square: %6.2f\n', mean(chi_squares)); +fprintf('mean iterations: %6.2f\n', mean(n_iterations)); +fprintf('time: %6.2f s\n', time); +fprintf('fits per second: %.0f\n', number_fits / time); + +% mean and std of fitted parameters +parameters_mean = mean(parameters, 2); +parameters_std = std(parameters, [], 2); +fprintf('\nparameters of 2D Gaussian peak\n'); +for i = 1 : number_parameters + fprintf('p%d true %6.2f mean %6.2f std %6.2f\n', i, true_parameters(i), parameters_mean(i), parameters_std(i)); +end + +end + +function f = gaussian_2d(x, y, p) +% Generates a 2D Gaussian peak. +% http://gpufit.readthedocs.io/en/latest/api.html#gauss-2d +% +% x,y - x and y grid position values +% p - parameters (amplitude, x,y center position, width, offset) + +f = p(1) * exp(-((x - p(2)).^2 + (y - p(3)).^2) / (2 * p(4)^2)) + p(5); + +end + +function [f, g] = gaussian_2d_with_gradient(x, y, p) +% Computes the gradient for a 2D Gaussian peak with respect to parameters. + +dx = x - p(2); +dy = y - p(3); +p42 = p(4)^2; +arg = (dx.^2 + dy.^2) / p42; +exp_f = exp(-0.5 * arg); +p1_exp_f = p(1) * exp_f; + +f = p1_exp_f + p(5); + +g1 = exp_f; +g2 = p1_exp_f .* dx / p42; +g3 = p1_exp_f .* dy / p42; +g4 = p1_exp_f .* arg / p(4); +g5 = ones(size(x)); +g = [g1; g2; g3; g4; g5]; + +end diff --git a/Gpufit/matlab/examples/gauss2d_plot.m b/Gpufit/matlab/examples/gauss2d_plot.m new file mode 100644 index 0000000..cef6adc --- /dev/null +++ b/Gpufit/matlab/examples/gauss2d_plot.m @@ -0,0 +1,117 @@ +function gauss2d_plot() +% Example of the Matlab binding of the Gpufit library implementing +% Levenberg Marquardt curve fitting in CUDA +% https://github.com/gpufit/Gpufit +% +% Multiple fits of a 2D Gaussian peak function with Poisson distributed noise +% repeated for a different total number of fits each time and plotting the +% results +% http://gpufit.readthedocs.io/en/latest/bindings.html#matlab + +%% number of fit points +size_x = 5; +n_points = size_x * size_x; + +%% set input arguments + +% mean true parameters +mean_true_parameters = single([100, 3, 3, 1, 10]); + +% average noise level +average_noise_level = 10; + +% initialize random number generator +rng(0); + +% tolerance +tolerance = 1e-4; + +% max number of itetations +max_n_iterations = 10; + +% model id +model_id = ModelID.GAUSS_2D; + +%% loop over different number of fits +n_fits_all = round(logspace(2, 6, 20)); + +% generate x and y values +g = single(0 : size_x - 1); +[x, y] = ndgrid(g, g); + +% loop +speed = zeros(length(n_fits_all), 1); +for i = 1:length(n_fits_all) + n_fits = n_fits_all(i); + + % vary positions of 2D Gaussians peaks slightly + test_parameters = repmat(mean_true_parameters', [1, n_fits]); + test_parameters([2,3], :) = test_parameters([2,3], :) + mean_true_parameters(4) * (-0.2 + 0.4 * rand(2, n_fits)); + + % generate data + data = gaussians_2d(x, y, test_parameters); + data = reshape(data, [n_points, n_fits]); + + % add noise + data = data + average_noise_level * randn(size(data), 'single'); + + % initial parameters (randomized) + initial_parameters = repmat(mean_true_parameters', [1, n_fits]); + % randomize relative to width for positions + initial_parameters([2,3], :) = initial_parameters([2,3], :) + mean_true_parameters(4) * (-0.2 + 0.4 * rand(2, n_fits)); + % randomize relative for other parameters + initial_parameters([1,4,5], :) = initial_parameters([1,4,5], :) .* (0.8 + 0.4 * rand(3, n_fits)); + + % run Gpufit + [parameters, states, chi_squares, n_iterations, time] = gpufit(data, [], ... + model_id, initial_parameters, tolerance, max_n_iterations); + + % analyze result + converged = states == 0; + speed(i) = n_fits / time; + precision_x0 = std(parameters(2, converged) - test_parameters(2, converged)); + + % display result + fprintf(' iterations: %.2f | time: %.3f s | speed: %8.0f fits/s\n', ... + mean(n_iterations(converged)), time, speed(i)); +end + +%% plot +figure(); +semilogx(n_fits_all, speed, 'bo-') +xlabel('number of fits per function call') +ylabel('fits per second') +legend('Gpufit', 'Location', 'NorthWest') +grid on; +xlim(n_fits_all([1,end])); + +end + +function g = gaussians_2d(x, y, p) +% Generates many 2D Gaussians peaks for a given set of parameters + +n_fits = size(p, 2); +msg = sprintf('generating %d fits ', n_fits); +fprintf(msg); + +g = zeros([size(x), n_fits], 'single'); + +progress = 0; +L = 50; % length of progressbar +l = 0; +for i = 1 : n_fits + + pi = p(:, i); + g(:, :, i) = pi(1) * exp(-((x - pi(2)).^2 + (y - pi(3)).^2) / (2 * pi(4)^2)) + pi(5); + + progress = progress + 1; + if progress >= n_fits / L + progress = 0; + fprintf('|'); + l = l + 1; + end +end +fprintf(repmat('\b', [1, length(msg) + l])); +fprintf('%7d fits', n_fits); + +end diff --git a/Gpufit/matlab/examples/simple.m b/Gpufit/matlab/examples/simple.m new file mode 100644 index 0000000..27487d1 --- /dev/null +++ b/Gpufit/matlab/examples/simple.m @@ -0,0 +1,26 @@ +function simple() +% Example of the Matlab binding of the Gpufit library implementing +% Levenberg Marquardt curve fitting in CUDA +% https://github.com/gpufit/Gpufit +% +% Simple example demonstrating a minimal call of all needed parameters for the Matlab interface +% http://gpufit.readthedocs.io/en/latest/bindings.html#matlab + +% number of fits, number of points per fit +number_fits = 10; +number_points = 10; + +% model ID and number of parameter +model_id = ModelID.GAUSS_1D; +number_parameter = 4; + +% initial parameters +initial_parameters = zeros(number_parameter, number_fits, 'single'); + +% data +data = zeros(number_points, number_fits, 'single'); + +% run Gpufit +[parameters, states, chi_squares, number_iterations, execution_time] = gpufit(data, [], model_id, initial_parameters); + +end \ No newline at end of file diff --git a/Gpufit/matlab/gpufit.m b/Gpufit/matlab/gpufit.m new file mode 100644 index 0000000..2e3beae --- /dev/null +++ b/Gpufit/matlab/gpufit.m @@ -0,0 +1,119 @@ +function [parameters, states, chi_squares, n_iterations, time]... + = gpufit(data, weights, model_id, initial_parameters, tolerance, max_n_iterations, parameters_to_fit, estimator_id, user_info) +% Wrapper around the Gpufit mex file. +% +% Optional arguments can be given as empty matrix []. +% +% Default values as specified + +%% size checks + +% number of input parameter (variable) +if nargin < 9 + user_info = []; + if nargin < 8 + estimator_id = []; + if nargin < 7 + parameters_to_fit = []; + if nargin < 6 + max_n_iterations = []; + if nargin < 5 + tolerance = []; + assert(nargin == 4, 'Not enough parameters'); + end + end + end + end +end + +% data is 2D and read number of points and fits +data_size = size(data); +assert(length(data_size) == 2, 'data is not two-dimensional'); +n_points = data_size(1); +n_fits = data_size(2); + +% consistency with weights (if given) +if ~isempty(weights) + assert(isequal(data_size, size(weights)), 'Dimension mismatch between data and weights') +end + +% initial parameters is 2D and read number of parameters +initial_parameters_size = size(initial_parameters); +assert(length(initial_parameters_size) == 2, 'initial_parameters is not two-dimensional'); +n_parameters = initial_parameters_size(1); +assert(n_fits == initial_parameters_size(2), 'Dimension mismatch in number of fits between data and initial_parameters'); + +% consistency with parameters_to_fit (if given) +if ~isempty(parameters_to_fit) + assert(size(parameters_to_fit, 1) == n_parameters, 'Dimension mismatch in number of parameters between initial_parameters and parameters_to_fit'); +end + +%% default values + +% tolerance +if isempty(tolerance) + tolerance = 1e-4; +end + +% max_n_iterations +if isempty(max_n_iterations) + max_n_iterations = 25; +end + +% estimator_id +if isempty(estimator_id) + estimator_id = EstimatorID.LSE; +end + +% parameters_to_fit +if isempty(parameters_to_fit) + parameters_to_fit = ones(n_parameters, 1, 'int32'); +end + +% now only weights and user_info could be not given (empty matrix) + +%% type checks + +% data, weights (if given), initial_parameters are all single +assert(isa(data, 'single'), 'Type of data is not single'); +if ~isempty(weights) + assert(isa(weights, 'single'), 'Type of weights is not single'); +end +assert(isa(initial_parameters, 'single'), 'Type of initial_parameters is not single'); + +% parameters_to_fit is int32 (cast to int32 if incorrect type) +if ~isa(parameters_to_fit, 'int32') + parameters_to_fit = int32(parameters_to_fit); +end + +% max_n_iterations must be int32 (cast if incorrect type) +if ~isa(max_n_iterations, 'int32') + max_n_iterations = int32(max_n_iterations); +end + +% tolerance must be single (cast if incorrect type) +if ~isa(tolerance, 'single') + tolerance = single(tolerance); +end + +% we don't check type of user_info, but we extract the size in bytes of it +if ~isempty(user_info) + user_info_info = whos('user_info'); + user_info_size = user_info_info.bytes; +else + user_info_size = 0; +end + + +%% run Gpufit taking the time +tic; +[parameters, states, chi_squares, n_iterations] ... + = GpufitMex(data, weights, n_fits, n_points, tolerance, max_n_iterations, estimator_id, initial_parameters, parameters_to_fit, model_id, n_parameters, user_info, user_info_size); + +time = toc; + +% reshape the output parameters array to have dimensions +% (n_parameters,n_fits) +parameters = reshape(parameters,n_parameters,n_fits); + +end diff --git a/Gpufit/matlab/mex/GpufitMex.cpp b/Gpufit/matlab/mex/GpufitMex.cpp new file mode 100644 index 0000000..071ed7c --- /dev/null +++ b/Gpufit/matlab/mex/GpufitMex.cpp @@ -0,0 +1,150 @@ +#include "Gpufit/gpufit.h" + +#include + +#include +#include + +/* + Get a arbitrary scalar (non complex) and check for class id. + https://www.mathworks.com/help/matlab/apiref/mxclassid.html +*/ +template inline bool get_scalar(const mxArray *p, T &v, const mxClassID id) +{ + if (mxIsNumeric(p) && !mxIsComplex(p) && mxGetNumberOfElements(p) == 1 && mxGetClassID(p) == id) + { + v = *static_cast(mxGetData(p)); + return true; + } + else { + return false; + } +} + +void mexFunction( + int nlhs, + mxArray *plhs[], + int nrhs, + mxArray const *prhs[]) +{ + int expected_nrhs = 0; + int expected_nlhs = 0; + bool wrong_nrhs = false; + bool wrong_nlhs = false; + + // expects a certain number of input (nrhs) and output (nlhs) arguments + expected_nrhs = 13; + expected_nlhs = 4; + if (nrhs != expected_nrhs) + { + wrong_nrhs = true; + } + else if (nlhs != expected_nlhs) + { + wrong_nlhs = true; + } + + if (wrong_nrhs || wrong_nlhs) + { + if (nrhs != expected_nrhs) + { + char s1[50]; + _itoa_s(expected_nrhs, s1, 10); + char const s2[] = " input arguments required."; + size_t const string_length = strlen(s1) + 1 + strlen(s2); + strcat_s(s1, string_length, s2); + mexErrMsgIdAndTxt("Gpufit:Mex", s1); + } + else if (nlhs != expected_nlhs) + { + char s1[50]; + _itoa_s(expected_nlhs, s1, 10); + char const s2[] = " output arguments required."; + size_t const string_length = strlen(s1) + 1 + strlen(s2); + strcat_s(s1, string_length, s2); + mexErrMsgIdAndTxt("Gpufit:Mex", s1); + } + } + + // input parameters + float * data = (float*)mxGetPr(prhs[0]); + float * weights = (float*)mxGetPr(prhs[1]); + std::size_t n_fits = (std::size_t)*mxGetPr(prhs[2]); + std::size_t n_points = (std::size_t)*mxGetPr(prhs[3]); + + // tolerance + float tolerance = 0; + if (!get_scalar(prhs[4], tolerance, mxSINGLE_CLASS)) + { + mexErrMsgIdAndTxt("Gpufit:Mex", "tolerance is not a single"); + } + + // max_n_iterations + int max_n_iterations = 0; + if (!get_scalar(prhs[5], max_n_iterations, mxINT32_CLASS)) + { + mexErrMsgIdAndTxt("Gpufit:Mex", "max_n_iteration is not int32"); + } + + int estimator_id = (int)*mxGetPr(prhs[6]); + float * initial_parameters = (float*)mxGetPr(prhs[7]); + int * parameters_to_fit = (int*)mxGetPr(prhs[8]); + int model_id = (int)*mxGetPr(prhs[9]); + int n_parameters = (int)*mxGetPr(prhs[10]); + int * user_info = (int*)mxGetPr(prhs[11]); + std::size_t user_info_size = (std::size_t)*mxGetPr(prhs[12]); + + // output parameters + float * output_parameters; + mxArray * mx_parameters; + mx_parameters = mxCreateNumericMatrix(1, n_fits*n_parameters, mxSINGLE_CLASS, mxREAL); + output_parameters = (float*)mxGetData(mx_parameters); + plhs[0] = mx_parameters; + + int * output_states; + mxArray * mx_states; + mx_states = mxCreateNumericMatrix(1, n_fits, mxINT32_CLASS, mxREAL); + output_states = (int*)mxGetData(mx_states); + plhs[1] = mx_states; + + float * output_chi_squares; + mxArray * mx_chi_squares; + mx_chi_squares = mxCreateNumericMatrix(1, n_fits, mxSINGLE_CLASS, mxREAL); + output_chi_squares = (float*)mxGetData(mx_chi_squares); + plhs[2] = mx_chi_squares; + + int * output_n_iterations; + mxArray * mx_n_iterations; + mx_n_iterations = mxCreateNumericMatrix(1, n_fits, mxINT32_CLASS, mxREAL); + output_n_iterations = (int*)mxGetData(mx_n_iterations); + plhs[3] = mx_n_iterations; + + // call to gpufit + int const status + = gpufit + ( + n_fits, + n_points, + data, + weights, + model_id, + initial_parameters, + tolerance, + max_n_iterations, + parameters_to_fit, + estimator_id, + user_info_size, + reinterpret_cast< char * >( user_info ), + output_parameters, + output_states, + output_chi_squares, + output_n_iterations + ) ; + + // check status + if (status != STATUS_OK) + { + std::string const error = gpufit_get_last_error() ; + mexErrMsgIdAndTxt( "Gpufit:Mex", error.c_str() ) ; + } +} diff --git a/Gpufit/matlab/tests/gauss_fit_1d_test.m b/Gpufit/matlab/tests/gauss_fit_1d_test.m new file mode 100644 index 0000000..412c72e --- /dev/null +++ b/Gpufit/matlab/tests/gauss_fit_1d_test.m @@ -0,0 +1,35 @@ +% Equivalent/similar to tests/Gauss_Fit_1D.cpp + +% constants +n_fits = 1; +n_points = 5; +n_parameters = 4; +true_parameters = single([4; 2; 0.5; 1]); + +% data +x = single((1:n_points)' - 1); +y = gaussian_1d(true_parameters, x); +data = zeros(n_points, n_fits, 'single'); +data(:, 1) = y; + +% model +model_id = ModelID.GAUSS_1D; + +% initial_parameters +initial_parameters = zeros(n_parameters, n_fits, 'single'); +initial_parameters(:, 1) = [2, 1.5, 0.3, 0]; + +% call to gpufit +[parameters, states, chi_squares, n_iterations] = gpufit(data, [], model_id, initial_parameters); + +%% Test results +assert(states == 0); +assert(n_iterations < 10); +assert(chi_squares < 1e-6); +assert(all(abs(parameters - true_parameters) < 1e-6)); + +function y = gaussian_1d(p, x) + +y = p(1) * exp(-(x - p(2)).^2 ./ (2 * p(3).^2)) + p(4); + +end \ No newline at end of file diff --git a/Gpufit/matlab/tests/run_tests.m b/Gpufit/matlab/tests/run_tests.m new file mode 100644 index 0000000..80da345 --- /dev/null +++ b/Gpufit/matlab/tests/run_tests.m @@ -0,0 +1,8 @@ +function run_tests() +% Runs all test scripts in this folder. +% See also: http://www.mathworks.com/help/matlab/script-based-unit-tests.html + +suite = testsuite(); +result = run(suite); +disp(result); +end \ No newline at end of file diff --git a/Gpufit/mle.cuh b/Gpufit/mle.cuh new file mode 100644 index 0000000..32a45a0 --- /dev/null +++ b/Gpufit/mle.cuh @@ -0,0 +1,179 @@ +#ifndef GPUFIT_MLE_CUH_INCLUDED +#define GPUFIT_MLE_CUH_INCLUDED + +#include + +/* Description of the calculate_chi_square_mle function +* ===================================================== +* +* This function calculates the chi-square values for the MLE estimator. +* +* Parameters: +* +* chi_square: An output vector of chi-square values for each data point. +* +* point_index: The data point index. +* +* data: An input vector of data. +* +* value: An input vector of fitting curve values. +* +* weight: An input vector of values for weighting chi-square values. It is not used +* in this function. It can be used in functions calculating other estimators +* than the MLE, such as LSE. +* +* state: A pointer to a value which indicates whether the fitting process was carreid +* out correctly or which problem occurred. It is set to 3 if a fitting curve +* value is negative. +* +* user_info: An input vector containing user information. (not used) +* +* user_info_size: The number of elements in user_info. (not used) +* +* Calling the calculate_chi_square_mle function +* ============================================= +* +* This __device__ function can be only called from a __global__ function or an other +* __device__ function. +* +*/ + +__device__ void calculate_chi_square_mle( + volatile float * chi_square, + int const point_index, + float const * data, + float const * value, + float const * weight, + int * state, + char * user_info, + std::size_t const user_info_size) +{ + if (value[point_index] < 0) + { + *state = 3; + } + + float const deviation = value[point_index] - data[point_index]; + + if (data[point_index] != 0) + { + chi_square[point_index] + = 2 * (deviation - data[point_index] * logf(value[point_index] / data[point_index])); + } + else + { + chi_square[point_index] = 2 * deviation; + } +} + +/* Description of the calculate_hessian_mle function +* ================================================== +* +* This function calculates the hessian matrix values of the MLE equation. The +* calculation is performed based on previously calculated derivative values. +* +* Parameters: +* +* hessian: An output vector of values of the hessian matrix for each data point. +* +* point_index: The data point index. +* +* parameter_index_i: Index of the hessian column. +* +* parameter_index_j: Index of the hessian row. +* +* data: An input vector of data values. +* +* value: An input vector of fitting curve values. +* +* derivative: An input vector of partial derivative values of the fitting +* curve for each data point. +* +* weight: An input vector of values for weighting hessian matrix values. It is not +* used in this function. It can be used in functions calculating other estimators +* than the MLE, such as LSE. +* +* user_info: An input vector containing user information. (not used) +* +* user_info_size: The number of elements in user_info. (not used) +* +* Calling the calculate_hessian_mle function +* ========================================== +* +* This __device__ function can be only called from a __global__ function or an other +* __device__ function. +* +*/ + +__device__ void calculate_hessian_mle( + double * hessian, + int const point_index, + int const parameter_index_i, + int const parameter_index_j, + float const * data, + float const * value, + float const * derivatives, + float const * weight, + char * user_info, + std::size_t const user_info_size) +{ + *hessian + += data[point_index] + / (value[point_index] * value[point_index]) + * derivatives[parameter_index_i] * derivatives[parameter_index_j]; +} + +/* Description of the calculate_gradient_mle function +* =================================================== +* +* This function calculates the gradient values of the MLE equation based +* on previously calculated derivative values. +* +* Parameters: +* +* gradient: An output vector of values of the gradient vector for each data point. +* +* point_index: The data point index. +* +* parameter_index: The parameter index. +* +* data: An input vector of data values. +* +* value: An input vector of fitting curve values. +* +* derivative: An input vector of partial derivative values of the fitting +* curve for each data point. +* +* weight: An input vector of values for weighting gradient vector values. It is not +* used in this function. It can be used in functions calculating other estimators +* than the MLE, such as LSE. +* +* user_info: An input vector containing user information. (not used) +* +* user_info_size: The number of elements in user_info. (not used) +* +* Calling the calculate_gradient_mle function +* =========================================== +* +* This __device__ function can be only called from a __global__ function or an other +* __device__ function. +* +*/ + +__device__ void calculate_gradient_mle( + volatile float * gradient, + int const point_index, + int const parameter_index, + float const * data, + float const * value, + float const * derivative, + float const * weight, + char * user_info, + std::size_t const user_info_size) +{ + gradient[point_index] + = -derivative[parameter_index] + * (1 - data[point_index] / value[point_index]); +} + +#endif diff --git a/Gpufit/python/CMakeLists.txt b/Gpufit/python/CMakeLists.txt new file mode 100644 index 0000000..1ed2b3c --- /dev/null +++ b/Gpufit/python/CMakeLists.txt @@ -0,0 +1,53 @@ + +# Python + +# Python package + +set( build_directory "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/pyGpufit" ) +set( setup_files + "${CMAKE_CURRENT_SOURCE_DIR}/README.txt" + "${CMAKE_CURRENT_SOURCE_DIR}/setup.py" + "${CMAKE_CURRENT_SOURCE_DIR}/setup.cfg" +) +set( module_directory "${build_directory}/pygpufit" ) +set( module_files + "${CMAKE_CURRENT_SOURCE_DIR}/pygpufit/__init__.py" + "${CMAKE_CURRENT_SOURCE_DIR}/pygpufit/gpufit.py" +) +set( binary $ ) + +add_custom_target( PYTHON_PACKAGE + COMMAND ${CMAKE_COMMAND} -E + remove_directory ${build_directory} + COMMAND ${CMAKE_COMMAND} -E + make_directory ${build_directory} + COMMAND ${CMAKE_COMMAND} -E + copy_if_different ${setup_files} ${build_directory} + COMMAND ${CMAKE_COMMAND} -E + make_directory ${module_directory} + COMMAND ${CMAKE_COMMAND} -E + copy_if_different ${module_files} ${module_directory} + COMMAND ${CMAKE_COMMAND} -E + copy_if_different ${binary} ${module_directory} +) +set_property( TARGET PYTHON_PACKAGE PROPERTY FOLDER CMakePredefinedTargets ) +add_dependencies( PYTHON_PACKAGE Gpufit ) + +if( NOT PYTHONINTERP_FOUND ) + message( STATUS "Python NOT found - skipping creation of Python wheel!" ) + return() +endif() + +# Python wheel (output name is incorrect, requires plattform tag, see packaging) + +add_custom_target( PYTHON_WHEEL ALL + COMMAND ${CMAKE_COMMAND} -E + chdir ${build_directory} "${PYTHON_EXECUTABLE}" setup.py clean --all + COMMAND ${CMAKE_COMMAND} -E + chdir ${build_directory} "${PYTHON_EXECUTABLE}" setup.py bdist_wheel + COMMENT "Preparing Python Wheel" +) +set_property( TARGET PYTHON_WHEEL PROPERTY FOLDER CMakePredefinedTargets ) +add_dependencies( PYTHON_WHEEL PYTHON_PACKAGE ) + +# add launcher to Python package diff --git a/Gpufit/python/README.txt b/Gpufit/python/README.txt new file mode 100644 index 0000000..2e58557 --- /dev/null +++ b/Gpufit/python/README.txt @@ -0,0 +1,27 @@ +Python binding for the [Gpufit library](https://github.com/gpufit/Gpufit) which implements Levenberg Marquardt curve fitting in CUDA + +Requirements + +- A CUDA capable graphics card with a recent Nvidia graphics driver (at least 367.48 / July 2016) +- Windows +- Python 2 or 3 with NumPy + +Installation + +Currently the wheel file has to be installed locally. + +If NumPy is not yet installed, install it using pip from the command line + +pip install numpy + +Then install pyGpufit from the local folder via: + +pip install --no-index --find-links=LocalPathToWheelFile pyGpufit + +Examples + +See examples folder. + +Troubleshooting + +A common reason for the error message 'CUDA driver version is insufficient for CUDA runtime version' is an outdated Nvidia graphics driver. \ No newline at end of file diff --git a/Gpufit/python/examples/gauss2d.py b/Gpufit/python/examples/gauss2d.py new file mode 100644 index 0000000..435c4de --- /dev/null +++ b/Gpufit/python/examples/gauss2d.py @@ -0,0 +1,112 @@ +""" + Example of the Python binding of the Gpufit library which implements + Levenberg Marquardt curve fitting in CUDA + https://github.com/gpufit/Gpufit + + Multiple fits of a 2D Gaussian peak function with Poisson distributed noise + http://gpufit.readthedocs.io/en/latest/bindings.html#python + + This example additionally requires numpy. +""" + +import numpy as np +import pygpufit.gpufit as gf + +def generate_gauss_2d(p, xi, yi): + """ + Generates a 2D Gaussian peak. + http://gpufit.readthedocs.io/en/latest/api.html#gauss-2d + + :param p: Parameters (amplitude, x,y center position, width, offset) + :param xi: x positions + :param yi: y positions + :return: The Gaussian 2D peak. + """ + + arg = -(np.square(xi - p[1]) + np.square(yi - p[2])) / (2*p[3]*p[3]) + y = p[0] * np.exp(arg) + p[4] + + return y + +if __name__ == '__main__': + + if not gf.cuda_available(): + raise RuntimeError(gf.get_last_error()) + + # number of fits and fit points + number_fits = 10000 + size_x = 12 + number_points = size_x * size_x + number_parameters = 5 + + # set input arguments + + # true parameters + true_parameters = np.array((10, 5.5, 5.5, 3, 10), dtype=np.float32) + + # initialize random number generator + np.random.seed(0) + + # initial parameters (relative randomized, positions relative to width) + initial_parameters = np.tile(true_parameters, (number_fits, 1)) + initial_parameters[:, (1,2)] += true_parameters[3] * (-0.2 + 0.4 * np.random.rand(number_fits, 2)) + initial_parameters[:, (0, 3, 4)] *= 0.8 + 0.4 * np.random.rand(number_fits, 3) + + # generate x and y values + g = np.arange(size_x) + yi, xi = np.meshgrid(g, g, indexing='ij') + xi = xi.astype(np.float32) + yi = yi.astype(np.float32) + + # generate data + data = generate_gauss_2d(true_parameters, xi, yi) + data = np.reshape(data, (1, number_points)) + data = np.tile(data, (number_fits, 1)) + + # add Poisson noise + data = np.random.poisson(data) + data = data.astype(np.float32) + + # tolerance + tolerance = 0.0001 + + # maximum number of iterations + max_number_iterations = 20 + + # estimator ID + estimator_id = gf.EstimatorID.MLE + + # model ID + model_id = gf.ModelID.GAUSS_2D + + # run Gpufit + parameters, states, chi_squares, number_iterations, execution_time = gf.fit(data, None, model_id, initial_parameters, \ + tolerance, max_number_iterations, None, estimator_id, None) + + # print fit results + converged = states == 0 + print('*Gpufit*') + + # print summary + print('\nmodel ID: {}'.format(model_id)) + print('number of fits: {}'.format(number_fits)) + print('fit size: {} x {}'.format(size_x, size_x)) + print('mean chi_square: {:.2f}'.format(np.mean(chi_squares[converged]))) + print('iterations: {:.2f}'.format(np.mean(number_iterations[converged]))) + print('time: {:.2f} s'.format(execution_time)) + + # get fit states + number_converged = np.sum(converged) + print('\nratio converged {:6.2f} %'.format(number_converged / number_fits * 100)) + print('ratio max it. exceeded {:6.2f} %'.format(np.sum(states == 1) / number_fits * 100)) + print('ratio singular hessian {:6.2f} %'.format(np.sum(states == 2) / number_fits * 100)) + print('ratio neg curvature MLE {:6.2f} %'.format(np.sum(states == 3) / number_fits * 100)) + + # mean, std of fitted parameters + converged_parameters = parameters[converged, :] + converged_parameters_mean = np.mean(converged_parameters, axis=0) + converged_parameters_std = np.std(converged_parameters, axis=0) + print('\nparameters of 2D Gaussian peak') + for i in range(number_parameters): + print('p{} true {:6.2f} mean {:6.2f} std {:6.2f}'.format(i, true_parameters[i], converged_parameters_mean[i], converged_parameters_std[i])) + diff --git a/Gpufit/python/examples/gauss2d_plot.py b/Gpufit/python/examples/gauss2d_plot.py new file mode 100644 index 0000000..d7feb8e --- /dev/null +++ b/Gpufit/python/examples/gauss2d_plot.py @@ -0,0 +1,114 @@ +""" + Example of the Python binding of the Gpufit library which implements + Levenberg Marquardt curve fitting in CUDA + https://github.com/gpufit/Gpufit + + Multiple fits of a 2D Gaussian peak function with Poisson distributed noise + repeated for a different total number of fits each time and plotting the results + http://gpufit.readthedocs.io/en/latest/bindings.html#python + + This example additionally requires numpy (http://www.numpy.org/) and matplotlib (http://matplotlib.org/). +""" + +import numpy as np +import matplotlib.pyplot as plt +import pygpufit.gpufit as gf + +def gaussians_2d(x, y, p): + """ + Generates many 2D Gaussians peaks for a given set of parameters + """ + + n_fits = p.shape[0] + + y = np.zeros((n_fits, x.shape[0], x.shape[1]), dtype=np.float32) + + # loop over each fit + for i in range(n_fits): + pi = p[i, :] + arg = -(np.square(xi - pi[1]) + np.square(yi - pi[2])) / (2 * pi[3] * pi[3]) + y[i, :, :] = pi[0] * np.exp(arg) + pi[4] + + return y + +if __name__ == '__main__': + + print('\n') + + # number of fit points + size_x = 5 + number_points = size_x * size_x + + # set input arguments + + # true parameters + mean_true_parameters = np.array((100, 2, 2, 1, 10), dtype=np.float32) + + # average noise level + average_noise_level = 10 + + # initialize random number generator + np.random.seed(0) + + # tolerance + tolerance = 0.0001 + + # maximum number of iterations + max_number_iterations = 10 + + # model ID + model_id = gf.ModelID.GAUSS_2D + + # loop over different number of fits + n_fits_all = np.around(np.logspace(2, 6, 20)).astype(np.int) + + # generate x and y values + g = np.arange(size_x) + yi, xi = np.meshgrid(g, g, indexing='ij') + xi = xi.astype(np.float32) + yi = yi.astype(np.float32) + + # loop + speed = np.zeros(n_fits_all.size) + for i in range(n_fits_all.size): + n_fits = n_fits_all[i] + + # vary positions of 2D Gaussian peaks slightly + test_parameters = np.tile(mean_true_parameters, (n_fits, 1)) + test_parameters[:, (1,2)] += mean_true_parameters[3] * (-0.2 + 0.4 * np.random.rand(n_fits, 2)) + + # generate data + data = gaussians_2d(xi, yi, test_parameters) + data = np.reshape(data, (n_fits, number_points)) + + # add noise + data += np.random.normal(scale=average_noise_level, size=data.shape) + + # initial parameters (randomized relative (to width for position)) + initial_parameters = np.tile(mean_true_parameters, (n_fits, 1)) + initial_parameters[:, (1,2)] += mean_true_parameters[3] * (-0.2 + 0.4 * np.random.rand(n_fits, 2)) + initial_parameters[:, (0,3,4)] *= 0.8 + 0.4 * np.random.rand(n_fits, 3) + + # run Gpufit + parameters, states, chi_squares, number_iterations, execution_time = gf.fit(data, None, model_id, initial_parameters, tolerance, max_number_iterations) + + # analyze result + converged = states == 0 + speed[i] = n_fits / execution_time + precision_x0 = np.std(parameters[converged, 1] - test_parameters[converged, 1], axis=0, dtype=np.float64) + + # display result + '{} fits '.format(n_fits) + print('{:7} fits iterations: {:6.2f} | time: {:6.3f} s | speed: {:8.0f} fits/s'\ + .format(n_fits, np.mean(number_iterations[converged]), execution_time, speed[i])) + +# plot +plt.semilogx(n_fits_all, speed, 'bo-') +plt.grid(True) +plt.xlabel('number of fits per function call') +plt.ylabel('fits per second') +plt.legend(['Gpufit'], loc='upper left') +ax = plt.gca() +ax.set_xlim(n_fits_all[0], n_fits_all[-1]) + +plt.show() \ No newline at end of file diff --git a/Gpufit/python/examples/simple.py b/Gpufit/python/examples/simple.py new file mode 100644 index 0000000..5184001 --- /dev/null +++ b/Gpufit/python/examples/simple.py @@ -0,0 +1,30 @@ +""" + Example of the Python binding of the Gpufit library which implements + Levenberg Marquardt curve fitting in CUDA + https://github.com/gpufit/Gpufit + + Simple example demonstrating a minimal call of all needed parameters for the Python interface + http://gpufit.readthedocs.io/en/latest/bindings.html#python +""" + +import numpy as np +import pygpufit.gpufit as gf + +if __name__ == '__main__': + + # number of fits, number of points per fit + number_fits = 10 + number_points = 10 + + # model ID and number of parameter + model_id = gf.ModelID.GAUSS_1D + number_parameter = 5 + + # initial parameters + initial_parameters = np.zeros((number_fits, number_parameter), dtype=np.float32) + + # data + data = np.zeros((number_fits, number_points), dtype=np.float32) + + # run Gpufit + parameters, states, chi_squares, number_iterations, execution_time = gf.fit(data, None, model_id, initial_parameters) \ No newline at end of file diff --git a/Gpufit/python/pygpufit/__init__.py b/Gpufit/python/pygpufit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Gpufit/python/pygpufit/gpufit.py b/Gpufit/python/pygpufit/gpufit.py new file mode 100644 index 0000000..22a889f --- /dev/null +++ b/Gpufit/python/pygpufit/gpufit.py @@ -0,0 +1,201 @@ +""" + Python binding for Gpufit, a Levenberg Marquardt curve fitting library written in CUDA + See https://github.com/gpufit/Gpufit, http://gpufit.readthedocs.io/en/latest/bindings.html#python + + The binding is based on ctypes. + See https://docs.python.org/3.5/library/ctypes.html, http://www.scipy-lectures.org/advanced/interfacing_with_c/interfacing_with_c.html +""" + +import os +import time +from ctypes import cdll, POINTER, c_int, c_float, c_char, c_char_p, c_size_t +import numpy as np + +# define library loader (actual loading is lazy) +package_dir = os.path.dirname(os.path.realpath(__file__)) +lib_path = os.path.join(package_dir, 'Gpufit.dll') # this will only work on Windows +lib = cdll.LoadLibrary(lib_path) + +# gpufit function in the dll +gpufit_func = lib.gpufit +gpufit_func.restype = c_int +gpufit_func.argtypes = [c_size_t, c_size_t, POINTER(c_float), POINTER(c_float), c_int, POINTER(c_float), c_float, c_int, POINTER(c_int), c_int, c_size_t, POINTER(c_char), POINTER(c_float), POINTER(c_int), POINTER(c_float), POINTER(c_int)] + +# gpufit_get_last_error function in the dll +error_func = lib.gpufit_get_last_error +error_func.restype = c_char_p +error_func.argtypes = None + +# gpufit_cuda_available function in the dll +cuda_available_func = lib.gpufit_cuda_available +cuda_available_func.restype = c_int +cuda_available_func.argtypes = None + + +class ModelID(): + + GAUSS_1D = 0 + GAUSS_2D = 1 + GAUSS_2D_ELLIPTIC = 2 + GAUSS_2D_ROTATED = 3 + CAUCHY_2D_ELLIPTIC = 4 + LINEAR_1D = 5 + + +class EstimatorID(): + + LSE = 0 + MLE = 1 + + +def fit(data, weights, model_id, initial_parameters, tolerance=None, max_number_iterations=None, \ + parameters_to_fit=None, estimator_id=None, user_info=None): + """ + Calls the C interface fit function in the library. + (see also http://gpufit.readthedocs.io/en/latest/bindings.html#python) + + All 2D NumPy arrays must be in row-major order (standard in NumPy), i.e. array.flags.C_CONTIGUOUS must be True + (see also https://docs.scipy.org/doc/numpy/reference/arrays.ndarray.html#internal-memory-layout-of-an-ndarray) + + :param data: The data - 2D NumPy array of dimension [number_fits, number_points] and data type np.float32 + :param weights: The weights - 2D NumPy array of the same dimension and data type as parameter data or None (no weights available) + :param model_id: The model ID + :param initial_parameters: Initial values for parameters - NumPy array of dimension [number_fits, number_parameters] and data type np.float32 + :param tolerance: The fit tolerance or None (will use default value) + :param max_number_iterations: The maximal number of iterations or None (will use default value) + :param parameters_to_fit: Which parameters to fit - NumPy array of length number_parameters and type np.int32 or None (will fit all parameters) + :param estimator_id: The Estimator ID or None (will use default values) + :param user_info: User info - NumPy array of type np.char or None (no user info available) + :return: parameters, states, chi_squares, number_iterations, execution_time + """ + + # check all 2D NumPy arrays for row-major memory layout (otherwise interpretation of order of dimensions fails) + if not data.flags.c_contiguous: + raise RuntimeError('Memory layout of data array mismatch.') + + if weights is not None and not weights.flags.c_contiguous: + raise RuntimeError('Memory layout of weights array mismatch.') + + if not initial_parameters.flags.c_contiguous: + raise RuntimeError('Memory layout of initial_parameters array mismatch.') + + # size check: data is 2D and read number of points and fits + if data.ndim != 2: + raise RuntimeError('data is not two-dimensional') + number_points = data.shape[1] + number_fits = data.shape[0] + + # size check: consistency with weights (if given) + if weights is not None and data.shape != weights.shape: + raise RuntimeError('dimension mismatch between data and weights') + # the unequal operator checks, type, length and content (https://docs.python.org/3.7/reference/expressions.html#value-comparisons) + + # size check: initial parameters is 2D and read number of parameters + if initial_parameters.ndim != 2: + raise RuntimeError('initial_parameters is not two-dimensional') + number_parameters = initial_parameters.shape[1] + if initial_parameters.shape[0] != number_fits: + raise RuntimeError('dimension mismatch in number of fits between data and initial_parameters') + + # size check: consistency with parameters_to_fit (if given) + if parameters_to_fit is not None and parameters_to_fit.shape[0] != number_parameters: + raise RuntimeError('dimension mismatch in number of parameters between initial_parameters and parameters_to_fit') + + # default value: tolerance + if not tolerance: + tolerance = 1e-4 + + # default value: max_number_iterations + if not max_number_iterations: + max_number_iterations = 25 + + # default value: estimator ID + if not estimator_id: + estimator_id = EstimatorID.LSE + + # default value: parameters_to_fit + if parameters_to_fit is None: + parameters_to_fit = np.ones(number_parameters, dtype=np.int32) + + # now only weights and user_info could be not given + + # type check: data, weights (if given), initial_parameters are all np.float32 + if data.dtype != np.float32: + raise RuntimeError('type of data is not np.float32') + if weights is not None and weights.dtype != np.float32: + raise RuntimeError('type of weights is not np.float32') + if initial_parameters.dtype != np.float32: + raise RuntimeError('type of initial_parameters is not np.float32') + + # type check: parameters_to_fit is np.int32 + if parameters_to_fit.dtype != np.int32: + raise RuntimeError('type of parameters_to_fit is not np.int32') + + # we don't check type of user_info, but we extract the size in bytes of it + if user_info is not None: + user_info_size = user_info.nbytes + else: + user_info_size = 0 + + # pre-allocate output variables + parameters = np.zeros((number_fits, number_parameters), dtype=np.float32) + states = np.zeros(number_fits, dtype=np.int32) + chi_squares = np.zeros(number_fits, dtype=np.float32) + number_iterations = np.zeros(number_fits, dtype=np.int32) + + # conversion to ctypes types for optional C interface parameters using NULL pointer (None) as default argument + if weights is not None: + weights_p = weights.ctypes.data_as(gpufit_func.argtypes[3]) + else: + weights_p = None + if user_info is not None: + user_info_p = user_info.ctypes.data_as(gpufit_func.argtypes[11]) + else: + user_info_p = None + + # call into the library (measure time) + t0 = time.clock() + status = gpufit_func( + gpufit_func.argtypes[0](number_fits), \ + gpufit_func.argtypes[1](number_points), \ + data.ctypes.data_as(gpufit_func.argtypes[2]), \ + weights_p, \ + gpufit_func.argtypes[4](model_id), \ + initial_parameters.ctypes.data_as(gpufit_func.argtypes[5]), \ + gpufit_func.argtypes[6](tolerance), \ + gpufit_func.argtypes[7](max_number_iterations), \ + parameters_to_fit.ctypes.data_as(gpufit_func.argtypes[8]), \ + gpufit_func.argtypes[9](estimator_id), \ + gpufit_func.argtypes[10](user_info_size), \ + user_info_p, \ + parameters.ctypes.data_as(gpufit_func.argtypes[12]), \ + states.ctypes.data_as(gpufit_func.argtypes[13]), \ + chi_squares.ctypes.data_as(gpufit_func.argtypes[14]), \ + number_iterations.ctypes.data_as(gpufit_func.argtypes[15])) + t1 = time.clock() + + + # check status + if status != 0: + # get error from last error and raise runtime error + error_message = error_func() + raise RuntimeError('status = {}, message = {}'.format(status, error_message)) + + # return output values + return parameters, states, chi_squares, number_iterations, t1 - t0 + + +def get_last_error(): + """ + + :return: + """ + return error_func() + + +def cuda_available(): + """ + + :return: True if CUDA is available, False otherwise + """ + return cuda_available_func() != 0 diff --git a/Gpufit/python/requirements.txt b/Gpufit/python/requirements.txt new file mode 100644 index 0000000..b316bf2 --- /dev/null +++ b/Gpufit/python/requirements.txt @@ -0,0 +1 @@ +NumPy>=1.8 \ No newline at end of file diff --git a/Gpufit/python/setup.cfg b/Gpufit/python/setup.cfg new file mode 100644 index 0000000..3c6e79c --- /dev/null +++ b/Gpufit/python/setup.cfg @@ -0,0 +1,2 @@ +[bdist_wheel] +universal=1 diff --git a/Gpufit/python/setup.py b/Gpufit/python/setup.py new file mode 100644 index 0000000..c2e2b83 --- /dev/null +++ b/Gpufit/python/setup.py @@ -0,0 +1,40 @@ +""" + setup script for pyGpufit + + TODO get version, get meaningful email +""" + +from setuptools import setup, find_packages +import os +from io import open # to have encoding as parameter of open on Python >=2.6 + +HERE = os.path.abspath(os.path.dirname(__file__)) + +CLASSIFIERS = ['Development Status :: 5 - Production/Stable', + 'Intended Audience :: End Users/Desktop', + 'Operating System :: Microsoft :: Windows', + 'Topic :: Scientific/Engineering', + 'Topic :: Software Development :: Libraries'] + +def get_long_description(): + """ + Get the long description from the README file. + """ + with open(os.path.join(HERE, 'README.txt'), encoding='utf-8') as f: + return f.read() + +if __name__ == "__main__": + setup(name='pyGpufit', + version='1.0.0', + description='Levenberg Marquardt curve fitting in CUDA', + long_description=get_long_description(), + url='https://github.com/gpufit/Gpufit', + author='M. Bates, A. Przybylski, B. Thiel, and J. Keller-Findeisen', + author_email='a@b.c', + license='', + classifiers=[], + keywords='Levenberg Marquardt, curve fitting, CUDA', + packages=find_packages(where=HERE), + package_data={'pygpufit': ['*.dll']}, + install_requires=['NumPy>=1.0'], + zip_safe=False) \ No newline at end of file diff --git a/Gpufit/python/tests/run_tests.py b/Gpufit/python/tests/run_tests.py new file mode 100644 index 0000000..5395da2 --- /dev/null +++ b/Gpufit/python/tests/run_tests.py @@ -0,0 +1,19 @@ +""" +Discovers all tests and runs them. Assumes that initially the working directory is test. +""" + +import sys +import unittest + +if __name__ == '__main__': + + loader = unittest.defaultTestLoader + + tests = loader.discover('.') + + runner = unittest.TextTestRunner() + + results = runner.run(tests) + + # return number of failures + sys.exit(len(results.failures)) \ No newline at end of file diff --git a/Gpufit/python/tests/test_gaussian_fit_1d.py b/Gpufit/python/tests/test_gaussian_fit_1d.py new file mode 100644 index 0000000..a2f2bd7 --- /dev/null +++ b/Gpufit/python/tests/test_gaussian_fit_1d.py @@ -0,0 +1,76 @@ +""" + Equivalent to https://github.com/gpufit/Gpufit/blob/master/Gpufit/tests/Gauss_Fit_1D.cpp +""" + +import unittest +import numpy as np +import pygpufit.gpufit as gf + +def generate_gauss_1d(parameters, x): + """ + Generates a 1D Gaussian curve. + + :param parameters: The parameters (a, x0, s, b) + :param x: The x values + :return: A 1D Gaussian curve. + """ + + a = parameters[0] + x0 = parameters[1] + s = parameters[2] + b = parameters[3] + + y = a * np.exp(-np.square(x - x0) / (2 * s**2)) + b + + return y + +class Test(unittest.TestCase): + + def test_gaussian_fit_1d(self): + # constants + n_fits = 1 + n_points = 5 + n_parameter = 4 # model will be GAUSS_1D + + # true parameters + true_parameters = np.array((4, 2, 0.5, 1), dtype=np.float32) + + # generate data + data = np.empty((n_fits, n_points), dtype=np.float32) + x = np.arange(n_points, dtype=np.float32) + data[0, :] = generate_gauss_1d(true_parameters, x) + + # tolerance + tolerance = 0.001 + + # max_n_iterations + max_n_iterations = 10 + + # model id + model_id = gf.ModelID.GAUSS_1D + + # initial parameters + initial_parameters = np.empty((n_fits, n_parameter), dtype=np.float32) + initial_parameters[0, :] = (2, 1.5, 0.3, 0) + + # call to gpufit + parameters, states, chi_squares, number_iterations, execution_time = gf.fit(data, None, model_id, + initial_parameters, tolerance, \ + max_n_iterations, None, None, None) + + # print results + for i in range(n_parameter): + print(' p{} true {} fit {}'.format(i, true_parameters[i], parameters[0, i])) + print('fit state : {}'.format(states)) + print('chi square: {}'.format(chi_squares)) + print('iterations: {}'.format(number_iterations)) + print('time: {} s'.format(execution_time)) + + assert (chi_squares < 1e-6) + assert (states == 0) + assert (number_iterations <= max_n_iterations) + for i in range(n_parameter): + assert (abs(true_parameters[i] - parameters[0, i]) < 1e-6) + +if __name__ == '__main__': + unittest.main() diff --git a/Gpufit/python/tests/test_linear_regression.py b/Gpufit/python/tests/test_linear_regression.py new file mode 100644 index 0000000..ad05ff4 --- /dev/null +++ b/Gpufit/python/tests/test_linear_regression.py @@ -0,0 +1,60 @@ +""" + Equivalent to https://github.com/gpufit/Gpufit/blob/master/Gpufit/tests/Linear_Fit_1D.cpp +""" + +import unittest +import numpy as np +import pygpufit.gpufit as gf + +class Test(unittest.TestCase): + + def test_gaussian_fit_1d(self): + # constants + n_fits = 1 + n_points = 2 + n_parameter = 2 + + # true parameters + true_parameters = np.array((0, 1), dtype=np.float32) + + # data values + data = np.empty((n_fits, n_points), dtype=np.float32) + data[0, :] = (0, 1) + + # max number iterations + max_number_iterations = 10 + + # initial parameters + initial_parameters = np.empty((n_fits, n_parameter), dtype=np.float32) + initial_parameters[0, :] = (0, 0) + + # model id + model_id = gf.ModelID.LINEAR_1D + + # tolerance + tolerance = 0.001 + + # user info + user_info = np.array((0, 1), dtype=np.float32) + + # call to gpufit + parameters, states, chi_squares, number_iterations, execution_time = gf.fit(data, None, model_id, + initial_parameters, tolerance, \ + None, None, None, user_info) + + # print results + for i in range(n_parameter): + print(' p{} true {} fit {}'.format(i, true_parameters[i], parameters[0, i])) + print('fit state : {}'.format(states)) + print('chi square: {}'.format(chi_squares)) + print('iterations: {}'.format(number_iterations)) + print('time: {} s'.format(execution_time)) + + assert (chi_squares < 1e-6) + assert (states == 0) + assert (number_iterations <= max_number_iterations) + for i in range(n_parameter): + assert (abs(true_parameters[i] - parameters[0, i]) < 1e-6) + +if __name__ == '__main__': + unittest.main() diff --git a/Gpufit/tests/CMakeLists.txt b/Gpufit/tests/CMakeLists.txt new file mode 100644 index 0000000..a53ba34 --- /dev/null +++ b/Gpufit/tests/CMakeLists.txt @@ -0,0 +1,10 @@ + +# Tests + +add_boost_test( Gpufit Error_Handling ) +add_boost_test( Gpufit Linear_Fit_1D ) +add_boost_test( Gpufit Gauss_Fit_1D ) +add_boost_test( Gpufit Gauss_Fit_2D ) +add_boost_test( Gpufit Gauss_Fit_2D_Elliptic ) +add_boost_test( Gpufit Gauss_Fit_2D_Rotated ) +add_boost_test( Gpufit Cauchy_Fit_2D_Elliptic ) diff --git a/Gpufit/tests/Cauchy_Fit_2D_Elliptic.cpp b/Gpufit/tests/Cauchy_Fit_2D_Elliptic.cpp new file mode 100644 index 0000000..461c726 --- /dev/null +++ b/Gpufit/tests/Cauchy_Fit_2D_Elliptic.cpp @@ -0,0 +1,73 @@ +#define BOOST_TEST_MODULE Gpufit + +#include "Gpufit/gpufit.h" + +#include + +#include + +template +void generate_cauchy_2d_elliptic(std::array< float, SIZE>& values) +{ + int const size_x = int(std::sqrt(SIZE)); + int const size_y = size_x; + + float const a = 4; + float const x0 = (float(size_x) - 1.f) / 2.f; + float const y0 = (float(size_y) - 1.f) / 2.f; + float const sx = 0.4f; + float const sy = 0.6f; + float const b = 1.f; + + for (int point_index_y = 0; point_index_y < size_y; point_index_y++) + { + for (int point_index_x = 0; point_index_x < size_x; point_index_x++) + { + int const point_index = point_index_y * size_x + point_index_x; + float const argx = ((x0 - point_index_x) / sx) *((x0 - point_index_x) / sx) + 1.f; + float const argy = ((y0 - point_index_y) / sy) *((y0 - point_index_y) / sy) + 1.f; + values[point_index] = a / argx / argy + b; + } + } +} + +BOOST_AUTO_TEST_CASE( Cauchy_Fit_2D_Elliptic ) +{ + std::size_t const n_fits{ 1 } ; + std::size_t const n_points{ 25 } ; + std::array< float, n_points > data{}; + generate_cauchy_2d_elliptic(data); + std::array< float, n_points > weights{}; + std::fill(weights.begin(), weights.end(), 1.f); + std::array< float, 6 > initial_parameters{ { 2.f, 1.8f, 2.2f, 0.5f, 0.5f, 0.f } }; + float tolerance{ 0.001f }; + int max_n_iterations{ 100 }; + std::array< int, 6 > parameters_to_fit{ { 1, 1, 1, 1, 1, 1 } }; + std::array< float, 6 > output_parameters; + int output_states; + float output_chi_square; + int output_n_iterations; + + int const status + = gpufit + ( + n_fits, + n_points, + data.data(), + weights.data(), + CAUCHY_2D_ELLIPTIC, + initial_parameters.data(), + tolerance, + max_n_iterations, + parameters_to_fit.data(), + LSE, + 0, + 0, + output_parameters.data(), + &output_states, + &output_chi_square, + &output_n_iterations + ) ; + + BOOST_CHECK( status == 0 ) ; +} diff --git a/Gpufit/tests/Error_Handling.cpp b/Gpufit/tests/Error_Handling.cpp new file mode 100644 index 0000000..c35a078 --- /dev/null +++ b/Gpufit/tests/Error_Handling.cpp @@ -0,0 +1,51 @@ +#define BOOST_TEST_MODULE Gpufit + +#include "Gpufit/gpufit.h" + +#include + +#include + +BOOST_AUTO_TEST_CASE( Error_Handling ) +{ + std::size_t const n_fits{ 1 } ; + std::size_t const n_points{ 2 } ; + std::array< float, n_points > data{ { 0, 1 } } ; + std::array< float, n_points > weights{ { 1, 1 } } ; + std::array< float, 2 > initial_parameters{ { 0, 0 } } ; + float tolerance{ 0.001f } ; + int max_n_iterations{ 10 } ; + std::array< int, 2 > parameters_to_fit{ { 0, 0 } } ; + std::array< int, 2 > user_info{ { 0, 1 } } ; + std::array< float, 2 > output_parameters ; + int output_states ; + float output_chi_square ; + int output_n_iterations ; + + int const status + = gpufit + ( + n_fits, + n_points, + data.data(), + weights.data(), + LINEAR_1D, + initial_parameters.data(), + tolerance, + max_n_iterations, + parameters_to_fit.data(), + LSE, + n_points * sizeof( int ), + reinterpret_cast< char * >( user_info.data() ), + output_parameters.data(), + & output_states, + & output_chi_square, + & output_n_iterations + ) ; + + BOOST_CHECK( status == - 1 ) ; + + std::string const error = gpufit_get_last_error() ; + + BOOST_CHECK( error == "invalid configuration argument" ) ; +} diff --git a/Gpufit/tests/Gauss_Fit_1D.cpp b/Gpufit/tests/Gauss_Fit_1D.cpp new file mode 100644 index 0000000..81a8c64 --- /dev/null +++ b/Gpufit/tests/Gauss_Fit_1D.cpp @@ -0,0 +1,87 @@ +#define BOOST_TEST_MODULE Gpufit + +#include "Gpufit/gpufit.h" + +#include + +#include + +template +void generate_gauss_1d( + std::array< float, n_points >& values, + std::array< float, 4 > const & parameters ) +{ + float const a = parameters[ 0 ]; + float const x0 = parameters[ 1 ]; + float const s = parameters[ 2 ]; + float const b = parameters[ 3 ]; + + for ( int point_index = 0; point_index < n_points; point_index++ ) + { + float const argx = ( ( point_index - x0 )*( point_index - x0 ) ) / ( 2.f * s * s ); + float const ex = exp( -argx ); + values[ point_index ] = a * ex + b; + } +} + +BOOST_AUTO_TEST_CASE( Gauss_Fit_1D ) +{ + /* + Performs a single fit using the GAUSS_1D model. + - Doesn't use user_info or weights. + - No noise is added. + - Checks fitted parameters equalling the true parameters. + */ + + std::size_t const n_fits{ 1 } ; + std::size_t const n_points{ 5 } ; + + std::array< float, 4 > const true_parameters{ { 4.f, 2.f, 0.5f, 1.f } }; + + std::array< float, n_points > data{}; + generate_gauss_1d( data, true_parameters ); + + std::array< float, 4 > initial_parameters{ { 2.f, 1.5f, 0.3f, 0.f } }; + + float tolerance{ 0.001f }; + + int max_n_iterations{ 10 }; + + std::array< int, 4 > parameters_to_fit{ { 1, 1, 1, 1 } }; + + std::array< float, 4 > output_parameters; + int output_states; + float output_chi_square; + int output_n_iterations; + + int const status + = gpufit + ( + n_fits, + n_points, + data.data(), + 0, + GAUSS_1D, + initial_parameters.data(), + tolerance, + max_n_iterations, + parameters_to_fit.data(), + LSE, + 0, + 0, + output_parameters.data(), + &output_states, + &output_chi_square, + &output_n_iterations + ) ; + + BOOST_CHECK( status == 0 ) ; + BOOST_CHECK( output_states == 0 ); + BOOST_CHECK( output_chi_square < 1e-6f ); + BOOST_CHECK( output_n_iterations <= max_n_iterations ); + + BOOST_CHECK( std::fabsf(output_parameters[ 0 ] - true_parameters[ 0 ] ) < 1e-6f ); + BOOST_CHECK( std::fabsf(output_parameters[ 1 ] - true_parameters[ 1 ] ) < 1e-6f ); + BOOST_CHECK( std::fabsf(output_parameters[ 2 ] - true_parameters[ 2 ] ) < 1e-6f ); + BOOST_CHECK( std::fabsf(output_parameters[ 3 ] - true_parameters[ 3 ] ) < 1e-6f ); +} diff --git a/Gpufit/tests/Gauss_Fit_2D.cpp b/Gpufit/tests/Gauss_Fit_2D.cpp new file mode 100644 index 0000000..0222933 --- /dev/null +++ b/Gpufit/tests/Gauss_Fit_2D.cpp @@ -0,0 +1,96 @@ +#define BOOST_TEST_MODULE Gpufit + +#include "Gpufit/gpufit.h" + +#include + +#include + +template +void generate_gauss_2d(std::array< float , SIZE>& values) +{ + int const size_x = int(std::sqrt(SIZE)); + int const size_y = size_x; + + float const a = 4.f; + float const x0 = (float(size_x) - 1.f) / 2.f; + float const y0 = (float(size_y) - 1.f) / 2.f; + float const s = 0.5f; + float const b = 1.f; + + for (int point_index_y = 0; point_index_y < size_y; point_index_y++) + { + for (int point_index_x = 0; point_index_x < size_x; point_index_x++) + { + int const point_index = point_index_y * size_x + point_index_x; + float const argx = ((point_index_x - x0)*(point_index_x - x0)) / (2.f * s * s); + float const argy = ((point_index_y - y0)*(point_index_y - y0)) / (2.f * s * s); + float const ex = exp(-argx) * exp(-argy); + values[point_index] = a * ex + b; + } + } +} + +BOOST_AUTO_TEST_CASE( Gauss_Fit_2D ) +{ + std::size_t const n_fits{ 1 } ; + std::size_t const n_points{ 25 } ; + std::array< float, n_points > data{}; + generate_gauss_2d(data); + std::array< float, n_points > weights{}; + std::fill(weights.begin(), weights.end(), 1.f); + std::array< float, 5 > initial_parameters{ { 2.f, 1.8f, 2.2f, 0.4f, 0.f } }; + float tolerance{ 0.001f }; + int max_n_iterations{ 10 }; + std::array< int, 5 > parameters_to_fit{ { 1, 1, 1, 1, 1 } }; + std::array< float, 5 > output_parameters; + int output_states; + float output_chi_square; + int output_n_iterations; + + int const status + = gpufit + ( + n_fits, + n_points, + data.data(), + 0, + GAUSS_2D, + initial_parameters.data(), + tolerance, + max_n_iterations, + parameters_to_fit.data(), + LSE, + 0, + 0, + output_parameters.data(), + &output_states, + &output_chi_square, + &output_n_iterations + ) ; + + BOOST_CHECK( status == 0 ) ; + + int const status_with_weights + = gpufit + ( + n_fits, + n_points, + data.data(), + weights.data(), + GAUSS_2D, + initial_parameters.data(), + tolerance, + max_n_iterations, + parameters_to_fit.data(), + LSE, + 0, + 0, + output_parameters.data(), + &output_states, + &output_chi_square, + &output_n_iterations + ) ; + + BOOST_CHECK( status_with_weights == 0 ) ; +} diff --git a/Gpufit/tests/Gauss_Fit_2D_Elliptic.cpp b/Gpufit/tests/Gauss_Fit_2D_Elliptic.cpp new file mode 100644 index 0000000..072169c --- /dev/null +++ b/Gpufit/tests/Gauss_Fit_2D_Elliptic.cpp @@ -0,0 +1,74 @@ +#define BOOST_TEST_MODULE Gpufit + +#include "Gpufit/gpufit.h" + +#include + +#include + +template +void generate_gauss_2d_elliptic(std::array< float, SIZE>& values) +{ + int const size_x = int(std::sqrt(SIZE)); + int const size_y = size_x; + + float const a = 4; + float const x0 = (float(size_x) - 1.f) / 2.f; + float const y0 = (float(size_y) - 1.f) / 2.f; + float const sx = 0.4f; + float const sy = 0.6f; + float const b = 1.f; + + for (int point_index_y = 0; point_index_y < size_y; point_index_y++) + { + for (int point_index_x = 0; point_index_x < size_x; point_index_x++) + { + int const point_index = point_index_y * size_x + point_index_x; + float const argx = ((point_index_x - x0)*(point_index_x - x0)) / (2.f * sx * sx); + float const argy = ((point_index_y - y0)*(point_index_y - y0)) / (2.f* sy * sy); + float const ex = exp(-argx) * exp(-argy); + values[point_index] = a * ex + b; + } + } +} + +BOOST_AUTO_TEST_CASE( Gauss_Fit_2D_Elliptic ) +{ + std::size_t const n_fits{ 1 } ; + std::size_t const n_points{ 25 } ; + std::array< float, n_points > data{}; + generate_gauss_2d_elliptic(data); + std::array< float, n_points > weights{}; + std::fill(weights.begin(), weights.end(), 1.f); + std::array< float, 6 > initial_parameters{ { 2.f, 1.8f, 2.2f, 0.5f, 0.5f, 0.f } }; + float tolerance{ 0.001f }; + int max_n_iterations{ 10 }; + std::array< int, 6 > parameters_to_fit{ { 1, 1, 1, 1, 1, 1 } }; + std::array< float, 6 > output_parameters; + int output_states; + float output_chi_square; + int output_n_iterations; + + int const status + = gpufit + ( + n_fits, + n_points, + data.data(), + weights.data(), + GAUSS_2D_ELLIPTIC, + initial_parameters.data(), + tolerance, + max_n_iterations, + parameters_to_fit.data(), + LSE, + 0, + 0, + output_parameters.data(), + &output_states, + &output_chi_square, + &output_n_iterations + ) ; + + BOOST_CHECK( status == 0 ) ; +} diff --git a/Gpufit/tests/Gauss_Fit_2D_Rotated.cpp b/Gpufit/tests/Gauss_Fit_2D_Rotated.cpp new file mode 100644 index 0000000..55cd682 --- /dev/null +++ b/Gpufit/tests/Gauss_Fit_2D_Rotated.cpp @@ -0,0 +1,77 @@ +#define BOOST_TEST_MODULE Gpufit + +#define PI 3.1415926535897f + +#include "Gpufit/gpufit.h" + +#include + +#include + +template +void generate_gauss_2d_rotated(std::array< float, SIZE>& values) +{ + int const size_x = int(std::sqrt(SIZE)); + int const size_y = size_x; + + float const a = 10.f; + float const x0 = (float(size_x) - 1.f) / 2.f; + float const y0 = (float(size_y) - 1.f) / 2.f; + float const sx = 0.4f; + float const sy = 0.5f; + float const b = 1.f; + float const r = PI / 16.f; + + for (int point_index_y = 0; point_index_y < size_y; point_index_y++) + { + for (int point_index_x = 0; point_index_x < size_x; point_index_x++) + { + int const point_index = point_index_y * size_x + point_index_x; + float const arga = ((point_index_x - x0) * cosf(r)) - ((point_index_y - y0) * sinf(r)); + float const argb = ((point_index_x - x0) * sinf(r)) + ((point_index_y - y0) * cosf(r)); + float const ex = exp((-0.5f) * (((arga / sx) * (arga / sx)) + ((argb / sy) * (argb / sy)))); + values[point_index] = a * ex + b; + } + } +} + +BOOST_AUTO_TEST_CASE( Gauss_Fit_2D_Rotated ) +{ + std::size_t const n_fits{ 1 } ; + std::size_t const n_points{ 64 } ; + std::array< float, n_points > data{}; + generate_gauss_2d_rotated(data); + std::array< float, n_points > weights{}; + std::fill(weights.begin(), weights.end(), 1.f); + std::array< float, 7 > initial_parameters{ { 8.f, 3.4f, 3.6f, 0.4f, 0.5f, 2.f, 0.f } }; + float tolerance{ 0.001f }; + int max_n_iterations{ 10 }; + std::array< int, 7 > parameters_to_fit{ { 1, 1, 1, 1, 1, 1, 1 } }; + std::array< float, 7 > output_parameters; + int output_states; + float output_chi_square; + int output_n_iterations; + + int const status + = gpufit + ( + n_fits, + n_points, + data.data(), + weights.data(), + GAUSS_2D_ROTATED, + initial_parameters.data(), + tolerance, + max_n_iterations, + parameters_to_fit.data(), + LSE, + 0, + 0, + output_parameters.data(), + &output_states, + &output_chi_square, + &output_n_iterations + ) ; + + BOOST_CHECK( status == 0 ) ; +} diff --git a/Gpufit/tests/Linear_Fit_1D.cpp b/Gpufit/tests/Linear_Fit_1D.cpp new file mode 100644 index 0000000..abd7c81 --- /dev/null +++ b/Gpufit/tests/Linear_Fit_1D.cpp @@ -0,0 +1,101 @@ +#define BOOST_TEST_MODULE Gpufit + +#include "Gpufit/gpufit.h" + +#include + +#include + +BOOST_AUTO_TEST_CASE( Linear_Fit_1D ) +{ + /* + Performs a single fit using the Linear Fit (LINEAR_1D) model. + - Uses user info + - Uses trivial weights. + - No noise is added. + - Checks fitted parameters equalling the true parameters. + */ + + std::size_t const n_fits{ 1 } ; + std::size_t const n_points{ 2 } ; + + std::array< float, 2 > const true_parameters{ { 1, 1 } }; + + std::array< float, n_points > data{ { 1, 2 } } ; + + std::array< float, n_points > weights{ { 1, 1 } } ; + + std::array< float, 2 > initial_parameters{ { 1, 0 } } ; + + float tolerance{ 0.001f } ; + + int max_n_iterations{ 10 } ; + + std::array< int, 2 > parameters_to_fit{ { 1, 1 } } ; + + std::array< float, n_points > user_info{ { 0.f, 1.f } } ; + + std::array< float, 2 > output_parameters ; + int output_states ; + float output_chi_squares ; + int output_n_iterations ; + + // test with LSE + int status = gpufit + ( + n_fits, + n_points, + data.data(), + weights.data(), + LINEAR_1D, + initial_parameters.data(), + tolerance, + max_n_iterations, + parameters_to_fit.data(), + LSE, + n_points * sizeof( float ), + reinterpret_cast< char * >( user_info.data() ), + output_parameters.data(), + & output_states, + & output_chi_squares, + & output_n_iterations + ) ; + + BOOST_CHECK( status == 0 ) ; + BOOST_CHECK( output_states == 0 ); + BOOST_CHECK( output_n_iterations <= max_n_iterations ); + BOOST_CHECK( output_chi_squares < 1e-6f ); + + BOOST_CHECK(std::fabsf(output_parameters[0] - true_parameters[0]) < 1e-6f); + BOOST_CHECK(std::fabsf(output_parameters[1] - true_parameters[1]) < 1e-6f); + + // test with MLE + status = gpufit + ( + n_fits, + n_points, + data.data(), + weights.data(), + LINEAR_1D, + initial_parameters.data(), + tolerance, + max_n_iterations, + parameters_to_fit.data(), + MLE, + n_points * sizeof(float), + reinterpret_cast< char * >(user_info.data()), + output_parameters.data(), + &output_states, + &output_chi_squares, + &output_n_iterations + ); + + BOOST_CHECK(status == 0); + BOOST_CHECK(output_states == 0); + BOOST_CHECK(output_n_iterations <= max_n_iterations); + BOOST_CHECK(output_chi_squares < 1e-6f); + + BOOST_CHECK(std::fabsf(output_parameters[0] - true_parameters[0]) < 1e-6f); + BOOST_CHECK(std::fabsf(output_parameters[1] - true_parameters[1]) < 1e-4f); + +} diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..6fe98c3 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 Mark Bates, Adrian Przybylski, Björn Thiel, and Jan Keller-Findeisen + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..498877e --- /dev/null +++ b/README.md @@ -0,0 +1,62 @@ +# Gpufit + +Levenberg Marquardt curve fitting in CUDA. + +Homepage: [github.com/gpufit/Gpufit](https://github.com/gpufit/Gpufit) + +## Quick start instructions + +To verify that Gpufit is working correctly on the host computer, go to the folder gpufit_performance_test of the binary package and run Gpufit_Cpufit_Performance_Comparison.exe. Further details of the test executable can be found in the documentation package. + +## Binary distribution + +The latest Gpufit binary release, supporting Windows 32-bit and 64-bit machines, can be found on the [release page](https://github.com/gpufit/Gpufit/releases). + +## Documentation + +[![Documentation Status](https://readthedocs.org/projects/gpufit/badge/?version=latest)](http://gpufit.readthedocs.io/en/latest/?badge=latest) + +Documentation for the Gpufit library may be found online ([latest documentation](http://gpufit.readthedocs.io/en/latest/?badge=latest)), and also +as a PDF file in the binary distribution of Gpufit. + +## Building Gpufit from source code + +Instructions for building Gpufit are found in the documentation: [Building from source code](https://github.com/gpufit/Gpufit/blob/master/docs/installation.rst). + +## Using the Gpufit binary distribution + +Instructions for using the bindary distribution may be found in the documentation. The binary package contains: + +- The Gpufit SDK, which consists of the 32-bit and 64-bit DLL files, and + the Gpufit header file which contains the function definitions. The Gpufit + SDK is intented to be used when calling Gpufit from an external application + written in e.g. C code. +- Gpufit Performance test: A simple console application comparing the execution speed of curve fitting on the GPU and CPU. This program also serves as a test to ensure the correct functioning of Gpufit. +- Matlab 32 bit and 64 bit bindings, with Matlab examples. +- Python version 2.x and version 3.x bindings (compiled as wheel files) and + Python examples. +- The Gpufit manual in PDF format + +## License + +MIT License + +Copyright (c) 2017 Mark Bates, Adrian Przybylski, Björn Thiel, and Jan Keller-Findeisen + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/docs/_static/style.css b/docs/_static/style.css new file mode 100644 index 0000000..6c92e05 --- /dev/null +++ b/docs/_static/style.css @@ -0,0 +1,15 @@ +.wy-nav-content { + max-width: 1100px !important; +} + +@media screen and (max-width: 767px) { + .wy-table-responsive table td { + white-space: nowrap; + } +} + +@media screen and (min-width: 768px) { + .wy-table-responsive table td { + white-space: normal; + } +} \ No newline at end of file diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html new file mode 100644 index 0000000..b0a4480 --- /dev/null +++ b/docs/_templates/layout.html @@ -0,0 +1,4 @@ +{% extends "!layout.html" %} +{% block extrahead %} + +{% endblock %} \ No newline at end of file diff --git a/docs/appendix.rst b/docs/appendix.rst new file mode 100644 index 0000000..103df3e --- /dev/null +++ b/docs/appendix.rst @@ -0,0 +1,31 @@ +======== +Appendix +======== + +Levenberg-Marquardt algorithm +----------------------------- + +A flowchart of the implementation of the Levenberg-Marquardt algorithm is given in :numref:`appendix-gpufit-flowchart`. + +.. _appendix-gpufit-flowchart: + +.. figure:: /images/gpufit_program_flow_skeleton_v2.png + :width: 14 cm + :align: center + + Levenberg-Marquardt algorithm flow as implemented in |GF|. + + +Performance comparison to other GPU benchmarks +---------------------------------------------- + +Using the bundled application to estimate the fitting speed per second of 10 million fits for various CUDA capable +graphics cards of various architectures (on different computers with different versions of graphics drivers) we can +compare to the results of Passmark G3D. By and large, the results seem to correlate, i.e. a high Passmark G3D score +also relates to a high Gpufit fitting speed. + +.. figure:: /images/Gpufit_PassmarkG3D_relative_performance.png + :width: 14 cm + :align: center + + Performance of Gpufit vs Passmark G3D \ No newline at end of file diff --git a/docs/bindings.rst b/docs/bindings.rst new file mode 100644 index 0000000..ff3d914 --- /dev/null +++ b/docs/bindings.rst @@ -0,0 +1,413 @@ +.. _external-bindings: + +================= +External bindings +================= + +This sections describes the Gpufit bindings to other programming languages. The bindings (e.g. to Python or Matlab) aim to +emulate the :ref:`c-interface` as closely as possible. + +Most high level languages feature multidimensional numerical arrays. In the bindings implemented for Matlab and Python, +we adopt the convention that the input data should be organized as a 2D array, with one dimension corresponding to the +number of data points per fit, and the other corresponding to the number of fits. Internally, in memory, these arrays should +always be ordered such that the data values for each fit are kept together. In Matlab, for example, this means storing the +data in an array with dimensions [number_points_per_fit, number_fits]. In this manner, the data in memory is ordered in the +same way that is expected by the Gpufit C interface, and there is no need to copy or otherwise re-organize the data +before passing it to the GPU. The same convention is used for the weights, the initial model parameters, and the output parameters. + +Unlike the C interface, the external bindings to not require the number of fits and the number of data points per fit to be +specified explicitly. Instead, these numbers are inferred from the dimensions of the 2D input arrays. + +Optional parameters with default values +--------------------------------------- + +The external bindings make some input parameters optional. The optional parameters are shown here. + +:tolerance: + default value 1e-4 +:max_n_iterations: + default value 25 iterations +:estimator_id: + the default estimator is LSE as defined in gpufit.h_ +:parameters_to_fit: + by default all parameters are fit + +For instructions on how to specify these parameters explicitly, see the sections below. + +Python +------ + +The Gpufit binding for Python is a project named pyGpufit. This project contains a Python package named pygpufit, which +contains a module gpufit, and this module implements a method called fit. Calling this method is equivalent to +calling the C interface function *gpufit()* of |GF|. The package expects the input data to be +stored as NumPy array. NumPy follows row-major order by default. + +Installation +++++++++++++ + +Wheel files for Python 2.X and 3.X on Windows 32/64 bit are included in the binary package. NumPy is required. + +Install the wheel file with. + +.. code-block:: bash + + pip install --no-index --find-links=LocalPathToWheelFile pyGpufit + +Python Interface +++++++++++++++++ + +Optional parameters are passed in as None. The numbers of points, fits and parameters is deduced from the dimensions of +the input data and initial parameters arrays. + +The signature of the gpufit method is + +.. code-block:: python + + def fit(data, weights, model_id:ModelID, initial_parameters, tolerance:float=None, max_number_iterations:int=None, parameters_to_fit=None, estimator_id:EstimatorID=None, user_info=None): + +*Input parameters* + +:data: Data + 2D NumPy array of shape (number_fits, number_points) and data type np.float32 +:weights: Weights + 2D NumPy array of shape (number_fits, number_points) and data type np.float32 (same as data) + + :special: None indicates that no weights are available +:tolerance: Fit tolerance + + :type: float + :special: If None, the default value will be used. +:max_number_iterations: Maximal number of iterations + + :type: int + :special: If None, the default value will be used. +:estimator_id: estimator ID + + :type: EstimatorID which is an Enum in the same module and defined analogously to gpufit.h_. + :special: If None, the default value is used. +:model_id: model ID + + :type: ModelID which is an Enum in the same module and defined analogously to gpufit.h_. +:initial_parameters: Initial parameters + 2D NumPy array of shape (number_fits, number_parameter) + + :array data type: np.float32 +:parameters_to_fit: parameters to fit + 1D NumPy array of length number_parameter + A zero indicates that this parameter should not be fitted, everything else means it should be fitted. + + :array data type: np.int32 + :special: If None, the default value is used. +:user_info: user info + 1D NumPy array of arbitrary type. The length in bytes is deduced automatically. + + :special: If None, no user_info is assumed. + +*Output parameters* + +:parameters: Fitted parameters for each fit + 2D NumPy array of shape (number_fits, number_parameter) and data type np.float32 +:states: Fit result states for each fit + 1D NumPy array of length number_parameter of data type np.int32 + As defined in gpufit.h_: +:chi_squares: :math:`\chi^2` values for each fit + 1D NumPy array of length number_parameter of data type np.float32 +:n_iterations: Number of iterations done for each fit + 1D NumPy array of length number_parameter of data type np.int32 +:time: Execution time of call to fit + In seconds. + +Errors are raised if checks on parameters fail or if the execution of fit failed. + +Python Examples ++++++++++++++++ + +2D Gaussian peak example +........................ + +An example can be found at `Python Gauss2D example`_. It is equivalent to :ref:`c-example-2d-gaussian`. + +The essential imports are: + +.. code-block:: python + + import numpy as np + import pygpufit.gpufit as gf + +The true parameters describing an example 2D Gaussian peak functions are: + +.. code-block:: python + + # true parameters + true_parameters = np.array((10, 5.5, 5.5, 3, 10), dtype=np.float32) + +A 2D grid of x and y positions can conveniently be generated using the np.meshgrid function: + +.. code-block:: python + + # generate x and y values + g = np.arange(size_x) + yi, xi = np.meshgrid(g, g, indexing='ij') + xi = xi.astype(np.float32) + yi = yi.astype(np.float32) + +Using these positions and the true parameter values a model function can be calculated as + +.. code-block:: python + + def generate_gauss_2d(p, xi, yi): + """ + Generates a 2D Gaussian peak. + http://gpufit.readthedocs.io/en/latest/api.html#gauss-2d + + :param p: Parameters (amplitude, x,y center position, width, offset) + :param xi: x positions + :param yi: y positions + :return: The Gaussian 2D peak. + """ + + arg = -(np.square(xi - p[1]) + np.square(yi - p[2])) / (2*p[3]*p[3]) + y = p[0] * np.exp(arg) + p[4] + + return y + +The model function can be repeated and noise can be added using the np.tile and np.random.poisson functions. + +.. code-block:: python + + # generate data + data = generate_gauss_2d(true_parameters, xi, yi) + data = np.reshape(data, (1, number_points)) + data = np.tile(data, (number_fits, 1)) + + # add Poisson noise + data = np.random.poisson(data) + data = data.astype(np.float32) + +The model and estimator IDs can be set as + +.. code-block:: python + + # estimator ID + estimator_id = gf.EstimatorID.MLE + + # model ID + model_id = gf.ModelID.GAUSS_2D + +When all input parameters are set we can call the C interface of Gpufit. + +.. code-block:: python + + # run Gpufit + parameters, states, chi_squares, number_iterations, execution_time = gf.fit(data, None, model_id, initial_parameters, tolerance, max_number_iterations, None, estimator_id, None) + +And finally statistics about the results of the fits can be displayed where the mean and standard deviation of the +fitted parameters are limited to those fits that converged. + +.. code-block:: python + + # print fit results + + # get fit states + converged = states == 0 + number_converged = np.sum(converged) + print('ratio converged {:6.2f} %'.format(number_converged / number_fits * 100)) + print('ratio max it. exceeded {:6.2f} %'.format(np.sum(states == 1) / number_fits * 100)) + print('ratio singular hessian {:6.2f} %'.format(np.sum(states == 2) / number_fits * 100)) + print('ratio neg curvature MLE {:6.2f} %'.format(np.sum(states == 3) / number_fits * 100)) + print('ratio gpu not read {:6.2f} %'.format(np.sum(states == 4) / number_fits * 100)) + + # mean, std of fitted parameters + converged_parameters = parameters[converged, :] + converged_parameters_mean = np.mean(converged_parameters, axis=0) + converged_parameters_std = np.std(converged_parameters, axis=0) + + for i in range(number_parameters): + print('p{} true {:6.2f} mean {:6.2f} std {:6.2f}'.format(i, true_parameters[i], converged_parameters_mean[i], converged_parameters_std[i])) + + # print summary + print('model ID: {}'.format(model_id)) + print('number of fits: {}'.format(number_fits)) + print('fit size: {} x {}'.format(size_x, size_x)) + print('mean chi_square: {:.2f}'.format(np.mean(chi_squares[converged]))) + print('iterations: {:.2f}'.format(np.mean(number_iterations[converged]))) + print('time: {:.2f} s'.format(execution_time)) + + +Matlab +------ + +The Matlab binding for Gpufit is a Matlab script (gpufit.m_). This script checks the input data, sets default parameters, and +calls the C interface of |GF|, via a compiled .mex file. + +Please note, that before using the Matlab binding, the path to gpufit.m_ must be added to the Matlab path. + +If other GPU-based computations are to be performed with Matlab in the same session, please use the Matlab GPU computing +functionality first (for example with a call to gpuDevice or gpuArray) before calling the Gpufit Matlab binding. If this is not +done, Matlab will throw an error (Error using gpuArray An unexpected error occurred during CUDA execution. +The CUDA error was: cannot set while device is active in this process). + +Matlab Interface +++++++++++++++++ + +Optional parameters are passed in as empty matrices (``[]``). The numbers of points, fits and parameters is deduced from the dimensions of +the input data and initial parameters matrices. + +The signature of the gpufit function is + +.. code-block:: matlab + + function [parameters, states, chi_squares, n_iterations, time] = gpufit(data, weights, model_id, initial_parameters, tolerance, max_n_iterations, parameters_to_fit, estimator_id, user_info) + +*Input parameters* + +:data: Data + 2D matrix of size [number_points, number_fits] and data type single +:weights: Weights + 2D matrix of size [number_points, number_fits] and data type single (same as data) + + :special: None indicates that no weights are available +:tolerance: Fit tolerance + + :type: single + :special: If empty ([]), the default value will be used. +:max_number_iterations: Maximal number of iterations + Will be converted to int32 if necessary + + :special: If empty ([]), the default value will be used. +:estimator_id: estimator ID + + :type: EstimatorID which is defined in EstimatorID.m analogously to gpufit.h_. + :special: If empty ([]), the default value is used. +:model_id: model ID + + :type: ModelID which is defined in ModelID.m analogously to gpufit.h_. +:initial_parameters: Initial parameters + 2D matrix of size: [number_parameter, number_fits] + + :type: single +:parameters_to_fit: parameters to fit + vector of length number_parameter, will be converted to int32 if necessary + A zero indicates that this parameter should not be fitted, everything else means it should be fitted. + + :special: If empty ([]), the default value is used. +:user_info: user info + vector of arbitrary type. The length in bytes is deduced automatically. + +*Output parameters* + +:parameters: Fitted parameters for each fit + 2D matrix of size: [number_parameter, number_fits] of data type single +:states: Fit result states for each fit + vector of length number_parameter of data type int32 + As defined in gpufit.h_: +:chi_squares: :math:`\chi^2` values for each fit + vector of length number_parameter of data type single +:n_iterations: Number of iterations done for each fit + vector of length number_parameter of data type int32 +:time: Execution time of call to gpufit + In seconds. + +Errors are raised if checks on parameters fail or if the execution of gpufit fails. + +Matlab Examples ++++++++++++++++ + +Simple example +.............. + +The most simple example is the `Matlab simple example`_. It is equivalent to :ref:`c-example-simple` and additionally +relies on default values for optional arguments. + +2D Gaussian peak example +........................ + +An example can be found at `Matlab Gauss2D example`_. It is equivalent to :ref:`c-example-2d-gaussian`. + +The true parameters describing an example 2D Gaussian peak functions are: + +.. code-block:: matlab + + % true parameters + true_parameters = single([10, 5.5, 5.5, 3, 10]); + +A 2D grid of x and y positions can conveniently be generated using the ndgrid function: + +.. code-block:: matlab + + % generate x and y values + g = single(0 : size_x - 1); + [x, y] = ndgrid(g, g); + +Using these positions and the true parameter values a model function can be calculated as + +.. code-block:: matlab + + function g = gaussian_2d(x, y, p) + % Generates a 2D Gaussian peak. + % http://gpufit.readthedocs.io/en/latest/api.html#gauss-2d + % + % x,y - x and y grid position values + % p - parameters (amplitude, x,y center position, width, offset) + + g = p(1) * exp(-((x - p(2)).^2 + (y - p(3)).^2) / (2 * p(4)^2)) + p(5); + + end + +The model function can be repeated and noise can be added using the repmat and poissrnd functions. + +.. code-block:: matlab + + % generate data with Poisson noise + data = gaussian_2d(x, y, true_parameters); + data = repmat(data(:), [1, number_fits]); + data = poissrnd(data); + +The model and estimator IDs can be set as + +.. code-block:: matlab + + % estimator id + estimator_id = EstimatorID.MLE; + + % model ID + model_id = ModelID.GAUSS_2D; + +When all input parameters are set we can call the C interface of |GF|. + +.. code-block:: matlab + + %% run Gpufit + [parameters, states, chi_squares, n_iterations, time] = gpufit(data, [], model_id, initial_parameters, tolerance, max_n_iterations, [], estimator_id, []); + +And finally statistics about the results of the fits can be displayed where the mean and standard deviation of the +fitted parameters are limited to those fits that converged. + +.. code-block:: matlab + + %% displaying results + + % get fit states + converged = states == 0; + number_converged = sum(converged); + fprintf(' ratio converged %6.2f %%\n', number_converged / number_fits * 100); + fprintf(' ratio max it. exceeded %6.2f %%\n', sum(states == 1) / number_fits * 100); + fprintf(' ratio singular hessian %6.2f %%\n', sum(states == 2) / number_fits * 100); + fprintf(' ratio neg curvature MLE %6.2f %%\n', sum(states == 3) / number_fits * 100); + fprintf(' ratio gpu not read %6.2f %%\n', sum(states == 4) / number_fits * 100); + + % mean and std of fitted parameters + converged_parameters = parameters(:, converged); + converged_parameters_mean = mean(converged_parameters, 2); + converged_parameters_std = std(converged_parameters, [], 2); + for i = 1 : number_parameters + fprintf(' p%d true %6.2f mean %6.2f std %6.2f\n', i, true_parameters(i), converged_parameters_mean(i), converged_parameters_std(i)); + end + + % print summary + fprintf('model ID: %d\n', model_id); + fprintf('number of fits: %d\n', number_fits); + fprintf('fit size: %d x %d\n', size_x, size_x); + fprintf('mean chi-square: %6.2f\n', mean(chi_squares(converged))); + fprintf('iterations: %6.2f\n', mean(n_iterations(converged))); + fprintf('time: %6.2f s\n', time); diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..fe55fe3 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,457 @@ +# -*- coding: utf-8 -*- +import sphinx_rtd_theme +# +# RTD Spielwiese documentation build configuration file, created by +# sphinx-quickstart on Tue Oct 04 12:39:10 2016. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.4' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.mathjax', + 'sphinx.ext.todo' +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +# +# source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'Gpufit: An open-source toolkit for GPU-accelerated curve fitting' +copyright = 'All rights reserved.' +author = 'Adrian Przybylski, Björn Thiel, Jan Keller-Findeisen, Bernd Stock, and Mark Bates' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = u'1.0' +# The full version, including alpha/beta/rc tags. +release = u'1.0.0' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +# +# today = '' +# +# Else, today_fmt is used as the format for a strftime call. +# +# today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +# +# default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +# +# add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +# +# add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +# +# show_authors = False + +# read epilog.rst +with open('epilog.txt') as f: + rst_epilog = f.read() + +# default highlight language is cpp +highlight_language = 'cpp' + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +# modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +# keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = True + +numfig = True + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +#html_theme_options = { +# 'collapse_navigation': False, +# 'display_version': False, +# 'navigation_depth': 3, +#} + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +# html_theme_path = [] + +# The name for this set of Sphinx documents. +# " v documentation" by default. +# +# html_title = u'RTD Spielwiese v1' + +# A shorter title for the navigation bar. Default is the same as html_title. +# +# html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +# +# html_logo = None + +# The name of an image file (relative to this directory) to use as a favicon of +# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +# +# html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +# +# html_extra_path = [] + +# If not None, a 'Last updated on:' timestamp is inserted at every page +# bottom, using the given strftime format. +# The empty string is equivalent to '%b %d, %Y'. +# +# html_last_updated_fmt = None + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +# +# html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +# +# html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +# +# html_additional_pages = {} + +# If false, no module index is generated. +# +# html_domain_indices = True + +# If false, no index is generated. +# +# html_use_index = True + +# If true, the index is split into individual pages for each letter. +# +# html_split_index = False + +# If true, links to the reST sources are added to the pages. +# +# html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +# +# html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +# +# html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +# +# html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +# html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' +# +# html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# 'ja' uses this config value. +# 'zh' user can custom change `jieba` dictionary path. +# +# html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +# +# html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Gpufit' + +# -- Options for LaTeX output --------------------------------------------- + + +# make code smaller in latex output +# see also: http://stackoverflow.com/questions/9899283/how-do-you-change-the-code-example-font-size-in-latex-pdf-output-with-sphinx +from sphinx.highlighting import PygmentsBridge +from pygments.formatters.latex import LatexFormatter + +class CustomLatexFormatter(LatexFormatter): + def __init__(self, **options): + super(CustomLatexFormatter, self).__init__(**options) + self.verboptions = r"formatcom=\footnotesize" + +PygmentsBridge.latex_formatter = CustomLatexFormatter + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + 'papersize': 'a4paper,oneside', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'Gpufit.tex', 'Gpufit Documentation', + 'Gpufit', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +# +# latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +# +# latex_use_parts = False + +# If true, show page references after internal links. +# +latex_show_pagerefs = True + +# If true, show URL addresses after external links. +# +# latex_show_urls = 'footnote' +latex_show_urls = 'no' + +# Documents to append as an appendix to all manuals. +# +# latex_appendices = [] + +# It false, will not define \strong, \code, itleref, \crossref ... but only +# \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added +# packages. +# +# latex_keep_old_macro_names = True + +# If false, no module index is generated. +# +# latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'gpufit', 'Gpufit Documentation', + [author], 1) +] + +# If true, show URL addresses after external links. +# +# man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'Gpufit', 'Gpufit Documentation', + author, 'Gpufit', 'Levenberg Marquardt curve fitting in CUDA', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +# +# texinfo_appendices = [] + +# If false, no module index is generated. +# +# texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +# +# texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +# +# texinfo_no_detailmenu = False + + +# -- Options for Epub output ---------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = project +epub_author = author +epub_publisher = author +epub_copyright = copyright + +# The basename for the epub file. It defaults to the project name. +# epub_basename = project + +# The HTML theme for the epub output. Since the default themes are not +# optimized for small screen space, using the same theme for HTML and epub +# output is usually not wise. This defaults to 'epub', a theme designed to save +# visual space. +# +# epub_theme = 'epub' + +# The language of the text. It defaults to the language option +# or 'en' if the language is not set. +# +# epub_language = '' + +# The scheme of the identifier. Typical schemes are ISBN or URL. +# epub_scheme = '' + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +# +# epub_identifier = '' + +# A unique identification for the text. +# +# epub_uid = '' + +# A tuple containing the cover image and cover page html template filenames. +# +# epub_cover = () + +# A sequence of (type, uri, title) tuples for the guide element of content.opf. +# +# epub_guide = () + +# HTML files that should be inserted before the pages created by sphinx. +# The format is a list of tuples containing the path and title. +# +# epub_pre_files = [] + +# HTML files that should be inserted after the pages created by sphinx. +# The format is a list of tuples containing the path and title. +# +# epub_post_files = [] + +# A list of files that should not be packed into the epub file. +epub_exclude_files = ['search.html'] + +# The depth of the table of contents in toc.ncx. +# +# epub_tocdepth = 3 + +# Allow duplicate toc entries. +# +# epub_tocdup = True + +# Choose between 'default' and 'includehidden'. +# +# epub_tocscope = 'default' + +# Fix unsupported image types using the Pillow. +# +# epub_fix_images = False + +# Scale large images. +# +# epub_max_image_width = 0 + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +# +# epub_show_urls = 'inline' + +# If false, no index is generated. +# +# epub_use_index = True diff --git a/docs/customization.rst b/docs/customization.rst new file mode 100644 index 0000000..4fcfec5 --- /dev/null +++ b/docs/customization.rst @@ -0,0 +1,299 @@ +.. _gpufit-customization: + +============= +Customization +============= + +This sections explains how to add custom fit model functions and custom fit estimators within |GF|. +Functions calculating the estimator and model values are defined in CUDA header files using the CUDA C syntax. +For each function and estimator there exists a separate file. Therefore, to add an additional model or estimator a new +CUDA header file containing the new model or estimator function must be created and included in the library. + +Please note, that in order to add a model function or estimator, it is necessary to rebuild the Gpufit library +from source. In future releases of Gpufit, it may be possible to include new fit functions or estimators at runtime. + + +Add a new fit model function +---------------------------- + +To add a new fit model, the model function itself as well as analytic expressions for its partial derivatives +must to be known. A function calculating the values of the model as well as a function calculating the +values of the partial derivatives of the model, with respect to the model parameters and possible grid +coordinates, must be implemented. + +Additionally, a new model ID must be defined and included in the list of available model IDs, and the number +of model parameters must be specified as well. + +Detailed step by step instructions for adding a model function are given below. + +1. Define an additional model ID in file gpufit.h_ +2. Implement a CUDA device function within a newly created .cuh file according to the following template. + +.. code-block:: cuda + + __device__ void ... ( // function name + float const * parameters, + int const n_fits, + int const n_points, + int const n_parameters, + float * values, + float * derivatives, + int const chunk_index, + char * user_info, + std::size_t const user_info_size) + { + ///////////////////////////// indices ///////////////////////////// + int const n_fits_per_block = blockDim.x / n_points; + int const fit_in_block = threadIdx.x / n_points; + int const point_index = threadIdx.x - (fit_in_block*n_points); + int const fit_index = blockIdx.x*n_fits_per_block + fit_in_block; + + ///////////////////////////// values ////////////////////////////// + float* current_value = &values[fit_index*n_points]; + float const * current_parameters = ¶meters[fit_index*n_parameters]; + + current_value[point_index] = ... ; // formula calculating fit model values + + /////////////////////////// derivatives /////////////////////////// + float * current_derivative = &derivatives[fit_index * n_points*n_parameters]; + + current_derivative[0 * n_points + point_index] = ... ; // formula calculating derivative values with respect to parameters[0] + current_derivative[1 * n_points + point_index] = ... ; // formula calculating derivative values with respect to parameters[1] + . + . + . + } + +This code can be used as a pattern, where the placeholders ". . ." must be replaced by user code which calculates model +function values and partial derivative values of the model function for a particular set of parameters. See for example linear_1d.cuh_. + +3. Include the newly created .cuh file in cuda_kernels.cu_ +4. Add an if branch in the CUDA global function ``cuda_calc_curve()`` in file cuda_kernels.cu_ to allow calling the added model function + +.. code-block:: cpp + + if (model_id == GAUSS_1D) + calculate_gauss1d + (parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size); + . + . + . + else if (model_id == ...) // model ID + ... // function name + (parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size); + +Compare model_id with the defined model of the new model and call the calculate model values function of your model. + +5. Add a switch case in function set_number_of_parameters in file interface.cpp_ + +.. code-block:: cpp + + switch (model_id) + { + case GAUSS_1D: + n_parameters_ = 4; + break; + . + . + . + case ... : // model ID + n_parameters_ = ... ; // number of model parameters + break; + default: + break; + } + +Add a new fit estimator +------------------------ + +To extend |GF| by additional estimators, three CUDA device functions must be defined and integrated. The sections requiring modification are +the functions which calculate the estimator function values, and its gradient and hessian values. Also, a new estimator ID must be defined. +Detailed step by step instructions for adding an additional estimator is given below. + +1. Define an additional estimator ID in gpufit.h_ +2. Implement three functions within a newly created .cuh file calculating :math:`\chi^2` values and + its gradient and hessian according to the following template. + +.. code-block:: cuda + + ///////////////////////////// Chi-square ///////////////////////////// + __device__ void ... ( // function name Chi-square + volatile float * chi_square, + int const point_index, + float const * data, + float const * value, + float const * weight, + int * state, + char * user_info, + std::size_t const user_info_size) + { + chi_square[point_index] = ... ; // formula calculating Chi-square summands + } + + ////////////////////////////// gradient ////////////////////////////// + __device__ void ... ( // function name gradient + volatile float * gradient, + int const point_index, + int const parameter_index, + float const * data, + float const * value, + float const * derivative, + float const * weight, + char * user_info, + std::size_t const user_info_size) + { + gradient[point_index] = ... ; // formula calculating summands of the gradient of Chi-square + } + + ////////////////////////////// hessian /////////////////////////////// + __device__ void ... ( // function name hessian + double * hessian, + int const point_index, + int const parameter_index_i, + int const parameter_index_j, + float const * data, + float const * value, + float const * derivative, + float const * weight, + char * user_info, + std::size_t const user_info_size) + { + *hessian += ... ; // formula calculating summands of the hessian of Chi-square + } + +This code can be used as a pattern, where the placeholders ". . ." must be replaced by user code which calculates the estimator +and the hessian values of the estimator given. For a concrete example, see lse.cuh_. + +3. Include the newly created .cuh file in cuda_kernels.cu_ + +.. code-block:: cpp + + #include "....cuh" // filename + +4. Add an if branch in 3 CUDA global functions in the file cuda_kernels.cu_ + + .. code-block:: cuda + + __global__ void cuda_calculate_chi_squares( + . + . + . + if (estimator_id == LSE) + { + calculate_chi_square_lse( + shared_chi_square, + point_index, + current_data, + current_value, + current_weight, + current_state, + user_info, + user_info_size); + } + . + . + . + else if (estimator_id == ...) // estimator ID + { + ...( // function name Chi-square + shared_chi_square, + point_index, + current_data, + current_value, + current_weight, + current_state, + user_info, + user_info_size); + } + . + . + . + + + .. code-block:: cuda + + __global__ void cuda_calculate_gradients( + . + . + . + if (estimator_id == LSE) + { + calculate_gradient_lse( + shared_gradient, + point_index, + derivative_index, + current_data, + current_value, + current_derivative, + current_weight, + user_info, + user_info_size); + } + . + . + . + else if (estimator_id == ...) // estimator ID + { + ...( // function name gradient + shared_gradient, + point_index, + derivative_index, + current_data, + current_value, + current_derivative, + current_weight, + user_info, + user_info_size); + } + . + . + . + + .. code-block:: cuda + + __global__ void cuda_calculate_hessians( + . + . + . + if (estimator_id == LSE) + { + calculate_hessian_lse( + &sum, + point_index, + derivative_index_i + point_index, + derivative_index_j + point_index, + current_data, + current_value, + current_derivative, + current_weight, + user_info, + user_info_size); + } + . + . + . + else if (estimator_id == ...) // estimator ID + { + ...( // function name hessian + &sum, + point_index, + derivative_index_i + point_index, + derivative_index_j + point_index, + current_data, + current_value, + current_derivative, + current_weight, + user_info, + user_info_size); + } + . + . + . + +Future releases +--------------- + +A disadvantage of the Gpufit library, when compared with established CPU-based curve fitting packages, +is that in order to add or modify a fit model function or a fit estimator, the library must be recompiled. +We anticipate that this limitation can be overcome in future releases of the library, by employing +run-time compilation of the CUDA code. diff --git a/docs/epilog.txt b/docs/epilog.txt new file mode 100644 index 0000000..ee243c1 --- /dev/null +++ b/docs/epilog.txt @@ -0,0 +1,48 @@ + +.. + The content of this file will be appended to every documentation file. Put common substitutions and links here. + +.. |GF| replace:: the Gpufit library +.. |GF_version| replace:: 1.0.0 + +.. _CUDA: http://developer.nvidia.com/cuda-zone +.. _CUDA_SELECT_NVCC_ARCH_FLAGS: http://cmake.org/cmake/help/v3.7/module/FindCUDA.html + +.. _CMake: http://www.cmake.org +.. _Boost: http://www.boost.org +.. _MATLAB: http://www.mathworks.com/products/matlab.html +.. _Python: http://www.python.org + +.. _`Gpufit on Github`: https://github.com/gpufit/Gpufit +.. _`Gpufit release location`: https://github.com/gpufit/Gpufit/releases +.. _Gpufit-master.zip: https://github.com/gpufit/Gpufit/archive/master.zip + +.. _gpufit.h: https://github.com/gpufit/Gpufit/blob/master/Gpufit/gpufit.h +.. _interface.cpp: https://github.com/gpufit/Gpufit/blob/master/Gpufit/interface.cpp + +.. _gauss_1d.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/gauss_1d.cuh +.. _gauss_2d.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/gauss_2d.cuh +.. _gauss_2d_elliptic.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/gauss_2d_elliptic.cuh +.. _gauss_2d_rotated.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/gauss_2d_rotated.cuh +.. _cauchy_2d_elliptic.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/cauchy2delliptic.cuh +.. _linear_1d.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/linear_1d.cuh +.. _lse.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/lse.cuh +.. _mle.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/mle.cuh +.. _cuda_kernels.cu: https://github.com/gpufit/Gpufit/blob/master/Gpufit/cuda_kernels.cu + +.. _Tests: https://github.com/gpufit/Gpufit/tree/master/Gpufit/tests +.. _Examples: https://github.com/gpufit/Gpufit/tree/master/Gpufit/examples +.. _Simple_Example.cpp: https://github.com/gpufit/Gpufit/blob/master/Gpufit/examples/Simple_Example.cpp +.. _Gauss_Fit_2D_Example.cpp: https://github.com/gpufit/Gpufit/blob/master/Gpufit/examples/Gauss_Fit_2D_Example.cpp +.. _Linear_Regression_Example.cpp: https://github.com/gpufit/Gpufit/blob/master/Gpufit/examples/Linear_Regression_Example.cpp + +.. _GpufitMex.cpp: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/matlab/GpufitMex.cpp +.. _gpufit.m: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/matlab/gpufit.m + +.. _`Matlab simple example`: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/matlab/examples/simple.m +.. _`Matlab Gauss2D example`: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/matlab/examples/gauss2d.m +.. _`Matlab Gauss2D plot example`: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/matlab/examples/gauss2d_plot.m + +.. _`Python simple example`: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/python/examples/simple.py +.. _`Python Gauss2D example`: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/python/examples/gauss2d.py +.. _`Python Gauss2D plot example`: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/python/examples/gauss2d_plot.py \ No newline at end of file diff --git a/docs/examples.rst b/docs/examples.rst new file mode 100644 index 0000000..da54114 --- /dev/null +++ b/docs/examples.rst @@ -0,0 +1,394 @@ +======== +Examples +======== + +C++ Examples_ are part of the library code base and can be built and run through the project environment. Here they are +described and important steps are highlighted. + +Please note, that additionally, the C++ Tests_ contained in the code base also demonstrate the usage of |GF|. However, a +detailed description of the tests is not provided. + +.. _c-example-simple: + +Simple skeleton example +----------------------- + +This example shows the minimal code providing all required parameters and the call to the C interface. It is contained +in Simple_Example.cpp_ and can be built and executed within the project environment. Please note, that it this code does +not do anything other than call gpufit(). + +In the first section of the code, the model ID is set, space for initial parameters and data values is reserved (in a normal +application, however, the data array would already exist), the fit tolerance is set, the maximal number of iterations is set, +the estimator ID is set, and the parameters to fit array is initialized to indicate that all parameters should be fit. + +.. code-block:: cpp + + // number of fits, number of points per fit + size_t const number_fits = 10; + size_t const number_points = 10; + + // model ID and number of parameter + int const model_id = GAUSS_1D; + size_t const number_parameters = 5; + + // initial parameters + std::vector< float > initial_parameters(number_fits * number_parameters); + + // data + std::vector< float > data(number_points * number_fits); + + // tolerance + float const tolerance = 0.001f; + + // maximal number of iterations + int const max_number_iterations = 10; + + // estimator ID + int const estimator_id = LSE; + + // parameters to fit (all of them) + std::vector< int > parameters_to_fit(number_parameters, 1); + +In a next step, sufficient memory is reserved for all four output parameters. + +.. code-block:: cpp + + // output parameters + std::vector< float > output_parameters(number_fits * number_parameters); + std::vector< int > output_states(number_fits); + std::vector< float > output_chi_square(number_fits); + std::vector< int > output_number_iterations(number_fits); + +Finally, there is a call to the C interface of Gpufit (in this example, the optional +inputs *weights* and *user info* are not used) and a check of the return status. +If an error occurred, the last error message is obtained and an exception is thrown. + +.. code-block:: cpp + + // call to gpufit (C interface) + int const status = gpufit + ( + number_fits, + number_points, + data.data(), + 0, + model_id, + initial_parameters.data(), + tolerance, + max_number_iterations, + parameters_to_fit.data(), + estimator_id, + 0, + 0, + output_parameters.data(), + output_states.data(), + output_chi_square.data(), + output_number_iterations.data() + ); + + // check status + if (status != STATUS_OK) + { + throw std::runtime_error(gpufit_get_last_error()); + } + +This simple example can easily be adapted to real applications by: + +- choosing your own model ID +- choosing your own estimator ID +- choosing your own fit tolerance and maximal number of iterations +- filling the data structure with the data values to be fitted +- filling the initial parameters structure with suitable estimates of the true parameters +- processing the output data + +The following two examples show |GF| can be used to fit real data. + +.. _c-example-2d-gaussian: + +Fit 2D Gaussian functions example +--------------------------------- + +This example features: + +- Multiple fits using a 2D Gaussian function +- Noisy data and random initial guesses for the fit parameters +- A Poisson noise adapted maximum likelihood estimator + +It is contained in Gauss_Fit_2D_Example.cpp_ and can be built and executed within the project environment. The optional +inputs to gpufit(), *weights* and *user info*, are not used. + +In this example, a 2D Gaussian curve is fit to 10\ :sup:`4` noisy data sets having a size of 20 x 20 points each. +The model function and the model parameters are described in :ref:`gauss-2d`. + +In this example the true parameters used to generate the Gaussian data are set to + +.. code-block:: cpp + + // true parameters + std::vector< float > true_parameters{ 10.f, 9.5f, 9.5f, 3.f, 10.f}; // amplitude, center x/y positions, width, offset + +which defines a 2D Gaussian peak centered at the middle of the grid (position 9.5, 9.5), with a width (standard deviation) of 3.0, an amplitude of 10 +and a background of 10. + +The guesses for the initial parameters are drawn from the true parameters with a uniformly distributed deviation +of about 20%. The initial guesses for the center coordinates are chosen with a deviation relative to the width of the Gaussian. + +.. code-block:: cpp + + // initial parameters (randomized) + std::vector< float > initial_parameters(number_fits * number_parameters); + for (size_t i = 0; i < number_fits; i++) + { + for (size_t j = 0; j < number_parameters; j++) + { + if (j == 1 || j == 2) + { + initial_parameters[i * number_parameters + j] = true_parameters[j] + true_parameters[3] * (-0.2f + 0.4f * uniform_dist(rng)); + } + else + { + initial_parameters[i * number_parameters + j] = true_parameters[j] * (0.8f + 0.4f*uniform_dist(rng)); + } + } + } + +The 2D grid of x and y values (each ranging from 0 to 19 with an increment of 1) is computed with a double for loop. + +.. code-block:: cpp + + // generate x and y values + std::vector< float > x(number_points); + std::vector< float > y(number_points); + for (size_t i = 0; i < size_x; i++) + { + for (size_t j = 0; j < size_x; j++) { + x[i * size_x + j] = static_cast(j); + y[i * size_x + j] = static_cast(i); + } + } + +Then a 2D Gaussian peak model function (without noise) is calculated once for the true parameters + +.. code-block:: cpp + + void generate_gauss_2d(std::vector &x, std::vector &y, std::vector &g, std::vector::iterator &p) + { + // generates a Gaussian 2D peak function on a set of x and y values with some paramters p (size 5) + // we assume that x.size == y.size == g.size, no checks done + + // given x and y values and parameters p computes a model function g + for (size_t i = 0; i < x.size(); i++) + { + float arg = -((x[i] - p[1]) * (x[i] - p[1]) + (y[i] - p[2]) * (y[i] - p[2])) / (2 * p[3] * p[3]); + g[i] = p[0] * exp(arg) + p[4]; + } + } + +Stored in variable temp, it is then used in every fit to generate Poisson distributed random numbers. + +.. code-block:: cpp + + // generate data with noise + std::vector< float > temp(number_points); + // compute the model function + generate_gauss_2d(x, y, temp, true_parameters.begin()); + + std::vector< float > data(number_fits * number_points); + for (size_t i = 0; i < number_fits; i++) + { + // generate Poisson random numbers + for (size_t j = 0; j < number_points; j++) + { + std::poisson_distribution< int > poisson_dist(temp[j]); + data[i * number_points + j] = static_cast(poisson_dist(rng)); + } + } + +Thus, in this example the difference between data for each fit only in the random noise. This, and the +randomized initial guesses for each fit, result in each fit returning slightly different best-fit parameters. + +We set the model and estimator IDs for the fit accordingly. + +.. code-block:: cpp + + // estimator ID + int const estimator_id = MLE; + + // model ID + int const model_id = GAUSS_2D; + +And call the gpufit :ref:`c-interface`. Parameters weights, user_info and user_info_size are set to 0, indicating that they +won't be used during the fits. + +.. code-block:: cpp + + // call to gpufit (C interface) + int const status = gpufit + ( + number_fits, + number_points, + data.data(), + 0, + model_id, + initial_parameters.data(), + tolerance, + max_number_iterations, + parameters_to_fit.data(), + estimator_id, + 0, + 0, + output_parameters.data(), + output_states.data(), + output_chi_square.data(), + output_number_iterations.data() + ); + + // check status + if (status != STATUS_OK) + { + throw std::runtime_error(gpufit_get_last_error()); + } + +After the fits have been executed and the return value is checked to ensure that no error occurred, some statistics +about the fits are displayed. + +Output statistics ++++++++++++++++++ + +A histogram of all possible fit states (see :ref:`api-output-parameters`) is obtained by iterating over the state of each fit. + +.. code-block:: cpp + + // get fit states + std::vector< int > output_states_histogram(5, 0); + for (std::vector< int >::iterator it = output_states.begin(); it != output_states.end(); ++it) + { + output_states_histogram[*it]++; + } + +In the computation of the mean and standard deviation only converged fits are taken into account. Here is an example of computing +the means of the output parameters iterating over all fits and all parameters. + +.. code-block:: cpp + + // compute mean of fitted parameters for converged fits + std::vector< float > output_parameters_mean(number_parameters, 0); + for (size_t i = 0; i != number_fits; i++) + { + if (output_states[i] == STATE_CONVERGED) + { + for (size_t j = 0; j < number_parameters; j++) + { + output_parameters_mean[j] += output_parameters[i * number_parameters + j]; + } + } + } + // normalize + for (size_t j = 0; j < number_parameters; j++) + { + output_parameters_mean[j] /= output_states_histogram[0]; + } + +.. _linear-regression-example: + +Linear Regression Example +------------------------- + +This example features: + +- Multiple fits of a 1D Linear curve +- Noisy data and random initial guesses for the parameters +- Unequal spaced x position values given as custom user info + +It is contained in Linear_Regression_Example.cpp_ and can be built and executed within the project environment. + +In this example, a straight line is fitted to 10\ :sup:`4` noisy data sets. Each data set includes 20 data points. +Locations of data points are scaled non-linear (exponentially). The user information given implicates the x positions of the data +sets. The fits are unweighted and the model function and the model parameters are described in :ref:`linear-1d`. + +The custom x positions of the linear model are stored in the user_info. + +.. code-block:: cpp + + // custom x positions for the data points of every fit, stored in user info + std::vector< float > user_info(number_points); + for (size_t i = 0; i < number_points; i++) + { + user_info[i] = static_cast(pow(2, i)); + } + + // size of user info in bytes + size_t const user_info_size = number_points * sizeof(float); + +Because only number_points values are specified, this means that the same custom x position values are used for every fit. + +The initial parameters for every fit are set to random values uniformly distributed around the true parameter value. + +.. code-block:: cpp + + // true parameters + std::vector< float > true_parameters { 5, 2 }; // offset, slope + + // initial parameters (randomized) + std::vector< float > initial_parameters(number_fits * number_parameters); + for (size_t i = 0; i != number_fits; i++) + { + // random offset + initial_parameters[i * number_parameters + 0] = true_parameters[0] * (0.8f + 0.4f * uniform_dist(rng)); + // random slope + initial_parameters[i * number_parameters + 1] = true_parameters[0] * (0.8f + 0.4f * uniform_dist(rng)); + } + +The data is generated as the value of a linear function and some additive normally distributed noise term. + +.. code-block:: cpp + + // generate data + std::vector< float > data(number_points * number_fits); + for (size_t i = 0; i != data.size(); i++) + { + size_t j = i / number_points; // the fit + size_t k = i % number_points; // the position within a fit + + float x = user_info[k]; + float y = true_parameters[0] + x * true_parameters[1]; + data[i] = y + normal_dist(rng); + } + +We set the model and estimator IDs for the fit accordingly. + +.. code-block:: cpp + + // estimator ID + int const estimator_id = LSE; + + // model ID + int const model_id = LINEAR_1D; + +And call the gpufit :ref:`c-interface`. Parameter weights is set to 0, indicating that they won't be used during the fits. + +.. code-block:: cpp + + // call to gpufit (C interface) + int const status = gpufit + ( + number_fits, + number_points, + data.data(), + 0, + model_id, + initial_parameters.data(), + tolerance, + max_number_iterations, + parameters_to_fit.data(), + estimator_id, + user_info_size, + reinterpret_cast< char * >( user_info.data() ), + output_parameters.data(), + output_states.data(), + output_chi_square.data(), + output_number_iterations.data() + ); + +After the fits have been executed and the return value is checked to ensure that no error occurred, some statistics +about the fits are displayed (see `Output statistics`_). diff --git a/docs/fit_estimator_functions.rst b/docs/fit_estimator_functions.rst new file mode 100644 index 0000000..fcee030 --- /dev/null +++ b/docs/fit_estimator_functions.rst @@ -0,0 +1,54 @@ +.. _estimator-functions: + +Estimator functions +------------------- + +.. _estimator-lse: + +Least squares estimator ++++++++++++++++++++++++ + +The least squares estimator computes the weighted sum of the squared deviation between the data values and the model at +the positions of the data points. The ID for this estimator is ``LSE``. It's implemented in lse.cuh_. + +Least squares estimation is a common method, and the standard Levenberg-Marquardt algorithm described by Marquardt makes +use of minimal least squares. The estimator is described as follows. + +.. math:: + + {\chi^2}(\vec{p}) = \sum_{n=0}^{N-1}{ \left(f_{n}(\vec{p})-z_{n}\right)^2\cdot w_n } + +:`n`: The index of the data points (:math:`0,..,N-1`) + +:`f_n`: The model function values at data position :math:`n` + +:`z_n`: Data values at data position :math:`n` + +:`\vec{p}`: Fit model function parameters + +:`w_n`: Weight values for data at position :math:`n` + + +.. _estimator-mle: + +Maximum likelihood estimator for data subject to Poisson statistics ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +The maximum likelihood estimator (MLE) for Poisson distributed noise is relatively simple to implement. In the case of data with Poisson noise +is provides a more precise estimate when compared to an LSE estimator. The ID for this estimator is ``MLE``. It's implemented in mle.cuh_. + +The estimator is described as follows. + +.. math:: + + {\chi^2}(\vec{p}) = 2\sum_{n=0}^{N-1}{(f_{n}(\vec{p})-z_{n})}-2\sum_{n=0,z_n\neq0}^{N-1}{z_n ln \left(\frac{f_{n}(\vec{p})}{z_n}\right)} + +:`n`: The index of the data points (:math:`0,..,N-1`) + +:`f_n`: The model function values at data position :math:`n` + +:`z_n`: Data values at data position :math:`n` + +:`\vec{p}`: Actual model function parameters + +Note that this estimator does not provide any means to weight the data values. Rather, noise in the data is assumed to be purely Poissonian. \ No newline at end of file diff --git a/docs/fit_model_functions.rst b/docs/fit_model_functions.rst new file mode 100644 index 0000000..620c821 --- /dev/null +++ b/docs/fit_model_functions.rst @@ -0,0 +1,193 @@ +.. _fit-model-functions: + +Fit Model functions +------------------- + +This section describes the fit model functions which are included with the Gpufit library. The headings are the names +of the ModelID parameter used in the gpufit()_ call. They are defined in gpufit.h_. + +Note that additional model functions may be added as described in the documentation, see :ref:`gpufit-customization`. + +.. _linear-1d: + +Linear regression ++++++++++++++++++ + +A 1D linear function defined by two parameters (offset and slope). The user information data may be used to specify the +X coordinate of each data point. The model ID of this function is ``LINEAR_1D``, and it is implemented in linear_1d.cuh_. + +.. math:: + + g(x,\vec{p})=p_0+p_1 x + +:`x`: (independent variable) *X* coordinate + + The X coordinate values may be specified in the user information data. + For details on how to do this, see the linear regression code example, :ref:`linear-regression-example`. + + If no independent variables are provided, the *X* coordinate of the first data value is assumed to be (0.0). + In this case, for a fit size of *M* data points, the *X* coordinates of the data are simply the corresponding array + indices of the data array, starting from zero (i.e. :math:`0, 1, 2, ...`). + +:`p_0`: offset + +:`p_1`: slope + + +.. _gauss-1d: + +1D Gaussian function +++++++++++++++++++++ + +A 1D Gaussian function defined by four parameters. Its model ID is ``GAUSS_1D`` and it is implemented in gauss_1d.cuh_. +Here, p is the vector of parameters (p0..p3) and the model function g exists for each x coordinate of the input data. + +.. math:: + + g(x,\vec{p})=p_0 e^{-\left(x-p_1\right)^2/\left(2p_2^2\right)}+p_3 + +:`x`: (independent variable) *X* coordinate + + No independent variables are passed to this model function. + Hence, the *X* coordinate of the first data value is assumed to be (0.0). For a fit size of *M* data points, + the *X* coordinates of the data are simply the corresponding array indices of the data array, starting from + zero (i.e. :math:`0, 1, 2, ...`). + +:`p_0`: amplitude + +:`p_1`: center coordinate + +:`p_2`: width (standard deviation) + +:`p_3`: offset + + +.. _gauss-2d: + +2D Gaussian function (cylindrical symmetry) ++++++++++++++++++++++++++++++++++++++++++++ + +A 2D Gaussian function defined by five parameters. Its model ID is ``GAUSS_2D`` and it is implemented in gauss_2d.cuh_. +Here, p is the vector of parameters (p0..p4) and the model function g exists for each x,y coordinate of the input data. + +.. math:: + + g(x,y,p)=p_0 e^{-\left(\left(x-p_1\right)^2+\left(y-p_2\right)^2\right)/\left(2p_3^2\right)}+p_4 + +:`x,y`: (independent variables) *X,Y* coordinates + + No independent variables are passed to this model function. + Hence, the *(X,Y)* coordinates of the first data value are assumed to be (:math:`0.0, 0.0`). + For a fit size of *M x N* data points, the *(X,Y)* coordinates of the data are simply the corresponding 2D array + indices of the data array, starting from zero. + +:`p_0`: amplitude + +:`p_1`: center coordinate x + +:`p_2`: center coordinate y + +:`p_3`: width (standard deviation; equal width in x and y dimensions) + +:`p_4`: offset + + +.. _gauss-2d-elliptic: + +2D Gaussian function (elliptical) ++++++++++++++++++++++++++++++++++ + +A 2D elliptical Gaussian function defined by six parameters. Its model ID is ``GAUSS_2D_ELLIPTIC`` and it is implemented +in gauss_2d_elliptic.cuh_. Here, p is the vector of parameters (p0..p5) and the model function g exists for each x,y coordinate of the input data. + +.. math:: + + g(x,y,\vec{p})=p_0 e^{-\frac{1}{2}\left(\frac{\left(x-p_1\right)^2}{p_3^2}+\frac{\left(y-p_2\right)^2}{p_4^2}\right)}+p_5 + +:`x,y`: (independent variables) *X,Y* coordinates + + No independent variables are passed to this model function. + Hence, the *(X,Y)* coordinates of the first data value are assumed to be (:math:`0.0, 0.0`). + For a fit size of *M x N* data points, the *(X,Y)* coordinates of the data are simply the corresponding + 2D array indices of the data array, starting from zero. + +:`p_0`: amplitude + +:`p_1`: center coordinate x + +:`p_2`: center coordinate y + +:`p_3`: width x (standard deviation) + +:`p_4`: width y (standard deviation) + +:`p_5`: offset + + +.. _gauss-2d-rotated: + +2D Gaussian function (elliptical, rotated) +++++++++++++++++++++++++++++++++++++++++++ + +A 2D elliptical Gaussian function whose principal axis may be rotated with respect to the X and Y coordinate axes, +defined by seven parameters. Its model is ``GAUSS_2D_ROTATED`` and it is implemented in gauss_2d_rotated.cuh_. +Here, p is the vector of parameters (p0..p6) and the model function g exists for each x,y coordinate of the input data. + +.. math:: + + g(x,y,\vec{p})=p_0 e^{-\frac{1}{2}\left(\frac{\left((x-p_1)\cos{p_6}-(y-p_2)\sin{p_6}\right)^2}{p_3^2}+\frac{\left((x-p_1)\sin{p_6}+(y-p_2)\cos{p_6}\right)^2}{p_4^2}\right)}+p_5 + +:`x,y`: (independent variables) *X,Y* coordinates + + No independent variables are passed to this model function. + Hence, the *(X,Y)* coordinates of the first data value are assumed to be (:math:`0.0, 0.0`). + For a fit size of *M x N* data points, the *(X,Y)* coordinates of the data are simply the corresponding + 2D array indices of the data array, starting from zero. + +:`p_0`: amplitude + +:`p_1`: center coordinate x + +:`p_2`: center coordinate y + +:`p_3`: width x (standard deviation) + +:`p_4`: width y (standard deviation) + +:`p_5`: offset + +:`p_6`: rotation angle [radians] + + +.. _cauchy-2d-elliptic: + +2D Cauchy function (elliptical) ++++++++++++++++++++++++++++++++ + +A 2D elliptical Cauchy function defined by six parameters. Its model ID is ``CAUCHY_2D_ELLIPTIC`` and it is implemented +in cauchy_2d_elliptic.cuh_. Here, p is the vector of parameters (p0..p5) and the model function g exists for each x,y +coordinate of the input data. + +.. math:: + + g(x,y,\vec{p})=p_0 \frac{1}{\left(\frac{x-p_1}{p_3}\right)^2+1} \frac{1}{\left(\frac{y-p_2}{p_4}\right)^2+1} + p_5 + +:`x,y`: (independent variables) *X,Y* coordinates + + No independent variables are passed to this model function. + Hence, the *(X,Y)* coordinates of the first data value are assumed to be (:math:`0.0, 0.0`). + For a fit size of *M x N* data points, the *(X,Y)* coordinates of the data are simply the corresponding + 2D array indices of the data array, starting from zero. + +:`p_0`: amplitude + +:`p_1`: center coordinate x + +:`p_2`: center coordinate y + +:`p_3`: width x (standard deviation) + +:`p_4`: width y (standard deviation) + +:`p_5`: offset + diff --git a/docs/gpufit_api.rst b/docs/gpufit_api.rst new file mode 100644 index 0000000..ce6695d --- /dev/null +++ b/docs/gpufit_api.rst @@ -0,0 +1,377 @@ +.. _api-description: + +====================== +Gpufit API description +====================== + +The Gpufit source code compiles to a dynamic-link library (DLL), providing a C interface. +In the sections below, the C interface and its arguments are described in detail. + +.. _c-interface: + +C Interface +----------- + +The C interface is defined in the Gpufit header file: gpufit.h_. + +gpufit() +++++++++ + +This is the main fit function. A single call to the *gpufit()* function executes a block of *N* fits. +The inputs to *gpufit()* are scalars and pointers to arrays, and the outputs are also array pointers. + +The inputs to the *gpufit()* function are: + +- the number of fits (*N*), +- the number of data points per fit (each fit has equal size), +- the fit data, +- an array of weight values that are used to weight the individual data points in the fit (optional), +- an ID number which specifies the fit model function, +- an array of initial parameters for the model functions, +- a tolerance value which determines when the fit has converged, +- the maximum number of iterations per fit, +- an array of flags which allow one or more fit parameters to be held constant, +- an ID number which specifies the fit estimator (e.g. least squares, etc.), +- the size of the user info data, +- the user info data, which may have multiple uses, for example to pass additional parameters to the fit functions, + or to include independent variables (e.g. X values) with the fit data. + +The outputs of *gpufit()* are: + +- the best fit model parameters for each fit, +- an array of flags indicating, for example, whether each fit converged, +- the final value of :math:`\chi^2` for each fit, +- the number of iterations needed for each fit to converge. + +The *gpufit()* function call is defined below. + +.. code-block:: cpp + + int gpufit + ( + size_t n_fits, + size_t n_points, + float * data, + float * weights, + int model_id, + float * initial_parameters, + float tolerance, + int max_n_iterations, + int * parameters_to_fit, + int estimator_id, + size_t user_info_size, + char * user_info, + float * output_parameters, + int * output_states, + float * output_chi_squares, + int * output_n_iterations + ) ; + +.. _api-input-parameters: + +Description of input parameters +............................... + +:n_fits: Number of fits to be performed + + :type: size_t + +:n_points: Number of data points per fit + + Gpufit is designed such that each fit must have the same number of data points per fit. + + :type: size_t + +:data: Pointer to data values + + A pointer to the data values. The data must be passed in as a 1D array of floating point values, with the data + for each fit concatenated one after another. In the case of multi-dimensional data, the data must be flattened + to a 1D array. The number of elements in the array is equal to the product n_fits * n_points. + + :type: float * + :length: n_points * n_fits + +:weights: Pointer to weights + + The weights array includes unique weighting values for each fit. It is used only by the least squares estimator (LSE). + The size of the weights array and its organization is identical to that for the data array. + For statistical weighting, this parameter should be set equal to the inverse of the variance of the data + (i.e. weights = 1.0 / variance ). The weights array is an optional input. + + :type: float * + :length: n_points * n_fits + :special: Use a NULL pointer to indicate that no weights are provided. In this case all data values will be weighted equally. + +:model_id: Model ID + + Determines the model which is used for all fits in this call. See :ref:`fit-model-functions` for more details. + + As defined in gpufit.h_: + + :0: GAUSS_1D + :1: GAUSS_2D + :2: GAUSS_2D_ELLIPTIC + :3: GAUSS_2D_ROTATED + :4: CAUCHY_2D_ELLIPTIC + :5: LINEAR_1D + + :type: int + +:initial_parameters: Pointer to initial parameter values + + A 1D array containing the initial model parameter values for each fit. If the number of parameters of the fit model + is defined by *n_parameters*, then the size of this array is *n_fits * n_parameters*. + + The parameter values for each fit are concatenated one after another. If there are *M* parameters per fit, + the parameters array is organized as follows: [(parameter 1), (parameter 2), ..., (parameter M), (parameter 1), + (parameter 2), ..., (parameter M), ...]. + + :type: float * + :length: n_fits * n_parameters + +:tolerance: Fit tolerance threshold + + The fit tolerance determines when the fit has converged. After each fit iteration, the change in the absolute value + of :math:`\chi^2` is calculated. The fit has converged when one of two conditions are met. First, if the change + in the absolute value of :math:`\chi^2` is less than the tolerance value, the fit has converged. + Alternatively, if the change in :math:`\chi^2` is less than the product of tolerance and the absolute value of + :math:`\chi^2` [tolerance * abs(:math:`\chi^2`)], then the fit has converged. + + Setting a lower value for the tolerance results in more precise values for the fit parameters, but requires more fit + iterations to reach convergence. + + A typical value for the tolerance settings is between 1.0E-3 and 1.0E-6. + + :type: float + +:max_n_iterations: Maximum number of iterations + + The maximum number of fit iterations permitted. If the fit has not converged after this number of iterations, + the fit returns with a status value indicating that the maximum number of iterations was reached. + + :type: int + +:parameters_to_fit: Pointer to array indicating which model parameters should be held constant during the fit + + This is an array of ones or zeros, with a length equal to the number of parameters of the fit model function. + Each entry in the array is a flag which determines whether or not the corresponding model parameter will be held + constant during the fit. To allow a parameter to vary during the fit, set the entry in *parameters_to_fit* equal + to one. To hold the value constant, set the entry to zero. + + An array of ones, e.g. [1,1,1,1,1,...] will allow all parameters to vary during the fit. + + :type: int * + :length: n_parameters + +:estimator_id: Estimator ID + + Determines the fit estimator which is used. See :ref:`estimator-functions` for more details. + + As defined in gpufit.h_: + + :0: LSE + :1: MLE + + :type: int + +:user_info_size: Size of user information data + + Size of the user information data array, in bytes. + + :type: size_t + +:user_info: Pointer to user information data + + This parameter is intended to provide flexibility to the Gpufit interface. The user information data is a generic + block of memory which is passed in to the *gpufit()* function, and which is accessible in shared GPU memory by the + fit model functions. Possible uses for the user information data is to pass in value for independent variables + (e.g. X values) or to supply additional data to the fit model function. For a coded example which makes use of + the user information data, see :ref:`linear-regression-example`. The user information data is an optional parameter + - if no user information is required this parameter may be set to NULL. + + :type: char * + :length: user_info_size + :special: Use a NULL pointer to indicate that no user information is available. + +.. _api-output-parameters: + +Description of output parameters +................................ + +:output_parameters: Pointer to array of best-fit model parameters + + For each fit, this array contains the best-fit model parameters. The array is organized identically to the input + parameters array. + + :type: float * + :length: n_fits * n_parameters + +:output_states: Pointer to array of fit result state IDs + + For each fit the result of the fit is indicated by a state ID. The state ID codes are defined below. + A state ID of 0 indicates that the fit converged successfully. + + As defined in gpufit.h_: + + :0: The fit converged, tolerance is satisfied, the maximum number of iterations is not exceeded + :1: Maximum number of iterations exceeded + :2: During the Gauss-Jordan elimination the Hessian matrix is indicated as singular + :3: Non-positive curve values have been detected while using MLE (MLE requires only positive curve values) + :4: State not read from GPU Memory + + :type: int * + :length: n_fits + +:output_chi_squares: Pointer to array of :math:`\chi^2` values + + For each fit, this array contains the final :math:`\chi^2` value. + + :type: float * + :length: n_fits + +:output_n_iterations: Pointer to array of iteration counts + + For each fit, this array contains the number of fit iterations which were performed. + + :type: int * + :length: n_fits + +:return value: Status code + + The return value of the function call indicates whether an error occurred. + + :0: No error + :-1: Error + +gpufit_portable_interface() ++++++++++++++++++++++++++++ + +This function is a simple wrapper around the *gpufit()* function, providing an alternative means of passing the function parameters. + +.. code-block:: cpp + + int gpufit_portable_interface(int argc, void *argv[]); + +Description of parameters +......................... + +:argc: The length of the argv pointer array + +:argv: Array of pointers to *gpufit* parameters, as defined above. For reference, the type of each element of the *argv* array is listed below. + + :argv[0]: Number of fits + + :type: size_t * + + :argv[1]: Number of points per fit + + :type: size_t * + + :argv[2]: Fit data + + :type: float * + + :argv[3]: Fit weights + + :type: float * + + :argv[4]: Fit model ID + + :type: int * + + :argv[5]: Initial parameters + + :type: float * + + :argv[6]: Fit tolerance + + :type: float * + + :argv[7]: Maximum number of iterations + + :type: int * + + :argv[8]: Parameters to fit + + :type: int * + + :argv[9]: Fit estimator ID + + :type: int * + + :argv[10]: User info size + + :type: size_t * + + :argv[11]: User info data + + :type: char * + + :argv[12]: Output parameters + + :type: float * + + :argv[13]: Output states + + :type: int * + + :argv[14]: Output :math:`\chi^2` values + + :type: float * + + :argv[15]: Output number of iterations + + :type: int * + + +:return value: This function simply returns the *gpufit()* return status code. + +gpufit_get_last_error() ++++++++++++++++++++++++ + +A function that returns a string representation of the last error. + +.. code-block:: cpp + + char const * gpufit_get_last_error(); + +:return value: Error message corresponding to the most recent error, or an empty string if no error occurred. + + 'CUDA driver version is insufficient for CUDA runtime version' + The graphics driver version installed on the computer is not supported by the CUDA Toolkit version which was used + to build Gpufit.dll. Update the graphics driver or re-build Gpufit using a compatible CUDA Toolkit version. + +gpufit_cuda_available() ++++++++++++++++++++++++ + +A function that calls a simple CUDA function to check if CUDA is available. + +.. code-block:: cpp + + int gpufit_cuda_available(); + +:return value: Returns 0 if CUDA is not available (no suitable device found, or driver version insufficient). + Use the function *gpufit_get_last_error()* to check the error message. Returns 1 if CUDA is available and CUDA runtime version and driver version are compatible. + +gpufit_get_cuda_version() ++++++++++++++++++++++++++ + +A function that returns the CUDA runtime version in *runtime_version* and the +installed CUDA driver version in *driver_version*. + +.. code-block:: cpp + + int gpufit_get_cuda_version(int * runtime_version, int * driver_version); + +:runtime_version: Pointer to the CUDA runtime version number (is 0 if the CUDA runtime version is incompatible with the installed CUDA driver version) + + +:driver_version: Pointer to the CUDA driver version number (is 0 if no CUDA enabled graphics card was detected) + +:return value: Returns 0 if an error occured during collecting of the version information. Use the function + *gpufit_get_last_error()* to check the error message. Returns 1 if collecting of the version + information was successful. + + + + diff --git a/docs/images/GPUFIT_CPUFIT_Performance_Comparison.png b/docs/images/GPUFIT_CPUFIT_Performance_Comparison.png new file mode 100644 index 0000000..8617237 Binary files /dev/null and b/docs/images/GPUFIT_CPUFIT_Performance_Comparison.png differ diff --git a/docs/images/GPUfit_PassmarkG3D_relative_performance.png b/docs/images/GPUfit_PassmarkG3D_relative_performance.png new file mode 100644 index 0000000..8f2e17e Binary files /dev/null and b/docs/images/GPUfit_PassmarkG3D_relative_performance.png differ diff --git a/docs/images/algorithm_gpufit_flowchart.png b/docs/images/algorithm_gpufit_flowchart.png new file mode 100644 index 0000000..b95d7cb Binary files /dev/null and b/docs/images/algorithm_gpufit_flowchart.png differ diff --git a/docs/images/algorithm_gpufit_flowchart.vsdx b/docs/images/algorithm_gpufit_flowchart.vsdx new file mode 100644 index 0000000..1b6bddb Binary files /dev/null and b/docs/images/algorithm_gpufit_flowchart.vsdx differ diff --git a/docs/images/gpufit_program_flow_skeleton_v2.png b/docs/images/gpufit_program_flow_skeleton_v2.png new file mode 100644 index 0000000..d454681 Binary files /dev/null and b/docs/images/gpufit_program_flow_skeleton_v2.png differ diff --git a/docs/images/gpufit_program_flow_v2.png b/docs/images/gpufit_program_flow_v2.png new file mode 100644 index 0000000..8ead94a Binary files /dev/null and b/docs/images/gpufit_program_flow_v2.png differ diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..6f89dc0 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,22 @@ +.. Gpufit documentation master file + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Gpufit Documentation +==================== + +.. toctree:: + :maxdepth: 3 + + introduction + installation + gpufit_api + fit_model_functions + fit_estimator_functions + examples + customization + bindings + appendix + license + + diff --git a/docs/installation.rst b/docs/installation.rst new file mode 100644 index 0000000..8af76ba --- /dev/null +++ b/docs/installation.rst @@ -0,0 +1,220 @@ +.. _installation-and-testing: + +======================== +Installation and Testing +======================== + +The Gpufit library can be used in several ways. When using a pre-compiled +binary version of Gpufit, the Gpufit functions may be accessed directly via +a dynamic linked library (e.g. Gpufit.dll) or via the external bindings to +Gpufit (e.g. the Matlab or Python bindings). For more information on the +Gpufit interface, see :ref:`api-description`, or for details of the external +bindings see :ref:`external-bindings`. + +This section describes how to compile Gpufit, including generating its +external bindings, from source code. Building from source is necessary when +a fit model function is added or changed, or if a new fit estimator is required. +Building the library may also be useful for compiling the code using a +specific version of the CUDA toolkit, or for a particular CUDA compute +capability. + +Gpufit binary distribution +++++++++++++++++++++++++++ + +A binary distribution of the Gpufit library is available for **Windows**. +Use of this distribution requires only a CUDA-capable graphics card, and an +updated Nvidia graphics driver. The binary package contains: + +- The Gpufit SDK, which consists of the 32-bit and 64-bit DLL files, and + the Gpufit header file which contains the function definitions. The Gpufit + SDK is intended to be used when calling Gpufit from an external application + written in e.g. C code. +- The performance test application, which serves to test that Gpufit is + correctly installed, and to check the performance of the CPU and GPU hardware. +- Matlab 32 bit and 64 bit bindings, with Matlab examples. +- Python version 2.x and version 3.x bindings (compiled as wheel files) and + Python examples. +- This manual in PDF format. + +To re-build the binary distribution, see the instructions located in +package/README.md. + +Building from source code ++++++++++++++++++++++++++ + +This section describes how to build Gpufit from source code. Note that as of +the initial release of Gpufit, the source code has been tested only with the +Microsoft Visual Studio compiler. + +Prerequisites +------------- + +The following tools are required in order to build Gpufit from source. + +*Required* + +* CMake_ 3.7 or later +* A C/C++ Compiler + + * Linux: GCC 4.7 + * Windows: Visual Studio 2013 or 2015 + +* CUDA_ Toolkit 6.5 or later [#]_ + +.. [#] Note that it is recommended to use the newest available stable release of the CUDA Toolkit which is compatible + with the compiler (e.g. Visual Studio 2015 is required in order to use CUDA Toolkit 8.0). Some older graphics cards + may only be supported by CUDA Toolkit version 6.5 or earlier. Also, when using CUDA Toolkit version 6.5, please use + the version with support for GTX9xx GPUs, available `here `__. + +*Optional* + +* Boost_ 1.58 or later (required if you want to build the tests) +* MATLAB_ if building the MATLAB bindings (minimum version Matlab 2012a) +* Python_ if building the Python bindings (Python version 2.x or 3.x) + +Source code availability +------------------------ + +The source code is available in an open repository hosted at Github, at the +following URL. + +.. code-block:: bash + + https://github.com/gpufit/Gpufit.git + +To obtain the code, Git may be used to clone the repository, or a current +snapshot may be downloaded directly from Github as Gpufit-master.zip_. + +Compiler configuration via CMake +-------------------------------- + +CMake is an open-source tool designed to build, test, and package software. +It is used to control the software compilation process using compiler +independent configuration files, and generate native makefiles and workspaces +that can be used in the compiler environment. In this section we provide a +simple example of how to use CMake in order to generate the input files for the +compiler (e.g. the Visual Studio solution file), which can then be used to +compile Gpufit. + +First, identify the directory which contains the Gpufit source code +(for example, on a Windows computer the Gpufit source code may be stored in +*C:\\Sources\\Gpufit*). Next, create a build directory outside the +source code source directory (e.g. *C:\\Sources\\Gpufit-build-64*). Finally, +run cmake to configure and generate the compiler input files. The following +commands, executed from the command prompt, assume that the cmake executable +(e.g. *C:\\Program Files\\CMake\\bin\\cmake.exe*) is automatically found +via the PATH environment variable (if not, the full path to cmake.exe must be +specified). This example also assumes that the source and build directories +have been set up as specified above. + +.. code-block:: bash + + cd C:\Sources\Gpufit-build-64 + cmake -G "Visual Studio 12 2013 Win64" C:\Sources\Gpufit + +Note that in this example the *-G* flag has been used to specify the +64-bit version of the Visual Studio 12 compiler. This flag should be changed +depending on the compiler used, and the desired architecture +(e.g. 32- or 64-bit). Further details of the CMake command line arguments +can be found `here `__. + +There is also a graphical user interface available for CMake, which simplifies +the configuration and generation steps. For further details, see +`Running CMake `_. + +Common issues encountered during CMake configuration +---------------------------------------------------- + +**Boost NOT found - skipping tests!** + +If you want to build the tests and Boost is not found automatically, set the +CMake variable BOOST_ROOT to the corresponding directory, and configure again. + +**Specify CUDA_ARCHITECTURES set** + +If you need a specific CUDA architecture, set CUDA_ARCHITECTURES according +to CUDA_SELECT_NVCC_ARCH_FLAGS_. + +**CMake finds lowest installed CUDA version by default** + +If there are multiple CUDA toolkits installed on the computer, CMake 3.7.1 +seems to find by default the lowest installed version. Set the desired CUDA +version manually (e.g. by editing the CUDA_TOOLKIT_ROOT_DIR variable in CMake). + +**Specify CUDA version to use** + +Set CUDA_BIN_PATH before running CMake or CUDA_TOOLKIT_ROOT_DIR after +first CMAKE configuration to the installation folder of the desired +CUDA version. + +**Required CUDA version** + +When using Microsoft Visual Studio 2015, the minimum required CUDA Toolkit +version is 8.0. + +**Python launcher** + +Set Python_WORKING_DIRECTORY to a valid directory, it will be added to the +Python path. + +**Matlab launcher** + +Set Matlab_WORKING_DIRECTORY to a valid directory, it will be added to +the Matlab path. + +Compiling Gpufit on Windows +--------------------------- + +After configuring and generating the solution files using CMake, go to the +desired build directory and open Gpufit.sln using Visual Studio. Select the +"Debug" or "Release" build options, as appropriate. Select the build target +"ALL_BUILD", and build this target. If the build process completes +without errors, the Gpufit binary files will be created in the corresponding +"Debug" or "Release" folders in the build directory. + +The unit tests can be executed by building the target "RUN_TESTS" or by +starting the created executables in the output directory from +the command line. + +Linux +----- + +Gpufit has not yet been officially tested on a computer running a Linux variant +with a CUDA capable graphics card. However, satisfying the Prerequisites_ and +using CMake, we estimate that the library should build in principle and one +should also be able to run the examples on Linux. + +MacOS +----- + +Gpufit has not yet been officially tested on a computer running MacOS with a +CUDA capable graphics card. However, satisfying the Prerequisites_ and using +CMake, we estimate that the library should build in principle and one +should also be able to run the examples on MacOS. + +Running the performance test +++++++++++++++++++++++++++++ + +The Gpufit performance test is a program which verifies the correct function +of Gpufit, and tests the fitting speed in comparison with the same algorithm +executed on the CPU. + +If Gpufit was built from source, running the build target +GPUFIT_CPUFIT_Performance_Comparison will run the test, which executes the +fitting process multiple times, varying the number of fits per function call. +The execution time is measured in each case and the relative speed improvement +between the GPU and the CPU is calculated. A successful run of the performance +test also indicates also that Gpufit is functioning correctly. + +The performance comparison is also included in the Gpufit binary distribution +as a console application. An example of the program's output is +shown in :numref:`installation-gpufit-cpufit-performance-comparison`. + +.. _installation-gpufit-cpufit-performance-comparison: + +.. figure:: /images/Gpufit_Cpufit_Performance_Comparison.png + :width: 10 cm + :align: center + + Output of the GPUFIT vs CPUFIT performance comparison + diff --git a/docs/introduction.rst b/docs/introduction.rst new file mode 100644 index 0000000..2a6fc1f --- /dev/null +++ b/docs/introduction.rst @@ -0,0 +1,87 @@ +============ +Introduction +============ + +Gpufit is a GPU-accelerated CUDA implementation of the Levenberg-Marquardt +algorithm. It was developed to meet the need for a high performance, general- +purpose nonlinear curve fitting software library which is publicly available +and open source. + +Optimization algorithms are ubiquitous tools employed in many field of science +and technology. One such algorithm for numerical, non-linear optimization is the +Levenberg-Marquardt algorithm (LMA). The LMA combines elements of the method of +steepest descent and Newton's method, and has become a standard algorithm for +least-squares fitting. + +Although the LMA is, in itself, an efficient optimization algorithm, +applications requiring many iterations of this procedure may encounter +limitations due to the sheer number of calculations involved. The time required +for the convergence of a fit, or a set of fits, can determine an application's +feasibility, e.g. in the context of real-time data processing and feedback +systems. Alternatively, in the case of very large datasets, the time required +to solve a particular optimization problem may prove impractical. + +In recent years, advanced graphics processing units (GPUs) and the development +of general purpose GPU programming have enabled fast and parallelized computing +by shifting calculations from the CPU to the GPU. The large number of +independent computing units available on a modern GPU enables the rapid +execution of many instructions in parallel, with an overall computation power +far exceeding that of a CPU. Languages such as CUDA C and OpenCL allow GPU- +based programs to be developed in a manner similar to conventional software, but +with an inherently parallelized structure. These developments have led to the +creation of new GPU-accelerated tools, such as the Gpufit. + +This manual describes how to install and build the Gpufit library and its +external bindings. Furthermore it details how to extend Gpufit by adding +custom model functions as well as custom fit estimator functions. + +The documentation includes: + +- Instructions for building and installing Gpufit +- A detailed description of the C interface +- A description of the built-in model functions +- A description of the built-in goodness-of-fit estimator functions +- A detailed description of the external bindings to Matlab and Python +- Usage examples for C, Matlab, and Python +- Instructions for adding custom model functions or custom estimator functions + +The current version of the Gpufit library is |GF_version| +(`see homepage `_). This manual was compiled +on |today|. + +Hardware requirements +--------------------- + +Because the fit algorithm is implemented in CUDA C, a CUDA_-compatible graphics +card is required to run Gpufit. The minimum supported compute capability is +2.0. More advanced GPU hardware will result in higher fitting performance. + +Software requirements +--------------------- + +In addition to a compatible GPU, the graphics card driver installed on the +host computer must be compatible with the version of the CUDA toolkit which +was used to compile Gpufit. This may present an issue for older graphics +cards or for computers running outdated graphics drivers. + +At the time of its initial release, Gpufit was compiled with CUDA toolkit +version 8.0. Therefore, the Nvidia graphics driver installed on the host PC +must be at least version 367.48 (released July 2016) in order to be compatible +with the binary files generated in this build. + +When compatibility issues arise, there are two possible solutions. The best +option is to update the graphics driver to a version which is compatible with +the CUDA toolkit used to build Gpufit. The second option is to re-compile +Gpufit from source code, using an earlier version of the CUDA toolkit which is +compatible with the graphics driver in question. However, this solution is +likely to result in slower performance of the Gpufit code, since older versions +of the CUDA toolkit are not as efficient. + +Note that all CUDA-supported graphics cards should be compatible with +CUDA toolkit version 6.5. This is the last version of CUDA which supported +GPUs with compute capability 1.x. In other words, an updated Nvidia graphics +driver should be available for all CUDA-enabled GPUs which is compatible with +toolkit version 6.5. + +If you are unsure if your graphics card is CUDA-compatible, a lists of CUDA +supported GPUs can be found `here `_. diff --git a/docs/license.rst b/docs/license.rst new file mode 100644 index 0000000..1223cbc --- /dev/null +++ b/docs/license.rst @@ -0,0 +1,25 @@ +======================= +Gpufit software license +======================= + +MIT License + +Copyright (c) 2017 Mark Bates, Adrian Przybylski, Björn Thiel, and Jan Keller-Findeisen + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..6f53cb2 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,281 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. epub3 to make an epub3 + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + echo. coverage to run coverage check of the documentation if enabled + echo. dummy to check syntax errors of document sources + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + + +REM Check if sphinx-build is available and fallback to Python version if any +%SPHINXBUILD% 1>NUL 2>NUL +if errorlevel 9009 goto sphinx_python +goto sphinx_ok + +:sphinx_python + +set SPHINXBUILD=python -m sphinx.__init__ +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +:sphinx_ok + + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\RTDSpielwiese.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\RTDSpielwiese.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "epub3" ( + %SPHINXBUILD% -b epub3 %ALLSPHINXOPTS% %BUILDDIR%/epub3 + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub3 file is in %BUILDDIR%/epub3. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf-ja + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +if "%1" == "coverage" ( + %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage + if errorlevel 1 exit /b 1 + echo. + echo.Testing of coverage in the sources finished, look at the ^ +results in %BUILDDIR%/coverage/python.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %BUILDDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. + goto end +) + +if "%1" == "dummy" ( + %SPHINXBUILD% -b dummy %ALLSPHINXOPTS% %BUILDDIR%/dummy + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. Dummy builder generates no files. + goto end +) + +:end diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 0000000..b8c2751 --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,20 @@ + +# Applications + +function( add_example modules name ) + set( target ${name} ) + add_executable( ${target} ${name}.cpp + ${PROJECT_SOURCE_DIR}/Tests/utils.h + ${PROJECT_SOURCE_DIR}/Tests/utils.cpp + ) + target_include_directories( ${target} PRIVATE ${PROJECT_SOURCE_DIR} ) + target_link_libraries( ${target} ${modules} ) + set_property( TARGET ${target} + PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" ) + set_property( TARGET ${target} PROPERTY FOLDER GpufitCpufitExamples ) +# install( TARGETS ${target} RUNTIME DESTINATION bin ) +endfunction() + +add_example( "Cpufit;Gpufit" Gpufit_Cpufit_Performance_Comparison ) + +add_example( "Cpufit;Gpufit" Gpufit_Cpufit_Nvidia_Profiler_Test ) diff --git a/examples/Gpufit_Cpufit_Nvidia_Profiler_Test.cpp b/examples/Gpufit_Cpufit_Nvidia_Profiler_Test.cpp new file mode 100644 index 0000000..41f72e2 --- /dev/null +++ b/examples/Gpufit_Cpufit_Nvidia_Profiler_Test.cpp @@ -0,0 +1,340 @@ +/* + * Runs 100k fits on the CPU and 2m fits on the GPU, used with the Nvidia profiler to obtain + * running time information on the different CUDA kernels. + */ + +#include "Cpufit/cpufit.h" +#include "Gpufit/gpufit.h" +#include "Tests/utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define _USE_MATH_DEFINES +#include + + +/* + Names of paramters for the 2D Gaussian peak model +*/ +struct Parameters +{ + float amplitude; + float center_x; + float center_y; + float width; + float background; +}; + +/* +Prints some statistics and the speed (fits/second) of a run. +*/ +void print_result( + std::string const name, + std::vector const & estimated_parameters, + std::vector const & test_parameters, + std::vector states, + std::vector const & n_iterations, + std::size_t const n_fits, + std::size_t const n_parameters, + std::chrono::milliseconds::rep const duration_in_ms) +{ + + std::vector estimated_x_centers(n_fits); + std::vector test_x_centers(n_fits); + + for (std::size_t i = 0; i < n_fits; i++) + { + estimated_x_centers[i] = estimated_parameters[i*n_parameters + 1]; + test_x_centers[i] = test_parameters[i].center_x; + } + + double const std_dev_x = calculate_standard_deviation(estimated_x_centers, test_x_centers, states); + + double const mean_n_iterations = calculate_mean(n_iterations, states); + + double fits_per_second = static_cast(n_fits) / duration_in_ms * 1000; + + // output + std::cout << std::fixed; + + std::cout << std::setw(5) << std::endl << "***" << name << "***"; + + std::cout << std::setprecision(3); + std::cout << std::setw(12) << duration_in_ms / 1000.0 << " s "; + + std::cout << std::setprecision(2); + std::cout << std::setw(12) << fits_per_second << " fits/s" << std::endl; + + std::cout << std::setprecision(6); + std::cout << "x precision: " << std_dev_x << " px "; + + std::cout << std::setprecision(2); + std::cout << "mean iterations: " << mean_n_iterations << std::endl; +} + +/* +Randomize parameters, slightly differently +*/ +void generate_initial_parameters(std::vector & parameters_set, std::vector const & parameters) +{ + std::uniform_real_distribution< float> uniform_dist(0, 1); + + float const a = 0.9f; + float const b = 0.2f; + + int const n_parameters = sizeof(Parameters) / sizeof(float); + for (std::size_t i = 0; i < parameters_set.size() / n_parameters; i++) + { + parameters_set[0 + i * n_parameters] = parameters[i].amplitude * (a + b * uniform_dist(rng)); + parameters_set[1 + i * n_parameters] = parameters[i].center_x * (a + b * uniform_dist(rng)); + parameters_set[2 + i * n_parameters] = parameters[i].center_y * (a + b * uniform_dist(rng)); + parameters_set[3 + i * n_parameters] = parameters[i].width * (a + b * uniform_dist(rng)); + parameters_set[4 + i * n_parameters] = parameters[i].background * (a + b * uniform_dist(rng)); + } +} + +/* +Randomize parameters +*/ +void generate_test_parameters(std::vector & target, Parameters const source) +{ + std::size_t const n_fits = target.size(); + + std::uniform_real_distribution< float> uniform_dist(0, 1); + + float const a = 0.9f; + float const b = 0.2f; + + for (std::size_t i = 0; i < n_fits; i++) + { + target[i].amplitude = source.amplitude * (a + b * uniform_dist(rng)); + target[i].center_x = source.center_x * (a + b * uniform_dist(rng)); + target[i].center_y = source.center_y * (a + b * uniform_dist(rng)); + target[i].width = source.width * (a + b * uniform_dist(rng)); + target[i].background = source.background * (a + b * uniform_dist(rng)); + } +} + +/* + +*/ +void add_gauss_noise(std::vector & vec, Parameters const & parameters, float const snr) +{ + float const gauss_fwtm = 4.292f * parameters.width; //only valid for circular gaussian + float const fit_area = gauss_fwtm*gauss_fwtm; + + float const mean_amplitude = 2.f * float(M_PI) * parameters.amplitude * parameters.width * parameters.width / fit_area; + + float const std_dev = mean_amplitude / snr; + + std::normal_distribution distribution(0.0, std_dev); + + for (std::size_t i = 0; i < vec.size(); i++) + { + vec[i] += distribution(rng); + } +} + +/* + +*/ +void generate_gauss2d( + std::size_t const n_fits, + std::size_t const n_points, + std::vector & data, + std::vector const & parameters) +{ + std::cout << "generating " << n_fits << " fits ..." << std::endl; + for (int i = 0; i < 50; i++) + std::cout << "-"; + std::cout << std::endl; + std::size_t progress = 0; + + for (std::size_t i = 0; i < n_fits; i++) + { + float const amplitude = parameters[i].amplitude; + float const x00 = parameters[i].center_x; + float const y00 = parameters[i].center_y; + float const width = parameters[i].width; + float const background = parameters[i].background; + + std::size_t const fit_index = i * n_points; + + for (int iy = 0; iy < sqrt(n_points); iy++) + { + for (int ix = 0; ix < sqrt(n_points); ix++) + { + std::size_t const point_index = iy * std::size_t(sqrt(n_points)) + ix; + std::size_t const absolute_index = fit_index + point_index; + + float const argx + = exp(-0.5f * ((ix - x00) / width) * ((ix - x00) / width)); + float const argy + = exp(-0.5f * ((iy - y00) / width) * ((iy - y00) / width)); + + data[absolute_index] = amplitude * argx * argy + background; + } + } + + progress += 1; + if (progress >= n_fits / 50) + { + progress = 0; + std::cout << "|"; + } + } + std::cout << std::endl; + for (int i = 0; i < 50; i++) + std::cout << "-"; + std::cout << std::endl; +} + +/* +Runs Gpufit vs. Cpufit for various number of fits and compares the speed + +No weights, Model: Gauss_2D, Estimator: LSE +*/ +int main(int argc, char * argv[]) +{ + // check for CUDA availability + if (!gpufit_cuda_available()) + { + std::cout << "CUDA not available" << std::endl; + return -1; + } + + // all numbers of fits + std::size_t const n_fits_gpu = 2000000; + std::size_t const n_fits_cpu = 100000; + std::size_t const size_x = 15; + std::size_t const n_points = size_x * size_x; + + // fit parameters constant for every run + std::size_t const n_parameters = 5; + std::vector parameters_to_fit(n_parameters, 1); + float const tolerance = 0.0001f; + int const max_n_iterations = 10; + + // initial parameters + Parameters true_parameters; + true_parameters.amplitude = 500.f; + true_parameters.center_x = static_cast(size_x) / 2.f - 0.5f; + true_parameters.center_y = static_cast(size_x) / 2.f - 0.5f; + true_parameters.width = 2.f; + true_parameters.background = 10.f; + + // test parameters + std::cout << "generate test parameters" << std::endl; + std::vector test_parameters(n_fits_gpu); + generate_test_parameters(test_parameters, true_parameters); + + // test data + std::vector data(n_fits_gpu * n_points); + generate_gauss2d(n_fits_gpu, n_points, data, test_parameters); + std::cout << "add noise" << std::endl; + add_gauss_noise(data, true_parameters, 10.f); + + // initial parameter set + std::vector initial_parameters(n_parameters * n_fits_gpu); + generate_initial_parameters(initial_parameters, test_parameters); + + std::cout << std::endl; + std::cout << n_fits_cpu << " fits on the CPU" << std::endl; + + // Cpufit output + std::vector cpufit_parameters(n_fits_cpu * n_parameters); + std::vector cpufit_states(n_fits_cpu); + std::vector cpufit_chi_squares(n_fits_cpu); + std::vector cpufit_n_iterations(n_fits_cpu); + + // run Cpufit and measure time + std::chrono::high_resolution_clock::time_point t0 = std::chrono::high_resolution_clock::now(); + int const cpu_status + = cpufit + ( + n_fits_cpu, + n_points, + data.data(), + 0, + GAUSS_2D, + initial_parameters.data(), + tolerance, + max_n_iterations, + parameters_to_fit.data(), + LSE, + 0, + 0, + cpufit_parameters.data(), + cpufit_states.data(), + cpufit_chi_squares.data(), + cpufit_n_iterations.data() + ); + std::chrono::milliseconds::rep const dt_cpufit = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t0).count(); + + if (cpu_status != 0) + { + // error in cpufit, should actually not happen + std::cout << "Error in cpufit: " << cpufit_get_last_error() << std::endl; + } + else + { + // print + print_result("Cpufit", cpufit_parameters, test_parameters, cpufit_states, cpufit_n_iterations, n_fits_cpu, n_parameters, dt_cpufit); + } + + std::cout << std::endl; + std::cout << n_fits_gpu << " fits on the GPU" << std::endl; + + // Gpufit output parameters + std::vector gpufit_parameters(n_fits_gpu * n_parameters); + std::vector gpufit_states(n_fits_gpu); + std::vector gpufit_chi_squares(n_fits_gpu); + std::vector gpufit_n_iterations(n_fits_gpu); + + // run Gpufit and measure time + t0 = std::chrono::high_resolution_clock::now(); + int const gpu_status + = gpufit + ( + n_fits_gpu, + n_points, + data.data(), + 0, + GAUSS_2D, + initial_parameters.data(), + tolerance, + max_n_iterations, + parameters_to_fit.data(), + LSE, + 0, + 0, + gpufit_parameters.data(), + gpufit_states.data(), + gpufit_chi_squares.data(), + gpufit_n_iterations.data() + ); + std::chrono::milliseconds::rep const dt_gpufit = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t0).count(); + + if (gpu_status != 0) + { + // error in gpufit + std::cout << "Error in gpufit: " << gpufit_get_last_error() << std::endl; + } + else + { + // print results + print_result("Gpufit", gpufit_parameters, test_parameters, gpufit_states, gpufit_n_iterations, n_fits_gpu, n_parameters, dt_gpufit); + } + + std::cout << "\nPERFORMANCE GAIN Gpufit/Cpufit \t" << std::setw(10) << static_cast(dt_cpufit) / dt_gpufit * n_fits_gpu / n_fits_cpu << std::endl; + + return 0; +} \ No newline at end of file diff --git a/examples/Gpufit_Cpufit_Performance_Comparison.cpp b/examples/Gpufit_Cpufit_Performance_Comparison.cpp new file mode 100644 index 0000000..b25dd90 --- /dev/null +++ b/examples/Gpufit_Cpufit_Performance_Comparison.cpp @@ -0,0 +1,450 @@ +#include "Cpufit/cpufit.h" +#include "Gpufit/gpufit.h" +#include "Tests/utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define _USE_MATH_DEFINES +#include + + +/* + Names of paramters for the 2D Gaussian peak model +*/ +struct Parameters +{ + float amplitude; + float center_x; + float center_y; + float width; + float background; +}; + +/* + Randomize parameters, slightly differently +*/ +void generate_initial_parameters(std::vector & parameters_set, std::vector const & parameters) +{ + std::uniform_real_distribution< float> uniform_dist(0, 1); + + float const a = 0.9f; + float const b = 0.2f; + + int const n_parameters = sizeof(Parameters) / sizeof(float); + for (std::size_t i = 0; i < parameters_set.size() / n_parameters; i++) + { + parameters_set[0 + i * n_parameters] = parameters[i].amplitude * (a + b * uniform_dist(rng)); + parameters_set[1 + i * n_parameters] = parameters[i].center_x * (a + b * uniform_dist(rng)); + parameters_set[2 + i * n_parameters] = parameters[i].center_y * (a + b * uniform_dist(rng)); + parameters_set[3 + i * n_parameters] = parameters[i].width * (a + b * uniform_dist(rng)); + parameters_set[4 + i * n_parameters] = parameters[i].background * (a + b * uniform_dist(rng)); + } +} + +/* + Randomize parameters +*/ +void generate_test_parameters(std::vector & target, Parameters const source) +{ + std::size_t const n_fits = target.size(); + + std::uniform_real_distribution< float> uniform_dist(0, 1); + + float const a = 0.9f; + float const b = 0.2f; + + int const text_width = 30; + int const progress_width = 25; + + std::cout << std::setw(text_width) << " "; + for (int i = 0; i < progress_width; i++) + std::cout << "-"; + std::cout << std::endl; + std::cout << std::setw(text_width) << std::left << "Generating test parameters"; + + std::size_t progress = 0; + + for (std::size_t i = 0; i < n_fits; i++) + { + target[i].amplitude = source.amplitude * (a + b * uniform_dist(rng)); + target[i].center_x = source.center_x * (a + b * uniform_dist(rng)); + target[i].center_y = source.center_y * (a + b * uniform_dist(rng)); + target[i].width = source.width * (a + b * uniform_dist(rng)); + target[i].background = source.background * (a + b * uniform_dist(rng)); + + progress += 1; + if (progress >= n_fits / progress_width) + { + progress = 0; + std::cout << "|"; + } + } + + std::cout << std::endl; + std::cout << std::setw(text_width) << " "; + for (int i = 0; i < progress_width; i++) + std::cout << "-"; + std::cout << std::endl; +} + +/* + +*/ +void add_gauss_noise(std::vector & vec, Parameters const & parameters, float const snr) +{ + float const gauss_fwtm = 4.292f * parameters.width; //only valid for circular gaussian + float const fit_area = gauss_fwtm*gauss_fwtm; + + float const mean_amplitude = 2.f * float(M_PI) * parameters.amplitude * parameters.width * parameters.width / fit_area; + + float const std_dev = mean_amplitude / snr; + + std::normal_distribution distribution(0.0, std_dev); + + int const text_width = 30; + int const progress_width = 25; + + std::cout << std::setw(text_width) << " "; + for (int i = 0; i < progress_width; i++) + std::cout << "-"; + std::cout << std::endl; + std::cout << std::setw(text_width) << std::left << "Adding noise"; + + std::size_t progress = 0; + + for (std::size_t i = 0; i < vec.size(); i++) + { + vec[i] += distribution(rng); + + progress += 1; + if (progress >= vec.size() / progress_width) + { + progress = 0; + std::cout << "|"; + } + } + + std::cout << std::endl; + std::cout << std::setw(text_width) << " "; + for (int i = 0; i < progress_width; i++) + std::cout << "-"; + std::cout << std::endl; +} + +/* + +*/ +void generate_gauss2d( + std::size_t const n_fits, + std::size_t const n_points, + std::vector & data, + std::vector const & parameters) +{ + int const text_width = 30; + int const progress_width = 25; + + std::cout << std::setw(text_width) << " "; + for (int i = 0; i < progress_width; i++) + std::cout << "-"; + std::cout << std::endl; + std::cout << std::setw(text_width) << std::left << "Generating data"; + + std::size_t progress = 0; + + for (std::size_t i = 0; i < n_fits; i++) + { + float const amplitude = parameters[i].amplitude; + float const x00 = parameters[i].center_x; + float const y00 = parameters[i].center_y; + float const width = parameters[i].width; + float const background = parameters[i].background; + + std::size_t const fit_index = i * n_points; + + for (int iy = 0; iy < sqrt(n_points); iy++) + { + for (int ix = 0; ix < sqrt(n_points); ix++) + { + std::size_t const point_index = iy * std::size_t(sqrt(n_points)) + ix; + std::size_t const absolute_index = fit_index + point_index; + + float const argx + = exp(-0.5f * ((ix - x00) / width) * ((ix - x00) / width)); + float const argy + = exp(-0.5f * ((iy - y00) / width) * ((iy - y00) / width)); + + data[absolute_index] = amplitude * argx * argy + background; + } + } + + progress += 1; + if (progress >= n_fits / progress_width) + { + progress = 0; + std::cout << "|"; + } + } + std::cout << std::endl; + std::cout << std::setw(text_width) << " "; + for (int i = 0; i < progress_width; i++) + std::cout << "-"; + std::cout << std::endl; +} + +/* +Runs Gpufit vs. Cpufit for various number of fits and compares the speed + +No weights, Model: Gauss_2D, Estimator: LSE +*/ +int main(int argc, char * argv[]) +{ + // title + std::cout << "----------------------------------------" << std::endl; + std::cout << "Performance comparison Gpufit vs. Cpufit" << std::endl; + std::cout << "----------------------------------------" << std::endl << std::endl; + + std::cout << "Please note that execution speed test results depend on" << std::endl; + std::cout << "the details of the CPU and GPU hardware." << std::endl; + std::cout << std::endl; + + + // check for CUDA availability + int cuda_runtime_version = 0; + int cuda_driver_version = 0; + bool const version_available = gpufit_get_cuda_version(&cuda_runtime_version, &cuda_driver_version) != 0; + int const cuda_runtime_major = cuda_runtime_version / 1000; + int const cuda_runtime_minor = cuda_runtime_version % 1000 / 10; + int const cuda_driver_major = cuda_driver_version / 1000; + int const cuda_driver_minor = cuda_driver_version % 1000 / 10; + + bool do_gpufits = false; + if (version_available) + { + std::cout << "CUDA runtime version: "; + std::cout << cuda_runtime_major << "." << cuda_runtime_minor << std::endl; + std::cout << "CUDA driver version: "; + std::cout << cuda_driver_major << "." << cuda_driver_minor << std::endl; + std::cout << std::endl; + + bool const cuda_available = cuda_driver_version > 0; + if (cuda_available) + { + bool const version_compatible + = cuda_driver_version >= cuda_runtime_version + && cuda_runtime_version > 0; + if (version_compatible) + { + do_gpufits = true; + } + else + { + std::cout << "The CUDA runtime version is not compatible with the" << std::endl; + std::cout << "current graphics driver. Please update the driver, or" << std::endl; + std::cout << "re - build Gpufit from source using a compatible version" << std::endl; + std::cout << "of the CUDA toolkit." << std::endl; + std::cout << std::endl; + } + } + else + { + std::cout << "No CUDA enabled graphics card detected." << std::endl; + std::cout << std::endl; + } + } + else + { + std::cout << "CUDA error detected. Error string: "; + std::cout << gpufit_get_last_error() << std::endl; + std::cout << std::endl; + } + if (!do_gpufits) + { + std::cout << "Skipping Gpufit computations." << std::endl << std::endl; + } + + // all numbers of fits + std::vector n_fits_all; + if (sizeof(void*) < 8) + { + n_fits_all = { 10, 100, 1000, 10000, 100000, 1000000}; + } + else + { + n_fits_all = { 10, 100, 1000, 10000, 100000, 1000000, 10000000 }; + } + + std::size_t const max_n_fits = n_fits_all.back(); + + // fit parameters constant for every run + std::size_t const size_x = 5; + std::size_t const n_points = size_x * size_x; + std::size_t const n_parameters = 5; + std::vector parameters_to_fit(n_parameters, 1); + float const tolerance = 0.0001f; + int const max_n_iterations = 10; + + // initial parameters + Parameters true_parameters; + true_parameters.amplitude = 500.f; + true_parameters.center_x = static_cast(size_x) / 2.f - 0.5f; + true_parameters.center_y = static_cast(size_x) / 2.f - 0.5f; + true_parameters.width = 1.f; + true_parameters.background = 10.f; + + // test parameters + std::vector test_parameters(max_n_fits); + generate_test_parameters(test_parameters, true_parameters); + + // test data + std::vector data(max_n_fits * n_points); + generate_gauss2d(max_n_fits, n_points, data, test_parameters); + add_gauss_noise(data, true_parameters, 10.f); + + // initial parameter set + std::vector initial_parameters(n_parameters * max_n_fits); + generate_initial_parameters(initial_parameters, test_parameters); + + // print collumn identifiers + std::cout << std::endl << std::right; + std::cout << std::setw(8) << "Number" << std::setw(3) << "|"; + std::cout << std::setw(13) << "Cpufit speed" << std::setw(3) << "|"; + std::cout << std::setw(13) << "Gpufit speed" << std::setw(3) << "|"; + std::cout << std::setw(12) << "Performance"; + std::cout << std::endl; + std::cout << std::setw(8) << "of fits" << std::setw(3) << "|"; + std::cout << std::setw(13) << "(fits/s)" << std::setw(3) << "|"; + std::cout << std::setw(13) << "(fits/s)" << std::setw(3) << "|"; + std::cout << std::setw(12) << "gain factor"; + std::cout << std::endl; + std::cout << "-------------------------------------------------------"; + std::cout << std::endl; + + // loop over number of fits + for (std::size_t fit_index = 0; fit_index < n_fits_all.size(); fit_index++) + { + // number of fits + std::size_t n_fits = n_fits_all[fit_index]; + std::cout << std::setw(8) << n_fits << std::setw(3) << "|"; + + // Cpufit output + std::vector cpufit_parameters(n_fits * n_parameters); + std::vector cpufit_states(n_fits); + std::vector cpufit_chi_squares(n_fits); + std::vector cpufit_n_iterations(n_fits); + + // run Cpufit and measure time + std::chrono::high_resolution_clock::time_point t0 = std::chrono::high_resolution_clock::now(); + int const cpu_status + = cpufit + ( + n_fits, + n_points, + data.data(), + 0, + GAUSS_2D, + initial_parameters.data(), + tolerance, + max_n_iterations, + parameters_to_fit.data(), + LSE, + 0, + 0, + cpufit_parameters.data(), + cpufit_states.data(), + cpufit_chi_squares.data(), + cpufit_n_iterations.data() + ); + std::chrono::milliseconds::rep const dt_cpufit = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t0).count(); + + if (cpu_status != 0) + { + // error in cpufit, should actually not happen + std::cout << "Error in cpufit: " << cpufit_get_last_error() << std::endl; + } + + std::chrono::milliseconds::rep dt_gpufit = 0; + + // if we do not do gpufit, we skip the rest of the loop + if (do_gpufits) + { + // Gpufit output parameters + std::vector gpufit_parameters(n_fits * n_parameters); + std::vector gpufit_states(n_fits); + std::vector gpufit_chi_squares(n_fits); + std::vector gpufit_n_iterations(n_fits); + + // run Gpufit and measure time + t0 = std::chrono::high_resolution_clock::now(); + int const gpu_status + = gpufit + ( + n_fits, + n_points, + data.data(), + 0, + GAUSS_2D, + initial_parameters.data(), + tolerance, + max_n_iterations, + parameters_to_fit.data(), + LSE, + 0, + 0, + gpufit_parameters.data(), + gpufit_states.data(), + gpufit_chi_squares.data(), + gpufit_n_iterations.data() + ); + dt_gpufit = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t0).count(); + + if (gpu_status != 0) + { + // error in gpufit + std::cout << "Error in gpufit: " << gpufit_get_last_error() << std::endl; + do_gpufits = false; + } + } + + // print the calculation speed in fits/s + std::cout << std::fixed << std::setprecision(0); + if (dt_cpufit) + { + std::cout << std::setw(13) << static_cast(n_fits) / static_cast(dt_cpufit)* 1000.0 << std::setw(3) << "|"; + } + else + { + std::cout << std::setw(13) << "inf" << std::setw(3) << "|"; + } + if (dt_gpufit) + { + std::cout << std::setw(13) << static_cast(n_fits) / static_cast(dt_gpufit)* 1000.0 << std::setw(3) << "|"; + std::cout << std::fixed << std::setprecision(2); + std::cout << std::setw(12) << static_cast(dt_cpufit) / static_cast(dt_gpufit); + } + else if (!do_gpufits) + { + std::cout << std::setw(13) << "--" << std::setw(3) << "|"; + std::cout << std::setw(12) << "--"; + } + else + { + std::cout << std::setw(13) << "inf" << std::setw(3) << "|"; + std::cout << std::setw(12) << "inf"; + } + + std::cout << std::endl; + } + std::cout << std::endl << "Test completed!" << std::endl; + std::cout << "Press ENTER to exit" << std::endl; + std::getchar(); + + return 0; +} \ No newline at end of file diff --git a/examples/Gpufit_Cpufit_Performance_Comparison_readme.txt b/examples/Gpufit_Cpufit_Performance_Comparison_readme.txt new file mode 100644 index 0000000..92339af --- /dev/null +++ b/examples/Gpufit_Cpufit_Performance_Comparison_readme.txt @@ -0,0 +1,106 @@ +Example application for the Gpufit library (https://github.com/gpufit/Gpufit) +which implements Levenberg Marquardt curve fitting in CUDA. + +Requirements +------------ + +- A CUDA capable graphics card with a recent Nvidia graphics driver + (at least 367.48 / July 2016) +- Windows +- >1.5 GB of free RAM + +Running +------- + +Start "Gpufit_Cpufit_Performance_Comparison.exe" to see a speed comparison of +GPU and CPU implementation. + +Output +------ + +The accurate execution of the performance comparison example shows the version +number of the installed CUDA driver and the CUDA runtime Gpufit was built with. + +EXAMPLE: + CUDA runtime version: 8.0 + CUDA driver version: 9.0 + +In the next step the successful generation of test data is indicated by three +full progress bars. + +EXAMPLE: + + ------------------------- + Generating test parameters ||||||||||||||||||||||||| + ------------------------- + ------------------------- + Generating data ||||||||||||||||||||||||| + ------------------------- + ------------------------- + Adding noise ||||||||||||||||||||||||| + ------------------------- + +The results of the performance comparison between Gpufit and Cpufit are shown +in a table. The results demonstrate the performance benefit of Gpufit compared +to Cpufit executing the fitting process vor various number of fits in a range +of 10 - 10000000. The execution speed is expressed in fits per second. If the +execution time was not measureable, the speed is expressed as infinite. + +EXAMPLE: + + Number | Cpufit speed | Gpufit speed | Performance + of fits | (fits/s) | (fits/s) | gain factor + ------------------------------------------------------- + 10 | inf | 92 | 0.00 + 100 | inf | 6667 | 0.00 + 1000 | 66667 | inf | inf + 10000 | 58480 | 666667 | 11.40 + 100000 | 59916 | 2173913 | 36.28 + 1000000 | 59898 | 2469136 | 41.22 + 10000000 | 60957 | 3038590 | 49.85 + +Troubleshooting +--------------- + +MESSAGE: + + CUDA runtime version: 0.0 + CUDA driver version: 7.5 + + The CUDA runtime version is not compatible with the current graphics driver. + Please update the driver, or re-build Gpufit from source using a compatible + version of the CUDA toolkit. + + Skipping Gpufit computations. + +BEHAVIOR: + + The example executes Cpufit skipping Gpufit. Only computation speed of Cpufit + is shown in the results table. + +SOLUTION: + + A common reason for this error message is an outdated Nvidia graphics driver. + In most cases updating the graphics card driver will solve this error. For + older graphics cards which are not supported by the CUDA toolkit used for + building Gpufit, re-compile Gpufit using an earlier version of the CUDA + toolkit which is compatible with the graphics driver. + +MESSAGE: + + CUDA runtime version: 0.0 + CUDA driver version: 0.0 + + No CUDA enabled graphics card detected. + + Skipping Gpufit computations. + +BEHAVIOR: + + The example executes Cpufit skipping Gpufit. Only computation speed of Cpufit + is shown in the results table. + +SOLUTION: + + The execution of Gpufit requires a CUDA enabled graphics card. + Ensure, that the host PC has installed a CUDA enabled graphics card. \ No newline at end of file diff --git a/package/README.md b/package/README.md new file mode 100644 index 0000000..ebf9279 --- /dev/null +++ b/package/README.md @@ -0,0 +1,48 @@ +# Creating a binary package + +The binary package bundles different builds outputs into a single distributable binary package containing the Gpufit dll, +the performance comparison example, the Matlab bindings and the Python bindings. + +## Calling the script + +create_package.bat %1 %2 %3 + +with + +- %1 is the BUILD_BASE_PATH (the path containing the various (see below) CMake generated Visual Studio projects) + +- %2 is the VERSION (e.g. 1.0.0) + +- %3 is the SOURCE_BASE_PATH (the path containing the sources) + +The output is a folder (BUILD_BASE_PATH/Gpufit-VERSION) which is also zipped if 7-Zip is available. + +## Requirements + +Note: The script has no way of checking that the requirements are fulfilled! + +See also [Build from sources](http://Gpufit.readthedocs.io/en/latest/installation.html#build-from-sources) for instructions. + +CMake + +- CUDA_ARCHITECTURE must be set to All (it is by default) + +- CUDA toolkit 8.0 is used for all builds (must be installed before) + +- Build directory for MSVC14 Win64 is BUILD_BASE_PATH/VC14x64-8.0 + +- Build directory for MSVC14 Win32 is BUILD_BASE_PATH/VC14x32-8.0 + +- Matlab and Python must be available + +Build + +- Configuration RelWithDebInfo is used for all builds! + +- With MSVC14 Win64 build target PYTHON_WHEEL, MATLAB_GPUFIT_PACKAGE and the Gpufit_Cpufit_Performance_Comparison example + +- With MSVC14 Win32 build target PYTHON_WHEEL, MATLAB_GPUFIT_PACKAGE and the Gpufit_Cpufit_Performance_Comparison example + +Documentation + +- An up-to-date version of the documentation must exist at SOURCE_BASE_PATH\docs\_build\latex\Gpufit.pdf (must be created before). \ No newline at end of file diff --git a/package/create_package.bat b/package/create_package.bat new file mode 100644 index 0000000..75ba751 --- /dev/null +++ b/package/create_package.bat @@ -0,0 +1,170 @@ +@ECHO OFF + +REM create package for Gpufit, assumes everything is compiled + +if "%1" == "" ( + echo specify build base path + goto end +) + +if "%2" == "" ( + echo specify version + goto end +) + +if "%3" == "" ( + echo specify source base path + goto end +) + +REM date and time from https://stackoverflow.com/a/30343827/1536976 + +@SETLOCAL ENABLEDELAYEDEXPANSION + +@REM Use WMIC to retrieve date and time +@echo off +FOR /F "skip=1 tokens=1-6" %%A IN ('WMIC Path Win32_LocalTime Get Day^,Hour^,Minute^,Month^,Second^,Year /Format:table') DO ( + IF NOT "%%~F"=="" ( + SET /A SortDate = 10000 * %%F + 100 * %%D + %%A + set YEAR=!SortDate:~0,4! + set MON=!SortDate:~4,2! + set DAY=!SortDate:~6,2! + @REM Add 1000000 so as to force a prepended 0 if hours less than 10 + SET /A SortTime = 1000000 + 10000 * %%B + 100 * %%C + %%E + set HOUR=!SortTime:~1,2! + set MIN=!SortTime:~3,2! + set SEC=!SortTime:~5,2! + ) +) + +set DATECODE=!YEAR!!MON!!DAY!!HOUR!!MIN! +echo %DATECODE% + +REM define paths + +set BUILD_BASE=%1 +set VERSION=%2 +set SOURCE_BASE=%3 + +set OUTPUT_NAME=Gpufit_%VERSION%_win32_win64_build%DATECODE% +set ROOT_INSTALL=%BUILD_BASE%\%OUTPUT_NAME% +set OUTPUT_ZIP=%BUILD_BASE%\%OUTPUT_NAME%.zip + +set PERFORMANCE_TEST_INSTALL=%ROOT_INSTALL%\gpufit_performance_test +set PYTHON_INSTALL=%ROOT_INSTALL%\python +set x32_MATLAB_INSTALL=%ROOT_INSTALL%\matlab32 +set x64_MATLAB_INSTALL=%ROOT_INSTALL%\matlab64 +set SDK_INSTALL_ROOT=%ROOT_INSTALL%\gpufit_sdk + +set x64_BUILD=%BUILD_BASE%\VC14x64-8.0\RelWithDebInfo +set x64_BUILD_LIB=%BUILD_BASE%\VC14x64-8.0\Gpufit\RelWithDebInfo +set x32_BUILD=%BUILD_BASE%\VC14x32-8.0\RelWithDebInfo +set x32_BUILD_LIB=%BUILD_BASE%\VC14x32-8.0\Gpufit\RelWithDebInfo + +set x64_PYTHON_BUILD=%x64_BUILD%\pyGpufit\dist +set x32_PYTHON_BUILD=%x32_BUILD%\pyGpufit\dist + +set x64_MATLAB_BUILD=%x64_BUILD%\matlab +set x32_MATLAB_BUILD=%x32_BUILD%\matlab + +set EXAMPLES_SOURCE=%SOURCE_BASE%\examples +set PYTHON_SOURCE=%SOURCE_BASE%\Gpufit\python +set MATLAB_SOURCE=%SOURCE_BASE%\Gpufit\matlab +set SDK_README_SOURCE=%SOURCE_BASE%\package\sdk_readme.txt + +set MANUAL_SOURCE=%SOURCE_BASE%\docs\_build\latex\Gpufit.pdf +set MANUAL_INSTALL=%ROOT_INSTALL%\Gpufit_%VERSION%_Manual.pdf + +REM clean up (if necessary) + +if exist "%ROOT_INSTALL%" rmdir /s /q "%ROOT_INSTALL%" +if exist "%OUTPUT_ZIP%" del "%OUTPUT_ZIP%" + +REM create root folder + +echo create root directory +mkdir "%ROOT_INSTALL%" + +REM copy main readme (is markdown, written as txt) and license + +copy "%SOURCE_BASE%\README.md" "%ROOT_INSTALL%\README.txt" +copy "%SOURCE_BASE%\LICENSE.txt" "%ROOT_INSTALL%" + +REM copy manual + +if not exist "%MANUAL_SOURCE%" ( + echo file %MANUAL_SOURCE% required, does not exist + goto end +) +copy "%MANUAL_SOURCE%" "%MANUAL_INSTALL%" + +REM copy performance test + +echo collect performance test application +mkdir "%PERFORMANCE_TEST_INSTALL%" +copy "%EXAMPLES_SOURCE%\Gpufit_Cpufit_Performance_Comparison_readme.txt" "%PERFORMANCE_TEST_INSTALL%\README.txt" + +mkdir "%PERFORMANCE_TEST_INSTALL%\win64" +copy "%x64_BUILD%\Gpufit_Cpufit_Performance_Comparison.exe" "%PERFORMANCE_TEST_INSTALL%\win64" +copy "%x64_BUILD%\Gpufit.dll" "%PERFORMANCE_TEST_INSTALL%\win64" +copy "%x64_BUILD%\Cpufit.dll" "%PERFORMANCE_TEST_INSTALL%\win64" + +mkdir "%PERFORMANCE_TEST_INSTALL%\win32" +copy "%x32_BUILD%\Gpufit_Cpufit_Performance_Comparison.exe" "%PERFORMANCE_TEST_INSTALL%\win32" +copy "%x32_BUILD%\Gpufit.dll" "%PERFORMANCE_TEST_INSTALL%\win32" +copy "%x32_BUILD%\Cpufit.dll" "%PERFORMANCE_TEST_INSTALL%\win32" + +REM copy Python packages + +echo collect python +mkdir "%PYTHON_INSTALL%" +copy "%x64_PYTHON_BUILD%\pyGpufit-%VERSION%-py2.py3-none-any.whl" "%PYTHON_INSTALL%\pyGpufit-%VERSION%-py2.py3-none-win_amd64.whl" +copy "%x32_PYTHON_BUILD%\pyGpufit-%VERSION%-py2.py3-none-any.whl" "%PYTHON_INSTALL%\pyGpufit-%VERSION%-py2.py3-none-win32.whl" +copy "%PYTHON_SOURCE%\README.txt" "%PYTHON_INSTALL%" +xcopy "%PYTHON_SOURCE%\examples" "%PYTHON_INSTALL%\examples" /i /q + +REM copy Matlab 32 bit + +echo collect matlab32 +mkdir "%x32_MATLAB_INSTALL%" +xcopy "%x32_MATLAB_BUILD%" "%x32_MATLAB_INSTALL%" /q +xcopy "%MATLAB_SOURCE%\examples" "%x32_MATLAB_INSTALL%\examples" /i /q + +REM copy Matlab 64 bit + +echo collect matlab64 +mkdir "%x64_MATLAB_INSTALL%" +xcopy "%x64_MATLAB_BUILD%" "%x64_MATLAB_INSTALL%" /q +xcopy "%MATLAB_SOURCE%\examples" "%x64_MATLAB_INSTALL%\examples" /i /q + +REM copy SDK_INSTALL_ROOT + +echo collect SDK +mkdir "%SDK_INSTALL_ROOT%" +copy "%SDK_README_SOURCE%" "%SDK_INSTALL_ROOT%\README.txt" + +mkdir "%SDK_INSTALL_ROOT%\include" +copy "%SOURCE_BASE%\Gpufit\gpufit.h" "%SDK_INSTALL_ROOT%\include" + +mkdir "%SDK_INSTALL_ROOT%\win32" +copy "%x32_BUILD%\Gpufit.dll" "%SDK_INSTALL_ROOT%\win32" +copy "%x32_BUILD_LIB%\Gpufit.lib" "%SDK_INSTALL_ROOT%\win32" + +mkdir "%SDK_INSTALL_ROOT%\win64" +copy "%x64_BUILD%\Gpufit.dll" "%SDK_INSTALL_ROOT%\win64" +copy "%x64_BUILD_LIB%\Gpufit.lib" "%SDK_INSTALL_ROOT%\win64" + +REM zip content of temp folder with 7-Zip if availabe + +set ZIP=C:\Program Files\7-Zip\7z.exe + +if not exist "%ZIP%" ( + echo 7-Zip not installed, zip manually + goto end +) ELSE ( + echo zip result + "%ZIP%" a -y -r -mem=AES256 "%OUTPUT_ZIP%" "%ROOT_INSTALL%%" > nul +) + +:end +PAUSE \ No newline at end of file diff --git a/package/sdk_readme.txt b/package/sdk_readme.txt new file mode 100644 index 0000000..59fc094 --- /dev/null +++ b/package/sdk_readme.txt @@ -0,0 +1,10 @@ +Software development kit for the Gpufit library (https://github.com/gpufit/Gpufit) +which implements Levenberg Marquardt curve fitting in CUDA. + +Compiled with the Microsoft Visual Studio 2015 C++ compiler and CUDA toolkit 8.0. + +Folder include contains the gpufit.h header file representing the C API. + +Folder win32 contains the 32 bit compiled dynamic link library and import libary. + +Folder win64 contains the 64 bit compiled dynamic link library and import libary. \ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 0000000..c524ac3 --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,4 @@ + +# Tests + +add_boost_test( "Cpufit;Gpufit" Consistency ) diff --git a/tests/Consistency.cpp b/tests/Consistency.cpp new file mode 100644 index 0000000..feb1032 --- /dev/null +++ b/tests/Consistency.cpp @@ -0,0 +1,220 @@ +#define BOOST_TEST_MODULE Gpufit + +#include "Cpufit/cpufit.h" +#include "Gpufit/gpufit.h" +#include "Tests/utils.h" + +#include + +#include + +void generate_input_linear_fit_1d(FitInput & i) +{ + // number fits, points, parameters + i.n_fits = 1; + i.n_points = 2; + i.n_parameters = 2; // LINEAR_1D has two parameters + + // data and weights + i.data = { 0, 1 }; + i.weights_ = { 1, 1 }; + + // model id and estimator id + i.model_id = LINEAR_1D; + i.estimator_id = LSE; + + // initial parameters and parameters to fit + i.initial_parameters = { 0, 0 }; + i.parameters_to_fit = { 1, 1 }; + + // tolerance and max_n_iterations + i.tolerance = 0.001f; + i.max_n_iterations = 10; + + // user info + i.user_info_ = { 0.f, 1.f }; +} + +void generate_input_gauss_fit_1d(FitInput & i) +{ + // number fits, points, parameters + i.n_fits = 1; + i.n_points = 5; + i.n_parameters = 4; // GAUSS_1D has four parameters + + // data and weights + clean_resize(i.data, i.n_fits * i.n_points); + std::vector< float > const true_parameters{ { 4.f, 2.f, 0.5f, 1.f } }; + generate_gauss_1d(i.data, true_parameters); + i.weights_.clear(); // no weights + + // model id and estimator id + i.model_id = GAUSS_1D; + i.estimator_id = LSE; + + // initial parameters and parameters to fit + i.initial_parameters = { 2.f, 1.5f, 0.3f, 0.f }; + i.parameters_to_fit = { 1, 1, 1, 1 }; + + // tolerance and max_n_iterations + i.tolerance = 0.001f; + i.max_n_iterations = 10; + + // user info + i.user_info_.clear(); // no user info +} + +void generate_input_gauss_fit_2d(FitInput & i) +{ + // number fits, points, parameters + i.n_fits = 1; + i.n_points = 25; + i.n_parameters = 5; // GAUSS_2D has five parameters + + // data and weights + clean_resize(i.data, i.n_fits * i.n_points); + std::vector< float > const true_parameters{ { 4.f, 1.8f, 2.2f, 0.5f, 1.f } }; + generate_gauss_2d(i.data, true_parameters); + i.weights_.clear(); // no weights + + // model id and estimator id + i.model_id = GAUSS_2D; + i.estimator_id = LSE; + + // initial parameters and parameters to fit + i.initial_parameters = { 2.f, 1.8f, 2.2f, 0.4f, 0.f }; + i.parameters_to_fit = { 1, 1, 1, 1, 1 }; + + // tolerance and max_n_iterations + i.tolerance = 0.0001f; + i.max_n_iterations = 20; + + // user info + i.user_info_.clear(); // no user info +} + +void generate_input_gauss_fit_2d_elliptic(FitInput & i) +{ + // number fits, points, parameters + i.n_fits = 1; + std::size_t const size_x = 5; + i.n_points = size_x * size_x; + i.n_parameters = 6; // GAUSS_2D_ELLIPTIC has five parameters + + // data and weights + clean_resize(i.data, i.n_fits * i.n_points); + + float const center_x = (static_cast(size_x) - 1.f) / 2.f; + std::vector< float > const true_parameters{ { 4.f, center_x, center_x, 0.4f, 0.6f, 1.f} }; + generate_gauss_2d_elliptic(i.data, true_parameters); + i.weights_.clear(); // no weights + + // model id and estimator id + i.model_id = GAUSS_2D_ELLIPTIC; + i.estimator_id = LSE; + + // initial parameters and parameters to fit + i.initial_parameters = { 2.f, 1.8f, 2.2f, 0.5f, 0.5f, 0.f }; + i.parameters_to_fit = { 1, 1, 1, 1, 1 }; + + // tolerance and max_n_iterations + i.tolerance = 0.001f; + i.max_n_iterations = 10; + + // user info + i.user_info_.clear(); // no user info +} + +void perform_cpufit_gpufit_and_check(void (*func)(FitInput &)) +{ + // generate the data + FitInput i; + func(i); + + // sanity checks (we don't want to introduce faulty data) + BOOST_CHECK(i.sanity_check()); + + // reset output variables + FitOutput gpu, cpu; + clean_resize(gpu.parameters, i.n_fits * i.n_parameters); + clean_resize(gpu.states, i.n_fits); + clean_resize(gpu.chi_squares, i.n_fits); + clean_resize(gpu.n_iterations, i.n_fits); + + clean_resize(cpu.parameters, i.n_fits * i.n_parameters); + clean_resize(cpu.states, i.n_fits); + clean_resize(cpu.chi_squares, i.n_fits); + clean_resize(cpu.n_iterations, i.n_fits); + + + // call to cpufit, store output + int const cpu_status + = cpufit + ( + i.n_fits, + i.n_points, + i.data.data(), + i.weights(), + i.model_id, + i.initial_parameters.data(), + i.tolerance, + i.max_n_iterations, + i.parameters_to_fit.data(), + i.estimator_id, + i.user_info_size(), + i.user_info(), + cpu.parameters.data(), + cpu.states.data(), + cpu.chi_squares.data(), + cpu.n_iterations.data() + ); + + BOOST_CHECK(cpu_status == 0); + + // call to gpufit, store output + int const gpu_status + = gpufit + ( + i.n_fits, + i.n_points, + i.data.data(), + i.weights(), + i.model_id, + i.initial_parameters.data(), + i.tolerance, + i.max_n_iterations, + i.parameters_to_fit.data(), + i.estimator_id, + i.user_info_size(), + i.user_info(), + gpu.parameters.data(), + gpu.states.data(), + gpu.chi_squares.data(), + gpu.n_iterations.data() + ); + + BOOST_CHECK(gpu_status == 0); + + // check both output for equality + BOOST_CHECK(cpu.states == gpu.states); + BOOST_CHECK(cpu.n_iterations == gpu.n_iterations); + BOOST_CHECK(close_or_equal(cpu.parameters, gpu.parameters)); + BOOST_CHECK(close_or_equal(cpu.chi_squares, gpu.chi_squares)); + +} + +BOOST_AUTO_TEST_CASE( Consistency ) +{ + BOOST_TEST_MESSAGE( "linear_fit_1d" ); + perform_cpufit_gpufit_and_check(&generate_input_linear_fit_1d); + + BOOST_TEST_MESSAGE( "gauss_fit_1d" ); + perform_cpufit_gpufit_and_check(&generate_input_gauss_fit_1d); + + BOOST_TEST_MESSAGE( "gauss_fit_2d" ); + perform_cpufit_gpufit_and_check(&generate_input_gauss_fit_2d); + + BOOST_TEST_MESSAGE("gauss_fit_2d_elliptic"); + perform_cpufit_gpufit_and_check(&generate_input_gauss_fit_2d_elliptic); + +} diff --git a/tests/utils.cpp b/tests/utils.cpp new file mode 100644 index 0000000..16f3970 --- /dev/null +++ b/tests/utils.cpp @@ -0,0 +1,60 @@ +#include "utils.h" + +// initialize random number generator +std::mt19937 rng(0); + +/* + Given a parameter vector p with 4 entries, constructs a 1D Gaussian peak function with x values 0,..,v.size() - 1 +*/ +void generate_gauss_1d(std::vector< float > & v, std::vector< float > const & p) +{ + for (std::size_t i = 0; i < v.size(); i++) + { + float const argx = ((i - p[1]) * (i - p[1])) / (2.f * p[2] * p[2]); + float const ex = exp(-argx); + v[i] = p[0] * ex + p[3]; + } +} + +/* + Given a parameters vector p with 5 entries, constructs a 2D Gaussian peak function with x, y values 0, .., sqrt(v.size()) - 1 +*/ +void generate_gauss_2d(std::vector< float > & v, std::vector< float > const & p) +{ + std::size_t const n = static_cast(std::sqrt(v.size())); + if (n * n != v.size()) + { + throw std::runtime_error("v.size() is not a perfect square number"); + } + + for (std::size_t j = 0; j < n; j++) + { + float const argy = ((j - p[2]) * (j - p[2])); + for (std::size_t i = 0; i < n; i++) + { + float const argx = ((i - p[1]) * (i - p[1])); + float const ex = exp(-(argx + argy) / (2.f * p[3] * p[3])); + v[j * n + i] = p[0] * ex + p[3]; + } + } +} + +void generate_gauss_2d_elliptic(std::vector< float > & v, std::vector< float > const & p) +{ + std::size_t const n = static_cast(std::sqrt(v.size())); + if (n * n != v.size()) + { + throw std::runtime_error("v.size() is not a perfect square number"); + } + + for (std::size_t j = 0; j < n; j++) + { + float const argy = ((j - p[2]) * (j - p[2])) / (2.f * p[4] * p[4]); + for (std::size_t i = 0; i < n; i++) + { + float const argx = ((i - p[1]) * (i - p[1])) / (2.f * p[3] * p[3]); + float const ex = exp(-(argx + argy)); + v[j * n + i] = p[0] * ex + p[3]; + } + } +} \ No newline at end of file diff --git a/tests/utils.h b/tests/utils.h new file mode 100644 index 0000000..dd0caa7 --- /dev/null +++ b/tests/utils.h @@ -0,0 +1,176 @@ +#ifndef TEST_UTILS_H_INCLUDED +#define TEST_UTILS_H_INCLUDED + +#include +#include + +#define CHK(x) if (!x) return false + +extern std::mt19937 rng; + +/* +Just to make sure that the content is erased after the resize. +*/ +template void clean_resize(std::vector & v, std::size_t const n) +{ + v.resize(n); + std::fill(v.begin(), v.end(), (T)0); +} + +template double max_relative_difference(std::vector const & a, std::vector const & b) +{ + double v = 0; + + auto it_a = a.begin(); + auto it_b = b.begin(); + + while (it_a !=a.end()) + { + T va = *it_a++; + T vb = *it_b++; + double d = static_cast(std::abs(va - vb)) / (std::abs(va) + std::abs(vb)); + v = std::max(v, d); + } + return v; +} + +template double max_absolute_difference(std::vector const & a, std::vector const & b) +{ + double v = 0; + + auto it_a = a.begin(); + auto it_b = b.begin(); + + while (it_a != a.end()) + { + T va = *it_a++; + T vb = *it_b++; + double d = static_cast(std::abs(va - vb)); + v = std::max(v, d); + } + return v; +} + +template bool close_or_equal(std::vector const & a, std::vector const & b, double relative_threshold = 1e-3, double absolute_threshold = 1e-6) +{ + if (a.empty() && b.empty()) + { + return true; + } + if (a.size() != b.size()) + { + return false; + } + double rd = max_relative_difference(a, b); + double ad = max_absolute_difference(a, b); + return rd < relative_threshold || ad < absolute_threshold; +} + +/* +Calculates the standard deviation of a vector whose values are the differences of values of two others vectors of equal length. +Only use values if use[i] == 0. +*/ +template double calculate_standard_deviation(std::vector const & a, std::vector const & b, std::vector const & use) +{ + std::size_t n = 0; + double sq_diff = 0; + + for (std::size_t i = 0; i < a.size(); i++) + { + if (use[i] == 0) + { + n++; + sq_diff += static_cast((a[i] - b[i])) * (a[i] - b[i]); + } + } + + double std_dev = std::sqrt(sq_diff / n); + return std_dev; +} + +template double calculate_mean(std::vector const & a, std::vector const & use) +{ + std::size_t n = 0; + double s = 0; + + for (std::size_t i = 0; i < a.size(); i++) + { + if (use[i] == 0) + { + n++; + s += static_cast(a[i]); + } + } + return s / n; +} + +void generate_gauss_1d(std::vector< float > & v, std::vector< float > const & p); + +void generate_gauss_2d(std::vector< float > & v, std::vector< float > const & p); + +void generate_gauss_2d_elliptic(std::vector< float > & v, std::vector< float > const & p); + +struct FitInput +{ + std::size_t n_fits; + std::size_t n_points; + std::size_t n_parameters; + + std::vector< float > data; + std::vector< float > weights_; // size 0 means no weights + + int model_id; + int estimator_id; + + std::vector< float > initial_parameters; + std::vector< int > parameters_to_fit; + + float tolerance; + int max_n_iterations; + + std::vector< float > user_info_; // user info is float + + float * weights() + { + if (!this->weights_.empty()) + { + return this->weights_.data(); + } + return 0; + } + + char * user_info() + { + if (!this->user_info_.empty()) + { + return reinterpret_cast(this->user_info_.data()); + } + return 0; + } + + std::size_t user_info_size() + { + return this->user_info_.size() * sizeof(float); // type of user_info is float + } + + bool sanity_check() + { + CHK(this->data.size() == this->n_fits * this->n_points); + if (!this->weights_.empty()) + { + CHK(this->weights_.size() == this->n_fits * this->n_points); + } + CHK(this->initial_parameters.size() == this->n_fits * this->n_parameters); + return true; + } +}; + +struct FitOutput +{ + std::vector< float > parameters; + std::vector< int > states; + std::vector< float > chi_squares; + std::vector< int > n_iterations; +}; + +#endif \ No newline at end of file