diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1a7c293
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+# Python
+**/.idea
+__pycache__
+
+# docs
+/docs/_build
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..9590a65
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,159 @@
+# Levenberg Marquardt curve fitting in CUDA
+# https://github.com/gpufit/Gpufit
+# see also CMake configuration in /docs/installation.rst
+
+# CMake
+
+cmake_minimum_required( VERSION 3.7 )
+set_property( GLOBAL PROPERTY USE_FOLDERS ON )
+
+if( NOT PROJECT_NAME )
+ project( Gpufit VERSION 1.0.0 )
+ include( CTest )
+endif()
+
+if( MSVC ) # link runtime statically
+ foreach( type ${CMAKE_CONFIGURATION_TYPES} ${CMAKE_BUILD_TYPE} )
+ string( TOUPPER ${type} TYPE )
+ foreach( flags CMAKE_C_FLAGS_${TYPE} CMAKE_CXX_FLAGS_${TYPE} )
+ get_property( help CACHE ${flags} PROPERTY HELPSTRING )
+ string( REPLACE "/MD" "/MT" ${flags} "${${flags}}" )
+ set( ${flags} "${${flags}}" CACHE STRING "${help}" FORCE )
+ endforeach()
+ endforeach()
+endif()
+
+function( add_launcher target executable arguments working_directory )
+ if( MSVC12 OR MSVC14 )
+ file( WRITE ${CMAKE_CURRENT_BINARY_DIR}/${target}.vcxproj.user
+"\n"
+"\n"
+" \n"
+" ${executable}\n"
+" ${arguments}\n"
+" ${working_directory}\n"
+" \n"
+"\n"
+ )
+ endif()
+endfunction()
+
+# Boost
+
+find_package( Boost 1.58.0 )
+if( Boost_FOUND )
+ function( add_boost_test modules name )
+ string( REPLACE ";" "_" prefix "${modules}" )
+ set( target ${prefix}_Test_${name} )
+ add_executable( ${target} ${name}.cpp
+ ${PROJECT_SOURCE_DIR}/Tests/utils.h
+ ${PROJECT_SOURCE_DIR}/Tests/utils.cpp
+ )
+ target_include_directories( ${target} PRIVATE ${PROJECT_SOURCE_DIR} )
+ target_link_libraries( ${target} ${modules} Boost::boost )
+ set_property( TARGET ${target}
+ PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" )
+ set_property( TARGET ${target} PROPERTY FOLDER Tests )
+
+ add_test( NAME ${target}
+ COMMAND ${target} --build_info --log_level=all --report_level=detailed )
+ endfunction()
+else()
+ set( BUILD_TESTING OFF )
+ message( WARNING "Boost NOT found - skipping tests! (set BOOST_ROOT manually)" )
+endif()
+
+# MATLAB
+
+find_package( Matlab )
+if( Matlab_FOUND )
+ find_program( Matlab_EXECUTABLE matlab
+ PATHS "${Matlab_ROOT_DIR}/bin" PATH_SUFFIXES win32 win64 NO_DEFAULT_PATH )
+ function( add_matlab_launcher target )
+ set( paths "${CMAKE_BINARY_DIR}/$(Configuration)" ${ARGN} )
+ list( GET paths -1 working_directory )
+ string( REPLACE ";" "','" paths "${paths}" )
+ set( arguments "-r addpath('${paths}');addpath(genpath(pwd))" )
+ add_launcher( ${target} "${Matlab_EXECUTABLE}" "${arguments}" "${working_directory}" )
+ endfunction()
+endif()
+
+# Python
+
+find_package( PythonInterp )
+if( PYTHONINTERP_FOUND )
+ function( add_python_launcher target )
+ set( paths "${CMAKE_BINARY_DIR}/$(Configuration)" ${ARGN} )
+ list( GET paths -1 working_directory )
+ string( REPLACE ";" "')\nsys.path.append('" paths "${paths}" )
+ set( arguments "-i -c \"import sys\nsys.path.append('${paths}')\"" )
+ add_launcher( ${target} "${PYTHON_EXECUTABLE}" "${arguments}" "${working_directory}" )
+ endfunction()
+endif()
+
+# Cpufit
+
+add_subdirectory( Cpufit )
+
+# Gpufit
+
+add_subdirectory( Gpufit )
+
+# Examples using Gpufit and Cpufit
+
+add_subdirectory( examples )
+
+# Launcher
+#
+# Uses the following variables:
+#
+# Matlab_WORKING_DIRECTORY (Default: user home directory)
+# -- Working directory for MATLAB applications using Cpufit and Gpufit.
+# Python_WORKING_DIRECTORY (Default: user home directory)
+# -- Working directory for Python applications using Gpufit.
+
+if( WIN32 )
+ file( TO_CMAKE_PATH "$ENV{HOMEPATH}" home )
+else()
+ file( TO_CMAKE_PATH "$ENV{HOME}" home )
+endif()
+
+if( Matlab_FOUND )
+ set( Matlab_WORKING_DIRECTORY "${home}" CACHE PATH "MATLAB working directory" )
+ if( Matlab_WORKING_DIRECTORY )
+ add_custom_target( RUN_MATLAB )
+ set_property( TARGET RUN_MATLAB PROPERTY FOLDER CMakePredefinedTargets )
+ add_dependencies( RUN_MATLAB CpufitMex GpufitMex )
+ add_matlab_launcher( RUN_MATLAB
+ "${CMAKE_SOURCE_DIR}/Cpufit/matlab"
+ "${CMAKE_SOURCE_DIR}/Gpufit/matlab"
+ "${Matlab_WORKING_DIRECTORY}"
+ )
+ endif()
+endif()
+
+if( PYTHONINTERP_FOUND )
+ set( Python_WORKING_DIRECTORY "${home}" CACHE PATH "Python working directory" )
+ if( Python_WORKING_DIRECTORY )
+ add_custom_target( RUN_PYTHON )
+ set_property( TARGET RUN_PYTHON PROPERTY FOLDER CMakePredefinedTargets )
+ add_dependencies( RUN_PYTHON Gpufit )
+ add_python_launcher( RUN_PYTHON
+ "${CMAKE_SOURCE_DIR}/Gpufit/python"
+ "${Python_WORKING_DIRECTORY}"
+ )
+ endif()
+endif()
+
+# Tests
+
+if( BUILD_TESTING )
+ add_subdirectory( tests )
+endif()
+
+# Package
+
+#set( CPACK_PACKAGE_VERSION ${PROJECT_VERSION} )
+#set( CPACK_GENERATOR ZIP )
+
+#include( CPack )
diff --git a/Cpufit/CMakeLists.txt b/Cpufit/CMakeLists.txt
new file mode 100644
index 0000000..9af1643
--- /dev/null
+++ b/Cpufit/CMakeLists.txt
@@ -0,0 +1,29 @@
+
+# Cpufit
+
+set( CpuHeaders
+ Cpufit.h
+ info.h
+ lm_fit.h
+ interface.h
+)
+
+set( CpuSources
+ Cpufit.cpp
+ info.cpp
+ lm_fit.cpp
+ lm_fit_cpp.cpp
+ interface.cpp
+ Cpufit.def
+)
+
+add_library( Cpufit SHARED
+ ${CpuHeaders}
+ ${CpuSources}
+)
+set_property( TARGET Cpufit
+ PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" )
+
+#install( TARGETS Cpufit RUNTIME DESTINATION bin )
+
+add_subdirectory( matlab )
diff --git a/Cpufit/Cpufit.def b/Cpufit/Cpufit.def
new file mode 100644
index 0000000..07c1849
--- /dev/null
+++ b/Cpufit/Cpufit.def
@@ -0,0 +1,4 @@
+LIBRARY "Cpufit"
+EXPORTS
+ cpufit @1
+ cpufit_get_last_error @2
\ No newline at end of file
diff --git a/Cpufit/README.md b/Cpufit/README.md
new file mode 100644
index 0000000..cee0619
--- /dev/null
+++ b/Cpufit/README.md
@@ -0,0 +1 @@
+# Cpufit
\ No newline at end of file
diff --git a/Cpufit/cpufit.cpp b/Cpufit/cpufit.cpp
new file mode 100644
index 0000000..c8c74cb
--- /dev/null
+++ b/Cpufit/cpufit.cpp
@@ -0,0 +1,76 @@
+#include "cpufit.h"
+#include "interface.h"
+
+#include
+
+std::string last_error ;
+
+int cpufit
+(
+ size_t n_fits,
+ size_t n_points,
+ float * data,
+ float * weights,
+ int model_id,
+ float * initial_parameters,
+ float tolerance,
+ int max_n_iterations,
+ int * parameters_to_fit,
+ int estimator_id,
+ size_t user_info_size,
+ char * user_info,
+ float * output_parameters,
+ int * output_states,
+ float * output_chi_squares,
+ int * output_n_iterations
+)
+try
+{
+ __int32 n_points_32 = 0;
+ if (n_points <= (unsigned int)(std::numeric_limits<__int32>::max()))
+ {
+ n_points_32 = __int32(n_points);
+ }
+ else
+ {
+ throw std::runtime_error("maximum number of data points per fit exceeded");
+ }
+
+ FitInterface fi(
+ data,
+ weights,
+ n_fits,
+ n_points_32,
+ tolerance,
+ max_n_iterations,
+ estimator_id,
+ initial_parameters,
+ parameters_to_fit,
+ user_info,
+ user_info_size,
+ output_parameters,
+ output_states,
+ output_chi_squares,
+ output_n_iterations);
+
+ fi.fit(model_id);
+
+ return STATUS_OK;
+}
+catch (std::exception & exception)
+{
+ last_error = exception.what();
+
+ return STATUS_ERROR;
+}
+catch (...)
+{
+ last_error = "Unknown Error";
+
+ return STATUS_ERROR;
+}
+
+char const * cpufit_get_last_error()
+{
+ return last_error.c_str();
+}
diff --git a/Cpufit/cpufit.h b/Cpufit/cpufit.h
new file mode 100644
index 0000000..1575636
--- /dev/null
+++ b/Cpufit/cpufit.h
@@ -0,0 +1,56 @@
+#ifndef CPU_FIT_H_INCLUDED
+#define CPU_FIT_H_INCLUDED
+
+// fitting model ID
+#define GAUSS_1D 0
+#define GAUSS_2D 1
+#define GAUSS_2D_ELLIPTIC 2
+#define GAUSS_2D_ROTATED 3
+#define CAUCHY_2D_ELLIPTIC 4
+#define LINEAR_1D 5
+
+// estimator ID
+#define LSE 0
+#define MLE 1
+
+// fit state
+#define STATE_CONVERGED 0
+#define STATE_MAX_ITERATION 1
+#define STATE_SINGULAR_HESSIAN 2
+#define STATE_NEG_CURVATURE_MLE 3
+
+// cpufit return state
+#define STATUS_OK 0
+#define STATUS_ERROR -1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int cpufit
+(
+ size_t n_fits,
+ size_t n_points,
+ float * data,
+ float * weights,
+ int model_id,
+ float * initial_parameters,
+ float tolerance,
+ int max_n_iterations,
+ int * parameters_to_fit,
+ int estimator_id,
+ size_t user_info_size,
+ char * user_info,
+ float * output_parameters,
+ int * output_states,
+ float * output_chi_squares,
+ int * output_n_iterations
+) ;
+
+char const * cpufit_get_last_error() ;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // CPU_FIT_H_INCLUDED
diff --git a/Cpufit/info.cpp b/Cpufit/info.cpp
new file mode 100644
index 0000000..dcd5085
--- /dev/null
+++ b/Cpufit/info.cpp
@@ -0,0 +1,30 @@
+#include "info.h"
+
+Info::Info(void) :
+ n_parameters_(0),
+ n_parameters_to_fit_(0),
+ max_n_iterations_(0),
+ n_fits_(0),
+ n_points_(0),
+ model_id_(0),
+ estimator_id_(0),
+ user_info_size_(0)
+{
+}
+
+Info::~Info(void)
+{
+}
+
+void Info::set_number_of_parameters_to_fit(int const * parameters_to_fit)
+{
+ n_parameters_to_fit_ = n_parameters_;
+
+ for (int i = 0; i < n_parameters_; i++)
+ {
+ if (!parameters_to_fit[i])
+ {
+ n_parameters_to_fit_--;
+ }
+ }
+}
\ No newline at end of file
diff --git a/Cpufit/info.h b/Cpufit/info.h
new file mode 100644
index 0000000..0faa764
--- /dev/null
+++ b/Cpufit/info.h
@@ -0,0 +1,28 @@
+#ifndef CPUFIT_PARAMETERS_H_INCLUDED
+#define CPUFIT_PARAMETERS_H_INCLUDED
+
+#include
+
+class Info
+{
+public:
+ Info();
+ virtual ~Info();
+ void set_number_of_parameters_to_fit(int const * parameters_to_fit);
+
+private:
+
+public:
+ int n_parameters_;
+ int n_parameters_to_fit_;
+ std::size_t n_fits_;
+ std::size_t n_points_;
+ int max_n_iterations_;
+ int model_id_;
+ int estimator_id_;
+ std::size_t user_info_size_;
+
+private:
+};
+
+#endif
diff --git a/Cpufit/interface.cpp b/Cpufit/interface.cpp
new file mode 100644
index 0000000..50dc01d
--- /dev/null
+++ b/Cpufit/interface.cpp
@@ -0,0 +1,118 @@
+#include "cpufit.h"
+#include "interface.h"
+
+FitInterface::FitInterface(
+ float const * data,
+ float const * weights,
+ std::size_t n_fits,
+ int n_points,
+ float tolerance,
+ int max_n_iterations,
+ int estimator_id,
+ float const * initial_parameters,
+ int const * parameters_to_fit,
+ char * user_info,
+ std::size_t user_info_size,
+ float * output_parameters,
+ int * output_states,
+ float * output_chi_squares,
+ int * output_n_iterations) :
+ data_(data),
+ weight_(weights),
+ n_fits_(n_fits),
+ n_points_(n_points),
+ tolerance_(tolerance),
+ max_n_iterations_(max_n_iterations),
+ estimator_id_(estimator_id),
+ initial_parameters_(initial_parameters),
+ parameters_to_fit_(parameters_to_fit),
+ user_info_(user_info),
+ user_info_size_(user_info_size),
+ output_parameters_(output_parameters),
+ output_states_(output_states),
+ output_chi_squares_(output_chi_squares),
+ output_n_iterations_(output_n_iterations),
+ n_parameters_(0)
+{}
+
+FitInterface::~FitInterface()
+{}
+
+void FitInterface::check_sizes()
+{
+ std::size_t maximum_size = std::numeric_limits< std::size_t >::max();
+
+ if (n_fits_ > maximum_size / n_points_ / sizeof(float))
+ {
+ throw std::runtime_error("maximum absolute number of data points exceeded");
+ }
+
+ if (n_fits_ > maximum_size / n_parameters_ / sizeof(float))
+ {
+ throw std::runtime_error("maximum number of fits and/or parameters exceeded");
+ }
+}
+
+void FitInterface::configure_info(Info & info, int const model_id)
+{
+ info.model_id_ = model_id;
+ info.n_fits_ = n_fits_;
+ info.n_points_ = n_points_;
+ info.max_n_iterations_ = max_n_iterations_;
+ info.estimator_id_ = estimator_id_;
+ info.user_info_size_ = user_info_size_;
+ info.n_parameters_ = n_parameters_;
+
+ info.set_number_of_parameters_to_fit(parameters_to_fit_);
+}
+
+void FitInterface::set_number_of_parameters(int const model_id)
+{
+ switch (model_id)
+ {
+ case GAUSS_1D:
+ n_parameters_ = 4;
+ break;
+ case GAUSS_2D:
+ n_parameters_ = 5;
+ break;
+ case GAUSS_2D_ELLIPTIC:
+ n_parameters_ = 6;
+ break;
+ case GAUSS_2D_ROTATED:
+ n_parameters_ = 7;
+ break;
+ case CAUCHY_2D_ELLIPTIC:
+ n_parameters_ = 6;
+ break;
+ case LINEAR_1D:
+ n_parameters_ = 2;
+ break;
+ default:
+ break;
+ }
+}
+
+void FitInterface::fit(int const model_id)
+{
+ set_number_of_parameters(model_id);
+
+ check_sizes();
+
+ Info info;
+ configure_info(info, model_id);
+
+ LMFit lmfit(
+ data_,
+ weight_,
+ info,
+ initial_parameters_,
+ parameters_to_fit_,
+ user_info_,
+ output_parameters_,
+ output_states_,
+ output_chi_squares_,
+ output_n_iterations_);
+
+ lmfit.run(tolerance_);
+}
diff --git a/Cpufit/interface.h b/Cpufit/interface.h
new file mode 100644
index 0000000..09bdc11
--- /dev/null
+++ b/Cpufit/interface.h
@@ -0,0 +1,57 @@
+#ifndef CPUFIT_INTERFACE_H_INCLUDED
+#define CPUFIT_INTERFACE_H_INCLUDED
+
+#include "lm_fit.h"
+
+class FitInterface
+{
+public:
+ FitInterface(
+ float const * data,
+ float const * weights,
+ std::size_t n_fits,
+ int n_points,
+ float tolerance,
+ int max_n_iterations,
+ int estimator_id,
+ float const * initial_parameters,
+ int const * parameters_to_fit,
+ char * user_info,
+ std::size_t user_info_size,
+ float * output_parameters,
+ int * output_states,
+ float * output_chi_squares,
+ int * output_n_iterations);
+
+ virtual ~FitInterface();
+
+ void fit(int const model_id);
+
+private:
+ void set_number_of_parameters(int const model_id);
+ void check_sizes();
+ void configure_info(Info & info, int const model_id);
+
+public:
+
+private:
+ int n_parameters_;
+ float const * const data_;
+ float const * const weight_;
+ std::size_t const n_fits_;
+ int const n_points_;
+ float const tolerance_;
+ int const max_n_iterations_;
+ int const estimator_id_;
+ float const * const initial_parameters_;
+ int const * const parameters_to_fit_;
+ char * const user_info_;
+ std::size_t const user_info_size_;
+
+ float * output_parameters_;
+ int * output_states_;
+ float * output_chi_squares_;
+ int * output_n_iterations_;
+};
+
+#endif
diff --git a/Cpufit/lm_fit.cpp b/Cpufit/lm_fit.cpp
new file mode 100644
index 0000000..e6fa64f
--- /dev/null
+++ b/Cpufit/lm_fit.cpp
@@ -0,0 +1,57 @@
+#include "lm_fit.h"
+#include
+#include
+#include
+#include
+#include
+#include
+
+LMFit::LMFit(
+ float const * const data,
+ float const * const weights,
+ Info const & info,
+ float const * const initial_parameters,
+ int const * const parameters_to_fit,
+ char * const user_info,
+ float * output_parameters,
+ int * output_states,
+ float * output_chi_squares,
+ int * output_n_iterations
+ ) :
+ data_(data),
+ weights_(weights),
+ initial_parameters_(initial_parameters),
+ parameters_to_fit_(parameters_to_fit),
+ user_info_(user_info),
+ output_parameters_(output_parameters),
+ output_states_(output_states),
+ output_chi_squares_(output_chi_squares),
+ output_n_iterations_(output_n_iterations),
+ info_(info)
+{}
+
+LMFit::~LMFit()
+{
+}
+
+void LMFit::run(float const tolerance)
+{
+ for (std::size_t fit_index = 0; fit_index < info_.n_fits_; fit_index++)
+ {
+ LMFitCPP gf_cpp(
+ tolerance,
+ fit_index,
+ data_ + fit_index*info_.n_points_,
+ weights_ ? weights_ + fit_index*info_.n_points_ : 0,
+ info_,
+ initial_parameters_ + fit_index*info_.n_parameters_,
+ parameters_to_fit_,
+ user_info_,
+ output_parameters_ + fit_index*info_.n_parameters_,
+ output_states_ + fit_index,
+ output_chi_squares_ + fit_index,
+ output_n_iterations_ + fit_index);
+
+ gf_cpp.run();
+ }
+}
\ No newline at end of file
diff --git a/Cpufit/lm_fit.h b/Cpufit/lm_fit.h
new file mode 100644
index 0000000..a5fd96d
--- /dev/null
+++ b/Cpufit/lm_fit.h
@@ -0,0 +1,137 @@
+#ifndef CPUFIT_GAUSS_FIT_H_INCLUDED
+#define CPUFIT_GAUSS_FIT_H_INCLUDED
+
+#include "info.h"
+
+class LMFitCPP;
+
+class LMFit
+{
+public:
+ LMFit(
+ float const * data,
+ float const * weights,
+ Info const& info,
+ float const * initial_parameters,
+ int const * parameters_to_fit,
+ char * user_info,
+ float * output_parameters,
+ int * output_states,
+ float * output_chi_squares,
+ int * output_n_iterations);
+
+ virtual ~LMFit();
+
+ void run(float const tolerance);
+
+private:
+ float const * const data_;
+ float const * const weights_;
+ float const * const initial_parameters_;
+ int const * const parameters_to_fit_;
+ char * const user_info_;
+
+ float * output_parameters_;
+ int * output_states_;
+ float * output_chi_squares_;
+ int * output_n_iterations_;
+
+ Info const & info_;
+};
+
+class LMFitCPP
+{
+public:
+ LMFitCPP(
+ float const tolerance,
+ std::size_t const fit_index,
+ float const * data,
+ float const * weight,
+ Info const & info,
+ float const * initial_parameters,
+ int const * parameters_to_fit,
+ char * user_info,
+ float * output_parameters,
+ int * output_states,
+ float * output_chi_squares,
+ int * output_n_iterations);
+
+ virtual ~LMFitCPP()
+ {};
+
+ void run();
+
+private:
+ void calc_curve_values();
+ void calc_coefficients();
+
+ void calc_curve_values(std::vector& curve, std::vector& derivatives);
+
+ void calc_values_gauss2d(std::vector& gaussian);
+ void calc_derivatives_gauss2d(std::vector & derivatives);
+
+ void calc_values_gauss2delliptic(std::vector& gaussian);
+ void calc_derivatives_gauss2delliptic(std::vector & derivatives);
+
+ void calc_values_gauss2drotated(std::vector& gaussian);
+ void calc_derivatives_gauss2drotated(std::vector & derivatives);
+
+ void calc_values_gauss1d(std::vector& gaussian);
+ void calc_derivatives_gauss1d(std::vector & derivatives);
+
+ void calc_values_cauchy2delliptic(std::vector& cauchy);
+ void calc_derivatives_cauchy2delliptic(std::vector & derivatives);
+
+ void calc_values_linear1d(std::vector& line);
+ void calc_derivatives_linear1d(std::vector & derivatives);
+
+ void calculate_hessian(std::vector const & derivatives,
+ std::vector const & curve);
+
+ void calc_gradient(std::vector const & derivatives,
+ std::vector const & curve);
+
+ void calc_chi_square(
+ std::vector const & curve);
+
+ void modify_step_width();
+ void gauss_jordan();
+ void update_parameters();
+
+ bool check_for_convergence();
+ void evaluate_iteration(int const iteration);
+ void prepare_next_iteration();
+
+public:
+
+private:
+
+ std::size_t const fit_index_;
+ float const * const data_;
+ float const * const weight_;
+ float const * const initial_parameters_;
+ int const * const parameters_to_fit_;
+
+ bool converged_;
+ float * parameters_;
+ int * state_;
+ float * chi_square_;
+ int * n_iterations_;
+
+ std::vector prev_parameters_;
+ Info const & info_;
+
+ float lambda_;
+ std::vector curve_;
+ std::vector derivatives_;
+ std::vector hessian_;
+ std::vector modified_hessian_;
+ std::vector gradient_;
+ std::vector delta_;
+ float prev_chi_square_;
+ float const tolerance_;
+
+ char * const user_info_;
+};
+
+#endif
\ No newline at end of file
diff --git a/Cpufit/lm_fit_cpp.cpp b/Cpufit/lm_fit_cpp.cpp
new file mode 100644
index 0000000..7eaae9d
--- /dev/null
+++ b/Cpufit/lm_fit_cpp.cpp
@@ -0,0 +1,711 @@
+#include "cpufit.h"
+#include "lm_fit.h"
+
+#include
+#include
+#include
+
+LMFitCPP::LMFitCPP(
+ float const tolerance,
+ std::size_t const fit_index,
+ float const * data,
+ float const * weight,
+ Info const & info,
+ float const * initial_parameters,
+ int const * parameters_to_fit,
+ char * user_info,
+ float * output_parameters,
+ int * output_state,
+ float * output_chi_square,
+ int * output_n_iterations
+ ) :
+ fit_index_(fit_index),
+ data_(data),
+ weight_(weight),
+ initial_parameters_(initial_parameters),
+ tolerance_(tolerance),
+ converged_(false),
+ info_(info),
+ parameters_to_fit_(parameters_to_fit),
+ curve_(info.n_points_),
+ derivatives_(info.n_points_*info.n_parameters_),
+ hessian_(info.n_parameters_to_fit_*info.n_parameters_to_fit_),
+ modified_hessian_(info.n_parameters_to_fit_*info.n_parameters_to_fit_),
+ gradient_(info.n_parameters_to_fit_),
+ delta_(info.n_parameters_to_fit_),
+ prev_chi_square_(0),
+ lambda_(0.001f),
+ prev_parameters_(info.n_parameters_to_fit_),
+ user_info_(user_info),
+ parameters_(output_parameters),
+ state_(output_state),
+ chi_square_(output_chi_square),
+ n_iterations_(output_n_iterations)
+{}
+
+void LMFitCPP::calc_derivatives_gauss2d(
+ std::vector & derivatives)
+{
+ std::size_t const fit_size_x = std::size_t(std::sqrt(info_.n_points_));
+
+ for (std::size_t y = 0; y < fit_size_x; y++)
+ for (std::size_t x = 0; x < fit_size_x; x++)
+ {
+ float const argx = (x - parameters_[1]) * (x - parameters_[1]) / (2 * parameters_[3] * parameters_[3]);
+ float const argy = (y - parameters_[2]) * (y - parameters_[2]) / (2 * parameters_[3] * parameters_[3]);
+ float const ex = exp(-(argx + argy));
+
+ derivatives[0 * info_.n_points_ + y*fit_size_x + x]
+ = ex;
+ derivatives[1 * info_.n_points_ + y*fit_size_x + x]
+ = (parameters_[0] * (x - parameters_[1])*ex) / (parameters_[3] * parameters_[3]);
+ derivatives[2 * info_.n_points_ + y*fit_size_x + x]
+ = (parameters_[0] * (y - parameters_[2])*ex) / (parameters_[3] * parameters_[3]);
+ derivatives[3 * info_.n_points_ + y*fit_size_x + x]
+ = (parameters_[0]
+ * ((x - parameters_[1])*(x - parameters_[1])
+ + (y - parameters_[2])*(y - parameters_[2]))*ex)
+ / (parameters_[3] * parameters_[3] * parameters_[3]);
+ derivatives[4 * info_.n_points_ + y*fit_size_x + x]
+ = 1;
+ }
+}
+
+void LMFitCPP::calc_derivatives_gauss2delliptic(
+ std::vector & derivatives)
+{
+ std::size_t const fit_size_x = std::size_t(std::sqrt(info_.n_points_));
+
+ for (std::size_t y = 0; y < fit_size_x; y++)
+ for (std::size_t x = 0; x < fit_size_x; x++)
+ {
+ float const argx = (x - parameters_[1]) * (x - parameters_[1]) / (2 * parameters_[3] * parameters_[3]);
+ float const argy = (y - parameters_[2]) * (y - parameters_[2]) / (2 * parameters_[4] * parameters_[4]);
+ float const ex = exp(-(argx +argy));
+
+ derivatives[0 * info_.n_points_ + y*fit_size_x + x]
+ = ex;
+ derivatives[1 * info_.n_points_ + y*fit_size_x + x]
+ = (parameters_[0] * (x - parameters_[1])*ex) / (parameters_[3] * parameters_[3]);
+ derivatives[2 * info_.n_points_ + y*fit_size_x + x]
+ = (parameters_[0] * (y - parameters_[2])*ex) / (parameters_[4] * parameters_[4]);
+ derivatives[3 * info_.n_points_ + y*fit_size_x + x]
+ = (parameters_[0] * (x - parameters_[1])*(x - parameters_[1])*ex) / (parameters_[3] * parameters_[3] * parameters_[3]);
+ derivatives[4 * info_.n_points_ + y*fit_size_x + x]
+ = (parameters_[0] * (y - parameters_[2])*(y - parameters_[2])*ex) / (parameters_[4] * parameters_[4] * parameters_[4]);
+ derivatives[5 * info_.n_points_ + y*fit_size_x + x]
+ = 1;
+ }
+}
+
+void LMFitCPP::calc_derivatives_gauss2drotated(
+ std::vector & derivatives)
+{
+ std::size_t const fit_size_x = std::size_t(std::sqrt(info_.n_points_));
+
+ float const amplitude = parameters_[0];
+ float const x0 = parameters_[1];
+ float const y0 = parameters_[2];
+ float const sig_x = parameters_[3];
+ float const sig_y = parameters_[4];
+ float const background = parameters_[5];
+ float const rot_sin = sin(parameters_[6]);
+ float const rot_cos = cos(parameters_[6]);
+
+ for (std::size_t y = 0; y < fit_size_x; y++)
+ for (std::size_t x = 0; x < fit_size_x; x++)
+ {
+ float const arga = ((x - x0) * rot_cos) - ((y - y0) * rot_sin);
+ float const argb = ((x - x0) * rot_sin) + ((y - y0) * rot_cos);
+ float const ex = exp((-0.5f) * (((arga / sig_x) * (arga / sig_x)) + ((argb / sig_y) * (argb / sig_y))));
+
+ derivatives[0 * info_.n_points_ + y*fit_size_x + x]
+ = ex;
+ derivatives[1 * info_.n_points_ + y*fit_size_x + x]
+ = ex * (amplitude * rot_cos * arga / (sig_x*sig_x) + amplitude * rot_sin *argb / (sig_y*sig_y));
+ derivatives[2 * info_.n_points_ + y*fit_size_x + x]
+ = ex * (-amplitude * rot_sin * arga / (sig_x*sig_x) + amplitude * rot_cos *argb / (sig_y*sig_y));
+ derivatives[3 * info_.n_points_ + y*fit_size_x + x]
+ = ex * amplitude * arga * arga / (sig_x*sig_x*sig_x);
+ derivatives[4 * info_.n_points_ + y*fit_size_x + x]
+ = ex * amplitude * argb * argb / (sig_y*sig_y*sig_y);
+ derivatives[5 * info_.n_points_ + y*fit_size_x + x]
+ = 1;
+ derivatives[6 * info_.n_points_ + y*fit_size_x + x]
+ = ex * amplitude * arga * argb * (1.0f / (sig_x*sig_x) - 1.0f / (sig_y*sig_y));
+ }
+}
+
+void LMFitCPP::calc_derivatives_gauss1d(
+ std::vector & derivatives)
+{
+ for (std::size_t x = 0; x < info_.n_points_; x++)
+ {
+ float argx = ((x - parameters_[1])*(x - parameters_[1])) / (2 * parameters_[2] * parameters_[2]);
+ float ex = exp(-argx);
+
+ derivatives[0 * info_.n_points_ + x] = ex;
+ derivatives[1 * info_.n_points_ + x] = (parameters_[0] * (x - parameters_[1])*ex) / (parameters_[2] * parameters_[2]);
+ derivatives[2 * info_.n_points_ + x] = (parameters_[0] * (x - parameters_[1])*(x - parameters_[1])*ex) / (parameters_[2] * parameters_[2] * parameters_[2]);
+ derivatives[3 * info_.n_points_ + x] = 1;
+ }
+}
+
+void LMFitCPP::calc_derivatives_cauchy2delliptic(
+ std::vector & derivatives)
+{
+ std::size_t const fit_size_x = std::size_t(std::sqrt(info_.n_points_));
+
+ for (std::size_t y = 0; y < fit_size_x; y++)
+ for (std::size_t x = 0; x < fit_size_x; x++)
+ {
+ float const argx =
+ ((parameters_[1] - x) / parameters_[3])
+ *((parameters_[1] - x) / parameters_[3]) + 1.f;
+ float const argy =
+ ((parameters_[2] - y) / parameters_[4])
+ *((parameters_[2] - y) / parameters_[4]) + 1.f;
+
+ derivatives[0 * info_.n_points_ + y*fit_size_x + x]
+ = 1.f / (argx*argy);
+ derivatives[1 * info_.n_points_ + y*fit_size_x + x] =
+ -2.f * parameters_[0] * (parameters_[1] - x)
+ / (parameters_[3] * parameters_[3] * argx*argx*argy);
+ derivatives[2 * info_.n_points_ + y*fit_size_x + x] =
+ -2.f * parameters_[0] * (parameters_[2] - y)
+ / (parameters_[4] * parameters_[4] * argy*argy*argx);
+ derivatives[3 * info_.n_points_ + y*fit_size_x + x] =
+ 2.f * parameters_[0] * (parameters_[1] - x) * (parameters_[1] - x)
+ / (parameters_[3] * parameters_[3] * parameters_[3] * argx*argx*argy);
+ derivatives[4 * info_.n_points_ + y*fit_size_x + x] =
+ 2.f * parameters_[0] * (parameters_[2] - y) * (parameters_[2] - y)
+ / (parameters_[4] * parameters_[4] * parameters_[4] * argy*argy*argx);
+ derivatives[5 * info_.n_points_ + y*fit_size_x + x]
+ = 1.f;
+ }
+}
+
+void LMFitCPP::calc_derivatives_linear1d(
+ std::vector & derivatives)
+{
+ float * user_info_float = (float*)user_info_;
+ float x = 0.f;
+
+ for (std::size_t point_index = 0; point_index < info_.n_points_; point_index++)
+ {
+ if (!user_info_float)
+ {
+ x = float(point_index);
+ }
+ else if (info_.user_info_size_ / sizeof(float) == info_.n_points_)
+ {
+ x = user_info_float[point_index];
+ }
+ else if (info_.user_info_size_ / sizeof(float) > info_.n_points_)
+ {
+ std::size_t const fit_begin = fit_index_ * info_.n_points_;
+ x = user_info_float[fit_begin + point_index];
+ }
+
+ derivatives[0 * info_.n_points_ + point_index] = 1.f;
+ derivatives[1 * info_.n_points_ + point_index] = x;
+ }
+}
+
+void LMFitCPP::calc_values_cauchy2delliptic(std::vector& cauchy)
+{
+ int const size_x = int(std::sqrt(float(info_.n_points_)));
+ int const size_y = size_x;
+
+ for (int iy = 0; iy < size_y; iy++)
+ {
+ for (int ix = 0; ix < size_x; ix++)
+ {
+ float const argx =
+ ((parameters_[1] - ix) / parameters_[3])
+ *((parameters_[1] - ix) / parameters_[3]) + 1.f;
+ float const argy =
+ ((parameters_[2] - iy) / parameters_[4])
+ *((parameters_[2] - iy) / parameters_[4]) + 1.f;
+
+ cauchy[iy*size_x + ix] = parameters_[0] / (argx * argy) + parameters_[5];
+ }
+ }
+}
+
+void LMFitCPP::calc_values_gauss2d(std::vector& gaussian)
+{
+ int const size_x = int(std::sqrt(float(info_.n_points_)));
+ int const size_y = size_x;
+
+ for (int iy = 0; iy < size_y; iy++)
+ {
+ for (int ix = 0; ix < size_x; ix++)
+ {
+ float argx = (ix - parameters_[1]) * (ix - parameters_[1]) / (2 * parameters_[3] * parameters_[3]);
+ float argy = (iy - parameters_[2]) * (iy - parameters_[2]) / (2 * parameters_[3] * parameters_[3]);
+ float ex = exp(-(argx +argy));
+
+ gaussian[iy*size_x + ix] = parameters_[0] * ex + parameters_[4];
+ }
+ }
+}
+
+void LMFitCPP::calc_values_gauss2delliptic(std::vector& gaussian)
+{
+ int const size_x = int(std::sqrt(float(info_.n_points_)));
+ int const size_y = size_x;
+ for (int iy = 0; iy < size_y; iy++)
+ {
+ for (int ix = 0; ix < size_x; ix++)
+ {
+ float argx = (ix - parameters_[1]) * (ix - parameters_[1]) / (2 * parameters_[3] * parameters_[3]);
+ float argy = (iy - parameters_[2]) * (iy - parameters_[2]) / (2 * parameters_[4] * parameters_[4]);
+ float ex = exp(-(argx + argy));
+
+ gaussian[iy*size_x + ix]
+ = parameters_[0] * ex + parameters_[5];
+ }
+ }
+}
+
+void LMFitCPP::calc_values_gauss2drotated(std::vector& gaussian)
+{
+ int const size_x = int(std::sqrt(float(info_.n_points_)));
+ int const size_y = size_x;
+
+ float amplitude = parameters_[0];
+ float background = parameters_[5];
+ float x0 = parameters_[1];
+ float y0 = parameters_[2];
+ float sig_x = parameters_[3];
+ float sig_y = parameters_[4];
+ float rot_sin = sin(parameters_[6]);
+ float rot_cos = cos(parameters_[6]);
+
+ for (int iy = 0; iy < size_y; iy++)
+ {
+ for (int ix = 0; ix < size_x; ix++)
+ {
+ int const pixel_index = iy*size_x + ix;
+
+ float arga = ((ix - x0) * rot_cos) - ((iy - y0) * rot_sin);
+ float argb = ((ix - x0) * rot_sin) + ((iy - y0) * rot_cos);
+
+ float ex
+ = exp((-0.5f) * (((arga / sig_x) * (arga / sig_x)) + ((argb / sig_y) * (argb / sig_y))));
+
+ gaussian[pixel_index] = amplitude * ex + background;
+ }
+ }
+}
+
+void LMFitCPP::calc_values_gauss1d(std::vector& gaussian)
+{
+ for (std::size_t ix = 0; ix < info_.n_points_; ix++)
+ {
+ float argx
+ = ((ix - parameters_[1])*(ix - parameters_[1]))
+ / (2 * parameters_[2] * parameters_[2]);
+ float ex = exp(-argx);
+ gaussian[ix] = parameters_[0] * ex + parameters_[3];
+ }
+}
+
+void LMFitCPP::calc_values_linear1d(std::vector& line)
+{
+ float * user_info_float = (float*)user_info_;
+ float x = 0.f;
+ for (std::size_t point_index = 0; point_index < info_.n_points_; point_index++)
+ {
+ if (!user_info_float)
+ {
+ x = float(point_index);
+ }
+ else if (info_.user_info_size_ / sizeof(float) == info_.n_points_)
+ {
+ x = user_info_float[point_index];
+ }
+ else if (info_.user_info_size_ / sizeof(float) > info_.n_points_)
+ {
+ std::size_t const fit_begin = fit_index_ * info_.n_points_;
+ x = user_info_float[fit_begin + point_index];
+ }
+ line[point_index] = parameters_[0] + parameters_[1] * x;
+ }
+}
+
+void LMFitCPP::calc_curve_values(std::vector& curve, std::vector& derivatives)
+{
+ if (info_.model_id_ == GAUSS_1D)
+ {
+ calc_values_gauss1d(curve);
+ calc_derivatives_gauss1d(derivatives);
+ }
+ else if (info_.model_id_ == GAUSS_2D)
+ {
+ calc_values_gauss2d(curve);
+ calc_derivatives_gauss2d(derivatives);
+ }
+ else if (info_.model_id_ == GAUSS_2D_ELLIPTIC)
+ {
+ calc_values_gauss2delliptic(curve);
+ calc_derivatives_gauss2delliptic(derivatives);
+ }
+ else if (info_.model_id_ == GAUSS_2D_ROTATED)
+ {
+ calc_values_gauss2drotated(curve);
+ calc_derivatives_gauss2drotated(derivatives);
+ }
+ else if (info_.model_id_ == CAUCHY_2D_ELLIPTIC)
+ {
+ calc_values_cauchy2delliptic(curve);
+ calc_derivatives_cauchy2delliptic(derivatives);
+ }
+ else if (info_.model_id_ == LINEAR_1D)
+ {
+ calc_values_linear1d(curve);
+ calc_derivatives_linear1d(derivatives);
+ }
+}
+
+void LMFitCPP::calculate_hessian(
+ std::vector const & derivatives,
+ std::vector const & curve)
+{
+ for (int jp = 0, jhessian = 0; jp < info_.n_parameters_; jp++)
+ {
+ if (parameters_to_fit_[jp])
+ {
+ for (int ip = 0, ihessian = 0; ip < jp + 1; ip++)
+ {
+ if (parameters_to_fit_[ip])
+ {
+ std::size_t const ijhessian
+ = ihessian * info_.n_parameters_to_fit_ + jhessian;
+ std::size_t const jihessian
+ = jhessian * info_.n_parameters_to_fit_ + ihessian;
+ std::size_t const derivatives_index_i = ip*info_.n_points_;
+ std::size_t const derivatives_index_j = jp*info_.n_points_;
+
+ double sum = 0.0;
+ for (std::size_t pixel_index = 0; pixel_index < info_.n_points_; pixel_index++)
+ {
+ if (info_.estimator_id_ == LSE)
+ {
+ if (!weight_)
+ {
+ sum
+ += derivatives[derivatives_index_i + pixel_index]
+ * derivatives[derivatives_index_j + pixel_index];
+ }
+ else
+ {
+ sum
+ += derivatives[derivatives_index_i + pixel_index]
+ * derivatives[derivatives_index_j + pixel_index]
+ * weight_[pixel_index];
+ }
+ }
+ else if (info_.estimator_id_ == MLE)
+ {
+ sum
+ += data_[pixel_index] / (curve[pixel_index] * curve[pixel_index])
+ * derivatives[derivatives_index_i + pixel_index]
+ * derivatives[derivatives_index_j + pixel_index];
+ }
+ }
+ hessian_[ijhessian] = float(sum);
+ if (ijhessian != jihessian)
+ {
+ hessian_[jihessian]
+ = hessian_[ijhessian];
+ }
+ ihessian++;
+ }
+ }
+ jhessian++;
+ }
+ }
+
+}
+
+void LMFitCPP::calc_gradient(
+ std::vector const & derivatives,
+ std::vector const & curve)
+{
+
+ for (int ip = 0, gradient_index = 0; ip < info_.n_parameters_; ip++)
+ {
+ if (parameters_to_fit_[ip])
+ {
+ std::size_t const derivatives_index = ip*info_.n_points_;
+ double sum = 0.0;
+ for (std::size_t pixel_index = 0; pixel_index < info_.n_points_; pixel_index++)
+ {
+ float deviant = data_[pixel_index] - curve[pixel_index];
+
+ if (info_.estimator_id_ == LSE)
+ {
+ if (!weight_)
+ {
+ sum
+ += deviant * derivatives[derivatives_index + pixel_index];
+ }
+ else
+ {
+ sum
+ += deviant * derivatives[derivatives_index + pixel_index] * weight_[pixel_index];
+ }
+
+ }
+ else if (info_.estimator_id_ == MLE)
+ {
+ sum
+ += -derivatives[derivatives_index + pixel_index] * (1 - data_[pixel_index] / curve[pixel_index]);
+ }
+ }
+ gradient_[gradient_index] = float(sum);
+ gradient_index++;
+ }
+ }
+
+}
+
+void LMFitCPP::calc_chi_square(
+ std::vector const & values)
+{
+ double sum = 0.0;
+ for (size_t pixel_index = 0; pixel_index < values.size(); pixel_index++)
+ {
+ float deviant = values[pixel_index] - data_[pixel_index];
+ if (info_.estimator_id_ == LSE)
+ {
+ if (!weight_)
+ {
+ sum += deviant * deviant;
+ }
+ else
+ {
+ sum += deviant * deviant * weight_[pixel_index];
+ }
+ }
+ else if (info_.estimator_id_ == MLE)
+ {
+ if (values[pixel_index] <= 0.f)
+ {
+ (*state_) = 3;
+ return;
+ }
+ if (data_[pixel_index] != 0.f)
+ {
+ sum
+ += 2 * (deviant - data_[pixel_index] * logf(values[pixel_index] / data_[pixel_index]));
+ }
+ else
+ {
+ sum += 2 * deviant;
+ }
+ }
+ }
+ *chi_square_ = float(sum);
+}
+
+void LMFitCPP::calc_curve_values()
+{
+ std::vector & curve = curve_;
+ std::vector & derivatives = derivatives_;
+
+ calc_curve_values(curve, derivatives);
+}
+
+void LMFitCPP::calc_coefficients()
+{
+ std::vector & curve = curve_;
+ std::vector & derivatives = derivatives_;
+
+ calc_chi_square(curve);
+
+ if ((*chi_square_) < prev_chi_square_ || prev_chi_square_ == 0)
+ {
+ calculate_hessian(derivatives, curve);
+ calc_gradient(derivatives, curve);
+ }
+}
+
+void LMFitCPP::gauss_jordan()
+{
+ delta_ = gradient_;
+
+ std::vector & alpha = modified_hessian_;
+ std::vector & beta = delta_;
+
+ int icol, irow;
+ float big, dum, pivinv;
+
+ std::vector indxc(info_.n_parameters_to_fit_, 0);
+ std::vector indxr(info_.n_parameters_to_fit_, 0);
+ std::vector ipiv(info_.n_parameters_to_fit_, 0);
+
+ for (int kp = 0; kp < info_.n_parameters_to_fit_; kp++)
+ {
+ big = 0.0;
+ for (int jp = 0; jp < info_.n_parameters_to_fit_; jp++)
+ {
+ if (ipiv[jp] != 1)
+ {
+ for (int ip = 0; ip < info_.n_parameters_to_fit_; ip++)
+ {
+ if (ipiv[ip] == 0)
+ {
+ if (fabs(alpha[jp*info_.n_parameters_to_fit_ + ip]) >= big)
+ {
+ big = fabs(alpha[jp*info_.n_parameters_to_fit_ + ip]);
+ irow = jp;
+ icol = ip;
+ }
+ }
+ }
+ }
+ }
+ ++(ipiv[icol]);
+
+
+ if (irow != icol)
+ {
+ for (int ip = 0; ip < info_.n_parameters_to_fit_; ip++)
+ {
+ std::swap(alpha[irow*info_.n_parameters_to_fit_ + ip], alpha[icol*info_.n_parameters_to_fit_ + ip]);
+ }
+ std::swap(beta[irow], beta[icol]);
+ }
+ indxr[kp] = irow;
+ indxc[kp] = icol;
+ if (alpha[icol*info_.n_parameters_to_fit_ + icol] == 0.0)
+ {
+ (*state_) = 2;
+ break;
+ }
+ pivinv = 1.0f / alpha[icol*info_.n_parameters_to_fit_ + icol];
+ alpha[icol*info_.n_parameters_to_fit_ + icol] = 1.0;
+ for (int ip = 0; ip < info_.n_parameters_to_fit_; ip++)
+ {
+ alpha[icol*info_.n_parameters_to_fit_ + ip] *= pivinv;
+ }
+ beta[icol] *= pivinv;
+
+ for (int jp = 0; jp < info_.n_parameters_to_fit_; jp++)
+ {
+ if (jp != icol)
+ {
+ dum = alpha[jp*info_.n_parameters_to_fit_ + icol];
+ alpha[jp*info_.n_parameters_to_fit_ + icol] = 0.0;
+ for (int ip = 0; ip < info_.n_parameters_to_fit_; ip++)
+ {
+ alpha[jp*info_.n_parameters_to_fit_ + ip] -= alpha[icol*info_.n_parameters_to_fit_ + ip] * dum;
+ }
+ beta[jp] -= beta[icol] * dum;
+ }
+ }
+ }
+}
+
+void LMFitCPP::update_parameters()
+{
+ for (int parameter_index = 0, delta_index = 0; parameter_index < info_.n_parameters_; parameter_index++)
+ {
+ if (parameters_to_fit_[parameter_index])
+ {
+ prev_parameters_[parameter_index] = parameters_[parameter_index];
+ parameters_[parameter_index] = parameters_[parameter_index] + delta_[delta_index++];
+ }
+ }
+}
+
+bool LMFitCPP::check_for_convergence()
+{
+ bool const fit_found
+ = std::abs(*chi_square_ - prev_chi_square_) < std::max(tolerance_, tolerance_ * std::abs(*chi_square_));
+
+ return fit_found;
+}
+
+void LMFitCPP::evaluate_iteration(int const iteration)
+{
+ bool const max_iterations_reached = iteration == info_.max_n_iterations_ - 1;
+ if (converged_ || max_iterations_reached)
+ {
+ (*n_iterations_) = iteration + 1;
+ if (!converged_)
+ {
+ (*state_) = 1;
+ }
+ }
+}
+
+void LMFitCPP::prepare_next_iteration()
+{
+ if ((*chi_square_) < prev_chi_square_)
+ {
+ lambda_ *= 0.1f;
+ prev_chi_square_ = (*chi_square_);
+ }
+ else
+ {
+ lambda_ *= 10.f;
+ (*chi_square_) = prev_chi_square_;
+ for (int parameter_index = 0, delta_index = 0; parameter_index < info_.n_parameters_; parameter_index++)
+ {
+ if (parameters_to_fit_[parameter_index])
+ {
+ parameters_[parameter_index] = prev_parameters_[parameter_index];
+ }
+ }
+ }
+}
+
+void LMFitCPP::modify_step_width()
+{
+ modified_hessian_ = hessian_;
+ size_t const n_parameters = (size_t)(sqrt((float)(hessian_.size())));
+ for (size_t parameter_index = 0; parameter_index < n_parameters; parameter_index++)
+ {
+ modified_hessian_[parameter_index*n_parameters + parameter_index]
+ = modified_hessian_[parameter_index*n_parameters + parameter_index]
+ * (1.0f + (lambda_));
+ }
+}
+
+void LMFitCPP::run()
+{
+ for (int i = 0; i < info_.n_parameters_; i++)
+ parameters_[i] = initial_parameters_[i];
+
+ (*state_) = 0;
+ calc_curve_values();
+ calc_coefficients();
+ prev_chi_square_ = (*chi_square_);
+
+ for (int iteration = 0; (*state_) == 0; iteration++)
+ {
+ modify_step_width();
+
+ gauss_jordan();
+
+ update_parameters();
+
+ calc_curve_values();
+ calc_coefficients();
+
+ converged_ = check_for_convergence();
+
+ evaluate_iteration(iteration);
+
+ prepare_next_iteration();
+
+ if (converged_ || (*state_) != 0)
+ {
+ break;
+ }
+ }
+}
diff --git a/Cpufit/matlab/CMakeLists.txt b/Cpufit/matlab/CMakeLists.txt
new file mode 100644
index 0000000..46276bd
--- /dev/null
+++ b/Cpufit/matlab/CMakeLists.txt
@@ -0,0 +1,62 @@
+
+# MATLAB Cpufit binding
+
+find_package( Matlab COMPONENTS MX_LIBRARY )
+
+if( NOT Matlab_FOUND )
+ message( STATUS "Matlab and/or MX_Library NOT found - skipping Cpufit Matlab binding!" )
+ return()
+endif()
+
+# Matlab MEX FILE
+
+set( Headers
+ )
+
+set( Sources
+ mex/CpufitMex.cpp
+ )
+
+add_library( CpufitMex SHARED
+ ${Headers}
+ ${Sources}
+ )
+set_property( TARGET CpufitMex
+ PROPERTY SUFFIX .${Matlab_MEX_EXTENSION} )
+set_property( TARGET CpufitMex
+ PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" )
+
+target_include_directories( CpufitMex PRIVATE ${Matlab_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
+target_link_libraries( CpufitMex Cpufit ${Matlab_LIBRARIES} )
+
+if( WIN32 )
+ SET(CMAKE_SHARED_LINKER_FLAGS "/export:mexFunction")
+endif()
+
+add_matlab_launcher( CpufitMex "${CMAKE_CURRENT_SOURCE_DIR}" )
+
+# MATLAB Cpufit + Gpufit PACKAGE
+
+set( build_directory "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/matlab" )
+set( package_files
+ "${CMAKE_CURRENT_SOURCE_DIR}/cpufit.m"
+)
+set( binary_gpufit $ )
+set( binary_mex $ )
+
+add_custom_target( MATLAB_CPUFIT_GPUFIT_PACKAGE ALL
+ COMMAND ${CMAKE_COMMAND} -E
+ make_directory ${build_directory}
+ COMMAND ${CMAKE_COMMAND} -E
+ copy_if_different ${package_files} ${build_directory}
+ COMMAND ${CMAKE_COMMAND} -E
+ copy_if_different ${binary_gpufit} ${build_directory}
+ COMMAND ${CMAKE_COMMAND} -E
+ copy_if_different ${binary_mex} ${build_directory}
+ COMMENT "Adding Cpufit to Matlab package"
+)
+set_property( TARGET MATLAB_CPUFIT_GPUFIT_PACKAGE PROPERTY FOLDER CMakePredefinedTargets )
+add_dependencies( MATLAB_CPUFIT_GPUFIT_PACKAGE MATLAB_GPUFIT_PACKAGE Cpufit CpufitMex )
+
+# add launcher
+
diff --git a/Cpufit/matlab/README.md b/Cpufit/matlab/README.md
new file mode 100644
index 0000000..a2dc84c
--- /dev/null
+++ b/Cpufit/matlab/README.md
@@ -0,0 +1,3 @@
+Matlab binding for Cpufit, the control CPU implementation of
+the [Gpufit library](https://github.com/gpufit/Gpufit) which
+implements Levenberg Marquardt curve fitting in CUDA
\ No newline at end of file
diff --git a/Cpufit/matlab/cpufit.m b/Cpufit/matlab/cpufit.m
new file mode 100644
index 0000000..243c654
--- /dev/null
+++ b/Cpufit/matlab/cpufit.m
@@ -0,0 +1,119 @@
+function [parameters, states, chi_squares, n_iterations, time]...
+ = cpufit(data, weights, model_id, initial_parameters, tolerance, max_n_iterations, parameters_to_fit, estimator_id, user_info)
+% Wrapper around the Cpufit mex file.
+%
+% Optional arguments can be given as empty matrix [].
+%
+% Default values as specified
+
+%% size checks
+
+% number of input parameter (variable)
+if nargin < 9
+ user_info = [];
+ if nargin < 8
+ estimator_id = [];
+ if nargin < 7
+ parameters_to_fit = [];
+ if nargin < 6
+ max_n_iterations = [];
+ if nargin < 5
+ tolerance = [];
+ assert(nargin == 4, 'Not enough parameters');
+ end
+ end
+ end
+ end
+end
+
+% data is 2D and read number of points and fits
+data_size = size(data);
+assert(length(data_size) == 2, 'data is not two-dimensional');
+n_points = data_size(1);
+n_fits = data_size(2);
+
+% consistency with weights (if given)
+if ~isempty(weights)
+ assert(isequal(data_size, size(weights)), 'Dimension mismatch between data and weights')
+end
+
+% initial parameters is 2D and read number of parameters
+initial_parameters_size = size(initial_parameters);
+assert(length(initial_parameters_size) == 2, 'initial_parameters is not two-dimensional');
+n_parameters = initial_parameters_size(1);
+assert(n_fits == initial_parameters_size(2), 'Dimension mismatch in number of fits between data and initial_parameters');
+
+% consistency with parameters_to_fit (if given)
+if ~isempty(parameters_to_fit)
+ assert(size(parameters_to_fit, 1) == n_parameters, 'Dimension mismatch in number of parameters between initial_parameters and parameters_to_fit');
+end
+
+%% default values
+
+% tolerance
+if isempty(tolerance)
+ tolerance = 1e-4;
+end
+
+% max_n_iterations
+if isempty(max_n_iterations)
+ max_n_iterations = 25;
+end
+
+% estimator_id
+if isempty(estimator_id)
+ estimator_id = EstimatorID.LSE;
+end
+
+% parameters_to_fit
+if isempty(parameters_to_fit)
+ parameters_to_fit = ones(n_parameters, 1, 'int32');
+end
+
+% now only weights and user_info could be not given (empty matrix)
+
+%% type checks
+
+% data, weights (if given), initial_parameters are all single
+assert(isa(data, 'single'), 'Type of data is not single');
+if ~isempty(weights)
+ assert(isa(weights, 'single'), 'Type of weights is not single');
+end
+assert(isa(initial_parameters, 'single'), 'Type of initial_parameters is not single');
+
+% parameters_to_fit is int32 (cast to int32 if incorrect type)
+if ~isa(parameters_to_fit, 'int32')
+ parameters_to_fit = int32(parameters_to_fit);
+end
+
+% max_n_iterations must be int32 (cast if incorrect type)
+if ~isa(max_n_iterations, 'int32')
+ max_n_iterations = int32(max_n_iterations);
+end
+
+% tolerance must be single (cast if incorrect type)
+if ~isa(tolerance, 'single')
+ tolerance = single(tolerance);
+end
+
+% we don't check type of user_info, but we extract the size in bytes of it
+if ~isempty(user_info)
+ user_info_info = whos('user_info');
+ user_info_size = user_info_info.bytes;
+else
+ user_info_size = 0;
+end
+
+
+%% run Cpufit taking the time
+tic;
+[parameters, states, chi_squares, n_iterations] ...
+ = CpufitMex(data, weights, n_fits, n_points, tolerance, max_n_iterations, estimator_id, initial_parameters, parameters_to_fit, model_id, n_parameters, user_info, user_info_size);
+
+time = toc;
+
+% reshape the output parameters array to have dimensions
+% (n_parameters,n_fits)
+parameters = reshape(parameters,n_parameters,n_fits);
+
+end
diff --git a/Cpufit/matlab/examples/gauss2d.m b/Cpufit/matlab/examples/gauss2d.m
new file mode 100644
index 0000000..3cb91f0
--- /dev/null
+++ b/Cpufit/matlab/examples/gauss2d.m
@@ -0,0 +1,182 @@
+function gauss2d()
+% Example of the Matlab binding of the Gpufit library implementing
+% Levenberg Marquardt curve fitting in C/C++
+% https://github.com/gpufit/Gpufit
+%
+% Multiple fits of a 2D Gaussian peak function with Poisson distributed noise
+% http://gpufit.readthedocs.io/en/latest/bindings.html#matlab
+
+% perform some 2D Gaussian peak fits with a symmetrical Gaussian peak
+fit_gauss2d();
+
+% perform some 2D Gaussian peak fits with an asymmetrical, rotated Gaussian peak
+fit_gauss2d_rotated();
+
+end
+function fit_gauss2d()
+
+%% number of fits and fit points
+number_fits = 1e4;
+size_x = 20;
+number_parameters = 5;
+
+%% set input arguments
+
+% true parameters
+true_parameters = single([20, 9.5, 9.5, 3, 10]);
+
+% initialize random number generator
+rng(0);
+
+% initial parameters (randomized)
+initial_parameters = repmat(single(true_parameters'), [1, number_fits]);
+% randomize relative to width for positions
+initial_parameters([2,3], :) = initial_parameters([2,3], :) + true_parameters(4) * (-0.2 + 0.4 * rand(2, number_fits));
+% randomize relative for other parameters
+initial_parameters([1,4,5], :) = initial_parameters([1,4,5], :) .* (0.8 + 0.4 * rand(3, number_fits));
+
+% generate x and y values
+g = single(0 : size_x - 1);
+[x, y] = ndgrid(g, g);
+
+% generate data with Poisson noise
+data = gaussian_2d(x, y, true_parameters);
+data = repmat(data(:), [1, number_fits]);
+data = poissrnd(data);
+
+% tolerance
+tolerance = 1e-3;
+
+% maximum number of iterations
+max_n_iterations = 20;
+
+% estimator id
+estimator_id = EstimatorID.MLE;
+
+% model ID
+model_id = ModelID.GAUSS_2D;
+
+%% run Cpufit
+[parameters, states, chi_squares, n_iterations, time] = cpufit(data, [], ...
+ model_id, initial_parameters, tolerance, max_n_iterations, [], estimator_id, []);
+
+%% displaying results
+display_results('2D Gaussian peak', model_id, number_fits, number_parameters, size_x, time, true_parameters, parameters, states, chi_squares, n_iterations);
+
+end
+
+function fit_gauss2d_rotated()
+
+%% number of fits and fit points
+number_fits = 1e4;
+size_x = 20;
+number_parameters = 7;
+
+%% set input arguments
+
+% true parameters
+true_parameters = single([200, 9.5, 9.5, 3, 4, 10, 0.5]);
+
+% initialize random number generator
+rng(0);
+
+% initial parameters (randomized)
+initial_parameters = repmat(single(true_parameters'), [1, number_fits]);
+% randomize relative to width for positions
+initial_parameters(2, :) = initial_parameters(2, :) + true_parameters(4) * (-0.2 + 0.4 * rand(1, number_fits));
+initial_parameters(3, :) = initial_parameters(3, :) + true_parameters(5) * (-0.2 + 0.4 * rand(1, number_fits));
+% randomize relative for other parameters
+initial_parameters([1,4,5,6,7], :) = initial_parameters([1,4,5,6,7], :) .* (0.8 + 0.4 * rand(5, number_fits));
+
+% generate x and y values
+g = single(0 : size_x - 1);
+[x, y] = ndgrid(g, g);
+
+% generate data with Poisson noise
+data = gaussian_2d_rotated(x, y, true_parameters);
+data = repmat(data(:), [1, number_fits]);
+data = poissrnd(data);
+
+% tolerance
+tolerance = 1e-3;
+
+% maximum number of iterations
+max_n_iterations = 20;
+
+% estimator id
+estimator_id = EstimatorID.MLE;
+
+% model ID
+model_id = ModelID.GAUSS_2D_ROTATED;
+
+%% run Cpufit
+[parameters, states, chi_squares, n_iterations, time] = cpufit(data, [], ...
+ model_id, initial_parameters, tolerance, max_n_iterations, [], estimator_id, []);
+
+%% displaying results
+display_results('2D rotated Gaussian peak', model_id, number_fits, number_parameters, size_x, time, true_parameters, parameters, states, chi_squares, n_iterations);
+
+
+end
+
+function g = gaussian_2d(x, y, p)
+% Generates a 2D Gaussian peak.
+% http://gpufit.readthedocs.io/en/latest/api.html#gauss-2d
+%
+% x,y - x and y grid position values
+% p - parameters (amplitude, x,y center position, width, offset)
+
+g = p(1) * exp(-((x - p(2)).^2 + (y - p(3)).^2) / (2 * p(4)^2)) + p(5);
+
+end
+
+function g = gaussian_2d_rotated(x, y, p)
+% Generates a 2D rotated elliptic Gaussian peak.
+% http://gpufit.readthedocs.io/en/latest/api.html#d-rotated-elliptic-gaussian-peak
+%
+% x,y - x and y grid position values
+% p - parameters (amplitude, x,y center position, width, offset)
+
+% cosine and sine of rotation angle
+cp = cos(p(7));
+sp = sin(p(7));
+
+% Gaussian peak with two axes
+arga = (x - p(2)) .* cp - (y - p(3)) .* sp;
+argb = (x - p(2)) .* sp + (y - p(3)) .* cp;
+ex = exp(-0.5 .* (((arga / p(4)) .* (arga / p(4))) + ((argb / p(5)) .* (argb / p(5)))));
+g = p(1) .* ex + p(6);
+
+end
+
+function display_results(name, model_id, number_fits, number_parameters, size_x, time, true_parameters, parameters, states, chi_squares, n_iterations)
+
+%% displaying results
+converged = states == 0;
+fprintf('\nCpufit of %s\n', name);
+
+% print summary
+fprintf('\nmodel ID: %d\n', model_id);
+fprintf('number of fits: %d\n', number_fits);
+fprintf('fit size: %d x %d\n', size_x, size_x);
+fprintf('mean chi-square: %6.2f\n', mean(chi_squares(converged)));
+fprintf('mean iterations: %6.2f\n', mean(n_iterations(converged)));
+fprintf('time: %6.2f s\n', time);
+
+% get fit states
+number_converged = sum(converged);
+fprintf('\nratio converged %6.2f %%\n', number_converged / number_fits * 100);
+fprintf('ratio max it. exceeded %6.2f %%\n', sum(states == 1) / number_fits * 100);
+fprintf('ratio singular hessian %6.2f %%\n', sum(states == 2) / number_fits * 100);
+fprintf('ratio neg curvature MLE %6.2f %%\n', sum(states == 3) / number_fits * 100);
+
+% mean and std of fitted parameters
+converged_parameters = parameters(:, converged);
+converged_parameters_mean = mean(converged_parameters, 2);
+converged_parameters_std = std(converged_parameters, [], 2);
+fprintf('\nparameters of %s\n', name);
+for i = 1 : number_parameters
+ fprintf('p%d true %6.2f mean %6.2f std %6.2f\n', i, true_parameters(i), converged_parameters_mean(i), converged_parameters_std(i));
+end
+
+end
\ No newline at end of file
diff --git a/Cpufit/matlab/examples/gauss2d_plot.m b/Cpufit/matlab/examples/gauss2d_plot.m
new file mode 100644
index 0000000..8d34707
--- /dev/null
+++ b/Cpufit/matlab/examples/gauss2d_plot.m
@@ -0,0 +1,117 @@
+function gauss2d_plot()
+% Example of the Matlab binding of the Gpufit library implementing
+% Levenberg Marquardt curve fitting in C/C++
+% https://github.com/gpufit/Gpufit
+%
+% Multiple fits of a 2D Gaussian peak function with Poisson distributed noise
+% repeated for a different total number of fits each time and plotting the
+% results
+% http://gpufit.readthedocs.io/en/latest/bindings.html#matlab
+
+%% number of fit points
+size_x = 5;
+n_points = size_x * size_x;
+
+%% set input arguments
+
+% mean true parameters
+mean_true_parameters = single([100, 3, 3, 1, 10]);
+
+% average noise level
+average_noise_level = 10;
+
+% initialize random number generator
+rng(0);
+
+% tolerance
+tolerance = 1e-4;
+
+% max number of itetations
+max_n_iterations = 10;
+
+% model id
+model_id = ModelID.GAUSS_2D;
+
+%% loop over different number of fits
+n_fits_all = round(logspace(2, 6, 20));
+
+% generate x and y values
+g = single(0 : size_x - 1);
+[x, y] = ndgrid(g, g);
+
+% loop
+speed = zeros(length(n_fits_all), 1);
+for i = 1:length(n_fits_all)
+ n_fits = n_fits_all(i);
+
+ % vary positions of 2D Gaussians peaks slightly
+ test_parameters = repmat(mean_true_parameters', [1, n_fits]);
+ test_parameters([2,3], :) = test_parameters([2,3], :) + mean_true_parameters(4) * (-0.2 + 0.4 * rand(2, n_fits));
+
+ % generate data
+ data = gaussians_2d(x, y, test_parameters);
+ data = reshape(data, [n_points, n_fits]);
+
+ % add noise
+ data = data + average_noise_level * randn(size(data), 'single');
+
+ % initial parameters (randomized)
+ initial_parameters = repmat(mean_true_parameters', [1, n_fits]);
+ % randomize relative to width for positions
+ initial_parameters([2,3], :) = initial_parameters([2,3], :) + mean_true_parameters(4) * (-0.2 + 0.4 * rand(2, n_fits));
+ % randomize relative for other parameters
+ initial_parameters([1,4,5], :) = initial_parameters([1,4,5], :) .* (0.8 + 0.4 * rand(3, n_fits));
+
+ % run Cpufit
+ [parameters, states, chi_squares, n_iterations, time] = cpufit(data, [], ...
+ model_id, initial_parameters, tolerance, max_n_iterations);
+
+ % analyze result
+ converged = states == 0;
+ speed(i) = n_fits / time;
+ precision_x0 = std(parameters(2, converged) - test_parameters(2, converged));
+
+ % display result
+ fprintf(' iterations: %.2f | time: %.3f s | speed: %8.0f fits/s\n', ...
+ mean(n_iterations(converged)), time, speed(i));
+end
+
+%% plot
+figure();
+semilogx(n_fits_all, speed, 'bo-')
+xlabel('number of fits per function call')
+ylabel('fits per second')
+legend('Cpufit', 'Location', 'NorthWest')
+grid on;
+xlim(n_fits_all([1,end]));
+
+end
+
+function g = gaussians_2d(x, y, p)
+% Generates many 2D Gaussians peaks for a given set of parameters
+
+n_fits = size(p, 2);
+msg = sprintf('generating %d fits ', n_fits);
+fprintf(msg);
+
+g = zeros([size(x), n_fits], 'single');
+
+progress = 0;
+L = 50; % length of progressbar
+l = 0;
+for i = 1 : n_fits
+
+ pi = p(:, i);
+ g(:, :, i) = pi(1) * exp(-((x - pi(2)).^2 + (y - pi(3)).^2) / (2 * pi(4)^2)) + pi(5);
+
+ progress = progress + 1;
+ if progress >= n_fits / L
+ progress = 0;
+ fprintf('|');
+ l = l + 1;
+ end
+end
+fprintf(repmat('\b', [1, length(msg) + l]));
+fprintf('%7d fits', n_fits);
+
+end
diff --git a/Cpufit/matlab/mex/CpufitMex.cpp b/Cpufit/matlab/mex/CpufitMex.cpp
new file mode 100644
index 0000000..3a10184
--- /dev/null
+++ b/Cpufit/matlab/mex/CpufitMex.cpp
@@ -0,0 +1,145 @@
+#include "Cpufit/cpufit.h"
+
+#include
+
+#include
+#include
+
+/*
+ Get a arbitrary scalar (non complex) and check for class id.
+ https://www.mathworks.com/help/matlab/apiref/mxclassid.html
+*/
+template inline bool get_scalar(const mxArray *p, T &v, const mxClassID id)
+{
+ if (mxIsNumeric(p) && !mxIsComplex(p) && mxGetNumberOfElements(p) == 1 && mxGetClassID(p) == id)
+ {
+ v = *static_cast(mxGetData(p));
+ return true;
+ }
+ else {
+ return false;
+ }
+}
+
+void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray const *prhs[])
+{
+ int expected_nrhs = 0;
+ int expected_nlhs = 0;
+ bool wrong_nrhs = false;
+ bool wrong_nlhs = false;
+
+ expected_nrhs = 13;
+ expected_nlhs = 4;
+ if (nrhs != expected_nrhs)
+ {
+ wrong_nrhs = true;
+ }
+ else if (nlhs != expected_nlhs)
+ {
+ wrong_nlhs = true;
+ }
+
+ if (wrong_nrhs || wrong_nlhs)
+ {
+ if (nrhs != expected_nrhs)
+ {
+ char s1[50];
+ _itoa_s(expected_nrhs, s1, 10);
+ char const s2[] = " input arguments required.";
+ size_t const string_length = strlen(s1) + 1 + strlen(s2);
+ strcat_s(s1, string_length, s2);
+ mexErrMsgIdAndTxt("Cpufit:Mex", s1);
+ }
+ else if (nlhs != expected_nlhs)
+ {
+ char s1[50];
+ _itoa_s(expected_nlhs, s1, 10);
+ char const s2[] = " output arguments required.";
+ size_t const string_length = strlen(s1) + 1 + strlen(s2);
+ strcat_s(s1, string_length, s2);
+ mexErrMsgIdAndTxt("Cpufit:Mex", s1);
+ }
+ }
+
+ // input parameters
+ float * data = (float*)mxGetPr(prhs[0]);
+ float * weights = (float*)mxGetPr(prhs[1]);
+ std::size_t n_fits = (std::size_t)*mxGetPr(prhs[2]);
+ std::size_t n_points = (std::size_t)*mxGetPr(prhs[3]);
+
+ // tolerance
+ float tolerance = 0;
+ if (!get_scalar(prhs[4], tolerance, mxSINGLE_CLASS))
+ {
+ mexErrMsgIdAndTxt("Cpufit:Mex", "tolerance is not a single");
+ }
+
+ // max_n_iterations
+ int max_n_iterations = 0;
+ if (!get_scalar(prhs[5], max_n_iterations, mxINT32_CLASS))
+ {
+ mexErrMsgIdAndTxt("Cpufit:Mex", "max_n_iteration is not int32");
+ }
+
+ int estimator_id = (int)*mxGetPr(prhs[6]);
+ float * initial_parameters = (float*)mxGetPr(prhs[7]);
+ int * parameters_to_fit = (int*)mxGetPr(prhs[8]);
+ int model_id = (int)*mxGetPr(prhs[9]);
+ int n_parameters = (int)*mxGetPr(prhs[10]);
+ int * user_info = (int*)mxGetPr(prhs[11]);
+ std::size_t user_info_size = (std::size_t)*mxGetPr(prhs[12]);
+
+ // output parameters
+ float * output_parameters;
+ mxArray * mx_parameters;
+ mx_parameters = mxCreateNumericMatrix(1, n_fits*n_parameters, mxSINGLE_CLASS, mxREAL);
+ output_parameters = (float*)mxGetData(mx_parameters);
+ plhs[0] = mx_parameters;
+
+ int * output_states;
+ mxArray * mx_states;
+ mx_states = mxCreateNumericMatrix(1, n_fits, mxINT32_CLASS, mxREAL);
+ output_states = (int*)mxGetData(mx_states);
+ plhs[1] = mx_states;
+
+ float * output_chi_squares;
+ mxArray * mx_chi_squares;
+ mx_chi_squares = mxCreateNumericMatrix(1, n_fits, mxSINGLE_CLASS, mxREAL);
+ output_chi_squares = (float*)mxGetData(mx_chi_squares);
+ plhs[2] = mx_chi_squares;
+
+ int * output_n_iterations;
+ mxArray * mx_n_iterations;
+ mx_n_iterations = mxCreateNumericMatrix(1, n_fits, mxINT32_CLASS, mxREAL);
+ output_n_iterations = (int*)mxGetData(mx_n_iterations);
+ plhs[3] = mx_n_iterations;
+
+ // call to gpufit
+ int const status
+ = cpufit
+ (
+ n_fits,
+ n_points,
+ data,
+ weights,
+ model_id,
+ initial_parameters,
+ tolerance,
+ max_n_iterations,
+ parameters_to_fit,
+ estimator_id,
+ user_info_size,
+ reinterpret_cast< char * >( user_info ),
+ output_parameters,
+ output_states,
+ output_chi_squares,
+ output_n_iterations
+ ) ;
+
+ // check status
+ if (status != STATUS_OK)
+ {
+ std::string const error = cpufit_get_last_error() ;
+ mexErrMsgIdAndTxt( "Cpufit:Mex", error.c_str() ) ;
+ }
+}
diff --git a/Gpufit/CMakeLists.txt b/Gpufit/CMakeLists.txt
new file mode 100644
index 0000000..76da81e
--- /dev/null
+++ b/Gpufit/CMakeLists.txt
@@ -0,0 +1,160 @@
+
+# CUDA
+#
+# Uses the following variables:
+#
+# CUDA_ARCHITECTURES (Default All)
+# -- Argument passed to CUDA_SELECT_NVCC_ARCH_FLAGS(...)
+# resulting in code_generation_flags
+# (see http://cmake.org/cmake/help/v3.7/module/FindCUDA.html).
+# CUDA_ARCHITECTURES: Auto | Common | All | ARCH_AND_PTX ...
+# Auto: Detects local machine GPU architecture.
+# Common: Covers common subset of architectures.
+# All: Covers all known architectures.
+# ARCH_AND_PTX: NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX
+# NAME: Fermi Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal
+# NUM: Any number.
+# Only those pairs are currently accepted by NVCC though:
+# 2.0 2.1 3.0 3.2 3.5 3.7 5.0 5.2 5.3 6.0 6.2
+# Examples:
+# 2.1(2.0) results in
+# -gencode;arch=compute_20,code=sm_21
+# Kepler+Tesla results in
+# -gencode;arch=compute_37,code=sm_37
+# 6.2+PTX results in
+# -gencode;arch=compute_62,code=sm_62;-gencode;arch=compute_62,code=compute_62
+#
+# CUDA_NVCC_FLAGS (Default ${code_generation_flags})
+# -- Additional NVCC command line arguments
+# (see http://cmake.org/cmake/help/v3.7/module/FindCUDA.html).
+# NOTE that multiple arguments must be semi-colon delimited
+# (e.g. --compiler-options;-Wall)
+#
+# Multiple CUDA versions installed, specify which version to use
+# Set CUDA_BIN_PATH before running CMake or CUDA_TOOLKIT_ROOT_DIR after first configuration
+# to installation folder of desired CUDA version
+
+find_package( CUDA 6.5 REQUIRED )
+
+set( CUDA_ARCHITECTURES All CACHE STRING
+ "Auto | Common | All | ... see CUDA_SELECT_NVCC_ARCH_FLAGS(...)" )
+
+if( CUDA_ARCHITECTURES STREQUAL Auto )
+ set( file ${PROJECT_BINARY_DIR}/detect_cuda_architectures.cpp )
+ file( WRITE ${file} ""
+ "#include \n"
+ "#include \n"
+ "int main()\n"
+ "{\n"
+ " int count = 0;\n"
+ " if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
+ " if (count == 0) return -1;\n"
+ " for (int device = 0; device < count; ++device)\n"
+ " {\n"
+ " cudaDeviceProp prop;\n"
+ " if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+ " std::printf(\"%d.%d \", prop.major, prop.minor);\n"
+ " }\n"
+ " return 0;\n"
+ "}\n"
+ )
+ try_run( run_result compile_result ${PROJECT_BINARY_DIR} ${file}
+ CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS}"
+ LINK_LIBRARIES ${CUDA_LIBRARIES}
+ RUN_OUTPUT_VARIABLE architectures
+ )
+ if( run_result EQUAL 0 )
+ string( REPLACE "2.1" "2.1(2.0)" architectures "${architectures}" )
+ if( CUDA_VERSION VERSION_LESS "7.0" )
+ string( REGEX REPLACE "3\\.[27]|5\\.[23]|6\\.[01]" "5.2+PTX" architectures "${architectures}" )
+ elseif( CUDA_VERSION VERSION_LESS "8.0" )
+ string( REGEX REPLACE "5\\.3|6\\.[01]" "5.3+PTX" architectures "${architectures}" )
+ endif()
+ set( CUDA_ARCHITECTURES "${architectures}" )
+ endif()
+elseif( CUDA_ARCHITECTURES STREQUAL All )
+# All does not include the latest PTX!
+ set( CUDA_ARCHITECTURES "2.1(2.0)" "3.0" "3.5" "5.0" "5.2" )
+ if( CUDA_VERSION VERSION_GREATER "6.5" )
+ list( APPEND CUDA_ARCHITECTURES "3.2" "3.7" "5.3" )
+ endif()
+ if( CUDA_VERSION VERSION_GREATER "7.5" )
+ list( APPEND CUDA_ARCHITECTURES "6.0" "6.1" )
+ endif()
+ string( APPEND CUDA_ARCHITECTURES "+PTX" )
+endif()
+CUDA_SELECT_NVCC_ARCH_FLAGS( code_generation_flags "${CUDA_ARCHITECTURES}" )
+list( APPEND CUDA_NVCC_FLAGS ${code_generation_flags} )
+message( STATUS "CUDA_NVCC_FLAGS=${code_generation_flags}" )
+
+# Gpufit
+
+set( GpuHeaders
+ gpufit.h
+ definitions.h
+ info.h
+ lm_fit.h
+ interface.h
+)
+
+set( GpuSources
+ gpufit.cpp
+ info.cpp
+ lm_fit.cpp
+ lm_fit_cuda.cpp
+ interface.cpp
+ gpufit.def
+)
+
+set( GpuCudaHeaders
+ linear_1d.cuh
+ gauss_1d.cuh
+ gauss_2d.cuh
+ gauss_2d_rotated.cuh
+ gauss_2d_elliptic.cuh
+ cauchy_2d_elliptic.cuh
+ lse.cuh
+ mle.cuh
+ cuda_gaussjordan.cuh
+ cuda_kernels.cuh
+ gpu_data.cuh
+)
+
+set( GpuCudaSources
+ lm_fit_cuda.cu
+ cuda_gaussjordan.cu
+ cuda_kernels.cu
+ info.cu
+ gpu_data.cu
+)
+
+source_group("CUDA Source Files" FILES ${GpuCudaSources})
+source_group("CUDA Header Files" FILES ${GpuCudaHeaders})
+
+cuda_add_library( Gpufit SHARED
+ ${GpuHeaders}
+ ${GpuSources}
+ ${GpuCudaHeaders}
+ ${GpuCudaSources}
+)
+
+set_property( TARGET Gpufit
+ PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" )
+
+#install( TARGETS Gpufit RUNTIME DESTINATION bin )
+
+# Examples
+
+add_subdirectory( examples )
+
+# Tests
+
+if( BUILD_TESTING )
+ add_subdirectory( tests )
+endif()
+
+# Bindings
+
+add_subdirectory( matlab )
+add_subdirectory( python )
+
diff --git a/Gpufit/Gpufit.def b/Gpufit/Gpufit.def
new file mode 100644
index 0000000..0e3b9db
--- /dev/null
+++ b/Gpufit/Gpufit.def
@@ -0,0 +1,7 @@
+LIBRARY "Gpufit"
+EXPORTS
+ gpufit @1
+ gpufit_get_last_error @2
+ gpufit_get_cuda_version @3
+ gpufit_cuda_available @4
+ gpufit_portable_interface @5
\ No newline at end of file
diff --git a/Gpufit/cauchy_2d_elliptic.cuh b/Gpufit/cauchy_2d_elliptic.cuh
new file mode 100644
index 0000000..b1c2a4e
--- /dev/null
+++ b/Gpufit/cauchy_2d_elliptic.cuh
@@ -0,0 +1,107 @@
+#ifndef GPUFIT_CAUCHY2DELLIPTIC_CUH_INCLUDED
+#define GPUFIT_CAUCHY2DELLIPTIC_CUH_INCLUDED
+
+/* Description of the calculate_cauchy2delliptic function
+* =======================================================
+*
+* This function calculates the values of two-dimensional elliptic cauchy model
+* functions and their partial derivatives with respect to the model parameters.
+*
+* No independent variables are passed to this model function. Hence, the
+* (X, Y) coordinate of the first data value is assumed to be (0.0, 0.0). For
+* a fit size of M x N data points, the (X, Y) coordinates of the data are
+* simply the corresponding array index values of the data array, starting from
+* zero.
+*
+* Parameters:
+*
+* parameters: An input vector of concatenated sets of model parameters.
+* p[0]: amplitude
+* p[1]: center coordinate x
+* p[2]: center coordinate y
+* p[3]: width x (standard deviation)
+* p[4]: width y (standard deviation)
+* p[5]: offset
+*
+* n_fits: The number of fits. (not used)
+*
+* n_points: The number of data points per fit.
+*
+* n_parameters: The number of model parameters.
+*
+* values: An output vector of concatenated sets of model function values.
+*
+* derivatives: An output vector of concatenated sets of model function partial
+* derivatives.
+*
+* chunk_index: The chunk index. (not used)
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_cauchy2delliptic function
+* ===============================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function. When calling the function, the blocks and threads of the __global__
+* function must be set up correctly, as shown in the following example code.
+*
+* dim3 threads(1, 1, 1);
+* dim3 blocks(1, 1, 1);
+*
+* threads.x = n_points * n_fits_per_block;
+* blocks.x = n_fits / n_fits_per_block;
+*
+* global_function<<< blocks,threads >>>(parameter1, ...);
+*
+*/
+
+__device__ void calculate_cauchy2delliptic(
+ float const * parameters,
+ int const n_fits,
+ int const n_points,
+ int const n_parameters,
+ float * values,
+ float * derivatives,
+ int const chunk_index,
+ char * user_info,
+ std::size_t const user_info_size)
+{
+ int const n_points_x = sqrt((float)n_points);
+ int const n_fits_per_block = blockDim.x / n_points;
+ int const fit_in_block = threadIdx.x / n_points;
+ int const point_index = threadIdx.x - (fit_in_block*n_points);
+ int const fit_index = blockIdx.x*n_fits_per_block + fit_in_block;
+
+ int const point_index_y = point_index / n_points_x;
+ int const point_index_x = point_index - (point_index_y*n_points_x);
+
+ float* current_value = &values[fit_index*n_points];
+ float const * p = ¶meters[fit_index*n_parameters];
+
+ float const argx = ((p[1] - point_index_x) / p[3]) *((p[1] - point_index_x) / p[3]) + 1;
+ float const argy = ((p[2] - point_index_y) / p[4]) *((p[2] - point_index_y) / p[4]) + 1;
+ current_value[point_index] = p[0] * 1 / argx * 1 / argy + p[5];
+
+ //////////////////////////////////////////////////////////////////////////////
+
+ float * current_derivative = &derivatives[fit_index * n_points*n_parameters];
+
+ current_derivative[0 * n_points + point_index]
+ = 1 / (argx*argy);
+ current_derivative[1 * n_points + point_index]
+ = -2 * p[0] * (p[1] - point_index_x) * 1 / (p[3] * p[3] * argx*argx*argy);
+ current_derivative[2 * n_points + point_index]
+ = -2 * p[0] * (p[2] - point_index_y) * 1 / (p[4] * p[4] * argy*argy*argx);
+ current_derivative[3 * n_points + point_index]
+ = 2 * p[0] * (p[1] - point_index_x) * (p[1] - point_index_x)
+ / (p[3] * p[3] * p[3] * argx * argx * argy);
+ current_derivative[4 * n_points + point_index]
+ = 2 * p[0] * (p[2] - point_index_y) * (p[2] - point_index_y)
+ / (p[4] * p[4] * p[4] * argy * argy * argx);
+ current_derivative[5 * n_points + point_index]
+ = 1;
+}
+
+#endif
diff --git a/Gpufit/cuda_gaussjordan.cu b/Gpufit/cuda_gaussjordan.cu
new file mode 100644
index 0000000..c6519bc
--- /dev/null
+++ b/Gpufit/cuda_gaussjordan.cu
@@ -0,0 +1,279 @@
+/* CUDA implementation of Gauss-Jordan elimination algorithm.
+*
+* Gauss-Jordan elimination method
+* ===============================
+*
+* This function solves a set of linear equations using the Gauss-Jordan elimination method.
+* Considering a set of N equations with N unknowns, this can be written in matrix form as
+* an NxN matrix of coefficients and a Nx1 column vector of right-hand side values.
+*
+* For example, consider the following problem with 3 equations and 3 unknowns (N=3):
+*
+* A x + B y + C z = MM
+* D x + E y + F z = NN
+* G x + H y + J z = PP
+*
+* We can write this as follows in matrix form:
+*
+* [ A B C ] [ x ] = [ MM ]
+* [ D E F ] [ y ] = [ NN ]
+* [ G H I ] [ z ] = [ PP ]
+*
+* or, [A]*[X] = [B] where [A] is the matrix of coefficients and [B] is the vector of
+* right-hand side values.
+*
+* The Gauss Jordan elimiation method solves the system of equations in the following
+* manner. First, we form the augmented matrix (A|B):
+*
+* [ A B C | MM ]
+* [ D E F | NN ]
+* [ G H I | PP ]
+*
+* and then the augmented matrix is manipulated until its left side has the reduced
+* row-echelon form. That is to say that any individual row may be multiplied
+* by a scalar factor, and any linear combination of rows may be added to another
+* row. Finally, two rows may be swapped without affecting the solution.
+*
+* When the manipulations are complete and the left side of the matrix has the desired
+* form, the right side then corresponds to the solution of the system.
+*
+*
+* Description of the cuda_gaussjordan function
+* ============================================
+*
+* This algorithm is designed to perform many solutions of the Gauss Jordan elimination
+* method in parallel. One limitation of the algorithm implemented here is that for
+* each solution the number of equations and unknowns (N) must be identical.
+*
+* Parameters:
+*
+* alpha: Coefficients matrices. The matrix of coefficients for a single solution is
+* a vector of NxN, where N is the number of equations. This array stores the
+* coefficients for the entire set of M input problems, concatenated end to end,
+* and hence the total size of the array is MxNxN.
+*
+* beta: Vector of right hand side values, concatenated together for all input problems.
+* For a set of M inputs, the size of the vector is MxN. Upon completion, this
+* vector contains the results vector X for each solution.
+*
+* skip_calculation: An input vector which allows the calculation to be skipped for
+* a particular solution. For a set of M inputs, the size of this
+* vector is M.
+*
+* singular: An output vector used to report whether a given solution is singular. For
+* a set of M inputs, this vector has size M. Memory needs to be allocated
+* by the calling the function.
+*
+* n_equations: The number of equations and unknowns for a single solution. This is
+* equal to the size N.
+*
+* n_equations_pow2: The next highest power of 2 greater than n_equations.
+*
+*
+* Calling the cuda_gaussjordan function
+* =====================================
+*
+* When calling the function, the blocks and threads must be set up correctly, as well
+* as the shared memory space, as shown in the following example code.
+*
+* dim3 threads(1, 1, 1);
+* dim3 blocks(1, 1, 1);
+*
+* threads.x = n_equations + 1;
+* threads.y = n_equations;
+* blocks.x = n_solutions;
+* blocks.y = 1;
+*
+* int const shared_size = sizeof(float) *
+* ( (threads.x * threads.y) + n_parameters_pow2 + n_parameters_pow2 );
+*
+* int * singular;
+* CUDA_CHECK_STATUS(cudaMalloc((void**)&singular, n_solutions * sizeof(int)));
+*
+* cuda_gaussjordan<<< blocks, threads, shared_size >>>(
+* alpha,
+* beta,
+* skip_calculation,
+* singular,
+* n_equations,
+* n_equations_pow2);
+*
+*/
+
+#include "cuda_gaussjordan.cuh"
+
+__global__ void cuda_gaussjordan(
+ float * delta,
+ float const * beta,
+ float const * alpha,
+ int const * skip_calculation,
+ int * singular,
+ std::size_t const n_equations,
+ std::size_t const n_equations_pow2)
+{
+ extern __shared__ float extern_array[]; //shared memory between threads of a single block,
+ //used for storing the calculation_matrix, the
+ //abs_row vector, and the abs_row_index vector
+
+ // In this routine we will store the augmented matrix (A|B), referred to here
+ // as the calculation matrix in a shared memory space which is visible to all
+ // threads within a block. Also stored in shared memory are two vectors which
+ // are used to find the largest element in each row (the pivot). These vectors
+ // are called abs_row and abs_row_index.
+ //
+ // Sizes of data stored in shared memory:
+ //
+ // calculation_matrix: n_equations * (n_equations+1)
+ // abs_row: n_equations_pow2
+ // abs_row_index: n_equations_pow2
+ //
+ // Note that each thread represents an element of the augmented matrix, with
+ // the column and row indicated by the x and y index of the thread. Each
+ // solution is calculated within one block, and the solution index is the
+ // block index x value.
+
+ int const col_index = threadIdx.x; //column index in the calculation_matrix
+ int const row_index = threadIdx.y; //row index in the calculation_matrix
+ int const solution_index = blockIdx.x;
+
+ int const n_col = blockDim.x; //number of columns in calculation matrix (=threads.x)
+ int const n_row = blockDim.y; //number of rows in calculation matrix (=threads.y)
+ int const alpha_size = blockDim.y * blockDim.y; //number of entries in alpha matrix for one solution (NxN)
+
+ if (skip_calculation[solution_index])
+ return;
+
+ float p; //local variable used in pivot calculation
+
+ float * calculation_matrix = extern_array; //point to the shared memory
+
+ float * abs_row = extern_array + n_equations * (n_equations + 1); //abs_row is located after the calculation_matrix
+ //within the shared memory
+
+ int * abs_row_index = (int *)abs_row + n_equations_pow2; //abs_row_index is located after abs_row
+ //
+ //note that although the shared memory is defined as
+ //float, we are storing data of type int in this
+ //part of the shared memory
+
+ //initialize the singular vector
+ if (col_index == 0 && row_index == 0)
+ {
+ singular[solution_index] = 0;
+ }
+
+ //initialize abs_row and abs_row_index, using only the threads on the diagonal
+ if (col_index == row_index)
+ {
+ abs_row[col_index + (n_equations_pow2 - n_equations)] = 0.0f;
+ abs_row_index[col_index + (n_equations_pow2 - n_equations)] = col_index + (n_equations_pow2 - n_equations);
+ }
+
+ //initialize the calculation_matrix (alpha and beta, concatenated, for one solution)
+ if (col_index != n_equations)
+ calculation_matrix[row_index*n_col + col_index] = alpha[solution_index * alpha_size + row_index * n_equations + col_index];
+ else
+ calculation_matrix[row_index*n_col + col_index] = beta[solution_index * n_equations + row_index];
+
+ //wait for thread synchronization
+
+ __syncthreads();
+
+ //start of main outer loop over the rows of the calculation matrix
+
+ for (int current_row = 0; current_row < n_equations; current_row++)
+ {
+
+ // work in only one row, skipping the last column
+ if (row_index == current_row && col_index != n_equations)
+ {
+
+ //save the absolute values of the current row
+ abs_row[col_index] = abs(calculation_matrix[row_index * n_col + col_index]);
+
+ //save the column indices
+ abs_row_index[col_index] = col_index;
+
+ __threadfence();
+
+ //find the largest absolute value in the current row and write its index in abs_row_index[0]
+ for (int n = 2; n <= n_equations_pow2; n = n * 2)
+ {
+ if (col_index < (n_equations_pow2 / n))
+ {
+ if (abs_row[abs_row_index[col_index]] < abs_row[abs_row_index[col_index + (n_equations_pow2 / n)]])
+ {
+ abs_row_index[col_index] = abs_row_index[col_index + (n_equations_pow2 / n)];
+ }
+ }
+ }
+ }
+
+ __syncthreads();
+
+ //singularity check - if all values in the row are zero, no solution exists
+ if (row_index == current_row && col_index != n_equations)
+ {
+ if (abs_row[abs_row_index[0]] == 0.0f)
+ {
+ singular[solution_index] = 1;
+ }
+ }
+
+ //devide the row by the biggest value in the row
+ if (row_index == current_row)
+ {
+ calculation_matrix[row_index * n_col + col_index]
+ = calculation_matrix[row_index * n_col + col_index] / calculation_matrix[row_index * n_col + abs_row_index[0]];
+ }
+
+ __syncthreads();
+
+ //The value of the largest element of the current row was found, and then current
+ //row was divided by this value such that the largest value of the current row
+ //is equal to one.
+ //
+ //Next, the matrix is manipulated to reduce to zero all other entries in the column
+ //in which the largest value was found. To do this, the values in the current row
+ //are scaled appropriately and substracted from the other rows of the matrix.
+ //
+ //For each element of the matrix that is not in the current row, calculate the value
+ //to be subtracted and let each thread store this value in the scalar variable p.
+
+ p = calculation_matrix[current_row * n_col + col_index] * calculation_matrix[row_index * n_col + abs_row_index[0]];
+ __syncthreads();
+
+ if (row_index != current_row)
+ {
+ calculation_matrix[row_index * n_col + col_index] = calculation_matrix[row_index * n_col + col_index] - p;
+ }
+ __syncthreads();
+
+ }
+
+ //At this point, if the solution exists, the calculation matrix has been reduced to the
+ //identity matrix on the left side, and the solution vector on the right side. However
+ //we have not swapped rows during the procedure, so the identity matrix is out of order.
+ //
+ //For example, starting with the following augmented matrix as input:
+ //
+ // [ 3 2 -4 | 4 ]
+ // [ 2 3 3 | 15 ]
+ // [ 5 -3 1 | 14 ]
+ //
+ //we will obtain:
+ //
+ // [ 0 0 1 | 2 ]
+ // [ 0 1 0 | 1 ]
+ // [ 1 0 0 | 3 ]
+ //
+ //Which needs to be re-arranged to obtain the correct solution vector. In the final
+ //step, each thread checks to see if its value equals 1, and if so it assigns the value
+ //in its rightmost column to the appropriate entry in the beta vector. The solution is
+ //stored in beta upon completetion.
+
+ if (col_index != n_equations && calculation_matrix[row_index * n_col + col_index] == 1)
+ delta[n_row * solution_index + col_index] = calculation_matrix[row_index * n_col + n_equations];
+
+ __syncthreads();
+}
diff --git a/Gpufit/cuda_gaussjordan.cuh b/Gpufit/cuda_gaussjordan.cuh
new file mode 100644
index 0000000..2d41cda
--- /dev/null
+++ b/Gpufit/cuda_gaussjordan.cuh
@@ -0,0 +1,15 @@
+#ifndef GPUFIT_CUDA_GAUSS_JORDAN_CUH_INCLUDED
+#define GPUFIT_CUDA_GAUSS_JORDAN_CUH_INCLUDED
+
+#include
+
+extern __global__ void cuda_gaussjordan(
+ float * delta,
+ float const * beta,
+ float const * alpha,
+ int const * skip_calculation,
+ int * singular,
+ std::size_t const n_equations,
+ std::size_t const n_equations_pow2);
+
+#endif
\ No newline at end of file
diff --git a/Gpufit/cuda_kernels.cu b/Gpufit/cuda_kernels.cu
new file mode 100644
index 0000000..2661a7e
--- /dev/null
+++ b/Gpufit/cuda_kernels.cu
@@ -0,0 +1,1081 @@
+#include "gpufit.h"
+#include "cuda_kernels.cuh"
+#include "definitions.h"
+#include "linear_1d.cuh"
+#include "gauss_1d.cuh"
+#include "gauss_2d.cuh"
+#include "gauss_2d_elliptic.cuh"
+#include "gauss_2d_rotated.cuh"
+#include "cauchy_2d_elliptic.cuh"
+#include "lse.cuh"
+#include "mle.cuh"
+
+/* Description of the cuda_calc_curve_values function
+* ===================================================
+*
+* This function calls one of the fitting curve functions depending on the input
+* parameter model_id. The fitting curve function calculates the values of
+* the fitting curves and its partial derivatives with respect to the fitting
+* curve parameters. Multiple fits are calculated in parallel.
+*
+* Parameters:
+*
+* parameters: An input vector of concatenated sets of model parameters.
+*
+* n_fits: The number of fits.
+*
+* n_points: The number of data points per fit.
+*
+* n_parameters: The number of curve parameters.
+*
+* finished: An input vector which allows the calculation to be skipped for single
+* fits.
+*
+* values: An output vector of concatenated sets of model function values.
+*
+* derivatives: An output vector of concatenated sets of model function partial
+* derivatives.
+*
+* n_fits_per_block: The number of fits calculated by each threadblock.
+*
+* model_id: The fitting model ID.
+*
+* chunk_index: The chunk index.
+*
+* user_info: An input vector containing user information.
+*
+* user_info_size: The number of elements in user_info.
+*
+* Calling the cuda_calc_curve_values function
+* ===========================================
+*
+* When calling the function, the blocks and threads must be set up correctly,
+* as shown in the following example code.
+*
+* dim3 threads(1, 1, 1);
+* dim3 blocks(1, 1, 1);
+*
+* threads.x = n_points * n_fits_per_block;
+* blocks.x = n_fits / n_fits_per_block;
+*
+* cuda_calc_curve_values<<< blocks, threads >>>(
+* parameters,
+* n_points,
+* n_parameters,
+* finished,
+* values,
+* derivatives,
+* n_fits_per_block,
+* model_id,
+* chunk_index,
+* user_info,
+* user_info_size);
+*
+*/
+
+__global__ void cuda_calc_curve_values(
+ float const * parameters,
+ int const n_fits,
+ int const n_points,
+ int const n_parameters,
+ int const * finished,
+ float * values,
+ float * derivatives,
+ int const n_fits_per_block,
+ int const model_id,
+ int const chunk_index,
+ char * user_info,
+ std::size_t const user_info_size)
+{
+ int const fit_in_block = threadIdx.x / n_points;
+ int const point_index = threadIdx.x - fit_in_block * n_points;
+ int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block;
+
+ if (finished[fit_index])
+ return;
+ if (point_index >= n_points)
+ return;
+
+ if (model_id == GAUSS_1D)
+ calculate_gauss1d(parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size);
+ else if (model_id == GAUSS_2D)
+ calculate_gauss2d(parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size);
+ else if (model_id == GAUSS_2D_ELLIPTIC)
+ calculate_gauss2delliptic(parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size);
+ else if (model_id == GAUSS_2D_ROTATED)
+ calculate_gauss2drotated(parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size);
+ else if (model_id == CAUCHY_2D_ELLIPTIC)
+ calculate_cauchy2delliptic(parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size);
+ else if (model_id == LINEAR_1D)
+ calculate_linear1d(parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size);
+}
+
+/* Description of the sum_up_floats function
+* ==========================================
+*
+* This function sums up a vector of float values and stores the result at the
+* first place of the vector.
+*
+* Parameters:
+*
+* shared_array: An input vector of float values. The vector must be stored
+* on the shared memory of the GPU. The size of this vector must be a
+* power of two. Use zero padding to extend it to the next highest
+* power of 2 greater than the number of elements.
+*
+* size: The number of elements in the input vector considering zero padding.
+*
+* Calling the sum_up_floats function
+* ==================================
+*
+* This __device__ function can be only called from a __global__ function or
+* an other __device__ function. When calling the function, the blocks and threads
+* of the __global__ function must be set up correctly, as shown in the following
+* example code.
+*
+* dim3 threads(1, 1, 1);
+* dim3 blocks(1, 1, 1);
+*
+* threads.x = size * vectors_per_block;
+* blocks.x = n_vectors / vectors_per_block;
+*
+* global_function<<< blocks,threads >>>(parameter1, ...);
+*
+*/
+
+__device__ void sum_up_floats(volatile float* shared_array, int const size)
+{
+ int const fit_in_block = threadIdx.x / size;
+ int const point_index = threadIdx.x - (fit_in_block*size);
+
+ int current_n_points = size >> 1;
+ __syncthreads();
+ while (current_n_points)
+ {
+ if (point_index < current_n_points)
+ {
+ shared_array[point_index] += shared_array[point_index + current_n_points];
+ }
+ current_n_points >>= 1;
+ __syncthreads();
+ }
+}
+
+/* Description of the cuda_calculate_chi_squares function
+* ========================================================
+*
+* This function calculates the chi-square values calling a __device__ function.
+* The calcluation is performed for multiple fits in parallel.
+*
+* Parameters:
+*
+* chi_squares: An output vector of concatenated chi-square values.
+*
+* states: An output vector of values which indicate whether the fitting process
+* was carreid out correctly or which problem occurred. In this function
+* it is only used for MLE. It is set to 3 if a fitting curve value is
+* negative. This vector includes the states for multiple fits.
+*
+* iteration_falied: An output vector which indicates whether the chi-square values
+* calculated by the current iteration decreased compared to the
+* previous iteration.
+*
+* prev_chi_squares: An input vector of concatenated chi-square values calculated
+* by the previous iteration.
+*
+* data: An input vector of data for multiple fits
+*
+* values: An input vector of concatenated sets of model function values.
+*
+* weight: An input vector of values for weighting chi-square, gradient and hessian,
+* while using LSE
+*
+* n_points: The number of data points per fit.
+*
+* estimator_id: The estimator ID.
+*
+* finished: An input vector which allows the calculation to be skipped for single
+* fits.
+*
+* n_fits_per_block: The number of fits calculated by each thread block.
+*
+* user_info: An input vector containing user information.
+*
+* user_info_size: The number of elements in user_info.
+*
+* Calling the cuda_calculate_chi_squares function
+* ================================================
+*
+* When calling the function, the blocks and threads must be set up correctly,
+* as shown in the following example code.
+*
+* dim3 threads(1, 1, 1);
+* dim3 blocks(1, 1, 1);
+*
+* threads.x = power_of_two_n_points * n_fits_per_block;
+* blocks.x = n_fits / n_fits_per_block;
+*
+* cuda_calculate_chi_squares<<< blocks, threads >>>(
+* chi_squares,
+* states,
+* iteration_falied,
+* prev_chi_squares,
+* data,
+* values,
+* weight,
+* n_points,
+* estimator_id,
+* finished,
+* n_fits_per_block,
+* user_info,
+* user_info_size);
+*
+*/
+
+__global__ void cuda_calculate_chi_squares(
+ float * chi_squares,
+ int * states,
+ int * iteration_falied,
+ float const * prev_chi_squares,
+ float const * data,
+ float const * values,
+ float const * weights,
+ int const n_points,
+ int const estimator_id,
+ int const * finished,
+ int const n_fits_per_block,
+ char * user_info,
+ std::size_t const user_info_size)
+{
+ int const shared_size = blockDim.x / n_fits_per_block;
+ int const fit_in_block = threadIdx.x / shared_size;
+ int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block;
+ int const point_index = threadIdx.x - fit_in_block * shared_size;
+ int const first_point = fit_index * n_points;
+
+ if (finished[fit_index])
+ {
+ return;
+ }
+
+ float const * current_data = &data[first_point];
+ float const * current_weight = weights ? &weights[first_point] : NULL;
+ float const * current_value = &values[first_point];
+ int * current_state = &states[fit_index];
+
+ extern __shared__ float extern_array[];
+
+ volatile float * shared_chi_square = &extern_array[fit_in_block*shared_size];
+
+ if (point_index >= n_points)
+ {
+ shared_chi_square[point_index] = 0.f;
+ }
+
+ if (point_index < n_points)
+ {
+ if (estimator_id == LSE)
+ {
+ calculate_chi_square_lse(
+ shared_chi_square,
+ point_index,
+ current_data,
+ current_value,
+ current_weight,
+ current_state,
+ user_info,
+ user_info_size);
+ }
+ else if (estimator_id == MLE)
+ {
+ calculate_chi_square_mle(
+ shared_chi_square,
+ point_index,
+ current_data,
+ current_value,
+ current_weight,
+ current_state,
+ user_info,
+ user_info_size);
+ }
+ }
+ sum_up_floats(shared_chi_square, shared_size);
+ chi_squares[fit_index] = shared_chi_square[0];
+
+
+ bool const prev_chi_squares_initialized = prev_chi_squares[fit_index] != 0;
+ bool const chi_square_increased = (chi_squares[fit_index] >= prev_chi_squares[fit_index]);
+ if (prev_chi_squares_initialized && chi_square_increased)
+ {
+ iteration_falied[fit_index] = 1;
+ }
+ else
+ {
+ iteration_falied[fit_index] = 0;
+ }
+}
+
+/* Description of the cuda_calculate_gradients function
+* ========================================================
+*
+* This function calculates the gradient values of the chi-square function calling
+* a __device__ function. The calcluation is performed for multiple fits in parallel.
+*
+* Parameters:
+*
+* gradients: An output vector of concatenated sets of gradient vector values.
+*
+* data: An input vector of data for multiple fits
+*
+* values: An input vector of concatenated sets of model function values.
+*
+* derivatives: An input vector of concatenated sets of model function partial
+* derivatives.
+*
+* weight: An input vector of values for weighting chi-square, gradient and hessian,
+* while using LSE
+*
+* n_points: The number of data points per fit.
+*
+* n_parameters: The number of fitting curve parameters.
+*
+* n_parameters_to_fit: The number of fitting curve parameters, that are not held
+* fixed.
+*
+* parameters_to_fit_indices: An input vector of indices of fitting curve parameters,
+* that are not held fixed.
+*
+* estimator_id: The estimator ID.
+*
+* finished: An input vector which allows the calculation to be skipped for single
+* fits.
+*
+* skip: An input vector which allows the calculation to be skipped for single fits.
+*
+* n_fits_per_block: The number of fits calculated by each thread block.
+*
+* user_info: An input vector containing user information.
+*
+* user_info_size: The number of elements in user_info.
+*
+* Calling the cuda_calculate_gradients function
+* ================================================
+*
+* When calling the function, the blocks and threads must be set up correctly,
+* as shown in the following example code.
+*
+* dim3 threads(1, 1, 1);
+* dim3 blocks(1, 1, 1);
+*
+* threads.x = power_of_two_n_points * n_fits_per_block;
+* blocks.x = n_fits / n_fits_per_block;
+*
+* cuda_calculate_gradients<<< blocks, threads >>>(
+* gradients,
+* data,
+* values,
+* derivatives,
+* weight,
+* n_points,
+* n_parameters,
+* n_parameters_to_fit,
+* parameters_to_fit_indices,
+* estimator_id,
+* finished,
+* skip,
+* n_fits_per_block,
+* user_info,
+* user_info_size);
+*
+*/
+
+__global__ void cuda_calculate_gradients(
+ float * gradients,
+ float const * data,
+ float const * values,
+ float const * derivatives,
+ float const * weights,
+ int const n_points,
+ int const n_parameters,
+ int const n_parameters_to_fit,
+ int const * parameters_to_fit_indices,
+ int const estimator_id,
+ int const * finished,
+ int const * skip,
+ int const n_fits_per_block,
+ char * user_info,
+ std::size_t const user_info_size)
+{
+ int const shared_size = blockDim.x / n_fits_per_block;
+ int const fit_in_block = threadIdx.x / shared_size;
+ int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block;
+ int const point_index = threadIdx.x - fit_in_block * shared_size;
+ int const first_point = fit_index * n_points;
+
+ if (finished[fit_index] || skip[fit_index])
+ {
+ return;
+ }
+
+ float const * current_data = &data[first_point];
+ float const * current_weight = weights ? &weights[first_point] : NULL;
+ float const * current_derivative = &derivatives[first_point * n_parameters];
+ float const * current_value = &values[first_point];
+
+ extern __shared__ float extern_array[];
+
+ volatile float * shared_gradient = &extern_array[fit_in_block * shared_size];
+
+ if (point_index >= n_points)
+ {
+ shared_gradient[point_index] = 0.f;
+ }
+
+ for (int parameter_index = 0; parameter_index < n_parameters_to_fit; parameter_index++)
+ {
+ if (point_index < n_points)
+ {
+ int const derivative_index = parameters_to_fit_indices[parameter_index] * n_points + point_index;
+
+ if (estimator_id == LSE)
+ {
+ calculate_gradient_lse(
+ shared_gradient,
+ point_index,
+ derivative_index,
+ current_data,
+ current_value,
+ current_derivative,
+ current_weight,
+ user_info,
+ user_info_size);
+ }
+ else if (estimator_id == MLE)
+ {
+ calculate_gradient_mle(
+ shared_gradient,
+ point_index,
+ derivative_index,
+ current_data,
+ current_value,
+ current_derivative,
+ current_weight,
+ user_info,
+ user_info_size);
+ }
+ }
+ sum_up_floats(shared_gradient, shared_size);
+ gradients[fit_index * n_parameters_to_fit + parameter_index] = shared_gradient[0];
+ }
+}
+
+/* Description of the cuda_calculate_hessians function
+* ========================================================
+*
+* This function calculates the hessian matrix values of the chi-square function
+* calling a __device__ functions. The calcluation is performed for multiple fits
+* in parallel.
+*
+* Parameters:
+*
+* hessians: An output vector of concatenated sets of hessian matrix values.
+*
+* data: An input vector of data for multiple fits
+*
+* values: An input vector of concatenated sets of model function values.
+*
+* derivatives: An input vector of concatenated sets of model function partial
+* derivatives.
+*
+* weight: An input vector of values for weighting chi-square, gradient and hessian,
+* while using LSE
+*
+* n_points: The number of data points per fit.
+*
+* n_parameters: The number of fitting curve parameters.
+*
+* n_parameters_to_fit: The number of fitting curve parameters, that are not held
+* fixed.
+*
+* parameters_to_fit_indices: An input vector of indices of fitting curve parameters,
+* that are not held fixed.
+*
+* estimator_id: The estimator ID.
+*
+* skip: An input vector which allows the calculation to be skipped for single fits.
+*
+* finished: An input vector which allows the calculation to be skipped for single
+* fits.
+*
+* user_info: An input vector containing user information.
+*
+* user_info_size: The number of elements in user_info.
+*
+* Calling the cuda_calculate_hessians function
+* ================================================
+*
+* When calling the function, the blocks and threads must be set up correctly,
+* as shown in the following example code.
+*
+* dim3 threads(1, 1, 1);
+* dim3 blocks(1, 1, 1);
+*
+* threads.x = n_parameters_to_fit;
+* threads.y = n_parameters_to_fit;
+* blocks.x = n_fits;
+*
+* cuda_calculate_hessians<<< blocks, threads >>>(
+* hessians,
+* data,
+* values,
+* derivatives,
+* weight,
+* n_points,
+* n_parameters,
+* n_parameters_to_fit,
+* parameters_to_fit_indices,
+* estimator_id,
+* skip,
+* finished,
+* user_info,
+* user_info_size);
+*
+*/
+
+__global__ void cuda_calculate_hessians(
+ float * hessians,
+ float const * data,
+ float const * values,
+ float const * derivatives,
+ float const * weights,
+ int const n_points,
+ int const n_parameters,
+ int const n_parameters_to_fit,
+ int const * parameters_to_fit_indices,
+ int const estimator_id,
+ int const * skip,
+ int const * finished,
+ char * user_info,
+ std::size_t const user_info_size)
+{
+ int const fit_index = blockIdx.x;
+ int const first_point = fit_index * n_points;
+
+ int const parameter_index_i = threadIdx.x;
+ int const parameter_index_j = threadIdx.y;
+
+ if (finished[fit_index] || skip[fit_index])
+ {
+ return;
+ }
+
+ float * current_hessian = &hessians[fit_index * n_parameters_to_fit * n_parameters_to_fit];
+ float const * current_data = &data[first_point];
+ float const * current_weight = weights ? &weights[first_point] : NULL;
+ float const * current_derivative = &derivatives[first_point*n_parameters];
+ float const * current_value = &values[first_point];
+
+ int const hessian_index_ij = parameter_index_i * n_parameters_to_fit + parameter_index_j;
+ int const derivative_index_i = parameters_to_fit_indices[parameter_index_i] * n_points;
+ int const derivative_index_j = parameters_to_fit_indices[parameter_index_j] * n_points;
+
+ double sum = 0.0;
+ for (int point_index = 0; point_index < n_points; point_index++)
+ {
+ if (estimator_id == LSE)
+ {
+ calculate_hessian_lse(
+ &sum,
+ point_index,
+ derivative_index_i + point_index,
+ derivative_index_j + point_index,
+ current_data,
+ current_value,
+ current_derivative,
+ current_weight,
+ user_info,
+ user_info_size);
+ }
+ else if (estimator_id == MLE)
+ {
+ calculate_hessian_mle(
+ &sum,
+ point_index,
+ derivative_index_i + point_index,
+ derivative_index_j + point_index,
+ current_data,
+ current_value,
+ current_derivative,
+ current_weight,
+ user_info,
+ user_info_size);
+ }
+ }
+ current_hessian[hessian_index_ij] = sum;
+}
+
+/* Description of the cuda_modify_step_widths function
+* ====================================================
+*
+* This function midifies the diagonal elements of the hessian matrices by multiplying
+* them by the factor (1+ lambda). This operation controls the step widths of the
+* iteration. If the last iteration failed, befor modifying the hessian, the diagonal
+* elements of the hessian are calculated back to represent unmodified values.
+*
+* hessians: An input and output vector of hessian matrices, which are modified by
+* the lambda values.
+*
+* lambdas: An input vector of values for modifying the hessians.
+*
+* n_parameters: The number of fitting curve parameters.
+*
+* iteration_falied: An input vector which indicates whether the previous iteration
+* failed.
+*
+* finished: An input vector which allows the calculation to be skipped for single fits.
+*
+* n_fits_per_block: The number of fits calculated by each thread block.
+*
+* Calling the cuda_modify_step_widths function
+* ============================================
+*
+* When calling the function, the blocks and threads must be set up correctly,
+* as shown in the following example code.
+*
+* dim3 threads(1, 1, 1);
+* dim3 blocks(1, 1, 1);
+*
+* threads.x = n_parameters_to_fit * n_fits_per_block;
+* blocks.x = n_fits / n_fits_per_block;
+*
+* cuda_modify_step_width<<< blocks, threads >>>(
+* hessians,
+* lambdas,
+* n_parameters,
+* iteration_failed,
+* finished,
+* n_fits_per_block);
+*
+*/
+
+__global__ void cuda_modify_step_widths(
+ float * hessians,
+ float const * lambdas,
+ unsigned int const n_parameters,
+ int const * iteration_failed,
+ int const * finished,
+ int const n_fits_per_block)
+{
+ int const shared_size = blockDim.x / n_fits_per_block;
+ int const fit_in_block = threadIdx.x / shared_size;
+ int const parameter_index = threadIdx.x - fit_in_block * shared_size;
+ int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block;
+
+ if (finished[fit_index])
+ {
+ return;
+ }
+
+ float * current_hessian = &hessians[fit_index * n_parameters * n_parameters];
+
+ if (iteration_failed[fit_index])
+ {
+ current_hessian[parameter_index * n_parameters + parameter_index]
+ = current_hessian[parameter_index * n_parameters + parameter_index]
+ / (1.0f + lambdas[fit_index] / 10.f);
+ }
+
+ current_hessian[parameter_index * n_parameters + parameter_index]
+ = current_hessian[parameter_index * n_parameters + parameter_index]
+ * (1.0f + lambdas[fit_index]);
+}
+
+/* Description of the cuda_update_parameters function
+* ===================================================
+*
+* This function stores the fitting curve parameter values in prev_parameters and
+* updates them after each iteration.
+*
+* Parameters:
+*
+* deltas: An input vector of concatenated delta values, which are added to the
+* model parameters.
+*
+* parameters: An input and output vector of concatenated sets of model
+* parameters.
+*
+* n_parameters_to_fit: The number of fitted curve parameters.
+*
+* parameters_to_fit_indices: The indices of fitted curve parameters.
+*
+* finished: An input vector which allows the calculation to be skipped for single fits.
+*
+* n_fits_per_block: The number of fits calculated by each threadblock.
+*
+* Calling the cuda_update_parameters function
+* ===========================================
+*
+* When calling the function, the blocks and threads must be set up correctly,
+* as shown in the following example code.
+*
+* dim3 threads(1, 1, 1);
+* dim3 blocks(1, 1, 1);
+*
+* threads.x = n_parameters * n_fits_per_block;
+* blocks.x = n_fits / n_fits_per_block;
+*
+* cuda_update_parameters<<< blocks, threads >>>(
+* deltas,
+* parameters,
+* n_parameters_to_fit,
+* parameters_to_fit_indices,
+* finished,
+* n_fits_per_block);
+*
+*/
+
+__global__ void cuda_update_parameters(
+ float * parameters,
+ float * prev_parameters,
+ float const * deltas,
+ int const n_parameters_to_fit,
+ int const * parameters_to_fit_indices,
+ int const * finished,
+ int const n_fits_per_block)
+{
+ int const n_parameters = blockDim.x / n_fits_per_block;
+ int const fit_in_block = threadIdx.x / n_parameters;
+ int const parameter_index = threadIdx.x - fit_in_block * n_parameters;
+ int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block;
+
+ float * current_parameters = ¶meters[fit_index * n_parameters];
+ float * current_prev_parameters = &prev_parameters[fit_index * n_parameters];
+
+ current_prev_parameters[parameter_index] = current_parameters[parameter_index];
+
+ if (finished[fit_index])
+ {
+ return;
+ }
+
+ if (parameter_index >= n_parameters_to_fit)
+ {
+ return;
+ }
+
+ float const * current_deltas = &deltas[fit_index * n_parameters_to_fit];
+
+ current_parameters[parameters_to_fit_indices[parameter_index]] += current_deltas[parameter_index];
+}
+
+/* Description of the cuda_update_state_after_gaussjordan function
+* ================================================================
+*
+* This function interprets the singular flag vector of the Gauss Jordan function
+* according to this LM implementation.
+*
+* Parameters:
+*
+* n_fits: The number of fits.
+*
+* singular_checks: An input vector used to report whether a fit is singular.
+*
+* states: An output vector of values which indicate whether the fitting process
+* was carreid out correctly or which problem occurred. If a hessian
+* matrix of a fit is singular, it is set to 2.
+*
+* Calling the cuda_update_state_after_gaussjordan function
+* ========================================================
+*
+* When calling the function, the blocks and threads must be set up correctly,
+* as shown in the following example code.
+*
+* dim3 threads(1, 1, 1);
+* dim3 blocks(1, 1, 1);
+*
+* int const example_value = 256;
+*
+* threads.x = min(n_fits, example_value);
+* blocks.x = int(ceil(float(n_fits) / float(threads.x)));
+*
+* cuda_update_state_after_gaussjordan<<< blocks, threads >>>(
+* n_fits,
+* singular_checks,
+* states);
+*
+*/
+
+
+__global__ void cuda_update_state_after_gaussjordan(
+ int const n_fits,
+ int const * singular_checks,
+ int * states)
+{
+ int const fit_index = blockIdx.x * blockDim.x + threadIdx.x;
+
+ if (fit_index >= n_fits)
+ {
+ return;
+ }
+
+ if (singular_checks[fit_index] == 1)
+ {
+ states[fit_index] = STATE_SINGULAR_HESSIAN;
+ }
+
+}
+
+/* Description of the cuda_check_for_convergence function
+* =======================================================
+*
+* This function checks after each iteration whether the fits are converged or not.
+* It also checks whether the set maximum number of iterations is reached.
+*
+* Parameters:
+*
+* finished: An input and output vector which allows the calculation to be skipped
+* for single fits.
+*
+* tolerance: The tolerance value for the convergence set by user.
+*
+* states: An output vector of values which indicate whether the fitting process
+* was carreid out correctly or which problem occurred. If the maximum
+* number of iterationsis reached without converging, it is set to 1. If
+* the fit converged it keeps its initial value of 0.
+*
+* chi_squares: An input vector of chi-square values for multiple fits. Used for the
+* convergence check.
+*
+* prev_chi_squares: An input vector of chi-square values for multiple fits calculated
+* in the previous iteration. Used for the convergence check.
+*
+* iteration: The value of the current iteration. It is compared to the value
+* of the maximum number of iteration set by user.
+*
+* max_n_iterations: The maximum number of iterations set by user.
+*
+* n_fits: The number of fits.
+*
+* Calling the cuda_check_for_convergence function
+* ===============================================
+*
+* When calling the function, the blocks and threads must be set up correctly,
+* as shown in the following example code.
+*
+* dim3 threads(1, 1, 1);
+* dim3 blocks(1, 1, 1);
+*
+* int const example_value = 256;
+*
+* threads.x = min(n_fits, example_value);
+* blocks.x = int(ceil(float(n_fits) / float(threads.x)));
+*
+* cuda_check_for_convergence<<< blocks, threads >>>(
+* finished,
+* tolerance,
+* states,
+* chi_squares,
+* prev_chi_squares,
+* iteration,
+* max_n_iterations,
+* n_fits);
+*
+*/
+
+__global__ void cuda_check_for_convergence(
+ int * finished,
+ float const tolerance,
+ int * states,
+ float const * chi_squares,
+ float const * prev_chi_squares,
+ int const iteration,
+ int const max_n_iterations,
+ int const n_fits)
+{
+ int const fit_index = blockIdx.x * blockDim.x + threadIdx.x;
+
+ if (fit_index >= n_fits)
+ {
+ return;
+ }
+
+ if (finished[fit_index])
+ {
+ return;
+ }
+
+ int const fit_found = abs(chi_squares[fit_index] - prev_chi_squares[fit_index]) < tolerance * fmaxf(1, chi_squares[fit_index]);
+
+ int const max_n_iterations_reached = iteration == max_n_iterations - 1;
+
+ if (fit_found)
+ {
+ finished[fit_index] = 1;
+ }
+ else if (max_n_iterations_reached)
+ {
+ states[fit_index] = STATE_MAX_ITERATION;
+ }
+}
+
+/* Description of the cuda_evaluate_iteration function
+* ====================================================
+*
+* This function evaluates the current iteration.
+* - It marks a fit as finished if a problem occured.
+* - It saves the needed number of iterations if a fit finished.
+* - It checks if all fits finished
+*
+* Parameters:
+*
+* all_finished: An output flag, that indicates whether all fits finished.
+*
+* n_iterations: An output vector of needed iterations for each fit.
+*
+* finished: An input and output vector which allows the evaluation to be skipped
+* for single fits
+*
+* iteration: The values of the current iteration.
+*
+* states: An input vector of values which indicate whether the fitting process
+* was carreid out correctly or which problem occurred.
+*
+* n_fits: The number of fits.
+*
+* Calling the cuda_evaluate_iteration function
+* ============================================
+*
+* When calling the function, the blocks and threads must be set up correctly,
+* as shown in the following example code.
+*
+* dim3 threads(1, 1, 1);
+* dim3 blocks(1, 1, 1);
+*
+* int const example_value = 256;
+*
+* threads.x = min(n_fits, example_value);
+* blocks.x = int(ceil(float(n_fits) / float(threads.x)));
+*
+* cuda_evaluate_iteration<<< blocks, threads >>>(
+* all_finished,
+* n_iterations,
+* finished,
+* iteration,
+* states,
+* n_fits)
+*
+*/
+
+__global__ void cuda_evaluate_iteration(
+ int * all_finished,
+ int * n_iterations,
+ int * finished,
+ int const iteration,
+ int const * states,
+ int const n_fits)
+{
+ int const fit_index = blockIdx.x * blockDim.x + threadIdx.x;
+
+ if (fit_index >= n_fits)
+ {
+ return;
+ }
+
+ if (states[fit_index] != STATE_CONVERGED)
+ {
+ finished[fit_index] = 1;
+ }
+
+ if (finished[fit_index] && n_iterations[fit_index] == 0)
+ {
+ n_iterations[fit_index] = iteration + 1;
+ }
+
+ if (!finished[fit_index])
+ {
+ * all_finished = 0;
+ }
+}
+
+/* Description of the cuda_prepare_next_iteration function
+* ========================================================
+*
+* This function prepares the next iteration. It either updates chi-square values
+* or sets chi-squares and curve parameters to previous values. This function also
+* updates lambda values.
+*
+* Parameters:
+*
+* lambdas: An output vector of values which control the step width by modifying
+* the diagonal elements of the hessian matrices.
+*
+* chi_squares: An input vector of chi-square values for multiple fits.
+*
+* prev_chi_squares: An input vector of chi-square values for multiple fits calculated
+* in the previous iteration.
+*
+* parameters: An output vector of concatenated sets of model parameters.
+*
+* prev_parameters: An input vector of concatenated sets of model parameters
+* calculated in the previous iteration.
+*
+* n_fits: The number of fits.
+*
+* n_parameters: The number of fitting curve parameters.
+*
+* Calling the cuda_prepare_next_iteration function
+* ================================================
+*
+* When calling the function, the blocks and threads must be set up correctly,
+* as shown in the following example code.
+*
+* dim3 threads(1, 1, 1);
+* dim3 blocks(1, 1, 1);
+*
+* int const example_value = 256;
+*
+* threads.x = min(n_fits, example_value);
+* blocks.x = int(ceil(float(n_fits) / float(threads.x)));
+*
+* cuda_prepare_next_iteration<<< blocks, threads >>>(
+* lambdas,
+* chi_squares,
+* prev_chi_squares,
+* parameters,
+* prev_parameters,
+* n_fits,
+* n_parameters);
+*
+*/
+
+__global__ void cuda_prepare_next_iteration(
+ float * lambdas,
+ float * chi_squares,
+ float * prev_chi_squares,
+ float * parameters,
+ float const * prev_parameters,
+ int const n_fits,
+ int const n_parameters)
+{
+ int const fit_index = blockIdx.x * blockDim.x + threadIdx.x;
+
+ if (fit_index >= n_fits)
+ {
+ return;
+ }
+
+ if (chi_squares[fit_index] < prev_chi_squares[fit_index])
+ {
+ lambdas[fit_index] *= 0.1f;
+ prev_chi_squares[fit_index] = chi_squares[fit_index];
+ }
+ else
+ {
+ lambdas[fit_index] *= 10.f;
+ chi_squares[fit_index] = prev_chi_squares[fit_index];
+ for (int iparameter = 0; iparameter < n_parameters; iparameter++)
+ {
+ parameters[fit_index * n_parameters + iparameter] = prev_parameters[fit_index * n_parameters + iparameter];
+ }
+ }
+}
diff --git a/Gpufit/cuda_kernels.cuh b/Gpufit/cuda_kernels.cuh
new file mode 100644
index 0000000..6836480
--- /dev/null
+++ b/Gpufit/cuda_kernels.cuh
@@ -0,0 +1,108 @@
+#ifndef GPUFIT_CUDA_KERNELS_CUH_INCLUDED
+#define GPUFIT_CUDA_KERNELS_CUH_INCLUDED
+
+#include
+
+extern __global__ void cuda_calculate_chi_squares(
+ float * chi_squares,
+ int * states,
+ int * iteration_falied,
+ float const * prev_chi_squares,
+ float const * data,
+ float const * values,
+ float const * weights,
+ int const n_points,
+ int const estimator_id,
+ int const * finished,
+ int const n_fits_per_block,
+ char * user_info,
+ std::size_t const user_info_size);
+extern __global__ void cuda_calculate_gradients(
+ float * gradients,
+ float const * data,
+ float const * values,
+ float const * derivatives,
+ float const * weights,
+ int const n_points,
+ int const n_parameters,
+ int const n_parameters_to_fit,
+ int const * parameters_to_fit_indices,
+ int const estimator_id,
+ int const * finished,
+ int const * skip,
+ int const n_fits_per_block,
+ char * user_info,
+ std::size_t const user_info_size);
+extern __global__ void cuda_calculate_hessians(
+ float * hessians,
+ float const * data,
+ float const * values,
+ float const * derivatives,
+ float const * weights,
+ int const n_points,
+ int const n_parameters,
+ int const n_parameters_to_fit,
+ int const * parameters_to_fit_indices,
+ int const estimator_id,
+ int const * skip,
+ int const * finished,
+ char * user_info,
+ std::size_t const user_info_size);
+extern __global__ void cuda_modify_step_widths(
+ float * hessians,
+ float const * lambdas,
+ unsigned int const n_parameters,
+ int const * iteration_failed,
+ int const * finished,
+ int const n_fits_per_block);
+extern __global__ void cuda_calc_curve_values(
+ float const * parameters,
+ int const n_fits,
+ int const n_points,
+ int const n_parameters,
+ int const * finished,
+ float * values,
+ float * derivatives,
+ int const n_fits_per_block,
+ int const model_id,
+ int const chunk_index,
+ char * user_info,
+ std::size_t const user_info_size);
+extern __global__ void cuda_update_parameters(
+ float * parameters,
+ float * prev_parameters,
+ float const * deltas,
+ int const n_parameters_to_fit,
+ int const * parameters_to_fit_indices,
+ int const * finished,
+ int const n_fits_per_block);
+extern __global__ void cuda_check_for_convergence(
+ int * finished,
+ float const tolerance,
+ int * states,
+ float const * chi_squares,
+ float const * prev_chi_squares,
+ int const iteration,
+ int const max_n_iterations,
+ int const n_fits);
+extern __global__ void cuda_evaluate_iteration(
+ int * all_finished,
+ int * n_iterations,
+ int * finished,
+ int const iteration,
+ int const * states,
+ int const n_fits);
+extern __global__ void cuda_prepare_next_iteration(
+ float * lambdas,
+ float * chi_squares,
+ float * prev_chi_squares,
+ float * function_parameters,
+ float const * prev_parameters,
+ int const n_fits,
+ int const n_parameters);
+extern __global__ void cuda_update_state_after_gaussjordan(
+ int const n_fits,
+ int const * singular_checks,
+ int * states);
+
+#endif
diff --git a/Gpufit/definitions.h b/Gpufit/definitions.h
new file mode 100644
index 0000000..348220d
--- /dev/null
+++ b/Gpufit/definitions.h
@@ -0,0 +1,12 @@
+#ifndef GPUFIT_DEFINITIONS_H_INCLUDED
+#define GPUFIT_DEFINITIONS_H_INCLUDED
+
+ // Status
+#include
+#define CUDA_CHECK_STATUS( cuda_function_call ) \
+ if (cudaError_t const status = cuda_function_call) \
+ { \
+ throw std::runtime_error( cudaGetErrorString( status ) ) ; \
+ }
+
+#endif
diff --git a/Gpufit/examples/CMakeLists.txt b/Gpufit/examples/CMakeLists.txt
new file mode 100644
index 0000000..bb4902f
--- /dev/null
+++ b/Gpufit/examples/CMakeLists.txt
@@ -0,0 +1,14 @@
+
+function( add_example module name )
+ add_executable( ${name} ${name}.cpp )
+ target_link_libraries( ${name} ${module} )
+ set_property( TARGET ${name}
+ PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" )
+ set_property( TARGET ${name} PROPERTY FOLDER GpufitExamples )
+endfunction()
+
+# Examples
+
+add_example( Gpufit Simple_Example )
+add_example( Gpufit Linear_Regression_Example )
+add_example( Gpufit Gauss_Fit_2D_Example )
diff --git a/Gpufit/examples/Gauss_Fit_2D_Example.cpp b/Gpufit/examples/Gauss_Fit_2D_Example.cpp
new file mode 100644
index 0000000..8e628c7
--- /dev/null
+++ b/Gpufit/examples/Gauss_Fit_2D_Example.cpp
@@ -0,0 +1,260 @@
+#include "../gpufit.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+void generate_gauss_2d(
+ std::vector const & x,
+ std::vector const & y,
+ std::vector & g,
+ std::vector const & p)
+{
+ // generates a Gaussian 2D peak function on a set of x and y values with some paramters p (size 5)
+ // we assume that x.size == y.size == g.size, no checks done
+
+ // given x and y values and parameters p computes a model function g
+ for (size_t i = 0; i < x.size(); i++)
+ {
+ float arg = -((x[i] - p[1]) * (x[i] - p[1]) + (y[i] - p[2]) * (y[i] - p[2])) / (2 * p[3] * p[3]);
+ g[i] = p[0] * exp(arg) + p[4];
+ }
+}
+
+void gauss_fit_2d_example()
+{
+ /*
+ This example generates test data in form of 10000 two dimensional Gaussian
+ peaks with the size of 5x5 data points per peak. It is noised by Poisson
+ distributed noise. The initial guesses were randomized, within a specified
+ range of the true value. The GAUSS_2D model is fitted to the test data sets
+ using the MLE estimator.
+
+ The console output shows
+ - the execution time,
+ - the ratio of converged fits including ratios of not converged fits for
+ different reasons,
+ - the values of the true parameters and the mean values of the fitted
+ parameters including their standard deviation,
+ - the mean chi square value
+ - and the mean number of iterations needed.
+
+ True parameters and noise and number of fits is the same as for the Matlab/Python 2D Gaussian examples.
+ */
+
+
+ // number of fits, fit points and parameters
+ size_t const number_fits = 10000;
+ size_t const size_x = 20;
+ size_t const number_points = size_x * size_x;
+ size_t const number_parameters = 5;
+
+ // true parameters (amplitude, center x position, center y position, width, offset)
+ std::vector< float > true_parameters{ 10.f, 9.5f, 9.5f, 3.f, 10.f};
+
+ // initialize random number generator
+ std::mt19937 rng;
+ rng.seed(0);
+ std::uniform_real_distribution< float> uniform_dist(0, 1);
+
+ // initial parameters (randomized)
+ std::vector< float > initial_parameters(number_fits * number_parameters);
+ for (size_t i = 0; i < number_fits; i++)
+ {
+ for (size_t j = 0; j < number_parameters; j++)
+ {
+ if (j == 1 || j == 2)
+ {
+ initial_parameters[i * number_parameters + j]
+ = true_parameters[j] + true_parameters[3]
+ * (-0.2f + 0.4f * uniform_dist(rng));
+ }
+ else
+ {
+ initial_parameters[i * number_parameters + j]
+ = true_parameters[j] * (0.8f + 0.4f * uniform_dist(rng));
+ }
+ }
+ }
+
+ // generate x and y values
+ std::vector< float > x(number_points);
+ std::vector< float > y(number_points);
+ for (size_t i = 0; i < size_x; i++)
+ {
+ for (size_t j = 0; j < size_x; j++) {
+ x[i * size_x + j] = static_cast(j);
+ y[i * size_x + j] = static_cast(i);
+ }
+ }
+
+ // generate test data with Poisson noise
+ std::vector< float > temp(number_points);
+ generate_gauss_2d(x, y, temp, true_parameters);
+
+ std::vector< float > data(number_fits * number_points);
+ for (size_t i = 0; i < number_fits; i++)
+ {
+ for (size_t j = 0; j < number_points; j++)
+ {
+ std::poisson_distribution< int > poisson_dist(temp[j]);
+ data[i * number_points + j] = static_cast(poisson_dist(rng));
+ }
+ }
+
+ // tolerance
+ float const tolerance = 0.001f;
+
+ // maximal number of iterations
+ int const max_number_iterations = 20;
+
+ // estimator ID
+ int const estimator_id = MLE;
+
+ // model ID
+ int const model_id = GAUSS_2D;
+
+ // parameters to fit (all of them)
+ std::vector< int > parameters_to_fit(number_parameters, 1);
+
+ // output parameters
+ std::vector< float > output_parameters(number_fits * number_parameters);
+ std::vector< int > output_states(number_fits);
+ std::vector< float > output_chi_square(number_fits);
+ std::vector< int > output_number_iterations(number_fits);
+
+ // call to gpufit (C interface)
+ std::chrono::high_resolution_clock::time_point time_0 = std::chrono::high_resolution_clock::now();
+ int const status = gpufit
+ (
+ number_fits,
+ number_points,
+ data.data(),
+ 0,
+ model_id,
+ initial_parameters.data(),
+ tolerance,
+ max_number_iterations,
+ parameters_to_fit.data(),
+ estimator_id,
+ 0,
+ 0,
+ output_parameters.data(),
+ output_states.data(),
+ output_chi_square.data(),
+ output_number_iterations.data()
+ );
+ std::chrono::high_resolution_clock::time_point time_1 = std::chrono::high_resolution_clock::now();
+
+ // check status
+ if (status != STATUS_OK)
+ {
+ throw std::runtime_error(gpufit_get_last_error());
+ }
+
+ // print execution time
+ std::cout
+ << "execution time "
+ << std::chrono::duration_cast(time_1 - time_0).count() << " ms\n";
+
+ // get fit states
+ std::vector< int > output_states_histogram(5, 0);
+ for (std::vector< int >::iterator it = output_states.begin(); it != output_states.end(); ++it)
+ {
+ output_states_histogram[*it]++;
+ }
+
+ std::cout << "ratio converged " << (float)output_states_histogram[0] / number_fits << "\n";
+ std::cout << "ratio max iteration exceeded " << (float)output_states_histogram[1] / number_fits << "\n";
+ std::cout << "ratio singular hessian " << (float)output_states_histogram[2] / number_fits << "\n";
+ std::cout << "ratio neg curvature MLE " << (float)output_states_histogram[3] / number_fits << "\n";
+ std::cout << "ratio gpu not read " << (float)output_states_histogram[4] / number_fits << "\n";
+
+ // compute mean of fitted parameters for converged fits
+ std::vector< float > output_parameters_mean(number_parameters, 0);
+ for (size_t i = 0; i != number_fits; i++)
+ {
+ if (output_states[i] == STATE_CONVERGED)
+ {
+ for (size_t j = 0; j < number_parameters; j++)
+ {
+ output_parameters_mean[j] += output_parameters[i * number_parameters + j];
+ }
+ }
+ }
+ // normalize
+ for (size_t j = 0; j < number_parameters; j++)
+ {
+ output_parameters_mean[j] /= output_states_histogram[0];
+ }
+
+ // compute std of fitted parameters for converged fits
+ std::vector< float > output_parameters_std(number_parameters, 0);
+ for (size_t i = 0; i != number_fits; i++)
+ {
+ if (output_states[i] == STATE_CONVERGED)
+ {
+ for (size_t j = 0; j < number_parameters; j++)
+ {
+ output_parameters_std[j]
+ += (output_parameters[i * number_parameters + j] - output_parameters_mean[j])
+ * (output_parameters[i * number_parameters + j] - output_parameters_mean[j]);
+ }
+ }
+ }
+ // normalize and take square root
+ for (size_t j = 0; j < number_parameters; j++)
+ {
+ output_parameters_std[j] = sqrt(output_parameters_std[j] / output_states_histogram[0]);
+ }
+
+ // print true value, fitted mean and std for every parameter
+ for (size_t j = 0; j < number_parameters; j++)
+ {
+ std::cout
+ << "parameter " << j
+ << " true " << true_parameters[j]
+ << " fitted mean " << output_parameters_mean[j]
+ << " std " << output_parameters_std[j] << "\n";
+ }
+
+ // compute mean chi-square for those converged
+ float output_chi_square_mean = 0;
+ for (size_t i = 0; i != number_fits; i++)
+ {
+ if (output_states[i] == STATE_CONVERGED)
+ {
+ output_chi_square_mean += output_chi_square[i];
+ }
+ }
+ output_chi_square_mean /= static_cast(output_states_histogram[0]);
+ std::cout << "mean chi square " << output_chi_square_mean << "\n";
+
+ // compute mean number of iterations for those converged
+ float output_number_iterations_mean = 0;
+ for (size_t i = 0; i != number_fits; i++)
+ {
+ if (output_states[i] == STATE_CONVERGED)
+ {
+ output_number_iterations_mean += static_cast(output_number_iterations[i]);
+ }
+ }
+ // normalize
+ output_number_iterations_mean /= static_cast(output_states_histogram[0]);
+ std::cout << "mean number of iterations " << output_number_iterations_mean << "\n";
+
+}
+
+int main(int argc, char *argv[])
+{
+ gauss_fit_2d_example();
+
+ std::cout << std::endl << "Example completed!" << std::endl;
+ std::cout << "Press ENTER to exit" << std::endl;
+ std::getchar();
+
+ return 0;
+}
diff --git a/Gpufit/examples/Linear_Regression_Example.cpp b/Gpufit/examples/Linear_Regression_Example.cpp
new file mode 100644
index 0000000..e70e05d
--- /dev/null
+++ b/Gpufit/examples/Linear_Regression_Example.cpp
@@ -0,0 +1,207 @@
+#include "../gpufit.h"
+
+#include
+#include
+#include
+#include
+
+void linear_regression_example()
+{
+ /*
+ This example generates test data in form of 10000 one dimensional linear
+ curves with the size of 20 data points per curve. It is noised by normal
+ distributed noise. The initial guesses were randomized, within a specified
+ range of the true value. The LINEAR_1D model is fitted to the test data sets
+ using the LSE estimator. The optional parameter user_info is used to pass
+ custom x positions of the data sets. The same x position values are used for
+ every fit.
+
+ The console output shows
+ - the ratio of converged fits including ratios of not converged fits for
+ different reasons,
+ - the values of the true parameters and the mean values of the fitted
+ parameters including their standard deviation,
+ - the mean chi square value
+ - and the mean number of iterations needed.
+ */
+
+ // number of fits, fit points and parameters
+ size_t const number_fits = 10000;
+ size_t const number_points = 20;
+ size_t const number_parameters = 2;
+
+ // custom x positions for the data points of every fit, stored in user info
+ std::vector< float > user_info(number_points);
+ for (size_t i = 0; i < number_points; i++)
+ {
+ user_info[i] = static_cast(pow(2, i));
+ }
+
+ // size of user info in bytes
+ size_t const user_info_size = number_points * sizeof(float);
+
+ // initialize random number generator
+ std::mt19937 rng;
+ rng.seed(0);
+ std::uniform_real_distribution< float > uniform_dist(0, 1);
+ std::normal_distribution< float > normal_dist(0, 1);
+
+ // true parameters
+ std::vector< float > true_parameters { 5, 2 }; // offset, slope
+
+ // initial parameters (randomized)
+ std::vector< float > initial_parameters(number_fits * number_parameters);
+ for (size_t i = 0; i != number_fits; i++)
+ {
+ // random offset
+ initial_parameters[i * number_parameters + 0] = true_parameters[0] * (0.8f + 0.4f * uniform_dist(rng));
+ // random slope
+ initial_parameters[i * number_parameters + 1] = true_parameters[0] * (0.8f + 0.4f * uniform_dist(rng));
+ }
+
+ // generate data
+ std::vector< float > data(number_points * number_fits);
+ for (size_t i = 0; i != data.size(); i++)
+ {
+ size_t j = i / number_points; // the fit
+ size_t k = i % number_points; // the position within a fit
+
+ float x = user_info[k];
+ float y = true_parameters[0] + x * true_parameters[1];
+ data[i] = y + normal_dist(rng);
+ }
+
+ // tolerance
+ float const tolerance = 0.001f;
+
+ // maximal number of iterations
+ int const max_number_iterations = 20;
+
+ // estimator ID
+ int const estimator_id = LSE;
+
+ // model ID
+ int const model_id = LINEAR_1D;
+
+ // parameters to fit (all of them)
+ std::vector< int > parameters_to_fit(number_parameters, 1);
+
+ // output parameters
+ std::vector< float > output_parameters(number_fits * number_parameters);
+ std::vector< int > output_states(number_fits);
+ std::vector< float > output_chi_square(number_fits);
+ std::vector< int > output_number_iterations(number_fits);
+
+ // call to gpufit (C interface)
+ int const status = gpufit
+ (
+ number_fits,
+ number_points,
+ data.data(),
+ 0,
+ model_id,
+ initial_parameters.data(),
+ tolerance,
+ max_number_iterations,
+ parameters_to_fit.data(),
+ estimator_id,
+ user_info_size,
+ reinterpret_cast< char * >( user_info.data() ),
+ output_parameters.data(),
+ output_states.data(),
+ output_chi_square.data(),
+ output_number_iterations.data()
+ );
+
+ // check status
+ if (status != STATUS_OK)
+ {
+ throw std::runtime_error(gpufit_get_last_error());
+ }
+
+ // get fit states
+ std::vector< int > output_states_histogram(5, 0);
+ for (std::vector< int >::iterator it = output_states.begin(); it != output_states.end(); ++it)
+ {
+ output_states_histogram[*it]++;
+ }
+
+ std::cout << "ratio converged " << (float) output_states_histogram[0] / number_fits << "\n";
+ std::cout << "ratio max iteration exceeded " << (float) output_states_histogram[1] / number_fits << "\n";
+ std::cout << "ratio singular hessian " << (float) output_states_histogram[2] / number_fits << "\n";
+ std::cout << "ratio neg curvature MLE " << (float) output_states_histogram[3] / number_fits << "\n";
+ std::cout << "ratio gpu not read " << (float) output_states_histogram[4] / number_fits << "\n";
+
+ // compute mean fitted parameters for converged fits
+ std::vector< float > output_parameters_mean(number_parameters, 0);
+ for (size_t i = 0; i != number_fits; i++)
+ {
+ if (output_states[i] == STATE_CONVERGED)
+ {
+ // add offset
+ output_parameters_mean[0] += output_parameters[i * number_parameters + 0];
+ // add slope
+ output_parameters_mean[1] += output_parameters[i * number_parameters + 1];
+ }
+ }
+ output_parameters_mean[0] /= output_states_histogram[0];
+ output_parameters_mean[1] /= output_states_histogram[0];
+
+ // compute std of fitted parameters for converged fits
+ std::vector< float > output_parameters_std(number_parameters, 0);
+ for (size_t i = 0; i != number_fits; i++)
+ {
+ if (output_states[i] == STATE_CONVERGED)
+ {
+ // add squared deviation for offset
+ output_parameters_std[0] += (output_parameters[i * number_parameters + 0] - output_parameters_mean[0]) * (output_parameters[i * number_parameters + 0] - output_parameters_mean[0]);
+ // add squared deviation for slope
+ output_parameters_std[1] += (output_parameters[i * number_parameters + 1] - output_parameters_mean[1]) * (output_parameters[i * number_parameters + 1] - output_parameters_mean[1]);
+ }
+ }
+ // divide and take square root
+ output_parameters_std[0] = sqrt(output_parameters_std[0] / output_states_histogram[0]);
+ output_parameters_std[1] = sqrt(output_parameters_std[1] / output_states_histogram[0]);
+
+ // print mean and std
+ std::cout << "offset true " << true_parameters[0] << " mean " << output_parameters_mean[0] << " std " << output_parameters_std[0] << "\n";
+ std::cout << "slope true " << true_parameters[1] << " mean " << output_parameters_mean[1] << " std " << output_parameters_std[1] << "\n";
+
+ // compute mean chi-square for those converged
+ float output_chi_square_mean = 0;
+ for (size_t i = 0; i != number_fits; i++)
+ {
+ if (output_states[i] == STATE_CONVERGED)
+ {
+ output_chi_square_mean += output_chi_square[i];
+ }
+ }
+ output_chi_square_mean /= static_cast(output_states_histogram[0]);
+ std::cout << "mean chi square " << output_chi_square_mean << "\n";
+
+ // compute mean number of iterations for those converged
+ float output_number_iterations_mean = 0;
+ for (size_t i = 0; i != number_fits; i++)
+ {
+ if (output_states[i] == STATE_CONVERGED)
+ {
+ output_number_iterations_mean += static_cast(output_number_iterations[i]);
+ }
+ }
+
+ // normalize
+ output_number_iterations_mean /= static_cast(output_states_histogram[0]);
+ std::cout << "mean number of iterations " << output_number_iterations_mean << "\n";
+}
+
+
+int main(int argc, char *argv[])
+{
+ linear_regression_example();
+
+ std::cout << std::endl << "Example completed!" << std::endl;
+ std::cout << "Press ENTER to exit" << std::endl;
+ std::getchar();
+
+ return 0;
+}
diff --git a/Gpufit/examples/Simple_Example.cpp b/Gpufit/examples/Simple_Example.cpp
new file mode 100644
index 0000000..6d8ea91
--- /dev/null
+++ b/Gpufit/examples/Simple_Example.cpp
@@ -0,0 +1,94 @@
+#include "../gpufit.h"
+#include
+#include
+
+void simple_example()
+{
+ /*
+ Simple example demonstrating a minimal call of all needed parameters to
+ the C interface. It can be built and executed, but in this exeample
+ gpufit doesn't do anything useful and it doesn't yield meaningful
+ output. No test data is generated. The values of the input data vector
+ and the initial fit parameters vector are set to 0.
+
+ This example can be devided in three parts:
+ - definition of input and output parameters
+ - call to gpufit
+ - status check
+ */
+
+ /*************** definition of input and output parameters ***************/
+
+ // number of fits, number of points per fit
+ size_t const number_fits = 10;
+ size_t const number_points = 10;
+
+ // model ID and number of parameter
+ int const model_id = GAUSS_1D;
+ size_t const number_parameters = 4;
+
+ // initial parameters
+ std::vector< float > initial_parameters(number_fits * number_parameters);
+
+ // data
+ std::vector< float > data(number_points * number_fits);
+
+ // tolerance
+ float const tolerance = 0.001f;
+
+ // maximal number of iterations
+ int const max_number_iterations = 10;
+
+ // estimator ID
+ int const estimator_id = LSE;
+
+ // parameters to fit (all of them)
+ std::vector< int > parameters_to_fit(number_parameters, 1);
+
+ // output parameters
+ std::vector< float > output_parameters(number_fits * number_parameters);
+ std::vector< int > output_states(number_fits);
+ std::vector< float > output_chi_square(number_fits);
+ std::vector< int > output_number_iterations(number_fits);
+
+ /***************************** call to gpufit ****************************/
+
+ int const status = gpufit
+ (
+ number_fits,
+ number_points,
+ data.data(),
+ 0,
+ model_id,
+ initial_parameters.data(),
+ tolerance,
+ max_number_iterations,
+ parameters_to_fit.data(),
+ estimator_id,
+ 0,
+ 0,
+ output_parameters.data(),
+ output_states.data(),
+ output_chi_square.data(),
+ output_number_iterations.data()
+ );
+
+ /****************************** status check *****************************/
+
+ if (status != STATUS_OK)
+ {
+ throw std::runtime_error(gpufit_get_last_error());
+ }
+}
+
+
+int main(int argc, char *argv[])
+{
+ simple_example();
+
+ std::cout << std::endl << "Example completed!" << std::endl;
+ std::cout << "Press ENTER to exit" << std::endl;
+ std::getchar();
+
+ return 0;
+}
diff --git a/Gpufit/gauss_1d.cuh b/Gpufit/gauss_1d.cuh
new file mode 100644
index 0000000..5fefc55
--- /dev/null
+++ b/Gpufit/gauss_1d.cuh
@@ -0,0 +1,91 @@
+#ifndef GPUFIT_GAUSS1D_CUH_INCLUDED
+#define GPUFIT_GAUSS1D_CUH_INCLUDED
+
+/* Description of the calculate_gauss1d function
+* ==============================================
+*
+* This function calculates the values of one-dimensional gauss model functions
+* and their partial derivatives with respect to the model parameters.
+*
+* No independent variables are passed to this model function. Hence, the
+* (X) coordinate of the first data value is assumed to be (0.0). For
+* a fit size of M data points, the (X) coordinates of the data are
+* simply the corresponding array index values of the data array, starting from
+* zero.
+*
+* Parameters:
+*
+* parameters: An input vector of concatenated sets of model parameters.
+* p[0]: amplitude
+* p[1]: center coordinate
+* p[2]: width (standard deviation)
+* p[3]: offset
+*
+* n_fits: The number of fits. (not used)
+*
+* n_points: The number of data points per fit.
+*
+* n_parameters: The number of model parameters.
+*
+* values: An output vector of concatenated sets of model function values.
+*
+* derivatives: An output vector of concatenated sets of model function partial
+* derivatives.
+*
+* chunk_index: The chunk index. (not used)
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_gauss1d function
+* ======================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function. When calling the function, the blocks and threads of the __global__
+* function must be set up correctly, as shown in the following example code.
+*
+* dim3 threads(1, 1, 1);
+* dim3 blocks(1, 1, 1);
+*
+* threads.x = n_points * n_fits_per_block;
+* blocks.x = n_fits / n_fits_per_block;
+*
+* global_function<<< blocks,threads >>>(parameter1, ...);
+*
+*/
+
+__device__ void calculate_gauss1d(
+ float const * parameters,
+ int const n_fits,
+ int const n_points,
+ int const n_parameters,
+ float * values,
+ float * derivatives,
+ int const chunk_index,
+ char * user_info,
+ std::size_t const user_info_size)
+{
+ int const n_fits_per_block = blockDim.x / n_points;
+ int const fit_in_block = threadIdx.x / n_points;
+ int const point_index = threadIdx.x - (fit_in_block*n_points);
+ int const fit_index = blockIdx.x*n_fits_per_block + fit_in_block;
+
+ float * current_value = &values[fit_index * n_points];
+ float const * p = ¶meters[fit_index * n_parameters];
+
+ float const argx = (point_index - p[1]) * (point_index - p[1]) / (2 * p[2] * p[2]);
+ float const ex = exp(-argx);
+ current_value[point_index] = p[0] * ex + p[3];
+
+ // derivatives
+
+ float * current_derivative = &derivatives[fit_index * n_points * n_parameters + point_index];
+
+ current_derivative[0] = ex;
+ current_derivative[1 * n_points] = p[0] * ex * (point_index - p[1]) / (p[2] * p[2]);
+ current_derivative[2 * n_points] = p[0] * ex * (point_index - p[1]) * (point_index - p[1]) / (p[2] * p[2] * p[2]);
+ current_derivative[3 * n_points] = 1.f;
+}
+
+#endif
diff --git a/Gpufit/gauss_2d.cuh b/Gpufit/gauss_2d.cuh
new file mode 100644
index 0000000..0448cfa
--- /dev/null
+++ b/Gpufit/gauss_2d.cuh
@@ -0,0 +1,97 @@
+#ifndef GPUFIT_GAUSS2D_CUH_INCLUDED
+#define GPUFIT_GAUSS2D_CUH_INCLUDED
+
+/* Description of the calculate_gauss2d function
+* ==============================================
+*
+* This function calculates the values of two-dimensional gauss model functions
+* and their partial derivatives with respect to the model parameters.
+*
+* No independent variables are passed to this model function. Hence, the
+* (X, Y) coordinate of the first data value is assumed to be (0.0, 0.0). For
+* a fit size of M x N data points, the (X, Y) coordinates of the data are
+* simply the corresponding array index values of the data array, starting from
+* zero.
+*
+* Parameters:
+*
+* parameters: An input vector of concatenated sets of model parameters.
+* p[0]: amplitude
+* p[1]: center coordinate x
+* p[2]: center coordinate y
+* p[3]: width (standard deviation; equal width in x and y dimensions)
+* p[4]: offset
+*
+* n_fits: The number of fits. (not used)
+*
+* n_points: The number of data points per fit.
+*
+* n_parameters: The number of model parameters.
+*
+* values: An output vector of concatenated sets of model function values.
+*
+* derivatives: An output vector of concatenated sets of model function partial
+* derivatives.
+*
+* chunk_index: The chunk index. (not used)
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_gauss2d function
+* ======================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function. When calling the function, the blocks and threads of the __global__
+* function must be set up correctly, as shown in the following example code.
+*
+* dim3 threads(1, 1, 1);
+* dim3 blocks(1, 1, 1);
+*
+* threads.x = n_points * n_fits_per_block;
+* blocks.x = n_fits / n_fits_per_block;
+*
+* global_function<<< blocks,threads >>>(parameter1, ...);
+*
+*/
+
+__device__ void calculate_gauss2d(
+ float const * parameters,
+ int const n_fits,
+ int const n_points,
+ int const n_parameters,
+ float * values,
+ float * derivatives,
+ int const chunk_index,
+ char * user_info,
+ std::size_t const user_info_size)
+{
+ int const n_points_x = sqrt((float)n_points);
+ int const n_fits_per_block = blockDim.x / n_points;
+ int const fit_in_block = threadIdx.x / n_points;
+ int const point_index = threadIdx.x - fit_in_block * n_points;
+ int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block;
+ int const point_index_y = point_index / n_points_x;
+ int const point_index_x = point_index - point_index_y * n_points_x;
+
+ float* current_value = &values[fit_index * n_points];
+ float const * p = ¶meters[fit_index * n_parameters];
+
+ float const argx = (point_index_x - p[1]) * (point_index_x - p[1]) / (2 * p[3] * p[3]);
+ float const argy = (point_index_y - p[2]) * (point_index_y - p[2]) / (2 * p[3] * p[3]);
+ float const ex = exp(-(argx + argy));
+ current_value[point_index] = p[0] * ex + p[4];
+
+ // derivatives
+
+ float * current_derivative = &derivatives[fit_index * n_points * n_parameters + point_index];
+
+ current_derivative[0] = ex;
+ current_derivative[1 * n_points] = p[0] * ex * (point_index_x - p[1]) / (p[3] * p[3]);
+ current_derivative[2 * n_points] = p[0] * ex * (point_index_y - p[2]) / (p[3] * p[3]);
+ current_derivative[3 * n_points] = ex * p[0] * ((point_index_x - p[1]) * (point_index_x - p[1]) + (point_index_y - p[2]) * (point_index_y - p[2])) / (p[3] * p[3] * p[3]);
+ current_derivative[4 * n_points] = 1;
+}
+
+#endif
diff --git a/Gpufit/gauss_2d_elliptic.cuh b/Gpufit/gauss_2d_elliptic.cuh
new file mode 100644
index 0000000..5417667
--- /dev/null
+++ b/Gpufit/gauss_2d_elliptic.cuh
@@ -0,0 +1,100 @@
+#ifndef GPUFIT_GAUSS2DELLIPTIC_CUH_INCLUDED
+#define GPUFIT_GAUSS2DELLIPTIC_CUH_INCLUDED
+
+/* Description of the calculate_gauss2delliptic function
+* ======================================================
+*
+* This function calculates the values of two-dimensional elliptic gauss model
+* functions and their partial derivatives with respect to the model parameters.
+*
+* No independent variables are passed to this model function. Hence, the
+* (X, Y) coordinate of the first data value is assumed to be (0.0, 0.0). For
+* a fit size of M x N data points, the (X, Y) coordinates of the data are
+* simply the corresponding array index values of the data array, starting from
+* zero.
+*
+* Parameters:
+*
+* parameters: An input vector of concatenated sets of model parameters.
+* p[0]: amplitude
+* p[1]: center coordinate x
+* p[2]: center coordinate y
+* p[3]: width x (standard deviation)
+* p[4]: width y (standard deviation)
+* p[5]: offset
+*
+* n_fits: The number of fits. (not used)
+*
+* n_points: The number of data points per fit.
+*
+* n_parameters: The number of model parameters.
+*
+* values: An output vector of concatenated sets of model function values.
+*
+* derivatives: An output vector of concatenated sets of model function partial
+* derivatives.
+*
+* chunk_index: The chunk index. (not used)
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_gauss2delliptic function
+* ==============================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function. When calling the function, the blocks and threads of the __global__
+* function must be set up correctly, as shown in the following example code.
+*
+* dim3 threads(1, 1, 1);
+* dim3 blocks(1, 1, 1);
+*
+* threads.x = n_points * n_fits_per_block;
+* blocks.x = n_fits / n_fits_per_block;
+*
+* global_function<<< blocks,threads >>>(parameter1, ...);
+*
+*/
+
+__device__ void calculate_gauss2delliptic(
+ float const * parameters,
+ int const n_fits,
+ int const n_points,
+ int const n_parameters,
+ float * values,
+ float * derivatives,
+ int const chunk_index,
+ char * user_info,
+ std::size_t const user_info_size)
+{
+ int const n_points_x = sqrt((float)n_points);
+ int const n_fits_per_block = blockDim.x / n_points;
+ int const fit_in_block = threadIdx.x / n_points;
+ int const point_index = threadIdx.x - (fit_in_block*n_points);
+ int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block;
+
+ int const point_index_y = point_index / n_points_x;
+ int const point_index_x = point_index - point_index_y * n_points_x;
+
+ float* current_value = &values[fit_index * n_points];
+ float const * p = ¶meters[fit_index * n_parameters];
+
+ float const argx = (point_index_x - p[1]) * (point_index_x - p[1]) / (2 * p[3] * p[3]);
+ float const argy = (point_index_y - p[2]) * (point_index_y - p[2]) / (2 * p[4] * p[4]);
+ float const ex = exp(-(argx + argy));
+ current_value[point_index] = p[0] * ex + p[5];
+
+ // derivatives
+
+ float * current_derivative = &derivatives[fit_index * n_points * n_parameters + point_index];
+
+ current_derivative[0] = ex;
+ current_derivative[1 * n_points] = p[0] * ex * (point_index_x - p[1]) / (p[3] * p[3]);
+ current_derivative[2 * n_points] = p[0] * ex * (point_index_y - p[2]) / (p[4] * p[4]);
+ current_derivative[3 * n_points] = p[0] * ex * (point_index_x - p[1]) * (point_index_x - p[1]) / (p[3] * p[3] * p[3]);
+ current_derivative[4 * n_points] = p[0] * ex * (point_index_y - p[2]) * (point_index_y - p[2]) / (p[4] * p[4] * p[4]);
+ current_derivative[5 * n_points] = 1;
+}
+
+#endif
diff --git a/Gpufit/gauss_2d_rotated.cuh b/Gpufit/gauss_2d_rotated.cuh
new file mode 100644
index 0000000..09d042f
--- /dev/null
+++ b/Gpufit/gauss_2d_rotated.cuh
@@ -0,0 +1,106 @@
+#ifndef GPUFIT_GAUSS2DROTATED_CUH_INCLUDED
+#define GPUFIT_GAUSS2DROTATED_CUH_INCLUDED
+
+/* Description of the calculate_gauss2drotated function
+* =====================================================
+*
+* This function calculates the values of two-dimensional elliptic gauss model
+* functions including a rotation parameter and their partial derivatives with
+* respect to the model parameters.
+*
+* No independent variables are passed to this model function. Hence, the
+* (X, Y) coordinate of the first data value is assumed to be (0.0, 0.0). For
+* a fit size of M x N data points, the (X, Y) coordinates of the data are
+* simply the corresponding array index values of the data array, starting from
+* zero.
+*
+* Parameters:
+*
+* parameters: An input vector of concatenated sets of model parameters.
+* p[0]: amplitude
+* p[1]: center coordinate x
+* p[2]: center coordinate y
+* p[3]: width x (standard deviation)
+* p[4]: width y (standard deviation)
+* p[5]: offset
+* p[6]: rotation angle [radians]
+*
+* n_fits: The number of fits. (not used)
+*
+* n_points: The number of data points per fit.
+*
+* n_parameters: The number of model parameters.
+*
+* values: An output vector of concatenated sets of model function values.
+*
+* derivatives: An output vector of concatenated sets of model function partial
+* derivatives.
+*
+* chunk_index: The chunk index. (not used)
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_gauss2drotated function
+* =============================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function. When calling the function, the blocks and threads of the __global__
+* function must be set up correctly, as shown in the following example code.
+*
+* dim3 threads(1, 1, 1);
+* dim3 blocks(1, 1, 1);
+*
+* threads.x = n_points * n_fits_per_block;
+* blocks.x = n_fits / n_fits_per_block;
+*
+* global_function<<< blocks,threads >>>(parameter1, ...);
+*
+*/
+
+__device__ void calculate_gauss2drotated(
+ float const * parameters,
+ int const n_fits,
+ int const n_points,
+ int const n_parameters,
+ float * values,
+ float * derivatives,
+ int const chunk_index,
+ char * user_info,
+ std::size_t const user_info_size)
+{
+ int const n_points_x = sqrt((float)n_points);
+ int const n_fits_per_block = blockDim.x / n_points;
+ int const fit_in_block = threadIdx.x / n_points;
+ int const point_index = threadIdx.x - (fit_in_block*n_points);
+ int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block;
+
+ int const point_index_y = point_index / n_points_x;
+ int const point_index_x = point_index - point_index_y * n_points_x;
+
+ float* current_value = &values[fit_index * n_points];
+ float const * p = ¶meters[fit_index * n_parameters];
+
+ float const cosp6 = cosf(p[6]);
+ float const sinp6 = sinf(p[6]);
+
+ float const arga = (point_index_x - p[1]) * cosp6 - (point_index_y - p[2]) * sinp6;
+ float const argb = (point_index_x - p[1]) * sinp6 + (point_index_y - p[2]) * cosp6;
+ float const ex = exp(-0.5 * (((arga / p[3]) * (arga / p[3])) + ((argb / p[4]) * (argb / p[4]))));
+ current_value[point_index] = p[0] * ex + p[5];
+
+ // derivatives
+
+ float * current_derivative = &derivatives[fit_index * n_points * n_parameters + point_index];
+
+ current_derivative[0] = ex;
+ current_derivative[1 * n_points] = (((p[0] * cosp6 * arga) / (p[3] * p[3])) + ((p[0] * sinp6 * argb) / (p[4] * p[4]))) * ex;
+ current_derivative[2 * n_points] = (((-p[0] * sinp6 * arga) / (p[3] * p[3])) + ((p[0] * cosp6 * argb) / (p[4] * p[4]))) * ex;
+ current_derivative[3 * n_points] = p[0] * arga * arga / (p[3] * p[3] * p[3]) * ex;
+ current_derivative[4 * n_points] = p[0] * argb * argb / (p[4] * p[4] * p[4]) * ex;
+ current_derivative[5 * n_points] = 1;
+ current_derivative[6 * n_points] = p[0] * arga * argb * (1.0 / (p[3] * p[3]) - 1.0 / (p[4] * p[4])) * ex;
+}
+
+#endif
diff --git a/Gpufit/gpu_data.cu b/Gpufit/gpu_data.cu
new file mode 100644
index 0000000..afbca05
--- /dev/null
+++ b/Gpufit/gpu_data.cu
@@ -0,0 +1,175 @@
+#include "gpu_data.cuh"
+#include
+#include
+
+GPUData::GPUData(Info const & info) :
+ chunk_size_(0),
+ info_(info),
+
+ data_( info_.max_chunk_size_*info_.n_points_ ),
+ weights_( info_.use_weights_ ? info_.n_points_ * info_.max_chunk_size_ : 0 ),
+ parameters_( info_.max_chunk_size_*info_.n_parameters_ ),
+ prev_parameters_( info_.max_chunk_size_*info_.n_parameters_ ),
+ parameters_to_fit_indices_( info_.n_parameters_to_fit_ ),
+ user_info_( info_.user_info_size_ ),
+
+ chi_squares_( info_.max_chunk_size_ ),
+ prev_chi_squares_( info_.max_chunk_size_ ),
+ gradients_( info_.max_chunk_size_ * info_.n_parameters_to_fit_ ),
+ hessians_( info_.max_chunk_size_ * info_.n_parameters_to_fit_ * info_.n_parameters_to_fit_ ),
+ deltas_(info_.max_chunk_size_ * info_.n_parameters_to_fit_),
+
+ values_( info_.max_chunk_size_ * info_.n_points_ ),
+ derivatives_( info_.max_chunk_size_ * info_.n_points_ * info_.n_parameters_ ),
+
+ lambdas_( info_.max_chunk_size_ ),
+ states_( info_.max_chunk_size_ ),
+ finished_( info_.max_chunk_size_ ),
+ iteration_falied_(info_.max_chunk_size_),
+ all_finished_( 1 ),
+ n_iterations_( info_.max_chunk_size_ )
+{
+
+}
+
+void GPUData::reset(int const chunk_size)
+{
+ chunk_size_ = chunk_size;
+
+ set(data_, 0.f, chunk_size_ * info_.n_points_);
+ if (info_.use_weights_)
+ set(weights_, 0.f, chunk_size_ * info_.n_points_);
+ set(parameters_, 0.f, chunk_size_ * info_.n_parameters_);
+ set(prev_parameters_, 0.f, chunk_size_ * info_.n_parameters_);
+ set(parameters_to_fit_indices_, 0, info_.n_parameters_to_fit_);
+
+ set(chi_squares_, 0.f, chunk_size_);
+ set(prev_chi_squares_, 0.f, chunk_size_);
+ set(gradients_, 0.f, chunk_size_ * info_.n_parameters_to_fit_);
+ set(hessians_, 0.f, chunk_size_ * info_.n_parameters_to_fit_ * info_.n_parameters_to_fit_);
+ set(deltas_, 0.f, chunk_size_ * info_.n_parameters_to_fit_);
+
+ set(values_, 0.f, chunk_size_*info_.n_points_);
+ set(derivatives_, 0.f, chunk_size_ * info_.n_points_ * info_.n_parameters_);
+
+ set(lambdas_, 0.f, chunk_size_);
+ set(states_, 0, chunk_size_);
+ set(finished_, 0, chunk_size_);
+ set(iteration_falied_, 0, chunk_size_);
+ set(all_finished_, 0, 1);
+ set(n_iterations_, 0, chunk_size_);
+}
+
+void GPUData::init
+(
+ int const chunk_index,
+ float const * const data,
+ float const * const weights,
+ float const * const initial_parameters,
+ std::vector const & parameters_to_fit_indices)
+{
+ chunk_index_ = chunk_index;
+ write(
+ data_,
+ &data[chunk_index_*info_.max_chunk_size_*info_.n_points_],
+ chunk_size_*info_.n_points_);
+ if (info_.use_weights_)
+ write(weights_, &weights[chunk_index_*info_.max_chunk_size_*info_.n_points_],
+ chunk_size_*info_.n_points_);
+ write(
+ parameters_,
+ &initial_parameters[chunk_index_*info_.max_chunk_size_*info_.n_parameters_],
+ chunk_size_ * info_.n_parameters_);
+ write(parameters_to_fit_indices_, parameters_to_fit_indices);
+
+ set(lambdas_, 0.001f, chunk_size_);
+}
+
+void GPUData::init_user_info(char const * const user_info)
+{
+ if (info_.user_info_size_ > 0)
+ write(user_info_, user_info, info_.user_info_size_);
+}
+
+void GPUData::read(bool * dst, int const * src)
+{
+ int int_dst = 0;
+ CUDA_CHECK_STATUS(cudaMemcpy(&int_dst, src, sizeof(int), cudaMemcpyDeviceToHost));
+ * dst = (int_dst == 1) ? true : false;
+}
+
+void GPUData::write(float* dst, float const * src, int const count)
+{
+ CUDA_CHECK_STATUS(cudaMemcpy(dst, src, count * sizeof(float), cudaMemcpyHostToDevice));
+}
+
+void GPUData::write(int* dst, std::vector const & src)
+{
+ std::size_t const size = src.size() * sizeof(int);
+ CUDA_CHECK_STATUS(cudaMemcpy(dst, src.data(), size, cudaMemcpyHostToDevice));
+}
+
+void GPUData::write(char* dst, char const * src, std::size_t const count)
+{
+ CUDA_CHECK_STATUS(cudaMemcpy(dst, src, count * sizeof(char), cudaMemcpyHostToDevice));
+}
+
+void GPUData::copy(float * dst, float const * src, std::size_t const count)
+{
+ CUDA_CHECK_STATUS(cudaMemcpy(dst, src, count * sizeof(float), cudaMemcpyDeviceToDevice));
+}
+
+__global__ void set_kernel(int* dst, int const value, int const count)
+{
+ int const index = blockIdx.x * blockDim.x + threadIdx.x;
+
+ if (index >= count)
+ return;
+
+ dst[index] = value;
+}
+
+void GPUData::set(int* arr, int const value, int const count)
+{
+ int const tx = 256;
+ int const bx = (count / tx) + 1;
+
+ dim3 threads(tx, 1, 1);
+ dim3 blocks(bx, 1, 1);
+
+ set_kernel<<< blocks, threads >>>(arr, value, count);
+ CUDA_CHECK_STATUS(cudaGetLastError());
+}
+
+void GPUData::set(int* arr, int const value)
+{
+ int const tx = 1;
+ int const bx = 1;
+
+ dim3 threads(tx, 1, 1);
+ dim3 blocks(bx, 1, 1);
+
+ set_kernel<<< blocks, threads >>>(arr, value, 1);
+ CUDA_CHECK_STATUS(cudaGetLastError());
+}
+
+__global__ void set_kernel(float* dst, float const value, std::size_t const count)
+{
+ std::size_t const index = blockIdx.x * blockDim.x + threadIdx.x;
+
+ if (index >= count)
+ return;
+
+ dst[index] = value;
+}
+
+void GPUData::set(float* arr, float const value, int const count)
+{
+ int const tx = 256;
+ int const bx = (count / tx) + 1;
+
+ dim3 threads(tx, 1, 1);
+ dim3 blocks(bx, 1, 1);
+ set_kernel<<< blocks, threads >>>(arr, value, count);
+ CUDA_CHECK_STATUS(cudaGetLastError());
+}
diff --git a/Gpufit/gpu_data.cuh b/Gpufit/gpu_data.cuh
new file mode 100644
index 0000000..b35f09d
--- /dev/null
+++ b/Gpufit/gpu_data.cuh
@@ -0,0 +1,122 @@
+#ifndef GPUFIT_GPU_DATA_CUH_INCLUDED
+#define GPUFIT_GPU_DATA_CUH_INCLUDED
+
+#include "info.h"
+
+#include
+
+#include
+#include
+#include
+
+template< typename Type >
+struct Device_Array
+{
+ explicit Device_Array( std::size_t const size )
+ {
+ std::size_t const maximum_size = std::numeric_limits< std::size_t >::max() ;
+ std::size_t const type_size = sizeof( Type ) ;
+ if (size <= maximum_size / type_size)
+ {
+ cudaError_t const status = cudaMalloc( & data_, size * type_size ) ;
+ if (status == cudaSuccess)
+ {
+ return ;
+ }
+ else
+ {
+ throw std::runtime_error( cudaGetErrorString( status ) ) ;
+ }
+ }
+ else
+ {
+ throw std::runtime_error( "maximum array size exceeded" ) ;
+ }
+ }
+
+ ~Device_Array() { cudaFree( data_ ) ; }
+
+ operator Type * () { return static_cast< Type * >( data_ ) ; }
+ operator Type const * () const { return static_cast< Type * >( data_ ) ; }
+
+ Type * copy( std::size_t const size, Type * const to ) const
+ {
+ /// \todo check size parameter
+
+ std::size_t const type_size = sizeof( Type ) ;
+ cudaError_t const status
+ = cudaMemcpy( to, data_, size * type_size, cudaMemcpyDeviceToHost ) ;
+ if (status == cudaSuccess)
+ {
+ return to + size ;
+ }
+ else
+ {
+ throw std::runtime_error( cudaGetErrorString( status ) ) ;
+ }
+ }
+
+private:
+ void * data_ ;
+} ;
+
+class GPUData
+{
+public:
+ GPUData(Info const & info);
+
+ void reset(int const chunk_size);
+ void init
+ (
+ int const chunk_index,
+ float const * data,
+ float const * weights,
+ float const * initial_parameters,
+ std::vector const & parameters_to_fit_indices
+ ) ;
+ void init_user_info(char const * user_info);
+
+ void read(bool * dst, int const * src);
+ void set(int* arr, int const value);
+ void copy(float * dst, float const * src, std::size_t const count);
+
+private:
+ void set(float* arr, float const value, int const count);
+ void set(int* arr, int const value, int const count);
+ void write(float* dst, float const * src, int const count);
+ void write(int* dst, std::vector const & src);
+ void write(char* dst, char const * src, std::size_t const count);
+
+private:
+ int chunk_size_;
+ Info const & info_;
+
+public:
+ int chunk_index_;
+
+ Device_Array< float > data_;
+ Device_Array< float > weights_;
+ Device_Array< float > parameters_;
+ Device_Array< float > prev_parameters_;
+ Device_Array< int > parameters_to_fit_indices_;
+ Device_Array< char > user_info_;
+
+ Device_Array< float > chi_squares_;
+ Device_Array< float > prev_chi_squares_;
+ Device_Array< float > gradients_;
+ Device_Array< float > hessians_;
+ Device_Array< float > deltas_;
+
+
+ Device_Array< float > values_;
+ Device_Array< float > derivatives_;
+
+ Device_Array< float > lambdas_;
+ Device_Array< int > states_;
+ Device_Array< int > finished_;
+ Device_Array< int > iteration_falied_;
+ Device_Array< int > all_finished_;
+ Device_Array< int > n_iterations_;
+};
+
+#endif
diff --git a/Gpufit/gpufit.cpp b/Gpufit/gpufit.cpp
new file mode 100644
index 0000000..e7f2d31
--- /dev/null
+++ b/Gpufit/gpufit.cpp
@@ -0,0 +1,130 @@
+#include "gpufit.h"
+#include "interface.h"
+
+#include
+
+std::string last_error ;
+
+int gpufit
+(
+ size_t n_fits,
+ size_t n_points,
+ float * data,
+ float * weights,
+ int model_id,
+ float * initial_parameters,
+ float tolerance,
+ int max_n_iterations,
+ int * parameters_to_fit,
+ int estimator_id,
+ size_t user_info_size,
+ char * user_info,
+ float * output_parameters,
+ int * output_states,
+ float * output_chi_squares,
+ int * output_n_iterations
+)
+try
+{
+ __int32 n_points_32 = 0;
+ if (n_points <= (unsigned int)(std::numeric_limits<__int32>::max()))
+ {
+ n_points_32 = __int32(n_points);
+ }
+ else
+ {
+ throw std::runtime_error("maximum number of data points per fit exceeded");
+ }
+
+ FitInterface fi(
+ data,
+ weights,
+ n_fits,
+ n_points_32,
+ tolerance,
+ max_n_iterations,
+ estimator_id,
+ initial_parameters,
+ parameters_to_fit,
+ user_info,
+ user_info_size,
+ output_parameters,
+ output_states,
+ output_chi_squares,
+ output_n_iterations);
+
+ fi.fit(model_id);
+
+ return STATUS_OK ;
+}
+catch( std::exception & exception )
+{
+ last_error = exception.what() ;
+
+ return STATUS_ERROR ;
+}
+catch( ... )
+{
+ last_error = "unknown error" ;
+
+ return STATUS_ERROR;
+}
+
+char const * gpufit_get_last_error()
+{
+ return last_error.c_str() ;
+}
+
+int gpufit_cuda_available()
+{
+ try
+ {
+ getDeviceCount();
+ return 1;
+ }
+ catch (std::exception & exception)
+ {
+ last_error = exception.what();
+
+ return 0;
+ }
+}
+
+int gpufit_get_cuda_version(int * runtime_version, int * driver_version)
+{
+ try
+ {
+ cudaRuntimeGetVersion(runtime_version);
+ cudaDriverGetVersion(driver_version);
+ return 1;
+ }
+ catch (std::exception & exception)
+ {
+ last_error = exception.what();
+
+ return 0;
+ }
+}
+
+int gpufit_portable_interface(int argc, void *argv[])
+{
+
+ return gpufit(
+ *((size_t *) argv[0]),
+ *((size_t *) argv[1]),
+ (float *) argv[2],
+ (float *) argv[3],
+ *((int *) argv[4]),
+ (float *) argv[5],
+ *((float *) argv[6]),
+ *((int *) argv[7]),
+ (int *) argv[8],
+ *((int *) argv[9]),
+ *((size_t *) argv[10]),
+ (char *) argv[11],
+ (float *) argv[12],
+ (int *) argv[13],
+ (float *) argv[14],
+ (int *) argv[15]);
+
+}
\ No newline at end of file
diff --git a/Gpufit/gpufit.h b/Gpufit/gpufit.h
new file mode 100644
index 0000000..985e6d7
--- /dev/null
+++ b/Gpufit/gpufit.h
@@ -0,0 +1,63 @@
+#ifndef GPU_FIT_H_INCLUDED
+#define GPU_FIT_H_INCLUDED
+
+// fitting model ID
+#define GAUSS_1D 0
+#define GAUSS_2D 1
+#define GAUSS_2D_ELLIPTIC 2
+#define GAUSS_2D_ROTATED 3
+#define CAUCHY_2D_ELLIPTIC 4
+#define LINEAR_1D 5
+
+// estimator ID
+#define LSE 0
+#define MLE 1
+
+// fit state
+#define STATE_CONVERGED 0
+#define STATE_MAX_ITERATION 1
+#define STATE_SINGULAR_HESSIAN 2
+#define STATE_NEG_CURVATURE_MLE 3
+#define STATE_GPU_NOT_READY 4
+
+// gpufit return state
+#define STATUS_OK 0
+#define STATUS_ERROR -1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int gpufit
+(
+ size_t n_fits,
+ size_t n_points,
+ float * data,
+ float * weights,
+ int model_id,
+ float * initial_parameters,
+ float tolerance,
+ int max_n_iterations,
+ int * parameters_to_fit,
+ int estimator_id,
+ size_t user_info_size,
+ char * user_info,
+ float * output_parameters,
+ int * output_states,
+ float * output_chi_squares,
+ int * output_n_iterations
+) ;
+
+char const * gpufit_get_last_error() ;
+
+int gpufit_cuda_available();
+
+int gpufit_get_cuda_version(int * runtime_version, int * driver_version);
+
+int gpufit_portable_interface(int argc, void *argv[]);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // GPU_FIT_H_INCLUDED
diff --git a/Gpufit/info.cpp b/Gpufit/info.cpp
new file mode 100644
index 0000000..e2fecca
--- /dev/null
+++ b/Gpufit/info.cpp
@@ -0,0 +1,124 @@
+#include "info.h"
+#include
+
+Info::Info() :
+ n_parameters_(0),
+ n_parameters_to_fit_(0),
+ max_chunk_size_(0),
+ max_n_iterations_(0),
+ n_points_(0),
+ power_of_two_n_points_(0),
+ n_fits_(0),
+ user_info_size_(0),
+ n_fits_per_block_(0),
+ model_id_(0),
+ estimator_id_(0),
+ max_threads_(0),
+ max_blocks_(0),
+ available_gpu_memory_(0)
+{
+}
+
+Info::~Info(void)
+{
+}
+
+void Info::set_number_of_parameters_to_fit(int const * const parameters_to_fit)
+{
+ n_parameters_to_fit_ = n_parameters_;
+
+ for (int i = 0; i < n_parameters_; i++)
+ {
+ if (!parameters_to_fit[i])
+ {
+ n_parameters_to_fit_--;
+ }
+ }
+}
+
+void Info::set_fits_per_block(std::size_t const current_chunk_size)
+{
+ n_fits_per_block_ = 8;
+ bool is_divisible = false;
+ bool enough_threads = false;
+ do
+ {
+ n_fits_per_block_ /= 2;
+ is_divisible = current_chunk_size % n_fits_per_block_ == 0;
+ enough_threads = n_fits_per_block_ * n_points_ < max_threads_ / 4;
+ } while ((!is_divisible || !enough_threads) && n_fits_per_block_ > 1);
+}
+
+void Info::set_max_chunk_size()
+{
+ int one_fit_memory
+ = sizeof(float)
+ *(2 * n_points_
+ + 2 * n_parameters_
+ + 2 * n_parameters_to_fit_
+ + 1 * n_parameters_to_fit_*n_parameters_to_fit_
+ + 1 * n_points_*n_parameters_
+ + 4)
+ + sizeof(int)
+ * 3;
+
+ if (use_weights_)
+ one_fit_memory += sizeof(float) * n_points_;
+
+ std::size_t tmp_chunk_size = available_gpu_memory_ / one_fit_memory;
+
+ if (tmp_chunk_size == 0)
+ {
+ throw std::runtime_error("not enough free GPU memory available");
+ }
+
+ tmp_chunk_size = (std::min)(tmp_chunk_size, max_blocks_);
+
+ std::size_t highest_factor = 1;
+
+ if (n_parameters_to_fit_)
+ {
+ highest_factor
+ = n_points_
+ * n_parameters_to_fit_
+ * n_parameters_to_fit_
+ * sizeof(float);
+ }
+ else
+ {
+ highest_factor = n_points_ * n_parameters_;
+ }
+
+ std::size_t const highest_size_t_value
+ = std::numeric_limits< std::size_t >::max();
+
+ if (tmp_chunk_size > highest_size_t_value / highest_factor)
+ {
+ tmp_chunk_size = highest_size_t_value / highest_factor;
+ }
+
+ max_chunk_size_ = tmp_chunk_size;
+
+ int i = 1;
+ int const divisor = 10;
+ while (tmp_chunk_size > divisor)
+ {
+ i *= divisor;
+ tmp_chunk_size /= divisor;
+ }
+ max_chunk_size_ = max_chunk_size_ / i * i;
+ max_chunk_size_ = std::min(max_chunk_size_, n_fits_);
+}
+
+
+void Info::configure()
+{
+ power_of_two_n_points_ = 1;
+ while (power_of_two_n_points_ < n_points_)
+ {
+ power_of_two_n_points_ *= 2;
+ }
+
+ get_gpu_properties();
+ set_max_chunk_size();
+}
diff --git a/Gpufit/info.cu b/Gpufit/info.cu
new file mode 100644
index 0000000..60568f8
--- /dev/null
+++ b/Gpufit/info.cu
@@ -0,0 +1,31 @@
+#include "info.h"
+#include
+
+void Info::get_gpu_properties()
+{
+ cudaDeviceProp devProp;
+ CUDA_CHECK_STATUS(cudaGetDeviceProperties(&devProp, 0));
+ max_threads_ = devProp.maxThreadsPerBlock;
+ max_blocks_ = devProp.maxGridSize[0];
+
+ std::size_t free_bytes;
+ std::size_t total_bytes;
+ CUDA_CHECK_STATUS(cudaMemGetInfo(&free_bytes, &total_bytes));
+ available_gpu_memory_ = std::size_t(double(free_bytes) * 0.1);
+
+ if (available_gpu_memory_ > user_info_size_)
+ {
+ available_gpu_memory_ -= user_info_size_;
+ }
+ else
+ {
+ throw std::runtime_error("maximum user info size exceeded");
+ }
+}
+
+int getDeviceCount()
+{
+ int deviceCount;
+ CUDA_CHECK_STATUS(cudaGetDeviceCount(&deviceCount));
+ return deviceCount;
+}
\ No newline at end of file
diff --git a/Gpufit/info.h b/Gpufit/info.h
new file mode 100644
index 0000000..3f17623
--- /dev/null
+++ b/Gpufit/info.h
@@ -0,0 +1,48 @@
+#ifndef GPUFIT_PARAMETERS_H_INCLUDED
+#define GPUFIT_PARAMETERS_H_INCLUDED
+
+#include "definitions.h"
+#include
+
+
+class Info
+{
+public:
+ Info();
+ virtual ~Info();
+
+ void set_fits_per_block(std::size_t const n_fits);
+ void set_number_of_parameters_to_fit(int const * parameters_to_fit);
+ void configure();
+
+private:
+ void get_gpu_properties();
+ void set_max_chunk_size();
+
+public:
+ int n_parameters_;
+ int n_parameters_to_fit_;
+
+ int n_points_;
+ int power_of_two_n_points_;
+
+ std::size_t n_fits_;
+
+ std::size_t user_info_size_;
+
+ int max_n_iterations_;
+ std::size_t max_chunk_size_;
+ int n_fits_per_block_;
+ int model_id_;
+ int estimator_id_;
+ bool use_weights_;
+
+private:
+ int max_threads_;
+ std::size_t max_blocks_;
+ std::size_t available_gpu_memory_;
+};
+
+int getDeviceCount();
+
+#endif
diff --git a/Gpufit/interface.cpp b/Gpufit/interface.cpp
new file mode 100644
index 0000000..e8ddac3
--- /dev/null
+++ b/Gpufit/interface.cpp
@@ -0,0 +1,123 @@
+#include "gpufit.h"
+#include "interface.h"
+
+FitInterface::FitInterface
+(
+ float const * data,
+ float const * weights,
+ std::size_t n_fits,
+ int n_points,
+ float tolerance,
+ int max_n_iterations,
+ int estimator_id,
+ float const * initial_parameters,
+ int * parameters_to_fit,
+ char * user_info,
+ std::size_t user_info_size,
+ float * output_parameters,
+ int * output_states,
+ float * output_chi_squares,
+ int * output_n_iterations
+) :
+ data_( data ),
+ weights_( weights ),
+ initial_parameters_( initial_parameters ),
+ parameters_to_fit_( parameters_to_fit ),
+ user_info_( user_info ),
+ n_fits_(n_fits),
+ n_points_(n_points),
+ tolerance_(tolerance),
+ max_n_iterations_(max_n_iterations),
+ estimator_id_(estimator_id),
+ user_info_size_(user_info_size),
+ output_parameters_( output_parameters ),
+ output_states_(output_states),
+ output_chi_squares_(output_chi_squares),
+ output_n_iterations_(output_n_iterations),
+ n_parameters_(0)
+{}
+
+FitInterface::~FitInterface()
+{}
+
+void FitInterface::check_sizes()
+{
+ std::size_t maximum_size = std::numeric_limits< std::size_t >::max();
+
+ if (n_fits_ > maximum_size / n_points_ / sizeof(float))
+ {
+ throw std::runtime_error("maximum absolute number of data points exceeded");
+ }
+
+ if (n_fits_ > maximum_size / n_parameters_ / sizeof(float))
+ {
+ throw std::runtime_error("maximum number of fits and/or parameters exceeded");
+ }
+}
+
+void FitInterface::set_number_of_parameters(int const model_id)
+{
+ switch (model_id)
+ {
+ case GAUSS_1D:
+ n_parameters_ = 4;
+ break;
+ case GAUSS_2D:
+ n_parameters_ = 5;
+ break;
+ case GAUSS_2D_ELLIPTIC:
+ n_parameters_ = 6;
+ break;
+ case GAUSS_2D_ROTATED:
+ n_parameters_ = 7;
+ break;
+ case CAUCHY_2D_ELLIPTIC:
+ n_parameters_ = 6;
+ break;
+ case LINEAR_1D:
+ n_parameters_ = 2;
+ break;
+ default:
+ break;
+ }
+}
+
+void FitInterface::configure_info(Info & info, int const model_id)
+{
+ info.model_id_ = model_id;
+ info.n_fits_ = n_fits_;
+ info.n_points_ = n_points_;
+ info.max_n_iterations_ = max_n_iterations_;
+ info.estimator_id_ = estimator_id_;
+ info.user_info_size_ = user_info_size_;
+ info.n_parameters_ = n_parameters_;
+ info.use_weights_ = weights_ ? true : false;
+
+ info.set_number_of_parameters_to_fit(parameters_to_fit_);
+ info.configure();
+}
+
+void FitInterface::fit(int const model_id)
+{
+ set_number_of_parameters(model_id);
+
+ check_sizes();
+
+ Info info;
+ configure_info(info, model_id);
+
+ LMFit lmfit
+ (
+ data_,
+ weights_,
+ info,
+ initial_parameters_,
+ parameters_to_fit_,
+ user_info_,
+ output_parameters_,
+ output_states_,
+ output_chi_squares_,
+ output_n_iterations_
+ ) ;
+ lmfit.run(tolerance_);
+}
diff --git a/Gpufit/interface.h b/Gpufit/interface.h
new file mode 100644
index 0000000..27814aa
--- /dev/null
+++ b/Gpufit/interface.h
@@ -0,0 +1,63 @@
+#ifndef GPUFIT_INTERFACE_H_INCLUDED
+#define GPUFIT_INTERFACE_H_INCLUDED
+
+#include "lm_fit.h"
+
+static_assert( sizeof( int ) == 4, "32 bit 'int' type required" ) ;
+
+class FitInterface
+{
+public:
+ FitInterface
+ (
+ float const * data,
+ float const * weights,
+ std::size_t n_fits,
+ int n_points,
+ float tolerance,
+ int max_n_iterations,
+ int estimator_id,
+ float const * initial_parameters,
+ int * parameters_to_fit,
+ char * user_info,
+ std::size_t user_info_size,
+ float * output_parameters,
+ int * output_states,
+ float * output_chi_squares,
+ int * output_n_iterations
+ ) ;
+
+ virtual ~FitInterface();
+ void fit(int const model_id);
+
+private:
+ void set_number_of_parameters(int const model_id);
+ void check_sizes();
+ void configure_info(Info & info, int const model_id);
+
+public:
+
+private:
+ //input
+ float const * const data_ ;
+ float const * const weights_;
+ float const * const initial_parameters_;
+ int const * const parameters_to_fit_;
+ char * const user_info_;
+ int n_parameters_;
+
+ std::size_t const n_fits_;
+ int const n_points_;
+ float const tolerance_;
+ int const max_n_iterations_;
+ int const estimator_id_;
+ std::size_t const user_info_size_;
+
+ //output
+ float * output_parameters_;
+ int * output_states_;
+ float * output_chi_squares_;
+ int * output_n_iterations_;
+};
+
+#endif
diff --git a/Gpufit/linear_1d.cuh b/Gpufit/linear_1d.cuh
new file mode 100644
index 0000000..0b6a5c8
--- /dev/null
+++ b/Gpufit/linear_1d.cuh
@@ -0,0 +1,103 @@
+#ifndef GPUFIT_LINEAR1D_CUH_INCLUDED
+#define GPUFIT_LINEAR1D_CUH_INCLUDED
+
+/* Description of the calculate_linear1d function
+* ===================================================
+*
+* This function calculates the values of one-dimensional linear model functions
+* and their partial derivatives with respect to the model parameters.
+*
+* This function makes use of the user information data to pass in the
+* independent variables (X values) corresponding to the data.
+*
+* Note that if no user information is provided, the (X) coordinate of the
+* first data value is assumed to be (0.0). In this case, for a fit size of
+* M data points, the (X) coordinates of the data are simply the corresponding
+* array index values of the data array, starting from zero.
+*
+* Parameters:
+*
+* parameters: An input vector of concatenated sets of model parameters.
+* p[0]: offset
+* p[1]: slope
+*
+* n_fits: The number of fits.
+*
+* n_points: The number of data points per fit.
+*
+* n_parameters: The number of model parameters.
+*
+* values: An output vector of concatenated sets of model function values.
+*
+* derivatives: An output vector of concatenated sets of model function partial
+* derivatives.
+*
+* chunk_index: The chunk index. Used for indexing of user_info.
+*
+* user_info: An input vector containing user information.
+*
+* user_info_size: The number of elements in user_info.
+*
+* Calling the calculate_linear1d function
+* =======================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function. When calling the function, the blocks and threads of the __global__
+* function must be set up correctly, as shown in the following example code.
+*
+* dim3 threads(1, 1, 1);
+* dim3 blocks(1, 1, 1);
+*
+* threads.x = n_points * n_fits_per_block;
+* blocks.x = n_fits / n_fits_per_block;
+*
+* global_function<<< blocks,threads >>>(parameter1, ...);
+*
+*/
+
+__device__ void calculate_linear1d(
+ float const * parameters,
+ int const n_fits,
+ int const n_points,
+ int const n_parameters,
+ float * values,
+ float * derivatives,
+ int const chunk_index,
+ char * user_info,
+ std::size_t const user_info_size)
+{
+ int const n_fits_per_block = blockDim.x / n_points;
+ int const fit_in_block = threadIdx.x / n_points;
+ int const point_index = threadIdx.x - (fit_in_block*n_points);
+ int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block;
+
+ float * user_info_float = (float*) user_info;
+ float x = 0.0f;
+ if (!user_info_float)
+ {
+ x = point_index;
+ }
+ else if (user_info_size / sizeof(float) == n_points)
+ {
+ x = user_info_float[point_index];
+ }
+ else if (user_info_size / sizeof(float) > n_points)
+ {
+ int const chunk_begin = chunk_index * n_fits * n_points;
+ int const fit_begin = fit_index * n_points;
+ x = user_info_float[chunk_begin + fit_begin + point_index];
+ }
+
+ float* current_value = &values[fit_index*n_points];
+ float const * current_parameters = ¶meters[fit_index * n_parameters];
+
+ current_value[point_index] = current_parameters[0] + current_parameters[1] * x;
+
+ // derivatives
+
+ float * current_derivative = &derivatives[fit_index * n_parameters * n_points + point_index];
+ current_derivative[0] = 1.f;
+ current_derivative[1 * n_points] = x;
+}
+
+#endif
diff --git a/Gpufit/lm_fit.cpp b/Gpufit/lm_fit.cpp
new file mode 100644
index 0000000..19a658f
--- /dev/null
+++ b/Gpufit/lm_fit.cpp
@@ -0,0 +1,92 @@
+#include "lm_fit.h"
+#include
+
+LMFit::LMFit
+(
+ float const * const data,
+ float const * const weights,
+ Info & info,
+ float const * const initial_parameters,
+ int const * const parameters_to_fit,
+ char * const user_info,
+ float * output_parameters,
+ int * output_states,
+ float * output_chi_squares,
+ int * output_n_iterations
+) :
+ data_( data ),
+ weights_( weights ),
+ initial_parameters_( initial_parameters ),
+ parameters_to_fit_( parameters_to_fit ),
+ user_info_( user_info ),
+ output_parameters_( output_parameters ),
+ output_states_( output_states ),
+ output_chi_squares_( output_chi_squares ),
+ output_n_iterations_( output_n_iterations ),
+ info_(info),
+ chunk_size_(0),
+ ichunk_(0),
+ n_fits_left_(info.n_fits_),
+ parameters_to_fit_indices_(0)
+{}
+
+LMFit::~LMFit()
+{}
+
+void LMFit::set_parameters_to_fit_indices()
+{
+ int const n_parameters_to_fit = info_.n_parameters_;
+ for (int i = 0; i < n_parameters_to_fit; i++)
+ {
+ if (parameters_to_fit_[i])
+ {
+ parameters_to_fit_indices_.push_back(i);
+ }
+ }
+}
+
+void LMFit::get_results(GPUData const & gpu_data, int const n_fits)
+{
+ output_parameters_
+ = gpu_data.parameters_.copy( n_fits*info_.n_parameters_, output_parameters_ ) ;
+ output_states_ = gpu_data.states_.copy( n_fits, output_states_ ) ;
+ output_chi_squares_ = gpu_data.chi_squares_.copy( n_fits, output_chi_squares_ ) ;
+ output_n_iterations_ = gpu_data.n_iterations_.copy( n_fits, output_n_iterations_ ) ;
+}
+
+void LMFit::run(float const tolerance)
+{
+ set_parameters_to_fit_indices();
+
+ GPUData gpu_data(info_);
+ gpu_data.init_user_info(user_info_);
+
+ // loop over data chunks
+ while (n_fits_left_ > 0)
+ {
+ chunk_size_ = int((std::min)(n_fits_left_, info_.max_chunk_size_));
+
+ info_.set_fits_per_block(chunk_size_);
+
+ gpu_data.reset(chunk_size_);
+ gpu_data.init(
+ ichunk_,
+ data_,
+ weights_,
+ initial_parameters_,
+ parameters_to_fit_indices_);
+
+ LMFitCUDA lmfit_cuda(
+ tolerance,
+ info_,
+ gpu_data,
+ chunk_size_);
+
+ lmfit_cuda.run();
+
+ get_results(gpu_data, chunk_size_);
+
+ n_fits_left_ -= chunk_size_;
+ ichunk_++;
+ }
+}
diff --git a/Gpufit/lm_fit.h b/Gpufit/lm_fit.h
new file mode 100644
index 0000000..6ee3b86
--- /dev/null
+++ b/Gpufit/lm_fit.h
@@ -0,0 +1,88 @@
+#ifndef GPUFIT_LM_FIT_H_INCLUDED
+#define GPUFIT_LM_FIT_H_INCLUDED
+
+#include "definitions.h"
+#include "info.h"
+#include "gpu_data.cuh"
+
+class LMFitCUDA;
+
+class LMFit
+{
+public:
+ LMFit
+ (
+ float const * data,
+ float const * weights,
+ Info & info,
+ float const * initial_parameters,
+ int const * parameters_to_fit,
+ char * user_info,
+ float * output_parameters,
+ int * output_states,
+ float * output_chi_squares,
+ int * output_n_iterations
+ ) ;
+
+ virtual ~LMFit();
+
+ void run(float const tolerance);
+
+private:
+ void set_parameters_to_fit_indices();
+ void get_results(GPUData const & gpu_data, int const n_fits);
+
+ float const * const data_ ;
+ float const * const weights_ ;
+ float const * const initial_parameters_ ;
+ int const * const parameters_to_fit_;
+ char const * const user_info_;
+
+ float * output_parameters_ ;
+ int * output_states_ ;
+ float * output_chi_squares_ ;
+ int * output_n_iterations_ ;
+
+ int ichunk_;
+ int chunk_size_;
+ std::size_t n_fits_left_;
+
+ Info & info_;
+
+ std::vector parameters_to_fit_indices_;
+};
+
+class LMFitCUDA
+{
+public:
+ LMFitCUDA(
+ float const tolerance,
+ Info const & info,
+ GPUData & gpu_data,
+ int const n_fits);
+
+ virtual ~LMFitCUDA();
+
+ void run();
+
+private:
+ void calc_curve_values();
+ void calc_chi_squares();
+ void calc_gradients();
+ void calc_hessians();
+ void evaluate_iteration(int const iteration);
+ void solve_equation_system();
+
+public:
+
+private:
+ Info const & info_;
+ GPUData & gpu_data_;
+ int const n_fits_;
+
+ bool all_finished_;
+
+ float tolerance_;
+};
+
+#endif
diff --git a/Gpufit/lm_fit_cuda.cpp b/Gpufit/lm_fit_cuda.cpp
new file mode 100644
index 0000000..94799a0
--- /dev/null
+++ b/Gpufit/lm_fit_cuda.cpp
@@ -0,0 +1,57 @@
+#include "lm_fit.h"
+
+LMFitCUDA::LMFitCUDA(
+ float const tolerance,
+ Info const & info,
+ GPUData & gpu_data,
+ int const n_fits
+ ) :
+ info_(info),
+ gpu_data_(gpu_data),
+ n_fits_(n_fits),
+ all_finished_(false),
+ tolerance_(tolerance)
+{
+}
+
+LMFitCUDA::~LMFitCUDA()
+{
+}
+
+void LMFitCUDA::run()
+{
+ // initialize the chi-square values
+ calc_curve_values();
+ calc_chi_squares();
+ calc_gradients();
+ calc_hessians();
+
+ gpu_data_.copy(
+ gpu_data_.prev_chi_squares_,
+ gpu_data_.chi_squares_,
+ n_fits_);
+
+ // loop over the fit iterations
+ for (int iteration = 0; !all_finished_; iteration++)
+ {
+ // modify step width
+ // Gauss Jordan
+ // update fitting parameters
+ solve_equation_system();
+
+ // calculate fitting curve values and its derivatives
+ // calculate chi-squares, gradients and hessians
+ calc_curve_values();
+ calc_chi_squares();
+ calc_gradients();
+ calc_hessians();
+
+ // check which fits have converged
+ // flag finished fits
+ // check whether all fits finished
+ // save the number of needed iterations by each fitting process
+ // check whether chi-squares are increasing or decreasing
+ // update chi-squares, curve parameters and lambdas
+ evaluate_iteration(iteration);
+ }
+}
\ No newline at end of file
diff --git a/Gpufit/lm_fit_cuda.cu b/Gpufit/lm_fit_cuda.cu
new file mode 100644
index 0000000..8d74fb9
--- /dev/null
+++ b/Gpufit/lm_fit_cuda.cu
@@ -0,0 +1,253 @@
+#include "lm_fit.h"
+#include
+#include "cuda_kernels.cuh"
+#include "cuda_gaussjordan.cuh"
+
+void LMFitCUDA::solve_equation_system()
+{
+ dim3 threads(1, 1, 1);
+ dim3 blocks(1, 1, 1);
+
+ threads.x = info_.n_parameters_to_fit_*info_.n_fits_per_block_;
+ threads.y = 1;
+ blocks.x = n_fits_ / info_.n_fits_per_block_;
+ blocks.y = 1;
+ cuda_modify_step_widths<<< blocks, threads >>>(
+ gpu_data_.hessians_,
+ gpu_data_.lambdas_,
+ info_.n_parameters_to_fit_,
+ gpu_data_.iteration_falied_,
+ gpu_data_.finished_,
+ info_.n_fits_per_block_);
+ CUDA_CHECK_STATUS(cudaGetLastError());
+
+ int n_parameters_pow2 = 1;
+
+ while (n_parameters_pow2 < info_.n_parameters_to_fit_)
+ {
+ n_parameters_pow2 *= 2;
+ }
+
+ //set up to run the Gauss Jordan elimination
+ int const n_equations = info_.n_parameters_to_fit_;
+ int const n_solutions = n_fits_;
+
+ threads.x = n_equations + 1;
+ threads.y = n_equations;
+ blocks.x = n_solutions;
+ blocks.y = 1;
+
+ //set the size of the shared memory area for each block
+ int const shared_size
+ = sizeof(float) * ((threads.x * threads.y)
+ + n_parameters_pow2 + n_parameters_pow2);
+
+ //set up the singular_test vector
+ int * singular_tests;
+ CUDA_CHECK_STATUS(cudaMalloc((void**)&singular_tests, n_fits_ * sizeof(int)));
+
+ //run the Gauss Jordan elimination
+ cuda_gaussjordan<<< blocks, threads, shared_size >>>(
+ gpu_data_.deltas_,
+ gpu_data_.gradients_,
+ gpu_data_.hessians_,
+ gpu_data_.finished_,
+ singular_tests,
+ info_.n_parameters_to_fit_,
+ n_parameters_pow2);
+ CUDA_CHECK_STATUS(cudaGetLastError());
+
+ //set up to update the lm_state_gpu_ variable with the Gauss Jordan results
+ threads.x = std::min(n_fits_, 256);
+ threads.y = 1;
+ blocks.x = int(std::ceil(float(n_fits_) / float(threads.x)));
+ blocks.y = 1;
+
+ //update the lm_state_gpu_ variable
+ cuda_update_state_after_gaussjordan<<< blocks, threads >>>(
+ n_fits_,
+ singular_tests,
+ gpu_data_.states_);
+ CUDA_CHECK_STATUS(cudaGetLastError());
+
+ CUDA_CHECK_STATUS(cudaFree(singular_tests));
+
+ threads.x = info_.n_parameters_*info_.n_fits_per_block_;
+ threads.y = 1;
+ blocks.x = n_fits_ / info_.n_fits_per_block_;
+ blocks.y = 1;
+ cuda_update_parameters<<< blocks, threads >>>(
+ gpu_data_.parameters_,
+ gpu_data_.prev_parameters_,
+ gpu_data_.deltas_,
+ info_.n_parameters_to_fit_,
+ gpu_data_.parameters_to_fit_indices_,
+ gpu_data_.finished_,
+ info_.n_fits_per_block_);
+ CUDA_CHECK_STATUS(cudaGetLastError());
+}
+
+void LMFitCUDA::calc_curve_values()
+{
+ dim3 threads(1, 1, 1);
+ dim3 blocks(1, 1, 1);
+
+ threads.x = info_.n_points_ * info_.n_fits_per_block_;
+ threads.y = 1;
+ blocks.x = n_fits_ / info_.n_fits_per_block_;
+ blocks.y = 1;
+
+ cuda_calc_curve_values << < blocks, threads >> >(
+ gpu_data_.parameters_,
+ n_fits_,
+ info_.n_points_,
+ info_.n_parameters_,
+ gpu_data_.finished_,
+ gpu_data_.values_,
+ gpu_data_.derivatives_,
+ info_.n_fits_per_block_,
+ info_.model_id_,
+ gpu_data_.chunk_index_,
+ gpu_data_.user_info_,
+ info_.user_info_size_);
+ CUDA_CHECK_STATUS(cudaGetLastError());
+}
+
+void LMFitCUDA::calc_chi_squares()
+{
+ dim3 threads(1, 1, 1);
+ dim3 blocks(1, 1, 1);
+
+ int const shared_size
+ = sizeof(float)
+ * info_.power_of_two_n_points_
+ * info_.n_fits_per_block_;
+
+ threads.x = info_.power_of_two_n_points_*info_.n_fits_per_block_;
+ threads.y = 1;
+ blocks.x = n_fits_ / info_.n_fits_per_block_;
+ blocks.y = 1;
+
+ cuda_calculate_chi_squares <<< blocks, threads, shared_size >>>(
+ gpu_data_.chi_squares_,
+ gpu_data_.states_,
+ gpu_data_.iteration_falied_,
+ gpu_data_.prev_chi_squares_,
+ gpu_data_.data_,
+ gpu_data_.values_,
+ gpu_data_.weights_,
+ info_.n_points_,
+ info_.estimator_id_,
+ gpu_data_.finished_,
+ info_.n_fits_per_block_,
+ gpu_data_.user_info_,
+ info_.user_info_size_);
+ CUDA_CHECK_STATUS(cudaGetLastError());
+}
+
+void LMFitCUDA::calc_gradients()
+{
+ dim3 threads(1, 1, 1);
+ dim3 blocks(1, 1, 1);
+
+ int const shared_size
+ = sizeof(float)
+ * info_.power_of_two_n_points_
+ * info_.n_fits_per_block_;
+
+ threads.x = info_.power_of_two_n_points_*info_.n_fits_per_block_;
+ threads.y = 1;
+ blocks.x = n_fits_ / info_.n_fits_per_block_;
+ blocks.y = 1;
+
+ cuda_calculate_gradients <<< blocks, threads, shared_size >>>(
+ gpu_data_.gradients_,
+ gpu_data_.data_,
+ gpu_data_.values_,
+ gpu_data_.derivatives_,
+ gpu_data_.weights_,
+ info_.n_points_,
+ info_.n_parameters_,
+ info_.n_parameters_to_fit_,
+ gpu_data_.parameters_to_fit_indices_,
+ info_.estimator_id_,
+ gpu_data_.finished_,
+ gpu_data_.iteration_falied_,
+ info_.n_fits_per_block_,
+ gpu_data_.user_info_,
+ info_.user_info_size_);
+ CUDA_CHECK_STATUS(cudaGetLastError());
+}
+
+void LMFitCUDA::calc_hessians()
+{
+ dim3 threads(1, 1, 1);
+ dim3 blocks(1, 1, 1);
+
+ threads.x = info_.n_parameters_to_fit_;
+ threads.y = info_.n_parameters_to_fit_;
+ blocks.x = n_fits_;
+ blocks.y = 1;
+
+ cuda_calculate_hessians <<< blocks, threads >>>(
+ gpu_data_.hessians_,
+ gpu_data_.data_,
+ gpu_data_.values_,
+ gpu_data_.derivatives_,
+ gpu_data_.weights_,
+ info_.n_points_,
+ info_.n_parameters_,
+ info_.n_parameters_to_fit_,
+ gpu_data_.parameters_to_fit_indices_,
+ info_.estimator_id_,
+ gpu_data_.iteration_falied_,
+ gpu_data_.finished_,
+ gpu_data_.user_info_,
+ info_.user_info_size_);
+ CUDA_CHECK_STATUS(cudaGetLastError());
+}
+
+void LMFitCUDA::evaluate_iteration(int const iteration)
+{
+ dim3 threads(1, 1, 1);
+ dim3 blocks(1, 1, 1);
+
+ threads.x = std::min(n_fits_, 256);
+ threads.y = 1;
+ blocks.x = int(std::ceil(float(n_fits_) / float(threads.x)));
+ blocks.y = 1;
+
+ cuda_check_for_convergence<<< blocks, threads >>>(
+ gpu_data_.finished_,
+ tolerance_,
+ gpu_data_.states_,
+ gpu_data_.chi_squares_,
+ gpu_data_.prev_chi_squares_,
+ iteration,
+ info_.max_n_iterations_,
+ n_fits_);
+ CUDA_CHECK_STATUS(cudaGetLastError());
+
+ gpu_data_.set(gpu_data_.all_finished_, 1);
+
+ cuda_evaluate_iteration<<< blocks, threads >>>(
+ gpu_data_.all_finished_,
+ gpu_data_.n_iterations_,
+ gpu_data_.finished_,
+ iteration,
+ gpu_data_.states_,
+ n_fits_);
+ CUDA_CHECK_STATUS(cudaGetLastError());
+
+ gpu_data_.read(&all_finished_, gpu_data_.all_finished_);
+
+ cuda_prepare_next_iteration<<< blocks, threads >>>(
+ gpu_data_.lambdas_,
+ gpu_data_.chi_squares_,
+ gpu_data_.prev_chi_squares_,
+ gpu_data_.parameters_,
+ gpu_data_.prev_parameters_,
+ n_fits_,
+ info_.n_parameters_);
+ CUDA_CHECK_STATUS(cudaGetLastError());
+}
diff --git a/Gpufit/lse.cuh b/Gpufit/lse.cuh
new file mode 100644
index 0000000..e615b01
--- /dev/null
+++ b/Gpufit/lse.cuh
@@ -0,0 +1,186 @@
+#ifndef GPUFIT_LSE_CUH_INCLUDED
+#define GPUFIT_LSE_CUH_INCLUDED
+
+/* Description of the calculate_chi_square_lse function
+* =====================================================
+*
+* This function calculates the chi-square values for the weighted LSE estimator.
+*
+* Parameters:
+*
+* chi_square: An output vector of chi-square values for each data point.
+*
+* point_index: The data point index.
+*
+* data: An input vector of data values.
+*
+* value: An input vector of fitting curve values.
+*
+* weight: An input vector of values for weighting the chi-square values.
+*
+* state: A pointer to a value which indicates whether the fitting
+* process was carreid out correctly or which problem occurred.
+* In this function it is not used. It can be used in functions calculating
+* other estimators than the LSE, such as MLE. It is passed into this function
+* to provide the same interface for all estimator functions.
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_chi_square_lse function
+* =============================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function.
+*
+*/
+
+__device__ void calculate_chi_square_lse(
+ volatile float * chi_square,
+ int const point_index,
+ float const * data,
+ float const * value,
+ float const * weight,
+ int * state,
+ char * user_info,
+ std::size_t const user_info_size)
+{
+ float const deviation = value[point_index] - data[point_index];
+
+ if (weight)
+ {
+ chi_square[point_index] = deviation * deviation * weight[point_index];
+ }
+ else
+ {
+ chi_square[point_index] = deviation * deviation;
+ }
+}
+
+/* Description of the calculate_hessian_lse function
+* ==================================================
+*
+* This function calculates the hessian matrix values of the weighted LSE equation.
+* The calculation is performed based on previously calculated fitting curve derivative
+* values.
+*
+* Parameters:
+*
+* hessian: An output vector of values of the hessian matrix for each data point.
+*
+* point_index: The data point index.
+*
+* parameter_index_i: Index of the hessian column.
+*
+* parameter_index_j: Index of the hessian row.
+*
+* data: An input vector of data values.
+*
+* value: An input vector of fitting curve values.
+*
+* derivative: An input vector of partial derivative values of the fitting
+* curve for each data point.
+*
+* weight: An input vector of values for weighting the hessian matrix values.
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_hessian_lse function
+* ==========================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function.
+*
+*/
+
+__device__ void calculate_hessian_lse(
+ double * hessian,
+ int const point_index,
+ int const parameter_index_i,
+ int const parameter_index_j,
+ float const * data,
+ float const * value,
+ float const * derivative,
+ float const * weight,
+ char * user_info,
+ std::size_t const user_info_size)
+{
+ if (weight)
+ {
+ *hessian
+ += derivative[parameter_index_i] * derivative[parameter_index_j]
+ * weight[point_index];
+ }
+ else
+ {
+ *hessian
+ += derivative[parameter_index_i] * derivative[parameter_index_j];
+ }
+}
+
+/* Description of the calculate_gradient_lse function
+* ===================================================
+*
+* This function calculates the gradient values of the weighted LSE equation
+* based on previously calculated fitting curve derivative values.
+*
+* Parameters:
+*
+* gradient: An output vector of values of the gradient vector for each data point.
+*
+* point_index: The data point index.
+*
+* parameter_index: The parameter index.
+*
+* n_parameters: The number of fitting curve parameters.
+*
+* data: An input vector of data values.
+*
+* value: An input vector of fitting curve values.
+*
+* derivative: An input vector of partial derivative values of the fitting
+* curve for each data point.
+*
+* weight: An input vector of values for weighting gradient values.
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_gradient_lse function
+* ===========================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function.
+*
+*/
+
+__device__ void calculate_gradient_lse(
+ volatile float * gradient,
+ int const point_index,
+ int const parameter_index,
+ float const * data,
+ float const * value,
+ float const * derivative,
+ float const * weight,
+ char * user_info,
+ std::size_t const user_info_size)
+{
+ float const deviation = data[point_index] - value[point_index];
+
+ if (weight)
+ {
+ gradient[point_index]
+ = derivative[parameter_index] * deviation * weight[point_index];
+ }
+ else
+ {
+ gradient[point_index]
+ = derivative[parameter_index] * deviation;
+ }
+}
+
+#endif
diff --git a/Gpufit/matlab/CMakeLists.txt b/Gpufit/matlab/CMakeLists.txt
new file mode 100644
index 0000000..b0c5dc8
--- /dev/null
+++ b/Gpufit/matlab/CMakeLists.txt
@@ -0,0 +1,69 @@
+
+# MATLAB Gpufit binding
+
+find_package( Matlab COMPONENTS MX_LIBRARY )
+
+if( NOT Matlab_FOUND )
+ message( STATUS "Matlab and/or MX_Library NOT found - skipping Gpufit Matlab binding!" )
+ return()
+endif()
+
+# MATLAB MEX FILE
+
+set( Headers
+ )
+
+set( Sources
+ mex/GpufitMex.cpp
+ )
+
+add_library( GpufitMex SHARED
+ ${Headers}
+ ${Sources}
+ )
+
+set_property( TARGET GpufitMex
+ PROPERTY SUFFIX .${Matlab_MEX_EXTENSION} )
+
+set_property( TARGET GpufitMex
+ PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" )
+
+target_include_directories( GpufitMex PRIVATE ${Matlab_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR} )
+
+target_link_libraries( GpufitMex Gpufit ${Matlab_LIBRARIES} )
+
+if( WIN32 )
+ SET(CMAKE_SHARED_LINKER_FLAGS "/export:mexFunction")
+endif()
+
+add_matlab_launcher( GpufitMex "${CMAKE_CURRENT_SOURCE_DIR}" )
+
+# MATLAB Gpufit PACKAGE
+
+set( build_directory "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/matlab" )
+set( package_files
+ "${CMAKE_CURRENT_SOURCE_DIR}/EstimatorID.m"
+ "${CMAKE_CURRENT_SOURCE_DIR}/gpufit.m"
+ "${CMAKE_CURRENT_SOURCE_DIR}/ModelID.m"
+ "${CMAKE_CURRENT_SOURCE_DIR}/README.txt"
+)
+set( binary_gpufit $ )
+set( binary_mex $ )
+
+add_custom_target( MATLAB_GPUFIT_PACKAGE
+ COMMAND ${CMAKE_COMMAND} -E
+ remove_directory ${build_directory}
+ COMMAND ${CMAKE_COMMAND} -E
+ make_directory ${build_directory}
+ COMMAND ${CMAKE_COMMAND} -E
+ copy_if_different ${package_files} ${build_directory}
+ COMMAND ${CMAKE_COMMAND} -E
+ copy_if_different ${binary_gpufit} ${build_directory}
+ COMMAND ${CMAKE_COMMAND} -E
+ copy_if_different ${binary_mex} ${build_directory}
+ COMMENT "Creating Gpufit Matlab package"
+)
+set_property( TARGET MATLAB_GPUFIT_PACKAGE PROPERTY FOLDER CMakePredefinedTargets )
+add_dependencies( MATLAB_GPUFIT_PACKAGE Gpufit GpufitMex)
+
+# add launcher
diff --git a/Gpufit/matlab/EstimatorID.m b/Gpufit/matlab/EstimatorID.m
new file mode 100644
index 0000000..a853ffa
--- /dev/null
+++ b/Gpufit/matlab/EstimatorID.m
@@ -0,0 +1,6 @@
+classdef EstimatorID
+ properties (Constant = true)
+ LSE = 0
+ MLE = 1
+ end
+end
\ No newline at end of file
diff --git a/Gpufit/matlab/ModelID.m b/Gpufit/matlab/ModelID.m
new file mode 100644
index 0000000..174c703
--- /dev/null
+++ b/Gpufit/matlab/ModelID.m
@@ -0,0 +1,10 @@
+classdef ModelID
+ properties (Constant = true)
+ GAUSS_1D = 0
+ GAUSS_2D = 1
+ GAUSS_2D_ELLIPTIC = 2
+ GAUSS_2D_ROTATED = 3
+ CAUCHY_2D_ELLIPTIC = 4
+ LINEAR_1D = 5
+ end
+end
\ No newline at end of file
diff --git a/Gpufit/matlab/README.txt b/Gpufit/matlab/README.txt
new file mode 100644
index 0000000..02ddfd2
--- /dev/null
+++ b/Gpufit/matlab/README.txt
@@ -0,0 +1,19 @@
+Matlab binding for the [Gpufit library](https://github.com/gpufit/Gpufit) which implements Levenberg Marquardt curve fitting in CUDA
+
+Requirements
+
+- A CUDA capable graphics card with a recent Nvidia graphics driver (at least 367.48 / July 2016)
+- Windows
+- Matlab 32/64bit
+
+Installation
+
+An installation is not necessary. However, this path must be part of the Matlab path. Use `addpath` if necessary.
+
+Examples
+
+See examples folder. The examples are fully functional only from Matlab2014a.
+
+Troubleshooting
+
+A common reason for the error message 'CUDA driver version is insufficient for CUDA runtime version' is an outdated Nvidia graphics driver.
\ No newline at end of file
diff --git a/Gpufit/matlab/examples/gauss2d.m b/Gpufit/matlab/examples/gauss2d.m
new file mode 100644
index 0000000..bf478a4
--- /dev/null
+++ b/Gpufit/matlab/examples/gauss2d.m
@@ -0,0 +1,182 @@
+function gauss2d()
+% Example of the Matlab binding of the Gpufit library implementing
+% Levenberg Marquardt curve fitting in CUDA
+% https://github.com/gpufit/Gpufit
+%
+% Multiple fits of a 2D Gaussian peak function with Poisson distributed noise
+% http://gpufit.readthedocs.io/en/latest/bindings.html#matlab
+
+% perform some 2D Gaussian peak fits with a symmetrical Gaussian peak
+fit_gauss2d();
+
+% perform some 2D Gaussian peak fits with an asymmetrical, rotated Gaussian peak
+fit_gauss2d_rotated();
+
+end
+function fit_gauss2d()
+
+%% number of fits and fit points
+number_fits = 1e4;
+size_x = 20;
+number_parameters = 5;
+
+%% set input arguments
+
+% true parameters
+true_parameters = single([20, 9.5, 9.5, 3, 10]);
+
+% initialize random number generator
+rng(0);
+
+% initial parameters (randomized)
+initial_parameters = repmat(single(true_parameters'), [1, number_fits]);
+% randomize relative to width for positions
+initial_parameters([2,3], :) = initial_parameters([2,3], :) + true_parameters(4) * (-0.2 + 0.4 * rand(2, number_fits));
+% randomize relative for other parameters
+initial_parameters([1,4,5], :) = initial_parameters([1,4,5], :) .* (0.8 + 0.4 * rand(3, number_fits));
+
+% generate x and y values
+g = single(0 : size_x - 1);
+[x, y] = ndgrid(g, g);
+
+% generate data with Poisson noise
+data = gaussian_2d(x, y, true_parameters);
+data = repmat(data(:), [1, number_fits]);
+data = poissrnd(data);
+
+% tolerance
+tolerance = 1e-3;
+
+% maximum number of iterations
+max_n_iterations = 20;
+
+% estimator id
+estimator_id = EstimatorID.MLE;
+
+% model ID
+model_id = ModelID.GAUSS_2D;
+
+%% run Gpufit
+[parameters, states, chi_squares, n_iterations, time] = gpufit(data, [], ...
+ model_id, initial_parameters, tolerance, max_n_iterations, [], estimator_id, []);
+
+%% displaying results
+display_results('2D Gaussian peak', model_id, number_fits, number_parameters, size_x, time, true_parameters, parameters, states, chi_squares, n_iterations);
+
+end
+
+function fit_gauss2d_rotated()
+
+%% number of fits and fit points
+number_fits = 1e4;
+size_x = 20;
+number_parameters = 7;
+
+%% set input arguments
+
+% true parameters
+true_parameters = single([200, 9.5, 9.5, 3, 4, 10, 0.5]);
+
+% initialize random number generator
+rng(0);
+
+% initial parameters (randomized)
+initial_parameters = repmat(single(true_parameters'), [1, number_fits]);
+% randomize relative to width for positions
+initial_parameters(2, :) = initial_parameters(2, :) + true_parameters(4) * (-0.2 + 0.4 * rand(1, number_fits));
+initial_parameters(3, :) = initial_parameters(3, :) + true_parameters(5) * (-0.2 + 0.4 * rand(1, number_fits));
+% randomize relative for other parameters
+initial_parameters([1,4,5,6,7], :) = initial_parameters([1,4,5,6,7], :) .* (0.8 + 0.4 * rand(5, number_fits));
+
+% generate x and y values
+g = single(0 : size_x - 1);
+[x, y] = ndgrid(g, g);
+
+% generate data with Poisson noise
+data = gaussian_2d_rotated(x, y, true_parameters);
+data = repmat(data(:), [1, number_fits]);
+data = poissrnd(data);
+
+% tolerance
+tolerance = 1e-3;
+
+% maximum number of iterations
+max_n_iterations = 20;
+
+% estimator id
+estimator_id = EstimatorID.MLE;
+
+% model ID
+model_id = ModelID.GAUSS_2D_ROTATED;
+
+%% run Gpufit
+[parameters, states, chi_squares, n_iterations, time] = gpufit(data, [], ...
+ model_id, initial_parameters, tolerance, max_n_iterations, [], estimator_id, []);
+
+%% displaying results
+display_results('2D rotated Gaussian peak', model_id, number_fits, number_parameters, size_x, time, true_parameters, parameters, states, chi_squares, n_iterations);
+
+
+end
+
+function g = gaussian_2d(x, y, p)
+% Generates a 2D Gaussian peak.
+% http://gpufit.readthedocs.io/en/latest/api.html#gauss-2d
+%
+% x,y - x and y grid position values
+% p - parameters (amplitude, x,y center position, width, offset)
+
+g = p(1) * exp(-((x - p(2)).^2 + (y - p(3)).^2) / (2 * p(4)^2)) + p(5);
+
+end
+
+function g = gaussian_2d_rotated(x, y, p)
+% Generates a 2D rotated elliptic Gaussian peak.
+% http://gpufit.readthedocs.io/en/latest/api.html#d-rotated-elliptic-gaussian-peak
+%
+% x,y - x and y grid position values
+% p - parameters (amplitude, x,y center position, width, offset)
+
+% cosine and sine of rotation angle
+cp = cos(p(7));
+sp = sin(p(7));
+
+% Gaussian peak with two axes
+arga = (x - p(2)) .* cp - (y - p(3)) .* sp;
+argb = (x - p(2)) .* sp + (y - p(3)) .* cp;
+ex = exp(-0.5 .* (((arga / p(4)) .* (arga / p(4))) + ((argb / p(5)) .* (argb / p(5)))));
+g = p(1) .* ex + p(6);
+
+end
+
+function display_results(name, model_id, number_fits, number_parameters, size_x, time, true_parameters, parameters, states, chi_squares, n_iterations)
+
+%% displaying results
+converged = states == 0;
+fprintf('\nGpufit of %s\n', name);
+
+% print summary
+fprintf('\nmodel ID: %d\n', model_id);
+fprintf('number of fits: %d\n', number_fits);
+fprintf('fit size: %d x %d\n', size_x, size_x);
+fprintf('mean chi-square: %6.2f\n', mean(chi_squares(converged)));
+fprintf('mean iterations: %6.2f\n', mean(n_iterations(converged)));
+fprintf('time: %6.2f s\n', time);
+
+% get fit states
+number_converged = sum(converged);
+fprintf('\nratio converged %6.2f %%\n', number_converged / number_fits * 100);
+fprintf('ratio max it. exceeded %6.2f %%\n', sum(states == 1) / number_fits * 100);
+fprintf('ratio singular hessian %6.2f %%\n', sum(states == 2) / number_fits * 100);
+fprintf('ratio neg curvature MLE %6.2f %%\n', sum(states == 3) / number_fits * 100);
+
+% mean and std of fitted parameters
+converged_parameters = parameters(:, converged);
+converged_parameters_mean = mean(converged_parameters, 2);
+converged_parameters_std = std(converged_parameters, [], 2);
+fprintf('\nparameters of %s\n', name);
+for i = 1 : number_parameters
+ fprintf('p%d true %6.2f mean %6.2f std %6.2f\n', i, true_parameters(i), converged_parameters_mean(i), converged_parameters_std(i));
+end
+
+end
\ No newline at end of file
diff --git a/Gpufit/matlab/examples/gauss2d_comparison.m b/Gpufit/matlab/examples/gauss2d_comparison.m
new file mode 100644
index 0000000..39dc68b
--- /dev/null
+++ b/Gpufit/matlab/examples/gauss2d_comparison.m
@@ -0,0 +1,206 @@
+function gauss2d_comparison()
+% Example of the Matlab binding of the Gpufit library implementing
+% Levenberg Marquardt curve fitting in CUDA
+% https://github.com/gpufit/Gpufit
+%
+% Multiple fits of a 2D Gaussian peak function with Poisson distributed noise
+% compared to a generic Matlab implementation using fminunc and supplying
+% the gradient by the user (uses quasi-newton as algorithm)
+% http://gpufit.readthedocs.io/en/latest/bindings.html#matlab
+
+%% number of fits and fit points
+number_fits = 1e3;
+size_x = 20;
+number_parameters = 5;
+
+%% set input arguments
+
+% true parameters
+true_parameters = single([10, 9.5, 9.5, 3, 10]);
+
+% initialize random number generator
+rng(0);
+
+% initial parameters (randomized)
+initial_parameters = repmat(single(true_parameters'), [1, number_fits]);
+% randomize relative to width for positions
+initial_parameters([2,3], :) = initial_parameters([2,3], :) + true_parameters(4) * (-0.2 + 0.4 * rand(2, number_fits));
+% randomize relative for other parameters
+initial_parameters([1,4,5], :) = initial_parameters([1,4,5], :) .* (0.8 + 0.4 * rand(3, number_fits));
+
+% generate x and y values
+g = single(0 : size_x - 1);
+[x, y] = ndgrid(g, g);
+
+% generate data with Poisson noise
+data = gaussian_2d(x, y, true_parameters);
+data = repmat(data(:), [1, number_fits]);
+data = poissrnd(data);
+
+% tolerance
+tolerance = 1e-4;
+
+% maximum number of iterations
+max_n_iterations = 20;
+
+% estimator id
+estimator_id = EstimatorID.MLE;
+
+% model ID
+model_id = ModelID.GAUSS_2D; % Gaussian peak in 2D
+
+%% run Gpufit
+fprintf('run Gpufit\n');
+[gf_parameters, gf_states, gf_chi_squares, gf_n_iterations, time] = gpufit(data, [], ...
+ model_id, initial_parameters, tolerance, max_n_iterations, [], estimator_id, []);
+
+% display results
+display_results('Gpufit', gf_parameters, gf_states, gf_chi_squares, gf_n_iterations, time, true_parameters);
+
+% store parameters
+
+%% run Matlab
+
+% convert data and initial_parameters to double (otherwise causes an error
+% in fminunc)
+data = double(data);
+initial_parameters = double(initial_parameters);
+xi = double(x(:)');
+yi = double(y(:)');
+
+% set fit options
+options = optimoptions(@fminunc,'Display', 'off', 'MaxIter', max_n_iterations, 'Algorithm', 'quasi-newton', 'TolFun', tolerance, 'GradObj', 'on', 'DerivativeCheck', 'off', 'Diagnostics', 'off');
+
+% initialize output arrays
+m_parameters = zeros(number_parameters, number_fits);
+m_states = zeros(1, number_fits);
+m_chi_squares = zeros(1, number_fits);
+m_n_iterations = zeros(1, number_fits);
+
+% loop over each fit
+fprintf('\n')
+progress = 0;
+L = 50; % length of progressbar
+tic;
+for i = 1 : number_fits
+
+ % get data and initial_parameters
+ d = data(:, i)';
+ p0 = initial_parameters(:, i);
+
+ % define minimizer function (give grid and data as implicit parameters)
+ fun = @(p) minimizer(p, xi, yi, d);
+
+ % call to fminunc
+ [p, fval, exitflag, output] = fminunc(fun, p0, options);
+
+ % copy to output
+ m_parameters(:, i) = p;
+ m_chi_squares(i) = fval;
+ m_states(i) = exitflag - 1;
+ m_n_iterations(i) = output.iterations;
+
+ progress = progress + 1;
+ if progress >= number_fits / L
+ progress = 0;
+ fprintf('|');
+ end
+end
+time = toc;
+fprintf(repmat('\b', [1, L]));
+
+% display results
+display_results('Matlab (one CPU kernel)', m_parameters, m_states, m_chi_squares, m_n_iterations, time, true_parameters);
+
+end
+
+function [f, g] = minimizer(p, xi, yi, d)
+% calls the model with the current parameters, then the likelihood function
+% and returns value and derivatives of the likelihood function
+%
+% p - current parameters
+% xi, yi - grid positions
+% d - current data
+
+if nargout > 1
+ [m, mg] = gaussian_2d_with_gradient(xi, yi, p);
+ [f, g] = poisson_likelihood(m, mg, d);
+else
+ m = gaussian_2d(xi, yi, p);
+ f = poisson_likelihood(m, [], d);
+end
+
+end
+
+function [f, g] = poisson_likelihood(m, mg, d)
+% Calculates value and derivatives of the poisson likelihood function for
+% given model and model derivatives
+
+h = d > 0;
+f = 2 * (sum(m-d) - sum(d(h) .* log(m(h) ./ d(h))));
+
+if nargout > 1 % gradient required
+ h = 2 * (1 - d ./ max(m, 1e-6));
+ h = repmat(h, [size(mg, 1), 1]);
+ g = h .* mg;
+ g = sum(g, 2);
+end
+
+end
+
+
+function display_results(name, parameters, ~, chi_squares, n_iterations, time, true_parameters)
+% displaying results
+
+fprintf('*%s*\n', name);
+number_parameters = size(parameters, 1);
+number_fits = size(parameters, 2);
+
+% print summary
+fprintf('\nnumber of fits: %d\n', number_fits);
+fprintf('mean chi-square: %6.2f\n', mean(chi_squares));
+fprintf('mean iterations: %6.2f\n', mean(n_iterations));
+fprintf('time: %6.2f s\n', time);
+fprintf('fits per second: %.0f\n', number_fits / time);
+
+% mean and std of fitted parameters
+parameters_mean = mean(parameters, 2);
+parameters_std = std(parameters, [], 2);
+fprintf('\nparameters of 2D Gaussian peak\n');
+for i = 1 : number_parameters
+ fprintf('p%d true %6.2f mean %6.2f std %6.2f\n', i, true_parameters(i), parameters_mean(i), parameters_std(i));
+end
+
+end
+
+function f = gaussian_2d(x, y, p)
+% Generates a 2D Gaussian peak.
+% http://gpufit.readthedocs.io/en/latest/api.html#gauss-2d
+%
+% x,y - x and y grid position values
+% p - parameters (amplitude, x,y center position, width, offset)
+
+f = p(1) * exp(-((x - p(2)).^2 + (y - p(3)).^2) / (2 * p(4)^2)) + p(5);
+
+end
+
+function [f, g] = gaussian_2d_with_gradient(x, y, p)
+% Computes the gradient for a 2D Gaussian peak with respect to parameters.
+
+dx = x - p(2);
+dy = y - p(3);
+p42 = p(4)^2;
+arg = (dx.^2 + dy.^2) / p42;
+exp_f = exp(-0.5 * arg);
+p1_exp_f = p(1) * exp_f;
+
+f = p1_exp_f + p(5);
+
+g1 = exp_f;
+g2 = p1_exp_f .* dx / p42;
+g3 = p1_exp_f .* dy / p42;
+g4 = p1_exp_f .* arg / p(4);
+g5 = ones(size(x));
+g = [g1; g2; g3; g4; g5];
+
+end
diff --git a/Gpufit/matlab/examples/gauss2d_plot.m b/Gpufit/matlab/examples/gauss2d_plot.m
new file mode 100644
index 0000000..cef6adc
--- /dev/null
+++ b/Gpufit/matlab/examples/gauss2d_plot.m
@@ -0,0 +1,117 @@
+function gauss2d_plot()
+% Example of the Matlab binding of the Gpufit library implementing
+% Levenberg Marquardt curve fitting in CUDA
+% https://github.com/gpufit/Gpufit
+%
+% Multiple fits of a 2D Gaussian peak function with Poisson distributed noise
+% repeated for a different total number of fits each time and plotting the
+% results
+% http://gpufit.readthedocs.io/en/latest/bindings.html#matlab
+
+%% number of fit points
+size_x = 5;
+n_points = size_x * size_x;
+
+%% set input arguments
+
+% mean true parameters
+mean_true_parameters = single([100, 3, 3, 1, 10]);
+
+% average noise level
+average_noise_level = 10;
+
+% initialize random number generator
+rng(0);
+
+% tolerance
+tolerance = 1e-4;
+
+% max number of itetations
+max_n_iterations = 10;
+
+% model id
+model_id = ModelID.GAUSS_2D;
+
+%% loop over different number of fits
+n_fits_all = round(logspace(2, 6, 20));
+
+% generate x and y values
+g = single(0 : size_x - 1);
+[x, y] = ndgrid(g, g);
+
+% loop
+speed = zeros(length(n_fits_all), 1);
+for i = 1:length(n_fits_all)
+ n_fits = n_fits_all(i);
+
+ % vary positions of 2D Gaussians peaks slightly
+ test_parameters = repmat(mean_true_parameters', [1, n_fits]);
+ test_parameters([2,3], :) = test_parameters([2,3], :) + mean_true_parameters(4) * (-0.2 + 0.4 * rand(2, n_fits));
+
+ % generate data
+ data = gaussians_2d(x, y, test_parameters);
+ data = reshape(data, [n_points, n_fits]);
+
+ % add noise
+ data = data + average_noise_level * randn(size(data), 'single');
+
+ % initial parameters (randomized)
+ initial_parameters = repmat(mean_true_parameters', [1, n_fits]);
+ % randomize relative to width for positions
+ initial_parameters([2,3], :) = initial_parameters([2,3], :) + mean_true_parameters(4) * (-0.2 + 0.4 * rand(2, n_fits));
+ % randomize relative for other parameters
+ initial_parameters([1,4,5], :) = initial_parameters([1,4,5], :) .* (0.8 + 0.4 * rand(3, n_fits));
+
+ % run Gpufit
+ [parameters, states, chi_squares, n_iterations, time] = gpufit(data, [], ...
+ model_id, initial_parameters, tolerance, max_n_iterations);
+
+ % analyze result
+ converged = states == 0;
+ speed(i) = n_fits / time;
+ precision_x0 = std(parameters(2, converged) - test_parameters(2, converged));
+
+ % display result
+ fprintf(' iterations: %.2f | time: %.3f s | speed: %8.0f fits/s\n', ...
+ mean(n_iterations(converged)), time, speed(i));
+end
+
+%% plot
+figure();
+semilogx(n_fits_all, speed, 'bo-')
+xlabel('number of fits per function call')
+ylabel('fits per second')
+legend('Gpufit', 'Location', 'NorthWest')
+grid on;
+xlim(n_fits_all([1,end]));
+
+end
+
+function g = gaussians_2d(x, y, p)
+% Generates many 2D Gaussians peaks for a given set of parameters
+
+n_fits = size(p, 2);
+msg = sprintf('generating %d fits ', n_fits);
+fprintf(msg);
+
+g = zeros([size(x), n_fits], 'single');
+
+progress = 0;
+L = 50; % length of progressbar
+l = 0;
+for i = 1 : n_fits
+
+ pi = p(:, i);
+ g(:, :, i) = pi(1) * exp(-((x - pi(2)).^2 + (y - pi(3)).^2) / (2 * pi(4)^2)) + pi(5);
+
+ progress = progress + 1;
+ if progress >= n_fits / L
+ progress = 0;
+ fprintf('|');
+ l = l + 1;
+ end
+end
+fprintf(repmat('\b', [1, length(msg) + l]));
+fprintf('%7d fits', n_fits);
+
+end
diff --git a/Gpufit/matlab/examples/simple.m b/Gpufit/matlab/examples/simple.m
new file mode 100644
index 0000000..27487d1
--- /dev/null
+++ b/Gpufit/matlab/examples/simple.m
@@ -0,0 +1,26 @@
+function simple()
+% Example of the Matlab binding of the Gpufit library implementing
+% Levenberg Marquardt curve fitting in CUDA
+% https://github.com/gpufit/Gpufit
+%
+% Simple example demonstrating a minimal call of all needed parameters for the Matlab interface
+% http://gpufit.readthedocs.io/en/latest/bindings.html#matlab
+
+% number of fits, number of points per fit
+number_fits = 10;
+number_points = 10;
+
+% model ID and number of parameter
+model_id = ModelID.GAUSS_1D;
+number_parameter = 4;
+
+% initial parameters
+initial_parameters = zeros(number_parameter, number_fits, 'single');
+
+% data
+data = zeros(number_points, number_fits, 'single');
+
+% run Gpufit
+[parameters, states, chi_squares, number_iterations, execution_time] = gpufit(data, [], model_id, initial_parameters);
+
+end
\ No newline at end of file
diff --git a/Gpufit/matlab/gpufit.m b/Gpufit/matlab/gpufit.m
new file mode 100644
index 0000000..2e3beae
--- /dev/null
+++ b/Gpufit/matlab/gpufit.m
@@ -0,0 +1,119 @@
+function [parameters, states, chi_squares, n_iterations, time]...
+ = gpufit(data, weights, model_id, initial_parameters, tolerance, max_n_iterations, parameters_to_fit, estimator_id, user_info)
+% Wrapper around the Gpufit mex file.
+%
+% Optional arguments can be given as empty matrix [].
+%
+% Default values as specified
+
+%% size checks
+
+% number of input parameter (variable)
+if nargin < 9
+ user_info = [];
+ if nargin < 8
+ estimator_id = [];
+ if nargin < 7
+ parameters_to_fit = [];
+ if nargin < 6
+ max_n_iterations = [];
+ if nargin < 5
+ tolerance = [];
+ assert(nargin == 4, 'Not enough parameters');
+ end
+ end
+ end
+ end
+end
+
+% data is 2D and read number of points and fits
+data_size = size(data);
+assert(length(data_size) == 2, 'data is not two-dimensional');
+n_points = data_size(1);
+n_fits = data_size(2);
+
+% consistency with weights (if given)
+if ~isempty(weights)
+ assert(isequal(data_size, size(weights)), 'Dimension mismatch between data and weights')
+end
+
+% initial parameters is 2D and read number of parameters
+initial_parameters_size = size(initial_parameters);
+assert(length(initial_parameters_size) == 2, 'initial_parameters is not two-dimensional');
+n_parameters = initial_parameters_size(1);
+assert(n_fits == initial_parameters_size(2), 'Dimension mismatch in number of fits between data and initial_parameters');
+
+% consistency with parameters_to_fit (if given)
+if ~isempty(parameters_to_fit)
+ assert(size(parameters_to_fit, 1) == n_parameters, 'Dimension mismatch in number of parameters between initial_parameters and parameters_to_fit');
+end
+
+%% default values
+
+% tolerance
+if isempty(tolerance)
+ tolerance = 1e-4;
+end
+
+% max_n_iterations
+if isempty(max_n_iterations)
+ max_n_iterations = 25;
+end
+
+% estimator_id
+if isempty(estimator_id)
+ estimator_id = EstimatorID.LSE;
+end
+
+% parameters_to_fit
+if isempty(parameters_to_fit)
+ parameters_to_fit = ones(n_parameters, 1, 'int32');
+end
+
+% now only weights and user_info could be not given (empty matrix)
+
+%% type checks
+
+% data, weights (if given), initial_parameters are all single
+assert(isa(data, 'single'), 'Type of data is not single');
+if ~isempty(weights)
+ assert(isa(weights, 'single'), 'Type of weights is not single');
+end
+assert(isa(initial_parameters, 'single'), 'Type of initial_parameters is not single');
+
+% parameters_to_fit is int32 (cast to int32 if incorrect type)
+if ~isa(parameters_to_fit, 'int32')
+ parameters_to_fit = int32(parameters_to_fit);
+end
+
+% max_n_iterations must be int32 (cast if incorrect type)
+if ~isa(max_n_iterations, 'int32')
+ max_n_iterations = int32(max_n_iterations);
+end
+
+% tolerance must be single (cast if incorrect type)
+if ~isa(tolerance, 'single')
+ tolerance = single(tolerance);
+end
+
+% we don't check type of user_info, but we extract the size in bytes of it
+if ~isempty(user_info)
+ user_info_info = whos('user_info');
+ user_info_size = user_info_info.bytes;
+else
+ user_info_size = 0;
+end
+
+
+%% run Gpufit taking the time
+tic;
+[parameters, states, chi_squares, n_iterations] ...
+ = GpufitMex(data, weights, n_fits, n_points, tolerance, max_n_iterations, estimator_id, initial_parameters, parameters_to_fit, model_id, n_parameters, user_info, user_info_size);
+
+time = toc;
+
+% reshape the output parameters array to have dimensions
+% (n_parameters,n_fits)
+parameters = reshape(parameters,n_parameters,n_fits);
+
+end
diff --git a/Gpufit/matlab/mex/GpufitMex.cpp b/Gpufit/matlab/mex/GpufitMex.cpp
new file mode 100644
index 0000000..071ed7c
--- /dev/null
+++ b/Gpufit/matlab/mex/GpufitMex.cpp
@@ -0,0 +1,150 @@
+#include "Gpufit/gpufit.h"
+
+#include
+
+#include
+#include
+
+/*
+ Get a arbitrary scalar (non complex) and check for class id.
+ https://www.mathworks.com/help/matlab/apiref/mxclassid.html
+*/
+template inline bool get_scalar(const mxArray *p, T &v, const mxClassID id)
+{
+ if (mxIsNumeric(p) && !mxIsComplex(p) && mxGetNumberOfElements(p) == 1 && mxGetClassID(p) == id)
+ {
+ v = *static_cast(mxGetData(p));
+ return true;
+ }
+ else {
+ return false;
+ }
+}
+
+void mexFunction(
+ int nlhs,
+ mxArray *plhs[],
+ int nrhs,
+ mxArray const *prhs[])
+{
+ int expected_nrhs = 0;
+ int expected_nlhs = 0;
+ bool wrong_nrhs = false;
+ bool wrong_nlhs = false;
+
+ // expects a certain number of input (nrhs) and output (nlhs) arguments
+ expected_nrhs = 13;
+ expected_nlhs = 4;
+ if (nrhs != expected_nrhs)
+ {
+ wrong_nrhs = true;
+ }
+ else if (nlhs != expected_nlhs)
+ {
+ wrong_nlhs = true;
+ }
+
+ if (wrong_nrhs || wrong_nlhs)
+ {
+ if (nrhs != expected_nrhs)
+ {
+ char s1[50];
+ _itoa_s(expected_nrhs, s1, 10);
+ char const s2[] = " input arguments required.";
+ size_t const string_length = strlen(s1) + 1 + strlen(s2);
+ strcat_s(s1, string_length, s2);
+ mexErrMsgIdAndTxt("Gpufit:Mex", s1);
+ }
+ else if (nlhs != expected_nlhs)
+ {
+ char s1[50];
+ _itoa_s(expected_nlhs, s1, 10);
+ char const s2[] = " output arguments required.";
+ size_t const string_length = strlen(s1) + 1 + strlen(s2);
+ strcat_s(s1, string_length, s2);
+ mexErrMsgIdAndTxt("Gpufit:Mex", s1);
+ }
+ }
+
+ // input parameters
+ float * data = (float*)mxGetPr(prhs[0]);
+ float * weights = (float*)mxGetPr(prhs[1]);
+ std::size_t n_fits = (std::size_t)*mxGetPr(prhs[2]);
+ std::size_t n_points = (std::size_t)*mxGetPr(prhs[3]);
+
+ // tolerance
+ float tolerance = 0;
+ if (!get_scalar(prhs[4], tolerance, mxSINGLE_CLASS))
+ {
+ mexErrMsgIdAndTxt("Gpufit:Mex", "tolerance is not a single");
+ }
+
+ // max_n_iterations
+ int max_n_iterations = 0;
+ if (!get_scalar(prhs[5], max_n_iterations, mxINT32_CLASS))
+ {
+ mexErrMsgIdAndTxt("Gpufit:Mex", "max_n_iteration is not int32");
+ }
+
+ int estimator_id = (int)*mxGetPr(prhs[6]);
+ float * initial_parameters = (float*)mxGetPr(prhs[7]);
+ int * parameters_to_fit = (int*)mxGetPr(prhs[8]);
+ int model_id = (int)*mxGetPr(prhs[9]);
+ int n_parameters = (int)*mxGetPr(prhs[10]);
+ int * user_info = (int*)mxGetPr(prhs[11]);
+ std::size_t user_info_size = (std::size_t)*mxGetPr(prhs[12]);
+
+ // output parameters
+ float * output_parameters;
+ mxArray * mx_parameters;
+ mx_parameters = mxCreateNumericMatrix(1, n_fits*n_parameters, mxSINGLE_CLASS, mxREAL);
+ output_parameters = (float*)mxGetData(mx_parameters);
+ plhs[0] = mx_parameters;
+
+ int * output_states;
+ mxArray * mx_states;
+ mx_states = mxCreateNumericMatrix(1, n_fits, mxINT32_CLASS, mxREAL);
+ output_states = (int*)mxGetData(mx_states);
+ plhs[1] = mx_states;
+
+ float * output_chi_squares;
+ mxArray * mx_chi_squares;
+ mx_chi_squares = mxCreateNumericMatrix(1, n_fits, mxSINGLE_CLASS, mxREAL);
+ output_chi_squares = (float*)mxGetData(mx_chi_squares);
+ plhs[2] = mx_chi_squares;
+
+ int * output_n_iterations;
+ mxArray * mx_n_iterations;
+ mx_n_iterations = mxCreateNumericMatrix(1, n_fits, mxINT32_CLASS, mxREAL);
+ output_n_iterations = (int*)mxGetData(mx_n_iterations);
+ plhs[3] = mx_n_iterations;
+
+ // call to gpufit
+ int const status
+ = gpufit
+ (
+ n_fits,
+ n_points,
+ data,
+ weights,
+ model_id,
+ initial_parameters,
+ tolerance,
+ max_n_iterations,
+ parameters_to_fit,
+ estimator_id,
+ user_info_size,
+ reinterpret_cast< char * >( user_info ),
+ output_parameters,
+ output_states,
+ output_chi_squares,
+ output_n_iterations
+ ) ;
+
+ // check status
+ if (status != STATUS_OK)
+ {
+ std::string const error = gpufit_get_last_error() ;
+ mexErrMsgIdAndTxt( "Gpufit:Mex", error.c_str() ) ;
+ }
+}
diff --git a/Gpufit/matlab/tests/gauss_fit_1d_test.m b/Gpufit/matlab/tests/gauss_fit_1d_test.m
new file mode 100644
index 0000000..412c72e
--- /dev/null
+++ b/Gpufit/matlab/tests/gauss_fit_1d_test.m
@@ -0,0 +1,35 @@
+% Equivalent/similar to tests/Gauss_Fit_1D.cpp
+
+% constants
+n_fits = 1;
+n_points = 5;
+n_parameters = 4;
+true_parameters = single([4; 2; 0.5; 1]);
+
+% data
+x = single((1:n_points)' - 1);
+y = gaussian_1d(true_parameters, x);
+data = zeros(n_points, n_fits, 'single');
+data(:, 1) = y;
+
+% model
+model_id = ModelID.GAUSS_1D;
+
+% initial_parameters
+initial_parameters = zeros(n_parameters, n_fits, 'single');
+initial_parameters(:, 1) = [2, 1.5, 0.3, 0];
+
+% call to gpufit
+[parameters, states, chi_squares, n_iterations] = gpufit(data, [], model_id, initial_parameters);
+
+%% Test results
+assert(states == 0);
+assert(n_iterations < 10);
+assert(chi_squares < 1e-6);
+assert(all(abs(parameters - true_parameters) < 1e-6));
+
+function y = gaussian_1d(p, x)
+
+y = p(1) * exp(-(x - p(2)).^2 ./ (2 * p(3).^2)) + p(4);
+
+end
\ No newline at end of file
diff --git a/Gpufit/matlab/tests/run_tests.m b/Gpufit/matlab/tests/run_tests.m
new file mode 100644
index 0000000..80da345
--- /dev/null
+++ b/Gpufit/matlab/tests/run_tests.m
@@ -0,0 +1,8 @@
+function run_tests()
+% Runs all test scripts in this folder.
+% See also: http://www.mathworks.com/help/matlab/script-based-unit-tests.html
+
+suite = testsuite();
+result = run(suite);
+disp(result);
+end
\ No newline at end of file
diff --git a/Gpufit/mle.cuh b/Gpufit/mle.cuh
new file mode 100644
index 0000000..32a45a0
--- /dev/null
+++ b/Gpufit/mle.cuh
@@ -0,0 +1,179 @@
+#ifndef GPUFIT_MLE_CUH_INCLUDED
+#define GPUFIT_MLE_CUH_INCLUDED
+
+#include
+
+/* Description of the calculate_chi_square_mle function
+* =====================================================
+*
+* This function calculates the chi-square values for the MLE estimator.
+*
+* Parameters:
+*
+* chi_square: An output vector of chi-square values for each data point.
+*
+* point_index: The data point index.
+*
+* data: An input vector of data.
+*
+* value: An input vector of fitting curve values.
+*
+* weight: An input vector of values for weighting chi-square values. It is not used
+* in this function. It can be used in functions calculating other estimators
+* than the MLE, such as LSE.
+*
+* state: A pointer to a value which indicates whether the fitting process was carreid
+* out correctly or which problem occurred. It is set to 3 if a fitting curve
+* value is negative.
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_chi_square_mle function
+* =============================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function.
+*
+*/
+
+__device__ void calculate_chi_square_mle(
+ volatile float * chi_square,
+ int const point_index,
+ float const * data,
+ float const * value,
+ float const * weight,
+ int * state,
+ char * user_info,
+ std::size_t const user_info_size)
+{
+ if (value[point_index] < 0)
+ {
+ *state = 3;
+ }
+
+ float const deviation = value[point_index] - data[point_index];
+
+ if (data[point_index] != 0)
+ {
+ chi_square[point_index]
+ = 2 * (deviation - data[point_index] * logf(value[point_index] / data[point_index]));
+ }
+ else
+ {
+ chi_square[point_index] = 2 * deviation;
+ }
+}
+
+/* Description of the calculate_hessian_mle function
+* ==================================================
+*
+* This function calculates the hessian matrix values of the MLE equation. The
+* calculation is performed based on previously calculated derivative values.
+*
+* Parameters:
+*
+* hessian: An output vector of values of the hessian matrix for each data point.
+*
+* point_index: The data point index.
+*
+* parameter_index_i: Index of the hessian column.
+*
+* parameter_index_j: Index of the hessian row.
+*
+* data: An input vector of data values.
+*
+* value: An input vector of fitting curve values.
+*
+* derivative: An input vector of partial derivative values of the fitting
+* curve for each data point.
+*
+* weight: An input vector of values for weighting hessian matrix values. It is not
+* used in this function. It can be used in functions calculating other estimators
+* than the MLE, such as LSE.
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_hessian_mle function
+* ==========================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function.
+*
+*/
+
+__device__ void calculate_hessian_mle(
+ double * hessian,
+ int const point_index,
+ int const parameter_index_i,
+ int const parameter_index_j,
+ float const * data,
+ float const * value,
+ float const * derivatives,
+ float const * weight,
+ char * user_info,
+ std::size_t const user_info_size)
+{
+ *hessian
+ += data[point_index]
+ / (value[point_index] * value[point_index])
+ * derivatives[parameter_index_i] * derivatives[parameter_index_j];
+}
+
+/* Description of the calculate_gradient_mle function
+* ===================================================
+*
+* This function calculates the gradient values of the MLE equation based
+* on previously calculated derivative values.
+*
+* Parameters:
+*
+* gradient: An output vector of values of the gradient vector for each data point.
+*
+* point_index: The data point index.
+*
+* parameter_index: The parameter index.
+*
+* data: An input vector of data values.
+*
+* value: An input vector of fitting curve values.
+*
+* derivative: An input vector of partial derivative values of the fitting
+* curve for each data point.
+*
+* weight: An input vector of values for weighting gradient vector values. It is not
+* used in this function. It can be used in functions calculating other estimators
+* than the MLE, such as LSE.
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_gradient_mle function
+* ===========================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function.
+*
+*/
+
+__device__ void calculate_gradient_mle(
+ volatile float * gradient,
+ int const point_index,
+ int const parameter_index,
+ float const * data,
+ float const * value,
+ float const * derivative,
+ float const * weight,
+ char * user_info,
+ std::size_t const user_info_size)
+{
+ gradient[point_index]
+ = -derivative[parameter_index]
+ * (1 - data[point_index] / value[point_index]);
+}
+
+#endif
diff --git a/Gpufit/python/CMakeLists.txt b/Gpufit/python/CMakeLists.txt
new file mode 100644
index 0000000..1ed2b3c
--- /dev/null
+++ b/Gpufit/python/CMakeLists.txt
@@ -0,0 +1,53 @@
+
+# Python
+
+# Python package
+
+set( build_directory "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/pyGpufit" )
+set( setup_files
+ "${CMAKE_CURRENT_SOURCE_DIR}/README.txt"
+ "${CMAKE_CURRENT_SOURCE_DIR}/setup.py"
+ "${CMAKE_CURRENT_SOURCE_DIR}/setup.cfg"
+)
+set( module_directory "${build_directory}/pygpufit" )
+set( module_files
+ "${CMAKE_CURRENT_SOURCE_DIR}/pygpufit/__init__.py"
+ "${CMAKE_CURRENT_SOURCE_DIR}/pygpufit/gpufit.py"
+)
+set( binary $ )
+
+add_custom_target( PYTHON_PACKAGE
+ COMMAND ${CMAKE_COMMAND} -E
+ remove_directory ${build_directory}
+ COMMAND ${CMAKE_COMMAND} -E
+ make_directory ${build_directory}
+ COMMAND ${CMAKE_COMMAND} -E
+ copy_if_different ${setup_files} ${build_directory}
+ COMMAND ${CMAKE_COMMAND} -E
+ make_directory ${module_directory}
+ COMMAND ${CMAKE_COMMAND} -E
+ copy_if_different ${module_files} ${module_directory}
+ COMMAND ${CMAKE_COMMAND} -E
+ copy_if_different ${binary} ${module_directory}
+)
+set_property( TARGET PYTHON_PACKAGE PROPERTY FOLDER CMakePredefinedTargets )
+add_dependencies( PYTHON_PACKAGE Gpufit )
+
+if( NOT PYTHONINTERP_FOUND )
+ message( STATUS "Python NOT found - skipping creation of Python wheel!" )
+ return()
+endif()
+
+# Python wheel (output name is incorrect, requires plattform tag, see packaging)
+
+add_custom_target( PYTHON_WHEEL ALL
+ COMMAND ${CMAKE_COMMAND} -E
+ chdir ${build_directory} "${PYTHON_EXECUTABLE}" setup.py clean --all
+ COMMAND ${CMAKE_COMMAND} -E
+ chdir ${build_directory} "${PYTHON_EXECUTABLE}" setup.py bdist_wheel
+ COMMENT "Preparing Python Wheel"
+)
+set_property( TARGET PYTHON_WHEEL PROPERTY FOLDER CMakePredefinedTargets )
+add_dependencies( PYTHON_WHEEL PYTHON_PACKAGE )
+
+# add launcher to Python package
diff --git a/Gpufit/python/README.txt b/Gpufit/python/README.txt
new file mode 100644
index 0000000..2e58557
--- /dev/null
+++ b/Gpufit/python/README.txt
@@ -0,0 +1,27 @@
+Python binding for the [Gpufit library](https://github.com/gpufit/Gpufit) which implements Levenberg Marquardt curve fitting in CUDA
+
+Requirements
+
+- A CUDA capable graphics card with a recent Nvidia graphics driver (at least 367.48 / July 2016)
+- Windows
+- Python 2 or 3 with NumPy
+
+Installation
+
+Currently the wheel file has to be installed locally.
+
+If NumPy is not yet installed, install it using pip from the command line
+
+pip install numpy
+
+Then install pyGpufit from the local folder via:
+
+pip install --no-index --find-links=LocalPathToWheelFile pyGpufit
+
+Examples
+
+See examples folder.
+
+Troubleshooting
+
+A common reason for the error message 'CUDA driver version is insufficient for CUDA runtime version' is an outdated Nvidia graphics driver.
\ No newline at end of file
diff --git a/Gpufit/python/examples/gauss2d.py b/Gpufit/python/examples/gauss2d.py
new file mode 100644
index 0000000..435c4de
--- /dev/null
+++ b/Gpufit/python/examples/gauss2d.py
@@ -0,0 +1,112 @@
+"""
+ Example of the Python binding of the Gpufit library which implements
+ Levenberg Marquardt curve fitting in CUDA
+ https://github.com/gpufit/Gpufit
+
+ Multiple fits of a 2D Gaussian peak function with Poisson distributed noise
+ http://gpufit.readthedocs.io/en/latest/bindings.html#python
+
+ This example additionally requires numpy.
+"""
+
+import numpy as np
+import pygpufit.gpufit as gf
+
+def generate_gauss_2d(p, xi, yi):
+ """
+ Generates a 2D Gaussian peak.
+ http://gpufit.readthedocs.io/en/latest/api.html#gauss-2d
+
+ :param p: Parameters (amplitude, x,y center position, width, offset)
+ :param xi: x positions
+ :param yi: y positions
+ :return: The Gaussian 2D peak.
+ """
+
+ arg = -(np.square(xi - p[1]) + np.square(yi - p[2])) / (2*p[3]*p[3])
+ y = p[0] * np.exp(arg) + p[4]
+
+ return y
+
+if __name__ == '__main__':
+
+ if not gf.cuda_available():
+ raise RuntimeError(gf.get_last_error())
+
+ # number of fits and fit points
+ number_fits = 10000
+ size_x = 12
+ number_points = size_x * size_x
+ number_parameters = 5
+
+ # set input arguments
+
+ # true parameters
+ true_parameters = np.array((10, 5.5, 5.5, 3, 10), dtype=np.float32)
+
+ # initialize random number generator
+ np.random.seed(0)
+
+ # initial parameters (relative randomized, positions relative to width)
+ initial_parameters = np.tile(true_parameters, (number_fits, 1))
+ initial_parameters[:, (1,2)] += true_parameters[3] * (-0.2 + 0.4 * np.random.rand(number_fits, 2))
+ initial_parameters[:, (0, 3, 4)] *= 0.8 + 0.4 * np.random.rand(number_fits, 3)
+
+ # generate x and y values
+ g = np.arange(size_x)
+ yi, xi = np.meshgrid(g, g, indexing='ij')
+ xi = xi.astype(np.float32)
+ yi = yi.astype(np.float32)
+
+ # generate data
+ data = generate_gauss_2d(true_parameters, xi, yi)
+ data = np.reshape(data, (1, number_points))
+ data = np.tile(data, (number_fits, 1))
+
+ # add Poisson noise
+ data = np.random.poisson(data)
+ data = data.astype(np.float32)
+
+ # tolerance
+ tolerance = 0.0001
+
+ # maximum number of iterations
+ max_number_iterations = 20
+
+ # estimator ID
+ estimator_id = gf.EstimatorID.MLE
+
+ # model ID
+ model_id = gf.ModelID.GAUSS_2D
+
+ # run Gpufit
+ parameters, states, chi_squares, number_iterations, execution_time = gf.fit(data, None, model_id, initial_parameters, \
+ tolerance, max_number_iterations, None, estimator_id, None)
+
+ # print fit results
+ converged = states == 0
+ print('*Gpufit*')
+
+ # print summary
+ print('\nmodel ID: {}'.format(model_id))
+ print('number of fits: {}'.format(number_fits))
+ print('fit size: {} x {}'.format(size_x, size_x))
+ print('mean chi_square: {:.2f}'.format(np.mean(chi_squares[converged])))
+ print('iterations: {:.2f}'.format(np.mean(number_iterations[converged])))
+ print('time: {:.2f} s'.format(execution_time))
+
+ # get fit states
+ number_converged = np.sum(converged)
+ print('\nratio converged {:6.2f} %'.format(number_converged / number_fits * 100))
+ print('ratio max it. exceeded {:6.2f} %'.format(np.sum(states == 1) / number_fits * 100))
+ print('ratio singular hessian {:6.2f} %'.format(np.sum(states == 2) / number_fits * 100))
+ print('ratio neg curvature MLE {:6.2f} %'.format(np.sum(states == 3) / number_fits * 100))
+
+ # mean, std of fitted parameters
+ converged_parameters = parameters[converged, :]
+ converged_parameters_mean = np.mean(converged_parameters, axis=0)
+ converged_parameters_std = np.std(converged_parameters, axis=0)
+ print('\nparameters of 2D Gaussian peak')
+ for i in range(number_parameters):
+ print('p{} true {:6.2f} mean {:6.2f} std {:6.2f}'.format(i, true_parameters[i], converged_parameters_mean[i], converged_parameters_std[i]))
+
diff --git a/Gpufit/python/examples/gauss2d_plot.py b/Gpufit/python/examples/gauss2d_plot.py
new file mode 100644
index 0000000..d7feb8e
--- /dev/null
+++ b/Gpufit/python/examples/gauss2d_plot.py
@@ -0,0 +1,114 @@
+"""
+ Example of the Python binding of the Gpufit library which implements
+ Levenberg Marquardt curve fitting in CUDA
+ https://github.com/gpufit/Gpufit
+
+ Multiple fits of a 2D Gaussian peak function with Poisson distributed noise
+ repeated for a different total number of fits each time and plotting the results
+ http://gpufit.readthedocs.io/en/latest/bindings.html#python
+
+ This example additionally requires numpy (http://www.numpy.org/) and matplotlib (http://matplotlib.org/).
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pygpufit.gpufit as gf
+
+def gaussians_2d(x, y, p):
+ """
+ Generates many 2D Gaussians peaks for a given set of parameters
+ """
+
+ n_fits = p.shape[0]
+
+ y = np.zeros((n_fits, x.shape[0], x.shape[1]), dtype=np.float32)
+
+ # loop over each fit
+ for i in range(n_fits):
+ pi = p[i, :]
+ arg = -(np.square(xi - pi[1]) + np.square(yi - pi[2])) / (2 * pi[3] * pi[3])
+ y[i, :, :] = pi[0] * np.exp(arg) + pi[4]
+
+ return y
+
+if __name__ == '__main__':
+
+ print('\n')
+
+ # number of fit points
+ size_x = 5
+ number_points = size_x * size_x
+
+ # set input arguments
+
+ # true parameters
+ mean_true_parameters = np.array((100, 2, 2, 1, 10), dtype=np.float32)
+
+ # average noise level
+ average_noise_level = 10
+
+ # initialize random number generator
+ np.random.seed(0)
+
+ # tolerance
+ tolerance = 0.0001
+
+ # maximum number of iterations
+ max_number_iterations = 10
+
+ # model ID
+ model_id = gf.ModelID.GAUSS_2D
+
+ # loop over different number of fits
+ n_fits_all = np.around(np.logspace(2, 6, 20)).astype(np.int)
+
+ # generate x and y values
+ g = np.arange(size_x)
+ yi, xi = np.meshgrid(g, g, indexing='ij')
+ xi = xi.astype(np.float32)
+ yi = yi.astype(np.float32)
+
+ # loop
+ speed = np.zeros(n_fits_all.size)
+ for i in range(n_fits_all.size):
+ n_fits = n_fits_all[i]
+
+ # vary positions of 2D Gaussian peaks slightly
+ test_parameters = np.tile(mean_true_parameters, (n_fits, 1))
+ test_parameters[:, (1,2)] += mean_true_parameters[3] * (-0.2 + 0.4 * np.random.rand(n_fits, 2))
+
+ # generate data
+ data = gaussians_2d(xi, yi, test_parameters)
+ data = np.reshape(data, (n_fits, number_points))
+
+ # add noise
+ data += np.random.normal(scale=average_noise_level, size=data.shape)
+
+ # initial parameters (randomized relative (to width for position))
+ initial_parameters = np.tile(mean_true_parameters, (n_fits, 1))
+ initial_parameters[:, (1,2)] += mean_true_parameters[3] * (-0.2 + 0.4 * np.random.rand(n_fits, 2))
+ initial_parameters[:, (0,3,4)] *= 0.8 + 0.4 * np.random.rand(n_fits, 3)
+
+ # run Gpufit
+ parameters, states, chi_squares, number_iterations, execution_time = gf.fit(data, None, model_id, initial_parameters, tolerance, max_number_iterations)
+
+ # analyze result
+ converged = states == 0
+ speed[i] = n_fits / execution_time
+ precision_x0 = np.std(parameters[converged, 1] - test_parameters[converged, 1], axis=0, dtype=np.float64)
+
+ # display result
+ '{} fits '.format(n_fits)
+ print('{:7} fits iterations: {:6.2f} | time: {:6.3f} s | speed: {:8.0f} fits/s'\
+ .format(n_fits, np.mean(number_iterations[converged]), execution_time, speed[i]))
+
+# plot
+plt.semilogx(n_fits_all, speed, 'bo-')
+plt.grid(True)
+plt.xlabel('number of fits per function call')
+plt.ylabel('fits per second')
+plt.legend(['Gpufit'], loc='upper left')
+ax = plt.gca()
+ax.set_xlim(n_fits_all[0], n_fits_all[-1])
+
+plt.show()
\ No newline at end of file
diff --git a/Gpufit/python/examples/simple.py b/Gpufit/python/examples/simple.py
new file mode 100644
index 0000000..5184001
--- /dev/null
+++ b/Gpufit/python/examples/simple.py
@@ -0,0 +1,30 @@
+"""
+ Example of the Python binding of the Gpufit library which implements
+ Levenberg Marquardt curve fitting in CUDA
+ https://github.com/gpufit/Gpufit
+
+ Simple example demonstrating a minimal call of all needed parameters for the Python interface
+ http://gpufit.readthedocs.io/en/latest/bindings.html#python
+"""
+
+import numpy as np
+import pygpufit.gpufit as gf
+
+if __name__ == '__main__':
+
+ # number of fits, number of points per fit
+ number_fits = 10
+ number_points = 10
+
+ # model ID and number of parameter
+ model_id = gf.ModelID.GAUSS_1D
+ number_parameter = 5
+
+ # initial parameters
+ initial_parameters = np.zeros((number_fits, number_parameter), dtype=np.float32)
+
+ # data
+ data = np.zeros((number_fits, number_points), dtype=np.float32)
+
+ # run Gpufit
+ parameters, states, chi_squares, number_iterations, execution_time = gf.fit(data, None, model_id, initial_parameters)
\ No newline at end of file
diff --git a/Gpufit/python/pygpufit/__init__.py b/Gpufit/python/pygpufit/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/Gpufit/python/pygpufit/gpufit.py b/Gpufit/python/pygpufit/gpufit.py
new file mode 100644
index 0000000..22a889f
--- /dev/null
+++ b/Gpufit/python/pygpufit/gpufit.py
@@ -0,0 +1,201 @@
+"""
+ Python binding for Gpufit, a Levenberg Marquardt curve fitting library written in CUDA
+ See https://github.com/gpufit/Gpufit, http://gpufit.readthedocs.io/en/latest/bindings.html#python
+
+ The binding is based on ctypes.
+ See https://docs.python.org/3.5/library/ctypes.html, http://www.scipy-lectures.org/advanced/interfacing_with_c/interfacing_with_c.html
+"""
+
+import os
+import time
+from ctypes import cdll, POINTER, c_int, c_float, c_char, c_char_p, c_size_t
+import numpy as np
+
+# define library loader (actual loading is lazy)
+package_dir = os.path.dirname(os.path.realpath(__file__))
+lib_path = os.path.join(package_dir, 'Gpufit.dll') # this will only work on Windows
+lib = cdll.LoadLibrary(lib_path)
+
+# gpufit function in the dll
+gpufit_func = lib.gpufit
+gpufit_func.restype = c_int
+gpufit_func.argtypes = [c_size_t, c_size_t, POINTER(c_float), POINTER(c_float), c_int, POINTER(c_float), c_float, c_int, POINTER(c_int), c_int, c_size_t, POINTER(c_char), POINTER(c_float), POINTER(c_int), POINTER(c_float), POINTER(c_int)]
+
+# gpufit_get_last_error function in the dll
+error_func = lib.gpufit_get_last_error
+error_func.restype = c_char_p
+error_func.argtypes = None
+
+# gpufit_cuda_available function in the dll
+cuda_available_func = lib.gpufit_cuda_available
+cuda_available_func.restype = c_int
+cuda_available_func.argtypes = None
+
+
+class ModelID():
+
+ GAUSS_1D = 0
+ GAUSS_2D = 1
+ GAUSS_2D_ELLIPTIC = 2
+ GAUSS_2D_ROTATED = 3
+ CAUCHY_2D_ELLIPTIC = 4
+ LINEAR_1D = 5
+
+
+class EstimatorID():
+
+ LSE = 0
+ MLE = 1
+
+
+def fit(data, weights, model_id, initial_parameters, tolerance=None, max_number_iterations=None, \
+ parameters_to_fit=None, estimator_id=None, user_info=None):
+ """
+ Calls the C interface fit function in the library.
+ (see also http://gpufit.readthedocs.io/en/latest/bindings.html#python)
+
+ All 2D NumPy arrays must be in row-major order (standard in NumPy), i.e. array.flags.C_CONTIGUOUS must be True
+ (see also https://docs.scipy.org/doc/numpy/reference/arrays.ndarray.html#internal-memory-layout-of-an-ndarray)
+
+ :param data: The data - 2D NumPy array of dimension [number_fits, number_points] and data type np.float32
+ :param weights: The weights - 2D NumPy array of the same dimension and data type as parameter data or None (no weights available)
+ :param model_id: The model ID
+ :param initial_parameters: Initial values for parameters - NumPy array of dimension [number_fits, number_parameters] and data type np.float32
+ :param tolerance: The fit tolerance or None (will use default value)
+ :param max_number_iterations: The maximal number of iterations or None (will use default value)
+ :param parameters_to_fit: Which parameters to fit - NumPy array of length number_parameters and type np.int32 or None (will fit all parameters)
+ :param estimator_id: The Estimator ID or None (will use default values)
+ :param user_info: User info - NumPy array of type np.char or None (no user info available)
+ :return: parameters, states, chi_squares, number_iterations, execution_time
+ """
+
+ # check all 2D NumPy arrays for row-major memory layout (otherwise interpretation of order of dimensions fails)
+ if not data.flags.c_contiguous:
+ raise RuntimeError('Memory layout of data array mismatch.')
+
+ if weights is not None and not weights.flags.c_contiguous:
+ raise RuntimeError('Memory layout of weights array mismatch.')
+
+ if not initial_parameters.flags.c_contiguous:
+ raise RuntimeError('Memory layout of initial_parameters array mismatch.')
+
+ # size check: data is 2D and read number of points and fits
+ if data.ndim != 2:
+ raise RuntimeError('data is not two-dimensional')
+ number_points = data.shape[1]
+ number_fits = data.shape[0]
+
+ # size check: consistency with weights (if given)
+ if weights is not None and data.shape != weights.shape:
+ raise RuntimeError('dimension mismatch between data and weights')
+ # the unequal operator checks, type, length and content (https://docs.python.org/3.7/reference/expressions.html#value-comparisons)
+
+ # size check: initial parameters is 2D and read number of parameters
+ if initial_parameters.ndim != 2:
+ raise RuntimeError('initial_parameters is not two-dimensional')
+ number_parameters = initial_parameters.shape[1]
+ if initial_parameters.shape[0] != number_fits:
+ raise RuntimeError('dimension mismatch in number of fits between data and initial_parameters')
+
+ # size check: consistency with parameters_to_fit (if given)
+ if parameters_to_fit is not None and parameters_to_fit.shape[0] != number_parameters:
+ raise RuntimeError('dimension mismatch in number of parameters between initial_parameters and parameters_to_fit')
+
+ # default value: tolerance
+ if not tolerance:
+ tolerance = 1e-4
+
+ # default value: max_number_iterations
+ if not max_number_iterations:
+ max_number_iterations = 25
+
+ # default value: estimator ID
+ if not estimator_id:
+ estimator_id = EstimatorID.LSE
+
+ # default value: parameters_to_fit
+ if parameters_to_fit is None:
+ parameters_to_fit = np.ones(number_parameters, dtype=np.int32)
+
+ # now only weights and user_info could be not given
+
+ # type check: data, weights (if given), initial_parameters are all np.float32
+ if data.dtype != np.float32:
+ raise RuntimeError('type of data is not np.float32')
+ if weights is not None and weights.dtype != np.float32:
+ raise RuntimeError('type of weights is not np.float32')
+ if initial_parameters.dtype != np.float32:
+ raise RuntimeError('type of initial_parameters is not np.float32')
+
+ # type check: parameters_to_fit is np.int32
+ if parameters_to_fit.dtype != np.int32:
+ raise RuntimeError('type of parameters_to_fit is not np.int32')
+
+ # we don't check type of user_info, but we extract the size in bytes of it
+ if user_info is not None:
+ user_info_size = user_info.nbytes
+ else:
+ user_info_size = 0
+
+ # pre-allocate output variables
+ parameters = np.zeros((number_fits, number_parameters), dtype=np.float32)
+ states = np.zeros(number_fits, dtype=np.int32)
+ chi_squares = np.zeros(number_fits, dtype=np.float32)
+ number_iterations = np.zeros(number_fits, dtype=np.int32)
+
+ # conversion to ctypes types for optional C interface parameters using NULL pointer (None) as default argument
+ if weights is not None:
+ weights_p = weights.ctypes.data_as(gpufit_func.argtypes[3])
+ else:
+ weights_p = None
+ if user_info is not None:
+ user_info_p = user_info.ctypes.data_as(gpufit_func.argtypes[11])
+ else:
+ user_info_p = None
+
+ # call into the library (measure time)
+ t0 = time.clock()
+ status = gpufit_func(
+ gpufit_func.argtypes[0](number_fits), \
+ gpufit_func.argtypes[1](number_points), \
+ data.ctypes.data_as(gpufit_func.argtypes[2]), \
+ weights_p, \
+ gpufit_func.argtypes[4](model_id), \
+ initial_parameters.ctypes.data_as(gpufit_func.argtypes[5]), \
+ gpufit_func.argtypes[6](tolerance), \
+ gpufit_func.argtypes[7](max_number_iterations), \
+ parameters_to_fit.ctypes.data_as(gpufit_func.argtypes[8]), \
+ gpufit_func.argtypes[9](estimator_id), \
+ gpufit_func.argtypes[10](user_info_size), \
+ user_info_p, \
+ parameters.ctypes.data_as(gpufit_func.argtypes[12]), \
+ states.ctypes.data_as(gpufit_func.argtypes[13]), \
+ chi_squares.ctypes.data_as(gpufit_func.argtypes[14]), \
+ number_iterations.ctypes.data_as(gpufit_func.argtypes[15]))
+ t1 = time.clock()
+
+
+ # check status
+ if status != 0:
+ # get error from last error and raise runtime error
+ error_message = error_func()
+ raise RuntimeError('status = {}, message = {}'.format(status, error_message))
+
+ # return output values
+ return parameters, states, chi_squares, number_iterations, t1 - t0
+
+
+def get_last_error():
+ """
+
+ :return:
+ """
+ return error_func()
+
+
+def cuda_available():
+ """
+
+ :return: True if CUDA is available, False otherwise
+ """
+ return cuda_available_func() != 0
diff --git a/Gpufit/python/requirements.txt b/Gpufit/python/requirements.txt
new file mode 100644
index 0000000..b316bf2
--- /dev/null
+++ b/Gpufit/python/requirements.txt
@@ -0,0 +1 @@
+NumPy>=1.8
\ No newline at end of file
diff --git a/Gpufit/python/setup.cfg b/Gpufit/python/setup.cfg
new file mode 100644
index 0000000..3c6e79c
--- /dev/null
+++ b/Gpufit/python/setup.cfg
@@ -0,0 +1,2 @@
+[bdist_wheel]
+universal=1
diff --git a/Gpufit/python/setup.py b/Gpufit/python/setup.py
new file mode 100644
index 0000000..c2e2b83
--- /dev/null
+++ b/Gpufit/python/setup.py
@@ -0,0 +1,40 @@
+"""
+ setup script for pyGpufit
+
+ TODO get version, get meaningful email
+"""
+
+from setuptools import setup, find_packages
+import os
+from io import open # to have encoding as parameter of open on Python >=2.6
+
+HERE = os.path.abspath(os.path.dirname(__file__))
+
+CLASSIFIERS = ['Development Status :: 5 - Production/Stable',
+ 'Intended Audience :: End Users/Desktop',
+ 'Operating System :: Microsoft :: Windows',
+ 'Topic :: Scientific/Engineering',
+ 'Topic :: Software Development :: Libraries']
+
+def get_long_description():
+ """
+ Get the long description from the README file.
+ """
+ with open(os.path.join(HERE, 'README.txt'), encoding='utf-8') as f:
+ return f.read()
+
+if __name__ == "__main__":
+ setup(name='pyGpufit',
+ version='1.0.0',
+ description='Levenberg Marquardt curve fitting in CUDA',
+ long_description=get_long_description(),
+ url='https://github.com/gpufit/Gpufit',
+ author='M. Bates, A. Przybylski, B. Thiel, and J. Keller-Findeisen',
+ author_email='a@b.c',
+ license='',
+ classifiers=[],
+ keywords='Levenberg Marquardt, curve fitting, CUDA',
+ packages=find_packages(where=HERE),
+ package_data={'pygpufit': ['*.dll']},
+ install_requires=['NumPy>=1.0'],
+ zip_safe=False)
\ No newline at end of file
diff --git a/Gpufit/python/tests/run_tests.py b/Gpufit/python/tests/run_tests.py
new file mode 100644
index 0000000..5395da2
--- /dev/null
+++ b/Gpufit/python/tests/run_tests.py
@@ -0,0 +1,19 @@
+"""
+Discovers all tests and runs them. Assumes that initially the working directory is test.
+"""
+
+import sys
+import unittest
+
+if __name__ == '__main__':
+
+ loader = unittest.defaultTestLoader
+
+ tests = loader.discover('.')
+
+ runner = unittest.TextTestRunner()
+
+ results = runner.run(tests)
+
+ # return number of failures
+ sys.exit(len(results.failures))
\ No newline at end of file
diff --git a/Gpufit/python/tests/test_gaussian_fit_1d.py b/Gpufit/python/tests/test_gaussian_fit_1d.py
new file mode 100644
index 0000000..a2f2bd7
--- /dev/null
+++ b/Gpufit/python/tests/test_gaussian_fit_1d.py
@@ -0,0 +1,76 @@
+"""
+ Equivalent to https://github.com/gpufit/Gpufit/blob/master/Gpufit/tests/Gauss_Fit_1D.cpp
+"""
+
+import unittest
+import numpy as np
+import pygpufit.gpufit as gf
+
+def generate_gauss_1d(parameters, x):
+ """
+ Generates a 1D Gaussian curve.
+
+ :param parameters: The parameters (a, x0, s, b)
+ :param x: The x values
+ :return: A 1D Gaussian curve.
+ """
+
+ a = parameters[0]
+ x0 = parameters[1]
+ s = parameters[2]
+ b = parameters[3]
+
+ y = a * np.exp(-np.square(x - x0) / (2 * s**2)) + b
+
+ return y
+
+class Test(unittest.TestCase):
+
+ def test_gaussian_fit_1d(self):
+ # constants
+ n_fits = 1
+ n_points = 5
+ n_parameter = 4 # model will be GAUSS_1D
+
+ # true parameters
+ true_parameters = np.array((4, 2, 0.5, 1), dtype=np.float32)
+
+ # generate data
+ data = np.empty((n_fits, n_points), dtype=np.float32)
+ x = np.arange(n_points, dtype=np.float32)
+ data[0, :] = generate_gauss_1d(true_parameters, x)
+
+ # tolerance
+ tolerance = 0.001
+
+ # max_n_iterations
+ max_n_iterations = 10
+
+ # model id
+ model_id = gf.ModelID.GAUSS_1D
+
+ # initial parameters
+ initial_parameters = np.empty((n_fits, n_parameter), dtype=np.float32)
+ initial_parameters[0, :] = (2, 1.5, 0.3, 0)
+
+ # call to gpufit
+ parameters, states, chi_squares, number_iterations, execution_time = gf.fit(data, None, model_id,
+ initial_parameters, tolerance, \
+ max_n_iterations, None, None, None)
+
+ # print results
+ for i in range(n_parameter):
+ print(' p{} true {} fit {}'.format(i, true_parameters[i], parameters[0, i]))
+ print('fit state : {}'.format(states))
+ print('chi square: {}'.format(chi_squares))
+ print('iterations: {}'.format(number_iterations))
+ print('time: {} s'.format(execution_time))
+
+ assert (chi_squares < 1e-6)
+ assert (states == 0)
+ assert (number_iterations <= max_n_iterations)
+ for i in range(n_parameter):
+ assert (abs(true_parameters[i] - parameters[0, i]) < 1e-6)
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/Gpufit/python/tests/test_linear_regression.py b/Gpufit/python/tests/test_linear_regression.py
new file mode 100644
index 0000000..ad05ff4
--- /dev/null
+++ b/Gpufit/python/tests/test_linear_regression.py
@@ -0,0 +1,60 @@
+"""
+ Equivalent to https://github.com/gpufit/Gpufit/blob/master/Gpufit/tests/Linear_Fit_1D.cpp
+"""
+
+import unittest
+import numpy as np
+import pygpufit.gpufit as gf
+
+class Test(unittest.TestCase):
+
+ def test_gaussian_fit_1d(self):
+ # constants
+ n_fits = 1
+ n_points = 2
+ n_parameter = 2
+
+ # true parameters
+ true_parameters = np.array((0, 1), dtype=np.float32)
+
+ # data values
+ data = np.empty((n_fits, n_points), dtype=np.float32)
+ data[0, :] = (0, 1)
+
+ # max number iterations
+ max_number_iterations = 10
+
+ # initial parameters
+ initial_parameters = np.empty((n_fits, n_parameter), dtype=np.float32)
+ initial_parameters[0, :] = (0, 0)
+
+ # model id
+ model_id = gf.ModelID.LINEAR_1D
+
+ # tolerance
+ tolerance = 0.001
+
+ # user info
+ user_info = np.array((0, 1), dtype=np.float32)
+
+ # call to gpufit
+ parameters, states, chi_squares, number_iterations, execution_time = gf.fit(data, None, model_id,
+ initial_parameters, tolerance, \
+ None, None, None, user_info)
+
+ # print results
+ for i in range(n_parameter):
+ print(' p{} true {} fit {}'.format(i, true_parameters[i], parameters[0, i]))
+ print('fit state : {}'.format(states))
+ print('chi square: {}'.format(chi_squares))
+ print('iterations: {}'.format(number_iterations))
+ print('time: {} s'.format(execution_time))
+
+ assert (chi_squares < 1e-6)
+ assert (states == 0)
+ assert (number_iterations <= max_number_iterations)
+ for i in range(n_parameter):
+ assert (abs(true_parameters[i] - parameters[0, i]) < 1e-6)
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/Gpufit/tests/CMakeLists.txt b/Gpufit/tests/CMakeLists.txt
new file mode 100644
index 0000000..a53ba34
--- /dev/null
+++ b/Gpufit/tests/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+# Tests
+
+add_boost_test( Gpufit Error_Handling )
+add_boost_test( Gpufit Linear_Fit_1D )
+add_boost_test( Gpufit Gauss_Fit_1D )
+add_boost_test( Gpufit Gauss_Fit_2D )
+add_boost_test( Gpufit Gauss_Fit_2D_Elliptic )
+add_boost_test( Gpufit Gauss_Fit_2D_Rotated )
+add_boost_test( Gpufit Cauchy_Fit_2D_Elliptic )
diff --git a/Gpufit/tests/Cauchy_Fit_2D_Elliptic.cpp b/Gpufit/tests/Cauchy_Fit_2D_Elliptic.cpp
new file mode 100644
index 0000000..461c726
--- /dev/null
+++ b/Gpufit/tests/Cauchy_Fit_2D_Elliptic.cpp
@@ -0,0 +1,73 @@
+#define BOOST_TEST_MODULE Gpufit
+
+#include "Gpufit/gpufit.h"
+
+#include
+
+#include
+
+template
+void generate_cauchy_2d_elliptic(std::array< float, SIZE>& values)
+{
+ int const size_x = int(std::sqrt(SIZE));
+ int const size_y = size_x;
+
+ float const a = 4;
+ float const x0 = (float(size_x) - 1.f) / 2.f;
+ float const y0 = (float(size_y) - 1.f) / 2.f;
+ float const sx = 0.4f;
+ float const sy = 0.6f;
+ float const b = 1.f;
+
+ for (int point_index_y = 0; point_index_y < size_y; point_index_y++)
+ {
+ for (int point_index_x = 0; point_index_x < size_x; point_index_x++)
+ {
+ int const point_index = point_index_y * size_x + point_index_x;
+ float const argx = ((x0 - point_index_x) / sx) *((x0 - point_index_x) / sx) + 1.f;
+ float const argy = ((y0 - point_index_y) / sy) *((y0 - point_index_y) / sy) + 1.f;
+ values[point_index] = a / argx / argy + b;
+ }
+ }
+}
+
+BOOST_AUTO_TEST_CASE( Cauchy_Fit_2D_Elliptic )
+{
+ std::size_t const n_fits{ 1 } ;
+ std::size_t const n_points{ 25 } ;
+ std::array< float, n_points > data{};
+ generate_cauchy_2d_elliptic(data);
+ std::array< float, n_points > weights{};
+ std::fill(weights.begin(), weights.end(), 1.f);
+ std::array< float, 6 > initial_parameters{ { 2.f, 1.8f, 2.2f, 0.5f, 0.5f, 0.f } };
+ float tolerance{ 0.001f };
+ int max_n_iterations{ 100 };
+ std::array< int, 6 > parameters_to_fit{ { 1, 1, 1, 1, 1, 1 } };
+ std::array< float, 6 > output_parameters;
+ int output_states;
+ float output_chi_square;
+ int output_n_iterations;
+
+ int const status
+ = gpufit
+ (
+ n_fits,
+ n_points,
+ data.data(),
+ weights.data(),
+ CAUCHY_2D_ELLIPTIC,
+ initial_parameters.data(),
+ tolerance,
+ max_n_iterations,
+ parameters_to_fit.data(),
+ LSE,
+ 0,
+ 0,
+ output_parameters.data(),
+ &output_states,
+ &output_chi_square,
+ &output_n_iterations
+ ) ;
+
+ BOOST_CHECK( status == 0 ) ;
+}
diff --git a/Gpufit/tests/Error_Handling.cpp b/Gpufit/tests/Error_Handling.cpp
new file mode 100644
index 0000000..c35a078
--- /dev/null
+++ b/Gpufit/tests/Error_Handling.cpp
@@ -0,0 +1,51 @@
+#define BOOST_TEST_MODULE Gpufit
+
+#include "Gpufit/gpufit.h"
+
+#include
+
+#include
+
+BOOST_AUTO_TEST_CASE( Error_Handling )
+{
+ std::size_t const n_fits{ 1 } ;
+ std::size_t const n_points{ 2 } ;
+ std::array< float, n_points > data{ { 0, 1 } } ;
+ std::array< float, n_points > weights{ { 1, 1 } } ;
+ std::array< float, 2 > initial_parameters{ { 0, 0 } } ;
+ float tolerance{ 0.001f } ;
+ int max_n_iterations{ 10 } ;
+ std::array< int, 2 > parameters_to_fit{ { 0, 0 } } ;
+ std::array< int, 2 > user_info{ { 0, 1 } } ;
+ std::array< float, 2 > output_parameters ;
+ int output_states ;
+ float output_chi_square ;
+ int output_n_iterations ;
+
+ int const status
+ = gpufit
+ (
+ n_fits,
+ n_points,
+ data.data(),
+ weights.data(),
+ LINEAR_1D,
+ initial_parameters.data(),
+ tolerance,
+ max_n_iterations,
+ parameters_to_fit.data(),
+ LSE,
+ n_points * sizeof( int ),
+ reinterpret_cast< char * >( user_info.data() ),
+ output_parameters.data(),
+ & output_states,
+ & output_chi_square,
+ & output_n_iterations
+ ) ;
+
+ BOOST_CHECK( status == - 1 ) ;
+
+ std::string const error = gpufit_get_last_error() ;
+
+ BOOST_CHECK( error == "invalid configuration argument" ) ;
+}
diff --git a/Gpufit/tests/Gauss_Fit_1D.cpp b/Gpufit/tests/Gauss_Fit_1D.cpp
new file mode 100644
index 0000000..81a8c64
--- /dev/null
+++ b/Gpufit/tests/Gauss_Fit_1D.cpp
@@ -0,0 +1,87 @@
+#define BOOST_TEST_MODULE Gpufit
+
+#include "Gpufit/gpufit.h"
+
+#include
+
+#include
+
+template
+void generate_gauss_1d(
+ std::array< float, n_points >& values,
+ std::array< float, 4 > const & parameters )
+{
+ float const a = parameters[ 0 ];
+ float const x0 = parameters[ 1 ];
+ float const s = parameters[ 2 ];
+ float const b = parameters[ 3 ];
+
+ for ( int point_index = 0; point_index < n_points; point_index++ )
+ {
+ float const argx = ( ( point_index - x0 )*( point_index - x0 ) ) / ( 2.f * s * s );
+ float const ex = exp( -argx );
+ values[ point_index ] = a * ex + b;
+ }
+}
+
+BOOST_AUTO_TEST_CASE( Gauss_Fit_1D )
+{
+ /*
+ Performs a single fit using the GAUSS_1D model.
+ - Doesn't use user_info or weights.
+ - No noise is added.
+ - Checks fitted parameters equalling the true parameters.
+ */
+
+ std::size_t const n_fits{ 1 } ;
+ std::size_t const n_points{ 5 } ;
+
+ std::array< float, 4 > const true_parameters{ { 4.f, 2.f, 0.5f, 1.f } };
+
+ std::array< float, n_points > data{};
+ generate_gauss_1d( data, true_parameters );
+
+ std::array< float, 4 > initial_parameters{ { 2.f, 1.5f, 0.3f, 0.f } };
+
+ float tolerance{ 0.001f };
+
+ int max_n_iterations{ 10 };
+
+ std::array< int, 4 > parameters_to_fit{ { 1, 1, 1, 1 } };
+
+ std::array< float, 4 > output_parameters;
+ int output_states;
+ float output_chi_square;
+ int output_n_iterations;
+
+ int const status
+ = gpufit
+ (
+ n_fits,
+ n_points,
+ data.data(),
+ 0,
+ GAUSS_1D,
+ initial_parameters.data(),
+ tolerance,
+ max_n_iterations,
+ parameters_to_fit.data(),
+ LSE,
+ 0,
+ 0,
+ output_parameters.data(),
+ &output_states,
+ &output_chi_square,
+ &output_n_iterations
+ ) ;
+
+ BOOST_CHECK( status == 0 ) ;
+ BOOST_CHECK( output_states == 0 );
+ BOOST_CHECK( output_chi_square < 1e-6f );
+ BOOST_CHECK( output_n_iterations <= max_n_iterations );
+
+ BOOST_CHECK( std::fabsf(output_parameters[ 0 ] - true_parameters[ 0 ] ) < 1e-6f );
+ BOOST_CHECK( std::fabsf(output_parameters[ 1 ] - true_parameters[ 1 ] ) < 1e-6f );
+ BOOST_CHECK( std::fabsf(output_parameters[ 2 ] - true_parameters[ 2 ] ) < 1e-6f );
+ BOOST_CHECK( std::fabsf(output_parameters[ 3 ] - true_parameters[ 3 ] ) < 1e-6f );
+}
diff --git a/Gpufit/tests/Gauss_Fit_2D.cpp b/Gpufit/tests/Gauss_Fit_2D.cpp
new file mode 100644
index 0000000..0222933
--- /dev/null
+++ b/Gpufit/tests/Gauss_Fit_2D.cpp
@@ -0,0 +1,96 @@
+#define BOOST_TEST_MODULE Gpufit
+
+#include "Gpufit/gpufit.h"
+
+#include
+
+#include
+
+template
+void generate_gauss_2d(std::array< float , SIZE>& values)
+{
+ int const size_x = int(std::sqrt(SIZE));
+ int const size_y = size_x;
+
+ float const a = 4.f;
+ float const x0 = (float(size_x) - 1.f) / 2.f;
+ float const y0 = (float(size_y) - 1.f) / 2.f;
+ float const s = 0.5f;
+ float const b = 1.f;
+
+ for (int point_index_y = 0; point_index_y < size_y; point_index_y++)
+ {
+ for (int point_index_x = 0; point_index_x < size_x; point_index_x++)
+ {
+ int const point_index = point_index_y * size_x + point_index_x;
+ float const argx = ((point_index_x - x0)*(point_index_x - x0)) / (2.f * s * s);
+ float const argy = ((point_index_y - y0)*(point_index_y - y0)) / (2.f * s * s);
+ float const ex = exp(-argx) * exp(-argy);
+ values[point_index] = a * ex + b;
+ }
+ }
+}
+
+BOOST_AUTO_TEST_CASE( Gauss_Fit_2D )
+{
+ std::size_t const n_fits{ 1 } ;
+ std::size_t const n_points{ 25 } ;
+ std::array< float, n_points > data{};
+ generate_gauss_2d(data);
+ std::array< float, n_points > weights{};
+ std::fill(weights.begin(), weights.end(), 1.f);
+ std::array< float, 5 > initial_parameters{ { 2.f, 1.8f, 2.2f, 0.4f, 0.f } };
+ float tolerance{ 0.001f };
+ int max_n_iterations{ 10 };
+ std::array< int, 5 > parameters_to_fit{ { 1, 1, 1, 1, 1 } };
+ std::array< float, 5 > output_parameters;
+ int output_states;
+ float output_chi_square;
+ int output_n_iterations;
+
+ int const status
+ = gpufit
+ (
+ n_fits,
+ n_points,
+ data.data(),
+ 0,
+ GAUSS_2D,
+ initial_parameters.data(),
+ tolerance,
+ max_n_iterations,
+ parameters_to_fit.data(),
+ LSE,
+ 0,
+ 0,
+ output_parameters.data(),
+ &output_states,
+ &output_chi_square,
+ &output_n_iterations
+ ) ;
+
+ BOOST_CHECK( status == 0 ) ;
+
+ int const status_with_weights
+ = gpufit
+ (
+ n_fits,
+ n_points,
+ data.data(),
+ weights.data(),
+ GAUSS_2D,
+ initial_parameters.data(),
+ tolerance,
+ max_n_iterations,
+ parameters_to_fit.data(),
+ LSE,
+ 0,
+ 0,
+ output_parameters.data(),
+ &output_states,
+ &output_chi_square,
+ &output_n_iterations
+ ) ;
+
+ BOOST_CHECK( status_with_weights == 0 ) ;
+}
diff --git a/Gpufit/tests/Gauss_Fit_2D_Elliptic.cpp b/Gpufit/tests/Gauss_Fit_2D_Elliptic.cpp
new file mode 100644
index 0000000..072169c
--- /dev/null
+++ b/Gpufit/tests/Gauss_Fit_2D_Elliptic.cpp
@@ -0,0 +1,74 @@
+#define BOOST_TEST_MODULE Gpufit
+
+#include "Gpufit/gpufit.h"
+
+#include
+
+#include
+
+template
+void generate_gauss_2d_elliptic(std::array< float, SIZE>& values)
+{
+ int const size_x = int(std::sqrt(SIZE));
+ int const size_y = size_x;
+
+ float const a = 4;
+ float const x0 = (float(size_x) - 1.f) / 2.f;
+ float const y0 = (float(size_y) - 1.f) / 2.f;
+ float const sx = 0.4f;
+ float const sy = 0.6f;
+ float const b = 1.f;
+
+ for (int point_index_y = 0; point_index_y < size_y; point_index_y++)
+ {
+ for (int point_index_x = 0; point_index_x < size_x; point_index_x++)
+ {
+ int const point_index = point_index_y * size_x + point_index_x;
+ float const argx = ((point_index_x - x0)*(point_index_x - x0)) / (2.f * sx * sx);
+ float const argy = ((point_index_y - y0)*(point_index_y - y0)) / (2.f* sy * sy);
+ float const ex = exp(-argx) * exp(-argy);
+ values[point_index] = a * ex + b;
+ }
+ }
+}
+
+BOOST_AUTO_TEST_CASE( Gauss_Fit_2D_Elliptic )
+{
+ std::size_t const n_fits{ 1 } ;
+ std::size_t const n_points{ 25 } ;
+ std::array< float, n_points > data{};
+ generate_gauss_2d_elliptic(data);
+ std::array< float, n_points > weights{};
+ std::fill(weights.begin(), weights.end(), 1.f);
+ std::array< float, 6 > initial_parameters{ { 2.f, 1.8f, 2.2f, 0.5f, 0.5f, 0.f } };
+ float tolerance{ 0.001f };
+ int max_n_iterations{ 10 };
+ std::array< int, 6 > parameters_to_fit{ { 1, 1, 1, 1, 1, 1 } };
+ std::array< float, 6 > output_parameters;
+ int output_states;
+ float output_chi_square;
+ int output_n_iterations;
+
+ int const status
+ = gpufit
+ (
+ n_fits,
+ n_points,
+ data.data(),
+ weights.data(),
+ GAUSS_2D_ELLIPTIC,
+ initial_parameters.data(),
+ tolerance,
+ max_n_iterations,
+ parameters_to_fit.data(),
+ LSE,
+ 0,
+ 0,
+ output_parameters.data(),
+ &output_states,
+ &output_chi_square,
+ &output_n_iterations
+ ) ;
+
+ BOOST_CHECK( status == 0 ) ;
+}
diff --git a/Gpufit/tests/Gauss_Fit_2D_Rotated.cpp b/Gpufit/tests/Gauss_Fit_2D_Rotated.cpp
new file mode 100644
index 0000000..55cd682
--- /dev/null
+++ b/Gpufit/tests/Gauss_Fit_2D_Rotated.cpp
@@ -0,0 +1,77 @@
+#define BOOST_TEST_MODULE Gpufit
+
+#define PI 3.1415926535897f
+
+#include "Gpufit/gpufit.h"
+
+#include
+
+#include
+
+template
+void generate_gauss_2d_rotated(std::array< float, SIZE>& values)
+{
+ int const size_x = int(std::sqrt(SIZE));
+ int const size_y = size_x;
+
+ float const a = 10.f;
+ float const x0 = (float(size_x) - 1.f) / 2.f;
+ float const y0 = (float(size_y) - 1.f) / 2.f;
+ float const sx = 0.4f;
+ float const sy = 0.5f;
+ float const b = 1.f;
+ float const r = PI / 16.f;
+
+ for (int point_index_y = 0; point_index_y < size_y; point_index_y++)
+ {
+ for (int point_index_x = 0; point_index_x < size_x; point_index_x++)
+ {
+ int const point_index = point_index_y * size_x + point_index_x;
+ float const arga = ((point_index_x - x0) * cosf(r)) - ((point_index_y - y0) * sinf(r));
+ float const argb = ((point_index_x - x0) * sinf(r)) + ((point_index_y - y0) * cosf(r));
+ float const ex = exp((-0.5f) * (((arga / sx) * (arga / sx)) + ((argb / sy) * (argb / sy))));
+ values[point_index] = a * ex + b;
+ }
+ }
+}
+
+BOOST_AUTO_TEST_CASE( Gauss_Fit_2D_Rotated )
+{
+ std::size_t const n_fits{ 1 } ;
+ std::size_t const n_points{ 64 } ;
+ std::array< float, n_points > data{};
+ generate_gauss_2d_rotated(data);
+ std::array< float, n_points > weights{};
+ std::fill(weights.begin(), weights.end(), 1.f);
+ std::array< float, 7 > initial_parameters{ { 8.f, 3.4f, 3.6f, 0.4f, 0.5f, 2.f, 0.f } };
+ float tolerance{ 0.001f };
+ int max_n_iterations{ 10 };
+ std::array< int, 7 > parameters_to_fit{ { 1, 1, 1, 1, 1, 1, 1 } };
+ std::array< float, 7 > output_parameters;
+ int output_states;
+ float output_chi_square;
+ int output_n_iterations;
+
+ int const status
+ = gpufit
+ (
+ n_fits,
+ n_points,
+ data.data(),
+ weights.data(),
+ GAUSS_2D_ROTATED,
+ initial_parameters.data(),
+ tolerance,
+ max_n_iterations,
+ parameters_to_fit.data(),
+ LSE,
+ 0,
+ 0,
+ output_parameters.data(),
+ &output_states,
+ &output_chi_square,
+ &output_n_iterations
+ ) ;
+
+ BOOST_CHECK( status == 0 ) ;
+}
diff --git a/Gpufit/tests/Linear_Fit_1D.cpp b/Gpufit/tests/Linear_Fit_1D.cpp
new file mode 100644
index 0000000..abd7c81
--- /dev/null
+++ b/Gpufit/tests/Linear_Fit_1D.cpp
@@ -0,0 +1,101 @@
+#define BOOST_TEST_MODULE Gpufit
+
+#include "Gpufit/gpufit.h"
+
+#include
+
+#include
+
+BOOST_AUTO_TEST_CASE( Linear_Fit_1D )
+{
+ /*
+ Performs a single fit using the Linear Fit (LINEAR_1D) model.
+ - Uses user info
+ - Uses trivial weights.
+ - No noise is added.
+ - Checks fitted parameters equalling the true parameters.
+ */
+
+ std::size_t const n_fits{ 1 } ;
+ std::size_t const n_points{ 2 } ;
+
+ std::array< float, 2 > const true_parameters{ { 1, 1 } };
+
+ std::array< float, n_points > data{ { 1, 2 } } ;
+
+ std::array< float, n_points > weights{ { 1, 1 } } ;
+
+ std::array< float, 2 > initial_parameters{ { 1, 0 } } ;
+
+ float tolerance{ 0.001f } ;
+
+ int max_n_iterations{ 10 } ;
+
+ std::array< int, 2 > parameters_to_fit{ { 1, 1 } } ;
+
+ std::array< float, n_points > user_info{ { 0.f, 1.f } } ;
+
+ std::array< float, 2 > output_parameters ;
+ int output_states ;
+ float output_chi_squares ;
+ int output_n_iterations ;
+
+ // test with LSE
+ int status = gpufit
+ (
+ n_fits,
+ n_points,
+ data.data(),
+ weights.data(),
+ LINEAR_1D,
+ initial_parameters.data(),
+ tolerance,
+ max_n_iterations,
+ parameters_to_fit.data(),
+ LSE,
+ n_points * sizeof( float ),
+ reinterpret_cast< char * >( user_info.data() ),
+ output_parameters.data(),
+ & output_states,
+ & output_chi_squares,
+ & output_n_iterations
+ ) ;
+
+ BOOST_CHECK( status == 0 ) ;
+ BOOST_CHECK( output_states == 0 );
+ BOOST_CHECK( output_n_iterations <= max_n_iterations );
+ BOOST_CHECK( output_chi_squares < 1e-6f );
+
+ BOOST_CHECK(std::fabsf(output_parameters[0] - true_parameters[0]) < 1e-6f);
+ BOOST_CHECK(std::fabsf(output_parameters[1] - true_parameters[1]) < 1e-6f);
+
+ // test with MLE
+ status = gpufit
+ (
+ n_fits,
+ n_points,
+ data.data(),
+ weights.data(),
+ LINEAR_1D,
+ initial_parameters.data(),
+ tolerance,
+ max_n_iterations,
+ parameters_to_fit.data(),
+ MLE,
+ n_points * sizeof(float),
+ reinterpret_cast< char * >(user_info.data()),
+ output_parameters.data(),
+ &output_states,
+ &output_chi_squares,
+ &output_n_iterations
+ );
+
+ BOOST_CHECK(status == 0);
+ BOOST_CHECK(output_states == 0);
+ BOOST_CHECK(output_n_iterations <= max_n_iterations);
+ BOOST_CHECK(output_chi_squares < 1e-6f);
+
+ BOOST_CHECK(std::fabsf(output_parameters[0] - true_parameters[0]) < 1e-6f);
+ BOOST_CHECK(std::fabsf(output_parameters[1] - true_parameters[1]) < 1e-4f);
+
+}
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..6fe98c3
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017 Mark Bates, Adrian Przybylski, Björn Thiel, and Jan Keller-Findeisen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..498877e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,62 @@
+# Gpufit
+
+Levenberg Marquardt curve fitting in CUDA.
+
+Homepage: [github.com/gpufit/Gpufit](https://github.com/gpufit/Gpufit)
+
+## Quick start instructions
+
+To verify that Gpufit is working correctly on the host computer, go to the folder gpufit_performance_test of the binary package and run Gpufit_Cpufit_Performance_Comparison.exe. Further details of the test executable can be found in the documentation package.
+
+## Binary distribution
+
+The latest Gpufit binary release, supporting Windows 32-bit and 64-bit machines, can be found on the [release page](https://github.com/gpufit/Gpufit/releases).
+
+## Documentation
+
+[![Documentation Status](https://readthedocs.org/projects/gpufit/badge/?version=latest)](http://gpufit.readthedocs.io/en/latest/?badge=latest)
+
+Documentation for the Gpufit library may be found online ([latest documentation](http://gpufit.readthedocs.io/en/latest/?badge=latest)), and also
+as a PDF file in the binary distribution of Gpufit.
+
+## Building Gpufit from source code
+
+Instructions for building Gpufit are found in the documentation: [Building from source code](https://github.com/gpufit/Gpufit/blob/master/docs/installation.rst).
+
+## Using the Gpufit binary distribution
+
+Instructions for using the bindary distribution may be found in the documentation. The binary package contains:
+
+- The Gpufit SDK, which consists of the 32-bit and 64-bit DLL files, and
+ the Gpufit header file which contains the function definitions. The Gpufit
+ SDK is intented to be used when calling Gpufit from an external application
+ written in e.g. C code.
+- Gpufit Performance test: A simple console application comparing the execution speed of curve fitting on the GPU and CPU. This program also serves as a test to ensure the correct functioning of Gpufit.
+- Matlab 32 bit and 64 bit bindings, with Matlab examples.
+- Python version 2.x and version 3.x bindings (compiled as wheel files) and
+ Python examples.
+- The Gpufit manual in PDF format
+
+## License
+
+MIT License
+
+Copyright (c) 2017 Mark Bates, Adrian Przybylski, Björn Thiel, and Jan Keller-Findeisen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/docs/_static/style.css b/docs/_static/style.css
new file mode 100644
index 0000000..6c92e05
--- /dev/null
+++ b/docs/_static/style.css
@@ -0,0 +1,15 @@
+.wy-nav-content {
+ max-width: 1100px !important;
+}
+
+@media screen and (max-width: 767px) {
+ .wy-table-responsive table td {
+ white-space: nowrap;
+ }
+}
+
+@media screen and (min-width: 768px) {
+ .wy-table-responsive table td {
+ white-space: normal;
+ }
+}
\ No newline at end of file
diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html
new file mode 100644
index 0000000..b0a4480
--- /dev/null
+++ b/docs/_templates/layout.html
@@ -0,0 +1,4 @@
+{% extends "!layout.html" %}
+{% block extrahead %}
+
+{% endblock %}
\ No newline at end of file
diff --git a/docs/appendix.rst b/docs/appendix.rst
new file mode 100644
index 0000000..103df3e
--- /dev/null
+++ b/docs/appendix.rst
@@ -0,0 +1,31 @@
+========
+Appendix
+========
+
+Levenberg-Marquardt algorithm
+-----------------------------
+
+A flowchart of the implementation of the Levenberg-Marquardt algorithm is given in :numref:`appendix-gpufit-flowchart`.
+
+.. _appendix-gpufit-flowchart:
+
+.. figure:: /images/gpufit_program_flow_skeleton_v2.png
+ :width: 14 cm
+ :align: center
+
+ Levenberg-Marquardt algorithm flow as implemented in |GF|.
+
+
+Performance comparison to other GPU benchmarks
+----------------------------------------------
+
+Using the bundled application to estimate the fitting speed per second of 10 million fits for various CUDA capable
+graphics cards of various architectures (on different computers with different versions of graphics drivers) we can
+compare to the results of Passmark G3D. By and large, the results seem to correlate, i.e. a high Passmark G3D score
+also relates to a high Gpufit fitting speed.
+
+.. figure:: /images/Gpufit_PassmarkG3D_relative_performance.png
+ :width: 14 cm
+ :align: center
+
+ Performance of Gpufit vs Passmark G3D
\ No newline at end of file
diff --git a/docs/bindings.rst b/docs/bindings.rst
new file mode 100644
index 0000000..ff3d914
--- /dev/null
+++ b/docs/bindings.rst
@@ -0,0 +1,413 @@
+.. _external-bindings:
+
+=================
+External bindings
+=================
+
+This sections describes the Gpufit bindings to other programming languages. The bindings (e.g. to Python or Matlab) aim to
+emulate the :ref:`c-interface` as closely as possible.
+
+Most high level languages feature multidimensional numerical arrays. In the bindings implemented for Matlab and Python,
+we adopt the convention that the input data should be organized as a 2D array, with one dimension corresponding to the
+number of data points per fit, and the other corresponding to the number of fits. Internally, in memory, these arrays should
+always be ordered such that the data values for each fit are kept together. In Matlab, for example, this means storing the
+data in an array with dimensions [number_points_per_fit, number_fits]. In this manner, the data in memory is ordered in the
+same way that is expected by the Gpufit C interface, and there is no need to copy or otherwise re-organize the data
+before passing it to the GPU. The same convention is used for the weights, the initial model parameters, and the output parameters.
+
+Unlike the C interface, the external bindings to not require the number of fits and the number of data points per fit to be
+specified explicitly. Instead, these numbers are inferred from the dimensions of the 2D input arrays.
+
+Optional parameters with default values
+---------------------------------------
+
+The external bindings make some input parameters optional. The optional parameters are shown here.
+
+:tolerance:
+ default value 1e-4
+:max_n_iterations:
+ default value 25 iterations
+:estimator_id:
+ the default estimator is LSE as defined in gpufit.h_
+:parameters_to_fit:
+ by default all parameters are fit
+
+For instructions on how to specify these parameters explicitly, see the sections below.
+
+Python
+------
+
+The Gpufit binding for Python is a project named pyGpufit. This project contains a Python package named pygpufit, which
+contains a module gpufit, and this module implements a method called fit. Calling this method is equivalent to
+calling the C interface function *gpufit()* of |GF|. The package expects the input data to be
+stored as NumPy array. NumPy follows row-major order by default.
+
+Installation
+++++++++++++
+
+Wheel files for Python 2.X and 3.X on Windows 32/64 bit are included in the binary package. NumPy is required.
+
+Install the wheel file with.
+
+.. code-block:: bash
+
+ pip install --no-index --find-links=LocalPathToWheelFile pyGpufit
+
+Python Interface
+++++++++++++++++
+
+Optional parameters are passed in as None. The numbers of points, fits and parameters is deduced from the dimensions of
+the input data and initial parameters arrays.
+
+The signature of the gpufit method is
+
+.. code-block:: python
+
+ def fit(data, weights, model_id:ModelID, initial_parameters, tolerance:float=None, max_number_iterations:int=None, parameters_to_fit=None, estimator_id:EstimatorID=None, user_info=None):
+
+*Input parameters*
+
+:data: Data
+ 2D NumPy array of shape (number_fits, number_points) and data type np.float32
+:weights: Weights
+ 2D NumPy array of shape (number_fits, number_points) and data type np.float32 (same as data)
+
+ :special: None indicates that no weights are available
+:tolerance: Fit tolerance
+
+ :type: float
+ :special: If None, the default value will be used.
+:max_number_iterations: Maximal number of iterations
+
+ :type: int
+ :special: If None, the default value will be used.
+:estimator_id: estimator ID
+
+ :type: EstimatorID which is an Enum in the same module and defined analogously to gpufit.h_.
+ :special: If None, the default value is used.
+:model_id: model ID
+
+ :type: ModelID which is an Enum in the same module and defined analogously to gpufit.h_.
+:initial_parameters: Initial parameters
+ 2D NumPy array of shape (number_fits, number_parameter)
+
+ :array data type: np.float32
+:parameters_to_fit: parameters to fit
+ 1D NumPy array of length number_parameter
+ A zero indicates that this parameter should not be fitted, everything else means it should be fitted.
+
+ :array data type: np.int32
+ :special: If None, the default value is used.
+:user_info: user info
+ 1D NumPy array of arbitrary type. The length in bytes is deduced automatically.
+
+ :special: If None, no user_info is assumed.
+
+*Output parameters*
+
+:parameters: Fitted parameters for each fit
+ 2D NumPy array of shape (number_fits, number_parameter) and data type np.float32
+:states: Fit result states for each fit
+ 1D NumPy array of length number_parameter of data type np.int32
+ As defined in gpufit.h_:
+:chi_squares: :math:`\chi^2` values for each fit
+ 1D NumPy array of length number_parameter of data type np.float32
+:n_iterations: Number of iterations done for each fit
+ 1D NumPy array of length number_parameter of data type np.int32
+:time: Execution time of call to fit
+ In seconds.
+
+Errors are raised if checks on parameters fail or if the execution of fit failed.
+
+Python Examples
++++++++++++++++
+
+2D Gaussian peak example
+........................
+
+An example can be found at `Python Gauss2D example`_. It is equivalent to :ref:`c-example-2d-gaussian`.
+
+The essential imports are:
+
+.. code-block:: python
+
+ import numpy as np
+ import pygpufit.gpufit as gf
+
+The true parameters describing an example 2D Gaussian peak functions are:
+
+.. code-block:: python
+
+ # true parameters
+ true_parameters = np.array((10, 5.5, 5.5, 3, 10), dtype=np.float32)
+
+A 2D grid of x and y positions can conveniently be generated using the np.meshgrid function:
+
+.. code-block:: python
+
+ # generate x and y values
+ g = np.arange(size_x)
+ yi, xi = np.meshgrid(g, g, indexing='ij')
+ xi = xi.astype(np.float32)
+ yi = yi.astype(np.float32)
+
+Using these positions and the true parameter values a model function can be calculated as
+
+.. code-block:: python
+
+ def generate_gauss_2d(p, xi, yi):
+ """
+ Generates a 2D Gaussian peak.
+ http://gpufit.readthedocs.io/en/latest/api.html#gauss-2d
+
+ :param p: Parameters (amplitude, x,y center position, width, offset)
+ :param xi: x positions
+ :param yi: y positions
+ :return: The Gaussian 2D peak.
+ """
+
+ arg = -(np.square(xi - p[1]) + np.square(yi - p[2])) / (2*p[3]*p[3])
+ y = p[0] * np.exp(arg) + p[4]
+
+ return y
+
+The model function can be repeated and noise can be added using the np.tile and np.random.poisson functions.
+
+.. code-block:: python
+
+ # generate data
+ data = generate_gauss_2d(true_parameters, xi, yi)
+ data = np.reshape(data, (1, number_points))
+ data = np.tile(data, (number_fits, 1))
+
+ # add Poisson noise
+ data = np.random.poisson(data)
+ data = data.astype(np.float32)
+
+The model and estimator IDs can be set as
+
+.. code-block:: python
+
+ # estimator ID
+ estimator_id = gf.EstimatorID.MLE
+
+ # model ID
+ model_id = gf.ModelID.GAUSS_2D
+
+When all input parameters are set we can call the C interface of Gpufit.
+
+.. code-block:: python
+
+ # run Gpufit
+ parameters, states, chi_squares, number_iterations, execution_time = gf.fit(data, None, model_id, initial_parameters, tolerance, max_number_iterations, None, estimator_id, None)
+
+And finally statistics about the results of the fits can be displayed where the mean and standard deviation of the
+fitted parameters are limited to those fits that converged.
+
+.. code-block:: python
+
+ # print fit results
+
+ # get fit states
+ converged = states == 0
+ number_converged = np.sum(converged)
+ print('ratio converged {:6.2f} %'.format(number_converged / number_fits * 100))
+ print('ratio max it. exceeded {:6.2f} %'.format(np.sum(states == 1) / number_fits * 100))
+ print('ratio singular hessian {:6.2f} %'.format(np.sum(states == 2) / number_fits * 100))
+ print('ratio neg curvature MLE {:6.2f} %'.format(np.sum(states == 3) / number_fits * 100))
+ print('ratio gpu not read {:6.2f} %'.format(np.sum(states == 4) / number_fits * 100))
+
+ # mean, std of fitted parameters
+ converged_parameters = parameters[converged, :]
+ converged_parameters_mean = np.mean(converged_parameters, axis=0)
+ converged_parameters_std = np.std(converged_parameters, axis=0)
+
+ for i in range(number_parameters):
+ print('p{} true {:6.2f} mean {:6.2f} std {:6.2f}'.format(i, true_parameters[i], converged_parameters_mean[i], converged_parameters_std[i]))
+
+ # print summary
+ print('model ID: {}'.format(model_id))
+ print('number of fits: {}'.format(number_fits))
+ print('fit size: {} x {}'.format(size_x, size_x))
+ print('mean chi_square: {:.2f}'.format(np.mean(chi_squares[converged])))
+ print('iterations: {:.2f}'.format(np.mean(number_iterations[converged])))
+ print('time: {:.2f} s'.format(execution_time))
+
+
+Matlab
+------
+
+The Matlab binding for Gpufit is a Matlab script (gpufit.m_). This script checks the input data, sets default parameters, and
+calls the C interface of |GF|, via a compiled .mex file.
+
+Please note, that before using the Matlab binding, the path to gpufit.m_ must be added to the Matlab path.
+
+If other GPU-based computations are to be performed with Matlab in the same session, please use the Matlab GPU computing
+functionality first (for example with a call to gpuDevice or gpuArray) before calling the Gpufit Matlab binding. If this is not
+done, Matlab will throw an error (Error using gpuArray An unexpected error occurred during CUDA execution.
+The CUDA error was: cannot set while device is active in this process).
+
+Matlab Interface
+++++++++++++++++
+
+Optional parameters are passed in as empty matrices (``[]``). The numbers of points, fits and parameters is deduced from the dimensions of
+the input data and initial parameters matrices.
+
+The signature of the gpufit function is
+
+.. code-block:: matlab
+
+ function [parameters, states, chi_squares, n_iterations, time] = gpufit(data, weights, model_id, initial_parameters, tolerance, max_n_iterations, parameters_to_fit, estimator_id, user_info)
+
+*Input parameters*
+
+:data: Data
+ 2D matrix of size [number_points, number_fits] and data type single
+:weights: Weights
+ 2D matrix of size [number_points, number_fits] and data type single (same as data)
+
+ :special: None indicates that no weights are available
+:tolerance: Fit tolerance
+
+ :type: single
+ :special: If empty ([]), the default value will be used.
+:max_number_iterations: Maximal number of iterations
+ Will be converted to int32 if necessary
+
+ :special: If empty ([]), the default value will be used.
+:estimator_id: estimator ID
+
+ :type: EstimatorID which is defined in EstimatorID.m analogously to gpufit.h_.
+ :special: If empty ([]), the default value is used.
+:model_id: model ID
+
+ :type: ModelID which is defined in ModelID.m analogously to gpufit.h_.
+:initial_parameters: Initial parameters
+ 2D matrix of size: [number_parameter, number_fits]
+
+ :type: single
+:parameters_to_fit: parameters to fit
+ vector of length number_parameter, will be converted to int32 if necessary
+ A zero indicates that this parameter should not be fitted, everything else means it should be fitted.
+
+ :special: If empty ([]), the default value is used.
+:user_info: user info
+ vector of arbitrary type. The length in bytes is deduced automatically.
+
+*Output parameters*
+
+:parameters: Fitted parameters for each fit
+ 2D matrix of size: [number_parameter, number_fits] of data type single
+:states: Fit result states for each fit
+ vector of length number_parameter of data type int32
+ As defined in gpufit.h_:
+:chi_squares: :math:`\chi^2` values for each fit
+ vector of length number_parameter of data type single
+:n_iterations: Number of iterations done for each fit
+ vector of length number_parameter of data type int32
+:time: Execution time of call to gpufit
+ In seconds.
+
+Errors are raised if checks on parameters fail or if the execution of gpufit fails.
+
+Matlab Examples
++++++++++++++++
+
+Simple example
+..............
+
+The most simple example is the `Matlab simple example`_. It is equivalent to :ref:`c-example-simple` and additionally
+relies on default values for optional arguments.
+
+2D Gaussian peak example
+........................
+
+An example can be found at `Matlab Gauss2D example`_. It is equivalent to :ref:`c-example-2d-gaussian`.
+
+The true parameters describing an example 2D Gaussian peak functions are:
+
+.. code-block:: matlab
+
+ % true parameters
+ true_parameters = single([10, 5.5, 5.5, 3, 10]);
+
+A 2D grid of x and y positions can conveniently be generated using the ndgrid function:
+
+.. code-block:: matlab
+
+ % generate x and y values
+ g = single(0 : size_x - 1);
+ [x, y] = ndgrid(g, g);
+
+Using these positions and the true parameter values a model function can be calculated as
+
+.. code-block:: matlab
+
+ function g = gaussian_2d(x, y, p)
+ % Generates a 2D Gaussian peak.
+ % http://gpufit.readthedocs.io/en/latest/api.html#gauss-2d
+ %
+ % x,y - x and y grid position values
+ % p - parameters (amplitude, x,y center position, width, offset)
+
+ g = p(1) * exp(-((x - p(2)).^2 + (y - p(3)).^2) / (2 * p(4)^2)) + p(5);
+
+ end
+
+The model function can be repeated and noise can be added using the repmat and poissrnd functions.
+
+.. code-block:: matlab
+
+ % generate data with Poisson noise
+ data = gaussian_2d(x, y, true_parameters);
+ data = repmat(data(:), [1, number_fits]);
+ data = poissrnd(data);
+
+The model and estimator IDs can be set as
+
+.. code-block:: matlab
+
+ % estimator id
+ estimator_id = EstimatorID.MLE;
+
+ % model ID
+ model_id = ModelID.GAUSS_2D;
+
+When all input parameters are set we can call the C interface of |GF|.
+
+.. code-block:: matlab
+
+ %% run Gpufit
+ [parameters, states, chi_squares, n_iterations, time] = gpufit(data, [], model_id, initial_parameters, tolerance, max_n_iterations, [], estimator_id, []);
+
+And finally statistics about the results of the fits can be displayed where the mean and standard deviation of the
+fitted parameters are limited to those fits that converged.
+
+.. code-block:: matlab
+
+ %% displaying results
+
+ % get fit states
+ converged = states == 0;
+ number_converged = sum(converged);
+ fprintf(' ratio converged %6.2f %%\n', number_converged / number_fits * 100);
+ fprintf(' ratio max it. exceeded %6.2f %%\n', sum(states == 1) / number_fits * 100);
+ fprintf(' ratio singular hessian %6.2f %%\n', sum(states == 2) / number_fits * 100);
+ fprintf(' ratio neg curvature MLE %6.2f %%\n', sum(states == 3) / number_fits * 100);
+ fprintf(' ratio gpu not read %6.2f %%\n', sum(states == 4) / number_fits * 100);
+
+ % mean and std of fitted parameters
+ converged_parameters = parameters(:, converged);
+ converged_parameters_mean = mean(converged_parameters, 2);
+ converged_parameters_std = std(converged_parameters, [], 2);
+ for i = 1 : number_parameters
+ fprintf(' p%d true %6.2f mean %6.2f std %6.2f\n', i, true_parameters(i), converged_parameters_mean(i), converged_parameters_std(i));
+ end
+
+ % print summary
+ fprintf('model ID: %d\n', model_id);
+ fprintf('number of fits: %d\n', number_fits);
+ fprintf('fit size: %d x %d\n', size_x, size_x);
+ fprintf('mean chi-square: %6.2f\n', mean(chi_squares(converged)));
+ fprintf('iterations: %6.2f\n', mean(n_iterations(converged)));
+ fprintf('time: %6.2f s\n', time);
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..fe55fe3
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,457 @@
+# -*- coding: utf-8 -*-
+import sphinx_rtd_theme
+#
+# RTD Spielwiese documentation build configuration file, created by
+# sphinx-quickstart on Tue Oct 04 12:39:10 2016.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.4'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+ 'sphinx.ext.mathjax',
+ 'sphinx.ext.todo'
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The encoding of source files.
+#
+# source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'Gpufit: An open-source toolkit for GPU-accelerated curve fitting'
+copyright = 'All rights reserved.'
+author = 'Adrian Przybylski, Björn Thiel, Jan Keller-Findeisen, Bernd Stock, and Mark Bates'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = u'1.0'
+# The full version, including alpha/beta/rc tags.
+release = u'1.0.0'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#
+# today = ''
+#
+# Else, today_fmt is used as the format for a strftime call.
+#
+# today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#
+# default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#
+# add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#
+# add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#
+# show_authors = False
+
+# read epilog.rst
+with open('epilog.txt') as f:
+ rst_epilog = f.read()
+
+# default highlight language is cpp
+highlight_language = 'cpp'
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+# modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+# keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = True
+
+numfig = True
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinx_rtd_theme"
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+#html_theme_options = {
+# 'collapse_navigation': False,
+# 'display_version': False,
+# 'navigation_depth': 3,
+#}
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+# html_theme_path = []
+
+# The name for this set of Sphinx documents.
+# " v documentation" by default.
+#
+# html_title = u'RTD Spielwiese v1'
+
+# A shorter title for the navigation bar. Default is the same as html_title.
+#
+# html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#
+# html_logo = None
+
+# The name of an image file (relative to this directory) to use as a favicon of
+# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#
+# html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#
+# html_extra_path = []
+
+# If not None, a 'Last updated on:' timestamp is inserted at every page
+# bottom, using the given strftime format.
+# The empty string is equivalent to '%b %d, %Y'.
+#
+# html_last_updated_fmt = None
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#
+# html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#
+# html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#
+# html_additional_pages = {}
+
+# If false, no module index is generated.
+#
+# html_domain_indices = True
+
+# If false, no index is generated.
+#
+# html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#
+# html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#
+# html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#
+# html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#
+# html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a tag referring to it. The value of this option must be the
+# base URL from which the finished HTML is served.
+#
+# html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+# html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
+# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh'
+#
+# html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# 'ja' uses this config value.
+# 'zh' user can custom change `jieba` dictionary path.
+#
+# html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#
+# html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Gpufit'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+
+# make code smaller in latex output
+# see also: http://stackoverflow.com/questions/9899283/how-do-you-change-the-code-example-font-size-in-latex-pdf-output-with-sphinx
+from sphinx.highlighting import PygmentsBridge
+from pygments.formatters.latex import LatexFormatter
+
+class CustomLatexFormatter(LatexFormatter):
+ def __init__(self, **options):
+ super(CustomLatexFormatter, self).__init__(**options)
+ self.verboptions = r"formatcom=\footnotesize"
+
+PygmentsBridge.latex_formatter = CustomLatexFormatter
+
+latex_elements = {
+ # The paper size ('letterpaper' or 'a4paper').
+ #
+ 'papersize': 'a4paper,oneside',
+
+ # The font size ('10pt', '11pt' or '12pt').
+ #
+ # 'pointsize': '10pt',
+
+ # Additional stuff for the LaTeX preamble.
+ #
+ # 'preamble': '',
+
+ # Latex figure (float) alignment
+ #
+ # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+# author, documentclass [howto, manual, or own class]).
+latex_documents = [
+ (master_doc, 'Gpufit.tex', 'Gpufit Documentation',
+ 'Gpufit', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#
+# latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#
+# latex_use_parts = False
+
+# If true, show page references after internal links.
+#
+latex_show_pagerefs = True
+
+# If true, show URL addresses after external links.
+#
+# latex_show_urls = 'footnote'
+latex_show_urls = 'no'
+
+# Documents to append as an appendix to all manuals.
+#
+# latex_appendices = []
+
+# It false, will not define \strong, \code, itleref, \crossref ... but only
+# \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added
+# packages.
+#
+# latex_keep_old_macro_names = True
+
+# If false, no module index is generated.
+#
+# latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ (master_doc, 'gpufit', 'Gpufit Documentation',
+ [author], 1)
+]
+
+# If true, show URL addresses after external links.
+#
+# man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+# dir menu entry, description, category)
+texinfo_documents = [
+ (master_doc, 'Gpufit', 'Gpufit Documentation',
+ author, 'Gpufit', 'Levenberg Marquardt curve fitting in CUDA',
+ 'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#
+# texinfo_appendices = []
+
+# If false, no module index is generated.
+#
+# texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#
+# texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#
+# texinfo_no_detailmenu = False
+
+
+# -- Options for Epub output ----------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+epub_author = author
+epub_publisher = author
+epub_copyright = copyright
+
+# The basename for the epub file. It defaults to the project name.
+# epub_basename = project
+
+# The HTML theme for the epub output. Since the default themes are not
+# optimized for small screen space, using the same theme for HTML and epub
+# output is usually not wise. This defaults to 'epub', a theme designed to save
+# visual space.
+#
+# epub_theme = 'epub'
+
+# The language of the text. It defaults to the language option
+# or 'en' if the language is not set.
+#
+# epub_language = ''
+
+# The scheme of the identifier. Typical schemes are ISBN or URL.
+# epub_scheme = ''
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A tuple containing the cover image and cover page html template filenames.
+#
+# epub_cover = ()
+
+# A sequence of (type, uri, title) tuples for the guide element of content.opf.
+#
+# epub_guide = ()
+
+# HTML files that should be inserted before the pages created by sphinx.
+# The format is a list of tuples containing the path and title.
+#
+# epub_pre_files = []
+
+# HTML files that should be inserted after the pages created by sphinx.
+# The format is a list of tuples containing the path and title.
+#
+# epub_post_files = []
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+# The depth of the table of contents in toc.ncx.
+#
+# epub_tocdepth = 3
+
+# Allow duplicate toc entries.
+#
+# epub_tocdup = True
+
+# Choose between 'default' and 'includehidden'.
+#
+# epub_tocscope = 'default'
+
+# Fix unsupported image types using the Pillow.
+#
+# epub_fix_images = False
+
+# Scale large images.
+#
+# epub_max_image_width = 0
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#
+# epub_show_urls = 'inline'
+
+# If false, no index is generated.
+#
+# epub_use_index = True
diff --git a/docs/customization.rst b/docs/customization.rst
new file mode 100644
index 0000000..4fcfec5
--- /dev/null
+++ b/docs/customization.rst
@@ -0,0 +1,299 @@
+.. _gpufit-customization:
+
+=============
+Customization
+=============
+
+This sections explains how to add custom fit model functions and custom fit estimators within |GF|.
+Functions calculating the estimator and model values are defined in CUDA header files using the CUDA C syntax.
+For each function and estimator there exists a separate file. Therefore, to add an additional model or estimator a new
+CUDA header file containing the new model or estimator function must be created and included in the library.
+
+Please note, that in order to add a model function or estimator, it is necessary to rebuild the Gpufit library
+from source. In future releases of Gpufit, it may be possible to include new fit functions or estimators at runtime.
+
+
+Add a new fit model function
+----------------------------
+
+To add a new fit model, the model function itself as well as analytic expressions for its partial derivatives
+must to be known. A function calculating the values of the model as well as a function calculating the
+values of the partial derivatives of the model, with respect to the model parameters and possible grid
+coordinates, must be implemented.
+
+Additionally, a new model ID must be defined and included in the list of available model IDs, and the number
+of model parameters must be specified as well.
+
+Detailed step by step instructions for adding a model function are given below.
+
+1. Define an additional model ID in file gpufit.h_
+2. Implement a CUDA device function within a newly created .cuh file according to the following template.
+
+.. code-block:: cuda
+
+ __device__ void ... ( // function name
+ float const * parameters,
+ int const n_fits,
+ int const n_points,
+ int const n_parameters,
+ float * values,
+ float * derivatives,
+ int const chunk_index,
+ char * user_info,
+ std::size_t const user_info_size)
+ {
+ ///////////////////////////// indices /////////////////////////////
+ int const n_fits_per_block = blockDim.x / n_points;
+ int const fit_in_block = threadIdx.x / n_points;
+ int const point_index = threadIdx.x - (fit_in_block*n_points);
+ int const fit_index = blockIdx.x*n_fits_per_block + fit_in_block;
+
+ ///////////////////////////// values //////////////////////////////
+ float* current_value = &values[fit_index*n_points];
+ float const * current_parameters = ¶meters[fit_index*n_parameters];
+
+ current_value[point_index] = ... ; // formula calculating fit model values
+
+ /////////////////////////// derivatives ///////////////////////////
+ float * current_derivative = &derivatives[fit_index * n_points*n_parameters];
+
+ current_derivative[0 * n_points + point_index] = ... ; // formula calculating derivative values with respect to parameters[0]
+ current_derivative[1 * n_points + point_index] = ... ; // formula calculating derivative values with respect to parameters[1]
+ .
+ .
+ .
+ }
+
+This code can be used as a pattern, where the placeholders ". . ." must be replaced by user code which calculates model
+function values and partial derivative values of the model function for a particular set of parameters. See for example linear_1d.cuh_.
+
+3. Include the newly created .cuh file in cuda_kernels.cu_
+4. Add an if branch in the CUDA global function ``cuda_calc_curve()`` in file cuda_kernels.cu_ to allow calling the added model function
+
+.. code-block:: cpp
+
+ if (model_id == GAUSS_1D)
+ calculate_gauss1d
+ (parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size);
+ .
+ .
+ .
+ else if (model_id == ...) // model ID
+ ... // function name
+ (parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size);
+
+Compare model_id with the defined model of the new model and call the calculate model values function of your model.
+
+5. Add a switch case in function set_number_of_parameters in file interface.cpp_
+
+.. code-block:: cpp
+
+ switch (model_id)
+ {
+ case GAUSS_1D:
+ n_parameters_ = 4;
+ break;
+ .
+ .
+ .
+ case ... : // model ID
+ n_parameters_ = ... ; // number of model parameters
+ break;
+ default:
+ break;
+ }
+
+Add a new fit estimator
+------------------------
+
+To extend |GF| by additional estimators, three CUDA device functions must be defined and integrated. The sections requiring modification are
+the functions which calculate the estimator function values, and its gradient and hessian values. Also, a new estimator ID must be defined.
+Detailed step by step instructions for adding an additional estimator is given below.
+
+1. Define an additional estimator ID in gpufit.h_
+2. Implement three functions within a newly created .cuh file calculating :math:`\chi^2` values and
+ its gradient and hessian according to the following template.
+
+.. code-block:: cuda
+
+ ///////////////////////////// Chi-square /////////////////////////////
+ __device__ void ... ( // function name Chi-square
+ volatile float * chi_square,
+ int const point_index,
+ float const * data,
+ float const * value,
+ float const * weight,
+ int * state,
+ char * user_info,
+ std::size_t const user_info_size)
+ {
+ chi_square[point_index] = ... ; // formula calculating Chi-square summands
+ }
+
+ ////////////////////////////// gradient //////////////////////////////
+ __device__ void ... ( // function name gradient
+ volatile float * gradient,
+ int const point_index,
+ int const parameter_index,
+ float const * data,
+ float const * value,
+ float const * derivative,
+ float const * weight,
+ char * user_info,
+ std::size_t const user_info_size)
+ {
+ gradient[point_index] = ... ; // formula calculating summands of the gradient of Chi-square
+ }
+
+ ////////////////////////////// hessian ///////////////////////////////
+ __device__ void ... ( // function name hessian
+ double * hessian,
+ int const point_index,
+ int const parameter_index_i,
+ int const parameter_index_j,
+ float const * data,
+ float const * value,
+ float const * derivative,
+ float const * weight,
+ char * user_info,
+ std::size_t const user_info_size)
+ {
+ *hessian += ... ; // formula calculating summands of the hessian of Chi-square
+ }
+
+This code can be used as a pattern, where the placeholders ". . ." must be replaced by user code which calculates the estimator
+and the hessian values of the estimator given. For a concrete example, see lse.cuh_.
+
+3. Include the newly created .cuh file in cuda_kernels.cu_
+
+.. code-block:: cpp
+
+ #include "....cuh" // filename
+
+4. Add an if branch in 3 CUDA global functions in the file cuda_kernels.cu_
+
+ .. code-block:: cuda
+
+ __global__ void cuda_calculate_chi_squares(
+ .
+ .
+ .
+ if (estimator_id == LSE)
+ {
+ calculate_chi_square_lse(
+ shared_chi_square,
+ point_index,
+ current_data,
+ current_value,
+ current_weight,
+ current_state,
+ user_info,
+ user_info_size);
+ }
+ .
+ .
+ .
+ else if (estimator_id == ...) // estimator ID
+ {
+ ...( // function name Chi-square
+ shared_chi_square,
+ point_index,
+ current_data,
+ current_value,
+ current_weight,
+ current_state,
+ user_info,
+ user_info_size);
+ }
+ .
+ .
+ .
+
+
+ .. code-block:: cuda
+
+ __global__ void cuda_calculate_gradients(
+ .
+ .
+ .
+ if (estimator_id == LSE)
+ {
+ calculate_gradient_lse(
+ shared_gradient,
+ point_index,
+ derivative_index,
+ current_data,
+ current_value,
+ current_derivative,
+ current_weight,
+ user_info,
+ user_info_size);
+ }
+ .
+ .
+ .
+ else if (estimator_id == ...) // estimator ID
+ {
+ ...( // function name gradient
+ shared_gradient,
+ point_index,
+ derivative_index,
+ current_data,
+ current_value,
+ current_derivative,
+ current_weight,
+ user_info,
+ user_info_size);
+ }
+ .
+ .
+ .
+
+ .. code-block:: cuda
+
+ __global__ void cuda_calculate_hessians(
+ .
+ .
+ .
+ if (estimator_id == LSE)
+ {
+ calculate_hessian_lse(
+ &sum,
+ point_index,
+ derivative_index_i + point_index,
+ derivative_index_j + point_index,
+ current_data,
+ current_value,
+ current_derivative,
+ current_weight,
+ user_info,
+ user_info_size);
+ }
+ .
+ .
+ .
+ else if (estimator_id == ...) // estimator ID
+ {
+ ...( // function name hessian
+ &sum,
+ point_index,
+ derivative_index_i + point_index,
+ derivative_index_j + point_index,
+ current_data,
+ current_value,
+ current_derivative,
+ current_weight,
+ user_info,
+ user_info_size);
+ }
+ .
+ .
+ .
+
+Future releases
+---------------
+
+A disadvantage of the Gpufit library, when compared with established CPU-based curve fitting packages,
+is that in order to add or modify a fit model function or a fit estimator, the library must be recompiled.
+We anticipate that this limitation can be overcome in future releases of the library, by employing
+run-time compilation of the CUDA code.
diff --git a/docs/epilog.txt b/docs/epilog.txt
new file mode 100644
index 0000000..ee243c1
--- /dev/null
+++ b/docs/epilog.txt
@@ -0,0 +1,48 @@
+
+..
+ The content of this file will be appended to every documentation file. Put common substitutions and links here.
+
+.. |GF| replace:: the Gpufit library
+.. |GF_version| replace:: 1.0.0
+
+.. _CUDA: http://developer.nvidia.com/cuda-zone
+.. _CUDA_SELECT_NVCC_ARCH_FLAGS: http://cmake.org/cmake/help/v3.7/module/FindCUDA.html
+
+.. _CMake: http://www.cmake.org
+.. _Boost: http://www.boost.org
+.. _MATLAB: http://www.mathworks.com/products/matlab.html
+.. _Python: http://www.python.org
+
+.. _`Gpufit on Github`: https://github.com/gpufit/Gpufit
+.. _`Gpufit release location`: https://github.com/gpufit/Gpufit/releases
+.. _Gpufit-master.zip: https://github.com/gpufit/Gpufit/archive/master.zip
+
+.. _gpufit.h: https://github.com/gpufit/Gpufit/blob/master/Gpufit/gpufit.h
+.. _interface.cpp: https://github.com/gpufit/Gpufit/blob/master/Gpufit/interface.cpp
+
+.. _gauss_1d.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/gauss_1d.cuh
+.. _gauss_2d.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/gauss_2d.cuh
+.. _gauss_2d_elliptic.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/gauss_2d_elliptic.cuh
+.. _gauss_2d_rotated.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/gauss_2d_rotated.cuh
+.. _cauchy_2d_elliptic.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/cauchy2delliptic.cuh
+.. _linear_1d.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/linear_1d.cuh
+.. _lse.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/lse.cuh
+.. _mle.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/mle.cuh
+.. _cuda_kernels.cu: https://github.com/gpufit/Gpufit/blob/master/Gpufit/cuda_kernels.cu
+
+.. _Tests: https://github.com/gpufit/Gpufit/tree/master/Gpufit/tests
+.. _Examples: https://github.com/gpufit/Gpufit/tree/master/Gpufit/examples
+.. _Simple_Example.cpp: https://github.com/gpufit/Gpufit/blob/master/Gpufit/examples/Simple_Example.cpp
+.. _Gauss_Fit_2D_Example.cpp: https://github.com/gpufit/Gpufit/blob/master/Gpufit/examples/Gauss_Fit_2D_Example.cpp
+.. _Linear_Regression_Example.cpp: https://github.com/gpufit/Gpufit/blob/master/Gpufit/examples/Linear_Regression_Example.cpp
+
+.. _GpufitMex.cpp: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/matlab/GpufitMex.cpp
+.. _gpufit.m: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/matlab/gpufit.m
+
+.. _`Matlab simple example`: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/matlab/examples/simple.m
+.. _`Matlab Gauss2D example`: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/matlab/examples/gauss2d.m
+.. _`Matlab Gauss2D plot example`: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/matlab/examples/gauss2d_plot.m
+
+.. _`Python simple example`: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/python/examples/simple.py
+.. _`Python Gauss2D example`: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/python/examples/gauss2d.py
+.. _`Python Gauss2D plot example`: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/python/examples/gauss2d_plot.py
\ No newline at end of file
diff --git a/docs/examples.rst b/docs/examples.rst
new file mode 100644
index 0000000..da54114
--- /dev/null
+++ b/docs/examples.rst
@@ -0,0 +1,394 @@
+========
+Examples
+========
+
+C++ Examples_ are part of the library code base and can be built and run through the project environment. Here they are
+described and important steps are highlighted.
+
+Please note, that additionally, the C++ Tests_ contained in the code base also demonstrate the usage of |GF|. However, a
+detailed description of the tests is not provided.
+
+.. _c-example-simple:
+
+Simple skeleton example
+-----------------------
+
+This example shows the minimal code providing all required parameters and the call to the C interface. It is contained
+in Simple_Example.cpp_ and can be built and executed within the project environment. Please note, that it this code does
+not do anything other than call gpufit().
+
+In the first section of the code, the model ID is set, space for initial parameters and data values is reserved (in a normal
+application, however, the data array would already exist), the fit tolerance is set, the maximal number of iterations is set,
+the estimator ID is set, and the parameters to fit array is initialized to indicate that all parameters should be fit.
+
+.. code-block:: cpp
+
+ // number of fits, number of points per fit
+ size_t const number_fits = 10;
+ size_t const number_points = 10;
+
+ // model ID and number of parameter
+ int const model_id = GAUSS_1D;
+ size_t const number_parameters = 5;
+
+ // initial parameters
+ std::vector< float > initial_parameters(number_fits * number_parameters);
+
+ // data
+ std::vector< float > data(number_points * number_fits);
+
+ // tolerance
+ float const tolerance = 0.001f;
+
+ // maximal number of iterations
+ int const max_number_iterations = 10;
+
+ // estimator ID
+ int const estimator_id = LSE;
+
+ // parameters to fit (all of them)
+ std::vector< int > parameters_to_fit(number_parameters, 1);
+
+In a next step, sufficient memory is reserved for all four output parameters.
+
+.. code-block:: cpp
+
+ // output parameters
+ std::vector< float > output_parameters(number_fits * number_parameters);
+ std::vector< int > output_states(number_fits);
+ std::vector< float > output_chi_square(number_fits);
+ std::vector< int > output_number_iterations(number_fits);
+
+Finally, there is a call to the C interface of Gpufit (in this example, the optional
+inputs *weights* and *user info* are not used) and a check of the return status.
+If an error occurred, the last error message is obtained and an exception is thrown.
+
+.. code-block:: cpp
+
+ // call to gpufit (C interface)
+ int const status = gpufit
+ (
+ number_fits,
+ number_points,
+ data.data(),
+ 0,
+ model_id,
+ initial_parameters.data(),
+ tolerance,
+ max_number_iterations,
+ parameters_to_fit.data(),
+ estimator_id,
+ 0,
+ 0,
+ output_parameters.data(),
+ output_states.data(),
+ output_chi_square.data(),
+ output_number_iterations.data()
+ );
+
+ // check status
+ if (status != STATUS_OK)
+ {
+ throw std::runtime_error(gpufit_get_last_error());
+ }
+
+This simple example can easily be adapted to real applications by:
+
+- choosing your own model ID
+- choosing your own estimator ID
+- choosing your own fit tolerance and maximal number of iterations
+- filling the data structure with the data values to be fitted
+- filling the initial parameters structure with suitable estimates of the true parameters
+- processing the output data
+
+The following two examples show |GF| can be used to fit real data.
+
+.. _c-example-2d-gaussian:
+
+Fit 2D Gaussian functions example
+---------------------------------
+
+This example features:
+
+- Multiple fits using a 2D Gaussian function
+- Noisy data and random initial guesses for the fit parameters
+- A Poisson noise adapted maximum likelihood estimator
+
+It is contained in Gauss_Fit_2D_Example.cpp_ and can be built and executed within the project environment. The optional
+inputs to gpufit(), *weights* and *user info*, are not used.
+
+In this example, a 2D Gaussian curve is fit to 10\ :sup:`4` noisy data sets having a size of 20 x 20 points each.
+The model function and the model parameters are described in :ref:`gauss-2d`.
+
+In this example the true parameters used to generate the Gaussian data are set to
+
+.. code-block:: cpp
+
+ // true parameters
+ std::vector< float > true_parameters{ 10.f, 9.5f, 9.5f, 3.f, 10.f}; // amplitude, center x/y positions, width, offset
+
+which defines a 2D Gaussian peak centered at the middle of the grid (position 9.5, 9.5), with a width (standard deviation) of 3.0, an amplitude of 10
+and a background of 10.
+
+The guesses for the initial parameters are drawn from the true parameters with a uniformly distributed deviation
+of about 20%. The initial guesses for the center coordinates are chosen with a deviation relative to the width of the Gaussian.
+
+.. code-block:: cpp
+
+ // initial parameters (randomized)
+ std::vector< float > initial_parameters(number_fits * number_parameters);
+ for (size_t i = 0; i < number_fits; i++)
+ {
+ for (size_t j = 0; j < number_parameters; j++)
+ {
+ if (j == 1 || j == 2)
+ {
+ initial_parameters[i * number_parameters + j] = true_parameters[j] + true_parameters[3] * (-0.2f + 0.4f * uniform_dist(rng));
+ }
+ else
+ {
+ initial_parameters[i * number_parameters + j] = true_parameters[j] * (0.8f + 0.4f*uniform_dist(rng));
+ }
+ }
+ }
+
+The 2D grid of x and y values (each ranging from 0 to 19 with an increment of 1) is computed with a double for loop.
+
+.. code-block:: cpp
+
+ // generate x and y values
+ std::vector< float > x(number_points);
+ std::vector< float > y(number_points);
+ for (size_t i = 0; i < size_x; i++)
+ {
+ for (size_t j = 0; j < size_x; j++) {
+ x[i * size_x + j] = static_cast(j);
+ y[i * size_x + j] = static_cast(i);
+ }
+ }
+
+Then a 2D Gaussian peak model function (without noise) is calculated once for the true parameters
+
+.. code-block:: cpp
+
+ void generate_gauss_2d(std::vector &x, std::vector &y, std::vector &g, std::vector::iterator &p)
+ {
+ // generates a Gaussian 2D peak function on a set of x and y values with some paramters p (size 5)
+ // we assume that x.size == y.size == g.size, no checks done
+
+ // given x and y values and parameters p computes a model function g
+ for (size_t i = 0; i < x.size(); i++)
+ {
+ float arg = -((x[i] - p[1]) * (x[i] - p[1]) + (y[i] - p[2]) * (y[i] - p[2])) / (2 * p[3] * p[3]);
+ g[i] = p[0] * exp(arg) + p[4];
+ }
+ }
+
+Stored in variable temp, it is then used in every fit to generate Poisson distributed random numbers.
+
+.. code-block:: cpp
+
+ // generate data with noise
+ std::vector< float > temp(number_points);
+ // compute the model function
+ generate_gauss_2d(x, y, temp, true_parameters.begin());
+
+ std::vector< float > data(number_fits * number_points);
+ for (size_t i = 0; i < number_fits; i++)
+ {
+ // generate Poisson random numbers
+ for (size_t j = 0; j < number_points; j++)
+ {
+ std::poisson_distribution< int > poisson_dist(temp[j]);
+ data[i * number_points + j] = static_cast(poisson_dist(rng));
+ }
+ }
+
+Thus, in this example the difference between data for each fit only in the random noise. This, and the
+randomized initial guesses for each fit, result in each fit returning slightly different best-fit parameters.
+
+We set the model and estimator IDs for the fit accordingly.
+
+.. code-block:: cpp
+
+ // estimator ID
+ int const estimator_id = MLE;
+
+ // model ID
+ int const model_id = GAUSS_2D;
+
+And call the gpufit :ref:`c-interface`. Parameters weights, user_info and user_info_size are set to 0, indicating that they
+won't be used during the fits.
+
+.. code-block:: cpp
+
+ // call to gpufit (C interface)
+ int const status = gpufit
+ (
+ number_fits,
+ number_points,
+ data.data(),
+ 0,
+ model_id,
+ initial_parameters.data(),
+ tolerance,
+ max_number_iterations,
+ parameters_to_fit.data(),
+ estimator_id,
+ 0,
+ 0,
+ output_parameters.data(),
+ output_states.data(),
+ output_chi_square.data(),
+ output_number_iterations.data()
+ );
+
+ // check status
+ if (status != STATUS_OK)
+ {
+ throw std::runtime_error(gpufit_get_last_error());
+ }
+
+After the fits have been executed and the return value is checked to ensure that no error occurred, some statistics
+about the fits are displayed.
+
+Output statistics
++++++++++++++++++
+
+A histogram of all possible fit states (see :ref:`api-output-parameters`) is obtained by iterating over the state of each fit.
+
+.. code-block:: cpp
+
+ // get fit states
+ std::vector< int > output_states_histogram(5, 0);
+ for (std::vector< int >::iterator it = output_states.begin(); it != output_states.end(); ++it)
+ {
+ output_states_histogram[*it]++;
+ }
+
+In the computation of the mean and standard deviation only converged fits are taken into account. Here is an example of computing
+the means of the output parameters iterating over all fits and all parameters.
+
+.. code-block:: cpp
+
+ // compute mean of fitted parameters for converged fits
+ std::vector< float > output_parameters_mean(number_parameters, 0);
+ for (size_t i = 0; i != number_fits; i++)
+ {
+ if (output_states[i] == STATE_CONVERGED)
+ {
+ for (size_t j = 0; j < number_parameters; j++)
+ {
+ output_parameters_mean[j] += output_parameters[i * number_parameters + j];
+ }
+ }
+ }
+ // normalize
+ for (size_t j = 0; j < number_parameters; j++)
+ {
+ output_parameters_mean[j] /= output_states_histogram[0];
+ }
+
+.. _linear-regression-example:
+
+Linear Regression Example
+-------------------------
+
+This example features:
+
+- Multiple fits of a 1D Linear curve
+- Noisy data and random initial guesses for the parameters
+- Unequal spaced x position values given as custom user info
+
+It is contained in Linear_Regression_Example.cpp_ and can be built and executed within the project environment.
+
+In this example, a straight line is fitted to 10\ :sup:`4` noisy data sets. Each data set includes 20 data points.
+Locations of data points are scaled non-linear (exponentially). The user information given implicates the x positions of the data
+sets. The fits are unweighted and the model function and the model parameters are described in :ref:`linear-1d`.
+
+The custom x positions of the linear model are stored in the user_info.
+
+.. code-block:: cpp
+
+ // custom x positions for the data points of every fit, stored in user info
+ std::vector< float > user_info(number_points);
+ for (size_t i = 0; i < number_points; i++)
+ {
+ user_info[i] = static_cast(pow(2, i));
+ }
+
+ // size of user info in bytes
+ size_t const user_info_size = number_points * sizeof(float);
+
+Because only number_points values are specified, this means that the same custom x position values are used for every fit.
+
+The initial parameters for every fit are set to random values uniformly distributed around the true parameter value.
+
+.. code-block:: cpp
+
+ // true parameters
+ std::vector< float > true_parameters { 5, 2 }; // offset, slope
+
+ // initial parameters (randomized)
+ std::vector< float > initial_parameters(number_fits * number_parameters);
+ for (size_t i = 0; i != number_fits; i++)
+ {
+ // random offset
+ initial_parameters[i * number_parameters + 0] = true_parameters[0] * (0.8f + 0.4f * uniform_dist(rng));
+ // random slope
+ initial_parameters[i * number_parameters + 1] = true_parameters[0] * (0.8f + 0.4f * uniform_dist(rng));
+ }
+
+The data is generated as the value of a linear function and some additive normally distributed noise term.
+
+.. code-block:: cpp
+
+ // generate data
+ std::vector< float > data(number_points * number_fits);
+ for (size_t i = 0; i != data.size(); i++)
+ {
+ size_t j = i / number_points; // the fit
+ size_t k = i % number_points; // the position within a fit
+
+ float x = user_info[k];
+ float y = true_parameters[0] + x * true_parameters[1];
+ data[i] = y + normal_dist(rng);
+ }
+
+We set the model and estimator IDs for the fit accordingly.
+
+.. code-block:: cpp
+
+ // estimator ID
+ int const estimator_id = LSE;
+
+ // model ID
+ int const model_id = LINEAR_1D;
+
+And call the gpufit :ref:`c-interface`. Parameter weights is set to 0, indicating that they won't be used during the fits.
+
+.. code-block:: cpp
+
+ // call to gpufit (C interface)
+ int const status = gpufit
+ (
+ number_fits,
+ number_points,
+ data.data(),
+ 0,
+ model_id,
+ initial_parameters.data(),
+ tolerance,
+ max_number_iterations,
+ parameters_to_fit.data(),
+ estimator_id,
+ user_info_size,
+ reinterpret_cast< char * >( user_info.data() ),
+ output_parameters.data(),
+ output_states.data(),
+ output_chi_square.data(),
+ output_number_iterations.data()
+ );
+
+After the fits have been executed and the return value is checked to ensure that no error occurred, some statistics
+about the fits are displayed (see `Output statistics`_).
diff --git a/docs/fit_estimator_functions.rst b/docs/fit_estimator_functions.rst
new file mode 100644
index 0000000..fcee030
--- /dev/null
+++ b/docs/fit_estimator_functions.rst
@@ -0,0 +1,54 @@
+.. _estimator-functions:
+
+Estimator functions
+-------------------
+
+.. _estimator-lse:
+
+Least squares estimator
++++++++++++++++++++++++
+
+The least squares estimator computes the weighted sum of the squared deviation between the data values and the model at
+the positions of the data points. The ID for this estimator is ``LSE``. It's implemented in lse.cuh_.
+
+Least squares estimation is a common method, and the standard Levenberg-Marquardt algorithm described by Marquardt makes
+use of minimal least squares. The estimator is described as follows.
+
+.. math::
+
+ {\chi^2}(\vec{p}) = \sum_{n=0}^{N-1}{ \left(f_{n}(\vec{p})-z_{n}\right)^2\cdot w_n }
+
+:`n`: The index of the data points (:math:`0,..,N-1`)
+
+:`f_n`: The model function values at data position :math:`n`
+
+:`z_n`: Data values at data position :math:`n`
+
+:`\vec{p}`: Fit model function parameters
+
+:`w_n`: Weight values for data at position :math:`n`
+
+
+.. _estimator-mle:
+
+Maximum likelihood estimator for data subject to Poisson statistics
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+The maximum likelihood estimator (MLE) for Poisson distributed noise is relatively simple to implement. In the case of data with Poisson noise
+is provides a more precise estimate when compared to an LSE estimator. The ID for this estimator is ``MLE``. It's implemented in mle.cuh_.
+
+The estimator is described as follows.
+
+.. math::
+
+ {\chi^2}(\vec{p}) = 2\sum_{n=0}^{N-1}{(f_{n}(\vec{p})-z_{n})}-2\sum_{n=0,z_n\neq0}^{N-1}{z_n ln \left(\frac{f_{n}(\vec{p})}{z_n}\right)}
+
+:`n`: The index of the data points (:math:`0,..,N-1`)
+
+:`f_n`: The model function values at data position :math:`n`
+
+:`z_n`: Data values at data position :math:`n`
+
+:`\vec{p}`: Actual model function parameters
+
+Note that this estimator does not provide any means to weight the data values. Rather, noise in the data is assumed to be purely Poissonian.
\ No newline at end of file
diff --git a/docs/fit_model_functions.rst b/docs/fit_model_functions.rst
new file mode 100644
index 0000000..620c821
--- /dev/null
+++ b/docs/fit_model_functions.rst
@@ -0,0 +1,193 @@
+.. _fit-model-functions:
+
+Fit Model functions
+-------------------
+
+This section describes the fit model functions which are included with the Gpufit library. The headings are the names
+of the ModelID parameter used in the gpufit()_ call. They are defined in gpufit.h_.
+
+Note that additional model functions may be added as described in the documentation, see :ref:`gpufit-customization`.
+
+.. _linear-1d:
+
+Linear regression
++++++++++++++++++
+
+A 1D linear function defined by two parameters (offset and slope). The user information data may be used to specify the
+X coordinate of each data point. The model ID of this function is ``LINEAR_1D``, and it is implemented in linear_1d.cuh_.
+
+.. math::
+
+ g(x,\vec{p})=p_0+p_1 x
+
+:`x`: (independent variable) *X* coordinate
+
+ The X coordinate values may be specified in the user information data.
+ For details on how to do this, see the linear regression code example, :ref:`linear-regression-example`.
+
+ If no independent variables are provided, the *X* coordinate of the first data value is assumed to be (0.0).
+ In this case, for a fit size of *M* data points, the *X* coordinates of the data are simply the corresponding array
+ indices of the data array, starting from zero (i.e. :math:`0, 1, 2, ...`).
+
+:`p_0`: offset
+
+:`p_1`: slope
+
+
+.. _gauss-1d:
+
+1D Gaussian function
+++++++++++++++++++++
+
+A 1D Gaussian function defined by four parameters. Its model ID is ``GAUSS_1D`` and it is implemented in gauss_1d.cuh_.
+Here, p is the vector of parameters (p0..p3) and the model function g exists for each x coordinate of the input data.
+
+.. math::
+
+ g(x,\vec{p})=p_0 e^{-\left(x-p_1\right)^2/\left(2p_2^2\right)}+p_3
+
+:`x`: (independent variable) *X* coordinate
+
+ No independent variables are passed to this model function.
+ Hence, the *X* coordinate of the first data value is assumed to be (0.0). For a fit size of *M* data points,
+ the *X* coordinates of the data are simply the corresponding array indices of the data array, starting from
+ zero (i.e. :math:`0, 1, 2, ...`).
+
+:`p_0`: amplitude
+
+:`p_1`: center coordinate
+
+:`p_2`: width (standard deviation)
+
+:`p_3`: offset
+
+
+.. _gauss-2d:
+
+2D Gaussian function (cylindrical symmetry)
++++++++++++++++++++++++++++++++++++++++++++
+
+A 2D Gaussian function defined by five parameters. Its model ID is ``GAUSS_2D`` and it is implemented in gauss_2d.cuh_.
+Here, p is the vector of parameters (p0..p4) and the model function g exists for each x,y coordinate of the input data.
+
+.. math::
+
+ g(x,y,p)=p_0 e^{-\left(\left(x-p_1\right)^2+\left(y-p_2\right)^2\right)/\left(2p_3^2\right)}+p_4
+
+:`x,y`: (independent variables) *X,Y* coordinates
+
+ No independent variables are passed to this model function.
+ Hence, the *(X,Y)* coordinates of the first data value are assumed to be (:math:`0.0, 0.0`).
+ For a fit size of *M x N* data points, the *(X,Y)* coordinates of the data are simply the corresponding 2D array
+ indices of the data array, starting from zero.
+
+:`p_0`: amplitude
+
+:`p_1`: center coordinate x
+
+:`p_2`: center coordinate y
+
+:`p_3`: width (standard deviation; equal width in x and y dimensions)
+
+:`p_4`: offset
+
+
+.. _gauss-2d-elliptic:
+
+2D Gaussian function (elliptical)
++++++++++++++++++++++++++++++++++
+
+A 2D elliptical Gaussian function defined by six parameters. Its model ID is ``GAUSS_2D_ELLIPTIC`` and it is implemented
+in gauss_2d_elliptic.cuh_. Here, p is the vector of parameters (p0..p5) and the model function g exists for each x,y coordinate of the input data.
+
+.. math::
+
+ g(x,y,\vec{p})=p_0 e^{-\frac{1}{2}\left(\frac{\left(x-p_1\right)^2}{p_3^2}+\frac{\left(y-p_2\right)^2}{p_4^2}\right)}+p_5
+
+:`x,y`: (independent variables) *X,Y* coordinates
+
+ No independent variables are passed to this model function.
+ Hence, the *(X,Y)* coordinates of the first data value are assumed to be (:math:`0.0, 0.0`).
+ For a fit size of *M x N* data points, the *(X,Y)* coordinates of the data are simply the corresponding
+ 2D array indices of the data array, starting from zero.
+
+:`p_0`: amplitude
+
+:`p_1`: center coordinate x
+
+:`p_2`: center coordinate y
+
+:`p_3`: width x (standard deviation)
+
+:`p_4`: width y (standard deviation)
+
+:`p_5`: offset
+
+
+.. _gauss-2d-rotated:
+
+2D Gaussian function (elliptical, rotated)
+++++++++++++++++++++++++++++++++++++++++++
+
+A 2D elliptical Gaussian function whose principal axis may be rotated with respect to the X and Y coordinate axes,
+defined by seven parameters. Its model is ``GAUSS_2D_ROTATED`` and it is implemented in gauss_2d_rotated.cuh_.
+Here, p is the vector of parameters (p0..p6) and the model function g exists for each x,y coordinate of the input data.
+
+.. math::
+
+ g(x,y,\vec{p})=p_0 e^{-\frac{1}{2}\left(\frac{\left((x-p_1)\cos{p_6}-(y-p_2)\sin{p_6}\right)^2}{p_3^2}+\frac{\left((x-p_1)\sin{p_6}+(y-p_2)\cos{p_6}\right)^2}{p_4^2}\right)}+p_5
+
+:`x,y`: (independent variables) *X,Y* coordinates
+
+ No independent variables are passed to this model function.
+ Hence, the *(X,Y)* coordinates of the first data value are assumed to be (:math:`0.0, 0.0`).
+ For a fit size of *M x N* data points, the *(X,Y)* coordinates of the data are simply the corresponding
+ 2D array indices of the data array, starting from zero.
+
+:`p_0`: amplitude
+
+:`p_1`: center coordinate x
+
+:`p_2`: center coordinate y
+
+:`p_3`: width x (standard deviation)
+
+:`p_4`: width y (standard deviation)
+
+:`p_5`: offset
+
+:`p_6`: rotation angle [radians]
+
+
+.. _cauchy-2d-elliptic:
+
+2D Cauchy function (elliptical)
++++++++++++++++++++++++++++++++
+
+A 2D elliptical Cauchy function defined by six parameters. Its model ID is ``CAUCHY_2D_ELLIPTIC`` and it is implemented
+in cauchy_2d_elliptic.cuh_. Here, p is the vector of parameters (p0..p5) and the model function g exists for each x,y
+coordinate of the input data.
+
+.. math::
+
+ g(x,y,\vec{p})=p_0 \frac{1}{\left(\frac{x-p_1}{p_3}\right)^2+1} \frac{1}{\left(\frac{y-p_2}{p_4}\right)^2+1} + p_5
+
+:`x,y`: (independent variables) *X,Y* coordinates
+
+ No independent variables are passed to this model function.
+ Hence, the *(X,Y)* coordinates of the first data value are assumed to be (:math:`0.0, 0.0`).
+ For a fit size of *M x N* data points, the *(X,Y)* coordinates of the data are simply the corresponding
+ 2D array indices of the data array, starting from zero.
+
+:`p_0`: amplitude
+
+:`p_1`: center coordinate x
+
+:`p_2`: center coordinate y
+
+:`p_3`: width x (standard deviation)
+
+:`p_4`: width y (standard deviation)
+
+:`p_5`: offset
+
diff --git a/docs/gpufit_api.rst b/docs/gpufit_api.rst
new file mode 100644
index 0000000..ce6695d
--- /dev/null
+++ b/docs/gpufit_api.rst
@@ -0,0 +1,377 @@
+.. _api-description:
+
+======================
+Gpufit API description
+======================
+
+The Gpufit source code compiles to a dynamic-link library (DLL), providing a C interface.
+In the sections below, the C interface and its arguments are described in detail.
+
+.. _c-interface:
+
+C Interface
+-----------
+
+The C interface is defined in the Gpufit header file: gpufit.h_.
+
+gpufit()
+++++++++
+
+This is the main fit function. A single call to the *gpufit()* function executes a block of *N* fits.
+The inputs to *gpufit()* are scalars and pointers to arrays, and the outputs are also array pointers.
+
+The inputs to the *gpufit()* function are:
+
+- the number of fits (*N*),
+- the number of data points per fit (each fit has equal size),
+- the fit data,
+- an array of weight values that are used to weight the individual data points in the fit (optional),
+- an ID number which specifies the fit model function,
+- an array of initial parameters for the model functions,
+- a tolerance value which determines when the fit has converged,
+- the maximum number of iterations per fit,
+- an array of flags which allow one or more fit parameters to be held constant,
+- an ID number which specifies the fit estimator (e.g. least squares, etc.),
+- the size of the user info data,
+- the user info data, which may have multiple uses, for example to pass additional parameters to the fit functions,
+ or to include independent variables (e.g. X values) with the fit data.
+
+The outputs of *gpufit()* are:
+
+- the best fit model parameters for each fit,
+- an array of flags indicating, for example, whether each fit converged,
+- the final value of :math:`\chi^2` for each fit,
+- the number of iterations needed for each fit to converge.
+
+The *gpufit()* function call is defined below.
+
+.. code-block:: cpp
+
+ int gpufit
+ (
+ size_t n_fits,
+ size_t n_points,
+ float * data,
+ float * weights,
+ int model_id,
+ float * initial_parameters,
+ float tolerance,
+ int max_n_iterations,
+ int * parameters_to_fit,
+ int estimator_id,
+ size_t user_info_size,
+ char * user_info,
+ float * output_parameters,
+ int * output_states,
+ float * output_chi_squares,
+ int * output_n_iterations
+ ) ;
+
+.. _api-input-parameters:
+
+Description of input parameters
+...............................
+
+:n_fits: Number of fits to be performed
+
+ :type: size_t
+
+:n_points: Number of data points per fit
+
+ Gpufit is designed such that each fit must have the same number of data points per fit.
+
+ :type: size_t
+
+:data: Pointer to data values
+
+ A pointer to the data values. The data must be passed in as a 1D array of floating point values, with the data
+ for each fit concatenated one after another. In the case of multi-dimensional data, the data must be flattened
+ to a 1D array. The number of elements in the array is equal to the product n_fits * n_points.
+
+ :type: float *
+ :length: n_points * n_fits
+
+:weights: Pointer to weights
+
+ The weights array includes unique weighting values for each fit. It is used only by the least squares estimator (LSE).
+ The size of the weights array and its organization is identical to that for the data array.
+ For statistical weighting, this parameter should be set equal to the inverse of the variance of the data
+ (i.e. weights = 1.0 / variance ). The weights array is an optional input.
+
+ :type: float *
+ :length: n_points * n_fits
+ :special: Use a NULL pointer to indicate that no weights are provided. In this case all data values will be weighted equally.
+
+:model_id: Model ID
+
+ Determines the model which is used for all fits in this call. See :ref:`fit-model-functions` for more details.
+
+ As defined in gpufit.h_:
+
+ :0: GAUSS_1D
+ :1: GAUSS_2D
+ :2: GAUSS_2D_ELLIPTIC
+ :3: GAUSS_2D_ROTATED
+ :4: CAUCHY_2D_ELLIPTIC
+ :5: LINEAR_1D
+
+ :type: int
+
+:initial_parameters: Pointer to initial parameter values
+
+ A 1D array containing the initial model parameter values for each fit. If the number of parameters of the fit model
+ is defined by *n_parameters*, then the size of this array is *n_fits * n_parameters*.
+
+ The parameter values for each fit are concatenated one after another. If there are *M* parameters per fit,
+ the parameters array is organized as follows: [(parameter 1), (parameter 2), ..., (parameter M), (parameter 1),
+ (parameter 2), ..., (parameter M), ...].
+
+ :type: float *
+ :length: n_fits * n_parameters
+
+:tolerance: Fit tolerance threshold
+
+ The fit tolerance determines when the fit has converged. After each fit iteration, the change in the absolute value
+ of :math:`\chi^2` is calculated. The fit has converged when one of two conditions are met. First, if the change
+ in the absolute value of :math:`\chi^2` is less than the tolerance value, the fit has converged.
+ Alternatively, if the change in :math:`\chi^2` is less than the product of tolerance and the absolute value of
+ :math:`\chi^2` [tolerance * abs(:math:`\chi^2`)], then the fit has converged.
+
+ Setting a lower value for the tolerance results in more precise values for the fit parameters, but requires more fit
+ iterations to reach convergence.
+
+ A typical value for the tolerance settings is between 1.0E-3 and 1.0E-6.
+
+ :type: float
+
+:max_n_iterations: Maximum number of iterations
+
+ The maximum number of fit iterations permitted. If the fit has not converged after this number of iterations,
+ the fit returns with a status value indicating that the maximum number of iterations was reached.
+
+ :type: int
+
+:parameters_to_fit: Pointer to array indicating which model parameters should be held constant during the fit
+
+ This is an array of ones or zeros, with a length equal to the number of parameters of the fit model function.
+ Each entry in the array is a flag which determines whether or not the corresponding model parameter will be held
+ constant during the fit. To allow a parameter to vary during the fit, set the entry in *parameters_to_fit* equal
+ to one. To hold the value constant, set the entry to zero.
+
+ An array of ones, e.g. [1,1,1,1,1,...] will allow all parameters to vary during the fit.
+
+ :type: int *
+ :length: n_parameters
+
+:estimator_id: Estimator ID
+
+ Determines the fit estimator which is used. See :ref:`estimator-functions` for more details.
+
+ As defined in gpufit.h_:
+
+ :0: LSE
+ :1: MLE
+
+ :type: int
+
+:user_info_size: Size of user information data
+
+ Size of the user information data array, in bytes.
+
+ :type: size_t
+
+:user_info: Pointer to user information data
+
+ This parameter is intended to provide flexibility to the Gpufit interface. The user information data is a generic
+ block of memory which is passed in to the *gpufit()* function, and which is accessible in shared GPU memory by the
+ fit model functions. Possible uses for the user information data is to pass in value for independent variables
+ (e.g. X values) or to supply additional data to the fit model function. For a coded example which makes use of
+ the user information data, see :ref:`linear-regression-example`. The user information data is an optional parameter
+ - if no user information is required this parameter may be set to NULL.
+
+ :type: char *
+ :length: user_info_size
+ :special: Use a NULL pointer to indicate that no user information is available.
+
+.. _api-output-parameters:
+
+Description of output parameters
+................................
+
+:output_parameters: Pointer to array of best-fit model parameters
+
+ For each fit, this array contains the best-fit model parameters. The array is organized identically to the input
+ parameters array.
+
+ :type: float *
+ :length: n_fits * n_parameters
+
+:output_states: Pointer to array of fit result state IDs
+
+ For each fit the result of the fit is indicated by a state ID. The state ID codes are defined below.
+ A state ID of 0 indicates that the fit converged successfully.
+
+ As defined in gpufit.h_:
+
+ :0: The fit converged, tolerance is satisfied, the maximum number of iterations is not exceeded
+ :1: Maximum number of iterations exceeded
+ :2: During the Gauss-Jordan elimination the Hessian matrix is indicated as singular
+ :3: Non-positive curve values have been detected while using MLE (MLE requires only positive curve values)
+ :4: State not read from GPU Memory
+
+ :type: int *
+ :length: n_fits
+
+:output_chi_squares: Pointer to array of :math:`\chi^2` values
+
+ For each fit, this array contains the final :math:`\chi^2` value.
+
+ :type: float *
+ :length: n_fits
+
+:output_n_iterations: Pointer to array of iteration counts
+
+ For each fit, this array contains the number of fit iterations which were performed.
+
+ :type: int *
+ :length: n_fits
+
+:return value: Status code
+
+ The return value of the function call indicates whether an error occurred.
+
+ :0: No error
+ :-1: Error
+
+gpufit_portable_interface()
++++++++++++++++++++++++++++
+
+This function is a simple wrapper around the *gpufit()* function, providing an alternative means of passing the function parameters.
+
+.. code-block:: cpp
+
+ int gpufit_portable_interface(int argc, void *argv[]);
+
+Description of parameters
+.........................
+
+:argc: The length of the argv pointer array
+
+:argv: Array of pointers to *gpufit* parameters, as defined above. For reference, the type of each element of the *argv* array is listed below.
+
+ :argv[0]: Number of fits
+
+ :type: size_t *
+
+ :argv[1]: Number of points per fit
+
+ :type: size_t *
+
+ :argv[2]: Fit data
+
+ :type: float *
+
+ :argv[3]: Fit weights
+
+ :type: float *
+
+ :argv[4]: Fit model ID
+
+ :type: int *
+
+ :argv[5]: Initial parameters
+
+ :type: float *
+
+ :argv[6]: Fit tolerance
+
+ :type: float *
+
+ :argv[7]: Maximum number of iterations
+
+ :type: int *
+
+ :argv[8]: Parameters to fit
+
+ :type: int *
+
+ :argv[9]: Fit estimator ID
+
+ :type: int *
+
+ :argv[10]: User info size
+
+ :type: size_t *
+
+ :argv[11]: User info data
+
+ :type: char *
+
+ :argv[12]: Output parameters
+
+ :type: float *
+
+ :argv[13]: Output states
+
+ :type: int *
+
+ :argv[14]: Output :math:`\chi^2` values
+
+ :type: float *
+
+ :argv[15]: Output number of iterations
+
+ :type: int *
+
+
+:return value: This function simply returns the *gpufit()* return status code.
+
+gpufit_get_last_error()
++++++++++++++++++++++++
+
+A function that returns a string representation of the last error.
+
+.. code-block:: cpp
+
+ char const * gpufit_get_last_error();
+
+:return value: Error message corresponding to the most recent error, or an empty string if no error occurred.
+
+ 'CUDA driver version is insufficient for CUDA runtime version'
+ The graphics driver version installed on the computer is not supported by the CUDA Toolkit version which was used
+ to build Gpufit.dll. Update the graphics driver or re-build Gpufit using a compatible CUDA Toolkit version.
+
+gpufit_cuda_available()
++++++++++++++++++++++++
+
+A function that calls a simple CUDA function to check if CUDA is available.
+
+.. code-block:: cpp
+
+ int gpufit_cuda_available();
+
+:return value: Returns 0 if CUDA is not available (no suitable device found, or driver version insufficient).
+ Use the function *gpufit_get_last_error()* to check the error message. Returns 1 if CUDA is available and CUDA runtime version and driver version are compatible.
+
+gpufit_get_cuda_version()
++++++++++++++++++++++++++
+
+A function that returns the CUDA runtime version in *runtime_version* and the
+installed CUDA driver version in *driver_version*.
+
+.. code-block:: cpp
+
+ int gpufit_get_cuda_version(int * runtime_version, int * driver_version);
+
+:runtime_version: Pointer to the CUDA runtime version number (is 0 if the CUDA runtime version is incompatible with the installed CUDA driver version)
+
+
+:driver_version: Pointer to the CUDA driver version number (is 0 if no CUDA enabled graphics card was detected)
+
+:return value: Returns 0 if an error occured during collecting of the version information. Use the function
+ *gpufit_get_last_error()* to check the error message. Returns 1 if collecting of the version
+ information was successful.
+
+
+
+
diff --git a/docs/images/GPUFIT_CPUFIT_Performance_Comparison.png b/docs/images/GPUFIT_CPUFIT_Performance_Comparison.png
new file mode 100644
index 0000000..8617237
Binary files /dev/null and b/docs/images/GPUFIT_CPUFIT_Performance_Comparison.png differ
diff --git a/docs/images/GPUfit_PassmarkG3D_relative_performance.png b/docs/images/GPUfit_PassmarkG3D_relative_performance.png
new file mode 100644
index 0000000..8f2e17e
Binary files /dev/null and b/docs/images/GPUfit_PassmarkG3D_relative_performance.png differ
diff --git a/docs/images/algorithm_gpufit_flowchart.png b/docs/images/algorithm_gpufit_flowchart.png
new file mode 100644
index 0000000..b95d7cb
Binary files /dev/null and b/docs/images/algorithm_gpufit_flowchart.png differ
diff --git a/docs/images/algorithm_gpufit_flowchart.vsdx b/docs/images/algorithm_gpufit_flowchart.vsdx
new file mode 100644
index 0000000..1b6bddb
Binary files /dev/null and b/docs/images/algorithm_gpufit_flowchart.vsdx differ
diff --git a/docs/images/gpufit_program_flow_skeleton_v2.png b/docs/images/gpufit_program_flow_skeleton_v2.png
new file mode 100644
index 0000000..d454681
Binary files /dev/null and b/docs/images/gpufit_program_flow_skeleton_v2.png differ
diff --git a/docs/images/gpufit_program_flow_v2.png b/docs/images/gpufit_program_flow_v2.png
new file mode 100644
index 0000000..8ead94a
Binary files /dev/null and b/docs/images/gpufit_program_flow_v2.png differ
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..6f89dc0
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,22 @@
+.. Gpufit documentation master file
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+Gpufit Documentation
+====================
+
+.. toctree::
+ :maxdepth: 3
+
+ introduction
+ installation
+ gpufit_api
+ fit_model_functions
+ fit_estimator_functions
+ examples
+ customization
+ bindings
+ appendix
+ license
+
+
diff --git a/docs/installation.rst b/docs/installation.rst
new file mode 100644
index 0000000..8af76ba
--- /dev/null
+++ b/docs/installation.rst
@@ -0,0 +1,220 @@
+.. _installation-and-testing:
+
+========================
+Installation and Testing
+========================
+
+The Gpufit library can be used in several ways. When using a pre-compiled
+binary version of Gpufit, the Gpufit functions may be accessed directly via
+a dynamic linked library (e.g. Gpufit.dll) or via the external bindings to
+Gpufit (e.g. the Matlab or Python bindings). For more information on the
+Gpufit interface, see :ref:`api-description`, or for details of the external
+bindings see :ref:`external-bindings`.
+
+This section describes how to compile Gpufit, including generating its
+external bindings, from source code. Building from source is necessary when
+a fit model function is added or changed, or if a new fit estimator is required.
+Building the library may also be useful for compiling the code using a
+specific version of the CUDA toolkit, or for a particular CUDA compute
+capability.
+
+Gpufit binary distribution
+++++++++++++++++++++++++++
+
+A binary distribution of the Gpufit library is available for **Windows**.
+Use of this distribution requires only a CUDA-capable graphics card, and an
+updated Nvidia graphics driver. The binary package contains:
+
+- The Gpufit SDK, which consists of the 32-bit and 64-bit DLL files, and
+ the Gpufit header file which contains the function definitions. The Gpufit
+ SDK is intended to be used when calling Gpufit from an external application
+ written in e.g. C code.
+- The performance test application, which serves to test that Gpufit is
+ correctly installed, and to check the performance of the CPU and GPU hardware.
+- Matlab 32 bit and 64 bit bindings, with Matlab examples.
+- Python version 2.x and version 3.x bindings (compiled as wheel files) and
+ Python examples.
+- This manual in PDF format.
+
+To re-build the binary distribution, see the instructions located in
+package/README.md.
+
+Building from source code
++++++++++++++++++++++++++
+
+This section describes how to build Gpufit from source code. Note that as of
+the initial release of Gpufit, the source code has been tested only with the
+Microsoft Visual Studio compiler.
+
+Prerequisites
+-------------
+
+The following tools are required in order to build Gpufit from source.
+
+*Required*
+
+* CMake_ 3.7 or later
+* A C/C++ Compiler
+
+ * Linux: GCC 4.7
+ * Windows: Visual Studio 2013 or 2015
+
+* CUDA_ Toolkit 6.5 or later [#]_
+
+.. [#] Note that it is recommended to use the newest available stable release of the CUDA Toolkit which is compatible
+ with the compiler (e.g. Visual Studio 2015 is required in order to use CUDA Toolkit 8.0). Some older graphics cards
+ may only be supported by CUDA Toolkit version 6.5 or earlier. Also, when using CUDA Toolkit version 6.5, please use
+ the version with support for GTX9xx GPUs, available `here `__.
+
+*Optional*
+
+* Boost_ 1.58 or later (required if you want to build the tests)
+* MATLAB_ if building the MATLAB bindings (minimum version Matlab 2012a)
+* Python_ if building the Python bindings (Python version 2.x or 3.x)
+
+Source code availability
+------------------------
+
+The source code is available in an open repository hosted at Github, at the
+following URL.
+
+.. code-block:: bash
+
+ https://github.com/gpufit/Gpufit.git
+
+To obtain the code, Git may be used to clone the repository, or a current
+snapshot may be downloaded directly from Github as Gpufit-master.zip_.
+
+Compiler configuration via CMake
+--------------------------------
+
+CMake is an open-source tool designed to build, test, and package software.
+It is used to control the software compilation process using compiler
+independent configuration files, and generate native makefiles and workspaces
+that can be used in the compiler environment. In this section we provide a
+simple example of how to use CMake in order to generate the input files for the
+compiler (e.g. the Visual Studio solution file), which can then be used to
+compile Gpufit.
+
+First, identify the directory which contains the Gpufit source code
+(for example, on a Windows computer the Gpufit source code may be stored in
+*C:\\Sources\\Gpufit*). Next, create a build directory outside the
+source code source directory (e.g. *C:\\Sources\\Gpufit-build-64*). Finally,
+run cmake to configure and generate the compiler input files. The following
+commands, executed from the command prompt, assume that the cmake executable
+(e.g. *C:\\Program Files\\CMake\\bin\\cmake.exe*) is automatically found
+via the PATH environment variable (if not, the full path to cmake.exe must be
+specified). This example also assumes that the source and build directories
+have been set up as specified above.
+
+.. code-block:: bash
+
+ cd C:\Sources\Gpufit-build-64
+ cmake -G "Visual Studio 12 2013 Win64" C:\Sources\Gpufit
+
+Note that in this example the *-G* flag has been used to specify the
+64-bit version of the Visual Studio 12 compiler. This flag should be changed
+depending on the compiler used, and the desired architecture
+(e.g. 32- or 64-bit). Further details of the CMake command line arguments
+can be found `here `__.
+
+There is also a graphical user interface available for CMake, which simplifies
+the configuration and generation steps. For further details, see
+`Running CMake `_.
+
+Common issues encountered during CMake configuration
+----------------------------------------------------
+
+**Boost NOT found - skipping tests!**
+
+If you want to build the tests and Boost is not found automatically, set the
+CMake variable BOOST_ROOT to the corresponding directory, and configure again.
+
+**Specify CUDA_ARCHITECTURES set**
+
+If you need a specific CUDA architecture, set CUDA_ARCHITECTURES according
+to CUDA_SELECT_NVCC_ARCH_FLAGS_.
+
+**CMake finds lowest installed CUDA version by default**
+
+If there are multiple CUDA toolkits installed on the computer, CMake 3.7.1
+seems to find by default the lowest installed version. Set the desired CUDA
+version manually (e.g. by editing the CUDA_TOOLKIT_ROOT_DIR variable in CMake).
+
+**Specify CUDA version to use**
+
+Set CUDA_BIN_PATH before running CMake or CUDA_TOOLKIT_ROOT_DIR after
+first CMAKE configuration to the installation folder of the desired
+CUDA version.
+
+**Required CUDA version**
+
+When using Microsoft Visual Studio 2015, the minimum required CUDA Toolkit
+version is 8.0.
+
+**Python launcher**
+
+Set Python_WORKING_DIRECTORY to a valid directory, it will be added to the
+Python path.
+
+**Matlab launcher**
+
+Set Matlab_WORKING_DIRECTORY to a valid directory, it will be added to
+the Matlab path.
+
+Compiling Gpufit on Windows
+---------------------------
+
+After configuring and generating the solution files using CMake, go to the
+desired build directory and open Gpufit.sln using Visual Studio. Select the
+"Debug" or "Release" build options, as appropriate. Select the build target
+"ALL_BUILD", and build this target. If the build process completes
+without errors, the Gpufit binary files will be created in the corresponding
+"Debug" or "Release" folders in the build directory.
+
+The unit tests can be executed by building the target "RUN_TESTS" or by
+starting the created executables in the output directory from
+the command line.
+
+Linux
+-----
+
+Gpufit has not yet been officially tested on a computer running a Linux variant
+with a CUDA capable graphics card. However, satisfying the Prerequisites_ and
+using CMake, we estimate that the library should build in principle and one
+should also be able to run the examples on Linux.
+
+MacOS
+-----
+
+Gpufit has not yet been officially tested on a computer running MacOS with a
+CUDA capable graphics card. However, satisfying the Prerequisites_ and using
+CMake, we estimate that the library should build in principle and one
+should also be able to run the examples on MacOS.
+
+Running the performance test
+++++++++++++++++++++++++++++
+
+The Gpufit performance test is a program which verifies the correct function
+of Gpufit, and tests the fitting speed in comparison with the same algorithm
+executed on the CPU.
+
+If Gpufit was built from source, running the build target
+GPUFIT_CPUFIT_Performance_Comparison will run the test, which executes the
+fitting process multiple times, varying the number of fits per function call.
+The execution time is measured in each case and the relative speed improvement
+between the GPU and the CPU is calculated. A successful run of the performance
+test also indicates also that Gpufit is functioning correctly.
+
+The performance comparison is also included in the Gpufit binary distribution
+as a console application. An example of the program's output is
+shown in :numref:`installation-gpufit-cpufit-performance-comparison`.
+
+.. _installation-gpufit-cpufit-performance-comparison:
+
+.. figure:: /images/Gpufit_Cpufit_Performance_Comparison.png
+ :width: 10 cm
+ :align: center
+
+ Output of the GPUFIT vs CPUFIT performance comparison
+
diff --git a/docs/introduction.rst b/docs/introduction.rst
new file mode 100644
index 0000000..2a6fc1f
--- /dev/null
+++ b/docs/introduction.rst
@@ -0,0 +1,87 @@
+============
+Introduction
+============
+
+Gpufit is a GPU-accelerated CUDA implementation of the Levenberg-Marquardt
+algorithm. It was developed to meet the need for a high performance, general-
+purpose nonlinear curve fitting software library which is publicly available
+and open source.
+
+Optimization algorithms are ubiquitous tools employed in many field of science
+and technology. One such algorithm for numerical, non-linear optimization is the
+Levenberg-Marquardt algorithm (LMA). The LMA combines elements of the method of
+steepest descent and Newton's method, and has become a standard algorithm for
+least-squares fitting.
+
+Although the LMA is, in itself, an efficient optimization algorithm,
+applications requiring many iterations of this procedure may encounter
+limitations due to the sheer number of calculations involved. The time required
+for the convergence of a fit, or a set of fits, can determine an application's
+feasibility, e.g. in the context of real-time data processing and feedback
+systems. Alternatively, in the case of very large datasets, the time required
+to solve a particular optimization problem may prove impractical.
+
+In recent years, advanced graphics processing units (GPUs) and the development
+of general purpose GPU programming have enabled fast and parallelized computing
+by shifting calculations from the CPU to the GPU. The large number of
+independent computing units available on a modern GPU enables the rapid
+execution of many instructions in parallel, with an overall computation power
+far exceeding that of a CPU. Languages such as CUDA C and OpenCL allow GPU-
+based programs to be developed in a manner similar to conventional software, but
+with an inherently parallelized structure. These developments have led to the
+creation of new GPU-accelerated tools, such as the Gpufit.
+
+This manual describes how to install and build the Gpufit library and its
+external bindings. Furthermore it details how to extend Gpufit by adding
+custom model functions as well as custom fit estimator functions.
+
+The documentation includes:
+
+- Instructions for building and installing Gpufit
+- A detailed description of the C interface
+- A description of the built-in model functions
+- A description of the built-in goodness-of-fit estimator functions
+- A detailed description of the external bindings to Matlab and Python
+- Usage examples for C, Matlab, and Python
+- Instructions for adding custom model functions or custom estimator functions
+
+The current version of the Gpufit library is |GF_version|
+(`see homepage `_). This manual was compiled
+on |today|.
+
+Hardware requirements
+---------------------
+
+Because the fit algorithm is implemented in CUDA C, a CUDA_-compatible graphics
+card is required to run Gpufit. The minimum supported compute capability is
+2.0. More advanced GPU hardware will result in higher fitting performance.
+
+Software requirements
+---------------------
+
+In addition to a compatible GPU, the graphics card driver installed on the
+host computer must be compatible with the version of the CUDA toolkit which
+was used to compile Gpufit. This may present an issue for older graphics
+cards or for computers running outdated graphics drivers.
+
+At the time of its initial release, Gpufit was compiled with CUDA toolkit
+version 8.0. Therefore, the Nvidia graphics driver installed on the host PC
+must be at least version 367.48 (released July 2016) in order to be compatible
+with the binary files generated in this build.
+
+When compatibility issues arise, there are two possible solutions. The best
+option is to update the graphics driver to a version which is compatible with
+the CUDA toolkit used to build Gpufit. The second option is to re-compile
+Gpufit from source code, using an earlier version of the CUDA toolkit which is
+compatible with the graphics driver in question. However, this solution is
+likely to result in slower performance of the Gpufit code, since older versions
+of the CUDA toolkit are not as efficient.
+
+Note that all CUDA-supported graphics cards should be compatible with
+CUDA toolkit version 6.5. This is the last version of CUDA which supported
+GPUs with compute capability 1.x. In other words, an updated Nvidia graphics
+driver should be available for all CUDA-enabled GPUs which is compatible with
+toolkit version 6.5.
+
+If you are unsure if your graphics card is CUDA-compatible, a lists of CUDA
+supported GPUs can be found `here `_.
diff --git a/docs/license.rst b/docs/license.rst
new file mode 100644
index 0000000..1223cbc
--- /dev/null
+++ b/docs/license.rst
@@ -0,0 +1,25 @@
+=======================
+Gpufit software license
+=======================
+
+MIT License
+
+Copyright (c) 2017 Mark Bates, Adrian Przybylski, Björn Thiel, and Jan Keller-Findeisen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..6f53cb2
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,281 @@
+@ECHO OFF
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set BUILDDIR=_build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
+set I18NSPHINXOPTS=%SPHINXOPTS% .
+if NOT "%PAPER%" == "" (
+ set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+ set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+ :help
+ echo.Please use `make ^` where ^ is one of
+ echo. html to make standalone HTML files
+ echo. dirhtml to make HTML files named index.html in directories
+ echo. singlehtml to make a single large HTML file
+ echo. pickle to make pickle files
+ echo. json to make JSON files
+ echo. htmlhelp to make HTML files and a HTML help project
+ echo. qthelp to make HTML files and a qthelp project
+ echo. devhelp to make HTML files and a Devhelp project
+ echo. epub to make an epub
+ echo. epub3 to make an epub3
+ echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+ echo. text to make text files
+ echo. man to make manual pages
+ echo. texinfo to make Texinfo files
+ echo. gettext to make PO message catalogs
+ echo. changes to make an overview over all changed/added/deprecated items
+ echo. xml to make Docutils-native XML files
+ echo. pseudoxml to make pseudoxml-XML files for display purposes
+ echo. linkcheck to check all external links for integrity
+ echo. doctest to run all doctests embedded in the documentation if enabled
+ echo. coverage to run coverage check of the documentation if enabled
+ echo. dummy to check syntax errors of document sources
+ goto end
+)
+
+if "%1" == "clean" (
+ for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+ del /q /s %BUILDDIR%\*
+ goto end
+)
+
+
+REM Check if sphinx-build is available and fallback to Python version if any
+%SPHINXBUILD% 1>NUL 2>NUL
+if errorlevel 9009 goto sphinx_python
+goto sphinx_ok
+
+:sphinx_python
+
+set SPHINXBUILD=python -m sphinx.__init__
+%SPHINXBUILD% 2> nul
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.http://sphinx-doc.org/
+ exit /b 1
+)
+
+:sphinx_ok
+
+
+if "%1" == "html" (
+ %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+ goto end
+)
+
+if "%1" == "dirhtml" (
+ %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+ goto end
+)
+
+if "%1" == "singlehtml" (
+ %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
+ goto end
+)
+
+if "%1" == "pickle" (
+ %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can process the pickle files.
+ goto end
+)
+
+if "%1" == "json" (
+ %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can process the JSON files.
+ goto end
+)
+
+if "%1" == "htmlhelp" (
+ %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+ goto end
+)
+
+if "%1" == "qthelp" (
+ %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+ echo.^> qcollectiongenerator %BUILDDIR%\qthelp\RTDSpielwiese.qhcp
+ echo.To view the help file:
+ echo.^> assistant -collectionFile %BUILDDIR%\qthelp\RTDSpielwiese.ghc
+ goto end
+)
+
+if "%1" == "devhelp" (
+ %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished.
+ goto end
+)
+
+if "%1" == "epub" (
+ %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The epub file is in %BUILDDIR%/epub.
+ goto end
+)
+
+if "%1" == "epub3" (
+ %SPHINXBUILD% -b epub3 %ALLSPHINXOPTS% %BUILDDIR%/epub3
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The epub3 file is in %BUILDDIR%/epub3.
+ goto end
+)
+
+if "%1" == "latex" (
+ %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+ goto end
+)
+
+if "%1" == "latexpdf" (
+ %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+ cd %BUILDDIR%/latex
+ make all-pdf
+ cd %~dp0
+ echo.
+ echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+ goto end
+)
+
+if "%1" == "latexpdfja" (
+ %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+ cd %BUILDDIR%/latex
+ make all-pdf-ja
+ cd %~dp0
+ echo.
+ echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+ goto end
+)
+
+if "%1" == "text" (
+ %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The text files are in %BUILDDIR%/text.
+ goto end
+)
+
+if "%1" == "man" (
+ %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The manual pages are in %BUILDDIR%/man.
+ goto end
+)
+
+if "%1" == "texinfo" (
+ %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
+ goto end
+)
+
+if "%1" == "gettext" (
+ %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
+ goto end
+)
+
+if "%1" == "changes" (
+ %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.The overview file is in %BUILDDIR%/changes.
+ goto end
+)
+
+if "%1" == "linkcheck" (
+ %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+ goto end
+)
+
+if "%1" == "doctest" (
+ %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+ goto end
+)
+
+if "%1" == "coverage" (
+ %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Testing of coverage in the sources finished, look at the ^
+results in %BUILDDIR%/coverage/python.txt.
+ goto end
+)
+
+if "%1" == "xml" (
+ %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The XML files are in %BUILDDIR%/xml.
+ goto end
+)
+
+if "%1" == "pseudoxml" (
+ %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
+ goto end
+)
+
+if "%1" == "dummy" (
+ %SPHINXBUILD% -b dummy %ALLSPHINXOPTS% %BUILDDIR%/dummy
+ if errorlevel 1 exit /b 1
+ echo.
+ echo.Build finished. Dummy builder generates no files.
+ goto end
+)
+
+:end
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 0000000..b8c2751
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,20 @@
+
+# Applications
+
+function( add_example modules name )
+ set( target ${name} )
+ add_executable( ${target} ${name}.cpp
+ ${PROJECT_SOURCE_DIR}/Tests/utils.h
+ ${PROJECT_SOURCE_DIR}/Tests/utils.cpp
+ )
+ target_include_directories( ${target} PRIVATE ${PROJECT_SOURCE_DIR} )
+ target_link_libraries( ${target} ${modules} )
+ set_property( TARGET ${target}
+ PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" )
+ set_property( TARGET ${target} PROPERTY FOLDER GpufitCpufitExamples )
+# install( TARGETS ${target} RUNTIME DESTINATION bin )
+endfunction()
+
+add_example( "Cpufit;Gpufit" Gpufit_Cpufit_Performance_Comparison )
+
+add_example( "Cpufit;Gpufit" Gpufit_Cpufit_Nvidia_Profiler_Test )
diff --git a/examples/Gpufit_Cpufit_Nvidia_Profiler_Test.cpp b/examples/Gpufit_Cpufit_Nvidia_Profiler_Test.cpp
new file mode 100644
index 0000000..41f72e2
--- /dev/null
+++ b/examples/Gpufit_Cpufit_Nvidia_Profiler_Test.cpp
@@ -0,0 +1,340 @@
+/*
+ * Runs 100k fits on the CPU and 2m fits on the GPU, used with the Nvidia profiler to obtain
+ * running time information on the different CUDA kernels.
+ */
+
+#include "Cpufit/cpufit.h"
+#include "Gpufit/gpufit.h"
+#include "Tests/utils.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#define _USE_MATH_DEFINES
+#include
+
+
+/*
+ Names of paramters for the 2D Gaussian peak model
+*/
+struct Parameters
+{
+ float amplitude;
+ float center_x;
+ float center_y;
+ float width;
+ float background;
+};
+
+/*
+Prints some statistics and the speed (fits/second) of a run.
+*/
+void print_result(
+ std::string const name,
+ std::vector const & estimated_parameters,
+ std::vector const & test_parameters,
+ std::vector states,
+ std::vector const & n_iterations,
+ std::size_t const n_fits,
+ std::size_t const n_parameters,
+ std::chrono::milliseconds::rep const duration_in_ms)
+{
+
+ std::vector estimated_x_centers(n_fits);
+ std::vector test_x_centers(n_fits);
+
+ for (std::size_t i = 0; i < n_fits; i++)
+ {
+ estimated_x_centers[i] = estimated_parameters[i*n_parameters + 1];
+ test_x_centers[i] = test_parameters[i].center_x;
+ }
+
+ double const std_dev_x = calculate_standard_deviation(estimated_x_centers, test_x_centers, states);
+
+ double const mean_n_iterations = calculate_mean(n_iterations, states);
+
+ double fits_per_second = static_cast(n_fits) / duration_in_ms * 1000;
+
+ // output
+ std::cout << std::fixed;
+
+ std::cout << std::setw(5) << std::endl << "***" << name << "***";
+
+ std::cout << std::setprecision(3);
+ std::cout << std::setw(12) << duration_in_ms / 1000.0 << " s ";
+
+ std::cout << std::setprecision(2);
+ std::cout << std::setw(12) << fits_per_second << " fits/s" << std::endl;
+
+ std::cout << std::setprecision(6);
+ std::cout << "x precision: " << std_dev_x << " px ";
+
+ std::cout << std::setprecision(2);
+ std::cout << "mean iterations: " << mean_n_iterations << std::endl;
+}
+
+/*
+Randomize parameters, slightly differently
+*/
+void generate_initial_parameters(std::vector & parameters_set, std::vector const & parameters)
+{
+ std::uniform_real_distribution< float> uniform_dist(0, 1);
+
+ float const a = 0.9f;
+ float const b = 0.2f;
+
+ int const n_parameters = sizeof(Parameters) / sizeof(float);
+ for (std::size_t i = 0; i < parameters_set.size() / n_parameters; i++)
+ {
+ parameters_set[0 + i * n_parameters] = parameters[i].amplitude * (a + b * uniform_dist(rng));
+ parameters_set[1 + i * n_parameters] = parameters[i].center_x * (a + b * uniform_dist(rng));
+ parameters_set[2 + i * n_parameters] = parameters[i].center_y * (a + b * uniform_dist(rng));
+ parameters_set[3 + i * n_parameters] = parameters[i].width * (a + b * uniform_dist(rng));
+ parameters_set[4 + i * n_parameters] = parameters[i].background * (a + b * uniform_dist(rng));
+ }
+}
+
+/*
+Randomize parameters
+*/
+void generate_test_parameters(std::vector & target, Parameters const source)
+{
+ std::size_t const n_fits = target.size();
+
+ std::uniform_real_distribution< float> uniform_dist(0, 1);
+
+ float const a = 0.9f;
+ float const b = 0.2f;
+
+ for (std::size_t i = 0; i < n_fits; i++)
+ {
+ target[i].amplitude = source.amplitude * (a + b * uniform_dist(rng));
+ target[i].center_x = source.center_x * (a + b * uniform_dist(rng));
+ target[i].center_y = source.center_y * (a + b * uniform_dist(rng));
+ target[i].width = source.width * (a + b * uniform_dist(rng));
+ target[i].background = source.background * (a + b * uniform_dist(rng));
+ }
+}
+
+/*
+
+*/
+void add_gauss_noise(std::vector & vec, Parameters const & parameters, float const snr)
+{
+ float const gauss_fwtm = 4.292f * parameters.width; //only valid for circular gaussian
+ float const fit_area = gauss_fwtm*gauss_fwtm;
+
+ float const mean_amplitude = 2.f * float(M_PI) * parameters.amplitude * parameters.width * parameters.width / fit_area;
+
+ float const std_dev = mean_amplitude / snr;
+
+ std::normal_distribution distribution(0.0, std_dev);
+
+ for (std::size_t i = 0; i < vec.size(); i++)
+ {
+ vec[i] += distribution(rng);
+ }
+}
+
+/*
+
+*/
+void generate_gauss2d(
+ std::size_t const n_fits,
+ std::size_t const n_points,
+ std::vector & data,
+ std::vector const & parameters)
+{
+ std::cout << "generating " << n_fits << " fits ..." << std::endl;
+ for (int i = 0; i < 50; i++)
+ std::cout << "-";
+ std::cout << std::endl;
+ std::size_t progress = 0;
+
+ for (std::size_t i = 0; i < n_fits; i++)
+ {
+ float const amplitude = parameters[i].amplitude;
+ float const x00 = parameters[i].center_x;
+ float const y00 = parameters[i].center_y;
+ float const width = parameters[i].width;
+ float const background = parameters[i].background;
+
+ std::size_t const fit_index = i * n_points;
+
+ for (int iy = 0; iy < sqrt(n_points); iy++)
+ {
+ for (int ix = 0; ix < sqrt(n_points); ix++)
+ {
+ std::size_t const point_index = iy * std::size_t(sqrt(n_points)) + ix;
+ std::size_t const absolute_index = fit_index + point_index;
+
+ float const argx
+ = exp(-0.5f * ((ix - x00) / width) * ((ix - x00) / width));
+ float const argy
+ = exp(-0.5f * ((iy - y00) / width) * ((iy - y00) / width));
+
+ data[absolute_index] = amplitude * argx * argy + background;
+ }
+ }
+
+ progress += 1;
+ if (progress >= n_fits / 50)
+ {
+ progress = 0;
+ std::cout << "|";
+ }
+ }
+ std::cout << std::endl;
+ for (int i = 0; i < 50; i++)
+ std::cout << "-";
+ std::cout << std::endl;
+}
+
+/*
+Runs Gpufit vs. Cpufit for various number of fits and compares the speed
+
+No weights, Model: Gauss_2D, Estimator: LSE
+*/
+int main(int argc, char * argv[])
+{
+ // check for CUDA availability
+ if (!gpufit_cuda_available())
+ {
+ std::cout << "CUDA not available" << std::endl;
+ return -1;
+ }
+
+ // all numbers of fits
+ std::size_t const n_fits_gpu = 2000000;
+ std::size_t const n_fits_cpu = 100000;
+ std::size_t const size_x = 15;
+ std::size_t const n_points = size_x * size_x;
+
+ // fit parameters constant for every run
+ std::size_t const n_parameters = 5;
+ std::vector parameters_to_fit(n_parameters, 1);
+ float const tolerance = 0.0001f;
+ int const max_n_iterations = 10;
+
+ // initial parameters
+ Parameters true_parameters;
+ true_parameters.amplitude = 500.f;
+ true_parameters.center_x = static_cast(size_x) / 2.f - 0.5f;
+ true_parameters.center_y = static_cast(size_x) / 2.f - 0.5f;
+ true_parameters.width = 2.f;
+ true_parameters.background = 10.f;
+
+ // test parameters
+ std::cout << "generate test parameters" << std::endl;
+ std::vector test_parameters(n_fits_gpu);
+ generate_test_parameters(test_parameters, true_parameters);
+
+ // test data
+ std::vector data(n_fits_gpu * n_points);
+ generate_gauss2d(n_fits_gpu, n_points, data, test_parameters);
+ std::cout << "add noise" << std::endl;
+ add_gauss_noise(data, true_parameters, 10.f);
+
+ // initial parameter set
+ std::vector initial_parameters(n_parameters * n_fits_gpu);
+ generate_initial_parameters(initial_parameters, test_parameters);
+
+ std::cout << std::endl;
+ std::cout << n_fits_cpu << " fits on the CPU" << std::endl;
+
+ // Cpufit output
+ std::vector cpufit_parameters(n_fits_cpu * n_parameters);
+ std::vector cpufit_states(n_fits_cpu);
+ std::vector cpufit_chi_squares(n_fits_cpu);
+ std::vector cpufit_n_iterations(n_fits_cpu);
+
+ // run Cpufit and measure time
+ std::chrono::high_resolution_clock::time_point t0 = std::chrono::high_resolution_clock::now();
+ int const cpu_status
+ = cpufit
+ (
+ n_fits_cpu,
+ n_points,
+ data.data(),
+ 0,
+ GAUSS_2D,
+ initial_parameters.data(),
+ tolerance,
+ max_n_iterations,
+ parameters_to_fit.data(),
+ LSE,
+ 0,
+ 0,
+ cpufit_parameters.data(),
+ cpufit_states.data(),
+ cpufit_chi_squares.data(),
+ cpufit_n_iterations.data()
+ );
+ std::chrono::milliseconds::rep const dt_cpufit = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t0).count();
+
+ if (cpu_status != 0)
+ {
+ // error in cpufit, should actually not happen
+ std::cout << "Error in cpufit: " << cpufit_get_last_error() << std::endl;
+ }
+ else
+ {
+ // print
+ print_result("Cpufit", cpufit_parameters, test_parameters, cpufit_states, cpufit_n_iterations, n_fits_cpu, n_parameters, dt_cpufit);
+ }
+
+ std::cout << std::endl;
+ std::cout << n_fits_gpu << " fits on the GPU" << std::endl;
+
+ // Gpufit output parameters
+ std::vector gpufit_parameters(n_fits_gpu * n_parameters);
+ std::vector gpufit_states(n_fits_gpu);
+ std::vector gpufit_chi_squares(n_fits_gpu);
+ std::vector gpufit_n_iterations(n_fits_gpu);
+
+ // run Gpufit and measure time
+ t0 = std::chrono::high_resolution_clock::now();
+ int const gpu_status
+ = gpufit
+ (
+ n_fits_gpu,
+ n_points,
+ data.data(),
+ 0,
+ GAUSS_2D,
+ initial_parameters.data(),
+ tolerance,
+ max_n_iterations,
+ parameters_to_fit.data(),
+ LSE,
+ 0,
+ 0,
+ gpufit_parameters.data(),
+ gpufit_states.data(),
+ gpufit_chi_squares.data(),
+ gpufit_n_iterations.data()
+ );
+ std::chrono::milliseconds::rep const dt_gpufit = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t0).count();
+
+ if (gpu_status != 0)
+ {
+ // error in gpufit
+ std::cout << "Error in gpufit: " << gpufit_get_last_error() << std::endl;
+ }
+ else
+ {
+ // print results
+ print_result("Gpufit", gpufit_parameters, test_parameters, gpufit_states, gpufit_n_iterations, n_fits_gpu, n_parameters, dt_gpufit);
+ }
+
+ std::cout << "\nPERFORMANCE GAIN Gpufit/Cpufit \t" << std::setw(10) << static_cast(dt_cpufit) / dt_gpufit * n_fits_gpu / n_fits_cpu << std::endl;
+
+ return 0;
+}
\ No newline at end of file
diff --git a/examples/Gpufit_Cpufit_Performance_Comparison.cpp b/examples/Gpufit_Cpufit_Performance_Comparison.cpp
new file mode 100644
index 0000000..b25dd90
--- /dev/null
+++ b/examples/Gpufit_Cpufit_Performance_Comparison.cpp
@@ -0,0 +1,450 @@
+#include "Cpufit/cpufit.h"
+#include "Gpufit/gpufit.h"
+#include "Tests/utils.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#define _USE_MATH_DEFINES
+#include
+
+
+/*
+ Names of paramters for the 2D Gaussian peak model
+*/
+struct Parameters
+{
+ float amplitude;
+ float center_x;
+ float center_y;
+ float width;
+ float background;
+};
+
+/*
+ Randomize parameters, slightly differently
+*/
+void generate_initial_parameters(std::vector & parameters_set, std::vector const & parameters)
+{
+ std::uniform_real_distribution< float> uniform_dist(0, 1);
+
+ float const a = 0.9f;
+ float const b = 0.2f;
+
+ int const n_parameters = sizeof(Parameters) / sizeof(float);
+ for (std::size_t i = 0; i < parameters_set.size() / n_parameters; i++)
+ {
+ parameters_set[0 + i * n_parameters] = parameters[i].amplitude * (a + b * uniform_dist(rng));
+ parameters_set[1 + i * n_parameters] = parameters[i].center_x * (a + b * uniform_dist(rng));
+ parameters_set[2 + i * n_parameters] = parameters[i].center_y * (a + b * uniform_dist(rng));
+ parameters_set[3 + i * n_parameters] = parameters[i].width * (a + b * uniform_dist(rng));
+ parameters_set[4 + i * n_parameters] = parameters[i].background * (a + b * uniform_dist(rng));
+ }
+}
+
+/*
+ Randomize parameters
+*/
+void generate_test_parameters(std::vector & target, Parameters const source)
+{
+ std::size_t const n_fits = target.size();
+
+ std::uniform_real_distribution< float> uniform_dist(0, 1);
+
+ float const a = 0.9f;
+ float const b = 0.2f;
+
+ int const text_width = 30;
+ int const progress_width = 25;
+
+ std::cout << std::setw(text_width) << " ";
+ for (int i = 0; i < progress_width; i++)
+ std::cout << "-";
+ std::cout << std::endl;
+ std::cout << std::setw(text_width) << std::left << "Generating test parameters";
+
+ std::size_t progress = 0;
+
+ for (std::size_t i = 0; i < n_fits; i++)
+ {
+ target[i].amplitude = source.amplitude * (a + b * uniform_dist(rng));
+ target[i].center_x = source.center_x * (a + b * uniform_dist(rng));
+ target[i].center_y = source.center_y * (a + b * uniform_dist(rng));
+ target[i].width = source.width * (a + b * uniform_dist(rng));
+ target[i].background = source.background * (a + b * uniform_dist(rng));
+
+ progress += 1;
+ if (progress >= n_fits / progress_width)
+ {
+ progress = 0;
+ std::cout << "|";
+ }
+ }
+
+ std::cout << std::endl;
+ std::cout << std::setw(text_width) << " ";
+ for (int i = 0; i < progress_width; i++)
+ std::cout << "-";
+ std::cout << std::endl;
+}
+
+/*
+
+*/
+void add_gauss_noise(std::vector & vec, Parameters const & parameters, float const snr)
+{
+ float const gauss_fwtm = 4.292f * parameters.width; //only valid for circular gaussian
+ float const fit_area = gauss_fwtm*gauss_fwtm;
+
+ float const mean_amplitude = 2.f * float(M_PI) * parameters.amplitude * parameters.width * parameters.width / fit_area;
+
+ float const std_dev = mean_amplitude / snr;
+
+ std::normal_distribution distribution(0.0, std_dev);
+
+ int const text_width = 30;
+ int const progress_width = 25;
+
+ std::cout << std::setw(text_width) << " ";
+ for (int i = 0; i < progress_width; i++)
+ std::cout << "-";
+ std::cout << std::endl;
+ std::cout << std::setw(text_width) << std::left << "Adding noise";
+
+ std::size_t progress = 0;
+
+ for (std::size_t i = 0; i < vec.size(); i++)
+ {
+ vec[i] += distribution(rng);
+
+ progress += 1;
+ if (progress >= vec.size() / progress_width)
+ {
+ progress = 0;
+ std::cout << "|";
+ }
+ }
+
+ std::cout << std::endl;
+ std::cout << std::setw(text_width) << " ";
+ for (int i = 0; i < progress_width; i++)
+ std::cout << "-";
+ std::cout << std::endl;
+}
+
+/*
+
+*/
+void generate_gauss2d(
+ std::size_t const n_fits,
+ std::size_t const n_points,
+ std::vector & data,
+ std::vector const & parameters)
+{
+ int const text_width = 30;
+ int const progress_width = 25;
+
+ std::cout << std::setw(text_width) << " ";
+ for (int i = 0; i < progress_width; i++)
+ std::cout << "-";
+ std::cout << std::endl;
+ std::cout << std::setw(text_width) << std::left << "Generating data";
+
+ std::size_t progress = 0;
+
+ for (std::size_t i = 0; i < n_fits; i++)
+ {
+ float const amplitude = parameters[i].amplitude;
+ float const x00 = parameters[i].center_x;
+ float const y00 = parameters[i].center_y;
+ float const width = parameters[i].width;
+ float const background = parameters[i].background;
+
+ std::size_t const fit_index = i * n_points;
+
+ for (int iy = 0; iy < sqrt(n_points); iy++)
+ {
+ for (int ix = 0; ix < sqrt(n_points); ix++)
+ {
+ std::size_t const point_index = iy * std::size_t(sqrt(n_points)) + ix;
+ std::size_t const absolute_index = fit_index + point_index;
+
+ float const argx
+ = exp(-0.5f * ((ix - x00) / width) * ((ix - x00) / width));
+ float const argy
+ = exp(-0.5f * ((iy - y00) / width) * ((iy - y00) / width));
+
+ data[absolute_index] = amplitude * argx * argy + background;
+ }
+ }
+
+ progress += 1;
+ if (progress >= n_fits / progress_width)
+ {
+ progress = 0;
+ std::cout << "|";
+ }
+ }
+ std::cout << std::endl;
+ std::cout << std::setw(text_width) << " ";
+ for (int i = 0; i < progress_width; i++)
+ std::cout << "-";
+ std::cout << std::endl;
+}
+
+/*
+Runs Gpufit vs. Cpufit for various number of fits and compares the speed
+
+No weights, Model: Gauss_2D, Estimator: LSE
+*/
+int main(int argc, char * argv[])
+{
+ // title
+ std::cout << "----------------------------------------" << std::endl;
+ std::cout << "Performance comparison Gpufit vs. Cpufit" << std::endl;
+ std::cout << "----------------------------------------" << std::endl << std::endl;
+
+ std::cout << "Please note that execution speed test results depend on" << std::endl;
+ std::cout << "the details of the CPU and GPU hardware." << std::endl;
+ std::cout << std::endl;
+
+
+ // check for CUDA availability
+ int cuda_runtime_version = 0;
+ int cuda_driver_version = 0;
+ bool const version_available = gpufit_get_cuda_version(&cuda_runtime_version, &cuda_driver_version) != 0;
+ int const cuda_runtime_major = cuda_runtime_version / 1000;
+ int const cuda_runtime_minor = cuda_runtime_version % 1000 / 10;
+ int const cuda_driver_major = cuda_driver_version / 1000;
+ int const cuda_driver_minor = cuda_driver_version % 1000 / 10;
+
+ bool do_gpufits = false;
+ if (version_available)
+ {
+ std::cout << "CUDA runtime version: ";
+ std::cout << cuda_runtime_major << "." << cuda_runtime_minor << std::endl;
+ std::cout << "CUDA driver version: ";
+ std::cout << cuda_driver_major << "." << cuda_driver_minor << std::endl;
+ std::cout << std::endl;
+
+ bool const cuda_available = cuda_driver_version > 0;
+ if (cuda_available)
+ {
+ bool const version_compatible
+ = cuda_driver_version >= cuda_runtime_version
+ && cuda_runtime_version > 0;
+ if (version_compatible)
+ {
+ do_gpufits = true;
+ }
+ else
+ {
+ std::cout << "The CUDA runtime version is not compatible with the" << std::endl;
+ std::cout << "current graphics driver. Please update the driver, or" << std::endl;
+ std::cout << "re - build Gpufit from source using a compatible version" << std::endl;
+ std::cout << "of the CUDA toolkit." << std::endl;
+ std::cout << std::endl;
+ }
+ }
+ else
+ {
+ std::cout << "No CUDA enabled graphics card detected." << std::endl;
+ std::cout << std::endl;
+ }
+ }
+ else
+ {
+ std::cout << "CUDA error detected. Error string: ";
+ std::cout << gpufit_get_last_error() << std::endl;
+ std::cout << std::endl;
+ }
+ if (!do_gpufits)
+ {
+ std::cout << "Skipping Gpufit computations." << std::endl << std::endl;
+ }
+
+ // all numbers of fits
+ std::vector n_fits_all;
+ if (sizeof(void*) < 8)
+ {
+ n_fits_all = { 10, 100, 1000, 10000, 100000, 1000000};
+ }
+ else
+ {
+ n_fits_all = { 10, 100, 1000, 10000, 100000, 1000000, 10000000 };
+ }
+
+ std::size_t const max_n_fits = n_fits_all.back();
+
+ // fit parameters constant for every run
+ std::size_t const size_x = 5;
+ std::size_t const n_points = size_x * size_x;
+ std::size_t const n_parameters = 5;
+ std::vector parameters_to_fit(n_parameters, 1);
+ float const tolerance = 0.0001f;
+ int const max_n_iterations = 10;
+
+ // initial parameters
+ Parameters true_parameters;
+ true_parameters.amplitude = 500.f;
+ true_parameters.center_x = static_cast(size_x) / 2.f - 0.5f;
+ true_parameters.center_y = static_cast(size_x) / 2.f - 0.5f;
+ true_parameters.width = 1.f;
+ true_parameters.background = 10.f;
+
+ // test parameters
+ std::vector test_parameters(max_n_fits);
+ generate_test_parameters(test_parameters, true_parameters);
+
+ // test data
+ std::vector data(max_n_fits * n_points);
+ generate_gauss2d(max_n_fits, n_points, data, test_parameters);
+ add_gauss_noise(data, true_parameters, 10.f);
+
+ // initial parameter set
+ std::vector initial_parameters(n_parameters * max_n_fits);
+ generate_initial_parameters(initial_parameters, test_parameters);
+
+ // print collumn identifiers
+ std::cout << std::endl << std::right;
+ std::cout << std::setw(8) << "Number" << std::setw(3) << "|";
+ std::cout << std::setw(13) << "Cpufit speed" << std::setw(3) << "|";
+ std::cout << std::setw(13) << "Gpufit speed" << std::setw(3) << "|";
+ std::cout << std::setw(12) << "Performance";
+ std::cout << std::endl;
+ std::cout << std::setw(8) << "of fits" << std::setw(3) << "|";
+ std::cout << std::setw(13) << "(fits/s)" << std::setw(3) << "|";
+ std::cout << std::setw(13) << "(fits/s)" << std::setw(3) << "|";
+ std::cout << std::setw(12) << "gain factor";
+ std::cout << std::endl;
+ std::cout << "-------------------------------------------------------";
+ std::cout << std::endl;
+
+ // loop over number of fits
+ for (std::size_t fit_index = 0; fit_index < n_fits_all.size(); fit_index++)
+ {
+ // number of fits
+ std::size_t n_fits = n_fits_all[fit_index];
+ std::cout << std::setw(8) << n_fits << std::setw(3) << "|";
+
+ // Cpufit output
+ std::vector cpufit_parameters(n_fits * n_parameters);
+ std::vector cpufit_states(n_fits);
+ std::vector cpufit_chi_squares(n_fits);
+ std::vector cpufit_n_iterations(n_fits);
+
+ // run Cpufit and measure time
+ std::chrono::high_resolution_clock::time_point t0 = std::chrono::high_resolution_clock::now();
+ int const cpu_status
+ = cpufit
+ (
+ n_fits,
+ n_points,
+ data.data(),
+ 0,
+ GAUSS_2D,
+ initial_parameters.data(),
+ tolerance,
+ max_n_iterations,
+ parameters_to_fit.data(),
+ LSE,
+ 0,
+ 0,
+ cpufit_parameters.data(),
+ cpufit_states.data(),
+ cpufit_chi_squares.data(),
+ cpufit_n_iterations.data()
+ );
+ std::chrono::milliseconds::rep const dt_cpufit = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t0).count();
+
+ if (cpu_status != 0)
+ {
+ // error in cpufit, should actually not happen
+ std::cout << "Error in cpufit: " << cpufit_get_last_error() << std::endl;
+ }
+
+ std::chrono::milliseconds::rep dt_gpufit = 0;
+
+ // if we do not do gpufit, we skip the rest of the loop
+ if (do_gpufits)
+ {
+ // Gpufit output parameters
+ std::vector gpufit_parameters(n_fits * n_parameters);
+ std::vector gpufit_states(n_fits);
+ std::vector gpufit_chi_squares(n_fits);
+ std::vector gpufit_n_iterations(n_fits);
+
+ // run Gpufit and measure time
+ t0 = std::chrono::high_resolution_clock::now();
+ int const gpu_status
+ = gpufit
+ (
+ n_fits,
+ n_points,
+ data.data(),
+ 0,
+ GAUSS_2D,
+ initial_parameters.data(),
+ tolerance,
+ max_n_iterations,
+ parameters_to_fit.data(),
+ LSE,
+ 0,
+ 0,
+ gpufit_parameters.data(),
+ gpufit_states.data(),
+ gpufit_chi_squares.data(),
+ gpufit_n_iterations.data()
+ );
+ dt_gpufit = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - t0).count();
+
+ if (gpu_status != 0)
+ {
+ // error in gpufit
+ std::cout << "Error in gpufit: " << gpufit_get_last_error() << std::endl;
+ do_gpufits = false;
+ }
+ }
+
+ // print the calculation speed in fits/s
+ std::cout << std::fixed << std::setprecision(0);
+ if (dt_cpufit)
+ {
+ std::cout << std::setw(13) << static_cast(n_fits) / static_cast(dt_cpufit)* 1000.0 << std::setw(3) << "|";
+ }
+ else
+ {
+ std::cout << std::setw(13) << "inf" << std::setw(3) << "|";
+ }
+ if (dt_gpufit)
+ {
+ std::cout << std::setw(13) << static_cast(n_fits) / static_cast(dt_gpufit)* 1000.0 << std::setw(3) << "|";
+ std::cout << std::fixed << std::setprecision(2);
+ std::cout << std::setw(12) << static_cast(dt_cpufit) / static_cast(dt_gpufit);
+ }
+ else if (!do_gpufits)
+ {
+ std::cout << std::setw(13) << "--" << std::setw(3) << "|";
+ std::cout << std::setw(12) << "--";
+ }
+ else
+ {
+ std::cout << std::setw(13) << "inf" << std::setw(3) << "|";
+ std::cout << std::setw(12) << "inf";
+ }
+
+ std::cout << std::endl;
+ }
+ std::cout << std::endl << "Test completed!" << std::endl;
+ std::cout << "Press ENTER to exit" << std::endl;
+ std::getchar();
+
+ return 0;
+}
\ No newline at end of file
diff --git a/examples/Gpufit_Cpufit_Performance_Comparison_readme.txt b/examples/Gpufit_Cpufit_Performance_Comparison_readme.txt
new file mode 100644
index 0000000..92339af
--- /dev/null
+++ b/examples/Gpufit_Cpufit_Performance_Comparison_readme.txt
@@ -0,0 +1,106 @@
+Example application for the Gpufit library (https://github.com/gpufit/Gpufit)
+which implements Levenberg Marquardt curve fitting in CUDA.
+
+Requirements
+------------
+
+- A CUDA capable graphics card with a recent Nvidia graphics driver
+ (at least 367.48 / July 2016)
+- Windows
+- >1.5 GB of free RAM
+
+Running
+-------
+
+Start "Gpufit_Cpufit_Performance_Comparison.exe" to see a speed comparison of
+GPU and CPU implementation.
+
+Output
+------
+
+The accurate execution of the performance comparison example shows the version
+number of the installed CUDA driver and the CUDA runtime Gpufit was built with.
+
+EXAMPLE:
+ CUDA runtime version: 8.0
+ CUDA driver version: 9.0
+
+In the next step the successful generation of test data is indicated by three
+full progress bars.
+
+EXAMPLE:
+
+ -------------------------
+ Generating test parameters |||||||||||||||||||||||||
+ -------------------------
+ -------------------------
+ Generating data |||||||||||||||||||||||||
+ -------------------------
+ -------------------------
+ Adding noise |||||||||||||||||||||||||
+ -------------------------
+
+The results of the performance comparison between Gpufit and Cpufit are shown
+in a table. The results demonstrate the performance benefit of Gpufit compared
+to Cpufit executing the fitting process vor various number of fits in a range
+of 10 - 10000000. The execution speed is expressed in fits per second. If the
+execution time was not measureable, the speed is expressed as infinite.
+
+EXAMPLE:
+
+ Number | Cpufit speed | Gpufit speed | Performance
+ of fits | (fits/s) | (fits/s) | gain factor
+ -------------------------------------------------------
+ 10 | inf | 92 | 0.00
+ 100 | inf | 6667 | 0.00
+ 1000 | 66667 | inf | inf
+ 10000 | 58480 | 666667 | 11.40
+ 100000 | 59916 | 2173913 | 36.28
+ 1000000 | 59898 | 2469136 | 41.22
+ 10000000 | 60957 | 3038590 | 49.85
+
+Troubleshooting
+---------------
+
+MESSAGE:
+
+ CUDA runtime version: 0.0
+ CUDA driver version: 7.5
+
+ The CUDA runtime version is not compatible with the current graphics driver.
+ Please update the driver, or re-build Gpufit from source using a compatible
+ version of the CUDA toolkit.
+
+ Skipping Gpufit computations.
+
+BEHAVIOR:
+
+ The example executes Cpufit skipping Gpufit. Only computation speed of Cpufit
+ is shown in the results table.
+
+SOLUTION:
+
+ A common reason for this error message is an outdated Nvidia graphics driver.
+ In most cases updating the graphics card driver will solve this error. For
+ older graphics cards which are not supported by the CUDA toolkit used for
+ building Gpufit, re-compile Gpufit using an earlier version of the CUDA
+ toolkit which is compatible with the graphics driver.
+
+MESSAGE:
+
+ CUDA runtime version: 0.0
+ CUDA driver version: 0.0
+
+ No CUDA enabled graphics card detected.
+
+ Skipping Gpufit computations.
+
+BEHAVIOR:
+
+ The example executes Cpufit skipping Gpufit. Only computation speed of Cpufit
+ is shown in the results table.
+
+SOLUTION:
+
+ The execution of Gpufit requires a CUDA enabled graphics card.
+ Ensure, that the host PC has installed a CUDA enabled graphics card.
\ No newline at end of file
diff --git a/package/README.md b/package/README.md
new file mode 100644
index 0000000..ebf9279
--- /dev/null
+++ b/package/README.md
@@ -0,0 +1,48 @@
+# Creating a binary package
+
+The binary package bundles different builds outputs into a single distributable binary package containing the Gpufit dll,
+the performance comparison example, the Matlab bindings and the Python bindings.
+
+## Calling the script
+
+create_package.bat %1 %2 %3
+
+with
+
+- %1 is the BUILD_BASE_PATH (the path containing the various (see below) CMake generated Visual Studio projects)
+
+- %2 is the VERSION (e.g. 1.0.0)
+
+- %3 is the SOURCE_BASE_PATH (the path containing the sources)
+
+The output is a folder (BUILD_BASE_PATH/Gpufit-VERSION) which is also zipped if 7-Zip is available.
+
+## Requirements
+
+Note: The script has no way of checking that the requirements are fulfilled!
+
+See also [Build from sources](http://Gpufit.readthedocs.io/en/latest/installation.html#build-from-sources) for instructions.
+
+CMake
+
+- CUDA_ARCHITECTURE must be set to All (it is by default)
+
+- CUDA toolkit 8.0 is used for all builds (must be installed before)
+
+- Build directory for MSVC14 Win64 is BUILD_BASE_PATH/VC14x64-8.0
+
+- Build directory for MSVC14 Win32 is BUILD_BASE_PATH/VC14x32-8.0
+
+- Matlab and Python must be available
+
+Build
+
+- Configuration RelWithDebInfo is used for all builds!
+
+- With MSVC14 Win64 build target PYTHON_WHEEL, MATLAB_GPUFIT_PACKAGE and the Gpufit_Cpufit_Performance_Comparison example
+
+- With MSVC14 Win32 build target PYTHON_WHEEL, MATLAB_GPUFIT_PACKAGE and the Gpufit_Cpufit_Performance_Comparison example
+
+Documentation
+
+- An up-to-date version of the documentation must exist at SOURCE_BASE_PATH\docs\_build\latex\Gpufit.pdf (must be created before).
\ No newline at end of file
diff --git a/package/create_package.bat b/package/create_package.bat
new file mode 100644
index 0000000..75ba751
--- /dev/null
+++ b/package/create_package.bat
@@ -0,0 +1,170 @@
+@ECHO OFF
+
+REM create package for Gpufit, assumes everything is compiled
+
+if "%1" == "" (
+ echo specify build base path
+ goto end
+)
+
+if "%2" == "" (
+ echo specify version
+ goto end
+)
+
+if "%3" == "" (
+ echo specify source base path
+ goto end
+)
+
+REM date and time from https://stackoverflow.com/a/30343827/1536976
+
+@SETLOCAL ENABLEDELAYEDEXPANSION
+
+@REM Use WMIC to retrieve date and time
+@echo off
+FOR /F "skip=1 tokens=1-6" %%A IN ('WMIC Path Win32_LocalTime Get Day^,Hour^,Minute^,Month^,Second^,Year /Format:table') DO (
+ IF NOT "%%~F"=="" (
+ SET /A SortDate = 10000 * %%F + 100 * %%D + %%A
+ set YEAR=!SortDate:~0,4!
+ set MON=!SortDate:~4,2!
+ set DAY=!SortDate:~6,2!
+ @REM Add 1000000 so as to force a prepended 0 if hours less than 10
+ SET /A SortTime = 1000000 + 10000 * %%B + 100 * %%C + %%E
+ set HOUR=!SortTime:~1,2!
+ set MIN=!SortTime:~3,2!
+ set SEC=!SortTime:~5,2!
+ )
+)
+
+set DATECODE=!YEAR!!MON!!DAY!!HOUR!!MIN!
+echo %DATECODE%
+
+REM define paths
+
+set BUILD_BASE=%1
+set VERSION=%2
+set SOURCE_BASE=%3
+
+set OUTPUT_NAME=Gpufit_%VERSION%_win32_win64_build%DATECODE%
+set ROOT_INSTALL=%BUILD_BASE%\%OUTPUT_NAME%
+set OUTPUT_ZIP=%BUILD_BASE%\%OUTPUT_NAME%.zip
+
+set PERFORMANCE_TEST_INSTALL=%ROOT_INSTALL%\gpufit_performance_test
+set PYTHON_INSTALL=%ROOT_INSTALL%\python
+set x32_MATLAB_INSTALL=%ROOT_INSTALL%\matlab32
+set x64_MATLAB_INSTALL=%ROOT_INSTALL%\matlab64
+set SDK_INSTALL_ROOT=%ROOT_INSTALL%\gpufit_sdk
+
+set x64_BUILD=%BUILD_BASE%\VC14x64-8.0\RelWithDebInfo
+set x64_BUILD_LIB=%BUILD_BASE%\VC14x64-8.0\Gpufit\RelWithDebInfo
+set x32_BUILD=%BUILD_BASE%\VC14x32-8.0\RelWithDebInfo
+set x32_BUILD_LIB=%BUILD_BASE%\VC14x32-8.0\Gpufit\RelWithDebInfo
+
+set x64_PYTHON_BUILD=%x64_BUILD%\pyGpufit\dist
+set x32_PYTHON_BUILD=%x32_BUILD%\pyGpufit\dist
+
+set x64_MATLAB_BUILD=%x64_BUILD%\matlab
+set x32_MATLAB_BUILD=%x32_BUILD%\matlab
+
+set EXAMPLES_SOURCE=%SOURCE_BASE%\examples
+set PYTHON_SOURCE=%SOURCE_BASE%\Gpufit\python
+set MATLAB_SOURCE=%SOURCE_BASE%\Gpufit\matlab
+set SDK_README_SOURCE=%SOURCE_BASE%\package\sdk_readme.txt
+
+set MANUAL_SOURCE=%SOURCE_BASE%\docs\_build\latex\Gpufit.pdf
+set MANUAL_INSTALL=%ROOT_INSTALL%\Gpufit_%VERSION%_Manual.pdf
+
+REM clean up (if necessary)
+
+if exist "%ROOT_INSTALL%" rmdir /s /q "%ROOT_INSTALL%"
+if exist "%OUTPUT_ZIP%" del "%OUTPUT_ZIP%"
+
+REM create root folder
+
+echo create root directory
+mkdir "%ROOT_INSTALL%"
+
+REM copy main readme (is markdown, written as txt) and license
+
+copy "%SOURCE_BASE%\README.md" "%ROOT_INSTALL%\README.txt"
+copy "%SOURCE_BASE%\LICENSE.txt" "%ROOT_INSTALL%"
+
+REM copy manual
+
+if not exist "%MANUAL_SOURCE%" (
+ echo file %MANUAL_SOURCE% required, does not exist
+ goto end
+)
+copy "%MANUAL_SOURCE%" "%MANUAL_INSTALL%"
+
+REM copy performance test
+
+echo collect performance test application
+mkdir "%PERFORMANCE_TEST_INSTALL%"
+copy "%EXAMPLES_SOURCE%\Gpufit_Cpufit_Performance_Comparison_readme.txt" "%PERFORMANCE_TEST_INSTALL%\README.txt"
+
+mkdir "%PERFORMANCE_TEST_INSTALL%\win64"
+copy "%x64_BUILD%\Gpufit_Cpufit_Performance_Comparison.exe" "%PERFORMANCE_TEST_INSTALL%\win64"
+copy "%x64_BUILD%\Gpufit.dll" "%PERFORMANCE_TEST_INSTALL%\win64"
+copy "%x64_BUILD%\Cpufit.dll" "%PERFORMANCE_TEST_INSTALL%\win64"
+
+mkdir "%PERFORMANCE_TEST_INSTALL%\win32"
+copy "%x32_BUILD%\Gpufit_Cpufit_Performance_Comparison.exe" "%PERFORMANCE_TEST_INSTALL%\win32"
+copy "%x32_BUILD%\Gpufit.dll" "%PERFORMANCE_TEST_INSTALL%\win32"
+copy "%x32_BUILD%\Cpufit.dll" "%PERFORMANCE_TEST_INSTALL%\win32"
+
+REM copy Python packages
+
+echo collect python
+mkdir "%PYTHON_INSTALL%"
+copy "%x64_PYTHON_BUILD%\pyGpufit-%VERSION%-py2.py3-none-any.whl" "%PYTHON_INSTALL%\pyGpufit-%VERSION%-py2.py3-none-win_amd64.whl"
+copy "%x32_PYTHON_BUILD%\pyGpufit-%VERSION%-py2.py3-none-any.whl" "%PYTHON_INSTALL%\pyGpufit-%VERSION%-py2.py3-none-win32.whl"
+copy "%PYTHON_SOURCE%\README.txt" "%PYTHON_INSTALL%"
+xcopy "%PYTHON_SOURCE%\examples" "%PYTHON_INSTALL%\examples" /i /q
+
+REM copy Matlab 32 bit
+
+echo collect matlab32
+mkdir "%x32_MATLAB_INSTALL%"
+xcopy "%x32_MATLAB_BUILD%" "%x32_MATLAB_INSTALL%" /q
+xcopy "%MATLAB_SOURCE%\examples" "%x32_MATLAB_INSTALL%\examples" /i /q
+
+REM copy Matlab 64 bit
+
+echo collect matlab64
+mkdir "%x64_MATLAB_INSTALL%"
+xcopy "%x64_MATLAB_BUILD%" "%x64_MATLAB_INSTALL%" /q
+xcopy "%MATLAB_SOURCE%\examples" "%x64_MATLAB_INSTALL%\examples" /i /q
+
+REM copy SDK_INSTALL_ROOT
+
+echo collect SDK
+mkdir "%SDK_INSTALL_ROOT%"
+copy "%SDK_README_SOURCE%" "%SDK_INSTALL_ROOT%\README.txt"
+
+mkdir "%SDK_INSTALL_ROOT%\include"
+copy "%SOURCE_BASE%\Gpufit\gpufit.h" "%SDK_INSTALL_ROOT%\include"
+
+mkdir "%SDK_INSTALL_ROOT%\win32"
+copy "%x32_BUILD%\Gpufit.dll" "%SDK_INSTALL_ROOT%\win32"
+copy "%x32_BUILD_LIB%\Gpufit.lib" "%SDK_INSTALL_ROOT%\win32"
+
+mkdir "%SDK_INSTALL_ROOT%\win64"
+copy "%x64_BUILD%\Gpufit.dll" "%SDK_INSTALL_ROOT%\win64"
+copy "%x64_BUILD_LIB%\Gpufit.lib" "%SDK_INSTALL_ROOT%\win64"
+
+REM zip content of temp folder with 7-Zip if availabe
+
+set ZIP=C:\Program Files\7-Zip\7z.exe
+
+if not exist "%ZIP%" (
+ echo 7-Zip not installed, zip manually
+ goto end
+) ELSE (
+ echo zip result
+ "%ZIP%" a -y -r -mem=AES256 "%OUTPUT_ZIP%" "%ROOT_INSTALL%%" > nul
+)
+
+:end
+PAUSE
\ No newline at end of file
diff --git a/package/sdk_readme.txt b/package/sdk_readme.txt
new file mode 100644
index 0000000..59fc094
--- /dev/null
+++ b/package/sdk_readme.txt
@@ -0,0 +1,10 @@
+Software development kit for the Gpufit library (https://github.com/gpufit/Gpufit)
+which implements Levenberg Marquardt curve fitting in CUDA.
+
+Compiled with the Microsoft Visual Studio 2015 C++ compiler and CUDA toolkit 8.0.
+
+Folder include contains the gpufit.h header file representing the C API.
+
+Folder win32 contains the 32 bit compiled dynamic link library and import libary.
+
+Folder win64 contains the 64 bit compiled dynamic link library and import libary.
\ No newline at end of file
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 0000000..c524ac3
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,4 @@
+
+# Tests
+
+add_boost_test( "Cpufit;Gpufit" Consistency )
diff --git a/tests/Consistency.cpp b/tests/Consistency.cpp
new file mode 100644
index 0000000..feb1032
--- /dev/null
+++ b/tests/Consistency.cpp
@@ -0,0 +1,220 @@
+#define BOOST_TEST_MODULE Gpufit
+
+#include "Cpufit/cpufit.h"
+#include "Gpufit/gpufit.h"
+#include "Tests/utils.h"
+
+#include
+
+#include
+
+void generate_input_linear_fit_1d(FitInput & i)
+{
+ // number fits, points, parameters
+ i.n_fits = 1;
+ i.n_points = 2;
+ i.n_parameters = 2; // LINEAR_1D has two parameters
+
+ // data and weights
+ i.data = { 0, 1 };
+ i.weights_ = { 1, 1 };
+
+ // model id and estimator id
+ i.model_id = LINEAR_1D;
+ i.estimator_id = LSE;
+
+ // initial parameters and parameters to fit
+ i.initial_parameters = { 0, 0 };
+ i.parameters_to_fit = { 1, 1 };
+
+ // tolerance and max_n_iterations
+ i.tolerance = 0.001f;
+ i.max_n_iterations = 10;
+
+ // user info
+ i.user_info_ = { 0.f, 1.f };
+}
+
+void generate_input_gauss_fit_1d(FitInput & i)
+{
+ // number fits, points, parameters
+ i.n_fits = 1;
+ i.n_points = 5;
+ i.n_parameters = 4; // GAUSS_1D has four parameters
+
+ // data and weights
+ clean_resize(i.data, i.n_fits * i.n_points);
+ std::vector< float > const true_parameters{ { 4.f, 2.f, 0.5f, 1.f } };
+ generate_gauss_1d(i.data, true_parameters);
+ i.weights_.clear(); // no weights
+
+ // model id and estimator id
+ i.model_id = GAUSS_1D;
+ i.estimator_id = LSE;
+
+ // initial parameters and parameters to fit
+ i.initial_parameters = { 2.f, 1.5f, 0.3f, 0.f };
+ i.parameters_to_fit = { 1, 1, 1, 1 };
+
+ // tolerance and max_n_iterations
+ i.tolerance = 0.001f;
+ i.max_n_iterations = 10;
+
+ // user info
+ i.user_info_.clear(); // no user info
+}
+
+void generate_input_gauss_fit_2d(FitInput & i)
+{
+ // number fits, points, parameters
+ i.n_fits = 1;
+ i.n_points = 25;
+ i.n_parameters = 5; // GAUSS_2D has five parameters
+
+ // data and weights
+ clean_resize(i.data, i.n_fits * i.n_points);
+ std::vector< float > const true_parameters{ { 4.f, 1.8f, 2.2f, 0.5f, 1.f } };
+ generate_gauss_2d(i.data, true_parameters);
+ i.weights_.clear(); // no weights
+
+ // model id and estimator id
+ i.model_id = GAUSS_2D;
+ i.estimator_id = LSE;
+
+ // initial parameters and parameters to fit
+ i.initial_parameters = { 2.f, 1.8f, 2.2f, 0.4f, 0.f };
+ i.parameters_to_fit = { 1, 1, 1, 1, 1 };
+
+ // tolerance and max_n_iterations
+ i.tolerance = 0.0001f;
+ i.max_n_iterations = 20;
+
+ // user info
+ i.user_info_.clear(); // no user info
+}
+
+void generate_input_gauss_fit_2d_elliptic(FitInput & i)
+{
+ // number fits, points, parameters
+ i.n_fits = 1;
+ std::size_t const size_x = 5;
+ i.n_points = size_x * size_x;
+ i.n_parameters = 6; // GAUSS_2D_ELLIPTIC has five parameters
+
+ // data and weights
+ clean_resize(i.data, i.n_fits * i.n_points);
+
+ float const center_x = (static_cast(size_x) - 1.f) / 2.f;
+ std::vector< float > const true_parameters{ { 4.f, center_x, center_x, 0.4f, 0.6f, 1.f} };
+ generate_gauss_2d_elliptic(i.data, true_parameters);
+ i.weights_.clear(); // no weights
+
+ // model id and estimator id
+ i.model_id = GAUSS_2D_ELLIPTIC;
+ i.estimator_id = LSE;
+
+ // initial parameters and parameters to fit
+ i.initial_parameters = { 2.f, 1.8f, 2.2f, 0.5f, 0.5f, 0.f };
+ i.parameters_to_fit = { 1, 1, 1, 1, 1 };
+
+ // tolerance and max_n_iterations
+ i.tolerance = 0.001f;
+ i.max_n_iterations = 10;
+
+ // user info
+ i.user_info_.clear(); // no user info
+}
+
+void perform_cpufit_gpufit_and_check(void (*func)(FitInput &))
+{
+ // generate the data
+ FitInput i;
+ func(i);
+
+ // sanity checks (we don't want to introduce faulty data)
+ BOOST_CHECK(i.sanity_check());
+
+ // reset output variables
+ FitOutput gpu, cpu;
+ clean_resize(gpu.parameters, i.n_fits * i.n_parameters);
+ clean_resize(gpu.states, i.n_fits);
+ clean_resize(gpu.chi_squares, i.n_fits);
+ clean_resize(gpu.n_iterations, i.n_fits);
+
+ clean_resize(cpu.parameters, i.n_fits * i.n_parameters);
+ clean_resize(cpu.states, i.n_fits);
+ clean_resize(cpu.chi_squares, i.n_fits);
+ clean_resize(cpu.n_iterations, i.n_fits);
+
+
+ // call to cpufit, store output
+ int const cpu_status
+ = cpufit
+ (
+ i.n_fits,
+ i.n_points,
+ i.data.data(),
+ i.weights(),
+ i.model_id,
+ i.initial_parameters.data(),
+ i.tolerance,
+ i.max_n_iterations,
+ i.parameters_to_fit.data(),
+ i.estimator_id,
+ i.user_info_size(),
+ i.user_info(),
+ cpu.parameters.data(),
+ cpu.states.data(),
+ cpu.chi_squares.data(),
+ cpu.n_iterations.data()
+ );
+
+ BOOST_CHECK(cpu_status == 0);
+
+ // call to gpufit, store output
+ int const gpu_status
+ = gpufit
+ (
+ i.n_fits,
+ i.n_points,
+ i.data.data(),
+ i.weights(),
+ i.model_id,
+ i.initial_parameters.data(),
+ i.tolerance,
+ i.max_n_iterations,
+ i.parameters_to_fit.data(),
+ i.estimator_id,
+ i.user_info_size(),
+ i.user_info(),
+ gpu.parameters.data(),
+ gpu.states.data(),
+ gpu.chi_squares.data(),
+ gpu.n_iterations.data()
+ );
+
+ BOOST_CHECK(gpu_status == 0);
+
+ // check both output for equality
+ BOOST_CHECK(cpu.states == gpu.states);
+ BOOST_CHECK(cpu.n_iterations == gpu.n_iterations);
+ BOOST_CHECK(close_or_equal(cpu.parameters, gpu.parameters));
+ BOOST_CHECK(close_or_equal(cpu.chi_squares, gpu.chi_squares));
+
+}
+
+BOOST_AUTO_TEST_CASE( Consistency )
+{
+ BOOST_TEST_MESSAGE( "linear_fit_1d" );
+ perform_cpufit_gpufit_and_check(&generate_input_linear_fit_1d);
+
+ BOOST_TEST_MESSAGE( "gauss_fit_1d" );
+ perform_cpufit_gpufit_and_check(&generate_input_gauss_fit_1d);
+
+ BOOST_TEST_MESSAGE( "gauss_fit_2d" );
+ perform_cpufit_gpufit_and_check(&generate_input_gauss_fit_2d);
+
+ BOOST_TEST_MESSAGE("gauss_fit_2d_elliptic");
+ perform_cpufit_gpufit_and_check(&generate_input_gauss_fit_2d_elliptic);
+
+}
diff --git a/tests/utils.cpp b/tests/utils.cpp
new file mode 100644
index 0000000..16f3970
--- /dev/null
+++ b/tests/utils.cpp
@@ -0,0 +1,60 @@
+#include "utils.h"
+
+// initialize random number generator
+std::mt19937 rng(0);
+
+/*
+ Given a parameter vector p with 4 entries, constructs a 1D Gaussian peak function with x values 0,..,v.size() - 1
+*/
+void generate_gauss_1d(std::vector< float > & v, std::vector< float > const & p)
+{
+ for (std::size_t i = 0; i < v.size(); i++)
+ {
+ float const argx = ((i - p[1]) * (i - p[1])) / (2.f * p[2] * p[2]);
+ float const ex = exp(-argx);
+ v[i] = p[0] * ex + p[3];
+ }
+}
+
+/*
+ Given a parameters vector p with 5 entries, constructs a 2D Gaussian peak function with x, y values 0, .., sqrt(v.size()) - 1
+*/
+void generate_gauss_2d(std::vector< float > & v, std::vector< float > const & p)
+{
+ std::size_t const n = static_cast(std::sqrt(v.size()));
+ if (n * n != v.size())
+ {
+ throw std::runtime_error("v.size() is not a perfect square number");
+ }
+
+ for (std::size_t j = 0; j < n; j++)
+ {
+ float const argy = ((j - p[2]) * (j - p[2]));
+ for (std::size_t i = 0; i < n; i++)
+ {
+ float const argx = ((i - p[1]) * (i - p[1]));
+ float const ex = exp(-(argx + argy) / (2.f * p[3] * p[3]));
+ v[j * n + i] = p[0] * ex + p[3];
+ }
+ }
+}
+
+void generate_gauss_2d_elliptic(std::vector< float > & v, std::vector< float > const & p)
+{
+ std::size_t const n = static_cast(std::sqrt(v.size()));
+ if (n * n != v.size())
+ {
+ throw std::runtime_error("v.size() is not a perfect square number");
+ }
+
+ for (std::size_t j = 0; j < n; j++)
+ {
+ float const argy = ((j - p[2]) * (j - p[2])) / (2.f * p[4] * p[4]);
+ for (std::size_t i = 0; i < n; i++)
+ {
+ float const argx = ((i - p[1]) * (i - p[1])) / (2.f * p[3] * p[3]);
+ float const ex = exp(-(argx + argy));
+ v[j * n + i] = p[0] * ex + p[3];
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/utils.h b/tests/utils.h
new file mode 100644
index 0000000..dd0caa7
--- /dev/null
+++ b/tests/utils.h
@@ -0,0 +1,176 @@
+#ifndef TEST_UTILS_H_INCLUDED
+#define TEST_UTILS_H_INCLUDED
+
+#include
+#include
+
+#define CHK(x) if (!x) return false
+
+extern std::mt19937 rng;
+
+/*
+Just to make sure that the content is erased after the resize.
+*/
+template void clean_resize(std::vector