diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1a7c293
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+# Python
+**/.idea
+__pycache__
+
+# docs
+/docs/_build
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..9590a65
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,159 @@
+# Levenberg Marquardt curve fitting in CUDA 
+# https://github.com/gpufit/Gpufit
+# see also CMake configuration in /docs/installation.rst
+
+# CMake
+
+cmake_minimum_required( VERSION 3.7 )
+set_property( GLOBAL PROPERTY USE_FOLDERS ON )
+
+if( NOT PROJECT_NAME )
+  project( Gpufit VERSION 1.0.0 )
+  include( CTest )
+endif()
+
+if( MSVC ) # link runtime statically
+  foreach( type ${CMAKE_CONFIGURATION_TYPES} ${CMAKE_BUILD_TYPE} )
+    string( TOUPPER ${type} TYPE )
+    foreach( flags CMAKE_C_FLAGS_${TYPE} CMAKE_CXX_FLAGS_${TYPE} )
+      get_property( help CACHE ${flags} PROPERTY HELPSTRING )
+      string( REPLACE "/MD" "/MT" ${flags} "${${flags}}" )
+      set( ${flags} "${${flags}}" CACHE STRING "${help}" FORCE )
+    endforeach()
+  endforeach()
+endif()
+
+function( add_launcher target executable arguments working_directory )
+  if( MSVC12 OR MSVC14 )
+    file( WRITE ${CMAKE_CURRENT_BINARY_DIR}/${target}.vcxproj.user
+"<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+"<Project ToolsVersion=\"14.0\" xmlns=\"http://schemas.microsoft.com/developer/msbuild/2003\">\n"
+"  <PropertyGroup>\n"
+"    <LocalDebuggerCommand>${executable}</LocalDebuggerCommand>\n"
+"    <LocalDebuggerCommandArguments>${arguments}</LocalDebuggerCommandArguments>\n"
+"    <LocalDebuggerWorkingDirectory>${working_directory}</LocalDebuggerWorkingDirectory>\n"
+"  </PropertyGroup>\n"
+"</Project>\n"
+    )
+  endif()
+endfunction()
+
+# Boost
+
+find_package( Boost 1.58.0 )
+if( Boost_FOUND )
+  function( add_boost_test modules name )
+    string( REPLACE ";" "_" prefix "${modules}" )
+    set( target ${prefix}_Test_${name} )
+    add_executable( ${target} ${name}.cpp
+      ${PROJECT_SOURCE_DIR}/Tests/utils.h
+      ${PROJECT_SOURCE_DIR}/Tests/utils.cpp
+    )
+    target_include_directories( ${target} PRIVATE ${PROJECT_SOURCE_DIR} )
+    target_link_libraries( ${target} ${modules} Boost::boost )
+    set_property( TARGET ${target}
+      PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" )
+    set_property( TARGET ${target} PROPERTY FOLDER Tests )
+
+    add_test( NAME ${target}
+      COMMAND ${target} --build_info --log_level=all --report_level=detailed )
+  endfunction()
+else()
+  set( BUILD_TESTING OFF )
+  message( WARNING "Boost NOT found - skipping tests! (set BOOST_ROOT manually)" )
+endif()
+
+# MATLAB
+
+find_package( Matlab )
+if( Matlab_FOUND )
+  find_program( Matlab_EXECUTABLE matlab
+    PATHS "${Matlab_ROOT_DIR}/bin" PATH_SUFFIXES win32 win64 NO_DEFAULT_PATH )
+  function( add_matlab_launcher target )
+    set( paths "${CMAKE_BINARY_DIR}/$(Configuration)" ${ARGN} )
+    list( GET paths -1 working_directory )
+    string( REPLACE ";" "','" paths "${paths}" )
+    set( arguments "-r addpath('${paths}');addpath(genpath(pwd))" )
+    add_launcher( ${target} "${Matlab_EXECUTABLE}" "${arguments}" "${working_directory}" )
+  endfunction()
+endif()
+
+# Python
+
+find_package( PythonInterp )
+if( PYTHONINTERP_FOUND )
+  function( add_python_launcher target )
+    set( paths "${CMAKE_BINARY_DIR}/$(Configuration)" ${ARGN} )
+    list( GET paths -1 working_directory )
+    string( REPLACE ";" "')\nsys.path.append('" paths "${paths}" )
+    set( arguments "-i -c \"import sys\nsys.path.append('${paths}')\"" )
+    add_launcher( ${target} "${PYTHON_EXECUTABLE}" "${arguments}" "${working_directory}" )
+  endfunction()
+endif()
+
+# Cpufit
+
+add_subdirectory( Cpufit )
+
+# Gpufit
+
+add_subdirectory( Gpufit )
+
+# Examples using Gpufit and Cpufit
+
+add_subdirectory( examples )
+
+# Launcher
+#
+# Uses the following variables:
+#
+#   Matlab_WORKING_DIRECTORY (Default: user home directory)
+#   -- Working directory for MATLAB applications using Cpufit and Gpufit.
+#   Python_WORKING_DIRECTORY (Default: user home directory)
+#   -- Working directory for Python applications using Gpufit.
+
+if( WIN32 )
+	file( TO_CMAKE_PATH "$ENV{HOMEPATH}" home )
+else()
+	file( TO_CMAKE_PATH "$ENV{HOME}" home )
+endif()
+
+if( Matlab_FOUND )
+  set( Matlab_WORKING_DIRECTORY "${home}" CACHE PATH "MATLAB working directory" )
+  if( Matlab_WORKING_DIRECTORY )
+    add_custom_target( RUN_MATLAB )
+    set_property( TARGET RUN_MATLAB PROPERTY FOLDER CMakePredefinedTargets )
+    add_dependencies( RUN_MATLAB CpufitMex GpufitMex )
+    add_matlab_launcher( RUN_MATLAB
+      "${CMAKE_SOURCE_DIR}/Cpufit/matlab"
+      "${CMAKE_SOURCE_DIR}/Gpufit/matlab"
+      "${Matlab_WORKING_DIRECTORY}"
+    )
+  endif()
+endif()
+
+if( PYTHONINTERP_FOUND )
+  set( Python_WORKING_DIRECTORY "${home}" CACHE PATH "Python working directory" )
+  if( Python_WORKING_DIRECTORY )
+    add_custom_target( RUN_PYTHON )
+    set_property( TARGET RUN_PYTHON PROPERTY FOLDER CMakePredefinedTargets )
+    add_dependencies( RUN_PYTHON Gpufit )
+    add_python_launcher( RUN_PYTHON
+      "${CMAKE_SOURCE_DIR}/Gpufit/python"
+      "${Python_WORKING_DIRECTORY}"
+    )
+  endif()
+endif()
+
+# Tests
+
+if( BUILD_TESTING )
+  add_subdirectory( tests )
+endif()
+
+# Package
+
+#set( CPACK_PACKAGE_VERSION ${PROJECT_VERSION} )
+#set( CPACK_GENERATOR ZIP )
+
+#include( CPack )
diff --git a/Cpufit/CMakeLists.txt b/Cpufit/CMakeLists.txt
new file mode 100644
index 0000000..9af1643
--- /dev/null
+++ b/Cpufit/CMakeLists.txt
@@ -0,0 +1,29 @@
+
+# Cpufit
+
+set( CpuHeaders
+	Cpufit.h
+	info.h
+	lm_fit.h
+	interface.h
+)
+
+set( CpuSources
+	Cpufit.cpp
+	info.cpp
+	lm_fit.cpp
+	lm_fit_cpp.cpp
+	interface.cpp
+	Cpufit.def
+)
+
+add_library( Cpufit SHARED
+	${CpuHeaders} 
+	${CpuSources}
+)
+set_property( TARGET Cpufit
+	PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" )
+
+#install( TARGETS Cpufit RUNTIME DESTINATION bin )
+
+add_subdirectory( matlab )
diff --git a/Cpufit/Cpufit.def b/Cpufit/Cpufit.def
new file mode 100644
index 0000000..07c1849
--- /dev/null
+++ b/Cpufit/Cpufit.def
@@ -0,0 +1,4 @@
+LIBRARY          "Cpufit"
+EXPORTS       
+    cpufit @1
+    cpufit_get_last_error @2
\ No newline at end of file
diff --git a/Cpufit/README.md b/Cpufit/README.md
new file mode 100644
index 0000000..cee0619
--- /dev/null
+++ b/Cpufit/README.md
@@ -0,0 +1 @@
+# Cpufit
\ No newline at end of file
diff --git a/Cpufit/cpufit.cpp b/Cpufit/cpufit.cpp
new file mode 100644
index 0000000..c8c74cb
--- /dev/null
+++ b/Cpufit/cpufit.cpp
@@ -0,0 +1,76 @@
+#include "cpufit.h"
+#include "interface.h"
+
+#include <string>
+
+std::string last_error ;
+
+int cpufit
+(
+    size_t n_fits,
+    size_t n_points,
+    float * data,
+    float * weights,
+    int model_id,
+    float * initial_parameters,
+    float tolerance,
+    int max_n_iterations,
+    int * parameters_to_fit,
+    int estimator_id,
+    size_t user_info_size,
+    char * user_info,
+    float * output_parameters,
+    int * output_states,
+    float * output_chi_squares,
+    int * output_n_iterations
+)
+try
+{
+    __int32 n_points_32 = 0;
+    if (n_points <= (unsigned int)(std::numeric_limits<__int32>::max()))
+    {
+        n_points_32 = __int32(n_points);
+    }
+    else
+    {
+        throw std::runtime_error("maximum number of data points per fit exceeded");
+    }
+
+    FitInterface fi(
+        data,
+        weights,
+        n_fits,
+        n_points_32,
+        tolerance,
+        max_n_iterations,
+        estimator_id,
+        initial_parameters,
+        parameters_to_fit,
+        user_info,
+        user_info_size,
+        output_parameters,
+        output_states,
+        output_chi_squares,
+        output_n_iterations);
+
+    fi.fit(model_id);
+
+    return STATUS_OK;
+}
+catch (std::exception & exception)
+{
+    last_error = exception.what();
+
+    return STATUS_ERROR;
+}
+catch (...)
+{
+    last_error = "Unknown Error";
+
+    return STATUS_ERROR;
+}
+
+char const * cpufit_get_last_error()
+{
+    return last_error.c_str();
+}
diff --git a/Cpufit/cpufit.h b/Cpufit/cpufit.h
new file mode 100644
index 0000000..1575636
--- /dev/null
+++ b/Cpufit/cpufit.h
@@ -0,0 +1,56 @@
+#ifndef CPU_FIT_H_INCLUDED
+#define CPU_FIT_H_INCLUDED
+
+// fitting model ID
+#define GAUSS_1D 0
+#define GAUSS_2D 1
+#define GAUSS_2D_ELLIPTIC 2
+#define GAUSS_2D_ROTATED 3
+#define CAUCHY_2D_ELLIPTIC 4
+#define LINEAR_1D 5
+
+// estimator ID
+#define LSE 0
+#define MLE 1
+
+// fit state
+#define STATE_CONVERGED 0
+#define STATE_MAX_ITERATION 1
+#define STATE_SINGULAR_HESSIAN 2
+#define STATE_NEG_CURVATURE_MLE 3
+
+// cpufit return state
+#define STATUS_OK 0
+#define STATUS_ERROR -1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int cpufit
+(
+    size_t n_fits,
+    size_t n_points,
+    float * data,
+    float * weights,
+    int model_id,
+    float * initial_parameters,
+    float tolerance,
+    int max_n_iterations,
+    int * parameters_to_fit,
+    int estimator_id,
+    size_t user_info_size,
+    char * user_info,
+    float * output_parameters,
+    int * output_states,
+    float * output_chi_squares,
+    int * output_n_iterations
+) ;
+
+char const * cpufit_get_last_error() ;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // CPU_FIT_H_INCLUDED
diff --git a/Cpufit/info.cpp b/Cpufit/info.cpp
new file mode 100644
index 0000000..dcd5085
--- /dev/null
+++ b/Cpufit/info.cpp
@@ -0,0 +1,30 @@
+#include "info.h"
+
+Info::Info(void) :
+    n_parameters_(0),
+    n_parameters_to_fit_(0),
+    max_n_iterations_(0),
+    n_fits_(0),
+    n_points_(0),
+    model_id_(0),
+    estimator_id_(0),
+    user_info_size_(0)
+{
+}
+
+Info::~Info(void)
+{
+}
+
+void Info::set_number_of_parameters_to_fit(int const * parameters_to_fit)
+{
+    n_parameters_to_fit_ = n_parameters_;
+
+    for (int i = 0; i < n_parameters_; i++)
+    {
+        if (!parameters_to_fit[i])
+        {
+            n_parameters_to_fit_--;
+        }
+    }
+}
\ No newline at end of file
diff --git a/Cpufit/info.h b/Cpufit/info.h
new file mode 100644
index 0000000..0faa764
--- /dev/null
+++ b/Cpufit/info.h
@@ -0,0 +1,28 @@
+#ifndef CPUFIT_PARAMETERS_H_INCLUDED
+#define CPUFIT_PARAMETERS_H_INCLUDED
+
+#include <vector>
+
+class Info
+{
+public:
+    Info();
+    virtual ~Info();
+    void set_number_of_parameters_to_fit(int const * parameters_to_fit);
+
+private:
+
+public:
+    int n_parameters_;
+    int n_parameters_to_fit_;
+    std::size_t n_fits_;
+    std::size_t n_points_;
+    int max_n_iterations_;
+    int model_id_;
+    int estimator_id_;
+    std::size_t user_info_size_;
+    
+private:
+};
+
+#endif
diff --git a/Cpufit/interface.cpp b/Cpufit/interface.cpp
new file mode 100644
index 0000000..50dc01d
--- /dev/null
+++ b/Cpufit/interface.cpp
@@ -0,0 +1,118 @@
+#include "cpufit.h"
+#include "interface.h"
+
+FitInterface::FitInterface(
+    float const * data,
+    float const * weights,
+    std::size_t n_fits,
+    int n_points,
+    float tolerance,
+    int max_n_iterations,
+    int estimator_id,
+    float const * initial_parameters,
+    int const * parameters_to_fit,
+    char * user_info,
+    std::size_t user_info_size,
+    float * output_parameters,
+    int * output_states,
+    float * output_chi_squares,
+    int * output_n_iterations) :
+    data_(data),
+    weight_(weights),
+    n_fits_(n_fits),
+    n_points_(n_points),
+    tolerance_(tolerance),
+    max_n_iterations_(max_n_iterations),
+    estimator_id_(estimator_id),
+    initial_parameters_(initial_parameters),
+    parameters_to_fit_(parameters_to_fit),
+    user_info_(user_info),
+    user_info_size_(user_info_size),
+    output_parameters_(output_parameters),
+    output_states_(output_states),
+    output_chi_squares_(output_chi_squares),
+    output_n_iterations_(output_n_iterations),
+    n_parameters_(0)
+{}
+
+FitInterface::~FitInterface()
+{}
+
+void FitInterface::check_sizes()
+{
+    std::size_t maximum_size = std::numeric_limits< std::size_t >::max();
+
+    if (n_fits_ > maximum_size / n_points_ / sizeof(float))
+    {
+        throw std::runtime_error("maximum absolute number of data points exceeded");
+    }
+
+    if (n_fits_ > maximum_size / n_parameters_ / sizeof(float))
+    {
+        throw std::runtime_error("maximum number of fits and/or parameters exceeded");
+    }
+}
+
+void FitInterface::configure_info(Info & info, int const model_id)
+{
+    info.model_id_ = model_id;
+    info.n_fits_ = n_fits_;
+    info.n_points_ = n_points_;
+    info.max_n_iterations_ = max_n_iterations_;
+    info.estimator_id_ = estimator_id_;
+    info.user_info_size_ = user_info_size_;
+    info.n_parameters_ = n_parameters_;
+
+    info.set_number_of_parameters_to_fit(parameters_to_fit_);
+}
+
+void FitInterface::set_number_of_parameters(int const model_id)
+{
+    switch (model_id)
+    {
+    case GAUSS_1D:
+        n_parameters_ = 4;
+        break;
+    case GAUSS_2D:
+        n_parameters_ = 5;
+        break;
+    case GAUSS_2D_ELLIPTIC:
+        n_parameters_ = 6;
+        break;
+    case GAUSS_2D_ROTATED:
+        n_parameters_ = 7;
+        break;
+    case CAUCHY_2D_ELLIPTIC:
+        n_parameters_ = 6;
+        break;
+    case LINEAR_1D:
+        n_parameters_ = 2;
+        break;
+    default:
+        break;
+    }
+}
+
+void FitInterface::fit(int const model_id)
+{
+    set_number_of_parameters(model_id);
+
+    check_sizes();
+
+    Info info;
+    configure_info(info, model_id);
+
+    LMFit lmfit(
+        data_,
+        weight_,
+        info,
+        initial_parameters_,
+        parameters_to_fit_,
+        user_info_,
+        output_parameters_,
+        output_states_,
+        output_chi_squares_,
+        output_n_iterations_);
+
+    lmfit.run(tolerance_);
+}
diff --git a/Cpufit/interface.h b/Cpufit/interface.h
new file mode 100644
index 0000000..09bdc11
--- /dev/null
+++ b/Cpufit/interface.h
@@ -0,0 +1,57 @@
+#ifndef CPUFIT_INTERFACE_H_INCLUDED
+#define CPUFIT_INTERFACE_H_INCLUDED
+
+#include "lm_fit.h"
+
+class FitInterface
+{
+public:
+    FitInterface(
+        float const * data,
+        float const * weights,
+        std::size_t n_fits,
+        int n_points,
+        float tolerance,
+        int max_n_iterations,
+        int estimator_id,
+        float const * initial_parameters,
+        int const * parameters_to_fit,
+        char * user_info,
+        std::size_t user_info_size,
+        float * output_parameters,
+        int * output_states,
+        float * output_chi_squares,
+        int * output_n_iterations);
+
+    virtual ~FitInterface();
+
+    void fit(int const model_id);
+
+private:
+    void set_number_of_parameters(int const model_id);
+    void check_sizes();
+    void configure_info(Info & info, int const model_id);
+
+public:
+
+private:
+    int n_parameters_;
+    float const * const data_;
+    float const * const weight_;
+    std::size_t const n_fits_;
+    int const n_points_;
+    float const tolerance_;
+    int const max_n_iterations_;
+    int const estimator_id_;
+    float const * const initial_parameters_;
+    int const * const parameters_to_fit_;
+    char * const user_info_;
+    std::size_t const user_info_size_;
+
+    float * output_parameters_;
+    int * output_states_;
+    float * output_chi_squares_;
+    int * output_n_iterations_;
+};
+
+#endif
diff --git a/Cpufit/lm_fit.cpp b/Cpufit/lm_fit.cpp
new file mode 100644
index 0000000..e6fa64f
--- /dev/null
+++ b/Cpufit/lm_fit.cpp
@@ -0,0 +1,57 @@
+#include "lm_fit.h"
+#include <stdlib.h>
+#include <math.h>
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include <numeric>
+
+LMFit::LMFit(
+    float const * const data,
+    float const * const weights,
+    Info const & info,
+    float const * const initial_parameters,
+    int const * const parameters_to_fit,
+    char * const user_info,
+    float * output_parameters,
+    int * output_states,
+    float * output_chi_squares,
+    int * output_n_iterations
+    ) :
+    data_(data),
+    weights_(weights),
+    initial_parameters_(initial_parameters),
+    parameters_to_fit_(parameters_to_fit),
+    user_info_(user_info),
+    output_parameters_(output_parameters),
+    output_states_(output_states),
+    output_chi_squares_(output_chi_squares),
+    output_n_iterations_(output_n_iterations),
+    info_(info)
+{}
+
+LMFit::~LMFit()
+{
+}
+
+void LMFit::run(float const tolerance)
+{
+    for (std::size_t fit_index = 0; fit_index < info_.n_fits_; fit_index++)
+    {
+        LMFitCPP gf_cpp(
+            tolerance,
+            fit_index,
+            data_ + fit_index*info_.n_points_,
+            weights_ ? weights_ + fit_index*info_.n_points_ : 0,
+            info_,
+            initial_parameters_ + fit_index*info_.n_parameters_,
+            parameters_to_fit_,
+            user_info_,
+            output_parameters_ + fit_index*info_.n_parameters_,
+            output_states_ + fit_index,
+            output_chi_squares_ + fit_index,
+            output_n_iterations_ + fit_index);
+
+        gf_cpp.run();
+    }
+}
\ No newline at end of file
diff --git a/Cpufit/lm_fit.h b/Cpufit/lm_fit.h
new file mode 100644
index 0000000..a5fd96d
--- /dev/null
+++ b/Cpufit/lm_fit.h
@@ -0,0 +1,137 @@
+#ifndef CPUFIT_GAUSS_FIT_H_INCLUDED
+#define CPUFIT_GAUSS_FIT_H_INCLUDED
+
+#include "info.h"
+
+class LMFitCPP;
+
+class LMFit
+{
+public:
+    LMFit(
+        float const * data,
+        float const * weights,
+        Info const& info,
+        float const * initial_parameters,
+        int const * parameters_to_fit,
+        char * user_info,
+        float * output_parameters,
+        int * output_states,
+        float * output_chi_squares,
+        int * output_n_iterations);
+
+    virtual ~LMFit();
+
+    void run(float const tolerance);
+        
+private:
+    float const * const data_;
+    float const * const weights_;
+    float const * const initial_parameters_;
+    int const * const parameters_to_fit_;
+    char * const user_info_;
+
+    float * output_parameters_;
+    int * output_states_;
+    float * output_chi_squares_;
+    int * output_n_iterations_;
+
+    Info const & info_;
+};
+
+class LMFitCPP
+{
+public:
+    LMFitCPP(
+        float const tolerance,
+        std::size_t const fit_index,
+        float const * data,
+        float const * weight,
+        Info const & info,
+        float const * initial_parameters,
+        int const * parameters_to_fit,
+        char * user_info,
+        float * output_parameters,
+        int * output_states,
+        float * output_chi_squares,
+        int * output_n_iterations);
+
+    virtual ~LMFitCPP()
+    {};
+
+    void run();
+
+private:
+	void calc_curve_values();
+    void calc_coefficients();
+
+    void calc_curve_values(std::vector<float>& curve, std::vector<float>& derivatives);
+
+    void calc_values_gauss2d(std::vector<float>& gaussian);
+    void calc_derivatives_gauss2d(std::vector<float> & derivatives);
+
+    void calc_values_gauss2delliptic(std::vector<float>& gaussian);
+    void calc_derivatives_gauss2delliptic(std::vector<float> & derivatives);
+
+    void calc_values_gauss2drotated(std::vector<float>& gaussian);
+    void calc_derivatives_gauss2drotated(std::vector<float> & derivatives);
+
+    void calc_values_gauss1d(std::vector<float>& gaussian);
+    void calc_derivatives_gauss1d(std::vector<float> & derivatives);
+
+    void calc_values_cauchy2delliptic(std::vector<float>& cauchy);
+    void calc_derivatives_cauchy2delliptic(std::vector<float> & derivatives);
+
+    void calc_values_linear1d(std::vector<float>& line);
+    void calc_derivatives_linear1d(std::vector<float> & derivatives);
+
+    void calculate_hessian(std::vector<float> const & derivatives,
+        std::vector<float> const & curve);
+
+    void calc_gradient(std::vector<float> const & derivatives,
+        std::vector<float> const & curve);
+
+    void calc_chi_square(
+        std::vector<float> const & curve);
+
+    void modify_step_width();
+    void gauss_jordan();
+    void update_parameters();
+
+    bool check_for_convergence();
+    void evaluate_iteration(int const iteration);
+    void prepare_next_iteration();
+
+public:
+
+private:
+
+    std::size_t const fit_index_;
+    float const * const data_;
+    float const * const weight_;
+    float const * const initial_parameters_;
+    int const * const parameters_to_fit_;
+
+    bool converged_;
+    float * parameters_;
+    int * state_;
+    float * chi_square_;
+    int * n_iterations_;
+
+    std::vector<float> prev_parameters_;
+    Info const & info_;
+
+    float lambda_;
+    std::vector<float> curve_;
+    std::vector<float> derivatives_;
+    std::vector<float> hessian_;
+    std::vector<float> modified_hessian_;
+    std::vector<float> gradient_;
+    std::vector<float> delta_;
+    float prev_chi_square_;
+    float const tolerance_;
+
+    char * const user_info_;
+};
+
+#endif
\ No newline at end of file
diff --git a/Cpufit/lm_fit_cpp.cpp b/Cpufit/lm_fit_cpp.cpp
new file mode 100644
index 0000000..7eaae9d
--- /dev/null
+++ b/Cpufit/lm_fit_cpp.cpp
@@ -0,0 +1,711 @@
+#include "cpufit.h"
+#include "lm_fit.h"
+
+#include <vector>
+#include <numeric>
+#include <algorithm>
+
+LMFitCPP::LMFitCPP(
+    float const tolerance,
+    std::size_t const fit_index,
+    float const * data,
+    float const * weight,
+    Info const & info,
+    float const * initial_parameters,
+    int const * parameters_to_fit,
+    char * user_info,
+    float * output_parameters,
+    int * output_state,
+    float * output_chi_square,
+    int * output_n_iterations
+    ) :
+    fit_index_(fit_index),
+    data_(data),
+    weight_(weight),
+    initial_parameters_(initial_parameters),
+    tolerance_(tolerance),
+    converged_(false),
+    info_(info),
+    parameters_to_fit_(parameters_to_fit),
+    curve_(info.n_points_),
+    derivatives_(info.n_points_*info.n_parameters_),
+    hessian_(info.n_parameters_to_fit_*info.n_parameters_to_fit_),
+    modified_hessian_(info.n_parameters_to_fit_*info.n_parameters_to_fit_),
+    gradient_(info.n_parameters_to_fit_),
+    delta_(info.n_parameters_to_fit_),
+    prev_chi_square_(0),
+    lambda_(0.001f),
+    prev_parameters_(info.n_parameters_to_fit_),
+    user_info_(user_info),
+    parameters_(output_parameters),
+    state_(output_state),
+    chi_square_(output_chi_square),
+    n_iterations_(output_n_iterations)
+{}
+
+void LMFitCPP::calc_derivatives_gauss2d(
+    std::vector<float> & derivatives)
+{
+    std::size_t const  fit_size_x = std::size_t(std::sqrt(info_.n_points_));
+
+    for (std::size_t y = 0; y < fit_size_x; y++)
+        for (std::size_t x = 0; x < fit_size_x; x++)
+        {
+            float const argx = (x - parameters_[1]) * (x - parameters_[1]) / (2 * parameters_[3] * parameters_[3]);
+            float const argy = (y - parameters_[2]) * (y - parameters_[2]) / (2 * parameters_[3] * parameters_[3]);
+            float const ex = exp(-(argx + argy));
+
+            derivatives[0 * info_.n_points_ + y*fit_size_x + x]
+                = ex;
+            derivatives[1 * info_.n_points_ + y*fit_size_x + x]
+                = (parameters_[0] * (x - parameters_[1])*ex) / (parameters_[3] * parameters_[3]);
+            derivatives[2 * info_.n_points_ + y*fit_size_x + x]
+                = (parameters_[0] * (y - parameters_[2])*ex) / (parameters_[3] * parameters_[3]);
+            derivatives[3 * info_.n_points_ + y*fit_size_x + x]
+                = (parameters_[0]
+                * ((x - parameters_[1])*(x - parameters_[1])
+                + (y - parameters_[2])*(y - parameters_[2]))*ex)
+                / (parameters_[3] * parameters_[3] * parameters_[3]);
+            derivatives[4 * info_.n_points_ + y*fit_size_x + x]
+                = 1;
+        }
+}
+
+void LMFitCPP::calc_derivatives_gauss2delliptic(
+    std::vector<float> & derivatives)
+{
+    std::size_t const  fit_size_x = std::size_t(std::sqrt(info_.n_points_));
+
+    for (std::size_t y = 0; y < fit_size_x; y++)
+        for (std::size_t x = 0; x < fit_size_x; x++)
+        {
+            float const argx = (x - parameters_[1]) * (x - parameters_[1]) / (2 * parameters_[3] * parameters_[3]);
+            float const argy = (y - parameters_[2]) * (y - parameters_[2]) / (2 * parameters_[4] * parameters_[4]);
+            float const ex = exp(-(argx +argy));
+
+            derivatives[0 * info_.n_points_ + y*fit_size_x + x]
+                = ex;
+            derivatives[1 * info_.n_points_ + y*fit_size_x + x]
+                = (parameters_[0] * (x - parameters_[1])*ex) / (parameters_[3] * parameters_[3]);
+            derivatives[2 * info_.n_points_ + y*fit_size_x + x]
+                = (parameters_[0] * (y - parameters_[2])*ex) / (parameters_[4] * parameters_[4]);
+            derivatives[3 * info_.n_points_ + y*fit_size_x + x]
+                = (parameters_[0] * (x - parameters_[1])*(x - parameters_[1])*ex) / (parameters_[3] * parameters_[3] * parameters_[3]);
+            derivatives[4 * info_.n_points_ + y*fit_size_x + x]
+                = (parameters_[0] * (y - parameters_[2])*(y - parameters_[2])*ex) / (parameters_[4] * parameters_[4] * parameters_[4]);
+            derivatives[5 * info_.n_points_ + y*fit_size_x + x]
+                = 1;
+        }
+}
+
+void LMFitCPP::calc_derivatives_gauss2drotated(
+    std::vector<float> & derivatives)
+{
+    std::size_t const  fit_size_x = std::size_t(std::sqrt(info_.n_points_));
+
+    float const amplitude = parameters_[0];
+    float const x0 = parameters_[1];
+    float const y0 = parameters_[2];
+    float const sig_x = parameters_[3];
+    float const sig_y = parameters_[4];
+    float const background = parameters_[5];
+    float const rot_sin = sin(parameters_[6]);
+    float const rot_cos = cos(parameters_[6]);
+
+    for (std::size_t y = 0; y < fit_size_x; y++)
+        for (std::size_t x = 0; x < fit_size_x; x++)
+        {
+            float const arga = ((x - x0) * rot_cos) - ((y - y0) * rot_sin);
+            float const argb = ((x - x0) * rot_sin) + ((y - y0) * rot_cos);
+            float const ex = exp((-0.5f) * (((arga / sig_x) * (arga / sig_x)) + ((argb / sig_y) * (argb / sig_y))));
+
+            derivatives[0 * info_.n_points_ + y*fit_size_x + x]
+                = ex;
+            derivatives[1 * info_.n_points_ + y*fit_size_x + x]
+                = ex * (amplitude * rot_cos * arga / (sig_x*sig_x) + amplitude * rot_sin *argb / (sig_y*sig_y));
+            derivatives[2 * info_.n_points_ + y*fit_size_x + x]
+                = ex * (-amplitude * rot_sin * arga / (sig_x*sig_x) + amplitude * rot_cos *argb / (sig_y*sig_y));
+            derivatives[3 * info_.n_points_ + y*fit_size_x + x]
+                = ex * amplitude * arga * arga / (sig_x*sig_x*sig_x);
+            derivatives[4 * info_.n_points_ + y*fit_size_x + x]
+                = ex * amplitude * argb * argb / (sig_y*sig_y*sig_y);
+            derivatives[5 * info_.n_points_ + y*fit_size_x + x]
+                = 1;
+            derivatives[6 * info_.n_points_ + y*fit_size_x + x]
+                = ex * amplitude * arga * argb * (1.0f / (sig_x*sig_x) - 1.0f / (sig_y*sig_y));
+        }
+}
+
+void LMFitCPP::calc_derivatives_gauss1d(
+    std::vector<float> & derivatives)
+{
+    for (std::size_t x = 0; x < info_.n_points_; x++)
+    {
+        float argx = ((x - parameters_[1])*(x - parameters_[1])) / (2 * parameters_[2] * parameters_[2]);
+        float ex = exp(-argx);
+
+        derivatives[0 * info_.n_points_ + x] = ex;
+        derivatives[1 * info_.n_points_ + x] = (parameters_[0] * (x - parameters_[1])*ex) / (parameters_[2] * parameters_[2]);
+        derivatives[2 * info_.n_points_ + x] = (parameters_[0] * (x - parameters_[1])*(x - parameters_[1])*ex) / (parameters_[2] * parameters_[2] * parameters_[2]);
+        derivatives[3 * info_.n_points_ + x] = 1;
+    }
+}
+
+void LMFitCPP::calc_derivatives_cauchy2delliptic(
+    std::vector<float> & derivatives)
+{
+    std::size_t const  fit_size_x = std::size_t(std::sqrt(info_.n_points_));
+
+    for (std::size_t y = 0; y < fit_size_x; y++)
+        for (std::size_t x = 0; x < fit_size_x; x++)
+        {
+            float const argx =
+                ((parameters_[1] - x) / parameters_[3])
+                *((parameters_[1] - x) / parameters_[3]) + 1.f;
+            float const argy =
+                ((parameters_[2] - y) / parameters_[4])
+                *((parameters_[2] - y) / parameters_[4]) + 1.f;
+
+            derivatives[0 * info_.n_points_ + y*fit_size_x + x]
+                = 1.f / (argx*argy);
+            derivatives[1 * info_.n_points_ + y*fit_size_x + x] =
+                -2.f * parameters_[0] * (parameters_[1] - x)
+                / (parameters_[3] * parameters_[3] * argx*argx*argy);
+            derivatives[2 * info_.n_points_ + y*fit_size_x + x] =
+                -2.f * parameters_[0] * (parameters_[2] - y)
+                / (parameters_[4] * parameters_[4] * argy*argy*argx);
+            derivatives[3 * info_.n_points_ + y*fit_size_x + x] =
+                2.f * parameters_[0] * (parameters_[1] - x) * (parameters_[1] - x)
+                / (parameters_[3] * parameters_[3] * parameters_[3] * argx*argx*argy);
+            derivatives[4 * info_.n_points_ + y*fit_size_x + x] =
+                2.f * parameters_[0] * (parameters_[2] - y) * (parameters_[2] - y)
+                / (parameters_[4] * parameters_[4] * parameters_[4] * argy*argy*argx);
+            derivatives[5 * info_.n_points_ + y*fit_size_x + x]
+                = 1.f;
+        }
+}
+
+void LMFitCPP::calc_derivatives_linear1d(
+    std::vector<float> & derivatives)
+{
+    float * user_info_float = (float*)user_info_;
+    float x = 0.f;
+
+    for (std::size_t point_index = 0; point_index < info_.n_points_; point_index++)
+    {
+        if (!user_info_float)
+        {
+            x = float(point_index);
+        }
+        else if (info_.user_info_size_ / sizeof(float) == info_.n_points_)
+        {
+            x = user_info_float[point_index];
+        }
+        else if (info_.user_info_size_ / sizeof(float) > info_.n_points_)
+        {
+            std::size_t const fit_begin = fit_index_ * info_.n_points_;
+            x = user_info_float[fit_begin + point_index];
+        }
+
+        derivatives[0 * info_.n_points_ + point_index] = 1.f;
+        derivatives[1 * info_.n_points_ + point_index] = x;
+    }
+}
+
+void LMFitCPP::calc_values_cauchy2delliptic(std::vector<float>& cauchy)
+{
+    int const size_x = int(std::sqrt(float(info_.n_points_)));
+    int const size_y = size_x;
+
+    for (int iy = 0; iy < size_y; iy++)
+    {
+        for (int ix = 0; ix < size_x; ix++)
+        {
+            float const argx =
+                ((parameters_[1] - ix) / parameters_[3])
+                *((parameters_[1] - ix) / parameters_[3]) + 1.f;
+            float const argy =
+                ((parameters_[2] - iy) / parameters_[4])
+                *((parameters_[2] - iy) / parameters_[4]) + 1.f;
+
+            cauchy[iy*size_x + ix] = parameters_[0] / (argx * argy) + parameters_[5];
+        }
+    }
+}
+
+void LMFitCPP::calc_values_gauss2d(std::vector<float>& gaussian)
+{
+    int const size_x = int(std::sqrt(float(info_.n_points_)));
+    int const size_y = size_x;
+
+    for (int iy = 0; iy < size_y; iy++)
+    {
+        for (int ix = 0; ix < size_x; ix++)
+        {
+            float argx = (ix - parameters_[1]) * (ix - parameters_[1]) / (2 * parameters_[3] * parameters_[3]);
+            float argy = (iy - parameters_[2]) * (iy - parameters_[2]) / (2 * parameters_[3] * parameters_[3]);
+            float ex = exp(-(argx +argy));
+
+            gaussian[iy*size_x + ix] = parameters_[0] * ex + parameters_[4];
+        }
+    }
+}
+
+void LMFitCPP::calc_values_gauss2delliptic(std::vector<float>& gaussian)
+{
+    int const size_x = int(std::sqrt(float(info_.n_points_)));
+    int const size_y = size_x;
+    for (int iy = 0; iy < size_y; iy++)
+    {
+        for (int ix = 0; ix < size_x; ix++)
+        {
+            float argx = (ix - parameters_[1]) * (ix - parameters_[1]) / (2 * parameters_[3] * parameters_[3]);
+            float argy = (iy - parameters_[2]) * (iy - parameters_[2]) / (2 * parameters_[4] * parameters_[4]);
+            float ex = exp(-(argx + argy));
+
+            gaussian[iy*size_x + ix]
+                = parameters_[0] * ex + parameters_[5];
+        }
+    }
+}
+    
+void LMFitCPP::calc_values_gauss2drotated(std::vector<float>& gaussian)
+{
+    int const size_x = int(std::sqrt(float(info_.n_points_)));
+    int const size_y = size_x;
+
+    float amplitude = parameters_[0];
+    float background = parameters_[5];
+    float x0 = parameters_[1];
+    float y0 = parameters_[2];
+    float sig_x = parameters_[3];
+    float sig_y = parameters_[4];
+    float rot_sin = sin(parameters_[6]);
+    float rot_cos = cos(parameters_[6]);
+
+    for (int iy = 0; iy < size_y; iy++)
+    {
+        for (int ix = 0; ix < size_x; ix++)
+        {
+            int const pixel_index = iy*size_x + ix;
+
+            float arga = ((ix - x0) * rot_cos) - ((iy - y0) * rot_sin);
+            float argb = ((ix - x0) * rot_sin) + ((iy - y0) * rot_cos);
+
+            float ex
+                = exp((-0.5f) * (((arga / sig_x) * (arga / sig_x)) + ((argb / sig_y) * (argb / sig_y))));
+
+            gaussian[pixel_index] = amplitude * ex + background;
+        }
+    }
+}
+
+void LMFitCPP::calc_values_gauss1d(std::vector<float>& gaussian)
+{
+    for (std::size_t ix = 0; ix < info_.n_points_; ix++)
+    {
+        float argx
+            = ((ix - parameters_[1])*(ix - parameters_[1]))
+            / (2 * parameters_[2] * parameters_[2]);
+        float ex = exp(-argx);
+        gaussian[ix] = parameters_[0] * ex + parameters_[3];
+    }
+}
+
+void LMFitCPP::calc_values_linear1d(std::vector<float>& line)
+{
+    float * user_info_float = (float*)user_info_;
+    float x = 0.f;
+    for (std::size_t point_index = 0; point_index < info_.n_points_; point_index++)
+    {
+        if (!user_info_float)
+        {
+            x = float(point_index);
+        }
+        else if (info_.user_info_size_ / sizeof(float) == info_.n_points_)
+        {
+            x = user_info_float[point_index];
+        }
+        else if (info_.user_info_size_ / sizeof(float) > info_.n_points_)
+        {
+            std::size_t const fit_begin = fit_index_ * info_.n_points_;
+            x = user_info_float[fit_begin + point_index];
+        }
+        line[point_index] = parameters_[0] + parameters_[1] * x;
+    }
+}
+
+void LMFitCPP::calc_curve_values(std::vector<float>& curve, std::vector<float>& derivatives)
+{           
+    if (info_.model_id_ == GAUSS_1D)
+    {
+        calc_values_gauss1d(curve);
+        calc_derivatives_gauss1d(derivatives);
+    }
+    else if (info_.model_id_ == GAUSS_2D)
+    {
+        calc_values_gauss2d(curve);
+        calc_derivatives_gauss2d(derivatives);
+    }
+    else if (info_.model_id_ == GAUSS_2D_ELLIPTIC)
+    {
+        calc_values_gauss2delliptic(curve);
+        calc_derivatives_gauss2delliptic(derivatives);
+    }
+    else if (info_.model_id_ == GAUSS_2D_ROTATED)
+    {
+        calc_values_gauss2drotated(curve);
+        calc_derivatives_gauss2drotated(derivatives);
+    }
+    else if (info_.model_id_ == CAUCHY_2D_ELLIPTIC)
+    {
+        calc_values_cauchy2delliptic(curve);
+        calc_derivatives_cauchy2delliptic(derivatives);
+    }
+    else if (info_.model_id_ == LINEAR_1D)
+    {
+        calc_values_linear1d(curve);
+        calc_derivatives_linear1d(derivatives);
+    }
+}
+
+void LMFitCPP::calculate_hessian(
+    std::vector<float> const & derivatives,
+    std::vector<float> const & curve)
+{
+    for (int jp = 0, jhessian = 0; jp < info_.n_parameters_; jp++)
+    {
+        if (parameters_to_fit_[jp])
+        {
+            for (int ip = 0, ihessian = 0; ip < jp + 1; ip++)
+            {
+                if (parameters_to_fit_[ip])
+                {
+                    std::size_t const ijhessian
+                        = ihessian * info_.n_parameters_to_fit_ + jhessian;
+                    std::size_t const jihessian
+                        = jhessian * info_.n_parameters_to_fit_ + ihessian;
+                    std::size_t const derivatives_index_i = ip*info_.n_points_;
+                    std::size_t const derivatives_index_j = jp*info_.n_points_;
+                    
+                    double sum = 0.0;
+                    for (std::size_t pixel_index = 0; pixel_index < info_.n_points_; pixel_index++)
+                    {
+                        if (info_.estimator_id_ == LSE)
+                        {
+                            if (!weight_)
+                            {
+                                sum
+                                    += derivatives[derivatives_index_i + pixel_index]
+                                    * derivatives[derivatives_index_j + pixel_index];
+                            }
+                            else
+                            {
+                                sum
+                                    += derivatives[derivatives_index_i + pixel_index]
+                                    * derivatives[derivatives_index_j + pixel_index]
+                                    * weight_[pixel_index];
+                            }
+                        }
+                        else if (info_.estimator_id_ == MLE)
+                        {
+                            sum
+                                += data_[pixel_index] / (curve[pixel_index] * curve[pixel_index])
+                                * derivatives[derivatives_index_i + pixel_index]
+                                * derivatives[derivatives_index_j + pixel_index];
+                        }
+                    }
+                    hessian_[ijhessian] = float(sum);
+                    if (ijhessian != jihessian)
+                    {
+                        hessian_[jihessian]
+                            = hessian_[ijhessian];
+                    }
+                    ihessian++;
+                }
+            }
+            jhessian++;
+        }
+    }
+
+}
+
+void LMFitCPP::calc_gradient(
+    std::vector<float> const & derivatives,
+    std::vector<float> const & curve)
+{
+
+    for (int ip = 0, gradient_index = 0; ip < info_.n_parameters_; ip++)
+    {
+        if (parameters_to_fit_[ip])
+        {
+            std::size_t const derivatives_index = ip*info_.n_points_;
+            double sum = 0.0;
+            for (std::size_t pixel_index = 0; pixel_index < info_.n_points_; pixel_index++)
+            {
+                float deviant = data_[pixel_index] - curve[pixel_index];
+
+                if (info_.estimator_id_ == LSE)
+                {
+                    if (!weight_)
+                    {
+                        sum
+                            += deviant * derivatives[derivatives_index + pixel_index];
+                    }
+                    else
+                    {
+                        sum
+                            += deviant * derivatives[derivatives_index + pixel_index] * weight_[pixel_index];
+                    }
+
+                }
+                else if (info_.estimator_id_ == MLE)
+                {
+                    sum
+                        += -derivatives[derivatives_index + pixel_index] * (1 - data_[pixel_index] / curve[pixel_index]);
+                }
+            }
+            gradient_[gradient_index] = float(sum);
+            gradient_index++;
+        }
+    }
+
+}
+
+void LMFitCPP::calc_chi_square(
+    std::vector<float> const & values)
+{
+    double sum = 0.0;
+    for (size_t pixel_index = 0; pixel_index < values.size(); pixel_index++)
+    {
+        float deviant = values[pixel_index] - data_[pixel_index];
+        if (info_.estimator_id_ == LSE)
+        {
+            if (!weight_)
+            {
+                sum += deviant * deviant;
+            }
+            else
+            {
+                sum += deviant * deviant * weight_[pixel_index];
+            }
+        }
+        else if (info_.estimator_id_ == MLE)
+        {
+            if (values[pixel_index] <= 0.f)
+            {
+                (*state_) = 3;
+                return;
+            }
+            if (data_[pixel_index] != 0.f)
+            {
+                sum
+                    += 2 * (deviant - data_[pixel_index] * logf(values[pixel_index] / data_[pixel_index]));
+            }
+            else
+            {
+                sum += 2 * deviant;
+            }
+        }
+    }
+    *chi_square_ = float(sum);
+}
+
+void LMFitCPP::calc_curve_values()
+{
+	std::vector<float> & curve = curve_;
+	std::vector<float> & derivatives = derivatives_;
+
+	calc_curve_values(curve, derivatives);
+}
+    
+void LMFitCPP::calc_coefficients()
+{
+    std::vector<float> & curve = curve_;
+    std::vector<float> & derivatives = derivatives_;
+
+    calc_chi_square(curve);
+
+    if ((*chi_square_) < prev_chi_square_ || prev_chi_square_ == 0)
+    {
+        calculate_hessian(derivatives, curve);
+        calc_gradient(derivatives, curve);
+    }
+}
+
+void LMFitCPP::gauss_jordan()
+{
+    delta_ = gradient_;
+
+    std::vector<float> & alpha = modified_hessian_;
+    std::vector<float> & beta = delta_;
+
+    int icol, irow;
+    float big, dum, pivinv;
+
+    std::vector<int> indxc(info_.n_parameters_to_fit_, 0);
+    std::vector<int> indxr(info_.n_parameters_to_fit_, 0);
+    std::vector<int> ipiv(info_.n_parameters_to_fit_, 0);
+
+    for (int kp = 0; kp < info_.n_parameters_to_fit_; kp++)
+    {
+        big = 0.0;
+        for (int jp = 0; jp < info_.n_parameters_to_fit_; jp++)
+        {
+            if (ipiv[jp] != 1)
+            {
+                for (int ip = 0; ip < info_.n_parameters_to_fit_; ip++)
+                {
+                    if (ipiv[ip] == 0)
+                    {
+                        if (fabs(alpha[jp*info_.n_parameters_to_fit_ + ip]) >= big)
+                        {
+                            big = fabs(alpha[jp*info_.n_parameters_to_fit_ + ip]);
+                            irow = jp;
+                            icol = ip;
+                        }
+                    }
+                }
+            }
+        }
+        ++(ipiv[icol]);
+
+
+        if (irow != icol)
+        {
+            for (int ip = 0; ip < info_.n_parameters_to_fit_; ip++)
+            {
+                std::swap(alpha[irow*info_.n_parameters_to_fit_ + ip], alpha[icol*info_.n_parameters_to_fit_ + ip]);
+            }
+            std::swap(beta[irow], beta[icol]);
+        }
+        indxr[kp] = irow;
+        indxc[kp] = icol;
+        if (alpha[icol*info_.n_parameters_to_fit_ + icol] == 0.0)
+        {
+            (*state_) = 2;
+            break;
+        }
+        pivinv = 1.0f / alpha[icol*info_.n_parameters_to_fit_ + icol];
+        alpha[icol*info_.n_parameters_to_fit_ + icol] = 1.0;
+        for (int ip = 0; ip < info_.n_parameters_to_fit_; ip++)
+        {
+            alpha[icol*info_.n_parameters_to_fit_ + ip] *= pivinv;
+        }
+        beta[icol] *= pivinv;
+
+        for (int jp = 0; jp < info_.n_parameters_to_fit_; jp++)
+        {
+            if (jp != icol)
+            {
+                dum = alpha[jp*info_.n_parameters_to_fit_ + icol];
+                alpha[jp*info_.n_parameters_to_fit_ + icol] = 0.0;
+                for (int ip = 0; ip < info_.n_parameters_to_fit_; ip++)
+                {
+                    alpha[jp*info_.n_parameters_to_fit_ + ip] -= alpha[icol*info_.n_parameters_to_fit_ + ip] * dum;
+                }
+                beta[jp] -= beta[icol] * dum;
+            }
+        }
+    }
+}
+
+void LMFitCPP::update_parameters()
+{
+    for (int parameter_index = 0, delta_index = 0; parameter_index < info_.n_parameters_; parameter_index++)
+    {
+        if (parameters_to_fit_[parameter_index])
+        {
+            prev_parameters_[parameter_index] = parameters_[parameter_index];
+            parameters_[parameter_index] = parameters_[parameter_index] + delta_[delta_index++];
+        }
+    }
+}
+
+bool LMFitCPP::check_for_convergence()
+{
+    bool const fit_found
+        = std::abs(*chi_square_ - prev_chi_square_)  < std::max(tolerance_, tolerance_ * std::abs(*chi_square_));
+
+    return fit_found;
+}
+
+void LMFitCPP::evaluate_iteration(int const iteration)
+{
+    bool const max_iterations_reached = iteration == info_.max_n_iterations_ - 1;
+    if (converged_ || max_iterations_reached)
+    {
+        (*n_iterations_) = iteration + 1;
+        if (!converged_)
+        {
+            (*state_) = 1;
+        }
+    }
+}
+
+void LMFitCPP::prepare_next_iteration()
+{
+    if ((*chi_square_) < prev_chi_square_)
+    {
+        lambda_ *= 0.1f;
+        prev_chi_square_ = (*chi_square_);
+    }
+    else
+    {
+        lambda_ *= 10.f;
+        (*chi_square_) = prev_chi_square_;
+        for (int parameter_index = 0, delta_index = 0; parameter_index < info_.n_parameters_; parameter_index++)
+        {
+            if (parameters_to_fit_[parameter_index])
+            {
+                parameters_[parameter_index] = prev_parameters_[parameter_index];
+            }
+        }
+    }
+}
+
+void LMFitCPP::modify_step_width()
+{
+    modified_hessian_ = hessian_;
+    size_t const n_parameters = (size_t)(sqrt((float)(hessian_.size())));
+    for (size_t parameter_index = 0; parameter_index < n_parameters; parameter_index++)
+    {
+        modified_hessian_[parameter_index*n_parameters + parameter_index]
+            = modified_hessian_[parameter_index*n_parameters + parameter_index]
+            * (1.0f + (lambda_));
+    }
+}
+
+void LMFitCPP::run()
+{
+    for (int i = 0; i < info_.n_parameters_; i++)
+        parameters_[i] = initial_parameters_[i];
+
+    (*state_) = 0;
+	calc_curve_values();
+    calc_coefficients();
+    prev_chi_square_ = (*chi_square_);
+        
+    for (int iteration = 0; (*state_) == 0; iteration++)
+    {
+        modify_step_width();
+        
+        gauss_jordan();
+
+        update_parameters();
+
+		calc_curve_values();
+        calc_coefficients();
+
+        converged_ = check_for_convergence();
+
+        evaluate_iteration(iteration);
+
+        prepare_next_iteration();
+
+        if (converged_ || (*state_) != 0)
+        {
+            break;
+        }
+    }
+}
diff --git a/Cpufit/matlab/CMakeLists.txt b/Cpufit/matlab/CMakeLists.txt
new file mode 100644
index 0000000..46276bd
--- /dev/null
+++ b/Cpufit/matlab/CMakeLists.txt
@@ -0,0 +1,62 @@
+
+# MATLAB Cpufit binding
+
+find_package( Matlab COMPONENTS MX_LIBRARY )
+
+if( NOT Matlab_FOUND )
+  message( STATUS "Matlab and/or MX_Library NOT found - skipping Cpufit Matlab binding!" )
+  return()
+endif()
+
+# Matlab MEX FILE
+
+set( Headers
+  )
+
+set( Sources
+  mex/CpufitMex.cpp
+  )
+
+add_library( CpufitMex SHARED
+  ${Headers}
+  ${Sources}
+  )
+set_property( TARGET CpufitMex
+  PROPERTY SUFFIX .${Matlab_MEX_EXTENSION} )
+set_property( TARGET CpufitMex
+  PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" )
+
+target_include_directories( CpufitMex PRIVATE ${Matlab_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
+target_link_libraries( CpufitMex Cpufit ${Matlab_LIBRARIES} )
+
+if( WIN32 )
+  SET(CMAKE_SHARED_LINKER_FLAGS "/export:mexFunction")
+endif()
+
+add_matlab_launcher( CpufitMex "${CMAKE_CURRENT_SOURCE_DIR}" )
+
+# MATLAB Cpufit + Gpufit PACKAGE
+
+set( build_directory "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/matlab" )
+set( package_files
+  "${CMAKE_CURRENT_SOURCE_DIR}/cpufit.m"
+)
+set( binary_gpufit $<TARGET_FILE:Cpufit> )
+set( binary_mex $<TARGET_FILE:CpufitMex> )
+
+add_custom_target( MATLAB_CPUFIT_GPUFIT_PACKAGE ALL
+  COMMAND ${CMAKE_COMMAND} -E
+    make_directory ${build_directory}
+  COMMAND ${CMAKE_COMMAND} -E
+    copy_if_different ${package_files} ${build_directory}
+  COMMAND ${CMAKE_COMMAND} -E
+    copy_if_different ${binary_gpufit} ${build_directory}
+  COMMAND ${CMAKE_COMMAND} -E
+    copy_if_different ${binary_mex} ${build_directory}	
+  COMMENT "Adding Cpufit to Matlab package"
+)
+set_property( TARGET MATLAB_CPUFIT_GPUFIT_PACKAGE PROPERTY FOLDER CMakePredefinedTargets )
+add_dependencies( MATLAB_CPUFIT_GPUFIT_PACKAGE MATLAB_GPUFIT_PACKAGE Cpufit CpufitMex )
+
+# add launcher
+
diff --git a/Cpufit/matlab/README.md b/Cpufit/matlab/README.md
new file mode 100644
index 0000000..a2dc84c
--- /dev/null
+++ b/Cpufit/matlab/README.md
@@ -0,0 +1,3 @@
+Matlab binding for Cpufit, the control CPU implementation of
+the [Gpufit library](https://github.com/gpufit/Gpufit) which
+implements Levenberg Marquardt curve fitting in CUDA
\ No newline at end of file
diff --git a/Cpufit/matlab/cpufit.m b/Cpufit/matlab/cpufit.m
new file mode 100644
index 0000000..243c654
--- /dev/null
+++ b/Cpufit/matlab/cpufit.m
@@ -0,0 +1,119 @@
+function [parameters, states, chi_squares, n_iterations, time]...
+    = cpufit(data, weights, model_id, initial_parameters, tolerance, max_n_iterations, parameters_to_fit, estimator_id, user_info)
+% Wrapper around the Cpufit mex file.
+%
+% Optional arguments can be given as empty matrix [].
+%
+% Default values as specified
+
+%% size checks
+
+% number of input parameter (variable)
+if nargin < 9
+    user_info = [];
+    if nargin < 8
+        estimator_id = [];
+        if nargin < 7
+            parameters_to_fit = [];
+            if nargin < 6
+                max_n_iterations = [];
+                if nargin < 5
+                    tolerance = [];
+					assert(nargin == 4, 'Not enough parameters');
+                end
+            end
+        end
+    end
+end
+
+% data is 2D and read number of points and fits
+data_size = size(data);
+assert(length(data_size) == 2, 'data is not two-dimensional');
+n_points = data_size(1);
+n_fits = data_size(2);
+
+% consistency with weights (if given)
+if ~isempty(weights)
+    assert(isequal(data_size, size(weights)), 'Dimension mismatch between data and weights')
+end
+
+% initial parameters is 2D and read number of parameters
+initial_parameters_size = size(initial_parameters);
+assert(length(initial_parameters_size) == 2, 'initial_parameters is not two-dimensional');
+n_parameters = initial_parameters_size(1);
+assert(n_fits == initial_parameters_size(2), 'Dimension mismatch in number of fits between data and initial_parameters');
+
+% consistency with parameters_to_fit (if given)
+if ~isempty(parameters_to_fit)
+    assert(size(parameters_to_fit, 1) == n_parameters, 'Dimension mismatch in number of parameters between initial_parameters and parameters_to_fit');
+end
+
+%% default values
+
+% tolerance
+if isempty(tolerance)
+    tolerance = 1e-4;
+end
+
+% max_n_iterations
+if isempty(max_n_iterations)
+    max_n_iterations = 25;
+end
+
+% estimator_id
+if isempty(estimator_id)
+    estimator_id = EstimatorID.LSE;
+end
+
+% parameters_to_fit
+if isempty(parameters_to_fit)
+    parameters_to_fit = ones(n_parameters, 1, 'int32');
+end
+
+% now only weights and user_info could be not given (empty matrix)
+
+%% type checks
+
+% data, weights (if given), initial_parameters are all single
+assert(isa(data, 'single'), 'Type of data is not single');
+if ~isempty(weights)
+    assert(isa(weights, 'single'), 'Type of weights is not single');
+end
+assert(isa(initial_parameters, 'single'), 'Type of initial_parameters is not single');
+
+% parameters_to_fit is int32 (cast to int32 if incorrect type)
+if ~isa(parameters_to_fit, 'int32')
+    parameters_to_fit = int32(parameters_to_fit);
+end
+
+% max_n_iterations must be int32 (cast if incorrect type)
+if ~isa(max_n_iterations, 'int32')
+    max_n_iterations = int32(max_n_iterations);
+end
+
+% tolerance must be single (cast if incorrect type)
+if ~isa(tolerance, 'single')
+    tolerance = single(tolerance);
+end
+
+% we don't check type of user_info, but we extract the size in bytes of it
+if ~isempty(user_info)
+    user_info_info = whos('user_info');
+    user_info_size = user_info_info.bytes;
+else
+    user_info_size = 0;
+end
+
+             
+%% run Cpufit taking the time
+tic;
+[parameters, states, chi_squares, n_iterations] ...
+    = CpufitMex(data, weights, n_fits, n_points, tolerance, max_n_iterations, estimator_id, initial_parameters, parameters_to_fit, model_id, n_parameters, user_info, user_info_size);
+    
+time = toc;
+
+% reshape the output parameters array to have dimensions
+% (n_parameters,n_fits)
+parameters = reshape(parameters,n_parameters,n_fits);
+
+end
diff --git a/Cpufit/matlab/examples/gauss2d.m b/Cpufit/matlab/examples/gauss2d.m
new file mode 100644
index 0000000..3cb91f0
--- /dev/null
+++ b/Cpufit/matlab/examples/gauss2d.m
@@ -0,0 +1,182 @@
+function gauss2d()
+% Example of the Matlab binding of the Gpufit library implementing
+% Levenberg Marquardt curve fitting in C/C++
+% https://github.com/gpufit/Gpufit
+%
+% Multiple fits of a 2D Gaussian peak function with Poisson distributed noise
+% http://gpufit.readthedocs.io/en/latest/bindings.html#matlab
+
+% perform some 2D Gaussian peak fits with a symmetrical Gaussian peak
+fit_gauss2d();
+
+% perform some 2D Gaussian peak fits with an asymmetrical, rotated Gaussian peak
+fit_gauss2d_rotated();
+
+end
+function fit_gauss2d()
+
+%% number of fits and fit points
+number_fits = 1e4;
+size_x = 20;
+number_parameters = 5;
+
+%% set input arguments
+
+% true parameters
+true_parameters = single([20, 9.5, 9.5, 3, 10]);
+
+% initialize random number generator
+rng(0);
+
+% initial parameters (randomized)
+initial_parameters = repmat(single(true_parameters'), [1, number_fits]);
+% randomize relative to width for positions
+initial_parameters([2,3], :) = initial_parameters([2,3], :) + true_parameters(4) * (-0.2 + 0.4 * rand(2, number_fits));
+% randomize relative for other parameters
+initial_parameters([1,4,5], :) = initial_parameters([1,4,5], :) .* (0.8 + 0.4 * rand(3, number_fits));
+
+% generate x and y values
+g = single(0 : size_x - 1);
+[x, y] = ndgrid(g, g);
+
+% generate data with Poisson noise
+data = gaussian_2d(x, y, true_parameters);
+data = repmat(data(:), [1, number_fits]);
+data = poissrnd(data);
+
+% tolerance
+tolerance = 1e-3;
+
+% maximum number of iterations
+max_n_iterations = 20;
+
+% estimator id
+estimator_id = EstimatorID.MLE;
+
+% model ID
+model_id = ModelID.GAUSS_2D;
+
+%% run Cpufit
+[parameters, states, chi_squares, n_iterations, time] = cpufit(data, [], ...
+    model_id, initial_parameters, tolerance, max_n_iterations, [], estimator_id, []);
+
+%% displaying results
+display_results('2D Gaussian peak', model_id, number_fits, number_parameters, size_x, time, true_parameters, parameters, states, chi_squares, n_iterations);
+
+end
+
+function fit_gauss2d_rotated()
+
+%% number of fits and fit points
+number_fits = 1e4;
+size_x = 20;
+number_parameters = 7;
+
+%% set input arguments
+
+% true parameters
+true_parameters = single([200, 9.5, 9.5, 3, 4, 10, 0.5]);
+
+% initialize random number generator
+rng(0);
+
+% initial parameters (randomized)
+initial_parameters = repmat(single(true_parameters'), [1, number_fits]);
+% randomize relative to width for positions
+initial_parameters(2, :) = initial_parameters(2, :) + true_parameters(4) * (-0.2 + 0.4 * rand(1, number_fits));
+initial_parameters(3, :) = initial_parameters(3, :) + true_parameters(5) * (-0.2 + 0.4 * rand(1, number_fits));
+% randomize relative for other parameters
+initial_parameters([1,4,5,6,7], :) = initial_parameters([1,4,5,6,7], :) .* (0.8 + 0.4 * rand(5, number_fits));
+
+% generate x and y values
+g = single(0 : size_x - 1);
+[x, y] = ndgrid(g, g);
+
+% generate data with Poisson noise
+data = gaussian_2d_rotated(x, y, true_parameters);
+data = repmat(data(:), [1, number_fits]);
+data = poissrnd(data);
+
+% tolerance
+tolerance = 1e-3;
+
+% maximum number of iterations
+max_n_iterations = 20;
+
+% estimator id
+estimator_id = EstimatorID.MLE;
+
+% model ID
+model_id = ModelID.GAUSS_2D_ROTATED;
+
+%% run Cpufit
+[parameters, states, chi_squares, n_iterations, time] = cpufit(data, [], ...
+    model_id, initial_parameters, tolerance, max_n_iterations, [], estimator_id, []);
+
+%% displaying results
+display_results('2D rotated Gaussian peak', model_id, number_fits, number_parameters, size_x, time, true_parameters, parameters, states, chi_squares, n_iterations);
+
+
+end
+
+function g = gaussian_2d(x, y, p)
+% Generates a 2D Gaussian peak.
+% http://gpufit.readthedocs.io/en/latest/api.html#gauss-2d
+%
+% x,y - x and y grid position values
+% p - parameters (amplitude, x,y center position, width, offset)
+
+g = p(1) * exp(-((x - p(2)).^2 + (y - p(3)).^2) / (2 * p(4)^2)) + p(5);
+
+end
+
+function g = gaussian_2d_rotated(x, y, p)
+% Generates a 2D rotated elliptic Gaussian peak.
+% http://gpufit.readthedocs.io/en/latest/api.html#d-rotated-elliptic-gaussian-peak
+%
+% x,y - x and y grid position values
+% p - parameters (amplitude, x,y center position, width, offset)
+
+% cosine and sine of rotation angle
+cp = cos(p(7));
+sp = sin(p(7));
+
+% Gaussian peak with two axes
+arga = (x - p(2)) .* cp - (y - p(3)) .* sp;
+argb = (x - p(2)) .* sp + (y - p(3)) .* cp;
+ex = exp(-0.5 .* (((arga / p(4)) .* (arga / p(4))) + ((argb / p(5)) .* (argb / p(5)))));
+g = p(1) .* ex + p(6);
+
+end
+
+function display_results(name, model_id, number_fits, number_parameters, size_x, time, true_parameters, parameters, states, chi_squares, n_iterations)
+
+%% displaying results
+converged = states == 0;
+fprintf('\nCpufit of %s\n', name);
+
+% print summary
+fprintf('\nmodel ID:        %d\n', model_id);
+fprintf('number of fits:  %d\n', number_fits);
+fprintf('fit size:        %d x %d\n', size_x, size_x);
+fprintf('mean chi-square: %6.2f\n', mean(chi_squares(converged)));
+fprintf('mean iterations: %6.2f\n', mean(n_iterations(converged)));
+fprintf('time:            %6.2f s\n', time);
+
+% get fit states
+number_converged = sum(converged);
+fprintf('\nratio converged         %6.2f %%\n', number_converged / number_fits * 100);
+fprintf('ratio max it. exceeded  %6.2f %%\n', sum(states == 1) / number_fits * 100);
+fprintf('ratio singular hessian  %6.2f %%\n', sum(states == 2) / number_fits * 100);
+fprintf('ratio neg curvature MLE %6.2f %%\n', sum(states == 3) / number_fits * 100);
+
+% mean and std of fitted parameters
+converged_parameters = parameters(:, converged);
+converged_parameters_mean = mean(converged_parameters, 2);
+converged_parameters_std  = std(converged_parameters, [], 2);
+fprintf('\nparameters of %s\n', name);
+for i = 1 : number_parameters
+    fprintf('p%d true %6.2f mean %6.2f std %6.2f\n', i, true_parameters(i), converged_parameters_mean(i), converged_parameters_std(i));
+end
+
+end
\ No newline at end of file
diff --git a/Cpufit/matlab/examples/gauss2d_plot.m b/Cpufit/matlab/examples/gauss2d_plot.m
new file mode 100644
index 0000000..8d34707
--- /dev/null
+++ b/Cpufit/matlab/examples/gauss2d_plot.m
@@ -0,0 +1,117 @@
+function gauss2d_plot()
+% Example of the Matlab binding of the Gpufit library implementing
+% Levenberg Marquardt curve fitting in C/C++
+% https://github.com/gpufit/Gpufit
+%
+% Multiple fits of a 2D Gaussian peak function with Poisson distributed noise
+% repeated for a different total number of fits each time and plotting the
+% results
+% http://gpufit.readthedocs.io/en/latest/bindings.html#matlab
+
+%% number of fit points
+size_x = 5;
+n_points = size_x * size_x;
+
+%% set input arguments
+
+% mean true parameters
+mean_true_parameters = single([100, 3, 3, 1, 10]);
+
+% average noise level
+average_noise_level = 10;
+
+% initialize random number generator
+rng(0);
+
+% tolerance
+tolerance = 1e-4;
+
+% max number of itetations
+max_n_iterations = 10;
+
+% model id
+model_id = ModelID.GAUSS_2D;
+
+%% loop over different number of fits
+n_fits_all = round(logspace(2, 6, 20));
+
+% generate x and y values
+g = single(0 : size_x - 1);
+[x, y] = ndgrid(g, g);
+
+% loop
+speed = zeros(length(n_fits_all), 1);
+for i = 1:length(n_fits_all)
+    n_fits = n_fits_all(i);
+    
+    % vary positions of 2D Gaussians peaks slightly
+    test_parameters = repmat(mean_true_parameters', [1, n_fits]);
+    test_parameters([2,3], :) = test_parameters([2,3], :) + mean_true_parameters(4) * (-0.2 + 0.4 * rand(2, n_fits));
+    
+    % generate data
+    data = gaussians_2d(x, y, test_parameters);
+    data = reshape(data, [n_points, n_fits]);
+    
+    % add noise
+    data = data + average_noise_level * randn(size(data), 'single');
+    
+    % initial parameters (randomized)
+    initial_parameters = repmat(mean_true_parameters', [1, n_fits]);
+    % randomize relative to width for positions
+    initial_parameters([2,3], :) = initial_parameters([2,3], :) + mean_true_parameters(4) * (-0.2 + 0.4 * rand(2, n_fits));
+    % randomize relative for other parameters
+    initial_parameters([1,4,5], :) = initial_parameters([1,4,5], :) .* (0.8 + 0.4 * rand(3, n_fits));
+    
+    % run Cpufit
+    [parameters, states, chi_squares, n_iterations, time] = cpufit(data, [], ...
+        model_id, initial_parameters, tolerance, max_n_iterations);
+
+    % analyze result
+    converged = states == 0;
+    speed(i) = n_fits / time;
+    precision_x0 = std(parameters(2, converged) - test_parameters(2, converged));
+    
+    % display result
+    fprintf('     iterations: %.2f | time: %.3f s | speed: %8.0f fits/s\n', ...
+        mean(n_iterations(converged)), time, speed(i));
+end
+
+%% plot
+figure();
+semilogx(n_fits_all, speed, 'bo-')
+xlabel('number of fits per function call')
+ylabel('fits per second')
+legend('Cpufit', 'Location', 'NorthWest')
+grid on;
+xlim(n_fits_all([1,end]));
+
+end
+
+function g = gaussians_2d(x, y, p)
+% Generates many 2D Gaussians peaks for a given set of parameters
+
+n_fits = size(p, 2);
+msg = sprintf('generating %d fits ', n_fits);
+fprintf(msg);
+
+g = zeros([size(x), n_fits], 'single');
+
+progress = 0;
+L = 50; % length of progressbar
+l = 0;
+for i = 1 : n_fits
+    
+    pi = p(:, i);
+    g(:, :, i) = pi(1) * exp(-((x - pi(2)).^2 + (y - pi(3)).^2) / (2 * pi(4)^2)) + pi(5);
+    
+    progress = progress + 1;
+    if progress >= n_fits / L
+        progress = 0;
+        fprintf('|');
+        l = l + 1;
+    end
+end
+fprintf(repmat('\b', [1, length(msg) + l]));
+fprintf('%7d fits', n_fits);
+
+end
diff --git a/Cpufit/matlab/mex/CpufitMex.cpp b/Cpufit/matlab/mex/CpufitMex.cpp
new file mode 100644
index 0000000..3a10184
--- /dev/null
+++ b/Cpufit/matlab/mex/CpufitMex.cpp
@@ -0,0 +1,145 @@
+#include "Cpufit/cpufit.h"
+
+#include <mex.h>
+
+#include <cstring>
+#include <string>
+
+/*
+	Get a arbitrary scalar (non complex) and check for class id.
+	https://www.mathworks.com/help/matlab/apiref/mxclassid.html
+*/
+template<class T> inline bool get_scalar(const mxArray *p, T &v, const mxClassID id)
+{
+	if (mxIsNumeric(p) && !mxIsComplex(p) && mxGetNumberOfElements(p) == 1 && mxGetClassID(p) == id)
+	{
+		v = *static_cast<T *>(mxGetData(p));
+		return true;
+	}
+	else {
+		return false;
+	}
+}
+
+void mexFunction(int nlhs, mxArray *plhs[], int nrhs,  mxArray const *prhs[])
+{
+    int expected_nrhs = 0;
+    int expected_nlhs = 0;
+    bool wrong_nrhs = false;
+    bool wrong_nlhs = false;
+    
+    expected_nrhs = 13;
+    expected_nlhs = 4;
+    if (nrhs != expected_nrhs)
+    {
+        wrong_nrhs = true;
+    }
+    else if (nlhs != expected_nlhs)
+    {
+        wrong_nlhs = true;
+    }
+
+    if (wrong_nrhs || wrong_nlhs)
+    {
+        if (nrhs != expected_nrhs)
+        {
+            char s1[50];
+            _itoa_s(expected_nrhs, s1, 10);
+            char const s2[] = " input arguments required.";
+            size_t const string_length = strlen(s1) + 1 + strlen(s2);
+            strcat_s(s1, string_length, s2);
+            mexErrMsgIdAndTxt("Cpufit:Mex", s1);
+        }
+        else if (nlhs != expected_nlhs)
+        {
+            char s1[50];
+            _itoa_s(expected_nlhs, s1, 10);
+            char const s2[] = " output arguments required.";
+            size_t const string_length = strlen(s1) + 1 + strlen(s2);
+            strcat_s(s1, string_length, s2);
+            mexErrMsgIdAndTxt("Cpufit:Mex", s1);
+        }
+    }
+
+	// input parameters
+	float * data = (float*)mxGetPr(prhs[0]);
+	float * weights = (float*)mxGetPr(prhs[1]);
+    std::size_t n_fits = (std::size_t)*mxGetPr(prhs[2]);
+    std::size_t n_points = (std::size_t)*mxGetPr(prhs[3]);
+
+	// tolerance
+	float tolerance = 0;
+	if (!get_scalar(prhs[4], tolerance, mxSINGLE_CLASS))
+	{
+		mexErrMsgIdAndTxt("Cpufit:Mex", "tolerance is not a single");
+	}
+
+	// max_n_iterations
+	int max_n_iterations = 0;
+	if (!get_scalar(prhs[5], max_n_iterations, mxINT32_CLASS))
+	{
+		mexErrMsgIdAndTxt("Cpufit:Mex", "max_n_iteration is not int32");
+	}
+
+    int estimator_id = (int)*mxGetPr(prhs[6]);
+	float * initial_parameters = (float*)mxGetPr(prhs[7]);
+	int * parameters_to_fit = (int*)mxGetPr(prhs[8]);
+    int model_id = (int)*mxGetPr(prhs[9]);
+    int n_parameters = (int)*mxGetPr(prhs[10]);
+	int * user_info = (int*)mxGetPr(prhs[11]);
+    std::size_t user_info_size = (std::size_t)*mxGetPr(prhs[12]);
+
+	// output parameters
+    float * output_parameters;
+	mxArray * mx_parameters;
+	mx_parameters = mxCreateNumericMatrix(1, n_fits*n_parameters, mxSINGLE_CLASS, mxREAL);
+	output_parameters = (float*)mxGetData(mx_parameters);
+	plhs[0] = mx_parameters;
+
+    int * output_states;
+	mxArray * mx_states;
+	mx_states = mxCreateNumericMatrix(1, n_fits, mxINT32_CLASS, mxREAL);
+	output_states = (int*)mxGetData(mx_states);
+	plhs[1] = mx_states;
+
+    float * output_chi_squares;
+	mxArray * mx_chi_squares;
+	mx_chi_squares = mxCreateNumericMatrix(1, n_fits, mxSINGLE_CLASS, mxREAL);
+	output_chi_squares = (float*)mxGetData(mx_chi_squares);
+	plhs[2] = mx_chi_squares;
+
+    int * output_n_iterations;
+    mxArray * mx_n_iterations;
+    mx_n_iterations = mxCreateNumericMatrix(1, n_fits, mxINT32_CLASS, mxREAL);
+    output_n_iterations = (int*)mxGetData(mx_n_iterations);
+    plhs[3] = mx_n_iterations;
+
+	// call to gpufit
+    int const status
+        = cpufit
+        (
+            n_fits,
+                n_points,
+                data,
+                weights,
+                model_id,
+                initial_parameters,
+                tolerance,
+                max_n_iterations,
+                parameters_to_fit,
+                estimator_id,
+                user_info_size,
+                reinterpret_cast< char * >( user_info ),
+                output_parameters,
+                output_states,
+                output_chi_squares,
+                output_n_iterations
+            ) ;
+
+	// check status
+    if (status != STATUS_OK)
+    {
+        std::string const error = cpufit_get_last_error() ;
+        mexErrMsgIdAndTxt( "Cpufit:Mex", error.c_str() ) ;
+    }
+}
diff --git a/Gpufit/CMakeLists.txt b/Gpufit/CMakeLists.txt
new file mode 100644
index 0000000..76da81e
--- /dev/null
+++ b/Gpufit/CMakeLists.txt
@@ -0,0 +1,160 @@
+
+# CUDA
+#
+# Uses the following variables:
+#
+#   CUDA_ARCHITECTURES (Default All)
+#   -- Argument passed to CUDA_SELECT_NVCC_ARCH_FLAGS(...)
+#      resulting in code_generation_flags
+#      (see http://cmake.org/cmake/help/v3.7/module/FindCUDA.html).
+#      CUDA_ARCHITECTURES: Auto | Common | All | ARCH_AND_PTX ...
+#      Auto: Detects local machine GPU architecture.
+#      Common: Covers common subset of architectures.
+#      All: Covers all known architectures.
+#      ARCH_AND_PTX: NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX
+#      NAME: Fermi Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal
+#      NUM: Any number.
+#      Only those pairs are currently accepted by NVCC though:
+#        2.0 2.1 3.0 3.2 3.5 3.7 5.0 5.2 5.3 6.0 6.2
+#      Examples:
+#        2.1(2.0) results in
+#          -gencode;arch=compute_20,code=sm_21
+#        Kepler+Tesla results in
+#          -gencode;arch=compute_37,code=sm_37
+#        6.2+PTX results in
+#          -gencode;arch=compute_62,code=sm_62;-gencode;arch=compute_62,code=compute_62
+#
+#   CUDA_NVCC_FLAGS (Default ${code_generation_flags})
+#   -- Additional NVCC command line arguments
+#      (see http://cmake.org/cmake/help/v3.7/module/FindCUDA.html).
+#      NOTE that multiple arguments must be semi-colon delimited
+#      (e.g. --compiler-options;-Wall)
+#
+#   Multiple CUDA versions installed, specify which version to use
+#      Set CUDA_BIN_PATH before running CMake or CUDA_TOOLKIT_ROOT_DIR after first configuration
+#      to installation folder of desired CUDA version
+
+find_package( CUDA 6.5 REQUIRED )
+
+set( CUDA_ARCHITECTURES All CACHE STRING
+  "Auto | Common | All | ... see CUDA_SELECT_NVCC_ARCH_FLAGS(...)" )
+
+if( CUDA_ARCHITECTURES STREQUAL Auto )
+  set( file ${PROJECT_BINARY_DIR}/detect_cuda_architectures.cpp )
+  file( WRITE ${file} ""
+    "#include <cuda_runtime.h>\n"
+    "#include <cstdio>\n"
+    "int main()\n"
+    "{\n"
+    "  int count = 0;\n"
+    "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
+    "  if (count == 0) return -1;\n"
+    "  for (int device = 0; device < count; ++device)\n"
+    "  {\n"
+    "    cudaDeviceProp prop;\n"
+    "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+    "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
+    "  }\n"
+    "  return 0;\n"
+    "}\n"
+  )
+  try_run( run_result compile_result ${PROJECT_BINARY_DIR} ${file}
+    CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS}"
+    LINK_LIBRARIES ${CUDA_LIBRARIES}
+    RUN_OUTPUT_VARIABLE architectures
+  )
+  if( run_result EQUAL 0 )
+    string( REPLACE "2.1" "2.1(2.0)" architectures "${architectures}" )
+    if( CUDA_VERSION VERSION_LESS "7.0" )
+      string( REGEX REPLACE "3\\.[27]|5\\.[23]|6\\.[01]" "5.2+PTX" architectures "${architectures}" )
+    elseif( CUDA_VERSION VERSION_LESS "8.0" )
+      string( REGEX REPLACE "5\\.3|6\\.[01]" "5.3+PTX" architectures "${architectures}" )
+    endif()
+    set( CUDA_ARCHITECTURES "${architectures}" )
+  endif()
+elseif( CUDA_ARCHITECTURES STREQUAL All )
+# All does not include the latest PTX!
+  set( CUDA_ARCHITECTURES "2.1(2.0)" "3.0" "3.5" "5.0" "5.2" )
+  if( CUDA_VERSION VERSION_GREATER "6.5" )
+    list( APPEND CUDA_ARCHITECTURES "3.2" "3.7" "5.3" )
+  endif()
+  if( CUDA_VERSION VERSION_GREATER "7.5" )
+    list( APPEND CUDA_ARCHITECTURES "6.0" "6.1" )
+  endif()
+  string( APPEND CUDA_ARCHITECTURES "+PTX" )
+endif()
+CUDA_SELECT_NVCC_ARCH_FLAGS( code_generation_flags "${CUDA_ARCHITECTURES}" )
+list( APPEND CUDA_NVCC_FLAGS ${code_generation_flags} )
+message( STATUS "CUDA_NVCC_FLAGS=${code_generation_flags}" )
+
+# Gpufit
+
+set( GpuHeaders
+	gpufit.h
+	definitions.h
+	info.h
+	lm_fit.h
+	interface.h
+)
+
+set( GpuSources
+	gpufit.cpp
+	info.cpp
+	lm_fit.cpp
+	lm_fit_cuda.cpp
+	interface.cpp
+	gpufit.def
+)
+
+set( GpuCudaHeaders
+	linear_1d.cuh
+	gauss_1d.cuh
+	gauss_2d.cuh
+	gauss_2d_rotated.cuh
+	gauss_2d_elliptic.cuh
+	cauchy_2d_elliptic.cuh
+	lse.cuh
+	mle.cuh
+	cuda_gaussjordan.cuh
+	cuda_kernels.cuh
+	gpu_data.cuh
+)
+
+set( GpuCudaSources
+	lm_fit_cuda.cu
+	cuda_gaussjordan.cu
+	cuda_kernels.cu
+	info.cu
+	gpu_data.cu
+)
+
+source_group("CUDA Source Files" FILES ${GpuCudaSources})
+source_group("CUDA Header Files" FILES ${GpuCudaHeaders})
+
+cuda_add_library( Gpufit SHARED
+	${GpuHeaders}
+	${GpuSources}
+	${GpuCudaHeaders}
+	${GpuCudaSources}
+)
+
+set_property( TARGET Gpufit
+	PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" )
+
+#install( TARGETS Gpufit RUNTIME DESTINATION bin )
+
+# Examples
+
+add_subdirectory( examples )
+
+# Tests
+
+if( BUILD_TESTING )
+	add_subdirectory( tests )
+endif()
+
+# Bindings
+
+add_subdirectory( matlab )
+add_subdirectory( python )
+
diff --git a/Gpufit/Gpufit.def b/Gpufit/Gpufit.def
new file mode 100644
index 0000000..0e3b9db
--- /dev/null
+++ b/Gpufit/Gpufit.def
@@ -0,0 +1,7 @@
+LIBRARY	      "Gpufit"
+EXPORTS       
+    gpufit @1
+    gpufit_get_last_error @2
+    gpufit_get_cuda_version @3
+    gpufit_cuda_available @4
+    gpufit_portable_interface @5
\ No newline at end of file
diff --git a/Gpufit/cauchy_2d_elliptic.cuh b/Gpufit/cauchy_2d_elliptic.cuh
new file mode 100644
index 0000000..b1c2a4e
--- /dev/null
+++ b/Gpufit/cauchy_2d_elliptic.cuh
@@ -0,0 +1,107 @@
+#ifndef GPUFIT_CAUCHY2DELLIPTIC_CUH_INCLUDED
+#define GPUFIT_CAUCHY2DELLIPTIC_CUH_INCLUDED
+
+/* Description of the calculate_cauchy2delliptic function
+* =======================================================
+*
+* This function calculates the values of two-dimensional elliptic cauchy model
+* functions and their partial derivatives with respect to the model parameters.
+*
+* No independent variables are passed to this model function.  Hence, the 
+* (X, Y) coordinate of the first data value is assumed to be (0.0, 0.0).  For
+* a fit size of M x N data points, the (X, Y) coordinates of the data are
+* simply the corresponding array index values of the data array, starting from
+* zero.
+*
+* Parameters:
+*
+* parameters: An input vector of concatenated sets of model parameters.
+*             p[0]: amplitude
+*             p[1]: center coordinate x
+*             p[2]: center coordinate y
+*             p[3]: width x (standard deviation)
+*             p[4]: width y (standard deviation)
+*             p[5]: offset
+*
+* n_fits: The number of fits. (not used)
+*
+* n_points: The number of data points per fit.
+*
+* n_parameters: The number of model parameters.
+*
+* values: An output vector of concatenated sets of model function values.
+*
+* derivatives: An output vector of concatenated sets of model function partial
+*              derivatives.
+*
+* chunk_index: The chunk index. (not used)
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_cauchy2delliptic function
+* ===============================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function. When calling the function, the blocks and threads of the __global__
+* function must be set up correctly, as shown in the following example code.
+*
+*   dim3  threads(1, 1, 1);
+*   dim3  blocks(1, 1, 1);
+*
+*   threads.x = n_points * n_fits_per_block;
+*   blocks.x = n_fits / n_fits_per_block;
+*
+*   global_function<<< blocks,threads >>>(parameter1, ...);
+*
+*/
+
+__device__ void calculate_cauchy2delliptic(
+    float const * parameters,
+    int const n_fits,
+    int const n_points,
+    int const n_parameters,
+    float * values,
+    float * derivatives,
+    int const chunk_index,
+    char * user_info,
+    std::size_t const user_info_size)
+{
+    int const n_points_x = sqrt((float)n_points);
+    int const n_fits_per_block = blockDim.x / n_points;
+    int const fit_in_block = threadIdx.x / n_points;
+    int const point_index = threadIdx.x - (fit_in_block*n_points);
+    int const fit_index = blockIdx.x*n_fits_per_block + fit_in_block;
+
+    int const point_index_y = point_index / n_points_x;
+    int const point_index_x = point_index - (point_index_y*n_points_x);
+
+    float* current_value = &values[fit_index*n_points];
+    float const * p = &parameters[fit_index*n_parameters];
+    
+    float const argx  = ((p[1] - point_index_x) / p[3]) *((p[1] - point_index_x) / p[3]) + 1;
+    float const argy = ((p[2] - point_index_y) / p[4]) *((p[2] - point_index_y) / p[4]) + 1;
+    current_value[point_index] = p[0] * 1 / argx * 1 / argy + p[5];
+
+    //////////////////////////////////////////////////////////////////////////////
+
+    float * current_derivative = &derivatives[fit_index * n_points*n_parameters];
+
+    current_derivative[0 * n_points + point_index]
+        = 1 / (argx*argy);
+    current_derivative[1 * n_points + point_index]
+        = -2 * p[0] * (p[1] - point_index_x) * 1 / (p[3] * p[3] * argx*argx*argy);
+    current_derivative[2 * n_points + point_index]
+        = -2 * p[0] * (p[2] - point_index_y) * 1 / (p[4] * p[4] * argy*argy*argx);
+    current_derivative[3 * n_points + point_index]
+        = 2 * p[0] * (p[1] - point_index_x) * (p[1] - point_index_x)
+        / (p[3] * p[3] * p[3] * argx * argx * argy);
+    current_derivative[4 * n_points + point_index]
+        = 2 * p[0] * (p[2] - point_index_y) * (p[2] - point_index_y) 
+        / (p[4] * p[4] * p[4] * argy * argy * argx);
+    current_derivative[5 * n_points + point_index]
+        = 1;
+}
+
+#endif
diff --git a/Gpufit/cuda_gaussjordan.cu b/Gpufit/cuda_gaussjordan.cu
new file mode 100644
index 0000000..c6519bc
--- /dev/null
+++ b/Gpufit/cuda_gaussjordan.cu
@@ -0,0 +1,279 @@
+/* CUDA implementation of Gauss-Jordan elimination algorithm.
+*  
+* Gauss-Jordan elimination method
+* ===============================
+*
+* This function solves a set of linear equations using the Gauss-Jordan elimination method.
+* Considering a set of N equations with N unknowns, this can be written in matrix form as
+* an NxN matrix of coefficients and a Nx1 column vector of right-hand side values.
+*
+* For example, consider the following problem with 3 equations and 3 unknowns (N=3):
+* 
+*   A x + B y + C z = MM
+*   D x + E y + F z = NN
+*   G x + H y + J z = PP
+* 
+* We can write this as follows in matrix form:
+* 
+*   [ A B C ] [ x ] = [ MM ]
+*   [ D E F ] [ y ] = [ NN ] 
+*   [ G H I ] [ z ] = [ PP ]
+* 
+* or, [A]*[X] = [B] where [A] is the matrix of coefficients and [B] is the vector of 
+* right-hand side values.
+*
+* The Gauss Jordan elimiation method solves the system of equations in the following
+* manner.  First, we form the augmented matrix (A|B):
+*
+*   [ A B C | MM ] 
+*   [ D E F | NN ] 
+*   [ G H I | PP ] 
+*
+* and then the augmented matrix is manipulated until its left side has the reduced
+* row-echelon form.  That is to say that any individual row may be multiplied
+* by a scalar factor, and any linear combination of rows may be added to another 
+* row.  Finally, two rows may be swapped without affecting the solution.
+* 
+* When the manipulations are complete and the left side of the matrix has the desired
+* form, the right side then corresponds to the solution of the system. 
+*
+*
+* Description of the cuda_gaussjordan function
+* ============================================
+* 
+* This algorithm is designed to perform many solutions of the Gauss Jordan elimination
+* method in parallel.  One limitation of the algorithm implemented here is that for
+* each solution the number of equations and unknowns (N) must be identical.  
+*
+* Parameters:
+* 
+* alpha: Coefficients matrices.  The matrix of coefficients for a single solution is 
+*        a vector of NxN, where N is the number of equations.  This array stores the 
+*        coefficients for the entire set of M input problems, concatenated end to end, 
+*        and hence the total size of the array is MxNxN.  
+*
+* beta: Vector of right hand side values, concatenated together for all input problems. 
+*       For a set of M inputs, the size of the vector is MxN.  Upon completion, this 
+*       vector contains the results vector X for each solution.
+*
+* skip_calculation: An input vector which allows the calculation to be skipped for
+*                   a particular solution.  For a set of M inputs, the size of this
+*                   vector is M. 
+*
+* singular: An output vector used to report whether a given solution is singular.  For
+*           a set of M inputs, this vector has size M.  Memory needs to be allocated
+*           by the calling the function.
+*
+* n_equations: The number of equations and unknowns for a single solution.  This is
+*              equal to the size N.
+*
+* n_equations_pow2: The next highest power of 2 greater than n_equations.
+*
+*
+* Calling the cuda_gaussjordan function
+* =====================================
+*
+* When calling the function, the blocks and threads must be set up correctly, as well
+* as the shared memory space, as shown in the following example code.
+*
+*   dim3  threads(1, 1, 1);
+*   dim3  blocks(1, 1, 1);
+*
+*   threads.x = n_equations + 1;
+*   threads.y = n_equations;
+*   blocks.x = n_solutions;
+*   blocks.y = 1;
+*
+*   int const shared_size = sizeof(float) * 
+*       ( (threads.x * threads.y) + n_parameters_pow2 + n_parameters_pow2 );
+*
+*   int * singular;
+*   CUDA_CHECK_STATUS(cudaMalloc((void**)&singular, n_solutions * sizeof(int)));
+*
+*   cuda_gaussjordan<<< blocks, threads, shared_size >>>(
+*       alpha,
+*       beta,
+*       skip_calculation,
+*       singular,
+*       n_equations,
+*       n_equations_pow2);
+*
+*/
+
+#include "cuda_gaussjordan.cuh"
+
+__global__ void cuda_gaussjordan(
+    float * delta,
+    float const * beta,
+    float const * alpha,
+    int const * skip_calculation,
+    int * singular,
+    std::size_t const n_equations,
+    std::size_t const n_equations_pow2)
+{
+    extern __shared__ float extern_array[];     //shared memory between threads of a single block, 
+    //used for storing the calculation_matrix, the 
+    //abs_row vector, and the abs_row_index vector
+
+    // In this routine we will store the augmented matrix (A|B), referred to here
+    // as the calculation matrix in a shared memory space which is visible to all
+    // threads within a block.  Also stored in shared memory are two vectors which 
+    // are used to find the largest element in each row (the pivot).  These vectors 
+    // are called abs_row and abs_row_index.
+    //
+    // Sizes of data stored in shared memory:
+    //
+    //      calculation_matrix: n_equations * (n_equations+1)
+    //      abs_row:            n_equations_pow2
+    //      abs_row_index:      n_equations_pow2
+    //  
+    // Note that each thread represents an element of the augmented matrix, with
+    // the column and row indicated by the x and y index of the thread.  Each 
+    // solution is calculated within one block, and the solution index is the 
+    // block index x value.
+
+    int const col_index = threadIdx.x;                  //column index in the calculation_matrix
+    int const row_index = threadIdx.y;                  //row index in the calculation_matrix
+    int const solution_index = blockIdx.x;
+
+    int const n_col = blockDim.x;                       //number of columns in calculation matrix (=threads.x)
+    int const n_row = blockDim.y;                       //number of rows in calculation matrix (=threads.y)
+    int const alpha_size = blockDim.y * blockDim.y;     //number of entries in alpha matrix for one solution (NxN)
+
+    if (skip_calculation[solution_index])
+        return;
+
+    float p;                                            //local variable used in pivot calculation
+
+    float * calculation_matrix = extern_array;                          //point to the shared memory
+
+    float * abs_row = extern_array + n_equations * (n_equations + 1);     //abs_row is located after the calculation_matrix
+    //within the shared memory
+
+    int * abs_row_index = (int *)abs_row + n_equations_pow2;            //abs_row_index is located after abs_row
+    //
+    //note that although the shared memory is defined as
+    //float, we are storing data of type int in this
+    //part of the shared memory
+
+    //initialize the singular vector
+    if (col_index == 0 && row_index == 0)
+    {
+        singular[solution_index] = 0;
+    }
+
+    //initialize abs_row and abs_row_index, using only the threads on the diagonal
+    if (col_index == row_index)
+    {
+        abs_row[col_index + (n_equations_pow2 - n_equations)] = 0.0f;
+        abs_row_index[col_index + (n_equations_pow2 - n_equations)] = col_index + (n_equations_pow2 - n_equations);
+    }
+
+    //initialize the calculation_matrix (alpha and beta, concatenated, for one solution)
+    if (col_index != n_equations)
+        calculation_matrix[row_index*n_col + col_index] = alpha[solution_index * alpha_size + row_index * n_equations + col_index];
+    else
+        calculation_matrix[row_index*n_col + col_index] = beta[solution_index * n_equations + row_index];
+
+    //wait for thread synchronization
+
+    __syncthreads();
+
+    //start of main outer loop over the rows of the calculation matrix
+
+    for (int current_row = 0; current_row < n_equations; current_row++)
+    {
+
+        // work in only one row, skipping the last column
+        if (row_index == current_row && col_index != n_equations)
+        {
+
+            //save the absolute values of the current row
+            abs_row[col_index] = abs(calculation_matrix[row_index * n_col + col_index]);
+
+            //save the column indices
+            abs_row_index[col_index] = col_index;
+
+            __threadfence();
+
+            //find the largest absolute value in the current row and write its index in abs_row_index[0]
+            for (int n = 2; n <= n_equations_pow2; n = n * 2)
+            {
+                if (col_index < (n_equations_pow2 / n))
+                {
+                    if (abs_row[abs_row_index[col_index]] < abs_row[abs_row_index[col_index + (n_equations_pow2 / n)]])
+                    {
+                        abs_row_index[col_index] = abs_row_index[col_index + (n_equations_pow2 / n)];
+                    }
+                }
+            }
+        }
+
+        __syncthreads();
+
+        //singularity check - if all values in the row are zero, no solution exists
+        if (row_index == current_row && col_index != n_equations)
+        {
+            if (abs_row[abs_row_index[0]] == 0.0f)
+            {
+                singular[solution_index] = 1;
+            }
+        }
+
+        //devide the row by the biggest value in the row
+        if (row_index == current_row)
+        {
+            calculation_matrix[row_index * n_col + col_index]
+                = calculation_matrix[row_index * n_col + col_index] / calculation_matrix[row_index * n_col + abs_row_index[0]];
+        }
+
+        __syncthreads();
+
+        //The value of the largest element of the current row was found, and then current
+        //row was divided by this value such that the largest value of the current row 
+        //is equal to one.  
+        //
+        //Next, the matrix is manipulated to reduce to zero all other entries in the column 
+        //in which the largest value was found.   To do this, the values in the current row
+        //are scaled appropriately and substracted from the other rows of the matrix. 
+        //
+        //For each element of the matrix that is not in the current row, calculate the value
+        //to be subtracted and let each thread store this value in the scalar variable p.
+
+        p = calculation_matrix[current_row * n_col + col_index] * calculation_matrix[row_index * n_col + abs_row_index[0]];
+        __syncthreads();
+
+        if (row_index != current_row)
+        {
+            calculation_matrix[row_index * n_col + col_index] = calculation_matrix[row_index * n_col + col_index] - p;
+        }
+        __syncthreads();
+
+    }
+
+    //At this point, if the solution exists, the calculation matrix has been reduced to the 
+    //identity matrix on the left side, and the solution vector on the right side.  However
+    //we have not swapped rows during the procedure, so the identity matrix is out of order.
+    //
+    //For example, starting with the following augmented matrix as input:
+    //
+    //  [  3  2 -4 |  4 ]
+    //  [  2  3  3 | 15 ]
+    //  [  5 -3  1 | 14 ]
+    //
+    //we will obtain:
+    //
+    //  [  0  0  1 |  2 ]
+    //  [  0  1  0 |  1 ]
+    //  [  1  0  0 |  3 ]
+    //
+    //Which needs to be re-arranged to obtain the correct solution vector.  In the final
+    //step, each thread checks to see if its value equals 1, and if so it assigns the value
+    //in its rightmost column to the appropriate entry in the beta vector.  The solution is
+    //stored in beta upon completetion.
+
+    if (col_index != n_equations && calculation_matrix[row_index * n_col + col_index] == 1)
+        delta[n_row * solution_index + col_index] = calculation_matrix[row_index * n_col + n_equations];
+
+    __syncthreads();
+}
diff --git a/Gpufit/cuda_gaussjordan.cuh b/Gpufit/cuda_gaussjordan.cuh
new file mode 100644
index 0000000..2d41cda
--- /dev/null
+++ b/Gpufit/cuda_gaussjordan.cuh
@@ -0,0 +1,15 @@
+#ifndef GPUFIT_CUDA_GAUSS_JORDAN_CUH_INCLUDED
+#define GPUFIT_CUDA_GAUSS_JORDAN_CUH_INCLUDED
+
+#include <device_launch_parameters.h>
+
+extern __global__ void cuda_gaussjordan(
+    float * delta,
+    float const * beta,
+    float const * alpha,
+    int const * skip_calculation,
+    int * singular,
+    std::size_t const n_equations,
+    std::size_t const n_equations_pow2);
+
+#endif
\ No newline at end of file
diff --git a/Gpufit/cuda_kernels.cu b/Gpufit/cuda_kernels.cu
new file mode 100644
index 0000000..2661a7e
--- /dev/null
+++ b/Gpufit/cuda_kernels.cu
@@ -0,0 +1,1081 @@
+#include "gpufit.h"
+#include "cuda_kernels.cuh"
+#include "definitions.h"
+#include "linear_1d.cuh"
+#include "gauss_1d.cuh"
+#include "gauss_2d.cuh"
+#include "gauss_2d_elliptic.cuh"
+#include "gauss_2d_rotated.cuh"
+#include "cauchy_2d_elliptic.cuh"
+#include "lse.cuh"
+#include "mle.cuh"
+
+/* Description of the cuda_calc_curve_values function
+* ===================================================
+*
+* This function calls one of the fitting curve functions depending on the input
+* parameter model_id. The fitting curve function calculates the values of
+* the fitting curves and its partial derivatives with respect to the fitting
+* curve parameters. Multiple fits are calculated in parallel.
+*
+* Parameters:
+*
+* parameters: An input vector of concatenated sets of model parameters.
+*
+* n_fits: The number of fits.
+*
+* n_points: The number of data points per fit.
+*
+* n_parameters: The number of curve parameters.
+*
+* finished: An input vector which allows the calculation to be skipped for single
+*           fits.
+*
+* values: An output vector of concatenated sets of model function values.
+*
+* derivatives: An output vector of concatenated sets of model function partial
+*              derivatives.
+*
+* n_fits_per_block: The number of fits calculated by each threadblock.
+*
+* model_id: The fitting model ID.
+*
+* chunk_index: The chunk index.
+*
+* user_info: An input vector containing user information.
+*
+* user_info_size: The number of elements in user_info.
+*
+* Calling the cuda_calc_curve_values function
+* ===========================================
+*
+* When calling the function, the blocks and threads must be set up correctly,
+* as shown in the following example code.
+*
+*   dim3  threads(1, 1, 1);
+*   dim3  blocks(1, 1, 1);
+*
+*   threads.x = n_points * n_fits_per_block;
+*   blocks.x = n_fits / n_fits_per_block;
+*
+*   cuda_calc_curve_values<<< blocks, threads >>>(
+*       parameters,
+*       n_points,
+*       n_parameters,
+*       finished,
+*       values,
+*       derivatives,
+*       n_fits_per_block,
+*       model_id,
+*       chunk_index,
+*       user_info,
+*       user_info_size);
+*
+*/
+
+__global__ void cuda_calc_curve_values(
+    float const * parameters,
+    int const n_fits,
+    int const n_points,
+    int const n_parameters,
+    int const * finished,
+    float * values,
+    float * derivatives,
+    int const n_fits_per_block,
+    int const model_id,
+    int const chunk_index,
+    char * user_info,
+    std::size_t const user_info_size)
+{
+    int const fit_in_block = threadIdx.x / n_points;
+    int const point_index = threadIdx.x - fit_in_block * n_points;
+    int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block;
+
+    if (finished[fit_index])
+        return;
+    if (point_index >= n_points)
+        return;
+
+    if (model_id == GAUSS_1D)
+        calculate_gauss1d(parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size);
+    else if (model_id == GAUSS_2D)
+        calculate_gauss2d(parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size);
+    else if (model_id == GAUSS_2D_ELLIPTIC)
+        calculate_gauss2delliptic(parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size);
+    else if (model_id == GAUSS_2D_ROTATED)
+        calculate_gauss2drotated(parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size);
+    else if (model_id == CAUCHY_2D_ELLIPTIC)
+        calculate_cauchy2delliptic(parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size);
+    else if (model_id == LINEAR_1D)
+        calculate_linear1d(parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size);
+}
+
+/* Description of the sum_up_floats function
+* ==========================================
+*
+* This function sums up a vector of float values and stores the result at the
+* first place of the vector.
+*
+* Parameters:
+*
+* shared_array: An input vector of float values. The vector must be stored
+*               on the shared memory of the GPU. The size of this vector must be a
+*               power of two. Use zero padding to extend it to the next highest
+*               power of 2 greater than the number of elements.
+*
+* size: The number of elements in the input vector considering zero padding.
+*
+* Calling the sum_up_floats function
+* ==================================
+*
+* This __device__ function can be only called from a __global__ function or
+* an other __device__ function. When calling the function, the blocks and threads
+* of the __global__ function must be set up correctly, as shown in the following
+* example code.
+*
+*   dim3  threads(1, 1, 1);
+*   dim3  blocks(1, 1, 1);
+*
+*   threads.x = size * vectors_per_block;
+*   blocks.x = n_vectors / vectors_per_block;
+*
+*   global_function<<< blocks,threads >>>(parameter1, ...);
+*
+*/
+
+__device__ void sum_up_floats(volatile float* shared_array, int const size)
+{
+    int const fit_in_block = threadIdx.x / size;
+    int const point_index = threadIdx.x - (fit_in_block*size);
+
+    int current_n_points = size >> 1;
+    __syncthreads();
+    while (current_n_points)
+    {
+        if (point_index < current_n_points)
+        {
+            shared_array[point_index] += shared_array[point_index + current_n_points];
+        }
+        current_n_points >>= 1;
+        __syncthreads();
+    }
+}
+
+/* Description of the cuda_calculate_chi_squares function
+* ========================================================
+*
+* This function calculates the chi-square values calling a __device__ function.
+* The calcluation is performed for multiple fits in parallel.
+*
+* Parameters:
+*
+* chi_squares: An output vector of concatenated chi-square values.
+*
+* states: An output vector of values which indicate whether the fitting process
+*         was carreid out correctly or which problem occurred. In this function
+*         it is only used for MLE. It is set to 3 if a fitting curve value is
+*         negative. This vector includes the states for multiple fits.
+*
+* iteration_falied: An output vector which indicates whether the chi-square values
+*                   calculated by the current iteration decreased compared to the
+*                   previous iteration.
+*
+* prev_chi_squares: An input vector of concatenated chi-square values calculated
+*                   by the previous iteration.
+*
+* data: An input vector of data for multiple fits
+*
+* values: An input vector of concatenated sets of model function values.
+*
+* weight: An input vector of values for weighting chi-square, gradient and hessian,
+*         while using LSE
+*
+* n_points: The number of data points per fit.
+*
+* estimator_id: The estimator ID.
+*
+* finished: An input vector which allows the calculation to be skipped for single
+*           fits.
+*
+* n_fits_per_block: The number of fits calculated by each thread block.
+*
+* user_info: An input vector containing user information.
+*
+* user_info_size: The number of elements in user_info.
+*
+* Calling the cuda_calculate_chi_squares function
+* ================================================
+*
+* When calling the function, the blocks and threads must be set up correctly,
+* as shown in the following example code.
+*
+*   dim3  threads(1, 1, 1);
+*   dim3  blocks(1, 1, 1);
+*
+*   threads.x = power_of_two_n_points * n_fits_per_block;
+*   blocks.x = n_fits / n_fits_per_block;
+*
+*   cuda_calculate_chi_squares<<< blocks, threads >>>(
+*       chi_squares,
+*       states,
+*       iteration_falied,
+*       prev_chi_squares,
+*       data,
+*       values,
+*       weight,
+*       n_points,
+*       estimator_id,
+*       finished,
+*       n_fits_per_block,
+*       user_info,
+*       user_info_size);
+*
+*/
+
+__global__ void cuda_calculate_chi_squares(
+    float * chi_squares,
+    int * states,
+    int * iteration_falied,
+    float const * prev_chi_squares,
+    float const * data,
+    float const * values,
+    float const * weights,
+    int const n_points,
+    int const estimator_id,
+    int const * finished,
+    int const n_fits_per_block,
+    char * user_info,
+    std::size_t const user_info_size)
+{
+    int const shared_size = blockDim.x / n_fits_per_block;
+    int const fit_in_block = threadIdx.x / shared_size;
+    int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block;
+    int const point_index = threadIdx.x - fit_in_block * shared_size;
+    int const first_point = fit_index * n_points;
+
+    if (finished[fit_index])
+    {
+        return;
+    }
+
+    float const * current_data = &data[first_point];
+    float const * current_weight = weights ? &weights[first_point] : NULL;
+    float const * current_value  = &values[first_point];
+    int * current_state = &states[fit_index];
+
+    extern __shared__ float extern_array[];
+    
+    volatile float * shared_chi_square = &extern_array[fit_in_block*shared_size];
+    
+    if (point_index >= n_points)
+    {
+        shared_chi_square[point_index] = 0.f;
+    }
+
+    if (point_index < n_points)
+    {
+        if (estimator_id == LSE)
+        {
+            calculate_chi_square_lse(
+                shared_chi_square,
+                point_index,
+                current_data,
+                current_value,
+                current_weight,
+                current_state,
+                user_info,
+                user_info_size);
+        }
+        else if (estimator_id == MLE)
+        {
+            calculate_chi_square_mle(
+                shared_chi_square,
+                point_index,
+                current_data,
+                current_value,
+                current_weight,
+                current_state,
+                user_info,
+                user_info_size);
+        }
+    }
+    sum_up_floats(shared_chi_square, shared_size);
+    chi_squares[fit_index] = shared_chi_square[0];
+
+
+    bool const prev_chi_squares_initialized = prev_chi_squares[fit_index] != 0;
+    bool const chi_square_increased = (chi_squares[fit_index] >= prev_chi_squares[fit_index]);
+    if (prev_chi_squares_initialized && chi_square_increased)
+    {
+        iteration_falied[fit_index] = 1;
+    }
+    else
+    {
+        iteration_falied[fit_index] = 0;
+    }
+}
+
+/* Description of the cuda_calculate_gradients function
+* ========================================================
+*
+* This function calculates the gradient values of the chi-square function calling
+* a __device__ function. The calcluation is performed for multiple fits in parallel.
+*
+* Parameters:
+*
+* gradients: An output vector of concatenated sets of gradient vector values.
+*
+* data: An input vector of data for multiple fits
+*
+* values: An input vector of concatenated sets of model function values.
+*
+* derivatives: An input vector of concatenated sets of model function partial
+*              derivatives.
+*
+* weight: An input vector of values for weighting chi-square, gradient and hessian,
+*         while using LSE
+*
+* n_points: The number of data points per fit.
+*
+* n_parameters: The number of fitting curve parameters.
+*
+* n_parameters_to_fit: The number of fitting curve parameters, that are not held
+*                      fixed.
+*
+* parameters_to_fit_indices: An input vector of indices of fitting curve parameters,
+*                            that are not held fixed.
+*
+* estimator_id: The estimator ID.
+*
+* finished: An input vector which allows the calculation to be skipped for single
+*           fits.
+*
+* skip: An input vector which allows the calculation to be skipped for single fits.
+*
+* n_fits_per_block: The number of fits calculated by each thread block.
+*
+* user_info: An input vector containing user information.
+*
+* user_info_size: The number of elements in user_info.
+*
+* Calling the cuda_calculate_gradients function
+* ================================================
+*
+* When calling the function, the blocks and threads must be set up correctly,
+* as shown in the following example code.
+*
+*   dim3  threads(1, 1, 1);
+*   dim3  blocks(1, 1, 1);
+*
+*   threads.x = power_of_two_n_points * n_fits_per_block;
+*   blocks.x = n_fits / n_fits_per_block;
+*
+*   cuda_calculate_gradients<<< blocks, threads >>>(
+*       gradients,
+*       data,
+*       values,
+*       derivatives,
+*       weight,
+*       n_points,
+*       n_parameters,
+*       n_parameters_to_fit,
+*       parameters_to_fit_indices,
+*       estimator_id,
+*       finished,
+*       skip,
+*       n_fits_per_block,
+*       user_info,
+*       user_info_size);
+*
+*/
+
+__global__ void cuda_calculate_gradients(
+    float * gradients,
+    float const * data,
+    float const * values,
+    float const * derivatives,
+    float const * weights,
+    int const n_points,
+    int const n_parameters,
+    int const n_parameters_to_fit,
+    int const * parameters_to_fit_indices,
+    int const estimator_id,
+    int const * finished,
+    int const * skip,
+    int const n_fits_per_block,
+    char * user_info,
+    std::size_t const user_info_size)
+{
+    int const shared_size = blockDim.x / n_fits_per_block;
+    int const fit_in_block = threadIdx.x / shared_size;
+    int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block;
+    int const point_index = threadIdx.x - fit_in_block * shared_size;
+    int const first_point = fit_index * n_points;
+
+    if (finished[fit_index] || skip[fit_index])
+    {
+        return;
+    }
+
+    float const * current_data = &data[first_point];
+    float const * current_weight = weights ? &weights[first_point] : NULL;
+    float const * current_derivative = &derivatives[first_point * n_parameters];
+    float const * current_value = &values[first_point];
+
+    extern __shared__ float extern_array[];
+
+    volatile float * shared_gradient = &extern_array[fit_in_block * shared_size];
+
+    if (point_index >= n_points)
+    {
+        shared_gradient[point_index] = 0.f;
+    }
+
+    for (int parameter_index = 0; parameter_index < n_parameters_to_fit; parameter_index++)
+    {
+        if (point_index < n_points)
+        {
+            int const derivative_index  = parameters_to_fit_indices[parameter_index] * n_points + point_index;
+
+            if (estimator_id == LSE)
+            {
+                calculate_gradient_lse(
+                    shared_gradient,
+                    point_index,
+                    derivative_index,
+                    current_data,
+                    current_value,
+                    current_derivative,
+                    current_weight,
+                    user_info,
+                    user_info_size);
+            }
+            else if (estimator_id == MLE)
+            {
+                calculate_gradient_mle(
+                    shared_gradient,
+                    point_index,
+                    derivative_index,
+                    current_data,
+                    current_value,
+                    current_derivative,
+                    current_weight,
+                    user_info,
+                    user_info_size);
+            }
+        }
+        sum_up_floats(shared_gradient, shared_size);
+        gradients[fit_index * n_parameters_to_fit + parameter_index] = shared_gradient[0];
+    }
+}
+
+/* Description of the cuda_calculate_hessians function
+* ========================================================
+*
+* This function calculates the hessian matrix values of the chi-square function
+* calling a __device__ functions. The calcluation is performed for multiple fits
+* in parallel.
+*
+* Parameters:
+*
+* hessians: An output vector of concatenated sets of hessian matrix values.
+*
+* data: An input vector of data for multiple fits
+*
+* values: An input vector of concatenated sets of model function values.
+*
+* derivatives: An input vector of concatenated sets of model function partial
+*              derivatives.
+*
+* weight: An input vector of values for weighting chi-square, gradient and hessian,
+*         while using LSE
+*
+* n_points: The number of data points per fit.
+*
+* n_parameters: The number of fitting curve parameters.
+*
+* n_parameters_to_fit: The number of fitting curve parameters, that are not held
+*                      fixed.
+*
+* parameters_to_fit_indices: An input vector of indices of fitting curve parameters,
+*                            that are not held fixed.
+*
+* estimator_id: The estimator ID.
+*
+* skip: An input vector which allows the calculation to be skipped for single fits.
+*
+* finished: An input vector which allows the calculation to be skipped for single
+*           fits.
+*
+* user_info: An input vector containing user information.
+*
+* user_info_size: The number of elements in user_info.
+*
+* Calling the cuda_calculate_hessians function
+* ================================================
+*
+* When calling the function, the blocks and threads must be set up correctly,
+* as shown in the following example code.
+*
+*   dim3  threads(1, 1, 1);
+*   dim3  blocks(1, 1, 1);
+*
+*   threads.x = n_parameters_to_fit;
+*   threads.y = n_parameters_to_fit;
+*   blocks.x = n_fits;
+*
+*   cuda_calculate_hessians<<< blocks, threads >>>(
+*       hessians,
+*       data,
+*       values,
+*       derivatives,
+*       weight,
+*       n_points,
+*       n_parameters,
+*       n_parameters_to_fit,
+*       parameters_to_fit_indices,
+*       estimator_id,
+*       skip,
+*       finished,
+*       user_info,
+*       user_info_size);
+*
+*/
+
+__global__ void cuda_calculate_hessians(
+    float * hessians,
+    float const * data,
+    float const * values,
+    float const * derivatives,
+    float const * weights,
+    int const n_points,
+    int const n_parameters,
+    int const n_parameters_to_fit,
+    int const * parameters_to_fit_indices,
+    int const estimator_id,
+    int const * skip,
+    int const * finished,
+    char * user_info,
+    std::size_t const user_info_size)
+{
+    int const fit_index = blockIdx.x;
+    int const first_point = fit_index * n_points;
+
+    int const parameter_index_i = threadIdx.x;
+    int const parameter_index_j = threadIdx.y;
+
+    if (finished[fit_index] || skip[fit_index])
+    {
+        return;
+    }
+
+    float * current_hessian = &hessians[fit_index * n_parameters_to_fit * n_parameters_to_fit];
+    float const * current_data = &data[first_point];
+    float const * current_weight = weights ? &weights[first_point] : NULL;
+    float const * current_derivative = &derivatives[first_point*n_parameters];
+    float const * current_value = &values[first_point];
+
+    int const hessian_index_ij = parameter_index_i * n_parameters_to_fit + parameter_index_j;
+    int const derivative_index_i = parameters_to_fit_indices[parameter_index_i] * n_points;
+    int const derivative_index_j = parameters_to_fit_indices[parameter_index_j] * n_points;
+
+    double sum = 0.0;
+    for (int point_index = 0; point_index < n_points; point_index++)
+    {
+        if (estimator_id == LSE)
+        {
+            calculate_hessian_lse(
+                &sum,
+                point_index,
+                derivative_index_i + point_index,
+                derivative_index_j + point_index,
+                current_data,
+                current_value,
+                current_derivative,
+                current_weight,
+                user_info,
+                user_info_size);
+        }
+        else if (estimator_id == MLE)
+        {
+            calculate_hessian_mle(
+                &sum,
+                point_index,
+                derivative_index_i + point_index,
+                derivative_index_j + point_index,
+                current_data,
+                current_value,
+                current_derivative,
+                current_weight,
+                user_info,
+                user_info_size);
+        }
+    }
+    current_hessian[hessian_index_ij] = sum;
+}
+
+/* Description of the cuda_modify_step_widths function
+* ====================================================
+*
+* This function midifies the diagonal elements of the hessian matrices by multiplying
+* them by the factor (1+ lambda). This operation controls the step widths of the
+* iteration. If the last iteration failed, befor modifying the hessian, the diagonal
+* elements of the hessian are calculated back to represent unmodified values.
+*
+* hessians: An input and output vector of hessian matrices, which are modified by
+*           the lambda values.
+*
+* lambdas: An input vector of values for modifying the hessians.
+*
+* n_parameters: The number of fitting curve parameters.
+*
+* iteration_falied: An input vector which indicates whether the previous iteration
+*                   failed.
+*
+* finished: An input vector which allows the calculation to be skipped for single fits.
+*
+* n_fits_per_block: The number of fits calculated by each thread block.
+*
+* Calling the cuda_modify_step_widths function
+* ============================================
+*
+* When calling the function, the blocks and threads must be set up correctly,
+* as shown in the following example code.
+*
+*   dim3  threads(1, 1, 1);
+*   dim3  blocks(1, 1, 1);
+*
+*   threads.x = n_parameters_to_fit * n_fits_per_block;
+*   blocks.x = n_fits / n_fits_per_block;
+*
+*   cuda_modify_step_width<<< blocks, threads >>>(
+*       hessians,
+*       lambdas,
+*       n_parameters,
+*       iteration_failed,
+*       finished,
+*       n_fits_per_block);
+*
+*/
+        
+__global__ void cuda_modify_step_widths(
+    float * hessians,
+    float const * lambdas,
+    unsigned int const n_parameters,
+    int const * iteration_failed,
+    int const * finished,
+    int const n_fits_per_block)
+{
+    int const shared_size = blockDim.x / n_fits_per_block;
+    int const fit_in_block = threadIdx.x / shared_size;
+    int const parameter_index = threadIdx.x - fit_in_block * shared_size;
+    int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block;
+
+    if (finished[fit_index])
+    {
+        return;
+    }
+
+    float * current_hessian = &hessians[fit_index * n_parameters * n_parameters];
+
+    if (iteration_failed[fit_index])
+    {
+        current_hessian[parameter_index * n_parameters + parameter_index]
+            = current_hessian[parameter_index * n_parameters + parameter_index]
+            / (1.0f + lambdas[fit_index] / 10.f);
+    }
+    
+    current_hessian[parameter_index * n_parameters + parameter_index]
+        = current_hessian[parameter_index * n_parameters + parameter_index]
+        * (1.0f + lambdas[fit_index]);
+}
+
+/* Description of the cuda_update_parameters function
+* ===================================================
+*
+* This function stores the fitting curve parameter values in prev_parameters and
+* updates them after each iteration.
+*
+* Parameters:
+*
+* deltas: An input vector of concatenated delta values, which are added to the
+*         model parameters.
+*
+* parameters: An input and output vector of concatenated sets of model
+*             parameters.
+*
+* n_parameters_to_fit: The number of fitted curve parameters.
+*
+* parameters_to_fit_indices: The indices of fitted curve parameters.
+*
+* finished: An input vector which allows the calculation to be skipped for single fits.
+*
+* n_fits_per_block: The number of fits calculated by each threadblock.
+*
+* Calling the cuda_update_parameters function
+* ===========================================
+*
+* When calling the function, the blocks and threads must be set up correctly,
+* as shown in the following example code.
+*
+*   dim3  threads(1, 1, 1);
+*   dim3  blocks(1, 1, 1);
+*
+*   threads.x = n_parameters * n_fits_per_block;
+*   blocks.x = n_fits / n_fits_per_block;
+*
+*   cuda_update_parameters<<< blocks, threads >>>(
+*       deltas,
+*       parameters,
+*       n_parameters_to_fit,
+*       parameters_to_fit_indices,
+*       finished,
+*       n_fits_per_block);
+*
+*/
+    
+__global__ void cuda_update_parameters(
+    float * parameters,
+    float * prev_parameters,
+    float const * deltas,
+    int const n_parameters_to_fit,
+    int const * parameters_to_fit_indices,
+    int const * finished,
+    int const n_fits_per_block)
+{
+    int const n_parameters = blockDim.x / n_fits_per_block;
+    int const fit_in_block = threadIdx.x / n_parameters;
+    int const parameter_index = threadIdx.x - fit_in_block * n_parameters;
+    int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block;
+
+    float * current_parameters = &parameters[fit_index * n_parameters];
+    float * current_prev_parameters = &prev_parameters[fit_index * n_parameters];
+
+    current_prev_parameters[parameter_index] = current_parameters[parameter_index];
+
+    if (finished[fit_index])
+    {
+        return;
+    }
+
+    if (parameter_index >= n_parameters_to_fit)
+    {
+        return;
+    }
+
+    float const * current_deltas = &deltas[fit_index * n_parameters_to_fit];
+
+    current_parameters[parameters_to_fit_indices[parameter_index]] += current_deltas[parameter_index];
+}
+
+/* Description of the cuda_update_state_after_gaussjordan function
+* ================================================================
+*
+* This function interprets the singular flag vector of the Gauss Jordan function
+* according to this LM implementation.
+*
+* Parameters:
+*
+* n_fits: The number of fits.
+*
+* singular_checks: An input vector used to report whether a fit is singular.  
+*
+* states: An output vector of values which indicate whether the fitting process
+*         was carreid out correctly or which problem occurred. If a hessian
+*         matrix of a fit is singular, it is set to 2.
+*
+* Calling the cuda_update_state_after_gaussjordan function
+* ========================================================
+*
+* When calling the function, the blocks and threads must be set up correctly,
+* as shown in the following example code.
+*
+*   dim3  threads(1, 1, 1);
+*   dim3  blocks(1, 1, 1);
+*
+*   int const example_value = 256;
+*
+*   threads.x = min(n_fits, example_value);
+*   blocks.x = int(ceil(float(n_fits) / float(threads.x)));
+*
+*   cuda_update_state_after_gaussjordan<<< blocks, threads >>>(
+*       n_fits,
+*       singular_checks,
+*       states);
+*
+*/
+
+
+__global__ void cuda_update_state_after_gaussjordan(
+    int const n_fits,
+    int const * singular_checks,
+    int * states)
+{
+    int const fit_index = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (fit_index >= n_fits)
+    {
+        return;
+    }
+
+    if (singular_checks[fit_index] == 1)
+    {
+        states[fit_index] = STATE_SINGULAR_HESSIAN;
+    }
+
+}
+
+/* Description of the cuda_check_for_convergence function
+* =======================================================
+*
+* This function checks after each iteration whether the fits are converged or not.
+* It also checks whether the set maximum number of iterations is reached.
+*
+* Parameters:
+*
+* finished: An input and output vector which allows the calculation to be skipped
+*           for single fits.
+*
+* tolerance: The tolerance value for the convergence set by user.
+*
+* states: An output vector of values which indicate whether the fitting process
+*         was carreid out correctly or which problem occurred. If the maximum
+*         number of iterationsis reached without converging, it is set to 1. If
+*         the fit converged it keeps its initial value of 0.
+*
+* chi_squares: An input vector of chi-square values for multiple fits. Used for the
+*             convergence check.
+*
+* prev_chi_squares: An input vector of chi-square values for multiple fits calculated
+*                  in the previous iteration. Used for the convergence check.
+*
+* iteration: The value of the current iteration. It is compared to the value
+*            of the maximum number of iteration set by user.
+*
+* max_n_iterations: The maximum number of iterations set by user.
+*
+* n_fits: The number of fits.
+*
+* Calling the cuda_check_for_convergence function
+* ===============================================
+*
+* When calling the function, the blocks and threads must be set up correctly,
+* as shown in the following example code.
+*
+*   dim3  threads(1, 1, 1);
+*   dim3  blocks(1, 1, 1);
+*
+*   int const example_value = 256;
+*
+*   threads.x = min(n_fits, example_value);
+*   blocks.x = int(ceil(float(n_fits) / float(threads.x)));
+*
+*   cuda_check_for_convergence<<< blocks, threads >>>(
+*       finished,
+*       tolerance,
+*       states,
+*       chi_squares,
+*       prev_chi_squares,
+*       iteration,
+*       max_n_iterations,
+*       n_fits);
+*
+*/
+
+__global__ void cuda_check_for_convergence(
+    int * finished,
+    float const tolerance,
+    int * states,
+    float const * chi_squares,
+    float const * prev_chi_squares,
+    int const iteration,
+    int const max_n_iterations,
+    int const n_fits)
+{
+    int const fit_index = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (fit_index >= n_fits)
+    {
+        return;
+    }
+
+    if (finished[fit_index])
+    {
+        return;
+    }
+
+    int const fit_found = abs(chi_squares[fit_index] - prev_chi_squares[fit_index])  < tolerance * fmaxf(1, chi_squares[fit_index]);
+
+    int const max_n_iterations_reached = iteration == max_n_iterations - 1;
+
+    if (fit_found)
+    {
+        finished[fit_index] = 1;
+    }
+    else if (max_n_iterations_reached)
+    {
+        states[fit_index] = STATE_MAX_ITERATION;
+    }
+}
+
+/* Description of the cuda_evaluate_iteration function
+* ====================================================
+*
+* This function evaluates the current iteration.
+*   - It marks a fit as finished if a problem occured.
+*   - It saves the needed number of iterations if a fit finished.
+*   - It checks if all fits finished
+*
+* Parameters:
+*
+* all_finished: An output flag, that indicates whether all fits finished.
+*
+* n_iterations: An output vector of needed iterations for each fit.
+*
+* finished: An input and output  vector which allows the evaluation to be skipped
+*           for single fits
+*
+* iteration: The values of the current iteration.
+*
+* states: An input vector of values which indicate whether the fitting process
+*         was carreid out correctly or which problem occurred.
+*
+* n_fits: The number of fits.
+*
+* Calling the cuda_evaluate_iteration function
+* ============================================
+*
+* When calling the function, the blocks and threads must be set up correctly,
+* as shown in the following example code.
+*
+*   dim3  threads(1, 1, 1);
+*   dim3  blocks(1, 1, 1);
+*
+*   int const example_value = 256;
+*
+*   threads.x = min(n_fits, example_value);
+*   blocks.x = int(ceil(float(n_fits) / float(threads.x)));
+*
+*   cuda_evaluate_iteration<<< blocks, threads >>>(
+*       all_finished,
+*       n_iterations,
+*       finished,
+*       iteration,
+*       states,
+*       n_fits)
+*
+*/
+
+__global__ void cuda_evaluate_iteration(
+    int * all_finished,
+    int * n_iterations,
+    int * finished,
+    int const iteration,
+    int const * states,
+    int const n_fits)
+{
+    int const fit_index = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (fit_index >= n_fits)
+    {
+        return;
+    }
+
+    if (states[fit_index] != STATE_CONVERGED)
+    {
+        finished[fit_index] = 1;
+    }
+
+    if (finished[fit_index] && n_iterations[fit_index] == 0)
+    {
+        n_iterations[fit_index] = iteration + 1;
+    }
+
+    if (!finished[fit_index])
+    {
+        * all_finished = 0;
+    }
+}
+
+/* Description of the cuda_prepare_next_iteration function
+* ========================================================
+*
+* This function prepares the next iteration. It either updates chi-square values
+* or sets chi-squares and curve parameters to previous values. This function also
+* updates lambda values.
+*
+* Parameters:
+*
+* lambdas: An output vector of values which control the step width by modifying
+*          the diagonal elements of the hessian matrices.
+*
+* chi_squares: An input vector of chi-square values for multiple fits.
+*
+* prev_chi_squares: An input vector of chi-square values for multiple fits calculated
+*                  in the previous iteration.
+*
+* parameters: An output vector of concatenated sets of model parameters.
+*
+* prev_parameters: An input vector of concatenated sets of model parameters
+*                  calculated in the previous iteration.
+*
+* n_fits: The number of fits.
+*
+* n_parameters: The number of fitting curve parameters.
+*
+* Calling the cuda_prepare_next_iteration function
+* ================================================
+*
+* When calling the function, the blocks and threads must be set up correctly,
+* as shown in the following example code.
+*
+*   dim3  threads(1, 1, 1);
+*   dim3  blocks(1, 1, 1);
+*
+*   int const example_value = 256;
+*
+*   threads.x = min(n_fits, example_value);
+*   blocks.x = int(ceil(float(n_fits) / float(threads.x)));
+*
+*   cuda_prepare_next_iteration<<< blocks, threads >>>(
+*       lambdas,
+*       chi_squares,
+*       prev_chi_squares,
+*       parameters,
+*       prev_parameters,
+*       n_fits,
+*       n_parameters);
+*
+*/
+
+__global__ void cuda_prepare_next_iteration(
+    float * lambdas,
+    float * chi_squares,
+    float * prev_chi_squares,
+    float * parameters,
+    float const * prev_parameters,
+    int const n_fits,
+    int const n_parameters)
+{
+    int const fit_index = blockIdx.x * blockDim.x + threadIdx.x;
+        
+    if (fit_index >= n_fits)
+    {
+        return;
+    }
+
+    if (chi_squares[fit_index] < prev_chi_squares[fit_index])
+    {
+        lambdas[fit_index] *= 0.1f;
+        prev_chi_squares[fit_index] = chi_squares[fit_index];
+    }
+    else
+    {
+        lambdas[fit_index] *= 10.f;
+        chi_squares[fit_index] = prev_chi_squares[fit_index];
+        for (int iparameter = 0; iparameter < n_parameters; iparameter++)
+        {
+            parameters[fit_index * n_parameters + iparameter] = prev_parameters[fit_index * n_parameters + iparameter];
+        }
+    }
+}
diff --git a/Gpufit/cuda_kernels.cuh b/Gpufit/cuda_kernels.cuh
new file mode 100644
index 0000000..6836480
--- /dev/null
+++ b/Gpufit/cuda_kernels.cuh
@@ -0,0 +1,108 @@
+#ifndef GPUFIT_CUDA_KERNELS_CUH_INCLUDED
+#define GPUFIT_CUDA_KERNELS_CUH_INCLUDED
+
+#include <device_launch_parameters.h>
+
+extern __global__ void cuda_calculate_chi_squares(
+    float * chi_squares,
+    int * states,
+    int * iteration_falied,
+    float const * prev_chi_squares,
+    float const * data,
+    float const * values,
+    float const * weights,
+    int const n_points,
+    int const estimator_id,
+    int const * finished,
+    int const n_fits_per_block,
+    char * user_info,
+    std::size_t const user_info_size);
+extern __global__ void cuda_calculate_gradients(
+    float * gradients,
+    float const * data,
+    float const * values,
+    float const * derivatives,
+    float const * weights,
+    int const n_points,
+    int const n_parameters,
+    int const n_parameters_to_fit,
+    int const * parameters_to_fit_indices,
+    int const estimator_id,
+    int const * finished,
+    int const * skip,
+    int const n_fits_per_block,
+    char * user_info,
+    std::size_t const user_info_size);
+extern __global__ void cuda_calculate_hessians(
+    float * hessians,
+    float const * data,
+    float const * values,
+    float const * derivatives,
+    float const * weights,
+    int const n_points,
+    int const n_parameters,
+    int const n_parameters_to_fit,
+    int const * parameters_to_fit_indices,
+    int const estimator_id,
+    int const * skip,
+    int const * finished,
+    char * user_info,
+    std::size_t const user_info_size);
+extern __global__ void cuda_modify_step_widths(
+    float * hessians,
+    float const * lambdas,
+    unsigned int const n_parameters,
+    int const * iteration_failed,
+    int const * finished,
+    int const n_fits_per_block);
+extern __global__ void cuda_calc_curve_values(
+    float const * parameters,
+    int const n_fits,
+    int const n_points,
+    int const n_parameters,
+    int const * finished,
+    float * values,
+    float * derivatives,
+    int const n_fits_per_block,
+    int const model_id,
+    int const chunk_index,
+    char * user_info,
+    std::size_t const user_info_size);
+extern __global__ void cuda_update_parameters(
+    float * parameters,
+    float * prev_parameters,
+    float const * deltas,
+    int const n_parameters_to_fit,
+    int const * parameters_to_fit_indices,
+    int const * finished,
+    int const n_fits_per_block);
+extern __global__ void cuda_check_for_convergence(
+    int * finished,
+    float const tolerance,
+    int * states,
+    float const * chi_squares,
+    float const * prev_chi_squares,
+    int const iteration,
+    int const max_n_iterations,
+    int const n_fits);
+extern __global__ void cuda_evaluate_iteration(
+    int * all_finished,
+    int * n_iterations,
+    int * finished,
+    int const iteration,
+    int const * states,
+    int const n_fits);
+extern __global__ void cuda_prepare_next_iteration(
+    float * lambdas,
+    float * chi_squares,
+    float * prev_chi_squares,
+    float * function_parameters,
+    float const * prev_parameters,
+    int const n_fits,
+    int const n_parameters);
+extern __global__ void cuda_update_state_after_gaussjordan(
+    int const n_fits,
+    int const * singular_checks,
+    int * states);
+
+#endif
diff --git a/Gpufit/definitions.h b/Gpufit/definitions.h
new file mode 100644
index 0000000..348220d
--- /dev/null
+++ b/Gpufit/definitions.h
@@ -0,0 +1,12 @@
+#ifndef GPUFIT_DEFINITIONS_H_INCLUDED
+#define GPUFIT_DEFINITIONS_H_INCLUDED
+
+    // Status
+#include <stdexcept>
+#define CUDA_CHECK_STATUS( cuda_function_call ) \
+    if (cudaError_t const status = cuda_function_call) \
+    { \
+        throw std::runtime_error( cudaGetErrorString( status ) ) ; \
+    }
+
+#endif
diff --git a/Gpufit/examples/CMakeLists.txt b/Gpufit/examples/CMakeLists.txt
new file mode 100644
index 0000000..bb4902f
--- /dev/null
+++ b/Gpufit/examples/CMakeLists.txt
@@ -0,0 +1,14 @@
+
+function( add_example module name )
+	add_executable( ${name} ${name}.cpp )
+	target_link_libraries( ${name} ${module} )
+	set_property( TARGET ${name}
+		PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" )
+	set_property( TARGET ${name} PROPERTY FOLDER GpufitExamples )
+endfunction()
+
+# Examples
+
+add_example( Gpufit Simple_Example )
+add_example( Gpufit Linear_Regression_Example )
+add_example( Gpufit Gauss_Fit_2D_Example )
diff --git a/Gpufit/examples/Gauss_Fit_2D_Example.cpp b/Gpufit/examples/Gauss_Fit_2D_Example.cpp
new file mode 100644
index 0000000..8e628c7
--- /dev/null
+++ b/Gpufit/examples/Gauss_Fit_2D_Example.cpp
@@ -0,0 +1,260 @@
+#include "../gpufit.h"
+
+#include <vector>
+#include <random>
+#include <iostream>
+#include <chrono>
+#include <numeric>
+#include <math.h>
+
+void generate_gauss_2d(
+    std::vector<float> const & x,
+    std::vector<float> const & y,
+    std::vector<float> & g,
+    std::vector<float> const & p)
+{
+	// generates a Gaussian 2D peak function on a set of x and y values with some paramters p (size 5)
+	// we assume that x.size == y.size == g.size, no checks done
+
+	// given x and y values and parameters p computes a model function g
+	for (size_t i = 0; i < x.size(); i++)
+	{
+		float arg = -((x[i] - p[1]) * (x[i] - p[1]) + (y[i] - p[2]) * (y[i] - p[2])) / (2 * p[3] * p[3]);
+		g[i] = p[0] * exp(arg) + p[4];
+	}
+}
+
+void gauss_fit_2d_example()
+{
+	/*
+        This example generates test data in form of 10000 two dimensional Gaussian
+        peaks with the size of 5x5 data points per peak. It is noised by Poisson
+        distributed noise. The initial guesses were randomized, within a specified
+        range of the true value. The GAUSS_2D model is fitted to the test data sets
+        using the MLE estimator.
+
+        The console output shows
+         - the execution time,
+         - the ratio of converged fits including ratios of not converged fits for 
+           different reasons,
+         - the values of the true parameters and the mean values of the fitted
+           parameters including their standard deviation,
+         - the mean chi square value
+         - and the mean number of iterations needed.
+
+		True parameters and noise and number of fits is the same as for the Matlab/Python 2D Gaussian examples.
+	*/
+
+
+	// number of fits, fit points and parameters
+	size_t const number_fits = 10000;
+	size_t const size_x = 20;
+	size_t const number_points = size_x * size_x;
+	size_t const number_parameters = 5;
+
+	// true parameters (amplitude, center x position, center y position, width, offset)
+	std::vector< float > true_parameters{ 10.f, 9.5f, 9.5f, 3.f, 10.f}; 
+
+	// initialize random number generator
+	std::mt19937 rng;
+	rng.seed(0);
+	std::uniform_real_distribution< float> uniform_dist(0, 1);
+
+	// initial parameters (randomized)
+	std::vector< float > initial_parameters(number_fits * number_parameters);
+	for (size_t i = 0; i < number_fits; i++)
+	{
+		for (size_t j = 0; j < number_parameters; j++)
+		{
+			if (j == 1 || j == 2)
+			{
+				initial_parameters[i * number_parameters + j]
+                    = true_parameters[j] + true_parameters[3] 
+                    * (-0.2f + 0.4f * uniform_dist(rng));
+			}
+			else
+			{
+				initial_parameters[i * number_parameters + j]
+                    = true_parameters[j] * (0.8f + 0.4f * uniform_dist(rng));
+			}
+		}
+	}
+
+	// generate x and y values
+	std::vector< float > x(number_points);
+	std::vector< float > y(number_points);
+	for (size_t i = 0; i < size_x; i++)
+	{
+		for (size_t j = 0; j < size_x; j++) {
+			x[i * size_x + j] = static_cast<float>(j);
+			y[i * size_x + j] = static_cast<float>(i);
+		}
+	}
+
+	// generate test data with Poisson noise
+	std::vector< float > temp(number_points);
+	generate_gauss_2d(x, y, temp, true_parameters);
+
+	std::vector< float > data(number_fits * number_points);
+	for (size_t i = 0; i < number_fits; i++)
+	{
+		for (size_t j = 0; j < number_points; j++)
+		{
+			std::poisson_distribution< int > poisson_dist(temp[j]);
+			data[i * number_points + j] = static_cast<float>(poisson_dist(rng));
+		}
+	}
+
+	// tolerance
+	float const tolerance = 0.001f;
+
+	// maximal number of iterations
+	int const max_number_iterations = 20;
+
+	// estimator ID
+	int const estimator_id = MLE;
+
+	// model ID
+	int const model_id = GAUSS_2D;
+
+	// parameters to fit (all of them)
+	std::vector< int > parameters_to_fit(number_parameters, 1);
+
+	// output parameters
+	std::vector< float > output_parameters(number_fits * number_parameters);
+	std::vector< int > output_states(number_fits);
+	std::vector< float > output_chi_square(number_fits);
+	std::vector< int > output_number_iterations(number_fits);
+
+	// call to gpufit (C interface)
+	std::chrono::high_resolution_clock::time_point time_0 = std::chrono::high_resolution_clock::now();
+	int const status = gpufit
+        (
+            number_fits,
+            number_points,
+            data.data(),
+            0,
+            model_id,
+            initial_parameters.data(),
+            tolerance,
+            max_number_iterations,
+            parameters_to_fit.data(),
+            estimator_id,
+            0,
+            0,
+            output_parameters.data(),
+            output_states.data(),
+            output_chi_square.data(),
+            output_number_iterations.data()
+        );
+	std::chrono::high_resolution_clock::time_point time_1 = std::chrono::high_resolution_clock::now();
+
+	// check status
+	if (status != STATUS_OK)
+	{
+		throw std::runtime_error(gpufit_get_last_error());
+	}
+
+	// print execution time
+	std::cout
+        << "execution time "
+        << std::chrono::duration_cast<std::chrono::milliseconds>(time_1 - time_0).count() << " ms\n";
+
+	// get fit states
+	std::vector< int > output_states_histogram(5, 0);
+	for (std::vector< int >::iterator it = output_states.begin(); it != output_states.end(); ++it)
+	{
+		output_states_histogram[*it]++;
+	}
+
+	std::cout << "ratio converged              " << (float)output_states_histogram[0] / number_fits << "\n";
+	std::cout << "ratio max iteration exceeded " << (float)output_states_histogram[1] / number_fits << "\n";
+	std::cout << "ratio singular hessian       " << (float)output_states_histogram[2] / number_fits << "\n";
+	std::cout << "ratio neg curvature MLE      " << (float)output_states_histogram[3] / number_fits << "\n";
+	std::cout << "ratio gpu not read           " << (float)output_states_histogram[4] / number_fits << "\n";
+
+	// compute mean of fitted parameters for converged fits
+	std::vector< float > output_parameters_mean(number_parameters, 0);
+	for (size_t i = 0; i != number_fits; i++)
+	{
+		if (output_states[i] == STATE_CONVERGED)
+		{
+			for (size_t j = 0; j < number_parameters; j++)
+			{
+				output_parameters_mean[j] += output_parameters[i * number_parameters + j];
+			}
+		}
+	}
+	// normalize
+	for (size_t j = 0; j < number_parameters; j++)
+	{
+		output_parameters_mean[j] /= output_states_histogram[0];
+	}
+	
+	// compute std of fitted parameters for converged fits
+	std::vector< float > output_parameters_std(number_parameters, 0);
+	for (size_t i = 0; i != number_fits; i++)
+	{
+		if (output_states[i] == STATE_CONVERGED)
+		{
+			for (size_t j = 0; j < number_parameters; j++)
+			{
+				output_parameters_std[j]
+                    += (output_parameters[i * number_parameters + j] - output_parameters_mean[j])
+                    *  (output_parameters[i * number_parameters + j] - output_parameters_mean[j]);
+			}
+		}
+	}
+	// normalize and take square root
+	for (size_t j = 0; j < number_parameters; j++)
+	{
+		output_parameters_std[j] = sqrt(output_parameters_std[j] / output_states_histogram[0]);
+	}
+
+	// print true value, fitted mean and std for every parameter
+	for (size_t j = 0; j < number_parameters; j++)
+	{
+		std::cout
+            << "parameter "     << j
+            << " true "         << true_parameters[j]
+            << " fitted mean "  << output_parameters_mean[j]
+            << " std "          << output_parameters_std[j] << "\n";
+	}
+
+	// compute mean chi-square for those converged
+	float  output_chi_square_mean = 0;
+	for (size_t i = 0; i != number_fits; i++)
+	{
+		if (output_states[i] == STATE_CONVERGED)
+		{
+			output_chi_square_mean += output_chi_square[i];
+		}
+	}
+	output_chi_square_mean /= static_cast<float>(output_states_histogram[0]);
+	std::cout << "mean chi square " << output_chi_square_mean << "\n";
+
+	// compute mean number of iterations for those converged
+	float  output_number_iterations_mean = 0;
+	for (size_t i = 0; i != number_fits; i++)
+	{
+		if (output_states[i] == STATE_CONVERGED)
+		{
+			output_number_iterations_mean += static_cast<float>(output_number_iterations[i]);
+		}
+	}
+	// normalize
+	output_number_iterations_mean /= static_cast<float>(output_states_histogram[0]);
+	std::cout << "mean number of iterations " << output_number_iterations_mean << "\n";
+
+}
+
+int main(int argc, char *argv[])
+{
+	gauss_fit_2d_example();
+
+    std::cout << std::endl << "Example completed!" << std::endl;
+    std::cout << "Press ENTER to exit" << std::endl;
+    std::getchar();
+
+	return 0;
+}
diff --git a/Gpufit/examples/Linear_Regression_Example.cpp b/Gpufit/examples/Linear_Regression_Example.cpp
new file mode 100644
index 0000000..e70e05d
--- /dev/null
+++ b/Gpufit/examples/Linear_Regression_Example.cpp
@@ -0,0 +1,207 @@
+#include "../gpufit.h"
+
+#include <vector>
+#include <random>
+#include <iostream>
+#include <math.h>
+
+void linear_regression_example()
+{
+    /*
+    This example generates test data in form of 10000 one dimensional linear
+    curves with the size of 20 data points per curve. It is noised by normal
+    distributed noise. The initial guesses were randomized, within a specified
+    range of the true value. The LINEAR_1D model is fitted to the test data sets
+    using the LSE estimator. The optional parameter user_info is used to pass 
+    custom x positions of the data sets. The same x position values are used for
+    every fit.
+
+    The console output shows
+    - the ratio of converged fits including ratios of not converged fits for
+      different reasons,
+    - the values of the true parameters and the mean values of the fitted
+      parameters including their standard deviation,
+    - the mean chi square value
+    - and the mean number of iterations needed.
+    */
+
+	// number of fits, fit points and parameters
+	size_t const number_fits = 10000;
+	size_t const number_points = 20;
+	size_t const number_parameters = 2;
+
+	// custom x positions for the data points of every fit, stored in user info
+	std::vector< float > user_info(number_points);
+	for (size_t i = 0; i < number_points; i++)
+	{
+		user_info[i] = static_cast<float>(pow(2, i));
+	}
+
+	// size of user info in bytes
+	size_t const user_info_size = number_points * sizeof(float); 
+
+	// initialize random number generator
+	std::mt19937 rng;
+	rng.seed(0);
+	std::uniform_real_distribution< float > uniform_dist(0, 1);
+	std::normal_distribution< float > normal_dist(0, 1);
+
+	// true parameters
+	std::vector< float > true_parameters { 5, 2 }; // offset, slope
+
+	// initial parameters (randomized)
+	std::vector< float > initial_parameters(number_fits * number_parameters);
+	for (size_t i = 0; i != number_fits; i++)
+	{
+		// random offset
+		initial_parameters[i * number_parameters + 0] = true_parameters[0] * (0.8f + 0.4f * uniform_dist(rng));
+		// random slope
+		initial_parameters[i * number_parameters + 1] = true_parameters[0] * (0.8f + 0.4f * uniform_dist(rng));
+	}
+
+	// generate data
+	std::vector< float > data(number_points * number_fits);
+	for (size_t i = 0; i != data.size(); i++)
+	{
+		size_t j = i / number_points; // the fit
+		size_t k = i % number_points; // the position within a fit
+
+		float x = user_info[k];
+		float y = true_parameters[0] + x * true_parameters[1];
+		data[i] = y + normal_dist(rng);
+	}
+
+	// tolerance
+	float const tolerance = 0.001f;
+
+	// maximal number of iterations
+	int const max_number_iterations = 20;
+
+	// estimator ID
+	int const estimator_id = LSE;
+
+	// model ID
+	int const model_id = LINEAR_1D;
+
+	// parameters to fit (all of them)
+	std::vector< int > parameters_to_fit(number_parameters, 1);
+
+	// output parameters
+	std::vector< float > output_parameters(number_fits * number_parameters);
+	std::vector< int > output_states(number_fits);
+	std::vector< float > output_chi_square(number_fits);
+	std::vector< int > output_number_iterations(number_fits);
+
+	// call to gpufit (C interface)
+	int const status = gpufit
+        (
+            number_fits,
+            number_points,
+            data.data(),
+            0,
+            model_id,
+            initial_parameters.data(),
+            tolerance,
+            max_number_iterations,
+            parameters_to_fit.data(),
+            estimator_id,
+            user_info_size,
+            reinterpret_cast< char * >( user_info.data() ),
+            output_parameters.data(),
+            output_states.data(),
+            output_chi_square.data(),
+            output_number_iterations.data()
+        );
+
+	// check status
+	if (status != STATUS_OK)
+	{
+		throw std::runtime_error(gpufit_get_last_error());
+	}
+
+	// get fit states
+	std::vector< int > output_states_histogram(5, 0);
+	for (std::vector< int >::iterator it = output_states.begin(); it != output_states.end(); ++it)
+	{
+		output_states_histogram[*it]++;
+	}
+
+	std::cout << "ratio converged              " << (float) output_states_histogram[0] / number_fits << "\n";
+	std::cout << "ratio max iteration exceeded " << (float) output_states_histogram[1] / number_fits << "\n";
+	std::cout << "ratio singular hessian       " << (float) output_states_histogram[2] / number_fits << "\n";
+	std::cout << "ratio neg curvature MLE      " << (float) output_states_histogram[3] / number_fits << "\n";
+	std::cout << "ratio gpu not read           " << (float) output_states_histogram[4] / number_fits << "\n";
+
+	// compute mean fitted parameters for converged fits
+	std::vector< float > output_parameters_mean(number_parameters, 0);
+	for (size_t i = 0; i != number_fits; i++)
+	{
+		if (output_states[i] == STATE_CONVERGED)
+		{
+			// add offset
+			output_parameters_mean[0] += output_parameters[i * number_parameters + 0];
+			// add slope
+			output_parameters_mean[1] += output_parameters[i * number_parameters + 1];
+		}
+	}
+	output_parameters_mean[0] /= output_states_histogram[0];
+	output_parameters_mean[1] /= output_states_histogram[0];
+
+	// compute std of fitted parameters for converged fits
+	std::vector< float > output_parameters_std(number_parameters, 0);
+	for (size_t i = 0; i != number_fits; i++)
+	{
+		if (output_states[i] == STATE_CONVERGED)
+		{
+			// add squared deviation for offset
+			output_parameters_std[0] += (output_parameters[i * number_parameters + 0] - output_parameters_mean[0]) * (output_parameters[i * number_parameters + 0] - output_parameters_mean[0]);
+			// add squared deviation for slope
+			output_parameters_std[1] += (output_parameters[i * number_parameters + 1] - output_parameters_mean[1]) * (output_parameters[i * number_parameters + 1] - output_parameters_mean[1]);
+		}
+	}
+	// divide and take square root
+	output_parameters_std[0] = sqrt(output_parameters_std[0] / output_states_histogram[0]);
+	output_parameters_std[1] = sqrt(output_parameters_std[1] / output_states_histogram[0]);
+
+	// print mean and std
+	std::cout << "offset  true " << true_parameters[0] << " mean " << output_parameters_mean[0] << " std " << output_parameters_std[0] << "\n";
+	std::cout << "slope   true " << true_parameters[1] << " mean " << output_parameters_mean[1] << " std " << output_parameters_std[1] << "\n";
+
+	// compute mean chi-square for those converged
+	float  output_chi_square_mean = 0;
+	for (size_t i = 0; i != number_fits; i++)
+	{
+		if (output_states[i] == STATE_CONVERGED)
+		{
+			output_chi_square_mean += output_chi_square[i];
+		}
+	}
+	output_chi_square_mean /= static_cast<float>(output_states_histogram[0]);
+	std::cout << "mean chi square " << output_chi_square_mean << "\n";
+
+	// compute mean number of iterations for those converged
+	float  output_number_iterations_mean = 0;
+	for (size_t i = 0; i != number_fits; i++)
+	{
+		if (output_states[i] == STATE_CONVERGED)
+		{
+			output_number_iterations_mean += static_cast<float>(output_number_iterations[i]);
+		}
+	}
+
+	// normalize
+	output_number_iterations_mean /= static_cast<float>(output_states_histogram[0]);
+	std::cout << "mean number of iterations " << output_number_iterations_mean << "\n";
+}
+
+
+int main(int argc, char *argv[])
+{
+	linear_regression_example();
+
+    std::cout << std::endl << "Example completed!" << std::endl;
+    std::cout << "Press ENTER to exit" << std::endl;
+    std::getchar();
+	
+	return 0;
+}
diff --git a/Gpufit/examples/Simple_Example.cpp b/Gpufit/examples/Simple_Example.cpp
new file mode 100644
index 0000000..6d8ea91
--- /dev/null
+++ b/Gpufit/examples/Simple_Example.cpp
@@ -0,0 +1,94 @@
+#include "../gpufit.h"
+#include <iostream>
+#include <vector>
+
+void simple_example()
+{
+	/*
+		Simple example demonstrating a minimal call of all needed parameters to
+        the C interface. It can be built and executed, but in this exeample
+        gpufit doesn't do anything useful and it doesn't yield meaningful
+        output. No test data is generated. The values of the input data vector
+        and the initial fit parameters vector are set to 0.
+
+        This example can be devided in three parts:
+            - definition of input and output parameters
+            - call to gpufit
+            - status check
+	*/
+
+    /*************** definition of input and output parameters  ***************/
+
+	// number of fits, number of points per fit
+	size_t const number_fits = 10;
+	size_t const number_points = 10;
+
+	// model ID and number of parameter
+	int const model_id = GAUSS_1D;
+	size_t const number_parameters = 4;
+
+	// initial parameters
+	std::vector< float > initial_parameters(number_fits * number_parameters);
+
+	// data
+	std::vector< float > data(number_points * number_fits);
+
+	// tolerance
+	float const tolerance = 0.001f;
+
+	// maximal number of iterations
+	int const max_number_iterations = 10;
+
+	// estimator ID
+	int const estimator_id = LSE;
+
+	// parameters to fit (all of them)
+	std::vector< int > parameters_to_fit(number_parameters, 1);
+
+	// output parameters
+	std::vector< float > output_parameters(number_fits * number_parameters);
+	std::vector< int > output_states(number_fits);
+	std::vector< float > output_chi_square(number_fits);
+	std::vector< int > output_number_iterations(number_fits);
+
+    /***************************** call to gpufit  ****************************/
+
+	int const status = gpufit
+        (
+            number_fits,
+            number_points,
+            data.data(),
+            0,
+            model_id,
+            initial_parameters.data(),
+            tolerance,
+            max_number_iterations,
+            parameters_to_fit.data(),
+            estimator_id,
+            0,
+            0,
+            output_parameters.data(),
+            output_states.data(),
+            output_chi_square.data(),
+            output_number_iterations.data()
+        );
+
+    /****************************** status check  *****************************/
+
+	if (status != STATUS_OK)
+	{
+		throw std::runtime_error(gpufit_get_last_error());
+	}
+}
+
+
+int main(int argc, char *argv[])
+{
+	simple_example();
+
+    std::cout << std::endl << "Example completed!" << std::endl;
+    std::cout << "Press ENTER to exit" << std::endl;
+    std::getchar();
+	
+	return 0;
+}
diff --git a/Gpufit/gauss_1d.cuh b/Gpufit/gauss_1d.cuh
new file mode 100644
index 0000000..5fefc55
--- /dev/null
+++ b/Gpufit/gauss_1d.cuh
@@ -0,0 +1,91 @@
+#ifndef GPUFIT_GAUSS1D_CUH_INCLUDED
+#define GPUFIT_GAUSS1D_CUH_INCLUDED
+
+/* Description of the calculate_gauss1d function
+* ==============================================
+*
+* This function calculates the values of one-dimensional gauss model functions
+* and their partial derivatives with respect to the model parameters. 
+*
+* No independent variables are passed to this model function.  Hence, the 
+* (X) coordinate of the first data value is assumed to be (0.0).  For
+* a fit size of M data points, the (X) coordinates of the data are
+* simply the corresponding array index values of the data array, starting from
+* zero.
+*
+* Parameters:
+*
+* parameters: An input vector of concatenated sets of model parameters.
+*             p[0]: amplitude
+*             p[1]: center coordinate
+*             p[2]: width (standard deviation)
+*             p[3]: offset
+*
+* n_fits: The number of fits. (not used)
+*
+* n_points: The number of data points per fit.
+*
+* n_parameters: The number of model parameters.
+*
+* values: An output vector of concatenated sets of model function values.
+*
+* derivatives: An output vector of concatenated sets of model function partial
+*              derivatives.
+*
+* chunk_index: The chunk index. (not used)
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_gauss1d function
+* ======================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function. When calling the function, the blocks and threads of the __global__
+* function must be set up correctly, as shown in the following example code.
+*
+*   dim3  threads(1, 1, 1);
+*   dim3  blocks(1, 1, 1);
+*
+*   threads.x = n_points * n_fits_per_block;
+*   blocks.x = n_fits / n_fits_per_block;
+*
+*   global_function<<< blocks,threads >>>(parameter1, ...);
+*
+*/
+
+__device__ void calculate_gauss1d(
+    float const * parameters,
+    int const n_fits,
+    int const n_points,
+    int const n_parameters,
+    float * values,
+    float * derivatives,
+    int const chunk_index,
+    char * user_info,
+    std::size_t const user_info_size)
+{
+    int const n_fits_per_block = blockDim.x / n_points;
+    int const fit_in_block = threadIdx.x / n_points;
+    int const point_index = threadIdx.x - (fit_in_block*n_points);
+    int const fit_index = blockIdx.x*n_fits_per_block + fit_in_block;
+
+    float * current_value = &values[fit_index * n_points];
+    float const * p = &parameters[fit_index * n_parameters];
+    
+    float const argx = (point_index - p[1]) * (point_index - p[1]) / (2 * p[2] * p[2]);
+    float const ex = exp(-argx);
+    current_value[point_index] = p[0] * ex + p[3];
+
+    // derivatives
+
+    float * current_derivative = &derivatives[fit_index * n_points * n_parameters + point_index];
+
+    current_derivative[0]  = ex;
+    current_derivative[1 * n_points]  = p[0] * ex * (point_index - p[1]) / (p[2] * p[2]);
+    current_derivative[2 * n_points]  = p[0] * ex * (point_index - p[1]) * (point_index - p[1]) / (p[2] * p[2] * p[2]);
+    current_derivative[3 * n_points]  = 1.f;
+}
+
+#endif
diff --git a/Gpufit/gauss_2d.cuh b/Gpufit/gauss_2d.cuh
new file mode 100644
index 0000000..0448cfa
--- /dev/null
+++ b/Gpufit/gauss_2d.cuh
@@ -0,0 +1,97 @@
+#ifndef GPUFIT_GAUSS2D_CUH_INCLUDED
+#define GPUFIT_GAUSS2D_CUH_INCLUDED
+
+/* Description of the calculate_gauss2d function
+* ==============================================
+*
+* This function calculates the values of two-dimensional gauss model functions
+* and their partial derivatives with respect to the model parameters. 
+*
+* No independent variables are passed to this model function.  Hence, the 
+* (X, Y) coordinate of the first data value is assumed to be (0.0, 0.0).  For
+* a fit size of M x N data points, the (X, Y) coordinates of the data are
+* simply the corresponding array index values of the data array, starting from
+* zero.
+*
+* Parameters:
+*
+* parameters: An input vector of concatenated sets of model parameters.
+*             p[0]: amplitude
+*             p[1]: center coordinate x
+*             p[2]: center coordinate y
+*             p[3]: width (standard deviation; equal width in x and y dimensions)
+*             p[4]: offset
+*
+* n_fits: The number of fits. (not used)
+*
+* n_points: The number of data points per fit.
+*
+* n_parameters: The number of model parameters.
+*
+* values: An output vector of concatenated sets of model function values.
+*
+* derivatives: An output vector of concatenated sets of model function partial
+*              derivatives.
+*
+* chunk_index: The chunk index. (not used)
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_gauss2d function
+* ======================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function. When calling the function, the blocks and threads of the __global__
+* function must be set up correctly, as shown in the following example code.
+*
+*   dim3  threads(1, 1, 1);
+*   dim3  blocks(1, 1, 1);
+*
+*   threads.x = n_points * n_fits_per_block;
+*   blocks.x = n_fits / n_fits_per_block;
+*
+*   global_function<<< blocks,threads >>>(parameter1, ...);
+*
+*/
+
+__device__ void calculate_gauss2d(
+    float const * parameters,
+    int const n_fits,
+    int const n_points,
+    int const n_parameters,
+    float * values,
+    float * derivatives,
+    int const chunk_index,
+    char * user_info,
+    std::size_t const user_info_size)
+{
+    int const n_points_x = sqrt((float)n_points);
+    int const n_fits_per_block = blockDim.x / n_points;
+    int const fit_in_block = threadIdx.x / n_points;
+    int const point_index = threadIdx.x - fit_in_block * n_points;
+    int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block;
+    int const point_index_y = point_index / n_points_x;
+    int const point_index_x = point_index - point_index_y * n_points_x;
+
+    float* current_value = &values[fit_index * n_points];
+    float const * p = &parameters[fit_index * n_parameters];
+
+    float const argx = (point_index_x - p[1]) * (point_index_x - p[1]) / (2 * p[3] * p[3]);
+    float const argy = (point_index_y - p[2]) * (point_index_y - p[2]) / (2 * p[3] * p[3]);
+    float const ex = exp(-(argx + argy));
+    current_value[point_index] = p[0] * ex + p[4];
+
+    // derivatives
+
+    float * current_derivative = &derivatives[fit_index * n_points * n_parameters + point_index];
+
+    current_derivative[0] = ex;
+    current_derivative[1 * n_points] = p[0] * ex * (point_index_x - p[1]) / (p[3] * p[3]);
+    current_derivative[2 * n_points] = p[0] * ex * (point_index_y - p[2]) / (p[3] * p[3]);
+    current_derivative[3 * n_points] = ex * p[0] * ((point_index_x - p[1]) * (point_index_x - p[1]) + (point_index_y - p[2]) * (point_index_y - p[2])) / (p[3] * p[3] * p[3]);
+    current_derivative[4 * n_points] = 1;
+}
+
+#endif
diff --git a/Gpufit/gauss_2d_elliptic.cuh b/Gpufit/gauss_2d_elliptic.cuh
new file mode 100644
index 0000000..5417667
--- /dev/null
+++ b/Gpufit/gauss_2d_elliptic.cuh
@@ -0,0 +1,100 @@
+#ifndef GPUFIT_GAUSS2DELLIPTIC_CUH_INCLUDED
+#define GPUFIT_GAUSS2DELLIPTIC_CUH_INCLUDED
+
+/* Description of the calculate_gauss2delliptic function
+* ======================================================
+*
+* This function calculates the values of two-dimensional elliptic gauss model
+* functions and their partial derivatives with respect to the model parameters.
+*
+* No independent variables are passed to this model function.  Hence, the 
+* (X, Y) coordinate of the first data value is assumed to be (0.0, 0.0).  For
+* a fit size of M x N data points, the (X, Y) coordinates of the data are
+* simply the corresponding array index values of the data array, starting from
+* zero.
+*
+* Parameters:
+*
+* parameters: An input vector of concatenated sets of model parameters.
+*             p[0]: amplitude
+*             p[1]: center coordinate x
+*             p[2]: center coordinate y
+*             p[3]: width x (standard deviation)
+*             p[4]: width y (standard deviation)
+*             p[5]: offset
+*
+* n_fits: The number of fits. (not used)
+*
+* n_points: The number of data points per fit.
+*
+* n_parameters: The number of model parameters.
+*
+* values: An output vector of concatenated sets of model function values.
+*
+* derivatives: An output vector of concatenated sets of model function partial
+*              derivatives.
+*
+* chunk_index: The chunk index. (not used)
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_gauss2delliptic function
+* ==============================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function. When calling the function, the blocks and threads of the __global__
+* function must be set up correctly, as shown in the following example code.
+*
+*   dim3  threads(1, 1, 1);
+*   dim3  blocks(1, 1, 1);
+*
+*   threads.x = n_points * n_fits_per_block;
+*   blocks.x = n_fits / n_fits_per_block;
+*
+*   global_function<<< blocks,threads >>>(parameter1, ...);
+*
+*/
+
+__device__ void calculate_gauss2delliptic(
+    float const * parameters,
+    int const n_fits,
+    int const n_points,
+    int const n_parameters,
+    float * values,
+    float * derivatives,
+    int const chunk_index,
+    char * user_info,
+    std::size_t const user_info_size)
+{
+    int const n_points_x = sqrt((float)n_points);
+    int const n_fits_per_block = blockDim.x / n_points;
+    int const fit_in_block = threadIdx.x / n_points;
+    int const point_index = threadIdx.x - (fit_in_block*n_points);
+    int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block;
+
+    int const point_index_y = point_index / n_points_x;
+    int const point_index_x = point_index - point_index_y * n_points_x;
+
+    float* current_value = &values[fit_index * n_points];
+    float const * p = &parameters[fit_index * n_parameters];
+
+    float const argx = (point_index_x - p[1]) * (point_index_x - p[1]) / (2 * p[3] * p[3]);
+    float const argy = (point_index_y - p[2]) * (point_index_y - p[2]) / (2 * p[4] * p[4]);
+    float const ex = exp(-(argx + argy));
+    current_value[point_index] = p[0] * ex + p[5];
+
+    // derivatives
+
+    float * current_derivative = &derivatives[fit_index * n_points * n_parameters + point_index];
+
+    current_derivative[0] = ex;
+    current_derivative[1 * n_points] = p[0] * ex * (point_index_x - p[1]) / (p[3] * p[3]);
+    current_derivative[2 * n_points] = p[0] * ex * (point_index_y - p[2]) / (p[4] * p[4]);
+    current_derivative[3 * n_points] = p[0] * ex * (point_index_x - p[1]) * (point_index_x - p[1]) / (p[3] * p[3] * p[3]);
+    current_derivative[4 * n_points] = p[0] * ex * (point_index_y - p[2]) * (point_index_y - p[2]) / (p[4] * p[4] * p[4]);
+    current_derivative[5 * n_points] = 1;
+}
+
+#endif
diff --git a/Gpufit/gauss_2d_rotated.cuh b/Gpufit/gauss_2d_rotated.cuh
new file mode 100644
index 0000000..09d042f
--- /dev/null
+++ b/Gpufit/gauss_2d_rotated.cuh
@@ -0,0 +1,106 @@
+#ifndef GPUFIT_GAUSS2DROTATED_CUH_INCLUDED
+#define GPUFIT_GAUSS2DROTATED_CUH_INCLUDED
+
+/* Description of the calculate_gauss2drotated function
+* =====================================================
+*
+* This function calculates the values of two-dimensional elliptic gauss model
+* functions including a rotation parameter and their partial derivatives with
+* respect to the model parameters. 
+*
+* No independent variables are passed to this model function.  Hence, the 
+* (X, Y) coordinate of the first data value is assumed to be (0.0, 0.0).  For
+* a fit size of M x N data points, the (X, Y) coordinates of the data are
+* simply the corresponding array index values of the data array, starting from
+* zero.
+*
+* Parameters:
+*
+* parameters: An input vector of concatenated sets of model parameters.
+*             p[0]: amplitude
+*             p[1]: center coordinate x
+*             p[2]: center coordinate y
+*             p[3]: width x (standard deviation)
+*             p[4]: width y (standard deviation)
+*             p[5]: offset
+*             p[6]: rotation angle [radians]
+*
+* n_fits: The number of fits. (not used)
+*
+* n_points: The number of data points per fit.
+*
+* n_parameters: The number of model parameters.
+*
+* values: An output vector of concatenated sets of model function values.
+*
+* derivatives: An output vector of concatenated sets of model function partial
+*              derivatives.
+*
+* chunk_index: The chunk index. (not used)
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_gauss2drotated function
+* =============================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function. When calling the function, the blocks and threads of the __global__
+* function must be set up correctly, as shown in the following example code.
+*
+*   dim3  threads(1, 1, 1);
+*   dim3  blocks(1, 1, 1);
+*
+*   threads.x = n_points * n_fits_per_block;
+*   blocks.x = n_fits / n_fits_per_block;
+*
+*   global_function<<< blocks,threads >>>(parameter1, ...);
+*
+*/
+
+__device__ void calculate_gauss2drotated(
+    float const * parameters,
+    int const n_fits,
+    int const n_points,
+    int const n_parameters,
+    float * values,
+    float * derivatives,
+    int const chunk_index,
+    char * user_info,
+    std::size_t const user_info_size)
+{
+    int const n_points_x = sqrt((float)n_points);
+    int const n_fits_per_block = blockDim.x / n_points;
+    int const fit_in_block = threadIdx.x / n_points;
+    int const point_index = threadIdx.x - (fit_in_block*n_points);
+    int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block;
+
+    int const point_index_y = point_index / n_points_x;
+    int const point_index_x = point_index - point_index_y * n_points_x;
+
+    float* current_value = &values[fit_index * n_points];
+    float const * p = &parameters[fit_index * n_parameters];
+
+    float const cosp6 = cosf(p[6]);
+    float const sinp6 = sinf(p[6]);
+
+    float const arga = (point_index_x - p[1]) * cosp6 - (point_index_y - p[2]) * sinp6;
+    float const argb = (point_index_x - p[1]) * sinp6 + (point_index_y - p[2]) * cosp6;
+    float const ex = exp(-0.5 * (((arga / p[3]) * (arga / p[3])) + ((argb / p[4]) * (argb / p[4]))));
+    current_value[point_index] = p[0] * ex + p[5];
+
+    // derivatives
+
+    float * current_derivative = &derivatives[fit_index * n_points * n_parameters + point_index];
+
+    current_derivative[0] = ex;
+    current_derivative[1 * n_points] = (((p[0] * cosp6 * arga) / (p[3] * p[3])) + ((p[0] * sinp6 * argb) / (p[4] * p[4]))) * ex;
+    current_derivative[2 * n_points] = (((-p[0] * sinp6 * arga) / (p[3] * p[3])) + ((p[0] * cosp6 * argb) / (p[4] * p[4]))) * ex;
+    current_derivative[3 * n_points] = p[0] * arga * arga / (p[3] * p[3] * p[3]) * ex;
+    current_derivative[4 * n_points] = p[0] * argb * argb / (p[4] * p[4] * p[4]) * ex;
+    current_derivative[5 * n_points] = 1;
+    current_derivative[6 * n_points] = p[0] * arga * argb * (1.0 / (p[3] * p[3]) - 1.0 / (p[4] * p[4])) * ex;
+}
+
+#endif
diff --git a/Gpufit/gpu_data.cu b/Gpufit/gpu_data.cu
new file mode 100644
index 0000000..afbca05
--- /dev/null
+++ b/Gpufit/gpu_data.cu
@@ -0,0 +1,175 @@
+#include "gpu_data.cuh"
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+
+GPUData::GPUData(Info const & info) :
+    chunk_size_(0),
+    info_(info),
+
+    data_( info_.max_chunk_size_*info_.n_points_ ),
+    weights_( info_.use_weights_ ? info_.n_points_ * info_.max_chunk_size_ : 0 ),
+    parameters_( info_.max_chunk_size_*info_.n_parameters_ ),
+    prev_parameters_( info_.max_chunk_size_*info_.n_parameters_ ),
+    parameters_to_fit_indices_( info_.n_parameters_to_fit_ ),
+    user_info_( info_.user_info_size_ ),
+
+    chi_squares_( info_.max_chunk_size_ ),
+    prev_chi_squares_( info_.max_chunk_size_ ),
+    gradients_( info_.max_chunk_size_ * info_.n_parameters_to_fit_ ),
+    hessians_( info_.max_chunk_size_ * info_.n_parameters_to_fit_ * info_.n_parameters_to_fit_ ),
+    deltas_(info_.max_chunk_size_ * info_.n_parameters_to_fit_),
+
+    values_( info_.max_chunk_size_ * info_.n_points_ ),
+    derivatives_( info_.max_chunk_size_ * info_.n_points_ * info_.n_parameters_ ),
+
+    lambdas_( info_.max_chunk_size_ ),
+    states_( info_.max_chunk_size_ ),
+    finished_( info_.max_chunk_size_ ),
+    iteration_falied_(info_.max_chunk_size_),
+    all_finished_( 1 ),
+    n_iterations_( info_.max_chunk_size_ )
+{
+
+}
+
+void GPUData::reset(int const chunk_size)
+{
+    chunk_size_ = chunk_size;
+
+    set(data_, 0.f, chunk_size_ * info_.n_points_);
+    if (info_.use_weights_)
+        set(weights_, 0.f, chunk_size_ * info_.n_points_);
+    set(parameters_, 0.f, chunk_size_ * info_.n_parameters_);
+    set(prev_parameters_, 0.f, chunk_size_ * info_.n_parameters_);
+    set(parameters_to_fit_indices_, 0, info_.n_parameters_to_fit_);
+
+    set(chi_squares_, 0.f, chunk_size_);
+    set(prev_chi_squares_, 0.f, chunk_size_);
+    set(gradients_, 0.f, chunk_size_ * info_.n_parameters_to_fit_);
+    set(hessians_, 0.f, chunk_size_ * info_.n_parameters_to_fit_ * info_.n_parameters_to_fit_);
+    set(deltas_, 0.f, chunk_size_ * info_.n_parameters_to_fit_);
+
+    set(values_, 0.f, chunk_size_*info_.n_points_);
+    set(derivatives_, 0.f, chunk_size_ * info_.n_points_ * info_.n_parameters_);
+
+    set(lambdas_, 0.f, chunk_size_);
+    set(states_, 0, chunk_size_);
+    set(finished_, 0, chunk_size_);
+    set(iteration_falied_, 0, chunk_size_);
+    set(all_finished_, 0, 1);
+    set(n_iterations_, 0, chunk_size_);
+}
+
+void GPUData::init
+(
+    int const chunk_index,
+    float const * const data,
+    float const * const weights,
+    float const * const initial_parameters,
+    std::vector<int> const & parameters_to_fit_indices)
+{
+    chunk_index_ = chunk_index;
+    write(
+        data_,
+        &data[chunk_index_*info_.max_chunk_size_*info_.n_points_],
+        chunk_size_*info_.n_points_);
+    if (info_.use_weights_)
+        write(weights_, &weights[chunk_index_*info_.max_chunk_size_*info_.n_points_],
+                chunk_size_*info_.n_points_);
+    write(
+        parameters_,
+        &initial_parameters[chunk_index_*info_.max_chunk_size_*info_.n_parameters_],
+        chunk_size_ * info_.n_parameters_);
+    write(parameters_to_fit_indices_, parameters_to_fit_indices);
+
+    set(lambdas_, 0.001f, chunk_size_);
+}
+
+void GPUData::init_user_info(char const * const user_info)
+{
+    if (info_.user_info_size_ > 0)
+        write(user_info_, user_info, info_.user_info_size_);
+}
+
+void GPUData::read(bool * dst, int const * src)
+{
+    int int_dst = 0;
+    CUDA_CHECK_STATUS(cudaMemcpy(&int_dst, src, sizeof(int), cudaMemcpyDeviceToHost));
+    * dst = (int_dst == 1) ? true : false;
+}
+
+void GPUData::write(float* dst, float const * src, int const count)
+{
+    CUDA_CHECK_STATUS(cudaMemcpy(dst, src, count * sizeof(float), cudaMemcpyHostToDevice));
+}
+
+void GPUData::write(int* dst, std::vector<int> const & src)
+{
+    std::size_t const size = src.size() * sizeof(int);
+    CUDA_CHECK_STATUS(cudaMemcpy(dst, src.data(), size, cudaMemcpyHostToDevice));
+}
+
+void GPUData::write(char* dst, char const * src, std::size_t const count)
+{
+    CUDA_CHECK_STATUS(cudaMemcpy(dst, src, count * sizeof(char), cudaMemcpyHostToDevice));
+}
+
+void GPUData::copy(float * dst, float const * src, std::size_t const count)
+{
+    CUDA_CHECK_STATUS(cudaMemcpy(dst, src, count * sizeof(float), cudaMemcpyDeviceToDevice));
+}
+
+__global__ void set_kernel(int* dst, int const value, int const count)
+{
+    int const index = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (index >= count)
+        return;
+
+    dst[index] = value;
+}
+
+void GPUData::set(int* arr, int const value, int const count)
+{
+    int const tx = 256;
+	int const bx = (count / tx) + 1;
+
+    dim3  threads(tx, 1, 1);
+    dim3  blocks(bx, 1, 1);
+
+    set_kernel<<< blocks, threads >>>(arr, value, count);
+    CUDA_CHECK_STATUS(cudaGetLastError());
+}
+
+void GPUData::set(int* arr, int const value)
+{
+    int const tx = 1;
+    int const bx = 1;
+
+    dim3  threads(tx, 1, 1);
+    dim3  blocks(bx, 1, 1);
+
+    set_kernel<<< blocks, threads >>>(arr, value, 1);
+    CUDA_CHECK_STATUS(cudaGetLastError());
+}
+
+__global__ void set_kernel(float* dst, float const value, std::size_t const count)
+{
+	std::size_t const index = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (index >= count)
+        return;
+
+    dst[index] = value;
+}
+
+void GPUData::set(float* arr, float const value, int const count)
+{
+    int const tx = 256;
+	int const bx = (count / tx) + 1;
+
+    dim3  threads(tx, 1, 1);
+    dim3  blocks(bx, 1, 1);
+    set_kernel<<< blocks, threads >>>(arr, value, count);
+    CUDA_CHECK_STATUS(cudaGetLastError());
+}
diff --git a/Gpufit/gpu_data.cuh b/Gpufit/gpu_data.cuh
new file mode 100644
index 0000000..b35f09d
--- /dev/null
+++ b/Gpufit/gpu_data.cuh
@@ -0,0 +1,122 @@
+#ifndef GPUFIT_GPU_DATA_CUH_INCLUDED
+#define GPUFIT_GPU_DATA_CUH_INCLUDED
+
+#include "info.h"
+
+#include <cuda_runtime.h>
+
+#include <stdexcept>
+#include <vector>
+#include <limits>
+
+template< typename Type >
+struct Device_Array
+{
+    explicit Device_Array( std::size_t const size )
+    {
+        std::size_t const maximum_size = std::numeric_limits< std::size_t >::max() ;
+        std::size_t const type_size = sizeof( Type ) ;
+        if (size <= maximum_size / type_size)
+        {
+            cudaError_t const status = cudaMalloc( & data_, size * type_size ) ;
+            if (status == cudaSuccess)
+            {
+                return ;
+            }
+            else
+            {
+                throw std::runtime_error( cudaGetErrorString( status ) ) ;
+            }
+        }
+        else
+        {
+            throw std::runtime_error( "maximum array size exceeded" ) ;
+        }
+    }
+
+    ~Device_Array() { cudaFree( data_ ) ; }
+
+    operator Type * () { return static_cast< Type * >( data_ ) ; }
+    operator Type const * () const { return static_cast< Type * >( data_ ) ; }
+
+    Type * copy( std::size_t const size, Type * const to ) const
+    {
+        /// \todo check size parameter
+
+        std::size_t const type_size = sizeof( Type ) ;
+        cudaError_t const status
+            = cudaMemcpy( to, data_, size * type_size, cudaMemcpyDeviceToHost ) ;
+        if (status == cudaSuccess)
+        {
+            return to + size ;
+        }
+        else
+        {
+            throw std::runtime_error( cudaGetErrorString( status ) ) ;
+        }
+    }
+
+private:
+    void * data_ ;
+} ;
+
+class GPUData
+{
+public:
+    GPUData(Info const & info);
+
+    void reset(int const chunk_size);
+    void init
+    (
+        int const chunk_index,
+        float const * data,
+        float const * weights,
+        float const * initial_parameters,
+        std::vector<int> const & parameters_to_fit_indices
+    ) ;
+    void init_user_info(char const * user_info);
+
+    void read(bool * dst, int const * src);
+    void set(int* arr, int const value);
+    void copy(float * dst, float const * src, std::size_t const count);
+
+private:
+	void set(float* arr, float const value, int const count);
+	void set(int* arr, int const value, int const count);
+    void write(float* dst, float const * src, int const count);
+    void write(int* dst, std::vector<int> const & src);
+    void write(char* dst, char const * src, std::size_t const count);
+
+private:
+    int chunk_size_;
+    Info const & info_;
+
+public:
+    int chunk_index_;
+
+    Device_Array< float > data_;
+    Device_Array< float > weights_;
+    Device_Array< float > parameters_;
+    Device_Array< float > prev_parameters_;
+    Device_Array< int > parameters_to_fit_indices_;
+    Device_Array< char > user_info_;
+
+    Device_Array< float > chi_squares_;
+    Device_Array< float > prev_chi_squares_;
+    Device_Array< float > gradients_;
+    Device_Array< float > hessians_;
+    Device_Array< float > deltas_;
+
+
+    Device_Array< float > values_;
+    Device_Array< float > derivatives_;
+
+    Device_Array< float > lambdas_;
+    Device_Array< int > states_;
+    Device_Array< int > finished_;
+    Device_Array< int > iteration_falied_;
+    Device_Array< int > all_finished_;
+    Device_Array< int > n_iterations_;
+};
+
+#endif
diff --git a/Gpufit/gpufit.cpp b/Gpufit/gpufit.cpp
new file mode 100644
index 0000000..e7f2d31
--- /dev/null
+++ b/Gpufit/gpufit.cpp
@@ -0,0 +1,130 @@
+#include "gpufit.h"
+#include "interface.h"
+
+#include <string>
+
+std::string last_error ;
+
+int gpufit
+(
+    size_t n_fits,
+    size_t n_points,
+    float * data,
+    float * weights,
+    int model_id,
+    float * initial_parameters,
+    float tolerance,
+    int max_n_iterations,
+    int * parameters_to_fit,
+    int estimator_id,
+    size_t user_info_size,
+    char * user_info,
+    float * output_parameters,
+    int * output_states,
+    float * output_chi_squares,
+    int * output_n_iterations
+)
+try
+{
+    __int32 n_points_32 = 0;
+    if (n_points <= (unsigned int)(std::numeric_limits<__int32>::max()))
+    {
+        n_points_32 = __int32(n_points);
+    }
+    else
+    {
+        throw std::runtime_error("maximum number of data points per fit exceeded");
+    }
+
+    FitInterface fi(
+        data,
+        weights,
+        n_fits,
+        n_points_32,
+        tolerance,
+        max_n_iterations,
+        estimator_id,
+        initial_parameters,
+        parameters_to_fit,
+        user_info,
+        user_info_size,
+        output_parameters,
+        output_states,
+        output_chi_squares,
+        output_n_iterations);
+
+    fi.fit(model_id);
+
+    return STATUS_OK ;
+}
+catch( std::exception & exception )
+{
+    last_error = exception.what() ;
+
+    return STATUS_ERROR ;
+}
+catch( ... )
+{
+    last_error = "unknown error" ;
+
+    return STATUS_ERROR;
+}
+
+char const * gpufit_get_last_error()
+{
+    return last_error.c_str() ;
+}
+
+int gpufit_cuda_available()
+{
+	try
+	{
+		getDeviceCount();
+		return 1;
+	}
+	catch (std::exception & exception)
+	{
+		last_error = exception.what();
+
+		return 0;
+	}
+}
+
+int gpufit_get_cuda_version(int * runtime_version, int * driver_version)
+{
+    try
+    {
+        cudaRuntimeGetVersion(runtime_version);
+        cudaDriverGetVersion(driver_version);
+        return 1;
+    }
+    catch (std::exception & exception)
+    {
+        last_error = exception.what();
+
+        return 0;
+    }
+}
+
+int gpufit_portable_interface(int argc, void *argv[])
+{
+
+    return gpufit(
+        *((size_t *) argv[0]),
+        *((size_t *) argv[1]),
+        (float *) argv[2],
+        (float *) argv[3],
+        *((int *) argv[4]),
+        (float *) argv[5],
+        *((float *) argv[6]),
+        *((int *) argv[7]),
+        (int *) argv[8],
+        *((int *) argv[9]),
+        *((size_t *) argv[10]),
+        (char *) argv[11],
+        (float *) argv[12],
+        (int *) argv[13],
+        (float *) argv[14],
+        (int *) argv[15]);
+
+}
\ No newline at end of file
diff --git a/Gpufit/gpufit.h b/Gpufit/gpufit.h
new file mode 100644
index 0000000..985e6d7
--- /dev/null
+++ b/Gpufit/gpufit.h
@@ -0,0 +1,63 @@
+#ifndef GPU_FIT_H_INCLUDED
+#define GPU_FIT_H_INCLUDED
+
+// fitting model ID
+#define GAUSS_1D 0
+#define GAUSS_2D 1
+#define GAUSS_2D_ELLIPTIC 2
+#define GAUSS_2D_ROTATED 3
+#define CAUCHY_2D_ELLIPTIC 4
+#define LINEAR_1D 5
+
+// estimator ID
+#define LSE 0
+#define MLE 1
+
+// fit state
+#define STATE_CONVERGED 0
+#define STATE_MAX_ITERATION 1
+#define STATE_SINGULAR_HESSIAN 2
+#define STATE_NEG_CURVATURE_MLE 3
+#define STATE_GPU_NOT_READY 4
+
+// gpufit return state
+#define STATUS_OK 0
+#define STATUS_ERROR -1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int gpufit
+(
+    size_t n_fits,
+    size_t n_points,
+    float * data,
+    float * weights,
+    int model_id,
+    float * initial_parameters,
+    float tolerance,
+    int max_n_iterations,
+    int * parameters_to_fit,
+    int estimator_id,
+    size_t user_info_size,
+    char * user_info,
+    float * output_parameters,
+    int * output_states,
+    float * output_chi_squares,
+    int * output_n_iterations
+) ;
+
+char const * gpufit_get_last_error() ;
+
+int gpufit_cuda_available();
+
+int gpufit_get_cuda_version(int * runtime_version, int * driver_version);
+
+int gpufit_portable_interface(int argc, void *argv[]);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // GPU_FIT_H_INCLUDED
diff --git a/Gpufit/info.cpp b/Gpufit/info.cpp
new file mode 100644
index 0000000..e2fecca
--- /dev/null
+++ b/Gpufit/info.cpp
@@ -0,0 +1,124 @@
+#include "info.h"
+#include <algorithm>
+
+Info::Info() :
+    n_parameters_(0),
+    n_parameters_to_fit_(0),
+    max_chunk_size_(0),
+    max_n_iterations_(0),
+    n_points_(0),
+    power_of_two_n_points_(0),
+    n_fits_(0),
+    user_info_size_(0),
+    n_fits_per_block_(0),
+    model_id_(0),
+    estimator_id_(0),
+    max_threads_(0),
+    max_blocks_(0),
+    available_gpu_memory_(0)
+{
+}
+
+Info::~Info(void)
+{
+}
+
+void Info::set_number_of_parameters_to_fit(int const * const parameters_to_fit)
+{
+    n_parameters_to_fit_ = n_parameters_;
+
+    for (int i = 0; i < n_parameters_; i++)
+    {
+        if (!parameters_to_fit[i])
+        {
+            n_parameters_to_fit_--;
+        }
+    }
+}
+
+void Info::set_fits_per_block(std::size_t const current_chunk_size)
+{
+    n_fits_per_block_ = 8;
+    bool is_divisible = false;
+    bool enough_threads = false;
+    do 
+    {
+        n_fits_per_block_ /= 2;
+        is_divisible = current_chunk_size % n_fits_per_block_ == 0;
+        enough_threads = n_fits_per_block_ * n_points_ < max_threads_ / 4;
+    } while ((!is_divisible || !enough_threads) && n_fits_per_block_ > 1);
+}
+
+void Info::set_max_chunk_size()
+{
+    int one_fit_memory
+        = sizeof(float)
+        *(2 * n_points_
+        + 2 * n_parameters_
+        + 2 * n_parameters_to_fit_
+        + 1 * n_parameters_to_fit_*n_parameters_to_fit_
+        + 1 * n_points_*n_parameters_
+        + 4)
+        + sizeof(int)
+        * 3;
+
+    if (use_weights_)
+        one_fit_memory += sizeof(float) * n_points_;
+
+    std::size_t tmp_chunk_size = available_gpu_memory_ / one_fit_memory;
+    
+    if (tmp_chunk_size == 0)
+    {
+        throw std::runtime_error("not enough free GPU memory available");
+    }
+
+    tmp_chunk_size = (std::min)(tmp_chunk_size, max_blocks_);
+
+    std::size_t highest_factor = 1;
+
+    if (n_parameters_to_fit_)
+    {
+        highest_factor
+            = n_points_
+            * n_parameters_to_fit_
+            * n_parameters_to_fit_
+            * sizeof(float);
+    }
+    else
+    {
+        highest_factor = n_points_ * n_parameters_;
+    }
+
+    std::size_t const highest_size_t_value
+        = std::numeric_limits< std::size_t >::max();
+
+    if (tmp_chunk_size > highest_size_t_value / highest_factor)
+    {
+        tmp_chunk_size = highest_size_t_value / highest_factor;
+    }
+
+    max_chunk_size_ = tmp_chunk_size;
+
+    int i = 1;
+    int const divisor = 10;
+    while (tmp_chunk_size > divisor)
+    {
+        i *= divisor;
+        tmp_chunk_size /= divisor;
+    }
+    max_chunk_size_ = max_chunk_size_ / i * i;
+    max_chunk_size_ = std::min(max_chunk_size_, n_fits_);
+}
+
+
+void Info::configure()
+{
+    power_of_two_n_points_ = 1;
+    while (power_of_two_n_points_ < n_points_)
+    {
+        power_of_two_n_points_ *= 2;
+    }
+
+    get_gpu_properties();
+    set_max_chunk_size();
+}
diff --git a/Gpufit/info.cu b/Gpufit/info.cu
new file mode 100644
index 0000000..60568f8
--- /dev/null
+++ b/Gpufit/info.cu
@@ -0,0 +1,31 @@
+#include "info.h"
+#include <cuda_runtime.h>
+
+void Info::get_gpu_properties()
+{
+    cudaDeviceProp devProp;
+    CUDA_CHECK_STATUS(cudaGetDeviceProperties(&devProp, 0));
+    max_threads_ = devProp.maxThreadsPerBlock;
+    max_blocks_ = devProp.maxGridSize[0];
+
+    std::size_t free_bytes;
+    std::size_t total_bytes;
+    CUDA_CHECK_STATUS(cudaMemGetInfo(&free_bytes, &total_bytes));
+    available_gpu_memory_ = std::size_t(double(free_bytes) * 0.1);
+    
+    if (available_gpu_memory_ > user_info_size_)
+    {
+        available_gpu_memory_ -= user_info_size_;
+    }
+    else
+    {
+        throw std::runtime_error("maximum user info size exceeded");
+    }
+}
+
+int getDeviceCount()
+{
+	int deviceCount;
+	CUDA_CHECK_STATUS(cudaGetDeviceCount(&deviceCount));
+	return deviceCount;
+}
\ No newline at end of file
diff --git a/Gpufit/info.h b/Gpufit/info.h
new file mode 100644
index 0000000..3f17623
--- /dev/null
+++ b/Gpufit/info.h
@@ -0,0 +1,48 @@
+#ifndef GPUFIT_PARAMETERS_H_INCLUDED
+#define GPUFIT_PARAMETERS_H_INCLUDED
+
+#include "definitions.h"
+#include <vector>
+
+
+class Info
+{
+public:
+    Info();
+    virtual ~Info();
+
+    void set_fits_per_block(std::size_t const n_fits);
+    void set_number_of_parameters_to_fit(int const * parameters_to_fit);
+    void configure();
+
+private:
+    void get_gpu_properties();
+    void set_max_chunk_size();
+
+public:
+    int n_parameters_;
+    int n_parameters_to_fit_;
+
+	int n_points_;
+    int power_of_two_n_points_;
+
+    std::size_t n_fits_;
+
+    std::size_t user_info_size_;
+
+    int max_n_iterations_;
+	std::size_t max_chunk_size_;
+    int n_fits_per_block_;
+    int model_id_;
+    int estimator_id_;
+    bool use_weights_;
+
+private:
+    int max_threads_;
+    std::size_t max_blocks_;
+    std::size_t available_gpu_memory_;
+};
+
+int getDeviceCount();
+
+#endif
diff --git a/Gpufit/interface.cpp b/Gpufit/interface.cpp
new file mode 100644
index 0000000..e8ddac3
--- /dev/null
+++ b/Gpufit/interface.cpp
@@ -0,0 +1,123 @@
+#include "gpufit.h"
+#include "interface.h"
+
+FitInterface::FitInterface
+(
+    float const * data,
+    float const * weights,
+    std::size_t n_fits,
+    int n_points,
+    float tolerance,
+    int max_n_iterations,
+    int estimator_id,
+    float const * initial_parameters,
+    int * parameters_to_fit,
+    char * user_info,
+    std::size_t user_info_size,
+    float * output_parameters,
+    int * output_states,
+    float * output_chi_squares,
+    int * output_n_iterations
+) :
+    data_( data ),
+    weights_( weights ),
+    initial_parameters_( initial_parameters ),
+    parameters_to_fit_( parameters_to_fit ),
+    user_info_( user_info ),
+    n_fits_(n_fits),
+    n_points_(n_points),
+    tolerance_(tolerance),
+    max_n_iterations_(max_n_iterations),
+    estimator_id_(estimator_id),
+    user_info_size_(user_info_size),
+    output_parameters_( output_parameters ),
+    output_states_(output_states),
+    output_chi_squares_(output_chi_squares),
+    output_n_iterations_(output_n_iterations),
+    n_parameters_(0)
+{}
+
+FitInterface::~FitInterface()
+{}
+
+void FitInterface::check_sizes()
+{
+    std::size_t maximum_size = std::numeric_limits< std::size_t >::max();
+    
+    if (n_fits_ > maximum_size / n_points_ / sizeof(float))
+    {
+        throw std::runtime_error("maximum absolute number of data points exceeded");
+    }
+    
+    if (n_fits_ > maximum_size / n_parameters_ / sizeof(float))
+    {
+        throw std::runtime_error("maximum number of fits and/or parameters exceeded");
+    }
+}
+
+void FitInterface::set_number_of_parameters(int const model_id)
+{
+    switch (model_id)
+    {
+    case GAUSS_1D:
+        n_parameters_ = 4;
+        break;
+    case GAUSS_2D:
+        n_parameters_ = 5;
+        break;
+    case GAUSS_2D_ELLIPTIC:
+        n_parameters_ = 6;
+        break;
+    case GAUSS_2D_ROTATED:
+        n_parameters_ = 7;
+        break;
+    case CAUCHY_2D_ELLIPTIC:
+        n_parameters_ = 6;
+        break;
+    case LINEAR_1D:
+        n_parameters_ = 2;
+        break;
+    default:
+        break;
+    }
+}
+
+void FitInterface::configure_info(Info & info, int const model_id)
+{
+    info.model_id_ = model_id;
+    info.n_fits_ = n_fits_;
+    info.n_points_ = n_points_;
+    info.max_n_iterations_ = max_n_iterations_;
+    info.estimator_id_ = estimator_id_;
+    info.user_info_size_ = user_info_size_;
+    info.n_parameters_ = n_parameters_;
+    info.use_weights_ = weights_ ? true : false;
+
+    info.set_number_of_parameters_to_fit(parameters_to_fit_);
+    info.configure();
+}
+
+void FitInterface::fit(int const model_id)
+{
+    set_number_of_parameters(model_id);
+
+    check_sizes();
+
+    Info info;
+    configure_info(info, model_id);
+
+    LMFit lmfit
+    (
+        data_,
+        weights_,
+        info,
+        initial_parameters_,
+        parameters_to_fit_,
+        user_info_,
+        output_parameters_,
+        output_states_,
+        output_chi_squares_,
+        output_n_iterations_
+    ) ;
+    lmfit.run(tolerance_);
+}
diff --git a/Gpufit/interface.h b/Gpufit/interface.h
new file mode 100644
index 0000000..27814aa
--- /dev/null
+++ b/Gpufit/interface.h
@@ -0,0 +1,63 @@
+#ifndef GPUFIT_INTERFACE_H_INCLUDED
+#define GPUFIT_INTERFACE_H_INCLUDED
+
+#include "lm_fit.h"
+
+static_assert( sizeof( int ) == 4, "32 bit 'int' type required" ) ;
+
+class FitInterface
+{
+public:
+    FitInterface
+    (
+        float const * data,
+        float const * weights,
+        std::size_t n_fits,
+        int n_points,
+        float tolerance,
+        int max_n_iterations,
+        int estimator_id,
+        float const * initial_parameters,
+        int * parameters_to_fit,
+        char * user_info,
+        std::size_t user_info_size,
+        float * output_parameters,
+        int * output_states,
+        float * output_chi_squares,
+        int * output_n_iterations
+    ) ;
+    
+    virtual ~FitInterface();
+    void fit(int const model_id);
+
+private:
+    void set_number_of_parameters(int const model_id);
+    void check_sizes();
+    void configure_info(Info & info, int const model_id);
+
+public:
+
+private:
+    //input
+    float const * const data_ ;
+    float const * const weights_;
+    float const * const initial_parameters_;
+    int const * const parameters_to_fit_;
+    char * const user_info_;
+    int n_parameters_;
+
+    std::size_t const n_fits_;
+    int const n_points_;
+    float const  tolerance_;
+    int const max_n_iterations_;
+    int const estimator_id_;
+    std::size_t const user_info_size_;
+
+    //output
+    float * output_parameters_;
+    int * output_states_;
+    float * output_chi_squares_;
+    int * output_n_iterations_;
+};
+
+#endif
diff --git a/Gpufit/linear_1d.cuh b/Gpufit/linear_1d.cuh
new file mode 100644
index 0000000..0b6a5c8
--- /dev/null
+++ b/Gpufit/linear_1d.cuh
@@ -0,0 +1,103 @@
+#ifndef GPUFIT_LINEAR1D_CUH_INCLUDED
+#define GPUFIT_LINEAR1D_CUH_INCLUDED
+
+/* Description of the calculate_linear1d function
+* ===================================================
+*
+* This function calculates the values of one-dimensional linear model functions
+* and their partial derivatives with respect to the model parameters. 
+*
+* This function makes use of the user information data to pass in the 
+* independent variables (X values) corresponding to the data.  
+*
+* Note that if no user information is provided, the (X) coordinate of the 
+* first data value is assumed to be (0.0).  In this case, for a fit size of 
+* M data points, the (X) coordinates of the data are simply the corresponding 
+* array index values of the data array, starting from zero.
+*
+* Parameters:
+*
+* parameters: An input vector of concatenated sets of model parameters.
+*             p[0]: offset
+*             p[1]: slope
+*
+* n_fits: The number of fits.
+*
+* n_points: The number of data points per fit.
+*
+* n_parameters: The number of model parameters.
+*
+* values: An output vector of concatenated sets of model function values.
+*
+* derivatives: An output vector of concatenated sets of model function partial
+*              derivatives.
+*
+* chunk_index: The chunk index. Used for indexing of user_info.
+*
+* user_info: An input vector containing user information.
+*
+* user_info_size: The number of elements in user_info.
+*
+* Calling the calculate_linear1d function
+* =======================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function. When calling the function, the blocks and threads of the __global__
+* function must be set up correctly, as shown in the following example code.
+*
+*   dim3  threads(1, 1, 1);
+*   dim3  blocks(1, 1, 1);
+*
+*   threads.x = n_points * n_fits_per_block;
+*   blocks.x = n_fits / n_fits_per_block;
+*
+*   global_function<<< blocks,threads >>>(parameter1, ...);
+*
+*/
+
+__device__ void calculate_linear1d(
+    float const * parameters,
+    int const n_fits,
+    int const n_points,
+    int const n_parameters,
+    float * values,
+    float * derivatives,
+    int const chunk_index,
+    char * user_info,
+    std::size_t const user_info_size)
+{
+    int const n_fits_per_block = blockDim.x / n_points;
+    int const fit_in_block = threadIdx.x / n_points;
+    int const point_index = threadIdx.x - (fit_in_block*n_points);
+    int const fit_index = blockIdx.x * n_fits_per_block + fit_in_block;
+
+    float * user_info_float = (float*) user_info;
+    float x = 0.0f;
+    if (!user_info_float)
+    {
+        x = point_index;
+    }
+    else if (user_info_size / sizeof(float) == n_points)
+    {
+        x = user_info_float[point_index];
+    }
+    else if (user_info_size / sizeof(float) > n_points)
+    {
+        int const chunk_begin = chunk_index * n_fits * n_points;
+        int const fit_begin = fit_index * n_points;
+        x = user_info_float[chunk_begin + fit_begin + point_index];
+    }
+
+    float* current_value = &values[fit_index*n_points];
+    float const * current_parameters = &parameters[fit_index * n_parameters];
+
+    current_value[point_index] = current_parameters[0] + current_parameters[1] * x;
+
+    // derivatives
+
+    float * current_derivative = &derivatives[fit_index * n_parameters * n_points + point_index];
+    current_derivative[0] = 1.f;
+    current_derivative[1 * n_points] = x;
+}
+
+#endif
diff --git a/Gpufit/lm_fit.cpp b/Gpufit/lm_fit.cpp
new file mode 100644
index 0000000..19a658f
--- /dev/null
+++ b/Gpufit/lm_fit.cpp
@@ -0,0 +1,92 @@
+#include "lm_fit.h"
+#include <algorithm>
+
+LMFit::LMFit
+(
+    float const * const data,
+    float const * const weights,
+    Info & info,
+    float const * const initial_parameters,
+    int const * const parameters_to_fit,
+    char * const user_info,
+    float * output_parameters,
+    int * output_states,
+    float * output_chi_squares,
+    int * output_n_iterations
+) :
+    data_( data ),
+    weights_( weights ),
+    initial_parameters_( initial_parameters ),
+    parameters_to_fit_( parameters_to_fit ),
+    user_info_( user_info ),
+    output_parameters_( output_parameters ),
+    output_states_( output_states ),
+    output_chi_squares_( output_chi_squares ),
+    output_n_iterations_( output_n_iterations ),
+    info_(info),
+    chunk_size_(0),
+    ichunk_(0),
+    n_fits_left_(info.n_fits_),
+    parameters_to_fit_indices_(0)
+{}
+
+LMFit::~LMFit()
+{}
+
+void LMFit::set_parameters_to_fit_indices()
+{
+    int const n_parameters_to_fit = info_.n_parameters_;
+    for (int i = 0; i < n_parameters_to_fit; i++)
+    {
+        if (parameters_to_fit_[i])
+        {
+            parameters_to_fit_indices_.push_back(i);
+        }
+    }
+}
+
+void LMFit::get_results(GPUData const & gpu_data, int const n_fits)
+{
+    output_parameters_
+        = gpu_data.parameters_.copy( n_fits*info_.n_parameters_, output_parameters_ ) ;
+    output_states_ = gpu_data.states_.copy( n_fits, output_states_ ) ;
+    output_chi_squares_ = gpu_data.chi_squares_.copy( n_fits, output_chi_squares_ ) ;
+    output_n_iterations_ = gpu_data.n_iterations_.copy( n_fits, output_n_iterations_ ) ;
+}
+
+void LMFit::run(float const tolerance)
+{
+    set_parameters_to_fit_indices();
+
+    GPUData gpu_data(info_);
+    gpu_data.init_user_info(user_info_);
+
+    // loop over data chunks
+    while (n_fits_left_ > 0)
+    {
+        chunk_size_ = int((std::min)(n_fits_left_, info_.max_chunk_size_));
+
+        info_.set_fits_per_block(chunk_size_);
+
+        gpu_data.reset(chunk_size_);
+        gpu_data.init(
+            ichunk_,
+            data_,
+            weights_,
+            initial_parameters_,
+            parameters_to_fit_indices_);
+
+        LMFitCUDA lmfit_cuda(
+            tolerance,
+            info_,
+            gpu_data,
+            chunk_size_);
+
+        lmfit_cuda.run();
+
+        get_results(gpu_data, chunk_size_);
+
+        n_fits_left_ -= chunk_size_;
+        ichunk_++;
+    }
+}
diff --git a/Gpufit/lm_fit.h b/Gpufit/lm_fit.h
new file mode 100644
index 0000000..6ee3b86
--- /dev/null
+++ b/Gpufit/lm_fit.h
@@ -0,0 +1,88 @@
+#ifndef GPUFIT_LM_FIT_H_INCLUDED
+#define GPUFIT_LM_FIT_H_INCLUDED
+
+#include "definitions.h"
+#include "info.h"
+#include "gpu_data.cuh"
+
+class LMFitCUDA;
+
+class LMFit
+{
+public:
+    LMFit
+    (
+        float const * data,
+        float const * weights,
+        Info & info,
+        float const * initial_parameters,
+        int const * parameters_to_fit,
+        char * user_info,
+        float * output_parameters,
+        int * output_states,
+        float * output_chi_squares,
+        int * output_n_iterations
+    ) ;
+
+    virtual ~LMFit();
+
+    void run(float const tolerance);
+
+private:
+    void set_parameters_to_fit_indices();
+    void get_results(GPUData const & gpu_data, int const n_fits);
+
+    float const * const data_ ;
+    float const * const weights_ ;
+    float const * const initial_parameters_ ;
+    int const * const parameters_to_fit_;
+    char const * const user_info_;
+
+    float * output_parameters_ ;
+    int * output_states_ ;
+    float * output_chi_squares_ ;
+    int * output_n_iterations_ ;
+
+    int ichunk_;
+    int chunk_size_;
+    std::size_t n_fits_left_;
+
+    Info & info_;
+
+    std::vector<int> parameters_to_fit_indices_;
+};
+
+class LMFitCUDA
+{
+public:
+    LMFitCUDA(
+        float const tolerance,
+        Info const & info,
+        GPUData & gpu_data,
+        int const n_fits);
+
+    virtual ~LMFitCUDA();
+
+    void run();
+
+private:
+	void calc_curve_values();
+    void calc_chi_squares();
+    void calc_gradients();
+    void calc_hessians();
+    void evaluate_iteration(int const iteration);
+    void solve_equation_system();
+
+public:
+
+private:
+    Info const & info_;
+    GPUData & gpu_data_;
+    int const n_fits_;
+
+    bool all_finished_;
+
+    float tolerance_;
+};
+
+#endif
diff --git a/Gpufit/lm_fit_cuda.cpp b/Gpufit/lm_fit_cuda.cpp
new file mode 100644
index 0000000..94799a0
--- /dev/null
+++ b/Gpufit/lm_fit_cuda.cpp
@@ -0,0 +1,57 @@
+#include "lm_fit.h"
+
+LMFitCUDA::LMFitCUDA(
+    float const tolerance,
+    Info const & info,
+    GPUData & gpu_data,
+    int const n_fits
+    ) :
+    info_(info),
+    gpu_data_(gpu_data),
+    n_fits_(n_fits),
+    all_finished_(false),
+    tolerance_(tolerance)
+{
+}
+
+LMFitCUDA::~LMFitCUDA()
+{
+}
+
+void LMFitCUDA::run()
+{
+    // initialize the chi-square values
+	calc_curve_values();
+    calc_chi_squares();
+    calc_gradients();
+    calc_hessians();
+
+    gpu_data_.copy(
+        gpu_data_.prev_chi_squares_,
+        gpu_data_.chi_squares_,
+        n_fits_);
+
+    // loop over the fit iterations
+    for (int iteration = 0; !all_finished_; iteration++)
+    {
+        // modify step width
+        // Gauss Jordan
+        // update fitting parameters
+        solve_equation_system();
+
+        // calculate fitting curve values and its derivatives
+        // calculate chi-squares, gradients and hessians
+		calc_curve_values();
+        calc_chi_squares();
+        calc_gradients();
+        calc_hessians();
+
+        // check which fits have converged
+        // flag finished fits
+        // check whether all fits finished
+        // save the number of needed iterations by each fitting process
+        // check whether chi-squares are increasing or decreasing
+        // update chi-squares, curve parameters and lambdas
+        evaluate_iteration(iteration);
+    }
+}
\ No newline at end of file
diff --git a/Gpufit/lm_fit_cuda.cu b/Gpufit/lm_fit_cuda.cu
new file mode 100644
index 0000000..8d74fb9
--- /dev/null
+++ b/Gpufit/lm_fit_cuda.cu
@@ -0,0 +1,253 @@
+#include "lm_fit.h"
+#include <algorithm>
+#include "cuda_kernels.cuh"
+#include "cuda_gaussjordan.cuh"
+
+void LMFitCUDA::solve_equation_system()
+{
+    dim3  threads(1, 1, 1);
+    dim3  blocks(1, 1, 1);
+
+    threads.x = info_.n_parameters_to_fit_*info_.n_fits_per_block_;
+    threads.y = 1;
+    blocks.x = n_fits_ / info_.n_fits_per_block_;
+    blocks.y = 1;
+    cuda_modify_step_widths<<< blocks, threads >>>(
+        gpu_data_.hessians_,
+        gpu_data_.lambdas_,
+        info_.n_parameters_to_fit_,
+        gpu_data_.iteration_falied_,
+        gpu_data_.finished_,
+        info_.n_fits_per_block_);
+    CUDA_CHECK_STATUS(cudaGetLastError());
+
+    int n_parameters_pow2 = 1;
+
+    while (n_parameters_pow2 < info_.n_parameters_to_fit_)
+    {
+        n_parameters_pow2 *= 2;
+    }
+
+    //set up to run the Gauss Jordan elimination
+    int const n_equations = info_.n_parameters_to_fit_;
+    int const n_solutions = n_fits_;
+
+    threads.x = n_equations + 1;
+    threads.y = n_equations;
+    blocks.x = n_solutions;
+    blocks.y = 1;
+
+    //set the size of the shared memory area for each block
+    int const shared_size
+        = sizeof(float) * ((threads.x * threads.y)
+        + n_parameters_pow2 + n_parameters_pow2);
+
+    //set up the singular_test vector
+    int * singular_tests;
+    CUDA_CHECK_STATUS(cudaMalloc((void**)&singular_tests, n_fits_ * sizeof(int)));
+
+    //run the Gauss Jordan elimination
+    cuda_gaussjordan<<< blocks, threads, shared_size >>>(
+        gpu_data_.deltas_,
+        gpu_data_.gradients_,
+        gpu_data_.hessians_,
+        gpu_data_.finished_,
+        singular_tests,
+        info_.n_parameters_to_fit_,
+        n_parameters_pow2);
+    CUDA_CHECK_STATUS(cudaGetLastError());
+
+    //set up to update the lm_state_gpu_ variable with the Gauss Jordan results
+    threads.x = std::min(n_fits_, 256);
+    threads.y = 1;
+    blocks.x = int(std::ceil(float(n_fits_) / float(threads.x)));
+    blocks.y = 1;
+
+    //update the lm_state_gpu_ variable
+    cuda_update_state_after_gaussjordan<<< blocks, threads >>>(
+        n_fits_,
+        singular_tests,
+        gpu_data_.states_);
+    CUDA_CHECK_STATUS(cudaGetLastError());
+
+    CUDA_CHECK_STATUS(cudaFree(singular_tests));
+
+    threads.x = info_.n_parameters_*info_.n_fits_per_block_;
+    threads.y = 1;
+    blocks.x = n_fits_ / info_.n_fits_per_block_;
+    blocks.y = 1;
+    cuda_update_parameters<<< blocks, threads >>>(
+        gpu_data_.parameters_,
+        gpu_data_.prev_parameters_,
+        gpu_data_.deltas_,
+        info_.n_parameters_to_fit_,
+        gpu_data_.parameters_to_fit_indices_,
+        gpu_data_.finished_,
+        info_.n_fits_per_block_);
+    CUDA_CHECK_STATUS(cudaGetLastError());
+}
+
+void LMFitCUDA::calc_curve_values()
+{
+	dim3  threads(1, 1, 1);
+	dim3  blocks(1, 1, 1);
+
+	threads.x = info_.n_points_ * info_.n_fits_per_block_;
+	threads.y = 1;
+	blocks.x = n_fits_ / info_.n_fits_per_block_;
+	blocks.y = 1;
+
+	cuda_calc_curve_values << < blocks, threads >> >(
+		gpu_data_.parameters_,
+		n_fits_,
+		info_.n_points_,
+		info_.n_parameters_,
+		gpu_data_.finished_,
+		gpu_data_.values_,
+		gpu_data_.derivatives_,
+		info_.n_fits_per_block_,
+		info_.model_id_,
+		gpu_data_.chunk_index_,
+		gpu_data_.user_info_,
+		info_.user_info_size_);
+	CUDA_CHECK_STATUS(cudaGetLastError());
+}
+
+void LMFitCUDA::calc_chi_squares()
+{
+    dim3  threads(1, 1, 1);
+    dim3  blocks(1, 1, 1);
+
+    int const shared_size
+        = sizeof(float)
+        * info_.power_of_two_n_points_
+        * info_.n_fits_per_block_;
+
+    threads.x = info_.power_of_two_n_points_*info_.n_fits_per_block_;
+    threads.y = 1;
+    blocks.x = n_fits_ / info_.n_fits_per_block_;
+    blocks.y = 1;
+
+    cuda_calculate_chi_squares <<< blocks, threads, shared_size >>>(
+        gpu_data_.chi_squares_,
+        gpu_data_.states_,
+        gpu_data_.iteration_falied_,
+        gpu_data_.prev_chi_squares_,
+        gpu_data_.data_,
+        gpu_data_.values_,
+        gpu_data_.weights_,
+        info_.n_points_,
+        info_.estimator_id_,
+        gpu_data_.finished_,
+        info_.n_fits_per_block_,
+        gpu_data_.user_info_,
+        info_.user_info_size_);
+    CUDA_CHECK_STATUS(cudaGetLastError());
+}
+
+void LMFitCUDA::calc_gradients()
+{
+    dim3  threads(1, 1, 1);
+    dim3  blocks(1, 1, 1);
+
+    int const shared_size
+        = sizeof(float)
+        * info_.power_of_two_n_points_
+        * info_.n_fits_per_block_;
+
+    threads.x = info_.power_of_two_n_points_*info_.n_fits_per_block_;
+    threads.y = 1;
+    blocks.x = n_fits_ / info_.n_fits_per_block_;
+    blocks.y = 1;
+
+    cuda_calculate_gradients <<< blocks, threads, shared_size >>>(
+        gpu_data_.gradients_,
+        gpu_data_.data_,
+        gpu_data_.values_,
+        gpu_data_.derivatives_,
+        gpu_data_.weights_,
+        info_.n_points_,
+        info_.n_parameters_,
+        info_.n_parameters_to_fit_,
+        gpu_data_.parameters_to_fit_indices_,
+        info_.estimator_id_,
+        gpu_data_.finished_,
+        gpu_data_.iteration_falied_,
+        info_.n_fits_per_block_,
+        gpu_data_.user_info_,
+        info_.user_info_size_);
+    CUDA_CHECK_STATUS(cudaGetLastError());
+}
+
+void LMFitCUDA::calc_hessians()
+{
+    dim3  threads(1, 1, 1);
+    dim3  blocks(1, 1, 1);
+
+    threads.x = info_.n_parameters_to_fit_;
+    threads.y = info_.n_parameters_to_fit_;
+    blocks.x = n_fits_;
+    blocks.y = 1;
+
+    cuda_calculate_hessians <<< blocks, threads >>>(
+        gpu_data_.hessians_,
+        gpu_data_.data_,
+        gpu_data_.values_,
+        gpu_data_.derivatives_,
+        gpu_data_.weights_,
+        info_.n_points_,
+        info_.n_parameters_,
+        info_.n_parameters_to_fit_,
+        gpu_data_.parameters_to_fit_indices_,
+        info_.estimator_id_,
+        gpu_data_.iteration_falied_,
+        gpu_data_.finished_,
+        gpu_data_.user_info_,
+        info_.user_info_size_);
+    CUDA_CHECK_STATUS(cudaGetLastError());
+}
+
+void LMFitCUDA::evaluate_iteration(int const iteration)
+{
+    dim3  threads(1, 1, 1);
+    dim3  blocks(1, 1, 1);
+
+    threads.x = std::min(n_fits_, 256);
+    threads.y = 1;
+    blocks.x = int(std::ceil(float(n_fits_) / float(threads.x)));
+    blocks.y = 1;
+
+    cuda_check_for_convergence<<< blocks, threads >>>(
+        gpu_data_.finished_,
+        tolerance_,
+        gpu_data_.states_,
+        gpu_data_.chi_squares_,
+        gpu_data_.prev_chi_squares_,
+        iteration,
+        info_.max_n_iterations_,
+        n_fits_);
+    CUDA_CHECK_STATUS(cudaGetLastError());
+
+    gpu_data_.set(gpu_data_.all_finished_, 1);
+
+    cuda_evaluate_iteration<<< blocks, threads >>>(
+        gpu_data_.all_finished_,
+        gpu_data_.n_iterations_,
+        gpu_data_.finished_,
+        iteration,
+        gpu_data_.states_,
+        n_fits_);
+    CUDA_CHECK_STATUS(cudaGetLastError());
+
+    gpu_data_.read(&all_finished_, gpu_data_.all_finished_);
+
+    cuda_prepare_next_iteration<<< blocks, threads >>>(
+        gpu_data_.lambdas_,
+        gpu_data_.chi_squares_,
+        gpu_data_.prev_chi_squares_,
+        gpu_data_.parameters_,
+        gpu_data_.prev_parameters_,
+        n_fits_,
+        info_.n_parameters_);
+    CUDA_CHECK_STATUS(cudaGetLastError());
+}
diff --git a/Gpufit/lse.cuh b/Gpufit/lse.cuh
new file mode 100644
index 0000000..e615b01
--- /dev/null
+++ b/Gpufit/lse.cuh
@@ -0,0 +1,186 @@
+#ifndef GPUFIT_LSE_CUH_INCLUDED
+#define GPUFIT_LSE_CUH_INCLUDED
+
+/* Description of the calculate_chi_square_lse function
+* =====================================================
+*
+* This function calculates the chi-square values for the weighted LSE estimator.
+*
+* Parameters:
+*
+* chi_square: An output vector of chi-square values for each data point.
+*
+* point_index: The data point index.
+*
+* data: An input vector of data values.
+*
+* value: An input vector of fitting curve values.
+*
+* weight: An input vector of values for weighting the chi-square values.
+*
+* state: A pointer to a value which indicates whether the fitting
+*        process was carreid out correctly or which problem occurred.
+*        In this function it is not used. It can be used in functions calculating
+*        other estimators than the LSE, such as MLE. It is passed into this function
+*        to provide the same interface for all estimator functions.
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_chi_square_lse function
+* =============================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function.
+*
+*/
+
+__device__ void calculate_chi_square_lse(
+    volatile float * chi_square,
+    int const point_index,
+    float const * data,
+    float const * value,
+    float const * weight,
+    int * state,
+    char * user_info,
+    std::size_t const user_info_size)
+{
+    float const deviation = value[point_index] - data[point_index];
+
+    if (weight)
+    {
+        chi_square[point_index] = deviation * deviation * weight[point_index];
+    }
+    else
+    {
+        chi_square[point_index] = deviation * deviation;
+    }
+}
+
+/* Description of the calculate_hessian_lse function
+* ==================================================
+*
+* This function calculates the hessian matrix values of the weighted LSE equation.
+* The calculation is performed based on previously calculated fitting curve derivative
+* values.
+*
+* Parameters:
+*
+* hessian: An output vector of values of the hessian matrix for each data point.
+*
+* point_index: The data point index.
+*
+* parameter_index_i: Index of the hessian column.
+*
+* parameter_index_j: Index of the hessian row.
+*
+* data: An input vector of data values.
+*
+* value: An input vector of fitting curve values.
+*
+* derivative: An input vector of partial derivative values of the fitting
+*             curve for each data point.
+*
+* weight: An input vector of values for weighting the hessian matrix values.
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_hessian_lse function
+* ==========================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function.
+*
+*/
+
+__device__ void calculate_hessian_lse(
+    double * hessian,
+    int const point_index,
+    int const parameter_index_i,
+    int const parameter_index_j,
+    float const * data,
+    float const * value,
+    float const * derivative,
+    float const * weight,
+    char * user_info,
+    std::size_t const user_info_size)
+{
+    if (weight)
+    {
+        *hessian
+            += derivative[parameter_index_i] * derivative[parameter_index_j]
+            * weight[point_index];
+    }
+    else
+    {
+        *hessian
+            += derivative[parameter_index_i] * derivative[parameter_index_j];
+    }
+}
+
+/* Description of the calculate_gradient_lse function
+* ===================================================
+*
+* This function calculates the gradient values of the weighted LSE equation
+* based on previously calculated fitting curve derivative values.
+*
+* Parameters:
+*
+* gradient: An output vector of values of the gradient vector for each data point.
+*
+* point_index: The data point index.
+*
+* parameter_index: The parameter index.
+*
+* n_parameters: The number of fitting curve parameters.
+*
+* data: An input vector of data values.
+*
+* value: An input vector of fitting curve values.
+*
+* derivative: An input vector of partial derivative values of the fitting
+*             curve for each data point.
+*
+* weight: An input vector of values for weighting gradient values.
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_gradient_lse function
+* ===========================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function.
+*
+*/
+
+__device__ void calculate_gradient_lse(
+    volatile float * gradient,
+    int const point_index,
+    int const parameter_index,
+    float const * data,
+    float const * value,
+    float const * derivative,
+    float const * weight,
+    char * user_info,
+    std::size_t const user_info_size)
+{
+    float const deviation = data[point_index] - value[point_index];
+
+    if (weight)
+    {
+        gradient[point_index]
+            = derivative[parameter_index] * deviation * weight[point_index];
+    }
+    else
+    {
+        gradient[point_index]
+            = derivative[parameter_index] * deviation;
+    }
+}
+
+#endif
diff --git a/Gpufit/matlab/CMakeLists.txt b/Gpufit/matlab/CMakeLists.txt
new file mode 100644
index 0000000..b0c5dc8
--- /dev/null
+++ b/Gpufit/matlab/CMakeLists.txt
@@ -0,0 +1,69 @@
+
+# MATLAB Gpufit binding
+
+find_package( Matlab COMPONENTS MX_LIBRARY )
+
+if( NOT Matlab_FOUND )
+  message( STATUS "Matlab and/or MX_Library NOT found - skipping Gpufit Matlab binding!" )
+  return()
+endif()
+
+# MATLAB MEX FILE
+
+set( Headers
+  )
+
+set( Sources
+  mex/GpufitMex.cpp
+  )
+
+add_library( GpufitMex SHARED
+  ${Headers}
+  ${Sources}
+  )
+
+set_property( TARGET GpufitMex
+  PROPERTY SUFFIX .${Matlab_MEX_EXTENSION} )
+
+set_property( TARGET GpufitMex
+  PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" )
+
+target_include_directories( GpufitMex PRIVATE ${Matlab_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR} )
+
+target_link_libraries( GpufitMex Gpufit ${Matlab_LIBRARIES} )
+
+if( WIN32 )
+  SET(CMAKE_SHARED_LINKER_FLAGS "/export:mexFunction")
+endif()
+
+add_matlab_launcher( GpufitMex "${CMAKE_CURRENT_SOURCE_DIR}" )
+
+# MATLAB Gpufit PACKAGE
+
+set( build_directory "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/matlab" )
+set( package_files
+  "${CMAKE_CURRENT_SOURCE_DIR}/EstimatorID.m"
+  "${CMAKE_CURRENT_SOURCE_DIR}/gpufit.m"
+  "${CMAKE_CURRENT_SOURCE_DIR}/ModelID.m"
+  "${CMAKE_CURRENT_SOURCE_DIR}/README.txt"
+)
+set( binary_gpufit $<TARGET_FILE:Gpufit> )
+set( binary_mex $<TARGET_FILE:GpufitMex> )
+
+add_custom_target( MATLAB_GPUFIT_PACKAGE
+  COMMAND ${CMAKE_COMMAND} -E
+    remove_directory ${build_directory}
+  COMMAND ${CMAKE_COMMAND} -E
+    make_directory ${build_directory}
+  COMMAND ${CMAKE_COMMAND} -E
+    copy_if_different ${package_files} ${build_directory}
+  COMMAND ${CMAKE_COMMAND} -E
+    copy_if_different ${binary_gpufit} ${build_directory}
+  COMMAND ${CMAKE_COMMAND} -E
+    copy_if_different ${binary_mex} ${build_directory}	
+  COMMENT "Creating Gpufit Matlab package"
+)
+set_property( TARGET MATLAB_GPUFIT_PACKAGE PROPERTY FOLDER CMakePredefinedTargets )
+add_dependencies( MATLAB_GPUFIT_PACKAGE Gpufit GpufitMex)
+
+# add launcher
diff --git a/Gpufit/matlab/EstimatorID.m b/Gpufit/matlab/EstimatorID.m
new file mode 100644
index 0000000..a853ffa
--- /dev/null
+++ b/Gpufit/matlab/EstimatorID.m
@@ -0,0 +1,6 @@
+classdef EstimatorID
+    properties (Constant = true)
+        LSE = 0
+        MLE = 1
+    end
+end
\ No newline at end of file
diff --git a/Gpufit/matlab/ModelID.m b/Gpufit/matlab/ModelID.m
new file mode 100644
index 0000000..174c703
--- /dev/null
+++ b/Gpufit/matlab/ModelID.m
@@ -0,0 +1,10 @@
+classdef ModelID
+    properties (Constant = true)
+        GAUSS_1D = 0
+        GAUSS_2D = 1
+        GAUSS_2D_ELLIPTIC = 2
+        GAUSS_2D_ROTATED = 3
+        CAUCHY_2D_ELLIPTIC = 4
+        LINEAR_1D = 5
+    end
+end
\ No newline at end of file
diff --git a/Gpufit/matlab/README.txt b/Gpufit/matlab/README.txt
new file mode 100644
index 0000000..02ddfd2
--- /dev/null
+++ b/Gpufit/matlab/README.txt
@@ -0,0 +1,19 @@
+Matlab binding for the [Gpufit library](https://github.com/gpufit/Gpufit) which implements Levenberg Marquardt curve fitting in CUDA
+
+Requirements
+
+- A CUDA capable graphics card with a recent Nvidia graphics driver (at least 367.48 / July 2016)
+- Windows
+- Matlab 32/64bit
+
+Installation
+
+An installation is not necessary. However, this path must be part of the Matlab path. Use `addpath` if necessary.
+
+Examples
+
+See examples folder. The examples are fully functional only from Matlab2014a.
+
+Troubleshooting
+
+A common reason for the error message 'CUDA driver version is insufficient for CUDA runtime version' is an outdated Nvidia graphics driver.
\ No newline at end of file
diff --git a/Gpufit/matlab/examples/gauss2d.m b/Gpufit/matlab/examples/gauss2d.m
new file mode 100644
index 0000000..bf478a4
--- /dev/null
+++ b/Gpufit/matlab/examples/gauss2d.m
@@ -0,0 +1,182 @@
+function gauss2d()
+% Example of the Matlab binding of the Gpufit library implementing
+% Levenberg Marquardt curve fitting in CUDA
+% https://github.com/gpufit/Gpufit
+%
+% Multiple fits of a 2D Gaussian peak function with Poisson distributed noise
+% http://gpufit.readthedocs.io/en/latest/bindings.html#matlab
+
+% perform some 2D Gaussian peak fits with a symmetrical Gaussian peak
+fit_gauss2d();
+
+% perform some 2D Gaussian peak fits with an asymmetrical, rotated Gaussian peak
+fit_gauss2d_rotated();
+
+end
+function fit_gauss2d()
+
+%% number of fits and fit points
+number_fits = 1e4;
+size_x = 20;
+number_parameters = 5;
+
+%% set input arguments
+
+% true parameters
+true_parameters = single([20, 9.5, 9.5, 3, 10]);
+
+% initialize random number generator
+rng(0);
+
+% initial parameters (randomized)
+initial_parameters = repmat(single(true_parameters'), [1, number_fits]);
+% randomize relative to width for positions
+initial_parameters([2,3], :) = initial_parameters([2,3], :) + true_parameters(4) * (-0.2 + 0.4 * rand(2, number_fits));
+% randomize relative for other parameters
+initial_parameters([1,4,5], :) = initial_parameters([1,4,5], :) .* (0.8 + 0.4 * rand(3, number_fits));
+
+% generate x and y values
+g = single(0 : size_x - 1);
+[x, y] = ndgrid(g, g);
+
+% generate data with Poisson noise
+data = gaussian_2d(x, y, true_parameters);
+data = repmat(data(:), [1, number_fits]);
+data = poissrnd(data);
+
+% tolerance
+tolerance = 1e-3;
+
+% maximum number of iterations
+max_n_iterations = 20;
+
+% estimator id
+estimator_id = EstimatorID.MLE;
+
+% model ID
+model_id = ModelID.GAUSS_2D;
+
+%% run Gpufit
+[parameters, states, chi_squares, n_iterations, time] = gpufit(data, [], ...
+    model_id, initial_parameters, tolerance, max_n_iterations, [], estimator_id, []);
+
+%% displaying results
+display_results('2D Gaussian peak', model_id, number_fits, number_parameters, size_x, time, true_parameters, parameters, states, chi_squares, n_iterations);
+
+end
+
+function fit_gauss2d_rotated()
+
+%% number of fits and fit points
+number_fits = 1e4;
+size_x = 20;
+number_parameters = 7;
+
+%% set input arguments
+
+% true parameters
+true_parameters = single([200, 9.5, 9.5, 3, 4, 10, 0.5]);
+
+% initialize random number generator
+rng(0);
+
+% initial parameters (randomized)
+initial_parameters = repmat(single(true_parameters'), [1, number_fits]);
+% randomize relative to width for positions
+initial_parameters(2, :) = initial_parameters(2, :) + true_parameters(4) * (-0.2 + 0.4 * rand(1, number_fits));
+initial_parameters(3, :) = initial_parameters(3, :) + true_parameters(5) * (-0.2 + 0.4 * rand(1, number_fits));
+% randomize relative for other parameters
+initial_parameters([1,4,5,6,7], :) = initial_parameters([1,4,5,6,7], :) .* (0.8 + 0.4 * rand(5, number_fits));
+
+% generate x and y values
+g = single(0 : size_x - 1);
+[x, y] = ndgrid(g, g);
+
+% generate data with Poisson noise
+data = gaussian_2d_rotated(x, y, true_parameters);
+data = repmat(data(:), [1, number_fits]);
+data = poissrnd(data);
+
+% tolerance
+tolerance = 1e-3;
+
+% maximum number of iterations
+max_n_iterations = 20;
+
+% estimator id
+estimator_id = EstimatorID.MLE;
+
+% model ID
+model_id = ModelID.GAUSS_2D_ROTATED;
+
+%% run Gpufit
+[parameters, states, chi_squares, n_iterations, time] = gpufit(data, [], ...
+    model_id, initial_parameters, tolerance, max_n_iterations, [], estimator_id, []);
+
+%% displaying results
+display_results('2D rotated Gaussian peak', model_id, number_fits, number_parameters, size_x, time, true_parameters, parameters, states, chi_squares, n_iterations);
+
+
+end
+
+function g = gaussian_2d(x, y, p)
+% Generates a 2D Gaussian peak.
+% http://gpufit.readthedocs.io/en/latest/api.html#gauss-2d
+%
+% x,y - x and y grid position values
+% p - parameters (amplitude, x,y center position, width, offset)
+
+g = p(1) * exp(-((x - p(2)).^2 + (y - p(3)).^2) / (2 * p(4)^2)) + p(5);
+
+end
+
+function g = gaussian_2d_rotated(x, y, p)
+% Generates a 2D rotated elliptic Gaussian peak.
+% http://gpufit.readthedocs.io/en/latest/api.html#d-rotated-elliptic-gaussian-peak
+%
+% x,y - x and y grid position values
+% p - parameters (amplitude, x,y center position, width, offset)
+
+% cosine and sine of rotation angle
+cp = cos(p(7));
+sp = sin(p(7));
+
+% Gaussian peak with two axes
+arga = (x - p(2)) .* cp - (y - p(3)) .* sp;
+argb = (x - p(2)) .* sp + (y - p(3)) .* cp;
+ex = exp(-0.5 .* (((arga / p(4)) .* (arga / p(4))) + ((argb / p(5)) .* (argb / p(5)))));
+g = p(1) .* ex + p(6);
+
+end
+
+function display_results(name, model_id, number_fits, number_parameters, size_x, time, true_parameters, parameters, states, chi_squares, n_iterations)
+
+%% displaying results
+converged = states == 0;
+fprintf('\nGpufit of %s\n', name);
+
+% print summary
+fprintf('\nmodel ID:        %d\n', model_id);
+fprintf('number of fits:  %d\n', number_fits);
+fprintf('fit size:        %d x %d\n', size_x, size_x);
+fprintf('mean chi-square: %6.2f\n', mean(chi_squares(converged)));
+fprintf('mean iterations: %6.2f\n', mean(n_iterations(converged)));
+fprintf('time:            %6.2f s\n', time);
+
+% get fit states
+number_converged = sum(converged);
+fprintf('\nratio converged         %6.2f %%\n', number_converged / number_fits * 100);
+fprintf('ratio max it. exceeded  %6.2f %%\n', sum(states == 1) / number_fits * 100);
+fprintf('ratio singular hessian  %6.2f %%\n', sum(states == 2) / number_fits * 100);
+fprintf('ratio neg curvature MLE %6.2f %%\n', sum(states == 3) / number_fits * 100);
+
+% mean and std of fitted parameters
+converged_parameters = parameters(:, converged);
+converged_parameters_mean = mean(converged_parameters, 2);
+converged_parameters_std  = std(converged_parameters, [], 2);
+fprintf('\nparameters of %s\n', name);
+for i = 1 : number_parameters
+    fprintf('p%d true %6.2f mean %6.2f std %6.2f\n', i, true_parameters(i), converged_parameters_mean(i), converged_parameters_std(i));
+end
+
+end
\ No newline at end of file
diff --git a/Gpufit/matlab/examples/gauss2d_comparison.m b/Gpufit/matlab/examples/gauss2d_comparison.m
new file mode 100644
index 0000000..39dc68b
--- /dev/null
+++ b/Gpufit/matlab/examples/gauss2d_comparison.m
@@ -0,0 +1,206 @@
+function gauss2d_comparison()
+% Example of the Matlab binding of the Gpufit library implementing
+% Levenberg Marquardt curve fitting in CUDA
+% https://github.com/gpufit/Gpufit
+%
+% Multiple fits of a 2D Gaussian peak function with Poisson distributed noise
+% compared to a generic Matlab implementation using fminunc and supplying
+% the gradient by the user (uses quasi-newton as algorithm)
+% http://gpufit.readthedocs.io/en/latest/bindings.html#matlab
+
+%% number of fits and fit points
+number_fits = 1e3;
+size_x = 20;
+number_parameters = 5;
+
+%% set input arguments
+
+% true parameters
+true_parameters = single([10, 9.5, 9.5, 3, 10]);
+
+% initialize random number generator
+rng(0);
+
+% initial parameters (randomized)
+initial_parameters = repmat(single(true_parameters'), [1, number_fits]);
+% randomize relative to width for positions
+initial_parameters([2,3], :) = initial_parameters([2,3], :) + true_parameters(4) * (-0.2 + 0.4 * rand(2, number_fits));
+% randomize relative for other parameters
+initial_parameters([1,4,5], :) = initial_parameters([1,4,5], :) .* (0.8 + 0.4 * rand(3, number_fits));
+
+% generate x and y values
+g = single(0 : size_x - 1);
+[x, y] = ndgrid(g, g);
+
+% generate data with Poisson noise
+data = gaussian_2d(x, y, true_parameters);
+data = repmat(data(:), [1, number_fits]);
+data = poissrnd(data);
+
+% tolerance
+tolerance = 1e-4;
+
+% maximum number of iterations
+max_n_iterations = 20;
+
+% estimator id
+estimator_id = EstimatorID.MLE;
+
+% model ID
+model_id = ModelID.GAUSS_2D; % Gaussian peak in 2D
+
+%% run Gpufit
+fprintf('run Gpufit\n');
+[gf_parameters, gf_states, gf_chi_squares, gf_n_iterations, time] = gpufit(data, [], ...
+    model_id, initial_parameters, tolerance, max_n_iterations, [], estimator_id, []);
+
+% display results
+display_results('Gpufit', gf_parameters, gf_states, gf_chi_squares, gf_n_iterations, time, true_parameters);
+
+% store parameters
+
+%% run Matlab
+
+% convert data and initial_parameters to double (otherwise causes an error
+% in fminunc)
+data = double(data);
+initial_parameters = double(initial_parameters);
+xi = double(x(:)');
+yi = double(y(:)');
+
+% set fit options
+options = optimoptions(@fminunc,'Display', 'off', 'MaxIter', max_n_iterations, 'Algorithm', 'quasi-newton', 'TolFun', tolerance, 'GradObj', 'on', 'DerivativeCheck', 'off', 'Diagnostics', 'off');
+
+% initialize output arrays
+m_parameters = zeros(number_parameters, number_fits);
+m_states = zeros(1, number_fits);
+m_chi_squares = zeros(1, number_fits);
+m_n_iterations = zeros(1, number_fits);
+
+% loop over each fit
+fprintf('\n')
+progress = 0;
+L = 50; % length of progressbar
+tic;
+for i = 1 : number_fits
+    
+    % get data and initial_parameters
+    d = data(:, i)';
+    p0 = initial_parameters(:, i);
+    
+    % define minimizer function (give grid and data as implicit parameters)
+    fun = @(p) minimizer(p, xi, yi, d);
+    
+    % call to fminunc
+    [p, fval, exitflag, output] = fminunc(fun, p0, options);
+    
+    % copy to output
+    m_parameters(:, i) = p;
+    m_chi_squares(i) = fval;
+    m_states(i) = exitflag - 1;
+    m_n_iterations(i) = output.iterations;
+    
+    progress = progress + 1;
+    if progress >= number_fits / L
+        progress = 0;
+        fprintf('|');
+    end
+end
+time = toc;
+fprintf(repmat('\b', [1, L]));
+
+% display results
+display_results('Matlab (one CPU kernel)', m_parameters, m_states, m_chi_squares, m_n_iterations, time, true_parameters);
+
+end
+
+function [f, g] = minimizer(p, xi, yi, d)
+% calls the model with the current parameters, then the likelihood function
+% and returns value and derivatives of the likelihood function
+%
+% p - current parameters
+% xi, yi - grid positions
+% d - current data
+
+if nargout > 1
+    [m, mg] = gaussian_2d_with_gradient(xi, yi, p);
+    [f, g] = poisson_likelihood(m, mg, d);
+else
+    m = gaussian_2d(xi, yi, p);
+    f = poisson_likelihood(m, [], d);
+end
+
+end
+
+function [f, g] = poisson_likelihood(m, mg, d)
+% Calculates value and derivatives of the poisson likelihood function for
+% given model and model derivatives
+
+h = d > 0;
+f = 2 * (sum(m-d) - sum(d(h) .* log(m(h) ./ d(h))));
+
+if nargout > 1 % gradient required
+    h = 2 * (1 - d ./ max(m, 1e-6));
+    h = repmat(h, [size(mg, 1), 1]);
+    g = h .* mg;
+    g = sum(g, 2);
+end
+
+end
+
+
+function display_results(name, parameters, ~, chi_squares, n_iterations, time, true_parameters)
+% displaying results
+
+fprintf('*%s*\n', name);
+number_parameters = size(parameters, 1);
+number_fits = size(parameters, 2);
+
+% print summary
+fprintf('\nnumber of fits: %d\n', number_fits);
+fprintf('mean chi-square: %6.2f\n', mean(chi_squares));
+fprintf('mean iterations: %6.2f\n', mean(n_iterations));
+fprintf('time:            %6.2f s\n', time);
+fprintf('fits per second: %.0f\n', number_fits / time);
+
+% mean and std of fitted parameters
+parameters_mean = mean(parameters, 2);
+parameters_std  = std(parameters, [], 2);
+fprintf('\nparameters of 2D Gaussian peak\n');
+for i = 1 : number_parameters
+    fprintf('p%d true %6.2f mean %6.2f std %6.2f\n', i, true_parameters(i), parameters_mean(i), parameters_std(i));
+end
+
+end
+
+function f = gaussian_2d(x, y, p)
+% Generates a 2D Gaussian peak.
+% http://gpufit.readthedocs.io/en/latest/api.html#gauss-2d
+%
+% x,y - x and y grid position values
+% p - parameters (amplitude, x,y center position, width, offset)
+
+f = p(1) * exp(-((x - p(2)).^2 + (y - p(3)).^2) / (2 * p(4)^2)) + p(5);
+
+end
+
+function [f, g] = gaussian_2d_with_gradient(x, y, p)
+% Computes the gradient for a 2D Gaussian peak with respect to parameters.
+
+dx = x - p(2);
+dy = y - p(3);
+p42 = p(4)^2;
+arg = (dx.^2 + dy.^2) / p42;
+exp_f = exp(-0.5 * arg);
+p1_exp_f = p(1) * exp_f;
+
+f = p1_exp_f + p(5);
+
+g1 = exp_f;
+g2 = p1_exp_f .* dx / p42;
+g3 = p1_exp_f .* dy / p42;
+g4 = p1_exp_f .* arg / p(4);
+g5 = ones(size(x));
+g = [g1; g2; g3; g4; g5];
+
+end
diff --git a/Gpufit/matlab/examples/gauss2d_plot.m b/Gpufit/matlab/examples/gauss2d_plot.m
new file mode 100644
index 0000000..cef6adc
--- /dev/null
+++ b/Gpufit/matlab/examples/gauss2d_plot.m
@@ -0,0 +1,117 @@
+function gauss2d_plot()
+% Example of the Matlab binding of the Gpufit library implementing
+% Levenberg Marquardt curve fitting in CUDA
+% https://github.com/gpufit/Gpufit
+%
+% Multiple fits of a 2D Gaussian peak function with Poisson distributed noise
+% repeated for a different total number of fits each time and plotting the
+% results
+% http://gpufit.readthedocs.io/en/latest/bindings.html#matlab
+
+%% number of fit points
+size_x = 5;
+n_points = size_x * size_x;
+
+%% set input arguments
+
+% mean true parameters
+mean_true_parameters = single([100, 3, 3, 1, 10]);
+
+% average noise level
+average_noise_level = 10;
+
+% initialize random number generator
+rng(0);
+
+% tolerance
+tolerance = 1e-4;
+
+% max number of itetations
+max_n_iterations = 10;
+
+% model id
+model_id = ModelID.GAUSS_2D;
+
+%% loop over different number of fits
+n_fits_all = round(logspace(2, 6, 20));
+
+% generate x and y values
+g = single(0 : size_x - 1);
+[x, y] = ndgrid(g, g);
+
+% loop
+speed = zeros(length(n_fits_all), 1);
+for i = 1:length(n_fits_all)
+    n_fits = n_fits_all(i);
+    
+    % vary positions of 2D Gaussians peaks slightly
+    test_parameters = repmat(mean_true_parameters', [1, n_fits]);
+    test_parameters([2,3], :) = test_parameters([2,3], :) + mean_true_parameters(4) * (-0.2 + 0.4 * rand(2, n_fits));
+    
+    % generate data
+    data = gaussians_2d(x, y, test_parameters);
+    data = reshape(data, [n_points, n_fits]);
+    
+    % add noise
+    data = data + average_noise_level * randn(size(data), 'single');
+    
+    % initial parameters (randomized)
+    initial_parameters = repmat(mean_true_parameters', [1, n_fits]);
+    % randomize relative to width for positions
+    initial_parameters([2,3], :) = initial_parameters([2,3], :) + mean_true_parameters(4) * (-0.2 + 0.4 * rand(2, n_fits));
+    % randomize relative for other parameters
+    initial_parameters([1,4,5], :) = initial_parameters([1,4,5], :) .* (0.8 + 0.4 * rand(3, n_fits));
+    
+    % run Gpufit
+    [parameters, states, chi_squares, n_iterations, time] = gpufit(data, [], ...
+        model_id, initial_parameters, tolerance, max_n_iterations);
+
+    % analyze result
+    converged = states == 0;
+    speed(i) = n_fits / time;
+    precision_x0 = std(parameters(2, converged) - test_parameters(2, converged));
+    
+    % display result
+    fprintf('     iterations: %.2f | time: %.3f s | speed: %8.0f fits/s\n', ...
+        mean(n_iterations(converged)), time, speed(i));
+end
+
+%% plot
+figure();
+semilogx(n_fits_all, speed, 'bo-')
+xlabel('number of fits per function call')
+ylabel('fits per second')
+legend('Gpufit', 'Location', 'NorthWest')
+grid on;
+xlim(n_fits_all([1,end]));
+
+end
+
+function g = gaussians_2d(x, y, p)
+% Generates many 2D Gaussians peaks for a given set of parameters
+
+n_fits = size(p, 2);
+msg = sprintf('generating %d fits ', n_fits);
+fprintf(msg);
+
+g = zeros([size(x), n_fits], 'single');
+
+progress = 0;
+L = 50; % length of progressbar
+l = 0;
+for i = 1 : n_fits
+    
+    pi = p(:, i);
+    g(:, :, i) = pi(1) * exp(-((x - pi(2)).^2 + (y - pi(3)).^2) / (2 * pi(4)^2)) + pi(5);
+    
+    progress = progress + 1;
+    if progress >= n_fits / L
+        progress = 0;
+        fprintf('|');
+        l = l + 1;
+    end
+end
+fprintf(repmat('\b', [1, length(msg) + l]));
+fprintf('%7d fits', n_fits);
+
+end
diff --git a/Gpufit/matlab/examples/simple.m b/Gpufit/matlab/examples/simple.m
new file mode 100644
index 0000000..27487d1
--- /dev/null
+++ b/Gpufit/matlab/examples/simple.m
@@ -0,0 +1,26 @@
+function simple()
+% Example of the Matlab binding of the Gpufit library implementing
+% Levenberg Marquardt curve fitting in CUDA
+% https://github.com/gpufit/Gpufit
+%
+% Simple example demonstrating a minimal call of all needed parameters for the Matlab interface
+% http://gpufit.readthedocs.io/en/latest/bindings.html#matlab
+
+% number of fits, number of points per fit
+number_fits = 10;
+number_points = 10;
+
+% model ID and number of parameter
+model_id = ModelID.GAUSS_1D;
+number_parameter = 4;
+
+% initial parameters
+initial_parameters = zeros(number_parameter, number_fits, 'single');
+
+% data
+data = zeros(number_points, number_fits, 'single');
+
+% run Gpufit
+[parameters, states, chi_squares, number_iterations, execution_time] = gpufit(data, [], model_id, initial_parameters);
+
+end
\ No newline at end of file
diff --git a/Gpufit/matlab/gpufit.m b/Gpufit/matlab/gpufit.m
new file mode 100644
index 0000000..2e3beae
--- /dev/null
+++ b/Gpufit/matlab/gpufit.m
@@ -0,0 +1,119 @@
+function [parameters, states, chi_squares, n_iterations, time]...
+    = gpufit(data, weights, model_id, initial_parameters, tolerance, max_n_iterations, parameters_to_fit, estimator_id, user_info)
+% Wrapper around the Gpufit mex file.
+%
+% Optional arguments can be given as empty matrix [].
+%
+% Default values as specified
+
+%% size checks
+
+% number of input parameter (variable)
+if nargin < 9
+    user_info = [];
+    if nargin < 8
+        estimator_id = [];
+        if nargin < 7
+            parameters_to_fit = [];
+            if nargin < 6
+                max_n_iterations = [];
+                if nargin < 5
+                    tolerance = [];
+					assert(nargin == 4, 'Not enough parameters');
+                end
+            end
+        end
+    end
+end
+
+% data is 2D and read number of points and fits
+data_size = size(data);
+assert(length(data_size) == 2, 'data is not two-dimensional');
+n_points = data_size(1);
+n_fits = data_size(2);
+
+% consistency with weights (if given)
+if ~isempty(weights)
+    assert(isequal(data_size, size(weights)), 'Dimension mismatch between data and weights')
+end
+
+% initial parameters is 2D and read number of parameters
+initial_parameters_size = size(initial_parameters);
+assert(length(initial_parameters_size) == 2, 'initial_parameters is not two-dimensional');
+n_parameters = initial_parameters_size(1);
+assert(n_fits == initial_parameters_size(2), 'Dimension mismatch in number of fits between data and initial_parameters');
+
+% consistency with parameters_to_fit (if given)
+if ~isempty(parameters_to_fit)
+    assert(size(parameters_to_fit, 1) == n_parameters, 'Dimension mismatch in number of parameters between initial_parameters and parameters_to_fit');
+end
+
+%% default values
+
+% tolerance
+if isempty(tolerance)
+    tolerance = 1e-4;
+end
+
+% max_n_iterations
+if isempty(max_n_iterations)
+    max_n_iterations = 25;
+end
+
+% estimator_id
+if isempty(estimator_id)
+    estimator_id = EstimatorID.LSE;
+end
+
+% parameters_to_fit
+if isempty(parameters_to_fit)
+    parameters_to_fit = ones(n_parameters, 1, 'int32');
+end
+
+% now only weights and user_info could be not given (empty matrix)
+
+%% type checks
+
+% data, weights (if given), initial_parameters are all single
+assert(isa(data, 'single'), 'Type of data is not single');
+if ~isempty(weights)
+    assert(isa(weights, 'single'), 'Type of weights is not single');
+end
+assert(isa(initial_parameters, 'single'), 'Type of initial_parameters is not single');
+
+% parameters_to_fit is int32 (cast to int32 if incorrect type)
+if ~isa(parameters_to_fit, 'int32')
+    parameters_to_fit = int32(parameters_to_fit);
+end
+
+% max_n_iterations must be int32 (cast if incorrect type)
+if ~isa(max_n_iterations, 'int32')
+    max_n_iterations = int32(max_n_iterations);
+end
+
+% tolerance must be single (cast if incorrect type)
+if ~isa(tolerance, 'single')
+    tolerance = single(tolerance);
+end
+
+% we don't check type of user_info, but we extract the size in bytes of it
+if ~isempty(user_info)
+    user_info_info = whos('user_info');
+    user_info_size = user_info_info.bytes;
+else
+    user_info_size = 0;
+end
+
+
+%% run Gpufit taking the time
+tic;
+[parameters, states, chi_squares, n_iterations] ...
+    = GpufitMex(data, weights, n_fits, n_points, tolerance, max_n_iterations, estimator_id, initial_parameters, parameters_to_fit, model_id, n_parameters, user_info, user_info_size);
+
+time = toc;
+
+% reshape the output parameters array to have dimensions
+% (n_parameters,n_fits)
+parameters = reshape(parameters,n_parameters,n_fits);
+
+end
diff --git a/Gpufit/matlab/mex/GpufitMex.cpp b/Gpufit/matlab/mex/GpufitMex.cpp
new file mode 100644
index 0000000..071ed7c
--- /dev/null
+++ b/Gpufit/matlab/mex/GpufitMex.cpp
@@ -0,0 +1,150 @@
+#include "Gpufit/gpufit.h"
+
+#include <mex.h>
+
+#include <cstring>
+#include <string>
+
+/*
+	Get a arbitrary scalar (non complex) and check for class id.
+	https://www.mathworks.com/help/matlab/apiref/mxclassid.html
+*/
+template<class T> inline bool get_scalar(const mxArray *p, T &v, const mxClassID id)
+{
+	if (mxIsNumeric(p) && !mxIsComplex(p) && mxGetNumberOfElements(p) == 1 && mxGetClassID(p) == id)
+	{
+		v = *static_cast<T *>(mxGetData(p));
+		return true;
+	}
+	else {
+		return false;
+	}
+}
+
+void mexFunction(
+    int          nlhs,
+    mxArray      *plhs[],
+    int          nrhs,
+    mxArray const *prhs[])
+{
+    int expected_nrhs = 0;
+    int expected_nlhs = 0;
+    bool wrong_nrhs = false;
+    bool wrong_nlhs = false;
+
+    // expects a certain number of input (nrhs) and output (nlhs) arguments
+    expected_nrhs = 13;
+    expected_nlhs = 4;
+    if (nrhs != expected_nrhs)
+    {
+        wrong_nrhs = true;
+    }
+    else if (nlhs != expected_nlhs)
+    {
+        wrong_nlhs = true;
+    }
+
+    if (wrong_nrhs || wrong_nlhs)
+    {
+        if (nrhs != expected_nrhs)
+        {
+            char s1[50];
+            _itoa_s(expected_nrhs, s1, 10);
+            char const s2[] = " input arguments required.";
+            size_t const string_length = strlen(s1) + 1 + strlen(s2);
+            strcat_s(s1, string_length, s2);
+            mexErrMsgIdAndTxt("Gpufit:Mex", s1);
+        }
+        else if (nlhs != expected_nlhs)
+        {
+            char s1[50];
+            _itoa_s(expected_nlhs, s1, 10);
+            char const s2[] = " output arguments required.";
+            size_t const string_length = strlen(s1) + 1 + strlen(s2);
+            strcat_s(s1, string_length, s2);
+            mexErrMsgIdAndTxt("Gpufit:Mex", s1);
+        }
+    }
+
+	// input parameters
+	float * data = (float*)mxGetPr(prhs[0]);
+	float * weights = (float*)mxGetPr(prhs[1]);
+    std::size_t n_fits = (std::size_t)*mxGetPr(prhs[2]);
+    std::size_t n_points = (std::size_t)*mxGetPr(prhs[3]);
+
+	// tolerance
+	float tolerance = 0;
+	if (!get_scalar(prhs[4], tolerance, mxSINGLE_CLASS))
+	{
+		mexErrMsgIdAndTxt("Gpufit:Mex", "tolerance is not a single");
+	}
+
+	// max_n_iterations
+	int max_n_iterations = 0;
+	if (!get_scalar(prhs[5], max_n_iterations, mxINT32_CLASS))
+	{
+		mexErrMsgIdAndTxt("Gpufit:Mex", "max_n_iteration is not int32");
+	}
+
+    int estimator_id = (int)*mxGetPr(prhs[6]);
+	float * initial_parameters = (float*)mxGetPr(prhs[7]);
+	int * parameters_to_fit = (int*)mxGetPr(prhs[8]);
+    int model_id = (int)*mxGetPr(prhs[9]);
+    int n_parameters = (int)*mxGetPr(prhs[10]);
+	int * user_info = (int*)mxGetPr(prhs[11]);
+    std::size_t user_info_size = (std::size_t)*mxGetPr(prhs[12]);
+
+	// output parameters
+    float * output_parameters;
+	mxArray * mx_parameters;
+	mx_parameters = mxCreateNumericMatrix(1, n_fits*n_parameters, mxSINGLE_CLASS, mxREAL);
+	output_parameters = (float*)mxGetData(mx_parameters);
+	plhs[0] = mx_parameters;
+
+    int * output_states;
+	mxArray * mx_states;
+	mx_states = mxCreateNumericMatrix(1, n_fits, mxINT32_CLASS, mxREAL);
+	output_states = (int*)mxGetData(mx_states);
+	plhs[1] = mx_states;
+
+    float * output_chi_squares;
+	mxArray * mx_chi_squares;
+	mx_chi_squares = mxCreateNumericMatrix(1, n_fits, mxSINGLE_CLASS, mxREAL);
+	output_chi_squares = (float*)mxGetData(mx_chi_squares);
+	plhs[2] = mx_chi_squares;
+
+    int * output_n_iterations;
+    mxArray * mx_n_iterations;
+    mx_n_iterations = mxCreateNumericMatrix(1, n_fits, mxINT32_CLASS, mxREAL);
+    output_n_iterations = (int*)mxGetData(mx_n_iterations);
+    plhs[3] = mx_n_iterations;
+
+	// call to gpufit
+    int const status
+            = gpufit
+            (
+                n_fits,
+                n_points,
+                data,
+                weights,
+                model_id,
+                initial_parameters,
+                tolerance,
+                max_n_iterations,
+                parameters_to_fit,
+                estimator_id,
+                user_info_size,
+                reinterpret_cast< char * >( user_info ),
+                output_parameters,
+                output_states,
+                output_chi_squares,
+                output_n_iterations
+            ) ;
+
+	// check status
+    if (status != STATUS_OK)
+    {
+        std::string const error = gpufit_get_last_error() ;
+        mexErrMsgIdAndTxt( "Gpufit:Mex", error.c_str() ) ;
+    }
+}
diff --git a/Gpufit/matlab/tests/gauss_fit_1d_test.m b/Gpufit/matlab/tests/gauss_fit_1d_test.m
new file mode 100644
index 0000000..412c72e
--- /dev/null
+++ b/Gpufit/matlab/tests/gauss_fit_1d_test.m
@@ -0,0 +1,35 @@
+% Equivalent/similar to tests/Gauss_Fit_1D.cpp
+
+% constants
+n_fits = 1;
+n_points = 5;
+n_parameters = 4;
+true_parameters = single([4; 2; 0.5; 1]);
+
+% data
+x = single((1:n_points)' - 1);
+y = gaussian_1d(true_parameters, x);
+data = zeros(n_points, n_fits, 'single');
+data(:, 1) = y;
+
+% model
+model_id = ModelID.GAUSS_1D;
+
+% initial_parameters
+initial_parameters = zeros(n_parameters, n_fits, 'single');
+initial_parameters(:, 1) = [2, 1.5, 0.3, 0];
+
+% call to gpufit
+[parameters, states, chi_squares, n_iterations] = gpufit(data, [], model_id, initial_parameters);
+
+%% Test results
+assert(states == 0);
+assert(n_iterations < 10);
+assert(chi_squares < 1e-6);
+assert(all(abs(parameters - true_parameters) < 1e-6));
+
+function y = gaussian_1d(p, x)
+
+y = p(1) * exp(-(x - p(2)).^2 ./ (2 * p(3).^2)) + p(4);
+
+end
\ No newline at end of file
diff --git a/Gpufit/matlab/tests/run_tests.m b/Gpufit/matlab/tests/run_tests.m
new file mode 100644
index 0000000..80da345
--- /dev/null
+++ b/Gpufit/matlab/tests/run_tests.m
@@ -0,0 +1,8 @@
+function run_tests()
+% Runs all test scripts in this folder.
+% See also: http://www.mathworks.com/help/matlab/script-based-unit-tests.html
+
+suite = testsuite();
+result = run(suite);
+disp(result);
+end
\ No newline at end of file
diff --git a/Gpufit/mle.cuh b/Gpufit/mle.cuh
new file mode 100644
index 0000000..32a45a0
--- /dev/null
+++ b/Gpufit/mle.cuh
@@ -0,0 +1,179 @@
+#ifndef GPUFIT_MLE_CUH_INCLUDED
+#define GPUFIT_MLE_CUH_INCLUDED
+
+#include <math.h>
+
+/* Description of the calculate_chi_square_mle function
+* =====================================================
+*
+* This function calculates the chi-square values for the MLE estimator.
+*
+* Parameters:
+*
+* chi_square: An output vector of chi-square values for each data point.
+*
+* point_index: The data point index.
+*
+* data: An input vector of data.
+*
+* value: An input vector of fitting curve values.
+*
+* weight: An input vector of values for weighting chi-square values. It is not used
+*         in this function. It can be used in functions calculating other estimators
+*         than the MLE, such as LSE.
+*
+* state: A pointer to a value which indicates whether the fitting process was carreid
+*        out correctly or which problem occurred. It is set to 3 if a fitting curve
+*        value is negative.
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_chi_square_mle function
+* =============================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function.
+*
+*/
+
+__device__ void calculate_chi_square_mle(
+    volatile float * chi_square,
+    int const point_index,
+    float const * data,
+    float const * value,
+    float const * weight,
+    int * state,
+    char * user_info,
+    std::size_t const user_info_size)
+{
+    if (value[point_index] < 0)
+    {
+        *state = 3;
+    }
+
+    float const deviation = value[point_index] - data[point_index];
+
+    if (data[point_index] != 0)
+    {
+        chi_square[point_index]
+            = 2 * (deviation - data[point_index] * logf(value[point_index] / data[point_index]));
+    }
+    else
+    {
+        chi_square[point_index] = 2 * deviation;
+    }
+}
+
+/* Description of the calculate_hessian_mle function
+* ==================================================
+*
+* This function calculates the hessian matrix values of the MLE equation. The
+* calculation is performed based on previously calculated derivative values.
+* 
+* Parameters:
+*
+* hessian: An output vector of values of the hessian matrix for each data point.
+*
+* point_index: The data point index.
+*
+* parameter_index_i: Index of the hessian column.
+*
+* parameter_index_j: Index of the hessian row.
+*
+* data: An input vector of data values.
+*
+* value: An input vector of fitting curve values.
+*
+* derivative: An input vector of partial derivative values of the fitting
+*             curve for each data point.
+*
+* weight: An input vector of values for weighting hessian matrix values. It is not
+*         used in this function. It can be used in functions calculating other estimators
+*         than the MLE, such as LSE.
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_hessian_mle function
+* ==========================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function.
+*
+*/
+
+__device__ void calculate_hessian_mle(
+    double * hessian,
+    int const point_index,
+    int const parameter_index_i,
+    int const parameter_index_j,
+    float const * data,
+    float const * value,
+    float const * derivatives,
+    float const * weight,
+    char * user_info,
+    std::size_t const user_info_size)
+{
+    *hessian
+        += data[point_index]
+        / (value[point_index] * value[point_index])
+        * derivatives[parameter_index_i] * derivatives[parameter_index_j];
+}
+
+/* Description of the calculate_gradient_mle function
+* ===================================================
+*
+* This function calculates the gradient values of the MLE equation based
+* on previously calculated derivative values.
+*
+* Parameters:
+*
+* gradient: An output vector of values of the gradient vector for each data point.
+*
+* point_index: The data point index.
+*
+* parameter_index: The parameter index.
+*
+* data: An input vector of data values.
+*
+* value: An input vector of fitting curve values.
+*
+* derivative: An input vector of partial derivative values of the fitting
+*             curve for each data point.
+*
+* weight: An input vector of values for weighting gradient vector values. It is not
+*         used in this function. It can be used in functions calculating other estimators
+*         than the MLE, such as LSE.
+*
+* user_info: An input vector containing user information. (not used)
+*
+* user_info_size: The number of elements in user_info. (not used)
+*
+* Calling the calculate_gradient_mle function
+* ===========================================
+*
+* This __device__ function can be only called from a __global__ function or an other
+* __device__ function.
+*
+*/
+
+__device__ void calculate_gradient_mle(
+    volatile float * gradient,
+    int const point_index,
+    int const parameter_index,
+    float const * data,
+    float const * value,
+    float const * derivative,
+    float const * weight,
+    char * user_info,
+    std::size_t const user_info_size)
+{
+    gradient[point_index]
+        = -derivative[parameter_index]
+        * (1 - data[point_index] / value[point_index]);
+}
+
+#endif
diff --git a/Gpufit/python/CMakeLists.txt b/Gpufit/python/CMakeLists.txt
new file mode 100644
index 0000000..1ed2b3c
--- /dev/null
+++ b/Gpufit/python/CMakeLists.txt
@@ -0,0 +1,53 @@
+
+# Python
+
+# Python package
+
+set( build_directory "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/pyGpufit" )
+set( setup_files
+  "${CMAKE_CURRENT_SOURCE_DIR}/README.txt"
+  "${CMAKE_CURRENT_SOURCE_DIR}/setup.py"
+  "${CMAKE_CURRENT_SOURCE_DIR}/setup.cfg"
+)
+set( module_directory "${build_directory}/pygpufit" )
+set( module_files
+  "${CMAKE_CURRENT_SOURCE_DIR}/pygpufit/__init__.py"
+  "${CMAKE_CURRENT_SOURCE_DIR}/pygpufit/gpufit.py"
+)
+set( binary $<TARGET_FILE:Gpufit> )
+
+add_custom_target( PYTHON_PACKAGE
+  COMMAND ${CMAKE_COMMAND} -E
+    remove_directory ${build_directory}
+  COMMAND ${CMAKE_COMMAND} -E
+    make_directory ${build_directory}
+  COMMAND ${CMAKE_COMMAND} -E
+    copy_if_different ${setup_files} ${build_directory}
+  COMMAND ${CMAKE_COMMAND} -E
+    make_directory ${module_directory}
+  COMMAND ${CMAKE_COMMAND} -E
+    copy_if_different ${module_files} ${module_directory}
+  COMMAND ${CMAKE_COMMAND} -E
+    copy_if_different ${binary} ${module_directory}
+)
+set_property( TARGET PYTHON_PACKAGE PROPERTY FOLDER CMakePredefinedTargets )
+add_dependencies( PYTHON_PACKAGE Gpufit )
+
+if( NOT PYTHONINTERP_FOUND )
+  message( STATUS "Python NOT found - skipping creation of Python wheel!" )
+  return()
+endif()
+
+# Python wheel (output name is incorrect, requires plattform tag, see packaging)
+
+add_custom_target( PYTHON_WHEEL ALL
+  COMMAND ${CMAKE_COMMAND} -E
+    chdir ${build_directory} "${PYTHON_EXECUTABLE}" setup.py clean --all
+  COMMAND ${CMAKE_COMMAND} -E
+    chdir ${build_directory} "${PYTHON_EXECUTABLE}" setup.py bdist_wheel
+  COMMENT "Preparing Python Wheel"
+)
+set_property( TARGET PYTHON_WHEEL PROPERTY FOLDER CMakePredefinedTargets )
+add_dependencies( PYTHON_WHEEL PYTHON_PACKAGE )
+
+# add launcher to Python package
diff --git a/Gpufit/python/README.txt b/Gpufit/python/README.txt
new file mode 100644
index 0000000..2e58557
--- /dev/null
+++ b/Gpufit/python/README.txt
@@ -0,0 +1,27 @@
+Python binding for the [Gpufit library](https://github.com/gpufit/Gpufit) which implements Levenberg Marquardt curve fitting in CUDA
+
+Requirements
+
+- A CUDA capable graphics card with a recent Nvidia graphics driver (at least 367.48 / July 2016)
+- Windows
+- Python 2 or 3 with NumPy
+
+Installation
+
+Currently the wheel file has to be installed locally.
+
+If NumPy is not yet installed, install it using pip from the command line
+
+pip install numpy
+
+Then install pyGpufit from the local folder via:
+
+pip install --no-index --find-links=LocalPathToWheelFile pyGpufit
+
+Examples
+
+See examples folder.
+
+Troubleshooting
+
+A common reason for the error message 'CUDA driver version is insufficient for CUDA runtime version' is an outdated Nvidia graphics driver.
\ No newline at end of file
diff --git a/Gpufit/python/examples/gauss2d.py b/Gpufit/python/examples/gauss2d.py
new file mode 100644
index 0000000..435c4de
--- /dev/null
+++ b/Gpufit/python/examples/gauss2d.py
@@ -0,0 +1,112 @@
+"""
+    Example of the Python binding of the Gpufit library which implements
+    Levenberg Marquardt curve fitting in CUDA
+    https://github.com/gpufit/Gpufit
+
+    Multiple fits of a 2D Gaussian peak function with Poisson distributed noise
+    http://gpufit.readthedocs.io/en/latest/bindings.html#python
+
+    This example additionally requires numpy.
+"""
+
+import numpy as np
+import pygpufit.gpufit as gf
+
+def generate_gauss_2d(p, xi, yi):
+    """
+    Generates a 2D Gaussian peak.
+    http://gpufit.readthedocs.io/en/latest/api.html#gauss-2d
+
+    :param p: Parameters (amplitude, x,y center position, width, offset)
+    :param xi: x positions
+    :param yi: y positions
+    :return: The Gaussian 2D peak.
+    """
+
+    arg = -(np.square(xi - p[1]) + np.square(yi - p[2])) / (2*p[3]*p[3])
+    y = p[0] * np.exp(arg) + p[4]
+
+    return y
+
+if __name__ == '__main__':
+
+    if not gf.cuda_available():
+        raise RuntimeError(gf.get_last_error())
+
+    # number of fits and fit points
+    number_fits = 10000
+    size_x = 12
+    number_points = size_x * size_x
+    number_parameters = 5
+
+    # set input arguments
+
+    # true parameters
+    true_parameters = np.array((10, 5.5, 5.5, 3, 10), dtype=np.float32)
+
+    # initialize random number generator
+    np.random.seed(0)
+
+    # initial parameters (relative randomized, positions relative to width)
+    initial_parameters = np.tile(true_parameters, (number_fits, 1))
+    initial_parameters[:, (1,2)] += true_parameters[3] * (-0.2 + 0.4 * np.random.rand(number_fits, 2))
+    initial_parameters[:, (0, 3, 4)] *= 0.8 + 0.4 * np.random.rand(number_fits, 3)
+
+    # generate x and y values
+    g = np.arange(size_x)
+    yi, xi = np.meshgrid(g, g, indexing='ij')
+    xi = xi.astype(np.float32)
+    yi = yi.astype(np.float32)
+
+    # generate data
+    data = generate_gauss_2d(true_parameters, xi, yi)
+    data = np.reshape(data, (1, number_points))
+    data = np.tile(data, (number_fits, 1))
+
+    # add Poisson noise
+    data = np.random.poisson(data)
+    data = data.astype(np.float32)
+
+    # tolerance
+    tolerance = 0.0001
+
+    # maximum number of iterations
+    max_number_iterations = 20
+
+    # estimator ID
+    estimator_id = gf.EstimatorID.MLE
+
+    # model ID
+    model_id = gf.ModelID.GAUSS_2D
+
+    # run Gpufit
+    parameters, states, chi_squares, number_iterations, execution_time = gf.fit(data, None, model_id, initial_parameters, \
+                                                        tolerance, max_number_iterations, None, estimator_id, None)
+
+    # print fit results
+    converged = states == 0
+    print('*Gpufit*')
+
+    # print summary
+    print('\nmodel ID:        {}'.format(model_id))
+    print('number of fits:  {}'.format(number_fits))
+    print('fit size:        {} x {}'.format(size_x, size_x))
+    print('mean chi_square: {:.2f}'.format(np.mean(chi_squares[converged])))
+    print('iterations:      {:.2f}'.format(np.mean(number_iterations[converged])))
+    print('time:            {:.2f} s'.format(execution_time))
+
+    # get fit states
+    number_converged = np.sum(converged)
+    print('\nratio converged         {:6.2f} %'.format(number_converged / number_fits * 100))
+    print('ratio max it. exceeded  {:6.2f} %'.format(np.sum(states == 1) / number_fits * 100))
+    print('ratio singular hessian  {:6.2f} %'.format(np.sum(states == 2) / number_fits * 100))
+    print('ratio neg curvature MLE {:6.2f} %'.format(np.sum(states == 3) / number_fits * 100))
+
+    # mean, std of fitted parameters
+    converged_parameters = parameters[converged, :]
+    converged_parameters_mean = np.mean(converged_parameters, axis=0)
+    converged_parameters_std = np.std(converged_parameters, axis=0)
+    print('\nparameters of 2D Gaussian peak')
+    for i in range(number_parameters):
+        print('p{} true {:6.2f} mean {:6.2f} std {:6.2f}'.format(i, true_parameters[i], converged_parameters_mean[i], converged_parameters_std[i]))
+
diff --git a/Gpufit/python/examples/gauss2d_plot.py b/Gpufit/python/examples/gauss2d_plot.py
new file mode 100644
index 0000000..d7feb8e
--- /dev/null
+++ b/Gpufit/python/examples/gauss2d_plot.py
@@ -0,0 +1,114 @@
+"""
+    Example of the Python binding of the Gpufit library which implements
+    Levenberg Marquardt curve fitting in CUDA
+    https://github.com/gpufit/Gpufit
+
+    Multiple fits of a 2D Gaussian peak function with Poisson distributed noise
+    repeated for a different total number of fits each time and plotting the results
+    http://gpufit.readthedocs.io/en/latest/bindings.html#python
+
+    This example additionally requires numpy (http://www.numpy.org/) and matplotlib (http://matplotlib.org/).
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pygpufit.gpufit as gf
+
+def gaussians_2d(x, y, p):
+    """
+    Generates many 2D Gaussians peaks for a given set of parameters
+    """
+
+    n_fits = p.shape[0]
+
+    y = np.zeros((n_fits, x.shape[0], x.shape[1]), dtype=np.float32)
+
+    # loop over each fit
+    for i in range(n_fits):
+        pi = p[i, :]
+        arg = -(np.square(xi - pi[1]) + np.square(yi - pi[2])) / (2 * pi[3] * pi[3])
+        y[i, :, :] = pi[0] * np.exp(arg) + pi[4]
+
+    return y
+
+if __name__ == '__main__':
+
+    print('\n')
+
+    # number of fit points
+    size_x = 5
+    number_points = size_x * size_x
+
+    # set input arguments
+
+    # true parameters
+    mean_true_parameters = np.array((100, 2, 2, 1, 10), dtype=np.float32)
+
+    # average noise level
+    average_noise_level = 10
+
+    # initialize random number generator
+    np.random.seed(0)
+
+    # tolerance
+    tolerance = 0.0001
+
+    # maximum number of iterations
+    max_number_iterations = 10
+
+    # model ID
+    model_id = gf.ModelID.GAUSS_2D
+
+    # loop over different number of fits
+    n_fits_all = np.around(np.logspace(2, 6, 20)).astype(np.int)
+
+    # generate x and y values
+    g = np.arange(size_x)
+    yi, xi = np.meshgrid(g, g, indexing='ij')
+    xi = xi.astype(np.float32)
+    yi = yi.astype(np.float32)
+
+    # loop
+    speed = np.zeros(n_fits_all.size)
+    for i in range(n_fits_all.size):
+        n_fits = n_fits_all[i]
+
+        # vary positions of 2D Gaussian peaks slightly
+        test_parameters = np.tile(mean_true_parameters, (n_fits, 1))
+        test_parameters[:, (1,2)] += mean_true_parameters[3] * (-0.2 + 0.4 * np.random.rand(n_fits, 2))
+
+        # generate data
+        data = gaussians_2d(xi, yi, test_parameters)
+        data = np.reshape(data, (n_fits, number_points))
+
+        # add noise
+        data += np.random.normal(scale=average_noise_level, size=data.shape)
+
+        # initial parameters (randomized relative (to width for position))
+        initial_parameters = np.tile(mean_true_parameters, (n_fits, 1))
+        initial_parameters[:, (1,2)] += mean_true_parameters[3] * (-0.2 + 0.4 * np.random.rand(n_fits, 2))
+        initial_parameters[:, (0,3,4)] *= 0.8 + 0.4 * np.random.rand(n_fits, 3)
+
+        # run Gpufit
+        parameters, states, chi_squares, number_iterations, execution_time = gf.fit(data, None, model_id, initial_parameters, tolerance, max_number_iterations)
+
+        # analyze result
+        converged = states == 0
+        speed[i] = n_fits / execution_time
+        precision_x0 = np.std(parameters[converged, 1] - test_parameters[converged, 1], axis=0, dtype=np.float64)
+
+        # display result
+        '{} fits '.format(n_fits)
+        print('{:7} fits     iterations: {:6.2f} | time: {:6.3f} s | speed: {:8.0f} fits/s'\
+              .format(n_fits, np.mean(number_iterations[converged]), execution_time, speed[i]))
+
+# plot
+plt.semilogx(n_fits_all, speed, 'bo-')
+plt.grid(True)
+plt.xlabel('number of fits per function call')
+plt.ylabel('fits per second')
+plt.legend(['Gpufit'], loc='upper left')
+ax = plt.gca()
+ax.set_xlim(n_fits_all[0], n_fits_all[-1])
+
+plt.show()
\ No newline at end of file
diff --git a/Gpufit/python/examples/simple.py b/Gpufit/python/examples/simple.py
new file mode 100644
index 0000000..5184001
--- /dev/null
+++ b/Gpufit/python/examples/simple.py
@@ -0,0 +1,30 @@
+"""
+    Example of the Python binding of the Gpufit library which implements
+    Levenberg Marquardt curve fitting in CUDA
+    https://github.com/gpufit/Gpufit
+
+    Simple example demonstrating a minimal call of all needed parameters for the Python interface
+    http://gpufit.readthedocs.io/en/latest/bindings.html#python
+"""
+
+import numpy as np
+import pygpufit.gpufit as gf
+
+if __name__ == '__main__':
+
+    # number of fits, number of points per fit
+    number_fits = 10
+    number_points = 10
+
+    # model ID and number of parameter
+    model_id = gf.ModelID.GAUSS_1D
+    number_parameter = 5
+
+    # initial parameters
+    initial_parameters = np.zeros((number_fits, number_parameter), dtype=np.float32)
+
+    # data
+    data = np.zeros((number_fits, number_points), dtype=np.float32)
+
+    # run Gpufit
+    parameters, states, chi_squares, number_iterations, execution_time = gf.fit(data, None, model_id, initial_parameters)
\ No newline at end of file
diff --git a/Gpufit/python/pygpufit/__init__.py b/Gpufit/python/pygpufit/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/Gpufit/python/pygpufit/gpufit.py b/Gpufit/python/pygpufit/gpufit.py
new file mode 100644
index 0000000..22a889f
--- /dev/null
+++ b/Gpufit/python/pygpufit/gpufit.py
@@ -0,0 +1,201 @@
+"""
+    Python binding for Gpufit, a Levenberg Marquardt curve fitting library written in CUDA
+    See https://github.com/gpufit/Gpufit, http://gpufit.readthedocs.io/en/latest/bindings.html#python
+
+    The binding is based on ctypes.
+    See https://docs.python.org/3.5/library/ctypes.html, http://www.scipy-lectures.org/advanced/interfacing_with_c/interfacing_with_c.html
+"""
+
+import os
+import time
+from ctypes import cdll, POINTER, c_int, c_float, c_char, c_char_p, c_size_t
+import numpy as np
+
+# define library loader (actual loading is lazy)
+package_dir = os.path.dirname(os.path.realpath(__file__))
+lib_path = os.path.join(package_dir, 'Gpufit.dll') # this will only work on Windows
+lib = cdll.LoadLibrary(lib_path)
+
+# gpufit function in the dll
+gpufit_func = lib.gpufit
+gpufit_func.restype = c_int
+gpufit_func.argtypes = [c_size_t, c_size_t, POINTER(c_float), POINTER(c_float), c_int, POINTER(c_float), c_float, c_int, POINTER(c_int), c_int, c_size_t, POINTER(c_char), POINTER(c_float), POINTER(c_int), POINTER(c_float), POINTER(c_int)]
+
+# gpufit_get_last_error function in the dll
+error_func = lib.gpufit_get_last_error
+error_func.restype = c_char_p
+error_func.argtypes = None
+
+# gpufit_cuda_available function in the dll
+cuda_available_func = lib.gpufit_cuda_available
+cuda_available_func.restype = c_int
+cuda_available_func.argtypes = None
+
+
+class ModelID():
+
+    GAUSS_1D = 0
+    GAUSS_2D = 1
+    GAUSS_2D_ELLIPTIC = 2
+    GAUSS_2D_ROTATED = 3
+    CAUCHY_2D_ELLIPTIC = 4
+    LINEAR_1D = 5
+
+
+class EstimatorID():
+
+    LSE = 0
+    MLE = 1
+
+
+def fit(data, weights, model_id, initial_parameters, tolerance=None, max_number_iterations=None, \
+           parameters_to_fit=None, estimator_id=None, user_info=None):
+    """
+    Calls the C interface fit function in the library.
+    (see also http://gpufit.readthedocs.io/en/latest/bindings.html#python)
+
+    All 2D NumPy arrays must be in row-major order (standard in NumPy), i.e. array.flags.C_CONTIGUOUS must be True
+    (see also https://docs.scipy.org/doc/numpy/reference/arrays.ndarray.html#internal-memory-layout-of-an-ndarray)
+
+    :param data: The data - 2D NumPy array of dimension [number_fits, number_points] and data type np.float32
+    :param weights: The weights - 2D NumPy array of the same dimension and data type as parameter data or None (no weights available)
+    :param model_id: The model ID
+    :param initial_parameters: Initial values for parameters - NumPy array of dimension [number_fits, number_parameters] and data type np.float32
+    :param tolerance: The fit tolerance or None (will use default value)
+    :param max_number_iterations: The maximal number of iterations or None (will use default value)
+    :param parameters_to_fit: Which parameters to fit - NumPy array of length number_parameters and type np.int32 or None (will fit all parameters)
+    :param estimator_id: The Estimator ID or None (will use default values)
+    :param user_info: User info - NumPy array of type np.char or None (no user info available)
+    :return: parameters, states, chi_squares, number_iterations, execution_time
+    """
+
+    # check all 2D NumPy arrays for row-major memory layout (otherwise interpretation of order of dimensions fails)
+    if not data.flags.c_contiguous:
+        raise RuntimeError('Memory layout of data array mismatch.')
+
+    if weights is not None and not weights.flags.c_contiguous:
+        raise RuntimeError('Memory layout of weights array mismatch.')
+
+    if not initial_parameters.flags.c_contiguous:
+        raise RuntimeError('Memory layout of initial_parameters array mismatch.')
+
+    # size check: data is 2D and read number of points and fits
+    if data.ndim != 2:
+        raise RuntimeError('data is not two-dimensional')
+    number_points = data.shape[1]
+    number_fits = data.shape[0]
+
+    # size check: consistency with weights (if given)
+    if weights is not None and data.shape != weights.shape:
+        raise  RuntimeError('dimension mismatch between data and weights')
+        # the unequal operator checks, type, length and content (https://docs.python.org/3.7/reference/expressions.html#value-comparisons)
+
+    # size check: initial parameters is 2D and read number of parameters
+    if initial_parameters.ndim != 2:
+        raise RuntimeError('initial_parameters is not two-dimensional')
+    number_parameters = initial_parameters.shape[1]
+    if initial_parameters.shape[0] != number_fits:
+        raise RuntimeError('dimension mismatch in number of fits between data and initial_parameters')
+
+    # size check: consistency with parameters_to_fit (if given)
+    if parameters_to_fit is not None and parameters_to_fit.shape[0] != number_parameters:
+        raise RuntimeError('dimension mismatch in number of parameters between initial_parameters and parameters_to_fit')
+
+    # default value: tolerance
+    if not tolerance:
+        tolerance = 1e-4
+
+    # default value: max_number_iterations
+    if not max_number_iterations:
+        max_number_iterations = 25
+
+    # default value: estimator ID
+    if not estimator_id:
+        estimator_id = EstimatorID.LSE
+
+    # default value: parameters_to_fit
+    if parameters_to_fit is None:
+        parameters_to_fit = np.ones(number_parameters, dtype=np.int32)
+
+    # now only weights and user_info could be not given
+
+    # type check: data, weights (if given), initial_parameters are all np.float32
+    if data.dtype != np.float32:
+        raise RuntimeError('type of data is not np.float32')
+    if weights is not None and weights.dtype != np.float32:
+        raise RuntimeError('type of weights is not np.float32')
+    if initial_parameters.dtype != np.float32:
+        raise RuntimeError('type of initial_parameters is not np.float32')
+
+    # type check: parameters_to_fit is np.int32
+    if parameters_to_fit.dtype != np.int32:
+        raise RuntimeError('type of parameters_to_fit is not np.int32')
+
+    # we don't check type of user_info, but we extract the size in bytes of it
+    if user_info is not None:
+        user_info_size = user_info.nbytes
+    else:
+        user_info_size = 0
+
+    # pre-allocate output variables
+    parameters = np.zeros((number_fits, number_parameters), dtype=np.float32)
+    states = np.zeros(number_fits, dtype=np.int32)
+    chi_squares = np.zeros(number_fits, dtype=np.float32)
+    number_iterations = np.zeros(number_fits, dtype=np.int32)
+
+    # conversion to ctypes types for optional C interface parameters using NULL pointer (None) as default argument
+    if weights is not None:
+        weights_p = weights.ctypes.data_as(gpufit_func.argtypes[3])
+    else:
+        weights_p = None
+    if user_info is not None:
+        user_info_p = user_info.ctypes.data_as(gpufit_func.argtypes[11])
+    else:
+        user_info_p = None
+
+    # call into the library (measure time)
+    t0 = time.clock()
+    status = gpufit_func(
+        gpufit_func.argtypes[0](number_fits), \
+        gpufit_func.argtypes[1](number_points), \
+        data.ctypes.data_as(gpufit_func.argtypes[2]), \
+        weights_p, \
+        gpufit_func.argtypes[4](model_id), \
+        initial_parameters.ctypes.data_as(gpufit_func.argtypes[5]), \
+        gpufit_func.argtypes[6](tolerance), \
+        gpufit_func.argtypes[7](max_number_iterations), \
+        parameters_to_fit.ctypes.data_as(gpufit_func.argtypes[8]), \
+        gpufit_func.argtypes[9](estimator_id), \
+        gpufit_func.argtypes[10](user_info_size), \
+        user_info_p, \
+        parameters.ctypes.data_as(gpufit_func.argtypes[12]), \
+        states.ctypes.data_as(gpufit_func.argtypes[13]), \
+        chi_squares.ctypes.data_as(gpufit_func.argtypes[14]), \
+        number_iterations.ctypes.data_as(gpufit_func.argtypes[15]))
+    t1 = time.clock()
+
+
+    # check status
+    if status != 0:
+        # get error from last error and raise runtime error
+        error_message = error_func()
+        raise RuntimeError('status = {}, message = {}'.format(status, error_message))
+
+    # return output values
+    return parameters, states, chi_squares, number_iterations, t1 - t0
+
+
+def get_last_error():
+    """
+
+    :return:
+    """
+    return error_func()
+
+
+def cuda_available():
+    """
+
+    :return: True if CUDA is available, False otherwise
+    """
+    return cuda_available_func() != 0
diff --git a/Gpufit/python/requirements.txt b/Gpufit/python/requirements.txt
new file mode 100644
index 0000000..b316bf2
--- /dev/null
+++ b/Gpufit/python/requirements.txt
@@ -0,0 +1 @@
+NumPy>=1.8
\ No newline at end of file
diff --git a/Gpufit/python/setup.cfg b/Gpufit/python/setup.cfg
new file mode 100644
index 0000000..3c6e79c
--- /dev/null
+++ b/Gpufit/python/setup.cfg
@@ -0,0 +1,2 @@
+[bdist_wheel]
+universal=1
diff --git a/Gpufit/python/setup.py b/Gpufit/python/setup.py
new file mode 100644
index 0000000..c2e2b83
--- /dev/null
+++ b/Gpufit/python/setup.py
@@ -0,0 +1,40 @@
+"""
+    setup script for pyGpufit
+
+    TODO get version, get meaningful email
+"""
+
+from setuptools import setup, find_packages
+import os
+from io import open # to have encoding as parameter of open on Python >=2.6
+
+HERE = os.path.abspath(os.path.dirname(__file__))
+
+CLASSIFIERS = ['Development Status :: 5 - Production/Stable',
+               'Intended Audience :: End Users/Desktop',
+               'Operating System :: Microsoft :: Windows',
+               'Topic :: Scientific/Engineering',
+               'Topic :: Software Development :: Libraries']
+
+def get_long_description():
+    """
+    Get the long description from the README file.
+    """
+    with open(os.path.join(HERE, 'README.txt'), encoding='utf-8') as f:
+        return f.read()
+
+if __name__ == "__main__":
+    setup(name='pyGpufit',
+        version='1.0.0',
+        description='Levenberg Marquardt curve fitting in CUDA',
+        long_description=get_long_description(),
+        url='https://github.com/gpufit/Gpufit',
+        author='M. Bates, A. Przybylski, B. Thiel, and J. Keller-Findeisen',
+        author_email='a@b.c',
+        license='',
+        classifiers=[],
+        keywords='Levenberg Marquardt, curve fitting, CUDA',
+        packages=find_packages(where=HERE),
+        package_data={'pygpufit': ['*.dll']},
+        install_requires=['NumPy>=1.0'],
+        zip_safe=False)
\ No newline at end of file
diff --git a/Gpufit/python/tests/run_tests.py b/Gpufit/python/tests/run_tests.py
new file mode 100644
index 0000000..5395da2
--- /dev/null
+++ b/Gpufit/python/tests/run_tests.py
@@ -0,0 +1,19 @@
+"""
+Discovers all tests and runs them. Assumes that initially the working directory is test.
+"""
+
+import sys
+import unittest
+
+if __name__ == '__main__':
+
+    loader = unittest.defaultTestLoader
+
+    tests = loader.discover('.')
+
+    runner = unittest.TextTestRunner()
+
+    results = runner.run(tests)
+
+    # return number of failures
+    sys.exit(len(results.failures))
\ No newline at end of file
diff --git a/Gpufit/python/tests/test_gaussian_fit_1d.py b/Gpufit/python/tests/test_gaussian_fit_1d.py
new file mode 100644
index 0000000..a2f2bd7
--- /dev/null
+++ b/Gpufit/python/tests/test_gaussian_fit_1d.py
@@ -0,0 +1,76 @@
+"""
+    Equivalent to https://github.com/gpufit/Gpufit/blob/master/Gpufit/tests/Gauss_Fit_1D.cpp
+"""
+
+import unittest
+import numpy as np
+import pygpufit.gpufit as gf
+
+def generate_gauss_1d(parameters, x):
+    """
+    Generates a 1D Gaussian curve.
+
+    :param parameters: The parameters (a, x0, s, b)
+    :param x: The x values
+    :return: A 1D Gaussian curve.
+    """
+
+    a = parameters[0]
+    x0 = parameters[1]
+    s = parameters[2]
+    b = parameters[3]
+
+    y = a * np.exp(-np.square(x - x0) / (2 * s**2)) + b
+
+    return y
+
+class Test(unittest.TestCase):
+
+    def test_gaussian_fit_1d(self):
+        # constants
+        n_fits = 1
+        n_points = 5
+        n_parameter = 4  # model will be GAUSS_1D
+
+        # true parameters
+        true_parameters = np.array((4, 2, 0.5, 1), dtype=np.float32)
+
+        # generate data
+        data = np.empty((n_fits, n_points), dtype=np.float32)
+        x = np.arange(n_points, dtype=np.float32)
+        data[0, :] = generate_gauss_1d(true_parameters, x)
+
+        # tolerance
+        tolerance = 0.001
+
+        # max_n_iterations
+        max_n_iterations = 10
+
+        # model id
+        model_id = gf.ModelID.GAUSS_1D
+
+        # initial parameters
+        initial_parameters = np.empty((n_fits, n_parameter), dtype=np.float32)
+        initial_parameters[0, :] = (2, 1.5, 0.3, 0)
+
+        # call to gpufit
+        parameters, states, chi_squares, number_iterations, execution_time = gf.fit(data, None, model_id,
+                                                                                    initial_parameters, tolerance, \
+                                                                                    max_n_iterations, None, None, None)
+
+        # print results
+        for i in range(n_parameter):
+            print(' p{} true {} fit {}'.format(i, true_parameters[i], parameters[0, i]))
+        print('fit state : {}'.format(states))
+        print('chi square: {}'.format(chi_squares))
+        print('iterations: {}'.format(number_iterations))
+        print('time: {} s'.format(execution_time))
+
+        assert (chi_squares < 1e-6)
+        assert (states == 0)
+        assert (number_iterations <= max_n_iterations)
+        for i in range(n_parameter):
+            assert (abs(true_parameters[i] - parameters[0, i]) < 1e-6)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/Gpufit/python/tests/test_linear_regression.py b/Gpufit/python/tests/test_linear_regression.py
new file mode 100644
index 0000000..ad05ff4
--- /dev/null
+++ b/Gpufit/python/tests/test_linear_regression.py
@@ -0,0 +1,60 @@
+"""
+    Equivalent to https://github.com/gpufit/Gpufit/blob/master/Gpufit/tests/Linear_Fit_1D.cpp
+"""
+
+import unittest
+import numpy as np
+import pygpufit.gpufit as gf
+
+class Test(unittest.TestCase):
+
+    def test_gaussian_fit_1d(self):
+        # constants
+        n_fits = 1
+        n_points = 2
+        n_parameter = 2
+
+        # true parameters
+        true_parameters = np.array((0, 1), dtype=np.float32)
+
+        # data values
+        data = np.empty((n_fits, n_points), dtype=np.float32)
+        data[0, :] = (0, 1)
+
+        # max number iterations
+        max_number_iterations = 10
+
+        # initial parameters
+        initial_parameters = np.empty((n_fits, n_parameter), dtype=np.float32)
+        initial_parameters[0, :] = (0, 0)
+
+        # model id
+        model_id = gf.ModelID.LINEAR_1D
+
+        # tolerance
+        tolerance = 0.001
+
+        # user info
+        user_info = np.array((0, 1), dtype=np.float32)
+
+        # call to gpufit
+        parameters, states, chi_squares, number_iterations, execution_time = gf.fit(data, None, model_id,
+                                                                                    initial_parameters, tolerance, \
+                                                                                    None, None, None, user_info)
+
+        # print results
+        for i in range(n_parameter):
+            print(' p{} true {} fit {}'.format(i, true_parameters[i], parameters[0, i]))
+        print('fit state : {}'.format(states))
+        print('chi square: {}'.format(chi_squares))
+        print('iterations: {}'.format(number_iterations))
+        print('time: {} s'.format(execution_time))
+
+        assert (chi_squares < 1e-6)
+        assert (states == 0)
+        assert (number_iterations <= max_number_iterations)
+        for i in range(n_parameter):
+            assert (abs(true_parameters[i] - parameters[0, i]) < 1e-6)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/Gpufit/tests/CMakeLists.txt b/Gpufit/tests/CMakeLists.txt
new file mode 100644
index 0000000..a53ba34
--- /dev/null
+++ b/Gpufit/tests/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+# Tests
+
+add_boost_test( Gpufit Error_Handling )
+add_boost_test( Gpufit Linear_Fit_1D )
+add_boost_test( Gpufit Gauss_Fit_1D )
+add_boost_test( Gpufit Gauss_Fit_2D )
+add_boost_test( Gpufit Gauss_Fit_2D_Elliptic )
+add_boost_test( Gpufit Gauss_Fit_2D_Rotated )
+add_boost_test( Gpufit Cauchy_Fit_2D_Elliptic )
diff --git a/Gpufit/tests/Cauchy_Fit_2D_Elliptic.cpp b/Gpufit/tests/Cauchy_Fit_2D_Elliptic.cpp
new file mode 100644
index 0000000..461c726
--- /dev/null
+++ b/Gpufit/tests/Cauchy_Fit_2D_Elliptic.cpp
@@ -0,0 +1,73 @@
+#define BOOST_TEST_MODULE Gpufit
+
+#include "Gpufit/gpufit.h"
+
+#include <boost/test/included/unit_test.hpp>
+
+#include <array>
+
+template<std::size_t SIZE>
+void generate_cauchy_2d_elliptic(std::array< float, SIZE>& values)
+{
+    int const size_x = int(std::sqrt(SIZE));
+    int const size_y = size_x;
+
+    float const a = 4;
+    float const x0 = (float(size_x) - 1.f) / 2.f;
+    float const y0 = (float(size_y) - 1.f) / 2.f;
+    float const sx = 0.4f;
+    float const sy = 0.6f;
+    float const b = 1.f;
+
+    for (int point_index_y = 0; point_index_y < size_y; point_index_y++)
+    {
+        for (int point_index_x = 0; point_index_x < size_x; point_index_x++)
+        {
+            int const point_index = point_index_y * size_x + point_index_x;
+            float const argx = ((x0 - point_index_x) / sx) *((x0 - point_index_x) / sx) + 1.f;
+            float const argy = ((y0 - point_index_y) / sy) *((y0 - point_index_y) / sy) + 1.f;
+            values[point_index] = a / argx / argy + b;
+        }
+    }
+}
+
+BOOST_AUTO_TEST_CASE( Cauchy_Fit_2D_Elliptic )
+{
+    std::size_t const n_fits{ 1 } ;
+    std::size_t const n_points{ 25 } ;
+    std::array< float, n_points > data{};
+    generate_cauchy_2d_elliptic(data);
+    std::array< float, n_points > weights{};
+    std::fill(weights.begin(), weights.end(), 1.f);
+    std::array< float, 6 > initial_parameters{ { 2.f, 1.8f, 2.2f, 0.5f, 0.5f, 0.f } };
+    float tolerance{ 0.001f };
+    int max_n_iterations{ 100 };
+    std::array< int, 6 > parameters_to_fit{ { 1, 1, 1, 1, 1, 1 } };
+    std::array< float, 6 > output_parameters;
+    int output_states;
+    float output_chi_square;
+    int output_n_iterations;
+
+    int const status
+            = gpufit
+            (
+                n_fits,
+                n_points,
+                data.data(),
+                weights.data(),
+                CAUCHY_2D_ELLIPTIC,
+                initial_parameters.data(),
+                tolerance,
+                max_n_iterations,
+                parameters_to_fit.data(),
+                LSE,
+                0,
+                0,
+                output_parameters.data(),
+                &output_states,
+                &output_chi_square,
+                &output_n_iterations
+            ) ;
+
+    BOOST_CHECK( status == 0 ) ;
+}
diff --git a/Gpufit/tests/Error_Handling.cpp b/Gpufit/tests/Error_Handling.cpp
new file mode 100644
index 0000000..c35a078
--- /dev/null
+++ b/Gpufit/tests/Error_Handling.cpp
@@ -0,0 +1,51 @@
+#define BOOST_TEST_MODULE Gpufit
+
+#include "Gpufit/gpufit.h"
+
+#include <boost/test/included/unit_test.hpp>
+
+#include <array>
+
+BOOST_AUTO_TEST_CASE( Error_Handling )
+{
+    std::size_t const n_fits{ 1 } ;
+    std::size_t const n_points{ 2 } ;
+    std::array< float, n_points > data{ { 0, 1 } } ;
+    std::array< float, n_points > weights{ { 1, 1 } } ;
+    std::array< float, 2 > initial_parameters{ { 0, 0 } } ;
+    float tolerance{ 0.001f } ;
+    int max_n_iterations{ 10 } ;
+    std::array< int, 2 > parameters_to_fit{ { 0, 0 } } ;
+    std::array< int, 2 > user_info{ { 0, 1 } } ;
+    std::array< float, 2 > output_parameters ;
+    int output_states ;
+    float output_chi_square ;
+    int output_n_iterations ;
+
+    int const status
+            = gpufit
+            (
+                n_fits,
+                n_points,
+                data.data(),
+                weights.data(),
+                LINEAR_1D,
+                initial_parameters.data(),
+                tolerance,
+                max_n_iterations,
+                parameters_to_fit.data(),
+                LSE,
+                n_points * sizeof( int ),
+                reinterpret_cast< char * >( user_info.data() ),
+                output_parameters.data(),
+                & output_states,
+                & output_chi_square,
+                & output_n_iterations
+            ) ;
+
+    BOOST_CHECK( status == - 1 ) ;
+
+    std::string const error = gpufit_get_last_error() ;
+
+    BOOST_CHECK( error == "invalid configuration argument" ) ;
+}
diff --git a/Gpufit/tests/Gauss_Fit_1D.cpp b/Gpufit/tests/Gauss_Fit_1D.cpp
new file mode 100644
index 0000000..81a8c64
--- /dev/null
+++ b/Gpufit/tests/Gauss_Fit_1D.cpp
@@ -0,0 +1,87 @@
+#define BOOST_TEST_MODULE Gpufit
+
+#include "Gpufit/gpufit.h"
+
+#include <boost/test/included/unit_test.hpp>
+
+#include <array>
+
+template<std::size_t n_points>
+void generate_gauss_1d(
+    std::array< float, n_points >& values,
+    std::array< float, 4 > const & parameters )
+{
+    float const a = parameters[ 0 ];
+    float const x0 = parameters[ 1 ];
+    float const s = parameters[ 2 ];
+    float const b = parameters[ 3 ];
+
+    for ( int point_index = 0; point_index < n_points; point_index++ )
+    {
+        float const argx = ( ( point_index - x0 )*( point_index - x0 ) ) / ( 2.f * s * s );
+        float const ex = exp( -argx );
+        values[ point_index ] = a * ex + b;
+    }
+}
+
+BOOST_AUTO_TEST_CASE( Gauss_Fit_1D )
+{
+	/*
+		Performs a single fit using the GAUSS_1D model.
+		- Doesn't use user_info or weights.
+		- No noise is added.
+		- Checks fitted parameters equalling the true parameters.
+	*/
+
+    std::size_t const n_fits{ 1 } ;
+    std::size_t const n_points{ 5 } ;
+
+    std::array< float, 4 > const true_parameters{ { 4.f, 2.f, 0.5f, 1.f } };
+
+    std::array< float, n_points > data{};
+    generate_gauss_1d( data, true_parameters );
+
+    std::array< float, 4 > initial_parameters{ { 2.f, 1.5f, 0.3f, 0.f } };
+
+    float tolerance{ 0.001f };
+
+    int max_n_iterations{ 10 };
+
+    std::array< int, 4 > parameters_to_fit{ { 1, 1, 1, 1 } };
+
+    std::array< float, 4 > output_parameters;
+    int output_states;
+    float output_chi_square;
+    int output_n_iterations;
+
+    int const status
+            = gpufit
+            (
+                n_fits,
+                n_points,
+                data.data(),
+                0,
+                GAUSS_1D,
+                initial_parameters.data(),
+                tolerance,
+                max_n_iterations,
+                parameters_to_fit.data(),
+                LSE,
+                0,
+                0,
+                output_parameters.data(),
+                &output_states,
+                &output_chi_square,
+                &output_n_iterations
+            ) ;
+
+    BOOST_CHECK( status == 0 ) ;
+    BOOST_CHECK( output_states == 0 );
+    BOOST_CHECK( output_chi_square < 1e-6f );
+    BOOST_CHECK( output_n_iterations <= max_n_iterations );
+
+    BOOST_CHECK( std::fabsf(output_parameters[ 0 ] - true_parameters[ 0 ] ) < 1e-6f );
+    BOOST_CHECK( std::fabsf(output_parameters[ 1 ] - true_parameters[ 1 ] ) < 1e-6f );
+    BOOST_CHECK( std::fabsf(output_parameters[ 2 ] - true_parameters[ 2 ] ) < 1e-6f );
+    BOOST_CHECK( std::fabsf(output_parameters[ 3 ] - true_parameters[ 3 ] ) < 1e-6f );
+}
diff --git a/Gpufit/tests/Gauss_Fit_2D.cpp b/Gpufit/tests/Gauss_Fit_2D.cpp
new file mode 100644
index 0000000..0222933
--- /dev/null
+++ b/Gpufit/tests/Gauss_Fit_2D.cpp
@@ -0,0 +1,96 @@
+#define BOOST_TEST_MODULE Gpufit
+
+#include "Gpufit/gpufit.h"
+
+#include <boost/test/included/unit_test.hpp>
+
+#include <array>
+
+template<std::size_t SIZE>
+void generate_gauss_2d(std::array< float , SIZE>& values)
+{
+    int const size_x = int(std::sqrt(SIZE));
+    int const size_y = size_x;
+
+    float const a = 4.f;
+    float const x0 = (float(size_x) - 1.f) / 2.f;
+    float const y0 = (float(size_y) - 1.f) / 2.f;
+    float const s = 0.5f;
+    float const b = 1.f;
+
+    for (int point_index_y = 0; point_index_y < size_y; point_index_y++)
+    {
+        for (int point_index_x = 0; point_index_x < size_x; point_index_x++)
+        {
+            int const point_index = point_index_y * size_x + point_index_x;
+            float const argx = ((point_index_x - x0)*(point_index_x - x0)) / (2.f * s * s);
+            float const argy = ((point_index_y - y0)*(point_index_y - y0)) / (2.f * s * s);
+            float const ex = exp(-argx) * exp(-argy);
+            values[point_index] = a * ex + b;
+        }
+    }
+}
+
+BOOST_AUTO_TEST_CASE( Gauss_Fit_2D )
+{
+    std::size_t const n_fits{ 1 } ;
+    std::size_t const n_points{ 25 } ;
+    std::array< float, n_points > data{};
+    generate_gauss_2d(data);
+    std::array< float, n_points > weights{};
+    std::fill(weights.begin(), weights.end(), 1.f);
+    std::array< float, 5 > initial_parameters{ { 2.f, 1.8f, 2.2f, 0.4f, 0.f } };
+    float tolerance{ 0.001f };
+    int max_n_iterations{ 10 };
+    std::array< int, 5 > parameters_to_fit{ { 1, 1, 1, 1, 1 } };
+    std::array< float, 5 > output_parameters;
+    int output_states;
+    float output_chi_square;
+    int output_n_iterations;
+
+    int const status
+            = gpufit
+            (
+                n_fits,
+                n_points,
+                data.data(),
+                0,
+                GAUSS_2D,
+                initial_parameters.data(),
+                tolerance,
+                max_n_iterations,
+                parameters_to_fit.data(),
+                LSE,
+                0,
+                0,
+                output_parameters.data(),
+                &output_states,
+                &output_chi_square,
+                &output_n_iterations
+            ) ;
+
+    BOOST_CHECK( status == 0 ) ;
+
+    int const status_with_weights
+            = gpufit
+            (
+                n_fits,
+                n_points,
+                data.data(),
+                weights.data(),
+                GAUSS_2D,
+                initial_parameters.data(),
+                tolerance,
+                max_n_iterations,
+                parameters_to_fit.data(),
+                LSE,
+                0,
+                0,
+                output_parameters.data(),
+                &output_states,
+                &output_chi_square,
+                &output_n_iterations
+            ) ;
+
+    BOOST_CHECK( status_with_weights == 0 ) ;
+}
diff --git a/Gpufit/tests/Gauss_Fit_2D_Elliptic.cpp b/Gpufit/tests/Gauss_Fit_2D_Elliptic.cpp
new file mode 100644
index 0000000..072169c
--- /dev/null
+++ b/Gpufit/tests/Gauss_Fit_2D_Elliptic.cpp
@@ -0,0 +1,74 @@
+#define BOOST_TEST_MODULE Gpufit
+
+#include "Gpufit/gpufit.h"
+
+#include <boost/test/included/unit_test.hpp>
+
+#include <array>
+
+template<std::size_t SIZE>
+void generate_gauss_2d_elliptic(std::array< float, SIZE>& values)
+{
+    int const size_x = int(std::sqrt(SIZE));
+    int const size_y = size_x;
+
+    float const a = 4;
+    float const x0 = (float(size_x) - 1.f) / 2.f;
+    float const y0 = (float(size_y) - 1.f) / 2.f;
+    float const sx = 0.4f;
+    float const sy = 0.6f;
+    float const b = 1.f;
+
+    for (int point_index_y = 0; point_index_y < size_y; point_index_y++)
+    {
+        for (int point_index_x = 0; point_index_x < size_x; point_index_x++)
+        {
+            int const point_index = point_index_y * size_x + point_index_x;
+            float const argx = ((point_index_x - x0)*(point_index_x - x0)) / (2.f * sx * sx);
+            float const argy = ((point_index_y - y0)*(point_index_y - y0)) / (2.f* sy * sy);
+            float const ex = exp(-argx) * exp(-argy);
+            values[point_index] = a * ex + b;
+        }
+    }
+}
+
+BOOST_AUTO_TEST_CASE( Gauss_Fit_2D_Elliptic )
+{
+    std::size_t const n_fits{ 1 } ;
+    std::size_t const n_points{ 25 } ;
+    std::array< float, n_points > data{};
+    generate_gauss_2d_elliptic(data);
+    std::array< float, n_points > weights{};
+    std::fill(weights.begin(), weights.end(), 1.f);
+    std::array< float, 6 > initial_parameters{ { 2.f, 1.8f, 2.2f, 0.5f, 0.5f, 0.f } };
+    float tolerance{ 0.001f };
+    int max_n_iterations{ 10 };
+    std::array< int, 6 > parameters_to_fit{ { 1, 1, 1, 1, 1, 1 } };
+    std::array< float, 6 > output_parameters;
+    int output_states;
+    float output_chi_square;
+    int output_n_iterations;
+
+    int const status
+            = gpufit
+            (
+                n_fits,
+                n_points,
+                data.data(),
+                weights.data(),
+                GAUSS_2D_ELLIPTIC,
+                initial_parameters.data(),
+                tolerance,
+                max_n_iterations,
+                parameters_to_fit.data(),
+                LSE,
+                0,
+                0,
+                output_parameters.data(),
+                &output_states,
+                &output_chi_square,
+                &output_n_iterations
+            ) ;
+
+    BOOST_CHECK( status == 0 ) ;
+}
diff --git a/Gpufit/tests/Gauss_Fit_2D_Rotated.cpp b/Gpufit/tests/Gauss_Fit_2D_Rotated.cpp
new file mode 100644
index 0000000..55cd682
--- /dev/null
+++ b/Gpufit/tests/Gauss_Fit_2D_Rotated.cpp
@@ -0,0 +1,77 @@
+#define BOOST_TEST_MODULE Gpufit
+
+#define PI 3.1415926535897f
+
+#include "Gpufit/gpufit.h"
+
+#include <boost/test/included/unit_test.hpp>
+
+#include <array>
+
+template<std::size_t SIZE>
+void generate_gauss_2d_rotated(std::array< float, SIZE>& values)
+{
+    int const size_x = int(std::sqrt(SIZE));
+    int const size_y = size_x;
+
+    float const a = 10.f;
+    float const x0 = (float(size_x) - 1.f) / 2.f;
+    float const y0 = (float(size_y) - 1.f) / 2.f;
+    float const sx = 0.4f;
+    float const sy = 0.5f;
+    float const b = 1.f;
+    float const r = PI / 16.f;
+
+    for (int point_index_y = 0; point_index_y < size_y; point_index_y++)
+    {
+        for (int point_index_x = 0; point_index_x < size_x; point_index_x++)
+        {
+            int const point_index = point_index_y * size_x + point_index_x;
+            float const arga = ((point_index_x - x0) * cosf(r)) - ((point_index_y - y0) * sinf(r));
+            float const argb = ((point_index_x - x0) * sinf(r)) + ((point_index_y - y0) * cosf(r));
+            float const ex = exp((-0.5f) * (((arga / sx) * (arga / sx)) + ((argb / sy) * (argb / sy))));
+            values[point_index] = a * ex + b;
+        }
+    }
+}
+
+BOOST_AUTO_TEST_CASE( Gauss_Fit_2D_Rotated )
+{
+    std::size_t const n_fits{ 1 } ;
+    std::size_t const n_points{ 64 } ;
+    std::array< float, n_points > data{};
+    generate_gauss_2d_rotated(data);
+    std::array< float, n_points > weights{};
+    std::fill(weights.begin(), weights.end(), 1.f);
+    std::array< float, 7 > initial_parameters{ { 8.f, 3.4f, 3.6f, 0.4f, 0.5f, 2.f, 0.f } };
+    float tolerance{ 0.001f };
+    int max_n_iterations{ 10 };
+    std::array< int, 7 > parameters_to_fit{ { 1, 1, 1, 1, 1, 1, 1 } };
+    std::array< float, 7 > output_parameters;
+    int output_states;
+    float output_chi_square;
+    int output_n_iterations;
+
+    int const status
+            = gpufit
+            (
+                n_fits,
+                n_points,
+                data.data(),
+                weights.data(),
+                GAUSS_2D_ROTATED,
+                initial_parameters.data(),
+                tolerance,
+                max_n_iterations,
+                parameters_to_fit.data(),
+                LSE,
+                0,
+                0,
+                output_parameters.data(),
+                &output_states,
+                &output_chi_square,
+                &output_n_iterations
+            ) ;
+
+    BOOST_CHECK( status == 0 ) ;
+}
diff --git a/Gpufit/tests/Linear_Fit_1D.cpp b/Gpufit/tests/Linear_Fit_1D.cpp
new file mode 100644
index 0000000..abd7c81
--- /dev/null
+++ b/Gpufit/tests/Linear_Fit_1D.cpp
@@ -0,0 +1,101 @@
+#define BOOST_TEST_MODULE Gpufit
+
+#include "Gpufit/gpufit.h"
+
+#include <boost/test/included/unit_test.hpp>
+
+#include <array>
+
+BOOST_AUTO_TEST_CASE( Linear_Fit_1D )
+{
+	/*
+		Performs a single fit using the Linear Fit (LINEAR_1D) model.
+		- Uses user info 
+		- Uses trivial weights.
+		- No noise is added.
+		- Checks fitted parameters equalling the true parameters.
+	*/
+
+    std::size_t const n_fits{ 1 } ;
+    std::size_t const n_points{ 2 } ;
+
+	std::array< float, 2 > const true_parameters{ { 1, 1 } };
+
+    std::array< float, n_points > data{ { 1, 2 } } ;
+    
+	std::array< float, n_points > weights{ { 1, 1 } } ;
+
+    std::array< float, 2 > initial_parameters{ { 1, 0 } } ;
+
+    float tolerance{ 0.001f } ;
+    
+	int max_n_iterations{ 10 } ;
+    
+	std::array< int, 2 > parameters_to_fit{ { 1, 1 } } ;
+    
+	std::array< float, n_points > user_info{ { 0.f, 1.f } } ;
+    
+	std::array< float, 2 > output_parameters ;
+    int output_states ;
+    float output_chi_squares ;
+    int output_n_iterations ;
+
+	// test with LSE
+    int status = gpufit
+        (
+            n_fits,
+            n_points,
+            data.data(),
+            weights.data(),
+            LINEAR_1D,
+            initial_parameters.data(),
+            tolerance,
+            max_n_iterations,
+            parameters_to_fit.data(),
+            LSE,
+            n_points * sizeof( float ),
+            reinterpret_cast< char * >( user_info.data() ),
+            output_parameters.data(),
+            & output_states,
+            & output_chi_squares,
+            & output_n_iterations
+        ) ;
+
+    BOOST_CHECK( status == 0 ) ;
+	BOOST_CHECK( output_states == 0 );
+	BOOST_CHECK( output_n_iterations <= max_n_iterations );
+	BOOST_CHECK( output_chi_squares < 1e-6f );
+
+	BOOST_CHECK(std::fabsf(output_parameters[0] - true_parameters[0]) < 1e-6f);
+	BOOST_CHECK(std::fabsf(output_parameters[1] - true_parameters[1]) < 1e-6f);
+
+	// test with MLE
+	status = gpufit
+		(
+			n_fits,
+			n_points,
+			data.data(),
+			weights.data(),
+			LINEAR_1D,
+			initial_parameters.data(),
+			tolerance,
+			max_n_iterations,
+			parameters_to_fit.data(),
+			MLE,
+			n_points * sizeof(float),
+			reinterpret_cast< char * >(user_info.data()),
+			output_parameters.data(),
+			&output_states,
+			&output_chi_squares,
+			&output_n_iterations
+		);
+
+	BOOST_CHECK(status == 0);
+	BOOST_CHECK(output_states == 0);
+	BOOST_CHECK(output_n_iterations <= max_n_iterations);
+	BOOST_CHECK(output_chi_squares < 1e-6f);
+
+	BOOST_CHECK(std::fabsf(output_parameters[0] - true_parameters[0]) < 1e-6f);
+	BOOST_CHECK(std::fabsf(output_parameters[1] - true_parameters[1]) < 1e-4f);
+
+}
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..6fe98c3
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017 Mark Bates, Adrian Przybylski, Björn Thiel, and Jan Keller-Findeisen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..498877e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,62 @@
+# Gpufit
+
+Levenberg Marquardt curve fitting in CUDA.
+
+Homepage: [github.com/gpufit/Gpufit](https://github.com/gpufit/Gpufit)
+
+## Quick start instructions
+
+To verify that Gpufit is working correctly on the host computer, go to the folder gpufit_performance_test of the binary package and run Gpufit_Cpufit_Performance_Comparison.exe.  Further details of the test executable can be found in the documentation package.
+
+## Binary distribution
+
+The latest Gpufit binary release, supporting Windows 32-bit and 64-bit machines, can be found on the [release page](https://github.com/gpufit/Gpufit/releases).
+
+## Documentation
+
+[![Documentation Status](https://readthedocs.org/projects/gpufit/badge/?version=latest)](http://gpufit.readthedocs.io/en/latest/?badge=latest)
+
+Documentation for the Gpufit library may be found online ([latest documentation](http://gpufit.readthedocs.io/en/latest/?badge=latest)), and also
+as a PDF file in the binary distribution of Gpufit.
+
+## Building Gpufit from source code
+
+Instructions for building Gpufit are found in the documentation: [Building from source code](https://github.com/gpufit/Gpufit/blob/master/docs/installation.rst).
+
+## Using the Gpufit binary distribution
+
+Instructions for using the bindary distribution may be found in the documentation.  The binary package contains:
+
+- The Gpufit SDK, which consists of the 32-bit and 64-bit DLL files, and 
+  the Gpufit header file which contains the function definitions.  The Gpufit
+  SDK is intented to be used when calling Gpufit from an external application
+  written in e.g. C code.
+- Gpufit Performance test: A simple console application comparing the execution speed of curve fitting on the GPU and CPU.  This program also serves as a test to ensure the correct functioning of Gpufit.
+- Matlab 32 bit and 64 bit bindings, with Matlab examples.
+- Python version 2.x and version 3.x bindings (compiled as wheel files) and
+  Python examples.
+- The Gpufit manual in PDF format
+
+## License
+
+MIT License
+
+Copyright (c) 2017 Mark Bates, Adrian Przybylski, Björn Thiel, and Jan Keller-Findeisen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/docs/_static/style.css b/docs/_static/style.css
new file mode 100644
index 0000000..6c92e05
--- /dev/null
+++ b/docs/_static/style.css
@@ -0,0 +1,15 @@
+.wy-nav-content {
+    max-width: 1100px !important;
+}
+
+@media screen and (max-width: 767px) {
+    .wy-table-responsive table td {
+        white-space: nowrap;
+    }
+}
+
+@media screen and (min-width: 768px) {
+    .wy-table-responsive table td {
+        white-space: normal;
+    }
+}
\ No newline at end of file
diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html
new file mode 100644
index 0000000..b0a4480
--- /dev/null
+++ b/docs/_templates/layout.html
@@ -0,0 +1,4 @@
+{% extends "!layout.html" %}
+{% block extrahead %}
+    <link href="{{ pathto("_static/style.css", True) }}" rel="stylesheet" type="text/css">
+{% endblock %}
\ No newline at end of file
diff --git a/docs/appendix.rst b/docs/appendix.rst
new file mode 100644
index 0000000..103df3e
--- /dev/null
+++ b/docs/appendix.rst
@@ -0,0 +1,31 @@
+========
+Appendix
+========
+
+Levenberg-Marquardt algorithm
+-----------------------------
+
+A flowchart of the implementation of the Levenberg-Marquardt algorithm is given in :numref:`appendix-gpufit-flowchart`.
+
+.. _appendix-gpufit-flowchart:
+
+.. figure:: /images/gpufit_program_flow_skeleton_v2.png
+   :width: 14 cm
+   :align: center
+
+   Levenberg-Marquardt algorithm flow as implemented in |GF|.
+   
+   
+Performance comparison to other GPU benchmarks
+----------------------------------------------
+
+Using the bundled application to estimate the fitting speed per second of 10 million fits for various CUDA capable
+graphics cards of various architectures (on different computers with different versions of graphics drivers) we can
+compare to the results of Passmark G3D. By and large, the results seem to correlate, i.e. a high Passmark G3D score
+also relates to a high Gpufit fitting speed.
+
+.. figure:: /images/Gpufit_PassmarkG3D_relative_performance.png
+   :width: 14 cm
+   :align: center
+
+   Performance of Gpufit vs Passmark G3D
\ No newline at end of file
diff --git a/docs/bindings.rst b/docs/bindings.rst
new file mode 100644
index 0000000..ff3d914
--- /dev/null
+++ b/docs/bindings.rst
@@ -0,0 +1,413 @@
+.. _external-bindings:
+
+=================
+External bindings
+=================
+
+This sections describes the Gpufit bindings to other programming languages.  The bindings (e.g. to Python or Matlab) aim to
+emulate the :ref:`c-interface` as closely as possible.
+
+Most high level languages feature multidimensional numerical arrays.  In the bindings implemented for Matlab and Python, 
+we adopt the convention that the input data should be organized as a 2D array, with one dimension corresponding to the
+number of data points per fit, and the other corresponding to the number of fits. Internally, in memory, these arrays should
+always be ordered such that the data values for each fit are kept together. In Matlab, for example, this means storing the
+data in an array with dimensions [number_points_per_fit, number_fits]. In this manner, the data in memory is ordered in the
+same way that is expected by the Gpufit C interface, and there is no need to copy or otherwise re-organize the data
+before passing it to the GPU. The same convention is used for the weights, the initial model parameters, and the output parameters.
+
+Unlike the C interface, the external bindings to not require the number of fits and the number of data points per fit to be 
+specified explicitly. Instead, these numbers are inferred from the dimensions of the 2D input arrays.
+
+Optional parameters with default values
+---------------------------------------
+
+The external bindings make some input parameters optional.  The optional parameters are shown here.
+
+:tolerance:
+    default value 1e-4
+:max_n_iterations:
+    default value 25 iterations
+:estimator_id:
+    the default estimator is LSE as defined in gpufit.h_
+:parameters_to_fit:
+    by default all parameters are fit
+
+For instructions on how to specify these parameters explicitly, see the sections below.
+	
+Python
+------
+
+The Gpufit binding for Python is a project named pyGpufit. This project contains a Python package named pygpufit, which
+contains a module gpufit, and this module implements a method called fit.  Calling this method is equivalent to
+calling the C interface function *gpufit()* of |GF|. The package expects the input data to be
+stored as NumPy array.  NumPy follows row-major order by default.
+
+Installation
+++++++++++++
+
+Wheel files for Python 2.X and 3.X on Windows 32/64 bit are included in the binary package. NumPy is required.
+
+Install the wheel file with.
+
+.. code-block:: bash
+
+    pip install --no-index --find-links=LocalPathToWheelFile pyGpufit
+
+Python Interface
+++++++++++++++++
+
+Optional parameters are passed in as None. The numbers of points, fits and parameters is deduced from the dimensions of
+the input data and initial parameters arrays.
+
+The signature of the gpufit method is
+
+.. code-block:: python
+
+    def fit(data, weights, model_id:ModelID, initial_parameters, tolerance:float=None, max_number_iterations:int=None, parameters_to_fit=None, estimator_id:EstimatorID=None, user_info=None):
+
+*Input parameters*
+
+:data: Data
+    2D NumPy array of shape (number_fits, number_points) and data type np.float32
+:weights: Weights
+    2D NumPy array of shape (number_fits, number_points) and data type np.float32 (same as data)
+
+    :special: None indicates that no weights are available
+:tolerance: Fit tolerance
+
+    :type: float
+    :special: If None, the default value will be used.
+:max_number_iterations: Maximal number of iterations
+
+    :type: int
+    :special: If None, the default value will be used.
+:estimator_id: estimator ID
+
+    :type: EstimatorID which is an Enum in the same module and defined analogously to gpufit.h_.
+    :special: If None, the default value is used.
+:model_id: model ID
+
+    :type: ModelID which is an Enum in the same module and defined analogously to gpufit.h_.
+:initial_parameters: Initial parameters
+    2D NumPy array of shape (number_fits, number_parameter)
+
+    :array data type: np.float32
+:parameters_to_fit: parameters to fit
+    1D NumPy array of length number_parameter
+    A zero indicates that this parameter should not be fitted, everything else means it should be fitted.
+
+    :array data type: np.int32
+    :special: If None, the default value is used.
+:user_info: user info
+    1D NumPy array of arbitrary type. The length in bytes is deduced automatically.
+
+    :special: If None, no user_info is assumed.
+
+*Output parameters*
+
+:parameters: Fitted parameters for each fit
+    2D NumPy array of shape (number_fits, number_parameter) and data type np.float32
+:states: Fit result states for each fit
+    1D NumPy array of length number_parameter of data type np.int32
+    As defined in gpufit.h_:
+:chi_squares: :math:`\chi^2` values for each fit
+    1D NumPy array of length number_parameter of data type np.float32
+:n_iterations: Number of iterations done for each fit
+    1D NumPy array of length number_parameter of data type np.int32
+:time: Execution time of call to fit
+    In seconds.
+
+Errors are raised if checks on parameters fail or if the execution of fit failed.
+
+Python Examples
++++++++++++++++
+
+2D Gaussian peak example
+........................
+
+An example can be found at `Python Gauss2D example`_. It is equivalent to :ref:`c-example-2d-gaussian`.
+
+The essential imports are:
+
+.. code-block:: python
+
+    import numpy as np
+    import pygpufit.gpufit as gf
+
+The true parameters describing an example 2D Gaussian peak functions are:
+
+.. code-block:: python
+
+    # true parameters
+    true_parameters = np.array((10, 5.5, 5.5, 3, 10), dtype=np.float32)
+
+A 2D grid of x and y positions can conveniently be generated using the np.meshgrid function:
+
+.. code-block:: python
+
+    # generate x and y values
+    g = np.arange(size_x)
+    yi, xi = np.meshgrid(g, g, indexing='ij')
+    xi = xi.astype(np.float32)
+    yi = yi.astype(np.float32)
+
+Using these positions and the true parameter values a model function can be calculated as
+
+.. code-block:: python
+
+    def generate_gauss_2d(p, xi, yi):
+        """
+        Generates a 2D Gaussian peak.
+        http://gpufit.readthedocs.io/en/latest/api.html#gauss-2d
+
+        :param p: Parameters (amplitude, x,y center position, width, offset)
+        :param xi: x positions
+        :param yi: y positions
+        :return: The Gaussian 2D peak.
+        """
+
+        arg = -(np.square(xi - p[1]) + np.square(yi - p[2])) / (2*p[3]*p[3])
+        y = p[0] * np.exp(arg) + p[4]
+
+        return y
+
+The model function can be repeated and noise can be added using the np.tile and np.random.poisson functions.
+
+.. code-block:: python
+
+    # generate data
+    data = generate_gauss_2d(true_parameters, xi, yi)
+    data = np.reshape(data, (1, number_points))
+    data = np.tile(data, (number_fits, 1))
+
+    # add Poisson noise
+    data = np.random.poisson(data)
+    data = data.astype(np.float32)
+
+The model and estimator IDs can be set as
+
+.. code-block:: python
+
+    # estimator ID
+    estimator_id = gf.EstimatorID.MLE
+
+    # model ID
+    model_id = gf.ModelID.GAUSS_2D
+
+When all input parameters are set we can call the C interface of Gpufit.
+
+.. code-block:: python
+
+    # run Gpufit
+    parameters, states, chi_squares, number_iterations, execution_time = gf.fit(data, None, model_id, initial_parameters, tolerance, max_number_iterations, None, estimator_id, None)
+
+And finally statistics about the results of the fits can be displayed where the mean and standard deviation of the
+fitted parameters are limited to those fits that converged.
+
+.. code-block:: python
+
+    # print fit results
+
+    # get fit states
+    converged = states == 0
+    number_converged = np.sum(converged)
+    print('ratio converged         {:6.2f} %'.format(number_converged / number_fits * 100))
+    print('ratio max it. exceeded  {:6.2f} %'.format(np.sum(states == 1) / number_fits * 100))
+    print('ratio singular hessian  {:6.2f} %'.format(np.sum(states == 2) / number_fits * 100))
+    print('ratio neg curvature MLE {:6.2f} %'.format(np.sum(states == 3) / number_fits * 100))
+    print('ratio gpu not read      {:6.2f} %'.format(np.sum(states == 4) / number_fits * 100))
+
+    # mean, std of fitted parameters
+    converged_parameters = parameters[converged, :]
+    converged_parameters_mean = np.mean(converged_parameters, axis=0)
+    converged_parameters_std = np.std(converged_parameters, axis=0)
+
+    for i in range(number_parameters):
+        print('p{} true {:6.2f} mean {:6.2f} std {:6.2f}'.format(i, true_parameters[i], converged_parameters_mean[i], converged_parameters_std[i]))
+
+    # print summary
+    print('model ID: {}'.format(model_id))
+    print('number of fits: {}'.format(number_fits))
+    print('fit size: {} x {}'.format(size_x, size_x))
+    print('mean chi_square: {:.2f}'.format(np.mean(chi_squares[converged])))
+    print('iterations: {:.2f}'.format(np.mean(number_iterations[converged])))
+    print('time: {:.2f} s'.format(execution_time))
+
+	
+Matlab
+------
+
+The Matlab binding for Gpufit is a Matlab script (gpufit.m_).  This script checks the input data, sets default parameters, and 
+calls the C interface of |GF|, via a compiled .mex file.
+
+Please note, that before using the Matlab binding, the path to gpufit.m_ must be added to the Matlab path.
+
+If other GPU-based computations are to be performed with Matlab in the same session, please use the Matlab GPU computing 
+functionality first (for example with a call to gpuDevice or gpuArray) before calling the Gpufit Matlab binding. If this is not
+done, Matlab will throw an error (Error using gpuArray An unexpected error occurred during CUDA execution. 
+The CUDA error was: cannot set while device is active in this process).
+
+Matlab Interface
+++++++++++++++++
+
+Optional parameters are passed in as empty matrices (``[]``). The numbers of points, fits and parameters is deduced from the dimensions of
+the input data and initial parameters matrices.
+
+The signature of the gpufit function is
+
+.. code-block:: matlab
+
+    function [parameters, states, chi_squares, n_iterations, time] = gpufit(data, weights, model_id, initial_parameters, tolerance, max_n_iterations, parameters_to_fit, estimator_id, user_info)
+
+*Input parameters*
+
+:data: Data
+    2D matrix of size [number_points, number_fits] and data type single
+:weights: Weights
+    2D matrix of size [number_points, number_fits] and data type single (same as data)
+
+    :special: None indicates that no weights are available
+:tolerance: Fit tolerance
+
+    :type: single
+    :special: If empty ([]), the default value will be used.
+:max_number_iterations: Maximal number of iterations
+    Will be converted to int32 if necessary
+
+    :special: If empty ([]), the default value will be used.
+:estimator_id: estimator ID
+
+    :type: EstimatorID which is defined in EstimatorID.m analogously to gpufit.h_.
+    :special: If empty ([]), the default value is used.
+:model_id: model ID
+
+    :type: ModelID which is defined in ModelID.m analogously to gpufit.h_.
+:initial_parameters: Initial parameters
+    2D matrix of size: [number_parameter, number_fits]
+
+    :type: single
+:parameters_to_fit: parameters to fit
+    vector of length number_parameter, will be converted to int32 if necessary
+    A zero indicates that this parameter should not be fitted, everything else means it should be fitted.
+
+    :special: If empty ([]), the default value is used.
+:user_info: user info
+    vector of arbitrary type. The length in bytes is deduced automatically.
+
+*Output parameters*
+
+:parameters: Fitted parameters for each fit
+    2D matrix of size: [number_parameter, number_fits] of data type single
+:states: Fit result states for each fit
+    vector of length number_parameter of data type int32
+    As defined in gpufit.h_:
+:chi_squares: :math:`\chi^2` values for each fit
+    vector of length number_parameter of data type single
+:n_iterations: Number of iterations done for each fit
+    vector of length number_parameter of data type int32
+:time: Execution time of call to gpufit
+    In seconds.
+
+Errors are raised if checks on parameters fail or if the execution of gpufit fails.
+
+Matlab Examples
++++++++++++++++
+
+Simple example
+..............
+
+The most simple example is the `Matlab simple example`_. It is equivalent to :ref:`c-example-simple` and additionally
+relies on default values for optional arguments.
+
+2D Gaussian peak example
+........................
+
+An example can be found at `Matlab Gauss2D example`_. It is equivalent to :ref:`c-example-2d-gaussian`.
+
+The true parameters describing an example 2D Gaussian peak functions are:
+
+.. code-block:: matlab
+
+    % true parameters
+    true_parameters = single([10, 5.5, 5.5, 3, 10]);
+
+A 2D grid of x and y positions can conveniently be generated using the ndgrid function:
+
+.. code-block:: matlab
+
+    % generate x and y values
+    g = single(0 : size_x - 1);
+    [x, y] = ndgrid(g, g);
+
+Using these positions and the true parameter values a model function can be calculated as
+
+.. code-block:: matlab
+
+    function g = gaussian_2d(x, y, p)
+    % Generates a 2D Gaussian peak.
+    % http://gpufit.readthedocs.io/en/latest/api.html#gauss-2d
+    %
+    % x,y - x and y grid position values
+    % p - parameters (amplitude, x,y center position, width, offset)
+
+    g = p(1) * exp(-((x - p(2)).^2 + (y - p(3)).^2) / (2 * p(4)^2)) + p(5);
+
+    end
+
+The model function can be repeated and noise can be added using the repmat and poissrnd functions.
+
+.. code-block:: matlab
+
+    % generate data with Poisson noise
+    data = gaussian_2d(x, y, true_parameters);
+    data = repmat(data(:), [1, number_fits]);
+    data = poissrnd(data);
+
+The model and estimator IDs can be set as
+
+.. code-block:: matlab
+
+    % estimator id
+    estimator_id = EstimatorID.MLE;
+
+    % model ID
+    model_id = ModelID.GAUSS_2D;
+
+When all input parameters are set we can call the C interface of |GF|.
+
+.. code-block:: matlab
+
+    %% run Gpufit
+    [parameters, states, chi_squares, n_iterations, time] = gpufit(data, [], model_id, initial_parameters, tolerance, max_n_iterations, [], estimator_id, []);
+
+And finally statistics about the results of the fits can be displayed where the mean and standard deviation of the
+fitted parameters are limited to those fits that converged.
+
+.. code-block:: matlab
+
+    %% displaying results
+
+    % get fit states
+    converged = states == 0;
+    number_converged = sum(converged);
+    fprintf(' ratio converged         %6.2f %%\n', number_converged / number_fits * 100);
+    fprintf(' ratio max it. exceeded  %6.2f %%\n', sum(states == 1) / number_fits * 100);
+    fprintf(' ratio singular hessian  %6.2f %%\n', sum(states == 2) / number_fits * 100);
+    fprintf(' ratio neg curvature MLE %6.2f %%\n', sum(states == 3) / number_fits * 100);
+    fprintf(' ratio gpu not read      %6.2f %%\n', sum(states == 4) / number_fits * 100);
+
+    % mean and std of fitted parameters
+    converged_parameters = parameters(:, converged);
+    converged_parameters_mean = mean(converged_parameters, 2);
+    converged_parameters_std  = std(converged_parameters, [], 2);
+    for i = 1 : number_parameters
+        fprintf(' p%d true %6.2f mean %6.2f std %6.2f\n', i, true_parameters(i), converged_parameters_mean(i), converged_parameters_std(i));
+    end
+
+    % print summary
+    fprintf('model ID: %d\n', model_id);
+    fprintf('number of fits: %d\n', number_fits);
+    fprintf('fit size: %d x %d\n', size_x, size_x);
+    fprintf('mean chi-square: %6.2f\n', mean(chi_squares(converged)));
+    fprintf('iterations: %6.2f\n', mean(n_iterations(converged)));
+    fprintf('time: %6.2f s\n', time);
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..fe55fe3
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,457 @@
+# -*- coding: utf-8 -*-
+import sphinx_rtd_theme
+#
+# RTD Spielwiese documentation build configuration file, created by
+# sphinx-quickstart on Tue Oct 04 12:39:10 2016.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.4'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.mathjax',
+    'sphinx.ext.todo'	
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The encoding of source files.
+#
+# source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'Gpufit: An open-source toolkit for GPU-accelerated curve fitting'
+copyright = 'All rights reserved.'
+author = 'Adrian Przybylski, Björn Thiel, Jan Keller-Findeisen, Bernd Stock, and Mark Bates'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = u'1.0'
+# The full version, including alpha/beta/rc tags.
+release = u'1.0.0'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#
+# today = ''
+#
+# Else, today_fmt is used as the format for a strftime call.
+#
+# today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#
+# default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#
+# add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#
+# add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#
+# show_authors = False
+
+# read epilog.rst
+with open('epilog.txt') as f:
+    rst_epilog = f.read()
+
+# default highlight language is cpp
+highlight_language = 'cpp'
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+# modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+# keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = True
+
+numfig = True
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinx_rtd_theme"
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+#html_theme_options = {
+#    'collapse_navigation': False,
+#    'display_version': False,
+#    'navigation_depth': 3,
+#}
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+# html_theme_path = []
+
+# The name for this set of Sphinx documents.
+# "<project> v<release> documentation" by default.
+#
+# html_title = u'RTD Spielwiese v1'
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#
+# html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#
+# html_logo = None
+
+# The name of an image file (relative to this directory) to use as a favicon of
+# the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#
+# html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#
+# html_extra_path = []
+
+# If not None, a 'Last updated on:' timestamp is inserted at every page
+# bottom, using the given strftime format.
+# The empty string is equivalent to '%b %d, %Y'.
+#
+# html_last_updated_fmt = None
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#
+# html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#
+# html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#
+# html_additional_pages = {}
+
+# If false, no module index is generated.
+#
+# html_domain_indices = True
+
+# If false, no index is generated.
+#
+# html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#
+# html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#
+# html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#
+# html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#
+# html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#
+# html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+# html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+#   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
+#   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh'
+#
+# html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# 'ja' uses this config value.
+# 'zh' user can custom change `jieba` dictionary path.
+#
+# html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#
+# html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Gpufit'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+
+# make code smaller in latex output
+# see also: http://stackoverflow.com/questions/9899283/how-do-you-change-the-code-example-font-size-in-latex-pdf-output-with-sphinx
+from sphinx.highlighting import PygmentsBridge
+from pygments.formatters.latex import LatexFormatter
+
+class CustomLatexFormatter(LatexFormatter):
+    def __init__(self, **options):
+        super(CustomLatexFormatter, self).__init__(**options)
+        self.verboptions = r"formatcom=\footnotesize"
+
+PygmentsBridge.latex_formatter = CustomLatexFormatter
+
+latex_elements = {
+     # The paper size ('letterpaper' or 'a4paper').
+     #
+     'papersize': 'a4paper,oneside',
+
+     # The font size ('10pt', '11pt' or '12pt').
+     #
+     # 'pointsize': '10pt',
+
+     # Additional stuff for the LaTeX preamble.
+     #
+     # 'preamble': '',
+
+     # Latex figure (float) alignment
+     #
+     # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'Gpufit.tex', 'Gpufit Documentation',
+     'Gpufit', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#
+# latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#
+# latex_use_parts = False
+
+# If true, show page references after internal links.
+#
+latex_show_pagerefs = True
+
+# If true, show URL addresses after external links.
+#
+# latex_show_urls = 'footnote'
+latex_show_urls = 'no'
+
+# Documents to append as an appendix to all manuals.
+#
+# latex_appendices = []
+
+# It false, will not define \strong, \code, 	itleref, \crossref ... but only
+# \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added
+# packages.
+#
+# latex_keep_old_macro_names = True
+
+# If false, no module index is generated.
+#
+# latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'gpufit', 'Gpufit Documentation',
+     [author], 1)
+]
+
+# If true, show URL addresses after external links.
+#
+# man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'Gpufit', 'Gpufit Documentation',
+     author, 'Gpufit', 'Levenberg Marquardt curve fitting in CUDA',
+     'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#
+# texinfo_appendices = []
+
+# If false, no module index is generated.
+#
+# texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#
+# texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#
+# texinfo_no_detailmenu = False
+
+
+# -- Options for Epub output ----------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+epub_author = author
+epub_publisher = author
+epub_copyright = copyright
+
+# The basename for the epub file. It defaults to the project name.
+# epub_basename = project
+
+# The HTML theme for the epub output. Since the default themes are not
+# optimized for small screen space, using the same theme for HTML and epub
+# output is usually not wise. This defaults to 'epub', a theme designed to save
+# visual space.
+#
+# epub_theme = 'epub'
+
+# The language of the text. It defaults to the language option
+# or 'en' if the language is not set.
+#
+# epub_language = ''
+
+# The scheme of the identifier. Typical schemes are ISBN or URL.
+# epub_scheme = ''
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A tuple containing the cover image and cover page html template filenames.
+#
+# epub_cover = ()
+
+# A sequence of (type, uri, title) tuples for the guide element of content.opf.
+#
+# epub_guide = ()
+
+# HTML files that should be inserted before the pages created by sphinx.
+# The format is a list of tuples containing the path and title.
+#
+# epub_pre_files = []
+
+# HTML files that should be inserted after the pages created by sphinx.
+# The format is a list of tuples containing the path and title.
+#
+# epub_post_files = []
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+# The depth of the table of contents in toc.ncx.
+#
+# epub_tocdepth = 3
+
+# Allow duplicate toc entries.
+#
+# epub_tocdup = True
+
+# Choose between 'default' and 'includehidden'.
+#
+# epub_tocscope = 'default'
+
+# Fix unsupported image types using the Pillow.
+#
+# epub_fix_images = False
+
+# Scale large images.
+#
+# epub_max_image_width = 0
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#
+# epub_show_urls = 'inline'
+
+# If false, no index is generated.
+#
+# epub_use_index = True
diff --git a/docs/customization.rst b/docs/customization.rst
new file mode 100644
index 0000000..4fcfec5
--- /dev/null
+++ b/docs/customization.rst
@@ -0,0 +1,299 @@
+.. _gpufit-customization:
+
+=============
+Customization
+=============
+
+This sections explains how to add custom fit model functions and custom fit estimators within |GF|.
+Functions calculating the estimator and model values are defined in CUDA header files using the CUDA C syntax.
+For each function and estimator there exists a separate file. Therefore, to add an additional model or estimator a new
+CUDA header file containing the new model or estimator function must be created and included in the library.
+
+Please note, that in order to add a model function or estimator, it is necessary to rebuild the Gpufit library 
+from source.  In future releases of Gpufit, it may be possible to include new fit functions or estimators at runtime.
+
+
+Add a new fit model function
+----------------------------
+
+To add a new fit model, the model function itself as well as analytic expressions for its partial derivatives 
+must to be known.  A function calculating the values of the model as well as a function calculating the 
+values of the partial derivatives of the model, with respect to the model parameters and possible grid 
+coordinates, must be implemented.
+
+Additionally, a new model ID must be defined and included in the list of available model IDs, and the number 
+of model parameters must be specified as well.
+
+Detailed step by step instructions for adding a model function are given below.
+
+1.	Define an additional model ID in file gpufit.h_
+2.  Implement a CUDA device function within a newly created .cuh file according to the following template.
+
+.. code-block:: cuda
+
+    __device__ void ... (                                       // function name
+        float const * parameters,
+        int const n_fits,
+        int const n_points,
+        int const n_parameters,
+        float * values,
+        float * derivatives,
+        int const chunk_index,
+        char * user_info,
+        std::size_t const user_info_size)
+    {
+        ///////////////////////////// indices /////////////////////////////
+        int const n_fits_per_block = blockDim.x / n_points;
+        int const fit_in_block = threadIdx.x / n_points;
+        int const point_index = threadIdx.x - (fit_in_block*n_points);
+        int const fit_index = blockIdx.x*n_fits_per_block + fit_in_block;
+
+        ///////////////////////////// values //////////////////////////////
+        float* current_value = &values[fit_index*n_points];
+        float const * current_parameters = &parameters[fit_index*n_parameters];
+
+        current_value[point_index] = ... ;                      // formula calculating fit model values
+
+        /////////////////////////// derivatives ///////////////////////////
+        float * current_derivative = &derivatives[fit_index * n_points*n_parameters];
+
+        current_derivative[0 * n_points + point_index] = ... ;  // formula calculating derivative values with respect to parameters[0]
+        current_derivative[1 * n_points + point_index] = ... ;  // formula calculating derivative values with respect to parameters[1]
+        .
+        .
+        .
+    }
+
+This code can be used as a pattern, where the placeholders ". . ." must be replaced by user code which calculates model
+function values and partial derivative values of the model function for a particular set of parameters. See for example linear_1d.cuh_.
+
+3.	Include the newly created .cuh file in cuda_kernels.cu_
+4.	Add an if branch in the CUDA global function ``cuda_calc_curve()`` in file cuda_kernels.cu_ to allow calling the added model function
+
+.. code-block:: cpp
+
+    if (model_id == GAUSS_1D)
+        calculate_gauss1d
+            (parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size);
+            .
+            .
+            .
+    else if (model_id == ...)       // model ID
+        ...                         // function name
+            (parameters, n_fits, n_points, n_parameters, values, derivatives, chunk_index, user_info, user_info_size);
+
+Compare model_id with the defined model of the new model and call the calculate model values function of your model.
+
+5.	Add a switch case in function set_number_of_parameters in file interface.cpp_
+
+.. code-block:: cpp
+
+    switch (model_id)
+    {
+        case GAUSS_1D:
+            n_parameters_ = 4;
+            break;
+            .
+            .
+            .
+        case ... :                  // model ID
+            n_parameters_ = ... ;   // number of model parameters
+            break;
+        default:
+            break;
+    }
+
+Add a new fit estimator
+------------------------
+
+To extend |GF| by additional estimators, three CUDA device functions must be defined and integrated.  The sections requiring modification are 
+the functions which calculate the estimator function values, and its gradient and hessian values. Also, a new estimator ID must be defined.
+Detailed step by step instructions for adding an additional estimator is given below.
+
+1. Define an additional estimator ID in gpufit.h_
+2. Implement three functions within a newly created .cuh file calculating :math:`\chi^2` values and
+   its gradient and hessian according to the following template.
+
+.. code-block:: cuda
+
+    ///////////////////////////// Chi-square /////////////////////////////
+    __device__ void ... (           // function name Chi-square
+        volatile float * chi_square,
+        int const point_index,
+        float const * data,
+        float const * value,
+        float const * weight,
+        int * state,
+        char * user_info,
+        std::size_t const user_info_size)
+    {
+        chi_square[point_index] = ... ;            // formula calculating Chi-square summands
+    }
+
+    ////////////////////////////// gradient //////////////////////////////
+    __device__ void ... (           // function name gradient
+        volatile float * gradient,
+        int const point_index,
+        int const parameter_index,
+        float const * data,
+        float const * value,
+        float const * derivative,
+        float const * weight,
+        char * user_info,
+        std::size_t const user_info_size)
+    {
+        gradient[point_index] = ... ;            // formula calculating summands of the gradient of Chi-square
+    }
+
+    ////////////////////////////// hessian ///////////////////////////////
+    __device__ void ... (           // function name hessian
+        double * hessian,
+        int const point_index,
+        int const parameter_index_i,
+        int const parameter_index_j,
+        float const * data,
+        float const * value,
+        float const * derivative,
+        float const * weight,
+        char * user_info,
+        std::size_t const user_info_size)
+    {
+        *hessian += ... ;            // formula calculating summands of the hessian of Chi-square
+    }
+
+This code can be used as a pattern, where the placeholders ". . ." must be replaced by user code which calculates the estimator
+and the hessian values of the estimator given. For a concrete example, see lse.cuh_.
+
+3. Include the newly created .cuh file in cuda_kernels.cu_
+
+.. code-block:: cpp
+
+    #include "....cuh"              // filename
+
+4. Add an if branch in 3 CUDA global functions in the file cuda_kernels.cu_
+
+    .. code-block:: cuda
+
+        __global__ void cuda_calculate_chi_squares(
+        .
+        .
+        .
+        if (estimator_id == LSE)
+        {
+            calculate_chi_square_lse(
+                shared_chi_square,
+                point_index,
+                current_data,
+                current_value,
+                current_weight,
+                current_state,
+                user_info,
+                user_info_size);
+        }
+        .
+        .
+        .
+        else if (estimator_id == ...)   // estimator ID
+        {
+            ...(                        // function name Chi-square
+                shared_chi_square,
+                point_index,
+                current_data,
+                current_value,
+                current_weight,
+                current_state,
+                user_info,
+                user_info_size);
+        }
+        .
+        .
+        .
+
+
+    .. code-block:: cuda
+
+        __global__ void cuda_calculate_gradients(
+        .
+        .
+        .
+        if (estimator_id == LSE)
+        {
+            calculate_gradient_lse(
+                shared_gradient,
+                point_index,
+                derivative_index,
+                current_data,
+                current_value,
+                current_derivative,
+                current_weight,
+                user_info,
+                user_info_size);
+        }
+        .
+        .
+        .
+        else if (estimator_id == ...)   // estimator ID
+        {
+            ...(                        // function name gradient
+                shared_gradient,
+                point_index,
+                derivative_index,
+                current_data,
+                current_value,
+                current_derivative,
+                current_weight,
+                user_info,
+                user_info_size);
+        }
+        .
+        .
+        .
+
+    .. code-block:: cuda
+
+        __global__ void cuda_calculate_hessians(
+        .
+        .
+        .
+        if (estimator_id == LSE)
+        {
+            calculate_hessian_lse(
+                &sum,
+                point_index,
+                derivative_index_i + point_index,
+                derivative_index_j + point_index,
+                current_data,
+                current_value,
+                current_derivative,
+                current_weight,
+                user_info,
+                user_info_size);
+        }
+        .
+        .
+        .
+        else if (estimator_id == ...)   // estimator ID
+        {
+            ...(                        // function name hessian
+                &sum,
+                point_index,
+                derivative_index_i + point_index,
+                derivative_index_j + point_index,
+                current_data,
+                current_value,
+                current_derivative,
+                current_weight,
+                user_info,
+                user_info_size);
+        }
+        .
+        .
+        .
+		
+Future releases
+---------------
+
+A disadvantage of the Gpufit library, when compared with established CPU-based curve fitting packages, 
+is that in order to add or modify a fit model function or a fit estimator, the library must be recompiled.  
+We anticipate that this limitation can be overcome in future releases of the library, by employing 
+run-time compilation of the CUDA code.
diff --git a/docs/epilog.txt b/docs/epilog.txt
new file mode 100644
index 0000000..ee243c1
--- /dev/null
+++ b/docs/epilog.txt
@@ -0,0 +1,48 @@
+
+..
+   The content of this file will be appended to every documentation file. Put common substitutions and links here.
+
+.. |GF| replace:: the Gpufit library
+.. |GF_version| replace:: 1.0.0
+
+.. _CUDA: http://developer.nvidia.com/cuda-zone
+.. _CUDA_SELECT_NVCC_ARCH_FLAGS: http://cmake.org/cmake/help/v3.7/module/FindCUDA.html
+
+.. _CMake: http://www.cmake.org
+.. _Boost: http://www.boost.org
+.. _MATLAB: http://www.mathworks.com/products/matlab.html
+.. _Python: http://www.python.org
+
+.. _`Gpufit on Github`: https://github.com/gpufit/Gpufit
+.. _`Gpufit release location`: https://github.com/gpufit/Gpufit/releases
+.. _Gpufit-master.zip: https://github.com/gpufit/Gpufit/archive/master.zip
+
+.. _gpufit.h: https://github.com/gpufit/Gpufit/blob/master/Gpufit/gpufit.h
+.. _interface.cpp: https://github.com/gpufit/Gpufit/blob/master/Gpufit/interface.cpp
+
+.. _gauss_1d.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/gauss_1d.cuh
+.. _gauss_2d.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/gauss_2d.cuh
+.. _gauss_2d_elliptic.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/gauss_2d_elliptic.cuh
+.. _gauss_2d_rotated.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/gauss_2d_rotated.cuh
+.. _cauchy_2d_elliptic.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/cauchy2delliptic.cuh
+.. _linear_1d.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/linear_1d.cuh
+.. _lse.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/lse.cuh
+.. _mle.cuh: https://github.com/gpufit/Gpufit/blob/master/Gpufit/mle.cuh
+.. _cuda_kernels.cu: https://github.com/gpufit/Gpufit/blob/master/Gpufit/cuda_kernels.cu
+
+.. _Tests: https://github.com/gpufit/Gpufit/tree/master/Gpufit/tests
+.. _Examples: https://github.com/gpufit/Gpufit/tree/master/Gpufit/examples
+.. _Simple_Example.cpp: https://github.com/gpufit/Gpufit/blob/master/Gpufit/examples/Simple_Example.cpp
+.. _Gauss_Fit_2D_Example.cpp: https://github.com/gpufit/Gpufit/blob/master/Gpufit/examples/Gauss_Fit_2D_Example.cpp
+.. _Linear_Regression_Example.cpp: https://github.com/gpufit/Gpufit/blob/master/Gpufit/examples/Linear_Regression_Example.cpp
+
+.. _GpufitMex.cpp: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/matlab/GpufitMex.cpp
+.. _gpufit.m: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/matlab/gpufit.m
+
+.. _`Matlab simple example`: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/matlab/examples/simple.m
+.. _`Matlab Gauss2D example`: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/matlab/examples/gauss2d.m
+.. _`Matlab Gauss2D plot example`: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/matlab/examples/gauss2d_plot.m
+
+.. _`Python simple example`: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/python/examples/simple.py
+.. _`Python Gauss2D example`: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/python/examples/gauss2d.py
+.. _`Python Gauss2D plot example`: https://github.com/gpufit/Gpufit/blob/master/Gpufit/bindings/python/examples/gauss2d_plot.py
\ No newline at end of file
diff --git a/docs/examples.rst b/docs/examples.rst
new file mode 100644
index 0000000..da54114
--- /dev/null
+++ b/docs/examples.rst
@@ -0,0 +1,394 @@
+========
+Examples
+========
+
+C++ Examples_ are part of the library code base and can be built and run through the project environment. Here they are
+described and important steps are highlighted.
+
+Please note, that additionally, the C++ Tests_ contained in the code base also demonstrate the usage of |GF|. However, a
+detailed description of the tests is not provided.
+
+.. _c-example-simple:
+
+Simple skeleton example
+-----------------------
+
+This example shows the minimal code providing all required parameters and the call to the C interface. It is contained
+in Simple_Example.cpp_ and can be built and executed within the project environment. Please note, that it this code does
+not do anything other than call gpufit().
+
+In the first section of the code, the model ID is set, space for initial parameters and data values is reserved (in a normal
+application, however, the data array would already exist), the fit tolerance is set, the maximal number of iterations is set, 
+the estimator ID is set, and the parameters to fit array is initialized to indicate that all parameters should be fit.
+
+.. code-block:: cpp
+
+	// number of fits, number of points per fit
+	size_t const number_fits = 10;
+	size_t const number_points = 10;
+
+	// model ID and number of parameter
+	int const model_id = GAUSS_1D;
+	size_t const number_parameters = 5;
+
+	// initial parameters
+	std::vector< float > initial_parameters(number_fits * number_parameters);
+
+	// data
+	std::vector< float > data(number_points * number_fits);
+
+	// tolerance
+	float const tolerance = 0.001f;
+
+	// maximal number of iterations
+	int const max_number_iterations = 10;
+
+	// estimator ID
+	int const estimator_id = LSE;
+
+	// parameters to fit (all of them)
+	std::vector< int > parameters_to_fit(number_parameters, 1);
+
+In a next step, sufficient memory is reserved for all four output parameters.
+
+.. code-block:: cpp
+
+	// output parameters
+	std::vector< float > output_parameters(number_fits * number_parameters);
+	std::vector< int > output_states(number_fits);
+	std::vector< float > output_chi_square(number_fits);
+	std::vector< int > output_number_iterations(number_fits);
+
+Finally, there is a call to the C interface of Gpufit (in this example, the optional 
+inputs *weights* and *user info* are not used) and a check of the return status.
+If an error occurred, the last error message is obtained and an exception is thrown.
+
+.. code-block:: cpp
+
+	// call to gpufit (C interface)
+	int const status = gpufit
+        (
+            number_fits,
+            number_points,
+            data.data(),
+            0,
+            model_id,
+            initial_parameters.data(),
+            tolerance,
+            max_number_iterations,
+            parameters_to_fit.data(),
+            estimator_id,
+            0,
+            0,
+            output_parameters.data(),
+            output_states.data(),
+            output_chi_square.data(),
+            output_number_iterations.data()
+        );
+
+	// check status
+	if (status != STATUS_OK)
+	{
+		throw std::runtime_error(gpufit_get_last_error());
+	}
+
+This simple example can easily be adapted to real applications by:
+
+- choosing your own model ID
+- choosing your own estimator ID
+- choosing your own fit tolerance and maximal number of iterations
+- filling the data structure with the data values to be fitted
+- filling the initial parameters structure with suitable estimates of the true parameters
+- processing the output data
+
+The following two examples show |GF| can be used to fit real data.
+
+.. _c-example-2d-gaussian:
+
+Fit 2D Gaussian functions example
+---------------------------------
+
+This example features:
+
+- Multiple fits using a 2D Gaussian function
+- Noisy data and random initial guesses for the fit parameters
+- A Poisson noise adapted maximum likelihood estimator
+
+It is contained in Gauss_Fit_2D_Example.cpp_ and can be built and executed within the project environment.  The optional 
+inputs to gpufit(), *weights* and *user info*, are not used.
+
+In this example, a 2D Gaussian curve is fit to 10\ :sup:`4` noisy data sets having a size of 20 x 20 points each.
+The model function and the model parameters are described in :ref:`gauss-2d`.
+
+In this example the true parameters used to generate the Gaussian data are set to
+
+.. code-block:: cpp
+
+    // true parameters
+	std::vector< float > true_parameters{ 10.f, 9.5f, 9.5f, 3.f, 10.f}; // amplitude, center x/y positions, width, offset
+
+which defines a 2D Gaussian peak centered at the middle of the grid (position 9.5, 9.5), with a width (standard deviation) of 3.0, an amplitude of 10
+and a background of 10.
+
+The guesses for the initial parameters are drawn from the true parameters with a uniformly distributed deviation
+of about 20%. The initial guesses for the center coordinates are chosen with a deviation relative to the width of the Gaussian.
+
+.. code-block:: cpp
+
+	// initial parameters (randomized)
+	std::vector< float > initial_parameters(number_fits * number_parameters);
+	for (size_t i = 0; i < number_fits; i++)
+	{
+		for (size_t j = 0; j < number_parameters; j++)
+		{
+			if (j == 1 || j == 2)
+			{
+				initial_parameters[i * number_parameters + j] = true_parameters[j] + true_parameters[3]  * (-0.2f + 0.4f * uniform_dist(rng));
+			}
+			else
+			{
+				initial_parameters[i * number_parameters + j] = true_parameters[j] * (0.8f + 0.4f*uniform_dist(rng));
+			}
+		}
+	}
+
+The 2D grid of x and y values (each ranging from 0 to 19 with an increment of 1) is computed with a double for loop.
+
+.. code-block:: cpp
+
+	// generate x and y values
+	std::vector< float > x(number_points);
+	std::vector< float > y(number_points);
+	for (size_t i = 0; i < size_x; i++)
+	{
+		for (size_t j = 0; j < size_x; j++) {
+			x[i * size_x + j] = static_cast<float>(j);
+			y[i * size_x + j] = static_cast<float>(i);
+		}
+	}
+
+Then a 2D Gaussian peak model function (without noise) is calculated once for the true parameters
+
+.. code-block:: cpp
+
+    void generate_gauss_2d(std::vector<float> &x, std::vector<float> &y, std::vector<float> &g, std::vector<float>::iterator &p)
+    {
+        // generates a Gaussian 2D peak function on a set of x and y values with some paramters p (size 5)
+        // we assume that x.size == y.size == g.size, no checks done
+
+        // given x and y values and parameters p computes a model function g
+        for (size_t i = 0; i < x.size(); i++)
+        {
+            float arg = -((x[i] - p[1]) * (x[i] - p[1]) + (y[i] - p[2]) * (y[i] - p[2])) / (2 * p[3] * p[3]);
+            g[i] = p[0] * exp(arg) + p[4];
+        }
+    }
+
+Stored in variable temp, it is then used in every fit to generate Poisson distributed random numbers.
+
+.. code-block:: cpp
+
+	// generate data with noise
+	std::vector< float > temp(number_points);
+	// compute the model function
+	generate_gauss_2d(x, y, temp, true_parameters.begin());
+
+	std::vector< float > data(number_fits * number_points);
+	for (size_t i = 0; i < number_fits; i++)
+	{
+		// generate Poisson random numbers
+		for (size_t j = 0; j < number_points; j++)
+		{
+			std::poisson_distribution< int > poisson_dist(temp[j]);
+			data[i * number_points + j] = static_cast<float>(poisson_dist(rng));
+		}
+	}
+
+Thus, in this example the difference between data for each fit only in the random noise.  This, and the 
+randomized initial guesses for each fit, result in each fit returning slightly different best-fit parameters.
+
+We set the model and estimator IDs for the fit accordingly.
+
+.. code-block:: cpp
+
+	// estimator ID
+	int const estimator_id = MLE;
+
+	// model ID
+	int const model_id = GAUSS_2D;
+
+And call the gpufit :ref:`c-interface`. Parameters weights, user_info and user_info_size are set to 0, indicating that they
+won't be used during the fits.
+
+.. code-block:: cpp
+
+	// call to gpufit (C interface)
+	int const status = gpufit
+        (
+            number_fits,
+            number_points,
+            data.data(),
+            0,
+            model_id,
+            initial_parameters.data(),
+            tolerance,
+            max_number_iterations,
+            parameters_to_fit.data(),
+            estimator_id,
+            0,
+            0,
+            output_parameters.data(),
+            output_states.data(),
+            output_chi_square.data(),
+            output_number_iterations.data()
+        );
+
+	// check status
+	if (status != STATUS_OK)
+	{
+		throw std::runtime_error(gpufit_get_last_error());
+	}
+
+After the fits have been executed and the return value is checked to ensure that no error occurred, some statistics
+about the fits are displayed.
+
+Output statistics
++++++++++++++++++
+
+A histogram of all possible fit states (see :ref:`api-output-parameters`) is obtained by iterating over the state of each fit.
+
+.. code-block:: cpp
+
+	// get fit states
+	std::vector< int > output_states_histogram(5, 0);
+	for (std::vector< int >::iterator it = output_states.begin(); it != output_states.end(); ++it)
+	{
+		output_states_histogram[*it]++;
+	}
+
+In the computation of the mean and standard deviation only converged fits are taken into account. Here is an example of computing
+the means of the output parameters iterating over all fits and all parameters.
+
+.. code-block:: cpp
+
+	// compute mean of fitted parameters for converged fits
+	std::vector< float > output_parameters_mean(number_parameters, 0);
+	for (size_t i = 0; i != number_fits; i++)
+	{
+		if (output_states[i] == STATE_CONVERGED)
+		{
+			for (size_t j = 0; j < number_parameters; j++)
+			{
+				output_parameters_mean[j] += output_parameters[i * number_parameters + j];
+			}
+		}
+	}
+	// normalize
+	for (size_t j = 0; j < number_parameters; j++)
+	{
+		output_parameters_mean[j] /= output_states_histogram[0];
+	}
+
+.. _linear-regression-example:
+	
+Linear Regression Example
+-------------------------
+
+This example features:
+
+- Multiple fits of a 1D Linear curve
+- Noisy data and random initial guesses for the parameters
+- Unequal spaced x position values given as custom user info
+
+It is contained in Linear_Regression_Example.cpp_ and can be built and executed within the project environment.
+
+In this example, a straight line is fitted to 10\ :sup:`4` noisy data sets. Each data set includes 20 data points.
+Locations of data points are scaled non-linear (exponentially). The user information given implicates the x positions of the data
+sets. The fits are unweighted and the model function and the model parameters are described in :ref:`linear-1d`.
+
+The custom x positions of the linear model are stored in the user_info.
+
+.. code-block:: cpp
+
+	// custom x positions for the data points of every fit, stored in user info
+	std::vector< float > user_info(number_points);
+	for (size_t i = 0; i < number_points; i++)
+	{
+		user_info[i] = static_cast<float>(pow(2, i));
+	}
+
+	// size of user info in bytes
+	size_t const user_info_size = number_points * sizeof(float);
+
+Because only number_points values are specified, this means that the same custom x position values are used for every fit.
+
+The initial parameters for every fit are set to random values uniformly distributed around the true parameter value.
+
+.. code-block:: cpp
+
+	// true parameters
+	std::vector< float > true_parameters { 5, 2 }; // offset, slope
+
+	// initial parameters (randomized)
+	std::vector< float > initial_parameters(number_fits * number_parameters);
+	for (size_t i = 0; i != number_fits; i++)
+	{
+		// random offset
+		initial_parameters[i * number_parameters + 0] = true_parameters[0] * (0.8f + 0.4f * uniform_dist(rng));
+		// random slope
+		initial_parameters[i * number_parameters + 1] = true_parameters[0] * (0.8f + 0.4f * uniform_dist(rng));
+	}
+
+The data is generated as the value of a linear function and some additive normally distributed noise term.
+
+.. code-block:: cpp
+
+	// generate data
+	std::vector< float > data(number_points * number_fits);
+	for (size_t i = 0; i != data.size(); i++)
+	{
+		size_t j = i / number_points; // the fit
+		size_t k = i % number_points; // the position within a fit
+
+		float x = user_info[k];
+		float y = true_parameters[0] + x * true_parameters[1];
+		data[i] = y + normal_dist(rng);
+	}
+
+We set the model and estimator IDs for the fit accordingly.
+
+.. code-block:: cpp
+
+	// estimator ID
+	int const estimator_id = LSE;
+
+	// model ID
+	int const model_id = LINEAR_1D;
+
+And call the gpufit :ref:`c-interface`. Parameter weights is set to 0, indicating that they won't be used during the fits.
+
+.. code-block:: cpp
+
+	// call to gpufit (C interface)
+	int const status = gpufit
+        (
+            number_fits,
+            number_points,
+            data.data(),
+            0,
+            model_id,
+            initial_parameters.data(),
+            tolerance,
+            max_number_iterations,
+            parameters_to_fit.data(),
+            estimator_id,
+            user_info_size,
+            reinterpret_cast< char * >( user_info.data() ),
+            output_parameters.data(),
+            output_states.data(),
+            output_chi_square.data(),
+            output_number_iterations.data()
+        );
+
+After the fits have been executed and the return value is checked to ensure that no error occurred, some statistics
+about the fits are displayed (see `Output statistics`_).
diff --git a/docs/fit_estimator_functions.rst b/docs/fit_estimator_functions.rst
new file mode 100644
index 0000000..fcee030
--- /dev/null
+++ b/docs/fit_estimator_functions.rst
@@ -0,0 +1,54 @@
+.. _estimator-functions:
+
+Estimator functions
+-------------------
+
+.. _estimator-lse:
+
+Least squares estimator
++++++++++++++++++++++++
+
+The least squares estimator computes the weighted sum of the squared deviation between the data values and the model at
+the positions of the data points.  The ID for this estimator is ``LSE``. It's implemented in lse.cuh_.
+
+Least squares estimation is a common method, and the standard Levenberg-Marquardt algorithm described by Marquardt makes 
+use of minimal least squares. The estimator is described as follows.
+
+.. math::
+
+    {\chi^2}(\vec{p}) = \sum_{n=0}^{N-1}{ \left(f_{n}(\vec{p})-z_{n}\right)^2\cdot w_n }
+
+:`n`: The index of the data points (:math:`0,..,N-1`)
+
+:`f_n`: The model function values at data position :math:`n`
+
+:`z_n`: Data values at data position :math:`n`
+
+:`\vec{p}`: Fit model function parameters
+
+:`w_n`: Weight values for data at position :math:`n`
+
+	
+.. _estimator-mle:
+
+Maximum likelihood estimator for data subject to Poisson statistics
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+The maximum likelihood estimator (MLE) for Poisson distributed noise is relatively simple to implement. In the case of data with Poisson noise
+is provides a more precise estimate when compared to an LSE estimator. The ID for this estimator is ``MLE``. It's implemented in mle.cuh_.
+
+The estimator is described as follows.
+
+.. math::
+
+    {\chi^2}(\vec{p}) = 2\sum_{n=0}^{N-1}{(f_{n}(\vec{p})-z_{n})}-2\sum_{n=0,z_n\neq0}^{N-1}{z_n ln \left(\frac{f_{n}(\vec{p})}{z_n}\right)}
+
+:`n`: The index of the data points (:math:`0,..,N-1`)
+
+:`f_n`: The model function values at data position :math:`n`
+
+:`z_n`: Data values at data position :math:`n`
+
+:`\vec{p}`: Actual model function parameters
+	
+Note that this estimator does not provide any means to weight the data values.  Rather, noise in the data is assumed to be purely Poissonian.
\ No newline at end of file
diff --git a/docs/fit_model_functions.rst b/docs/fit_model_functions.rst
new file mode 100644
index 0000000..620c821
--- /dev/null
+++ b/docs/fit_model_functions.rst
@@ -0,0 +1,193 @@
+.. _fit-model-functions:
+
+Fit Model functions
+-------------------
+
+This section describes the fit model functions which are included with the Gpufit library. The headings are the names
+of the ModelID parameter used in the gpufit()_ call.  They are defined in gpufit.h_.
+
+Note that additional model functions may be added as described in the documentation, see :ref:`gpufit-customization`.
+
+.. _linear-1d:
+
+Linear regression
++++++++++++++++++
+
+A 1D linear function defined by two parameters (offset and slope).  The user information data may be used to specify the
+X coordinate of each data point.  The model ID of this function is ``LINEAR_1D``, and it is implemented in linear_1d.cuh_.
+
+.. math::
+
+    g(x,\vec{p})=p_0+p_1 x
+
+:`x`: (independent variable) *X* coordinate
+
+    The X coordinate values may be specified in the user information data.
+    For details on how to do this, see the linear regression code example, :ref:`linear-regression-example`.
+
+    If no independent variables are provided, the *X* coordinate of the first data value is assumed to be (0.0).
+    In this case, for a fit size of *M* data points, the *X* coordinates of the data are simply the corresponding array
+    indices of the data array, starting from zero (i.e. :math:`0, 1, 2, ...`).
+
+:`p_0`: offset
+
+:`p_1`: slope
+
+
+.. _gauss-1d:
+
+1D Gaussian function
+++++++++++++++++++++
+
+A 1D Gaussian function defined by four parameters.  Its model ID is ``GAUSS_1D`` and it is implemented in gauss_1d.cuh_.
+Here, p is the vector of parameters (p0..p3) and the model function g exists for each x coordinate of the input data.
+
+.. math::
+
+    g(x,\vec{p})=p_0 e^{-\left(x-p_1\right)^2/\left(2p_2^2\right)}+p_3
+
+:`x`: (independent variable) *X* coordinate
+
+    No independent variables are passed to this model function.
+    Hence, the *X* coordinate of the first data value is assumed to be (0.0). For a fit size of *M* data points,
+    the *X* coordinates of the data are simply the corresponding array indices of the data array, starting from
+    zero (i.e. :math:`0, 1, 2, ...`).
+
+:`p_0`: amplitude
+
+:`p_1`: center coordinate
+
+:`p_2`: width (standard deviation)
+
+:`p_3`: offset
+
+	
+.. _gauss-2d:
+
+2D Gaussian function (cylindrical symmetry)
++++++++++++++++++++++++++++++++++++++++++++
+
+A 2D Gaussian function defined by five parameters. Its model ID is ``GAUSS_2D`` and it is implemented in gauss_2d.cuh_.
+Here, p is the vector of parameters (p0..p4) and the model function g exists for each x,y coordinate of the input data.
+
+.. math::
+
+    g(x,y,p)=p_0 e^{-\left(\left(x-p_1\right)^2+\left(y-p_2\right)^2\right)/\left(2p_3^2\right)}+p_4
+
+:`x,y`: (independent variables) *X,Y* coordinates
+	
+    No independent variables are passed to this model function.
+    Hence, the *(X,Y)* coordinates of the first data value are assumed to be (:math:`0.0, 0.0`).
+    For a fit size of *M x N* data points, the *(X,Y)* coordinates of the data are simply the corresponding 2D array
+    indices of the data array, starting from zero.
+
+:`p_0`: amplitude
+	
+:`p_1`: center coordinate x
+	
+:`p_2`: center coordinate y
+	
+:`p_3`: width (standard deviation; equal width in x and y dimensions)
+	
+:`p_4`: offset
+
+
+.. _gauss-2d-elliptic:
+
+2D Gaussian function (elliptical)
++++++++++++++++++++++++++++++++++
+
+A 2D elliptical Gaussian function defined by six parameters. Its model ID is ``GAUSS_2D_ELLIPTIC`` and it is implemented
+in gauss_2d_elliptic.cuh_.  Here, p is the vector of parameters (p0..p5) and the model function g exists for each x,y coordinate of the input data.
+
+.. math::
+
+    g(x,y,\vec{p})=p_0 e^{-\frac{1}{2}\left(\frac{\left(x-p_1\right)^2}{p_3^2}+\frac{\left(y-p_2\right)^2}{p_4^2}\right)}+p_5
+
+:`x,y`: (independent variables) *X,Y* coordinates
+
+    No independent variables are passed to this model function.
+    Hence, the *(X,Y)* coordinates of the first data value are assumed to be (:math:`0.0, 0.0`).
+    For a fit size of *M x N* data points, the *(X,Y)* coordinates of the data are simply the corresponding
+    2D array indices of the data array, starting from zero.
+
+:`p_0`: amplitude
+	
+:`p_1`: center coordinate x
+	
+:`p_2`: center coordinate y
+	
+:`p_3`: width x (standard deviation)
+	
+:`p_4`: width y (standard deviation)
+	
+:`p_5`: offset
+
+
+.. _gauss-2d-rotated:
+
+2D Gaussian function (elliptical, rotated)
+++++++++++++++++++++++++++++++++++++++++++
+
+A 2D elliptical Gaussian function whose principal axis may be rotated with respect to the X and Y coordinate axes,
+defined by seven parameters. Its model is ``GAUSS_2D_ROTATED`` and it is implemented in gauss_2d_rotated.cuh_.
+Here, p is the vector of parameters (p0..p6) and the model function g exists for each x,y coordinate of the input data.
+
+.. math::
+
+    g(x,y,\vec{p})=p_0 e^{-\frac{1}{2}\left(\frac{\left((x-p_1)\cos{p_6}-(y-p_2)\sin{p_6}\right)^2}{p_3^2}+\frac{\left((x-p_1)\sin{p_6}+(y-p_2)\cos{p_6}\right)^2}{p_4^2}\right)}+p_5
+
+:`x,y`: (independent variables) *X,Y* coordinates
+
+    No independent variables are passed to this model function.
+    Hence, the *(X,Y)* coordinates of the first data value are assumed to be (:math:`0.0, 0.0`).
+    For a fit size of *M x N* data points, the *(X,Y)* coordinates of the data are simply the corresponding
+    2D array indices of the data array, starting from zero.
+
+:`p_0`: amplitude
+	
+:`p_1`: center coordinate x
+	
+:`p_2`: center coordinate y
+	
+:`p_3`: width x (standard deviation)
+	
+:`p_4`: width y (standard deviation)
+	
+:`p_5`: offset
+
+:`p_6`: rotation angle [radians]
+
+
+.. _cauchy-2d-elliptic:
+
+2D Cauchy function (elliptical)
++++++++++++++++++++++++++++++++
+
+A 2D elliptical Cauchy function defined by six parameters. Its model ID is ``CAUCHY_2D_ELLIPTIC`` and it is implemented
+in cauchy_2d_elliptic.cuh_. Here, p is the vector of parameters (p0..p5) and the model function g exists for each x,y
+coordinate of the input data.
+
+.. math::
+
+    g(x,y,\vec{p})=p_0 \frac{1}{\left(\frac{x-p_1}{p_3}\right)^2+1} \frac{1}{\left(\frac{y-p_2}{p_4}\right)^2+1} + p_5
+
+:`x,y`: (independent variables) *X,Y* coordinates
+
+    No independent variables are passed to this model function.
+    Hence, the *(X,Y)* coordinates of the first data value are assumed to be (:math:`0.0, 0.0`).
+    For a fit size of *M x N* data points, the *(X,Y)* coordinates of the data are simply the corresponding
+    2D array indices of the data array, starting from zero.
+
+:`p_0`: amplitude
+	
+:`p_1`: center coordinate x
+	
+:`p_2`: center coordinate y
+	
+:`p_3`: width x (standard deviation)
+	
+:`p_4`: width y (standard deviation)
+	
+:`p_5`: offset
+
diff --git a/docs/gpufit_api.rst b/docs/gpufit_api.rst
new file mode 100644
index 0000000..ce6695d
--- /dev/null
+++ b/docs/gpufit_api.rst
@@ -0,0 +1,377 @@
+.. _api-description:
+
+======================
+Gpufit API description
+======================
+
+The Gpufit source code compiles to a dynamic-link library (DLL), providing a C interface.  
+In the sections below, the C interface and its arguments are described in detail.
+
+.. _c-interface:
+
+C Interface
+-----------
+
+The C interface is defined in the Gpufit header file: gpufit.h_.
+
+gpufit()
+++++++++
+
+This is the main fit function.  A single call to the *gpufit()* function executes a block of *N* fits.  
+The inputs to *gpufit()* are scalars and pointers to arrays, and the outputs are also array pointers.
+
+The inputs to the *gpufit()* function are:
+
+- the number of fits (*N*),
+- the number of data points per fit (each fit has equal size),
+- the fit data,
+- an array of weight values that are used to weight the individual data points in the fit (optional),
+- an ID number which specifies the fit model function,
+- an array of initial parameters for the model functions,
+- a tolerance value which determines when the fit has converged,
+- the maximum number of iterations per fit,
+- an array of flags which allow one or more fit parameters to be held constant,
+- an ID number which specifies the fit estimator (e.g. least squares, etc.),
+- the size of the user info data,
+- the user info data, which may have multiple uses, for example to pass additional parameters to the fit functions,
+  or to include independent variables (e.g. X values) with the fit data.
+
+The outputs of *gpufit()* are:
+
+- the best fit model parameters for each fit,
+- an array of flags indicating, for example, whether each fit converged,
+- the final value of :math:`\chi^2` for each fit,
+- the number of iterations needed for each fit to converge.
+
+The *gpufit()* function call is defined below.
+
+.. code-block:: cpp
+
+    int gpufit
+    (
+        size_t n_fits,
+        size_t n_points,
+        float * data,
+        float * weights,
+        int model_id,
+        float * initial_parameters,
+        float tolerance,
+        int max_n_iterations,
+        int * parameters_to_fit,
+        int estimator_id,
+        size_t user_info_size,
+        char * user_info,
+        float * output_parameters,
+        int * output_states,
+        float * output_chi_squares,
+        int * output_n_iterations
+    ) ;
+
+.. _api-input-parameters:
+
+Description of input parameters
+...............................
+
+:n_fits: Number of fits to be performed
+
+    :type: size_t
+
+:n_points: Number of data points per fit
+
+    Gpufit is designed such that each fit must have the same number of data points per fit.
+
+    :type: size_t
+
+:data: Pointer to data values
+
+    A pointer to the data values.  The data must be passed in as a 1D array of floating point values, with the data
+    for each fit concatenated one after another.  In the case of multi-dimensional data, the data must be flattened
+    to a 1D array.  The number of elements in the array is equal to the product n_fits * n_points.
+
+    :type: float *
+    :length: n_points * n_fits
+
+:weights: Pointer to weights
+
+    The weights array includes unique weighting values for each fit. It is used only by the least squares estimator (LSE).
+    The size of the weights array and its organization is identical to that for the data array.
+    For statistical weighting, this parameter should be set equal to the inverse of the variance of the data
+    (i.e. weights = 1.0 / variance ).  The weights array is an optional input.
+
+    :type: float *
+    :length: n_points * n_fits
+    :special: Use a NULL pointer to indicate that no weights are provided.  In this case all data values will be weighted equally.
+
+:model_id: Model ID
+
+    Determines the model which is used for all fits in this call. See :ref:`fit-model-functions` for more details.
+
+    As defined in gpufit.h_:
+
+        :0: GAUSS_1D
+        :1: GAUSS_2D
+        :2: GAUSS_2D_ELLIPTIC
+        :3: GAUSS_2D_ROTATED
+        :4: CAUCHY_2D_ELLIPTIC
+        :5: LINEAR_1D
+
+    :type: int
+
+:initial_parameters: Pointer to initial parameter values
+
+    A 1D array containing the initial model parameter values for each fit.  If the number of parameters of the fit model
+    is defined by *n_parameters*, then the size of this array is *n_fits * n_parameters*.
+	
+    The parameter values for each fit are concatenated one after another. If there are *M* parameters per fit,
+    the parameters array is organized as follows: [(parameter 1), (parameter 2), ..., (parameter M), (parameter 1),
+    (parameter 2), ..., (parameter M), ...].
+
+    :type: float *
+    :length: n_fits * n_parameters
+
+:tolerance: Fit tolerance threshold
+
+    The fit tolerance determines when the fit has converged.  After each fit iteration, the change in the absolute value
+    of :math:`\chi^2` is calculated.  The fit has converged when one of two conditions are met.  First, if the change
+    in the absolute value of :math:`\chi^2` is less than the tolerance value, the fit has converged.
+    Alternatively, if the change in :math:`\chi^2` is less than the product of tolerance and the absolute value of
+    :math:`\chi^2` [tolerance * abs(:math:`\chi^2`)], then the fit has converged.
+	
+    Setting a lower value for the tolerance results in more precise values for the fit parameters, but requires more fit
+    iterations to reach convergence.
+	
+    A typical value for the tolerance settings is between 1.0E-3 and 1.0E-6.
+
+    :type: float
+
+:max_n_iterations: Maximum number of iterations
+
+    The maximum number of fit iterations permitted.  If the fit has not converged after this number of iterations,
+    the fit returns with a status value indicating that the maximum number of iterations was reached.
+
+    :type: int
+
+:parameters_to_fit: Pointer to array indicating which model parameters should be held constant during the fit
+
+    This is an array of ones or zeros, with a length equal to the number of parameters of the fit model function.
+    Each entry in the array is a flag which determines whether or not the corresponding model parameter will be held
+    constant during the fit.  To allow a parameter to vary during the fit, set the entry in *parameters_to_fit* equal
+    to one.  To hold the value constant, set the entry to zero.
+	
+    An array of ones, e.g. [1,1,1,1,1,...] will allow all parameters to vary during the fit.
+
+    :type: int *
+    :length: n_parameters
+
+:estimator_id: Estimator ID
+
+    Determines the fit estimator which is used. See :ref:`estimator-functions` for more details.
+
+    As defined in gpufit.h_:
+
+        :0: LSE
+        :1: MLE
+
+    :type: int
+
+:user_info_size: Size of user information data
+
+    Size of the user information data array, in bytes.
+
+    :type: size_t
+
+:user_info: Pointer to user information data
+
+    This parameter is intended to provide flexibility to the Gpufit interface.  The user information data is a generic
+    block of memory which is passed in to the *gpufit()* function, and which is accessible in shared GPU memory by the
+    fit model functions.  Possible uses for the user information data is to pass in value for independent variables
+    (e.g. X values) or to supply additional data to the fit model function.  For a coded example which makes use of
+    the user information data, see :ref:`linear-regression-example`. The user information data is an optional parameter
+    - if no user information is required this parameter may be set to NULL.
+
+    :type: char *
+    :length: user_info_size
+    :special: Use a NULL pointer to indicate that no user information is available.
+
+.. _api-output-parameters:
+
+Description of output parameters
+................................
+
+:output_parameters: Pointer to array of best-fit model parameters
+
+    For each fit, this array contains the best-fit model parameters.  The array is organized identically to the input
+    parameters array.
+
+    :type: float *
+    :length: n_fits * n_parameters
+
+:output_states: Pointer to array of fit result state IDs
+
+    For each fit the result of the fit is indicated by a state ID.  The state ID codes are defined below.
+    A state ID of 0 indicates that the fit converged successfully.
+
+    As defined in gpufit.h_:
+
+        :0: The fit converged, tolerance is satisfied, the maximum number of iterations is not exceeded
+        :1: Maximum number of iterations exceeded
+        :2: During the Gauss-Jordan elimination the Hessian matrix is indicated as singular
+        :3: Non-positive curve values have been detected while using MLE (MLE requires only positive curve values)
+        :4: State not read from GPU Memory
+
+    :type: int *
+    :length: n_fits
+
+:output_chi_squares: Pointer to array of :math:`\chi^2` values
+
+    For each fit, this array contains the final :math:`\chi^2` value.
+
+    :type: float *
+    :length: n_fits
+
+:output_n_iterations: Pointer to array of iteration counts
+
+    For each fit, this array contains the number of fit iterations which were performed. 
+
+    :type: int *
+    :length: n_fits
+
+:return value: Status code
+
+    The return value of the function call indicates whether an error occurred.
+
+    :0: No error
+    :-1: Error
+
+gpufit_portable_interface()
++++++++++++++++++++++++++++
+
+This function is a simple wrapper around the *gpufit()* function, providing an alternative means of passing the function parameters.
+
+.. code-block:: cpp
+
+    int gpufit_portable_interface(int argc, void *argv[]);
+
+Description of parameters
+.........................
+
+:argc: The length of the argv pointer array
+
+:argv: Array of pointers to *gpufit* parameters, as defined above.  For reference, the type of each element of the *argv* array is listed below.
+
+	:argv[0]: Number of fits
+	
+		:type: size_t *
+		
+	:argv[1]: Number of points per fit
+	
+		:type: size_t *	
+		
+	:argv[2]: Fit data
+	
+		:type: float *	
+		
+	:argv[3]: Fit weights
+	
+		:type: float *	
+		
+	:argv[4]: Fit model ID
+	
+		:type: int *	
+		
+	:argv[5]: Initial parameters
+	
+		:type: float *	
+		
+	:argv[6]: Fit tolerance
+	
+		:type: float *	
+		
+	:argv[7]: Maximum number of iterations
+	
+		:type: int *	
+		
+	:argv[8]: Parameters to fit
+	
+		:type: int *	
+		
+	:argv[9]: Fit estimator ID
+	
+		:type: int *	
+		
+	:argv[10]: User info size
+	
+		:type: size_t *	
+		
+	:argv[11]: User info data
+	
+		:type: char *	
+		
+	:argv[12]: Output parameters
+	
+		:type: float *	
+		
+	:argv[13]: Output states
+	
+		:type: int *	
+		
+	:argv[14]: Output :math:`\chi^2` values
+	
+		:type: float *	
+		
+	:argv[15]: Output number of iterations
+	
+		:type: int *	
+	
+
+:return value: This function simply returns the *gpufit()* return status code.
+
+gpufit_get_last_error()
++++++++++++++++++++++++
+
+A function that returns a string representation of the last error.
+
+.. code-block:: cpp
+
+    char const * gpufit_get_last_error();
+
+:return value: Error message corresponding to the most recent error, or an empty string if no error occurred.
+
+    'CUDA driver version is insufficient for CUDA runtime version'
+        The graphics driver version installed on the computer is not supported by the CUDA Toolkit version which was used
+        to build Gpufit.dll.  Update the graphics driver or re-build Gpufit using a compatible CUDA Toolkit version.
+
+gpufit_cuda_available()
++++++++++++++++++++++++
+
+A function that calls a simple CUDA function to check if CUDA is available.
+
+.. code-block:: cpp
+
+    int gpufit_cuda_available();
+
+:return value: Returns 0 if CUDA is not available (no suitable device found, or driver version insufficient).
+               Use the function *gpufit_get_last_error()* to check the error message. Returns 1 if CUDA is available and CUDA runtime version and driver version are compatible.
+               
+gpufit_get_cuda_version()
++++++++++++++++++++++++++
+
+A function that returns the CUDA runtime version in *runtime_version* and the
+installed CUDA driver version in *driver_version*.
+
+.. code-block:: cpp
+
+    int gpufit_get_cuda_version(int * runtime_version, int * driver_version);
+
+:runtime_version: Pointer to the CUDA runtime version number (is 0 if the CUDA runtime version is incompatible with the installed CUDA driver version)
+        
+
+:driver_version: Pointer to the CUDA driver version number (is 0 if no CUDA enabled graphics card was detected)
+
+:return value: Returns 0 if an error occured during collecting of the version information. Use the function
+               *gpufit_get_last_error()* to check the error message. Returns 1 if collecting of the version
+               information was successful.
+
+
+
+
diff --git a/docs/images/GPUFIT_CPUFIT_Performance_Comparison.png b/docs/images/GPUFIT_CPUFIT_Performance_Comparison.png
new file mode 100644
index 0000000..8617237
Binary files /dev/null and b/docs/images/GPUFIT_CPUFIT_Performance_Comparison.png differ
diff --git a/docs/images/GPUfit_PassmarkG3D_relative_performance.png b/docs/images/GPUfit_PassmarkG3D_relative_performance.png
new file mode 100644
index 0000000..8f2e17e
Binary files /dev/null and b/docs/images/GPUfit_PassmarkG3D_relative_performance.png differ
diff --git a/docs/images/algorithm_gpufit_flowchart.png b/docs/images/algorithm_gpufit_flowchart.png
new file mode 100644
index 0000000..b95d7cb
Binary files /dev/null and b/docs/images/algorithm_gpufit_flowchart.png differ
diff --git a/docs/images/algorithm_gpufit_flowchart.vsdx b/docs/images/algorithm_gpufit_flowchart.vsdx
new file mode 100644
index 0000000..1b6bddb
Binary files /dev/null and b/docs/images/algorithm_gpufit_flowchart.vsdx differ
diff --git a/docs/images/gpufit_program_flow_skeleton_v2.png b/docs/images/gpufit_program_flow_skeleton_v2.png
new file mode 100644
index 0000000..d454681
Binary files /dev/null and b/docs/images/gpufit_program_flow_skeleton_v2.png differ
diff --git a/docs/images/gpufit_program_flow_v2.png b/docs/images/gpufit_program_flow_v2.png
new file mode 100644
index 0000000..8ead94a
Binary files /dev/null and b/docs/images/gpufit_program_flow_v2.png differ
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..6f89dc0
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,22 @@
+.. Gpufit documentation master file
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Gpufit Documentation
+====================
+
+.. toctree::
+   :maxdepth: 3
+
+   introduction
+   installation
+   gpufit_api
+   fit_model_functions
+   fit_estimator_functions
+   examples
+   customization
+   bindings
+   appendix
+   license
+   
+
diff --git a/docs/installation.rst b/docs/installation.rst
new file mode 100644
index 0000000..8af76ba
--- /dev/null
+++ b/docs/installation.rst
@@ -0,0 +1,220 @@
+.. _installation-and-testing:
+
+========================
+Installation and Testing
+========================
+
+The Gpufit library can be used in several ways. When using a pre-compiled
+binary version of Gpufit, the Gpufit functions may be accessed directly via 
+a dynamic linked library (e.g. Gpufit.dll) or via the external bindings to 
+Gpufit (e.g. the Matlab or Python bindings). For more information on the
+Gpufit interface, see :ref:`api-description`, or for details of the external
+bindings see :ref:`external-bindings`.
+
+This section describes how to compile Gpufit, including generating its 
+external bindings, from source code. Building from source is necessary when
+a fit model function is added or changed, or if a new fit estimator is required.  
+Building the library may also be useful for compiling the code using a 
+specific version of the CUDA toolkit, or for a particular CUDA compute 
+capability. 
+
+Gpufit binary distribution
+++++++++++++++++++++++++++
+
+A binary distribution of the Gpufit library is available for **Windows**.
+Use of this distribution requires only a CUDA-capable graphics card, and an
+updated Nvidia graphics driver. The binary package contains:
+
+- The Gpufit SDK, which consists of the 32-bit and 64-bit DLL files, and 
+  the Gpufit header file which contains the function definitions.  The Gpufit
+  SDK is intended to be used when calling Gpufit from an external application
+  written in e.g. C code.
+- The performance test application, which serves to test that Gpufit is 
+  correctly installed, and to check the performance of the CPU and GPU hardware.
+- Matlab 32 bit and 64 bit bindings, with Matlab examples.
+- Python version 2.x and version 3.x bindings (compiled as wheel files) and
+  Python examples.
+- This manual in PDF format.
+
+To re-build the binary distribution, see the instructions located in 
+package/README.md.
+
+Building from source code
++++++++++++++++++++++++++
+
+This section describes how to build Gpufit from source code.  Note that as of
+the initial release of Gpufit, the source code has been tested only with the 
+Microsoft Visual Studio compiler.
+
+Prerequisites
+-------------
+
+The following tools are required in order to build Gpufit from source.
+
+*Required*
+
+* CMake_ 3.7 or later
+* A C/C++ Compiler
+
+  * Linux: GCC 4.7
+  * Windows: Visual Studio 2013 or 2015
+
+* CUDA_ Toolkit 6.5 or later [#]_
+
+.. [#] Note that it is recommended to use the newest available stable release of the CUDA Toolkit which is compatible
+    with the compiler (e.g. Visual Studio 2015 is required in order to use CUDA Toolkit 8.0). Some older graphics cards
+    may only be supported by CUDA Toolkit version 6.5 or earlier. Also, when using CUDA Toolkit version 6.5, please use
+    the version with support for GTX9xx GPUs, available `here <https://developer.nvidia.com/cuda-downloads-geforce-gtx9xx>`__.
+
+*Optional*
+
+* Boost_ 1.58 or later (required if you want to build the tests)
+* MATLAB_ if building the MATLAB bindings (minimum version Matlab 2012a)
+* Python_ if building the Python bindings (Python version 2.x or 3.x)
+
+Source code availability
+------------------------
+
+The source code is available in an open repository hosted at Github, at the 
+following URL.
+
+.. code-block:: bash
+
+    https://github.com/gpufit/Gpufit.git
+
+To obtain the code, Git may be used to clone the repository, or a current 
+snapshot may be downloaded directly from Github as Gpufit-master.zip_.
+
+Compiler configuration via CMake
+--------------------------------
+
+CMake is an open-source tool designed to build, test, and package software. 
+It is used to control the software compilation process using compiler 
+independent configuration files, and generate native makefiles and workspaces 
+that can be used in the compiler environment.  In this section we provide a
+simple example of how to use CMake in order to generate the input files for the
+compiler (e.g. the Visual Studio solution file), which can then be used to 
+compile Gpufit.
+
+First, identify the directory which contains the Gpufit source code 
+(for example, on a Windows computer the Gpufit source code may be stored in 
+*C:\\Sources\\Gpufit*).  Next, create a build directory outside the 
+source code source directory (e.g. *C:\\Sources\\Gpufit-build-64*). Finally, 
+run cmake to configure and generate the compiler input files.  The following
+commands, executed from the command prompt, assume that the cmake executable
+(e.g. *C:\\Program Files\\CMake\\bin\\cmake.exe*) is automatically found 
+via the PATH environment variable (if not, the full path to cmake.exe must be
+specified).  This example also assumes that the source and build directories 
+have been set up as specified above.
+
+.. code-block:: bash
+
+    cd C:\Sources\Gpufit-build-64
+    cmake -G "Visual Studio 12 2013 Win64" C:\Sources\Gpufit
+
+Note that in this example the *-G* flag has been used to specify the 
+64-bit version of the Visual Studio 12 compiler.  This flag should be changed
+depending on the compiler used, and the desired architecture 
+(e.g. 32- or 64-bit).  Further details of the CMake command line arguments 
+can be found `here <https://cmake.org/cmake/help/latest/manual/cmake.1.html>`__.
+
+There is also a graphical user interface available for CMake, which simplifies
+the configuration and generation steps.  For further details, see  
+`Running CMake <https://cmake.org/runningcmake/>`_.
+
+Common issues encountered during CMake configuration
+----------------------------------------------------
+
+**Boost NOT found - skipping tests!**
+
+If you want to build the tests and Boost is not found automatically, set the 
+CMake variable BOOST_ROOT to the corresponding directory, and configure again.
+
+**Specify CUDA_ARCHITECTURES set**
+
+If you need a specific CUDA architecture, set CUDA_ARCHITECTURES according 
+to CUDA_SELECT_NVCC_ARCH_FLAGS_.
+
+**CMake finds lowest installed CUDA version by default**
+
+If there are multiple CUDA toolkits installed on the computer, CMake 3.7.1 
+seems to find by default the lowest installed version. Set the desired CUDA 
+version manually (e.g. by editing the CUDA_TOOLKIT_ROOT_DIR variable in CMake).
+
+**Specify CUDA version to use**
+
+Set CUDA_BIN_PATH before running CMake or CUDA_TOOLKIT_ROOT_DIR after 
+first CMAKE configuration to the installation folder of the desired 
+CUDA version.
+
+**Required CUDA version**
+
+When using Microsoft Visual Studio 2015, the minimum required CUDA Toolkit 
+version is 8.0.
+
+**Python launcher**
+
+Set Python_WORKING_DIRECTORY to a valid directory, it will be added to the 
+Python path.
+
+**Matlab launcher**
+
+Set Matlab_WORKING_DIRECTORY to a valid directory, it will be added to 
+the Matlab path.
+
+Compiling Gpufit on Windows
+---------------------------
+
+After configuring and generating the solution files using CMake, go to the 
+desired build directory and open Gpufit.sln using Visual Studio.  Select the 
+"Debug" or "Release" build options, as appropriate.  Select the build target 
+"ALL_BUILD", and build this target.  If the build process completes
+without errors, the Gpufit binary files will be created in the corresponding 
+"Debug" or "Release" folders in the build directory.
+
+The unit tests can be executed by building the target "RUN_TESTS" or by 
+starting the created executables in the output directory from
+the command line.  
+
+Linux
+-----
+
+Gpufit has not yet been officially tested on a computer running a Linux variant 
+with a CUDA capable graphics card.  However, satisfying the Prerequisites_ and 
+using CMake, we estimate that the library should build in principle and one
+should also be able to run the examples on Linux.
+
+MacOS
+-----
+
+Gpufit has not yet been officially tested on a computer running MacOS with a 
+CUDA capable graphics card.  However, satisfying the Prerequisites_ and using 
+CMake, we estimate that the library should build in principle and one
+should also be able to run the examples on MacOS.
+
+Running the performance test
+++++++++++++++++++++++++++++
+
+The Gpufit performance test is a program which verifies the correct function
+of Gpufit, and tests the fitting speed in comparison with the same algorithm
+executed on the CPU.
+
+If Gpufit was built from source, running the build target 
+GPUFIT_CPUFIT_Performance_Comparison will run the test, which executes the 
+fitting process multiple times, varying the number of fits per function call.
+The execution time is measured in each case and the relative speed improvement 
+between the GPU and the CPU is calculated.  A successful run of the performance
+test also indicates also that Gpufit is functioning correctly.  
+
+The performance comparison is also included in the Gpufit binary distribution
+as a console application.  An example of the program's output is
+shown in :numref:`installation-gpufit-cpufit-performance-comparison`.
+
+.. _installation-gpufit-cpufit-performance-comparison:
+
+.. figure:: /images/Gpufit_Cpufit_Performance_Comparison.png
+   :width: 10 cm
+   :align: center
+
+   Output of the GPUFIT vs CPUFIT performance comparison
+
diff --git a/docs/introduction.rst b/docs/introduction.rst
new file mode 100644
index 0000000..2a6fc1f
--- /dev/null
+++ b/docs/introduction.rst
@@ -0,0 +1,87 @@
+============
+Introduction
+============
+
+Gpufit is a GPU-accelerated CUDA implementation of the Levenberg-Marquardt 
+algorithm. It was developed to meet the need for a high performance, general-
+purpose nonlinear curve fitting software library which is publicly available
+and open source.
+
+Optimization algorithms are ubiquitous tools employed in many field of science 
+and technology. One such algorithm for numerical, non-linear optimization is the 
+Levenberg-Marquardt algorithm (LMA).  The LMA combines elements of the method of 
+steepest descent and Newton's method, and has become a standard algorithm for 
+least-squares fitting.
+
+Although the LMA is, in itself, an efficient optimization algorithm, 
+applications requiring many iterations of this procedure may encounter 
+limitations due to the sheer number of calculations involved.  The time required 
+for the convergence of a fit, or a set of fits, can determine an application's 
+feasibility, e.g. in the context of real-time data processing and feedback 
+systems.  Alternatively, in the case of very large datasets, the time required 
+to solve a particular optimization problem may prove impractical.
+
+In recent years, advanced graphics processing units (GPUs) and the development 
+of general purpose GPU programming have enabled fast and parallelized computing 
+by shifting calculations from the CPU to the GPU.  The large number of 
+independent computing units available on a modern GPU enables the rapid 
+execution of many instructions in parallel, with an overall computation power 
+far exceeding that of a CPU.  Languages such as CUDA C and OpenCL allow GPU-
+based programs to be developed in a manner similar to conventional software, but 
+with an inherently parallelized structure.  These developments have led to the 
+creation of new GPU-accelerated tools, such as the Gpufit.
+
+This manual describes how to install and build the Gpufit library and its 
+external bindings. Furthermore it details how to extend Gpufit by adding 
+custom model functions as well as custom fit estimator functions.
+
+The documentation includes:
+
+- Instructions for building and installing Gpufit
+- A detailed description of the C interface
+- A description of the built-in model functions
+- A description of the built-in goodness-of-fit estimator functions
+- A detailed description of the external bindings to Matlab and Python
+- Usage examples for C, Matlab, and Python
+- Instructions for adding custom model functions or custom estimator functions
+
+The current version of the Gpufit library is |GF_version| 
+(`see homepage <http://github.com/gpufit/Gpufit>`_). This manual was compiled 
+on |today|.
+
+Hardware requirements
+---------------------
+
+Because the fit algorithm is implemented in CUDA C, a CUDA_-compatible graphics
+card is required to run Gpufit.  The minimum supported compute capability is 
+2.0.  More advanced GPU hardware will result in higher fitting performance.
+
+Software requirements
+---------------------
+
+In addition to a compatible GPU, the graphics card driver installed on the 
+host computer must be compatible with the version of the CUDA toolkit which 
+was used to compile Gpufit.  This may present an issue for older graphics 
+cards or for computers running outdated graphics drivers.
+
+At the time of its initial release, Gpufit was compiled with CUDA toolkit 
+version 8.0.  Therefore, the Nvidia graphics driver installed on the host PC 
+must be at least version 367.48 (released July 2016) in order to be compatible
+with the binary files generated in this build.
+
+When compatibility issues arise, there are two possible solutions.  The best 
+option is to update the graphics driver to a version which is compatible with
+the CUDA toolkit used to build Gpufit.  The second option is to re-compile 
+Gpufit from source code, using an earlier version of the CUDA toolkit which is 
+compatible with the graphics driver in question.  However, this solution is 
+likely to result in slower performance of the Gpufit code, since older versions 
+of the CUDA toolkit are not as efficient.
+
+Note that all CUDA-supported graphics cards should be compatible with 
+CUDA toolkit version 6.5.  This is the last version of CUDA which supported 
+GPUs with compute capability 1.x.  In other words, an updated Nvidia graphics
+driver should be available for all CUDA-enabled GPUs which is compatible with
+toolkit version 6.5.  
+
+If you are unsure if your graphics card is CUDA-compatible, a lists of CUDA
+supported GPUs can be found `here <http://developer.nvidia.com/cuda-gpus>`_.
diff --git a/docs/license.rst b/docs/license.rst
new file mode 100644
index 0000000..1223cbc
--- /dev/null
+++ b/docs/license.rst
@@ -0,0 +1,25 @@
+=======================
+Gpufit software license
+=======================
+
+MIT License
+
+Copyright (c) 2017 Mark Bates, Adrian Przybylski, Björn Thiel, and Jan Keller-Findeisen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..6f53cb2
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,281 @@
+@ECHO OFF
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set BUILDDIR=_build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
+set I18NSPHINXOPTS=%SPHINXOPTS% .
+if NOT "%PAPER%" == "" (
+	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+	:help
+	echo.Please use `make ^<target^>` where ^<target^> is one of
+	echo.  html       to make standalone HTML files
+	echo.  dirhtml    to make HTML files named index.html in directories
+	echo.  singlehtml to make a single large HTML file
+	echo.  pickle     to make pickle files
+	echo.  json       to make JSON files
+	echo.  htmlhelp   to make HTML files and a HTML help project
+	echo.  qthelp     to make HTML files and a qthelp project
+	echo.  devhelp    to make HTML files and a Devhelp project
+	echo.  epub       to make an epub
+	echo.  epub3      to make an epub3
+	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+	echo.  text       to make text files
+	echo.  man        to make manual pages
+	echo.  texinfo    to make Texinfo files
+	echo.  gettext    to make PO message catalogs
+	echo.  changes    to make an overview over all changed/added/deprecated items
+	echo.  xml        to make Docutils-native XML files
+	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
+	echo.  linkcheck  to check all external links for integrity
+	echo.  doctest    to run all doctests embedded in the documentation if enabled
+	echo.  coverage   to run coverage check of the documentation if enabled
+	echo.  dummy      to check syntax errors of document sources
+	goto end
+)
+
+if "%1" == "clean" (
+	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+	del /q /s %BUILDDIR%\*
+	goto end
+)
+
+
+REM Check if sphinx-build is available and fallback to Python version if any
+%SPHINXBUILD% 1>NUL 2>NUL
+if errorlevel 9009 goto sphinx_python
+goto sphinx_ok
+
+:sphinx_python
+
+set SPHINXBUILD=python -m sphinx.__init__
+%SPHINXBUILD% 2> nul
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+:sphinx_ok
+
+
+if "%1" == "html" (
+	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+	goto end
+)
+
+if "%1" == "dirhtml" (
+	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+	goto end
+)
+
+if "%1" == "singlehtml" (
+	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
+	goto end
+)
+
+if "%1" == "pickle" (
+	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the pickle files.
+	goto end
+)
+
+if "%1" == "json" (
+	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the JSON files.
+	goto end
+)
+
+if "%1" == "htmlhelp" (
+	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+	goto end
+)
+
+if "%1" == "qthelp" (
+	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\RTDSpielwiese.qhcp
+	echo.To view the help file:
+	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\RTDSpielwiese.ghc
+	goto end
+)
+
+if "%1" == "devhelp" (
+	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished.
+	goto end
+)
+
+if "%1" == "epub" (
+	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The epub file is in %BUILDDIR%/epub.
+	goto end
+)
+
+if "%1" == "epub3" (
+	%SPHINXBUILD% -b epub3 %ALLSPHINXOPTS% %BUILDDIR%/epub3
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The epub3 file is in %BUILDDIR%/epub3.
+	goto end
+)
+
+if "%1" == "latex" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "latexpdf" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	cd %BUILDDIR%/latex
+	make all-pdf
+	cd %~dp0
+	echo.
+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "latexpdfja" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	cd %BUILDDIR%/latex
+	make all-pdf-ja
+	cd %~dp0
+	echo.
+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "text" (
+	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The text files are in %BUILDDIR%/text.
+	goto end
+)
+
+if "%1" == "man" (
+	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The manual pages are in %BUILDDIR%/man.
+	goto end
+)
+
+if "%1" == "texinfo" (
+	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
+	goto end
+)
+
+if "%1" == "gettext" (
+	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
+	goto end
+)
+
+if "%1" == "changes" (
+	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.The overview file is in %BUILDDIR%/changes.
+	goto end
+)
+
+if "%1" == "linkcheck" (
+	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+	goto end
+)
+
+if "%1" == "doctest" (
+	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+	goto end
+)
+
+if "%1" == "coverage" (
+	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Testing of coverage in the sources finished, look at the ^
+results in %BUILDDIR%/coverage/python.txt.
+	goto end
+)
+
+if "%1" == "xml" (
+	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The XML files are in %BUILDDIR%/xml.
+	goto end
+)
+
+if "%1" == "pseudoxml" (
+	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
+	goto end
+)
+
+if "%1" == "dummy" (
+	%SPHINXBUILD% -b dummy %ALLSPHINXOPTS% %BUILDDIR%/dummy
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. Dummy builder generates no files.
+	goto end
+)
+
+:end
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 0000000..b8c2751
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,20 @@
+
+# Applications
+
+function( add_example modules name )
+  set( target ${name} )
+  add_executable( ${target} ${name}.cpp
+    ${PROJECT_SOURCE_DIR}/Tests/utils.h
+    ${PROJECT_SOURCE_DIR}/Tests/utils.cpp
+  )  
+  target_include_directories( ${target} PRIVATE ${PROJECT_SOURCE_DIR} )
+  target_link_libraries( ${target} ${modules} )
+  set_property( TARGET ${target}
+    PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" )
+  set_property( TARGET ${target} PROPERTY FOLDER GpufitCpufitExamples )
+#  install( TARGETS ${target} RUNTIME DESTINATION bin )
+endfunction()
+
+add_example( "Cpufit;Gpufit" Gpufit_Cpufit_Performance_Comparison )
+
+add_example( "Cpufit;Gpufit" Gpufit_Cpufit_Nvidia_Profiler_Test )
diff --git a/examples/Gpufit_Cpufit_Nvidia_Profiler_Test.cpp b/examples/Gpufit_Cpufit_Nvidia_Profiler_Test.cpp
new file mode 100644
index 0000000..41f72e2
--- /dev/null
+++ b/examples/Gpufit_Cpufit_Nvidia_Profiler_Test.cpp
@@ -0,0 +1,340 @@
+/*
+ * Runs 100k fits on the CPU and 2m fits on the GPU, used with the Nvidia profiler to obtain
+ * running time information on the different CUDA kernels.
+ */
+
+#include "Cpufit/cpufit.h"
+#include "Gpufit/gpufit.h"
+#include "Tests/utils.h"
+
+#include <stdexcept>
+#include <array>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include <random>
+#include <numeric>
+#include <chrono>
+#include <string>
+
+#define _USE_MATH_DEFINES
+#include <math.h>
+
+
+/*
+	Names of paramters for the 2D Gaussian peak model
+*/
+struct Parameters
+{
+	float amplitude;
+	float center_x;
+	float center_y;
+	float width;
+	float background;
+};
+
+/*
+Prints some statistics and the speed (fits/second) of a run.
+*/
+void print_result(
+    std::string const name,
+    std::vector<float> const & estimated_parameters,
+    std::vector<Parameters> const & test_parameters,
+    std::vector<int> states,
+    std::vector<int> const & n_iterations,
+    std::size_t const n_fits,
+    std::size_t const n_parameters,
+    std::chrono::milliseconds::rep const duration_in_ms)
+{
+
+    std::vector<float> estimated_x_centers(n_fits);
+    std::vector<float> test_x_centers(n_fits);
+
+    for (std::size_t i = 0; i < n_fits; i++)
+    {
+        estimated_x_centers[i] = estimated_parameters[i*n_parameters + 1];
+        test_x_centers[i] = test_parameters[i].center_x;
+    }
+
+    double const std_dev_x = calculate_standard_deviation(estimated_x_centers, test_x_centers, states);
+
+    double const mean_n_iterations = calculate_mean(n_iterations, states);
+
+    double fits_per_second = static_cast<double>(n_fits) / duration_in_ms * 1000;
+
+    // output
+    std::cout << std::fixed;
+
+    std::cout << std::setw(5) << std::endl << "***" << name << "***";
+
+    std::cout << std::setprecision(3);
+    std::cout << std::setw(12) << duration_in_ms / 1000.0 << " s  ";
+
+    std::cout << std::setprecision(2);
+    std::cout << std::setw(12) << fits_per_second << " fits/s" << std::endl;
+
+    std::cout << std::setprecision(6);
+    std::cout << "x precision: " << std_dev_x << " px  ";
+
+    std::cout << std::setprecision(2);
+    std::cout << "mean iterations: " << mean_n_iterations << std::endl;
+}
+
+/*
+Randomize parameters, slightly differently
+*/
+void generate_initial_parameters(std::vector<float> & parameters_set, std::vector<Parameters> const & parameters)
+{
+    std::uniform_real_distribution< float> uniform_dist(0, 1);
+
+    float const a = 0.9f;
+    float const b = 0.2f;
+
+    int const n_parameters = sizeof(Parameters) / sizeof(float);
+    for (std::size_t i = 0; i < parameters_set.size() / n_parameters; i++)
+    {
+        parameters_set[0 + i * n_parameters] = parameters[i].amplitude * (a + b * uniform_dist(rng));
+        parameters_set[1 + i * n_parameters] = parameters[i].center_x * (a + b * uniform_dist(rng));
+        parameters_set[2 + i * n_parameters] = parameters[i].center_y * (a + b * uniform_dist(rng));
+        parameters_set[3 + i * n_parameters] = parameters[i].width * (a + b * uniform_dist(rng));
+        parameters_set[4 + i * n_parameters] = parameters[i].background * (a + b * uniform_dist(rng));
+    }
+}
+
+/*
+Randomize parameters
+*/
+void generate_test_parameters(std::vector<Parameters> & target, Parameters const source)
+{
+    std::size_t const n_fits = target.size();
+
+    std::uniform_real_distribution< float> uniform_dist(0, 1);
+
+    float const a = 0.9f;
+    float const b = 0.2f;
+
+    for (std::size_t i = 0; i < n_fits; i++)
+    {
+        target[i].amplitude = source.amplitude * (a + b * uniform_dist(rng));
+        target[i].center_x = source.center_x * (a + b * uniform_dist(rng));
+        target[i].center_y = source.center_y * (a + b * uniform_dist(rng));
+        target[i].width = source.width * (a + b * uniform_dist(rng));
+        target[i].background = source.background * (a + b * uniform_dist(rng));
+    }
+}
+
+/*
+
+*/
+void add_gauss_noise(std::vector<float> & vec, Parameters const & parameters, float const snr)
+{
+    float const gauss_fwtm = 4.292f * parameters.width; //only valid for circular gaussian
+    float const fit_area = gauss_fwtm*gauss_fwtm;
+
+    float const mean_amplitude = 2.f * float(M_PI) * parameters.amplitude * parameters.width * parameters.width / fit_area;
+
+    float const std_dev = mean_amplitude / snr;
+
+    std::normal_distribution<float> distribution(0.0, std_dev);
+
+    for (std::size_t i = 0; i < vec.size(); i++)
+    {
+        vec[i] += distribution(rng);
+    }
+}
+
+/*
+
+*/
+void generate_gauss2d(
+	std::size_t const n_fits,
+	std::size_t const n_points,
+	std::vector<float> & data,
+	std::vector<Parameters> const & parameters)
+{
+	std::cout << "generating " << n_fits << " fits ..." << std::endl;
+	for (int i = 0; i < 50; i++)
+		std::cout << "-";
+	std::cout << std::endl;
+	std::size_t progress = 0;
+
+	for (std::size_t i = 0; i < n_fits; i++)
+	{
+		float const amplitude = parameters[i].amplitude;
+		float const x00 = parameters[i].center_x;
+		float const y00 = parameters[i].center_y;
+		float const width = parameters[i].width;
+		float const background = parameters[i].background;
+
+		std::size_t const fit_index = i * n_points;
+
+		for (int iy = 0; iy < sqrt(n_points); iy++)
+		{
+			for (int ix = 0; ix < sqrt(n_points); ix++)
+			{
+				std::size_t const point_index = iy * std::size_t(sqrt(n_points)) + ix;
+				std::size_t const absolute_index = fit_index + point_index;
+
+				float const argx
+					= exp(-0.5f * ((ix - x00) / width) * ((ix - x00) / width));
+				float const argy
+					= exp(-0.5f * ((iy - y00) / width) * ((iy - y00) / width));
+
+				data[absolute_index] = amplitude * argx * argy + background;
+			}
+		}
+
+		progress += 1;
+		if (progress >= n_fits / 50)
+		{
+			progress = 0;
+			std::cout << "|";
+		}
+	}
+	std::cout << std::endl;
+	for (int i = 0; i < 50; i++)
+		std::cout << "-";
+	std::cout << std::endl;
+}
+
+/*
+Runs Gpufit vs. Cpufit for various number of fits and compares the speed
+
+No weights, Model: Gauss_2D, Estimator: LSE
+*/
+int main(int argc, char * argv[])
+{
+	// check for CUDA availability
+	if (!gpufit_cuda_available())
+	{
+		std::cout << "CUDA not available" << std::endl;
+		return -1;
+	}
+
+	// all numbers of fits
+	std::size_t const n_fits_gpu = 2000000;
+    std::size_t const n_fits_cpu = 100000;
+	std::size_t const size_x = 15;
+	std::size_t const n_points = size_x * size_x;
+
+	// fit parameters constant for every run
+	std::size_t const n_parameters = 5;
+	std::vector<int> parameters_to_fit(n_parameters, 1);
+	float const tolerance = 0.0001f;
+	int const max_n_iterations = 10;
+
+	// initial parameters
+	Parameters true_parameters;
+	true_parameters.amplitude = 500.f;
+	true_parameters.center_x = static_cast<float>(size_x) / 2.f - 0.5f;
+	true_parameters.center_y = static_cast<float>(size_x) / 2.f - 0.5f;
+	true_parameters.width = 2.f;
+	true_parameters.background = 10.f;
+
+	//  test parameters
+	std::cout << "generate test parameters" << std::endl;
+	std::vector<Parameters> test_parameters(n_fits_gpu);
+	generate_test_parameters(test_parameters, true_parameters);
+
+	//  test data
+	std::vector<float> data(n_fits_gpu * n_points);
+	generate_gauss2d(n_fits_gpu, n_points, data, test_parameters);
+	std::cout << "add noise" << std::endl;
+	add_gauss_noise(data, true_parameters, 10.f);
+
+	// initial parameter set
+	std::vector<float> initial_parameters(n_parameters * n_fits_gpu);
+	generate_initial_parameters(initial_parameters, test_parameters);
+
+	std::cout << std::endl;
+	std::cout << n_fits_cpu << " fits on the CPU" << std::endl;
+
+	// Cpufit output
+	std::vector<float> cpufit_parameters(n_fits_cpu * n_parameters);
+	std::vector<int> cpufit_states(n_fits_cpu);
+	std::vector<float> cpufit_chi_squares(n_fits_cpu);
+	std::vector<int> cpufit_n_iterations(n_fits_cpu);
+
+	// run Cpufit and measure time
+	std::chrono::high_resolution_clock::time_point t0 = std::chrono::high_resolution_clock::now();
+	int const cpu_status
+		= cpufit
+		(
+			n_fits_cpu,
+			n_points,
+			data.data(),
+			0,
+			GAUSS_2D,
+			initial_parameters.data(),
+			tolerance,
+			max_n_iterations,
+			parameters_to_fit.data(),
+			LSE,
+			0,
+			0,
+			cpufit_parameters.data(),
+			cpufit_states.data(),
+			cpufit_chi_squares.data(),
+			cpufit_n_iterations.data()
+		);
+	std::chrono::milliseconds::rep const dt_cpufit = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t0).count();
+
+	if (cpu_status != 0)
+	{
+		// error in cpufit, should actually not happen
+		std::cout << "Error in cpufit: " << cpufit_get_last_error() << std::endl;
+	}
+	else
+	{
+		// print
+		print_result("Cpufit", cpufit_parameters, test_parameters, cpufit_states, cpufit_n_iterations, n_fits_cpu, n_parameters, dt_cpufit);
+	}
+
+    std::cout << std::endl;
+    std::cout << n_fits_gpu << " fits on the GPU" << std::endl;
+
+	// Gpufit output parameters
+	std::vector<float> gpufit_parameters(n_fits_gpu * n_parameters);
+	std::vector<int> gpufit_states(n_fits_gpu);
+	std::vector<float> gpufit_chi_squares(n_fits_gpu);
+	std::vector<int> gpufit_n_iterations(n_fits_gpu);
+
+	// run Gpufit and measure time
+	t0 = std::chrono::high_resolution_clock::now();
+	int const gpu_status
+		= gpufit
+		(
+			n_fits_gpu,
+			n_points,
+			data.data(),
+			0,
+			GAUSS_2D,
+			initial_parameters.data(),
+			tolerance,
+			max_n_iterations,
+			parameters_to_fit.data(),
+			LSE,
+			0,
+			0,
+			gpufit_parameters.data(),
+			gpufit_states.data(),
+			gpufit_chi_squares.data(),
+			gpufit_n_iterations.data()
+		);
+	std::chrono::milliseconds::rep const dt_gpufit = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t0).count();
+
+	if (gpu_status != 0)
+	{
+		// error in gpufit
+		std::cout << "Error in gpufit: " << gpufit_get_last_error() << std::endl;
+	}
+	else
+	{
+		// print results
+		print_result("Gpufit", gpufit_parameters, test_parameters, gpufit_states, gpufit_n_iterations, n_fits_gpu, n_parameters, dt_gpufit);
+	}
+
+    std::cout << "\nPERFORMANCE GAIN Gpufit/Cpufit \t" << std::setw(10) << static_cast<double>(dt_cpufit) / dt_gpufit * n_fits_gpu / n_fits_cpu << std::endl;
+
+	return 0;
+}
\ No newline at end of file
diff --git a/examples/Gpufit_Cpufit_Performance_Comparison.cpp b/examples/Gpufit_Cpufit_Performance_Comparison.cpp
new file mode 100644
index 0000000..b25dd90
--- /dev/null
+++ b/examples/Gpufit_Cpufit_Performance_Comparison.cpp
@@ -0,0 +1,450 @@
+#include "Cpufit/cpufit.h"
+#include "Gpufit/gpufit.h"
+#include "Tests/utils.h"
+
+#include <stdexcept>
+#include <array>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include <random>
+#include <numeric>
+#include <chrono>
+#include <string>
+
+#define _USE_MATH_DEFINES
+#include <math.h>
+
+
+/*
+    Names of paramters for the 2D Gaussian peak model
+*/
+struct Parameters
+{
+    float amplitude;
+    float center_x;
+    float center_y;
+    float width;
+    float background;
+};
+
+/*
+    Randomize parameters, slightly differently
+*/
+void generate_initial_parameters(std::vector<float> & parameters_set, std::vector<Parameters> const & parameters)
+{
+    std::uniform_real_distribution< float> uniform_dist(0, 1);
+
+    float const a = 0.9f;
+    float const b = 0.2f;
+
+    int const n_parameters = sizeof(Parameters) / sizeof(float);
+    for (std::size_t i = 0; i < parameters_set.size() / n_parameters; i++)
+    {
+        parameters_set[0 + i * n_parameters] = parameters[i].amplitude * (a + b * uniform_dist(rng));
+        parameters_set[1 + i * n_parameters] = parameters[i].center_x * (a + b * uniform_dist(rng));
+        parameters_set[2 + i * n_parameters] = parameters[i].center_y * (a + b * uniform_dist(rng));
+        parameters_set[3 + i * n_parameters] = parameters[i].width * (a + b * uniform_dist(rng));
+        parameters_set[4 + i * n_parameters] = parameters[i].background * (a + b * uniform_dist(rng));
+    }
+}
+
+/*
+    Randomize parameters
+*/
+void generate_test_parameters(std::vector<Parameters> & target,    Parameters const source)
+{
+    std::size_t const n_fits = target.size();
+
+    std::uniform_real_distribution< float> uniform_dist(0, 1);
+
+    float const a = 0.9f;
+    float const b = 0.2f;
+
+    int const text_width = 30;
+    int const progress_width = 25;
+
+    std::cout << std::setw(text_width) << " ";
+    for (int i = 0; i < progress_width; i++)
+        std::cout << "-";
+    std::cout << std::endl;
+    std::cout << std::setw(text_width) << std::left << "Generating test parameters";
+
+    std::size_t progress = 0;
+
+    for (std::size_t i = 0; i < n_fits; i++)
+    {
+        target[i].amplitude = source.amplitude * (a + b * uniform_dist(rng));
+        target[i].center_x = source.center_x * (a + b * uniform_dist(rng));
+        target[i].center_y = source.center_y * (a + b * uniform_dist(rng));
+        target[i].width = source.width * (a + b * uniform_dist(rng));
+        target[i].background = source.background * (a + b * uniform_dist(rng));
+
+        progress += 1;
+        if (progress >= n_fits / progress_width)
+        {
+            progress = 0;
+            std::cout << "|";
+        }
+    }
+
+    std::cout << std::endl;
+    std::cout << std::setw(text_width) << " ";
+    for (int i = 0; i < progress_width; i++)
+        std::cout << "-";
+    std::cout << std::endl;
+}
+
+/*
+
+*/
+void add_gauss_noise(std::vector<float> & vec, Parameters const & parameters, float const snr)
+{
+    float const gauss_fwtm = 4.292f * parameters.width; //only valid for circular gaussian
+    float const fit_area = gauss_fwtm*gauss_fwtm;
+
+    float const mean_amplitude = 2.f * float(M_PI) * parameters.amplitude * parameters.width * parameters.width / fit_area;
+
+    float const std_dev = mean_amplitude / snr;
+
+    std::normal_distribution<float> distribution(0.0, std_dev);
+
+    int const text_width = 30;
+    int const progress_width = 25;
+
+    std::cout << std::setw(text_width) << " ";
+    for (int i = 0; i < progress_width; i++)
+        std::cout << "-";
+    std::cout << std::endl;
+    std::cout << std::setw(text_width) << std::left << "Adding noise";
+
+    std::size_t progress = 0;
+
+    for (std::size_t i = 0; i < vec.size(); i++)
+    {
+        vec[i] += distribution(rng);
+
+        progress += 1;
+        if (progress >= vec.size() / progress_width)
+        {
+            progress = 0;
+            std::cout << "|";
+        }
+    }
+
+    std::cout << std::endl;
+    std::cout << std::setw(text_width) << " ";
+    for (int i = 0; i < progress_width; i++)
+        std::cout << "-";
+    std::cout << std::endl;
+}
+
+/*
+
+*/
+void generate_gauss2d(
+    std::size_t const n_fits,
+    std::size_t const n_points,
+    std::vector<float> & data,
+    std::vector<Parameters> const & parameters)
+{
+    int const text_width = 30;
+    int const progress_width = 25;
+
+    std::cout << std::setw(text_width) << " ";
+    for (int i = 0; i < progress_width; i++)
+        std::cout << "-";
+    std::cout << std::endl;
+    std::cout << std::setw(text_width) << std::left << "Generating data";
+
+    std::size_t progress = 0;
+
+    for (std::size_t i = 0; i < n_fits; i++)
+    {
+        float const amplitude = parameters[i].amplitude;
+        float const x00 = parameters[i].center_x;
+        float const y00 = parameters[i].center_y;
+        float const width = parameters[i].width;
+        float const background = parameters[i].background;
+
+        std::size_t const fit_index = i * n_points;
+
+        for (int iy = 0; iy < sqrt(n_points); iy++)
+        {
+            for (int ix = 0; ix < sqrt(n_points); ix++)
+            {
+                std::size_t const point_index = iy * std::size_t(sqrt(n_points)) + ix;
+                std::size_t const absolute_index = fit_index + point_index;
+
+                float const argx
+                    = exp(-0.5f * ((ix - x00) / width) * ((ix - x00) / width));
+                float const argy
+                    = exp(-0.5f * ((iy - y00) / width) * ((iy - y00) / width));
+
+                data[absolute_index] = amplitude * argx * argy + background;
+            }
+        }
+
+        progress += 1;
+        if (progress >= n_fits / progress_width)
+        {
+            progress = 0;
+            std::cout << "|";
+        }
+    }
+    std::cout << std::endl;
+    std::cout << std::setw(text_width) << " ";
+    for (int i = 0; i < progress_width; i++)
+        std::cout << "-";
+    std::cout << std::endl;
+}
+
+/*
+Runs Gpufit vs. Cpufit for various number of fits and compares the speed
+
+No weights, Model: Gauss_2D, Estimator: LSE
+*/
+int main(int argc, char * argv[])
+{
+    // title 
+    std::cout << "----------------------------------------" << std::endl;
+    std::cout << "Performance comparison Gpufit vs. Cpufit" << std::endl;
+    std::cout << "----------------------------------------" << std::endl << std::endl;
+
+    std::cout << "Please note that execution speed test results depend on" << std::endl;
+    std::cout << "the details of the CPU and GPU hardware." << std::endl;
+    std::cout << std::endl;
+
+
+    // check for CUDA availability
+    int cuda_runtime_version = 0;
+    int cuda_driver_version = 0;
+    bool const version_available = gpufit_get_cuda_version(&cuda_runtime_version, &cuda_driver_version) != 0;
+    int const cuda_runtime_major = cuda_runtime_version / 1000;
+    int const cuda_runtime_minor = cuda_runtime_version % 1000 / 10;
+    int const cuda_driver_major = cuda_driver_version / 1000;
+    int const cuda_driver_minor = cuda_driver_version % 1000 / 10;
+
+    bool do_gpufits = false;
+    if (version_available)
+    {
+        std::cout << "CUDA runtime version: ";
+        std::cout << cuda_runtime_major << "." << cuda_runtime_minor << std::endl;
+        std::cout << "CUDA driver version:  ";
+        std::cout << cuda_driver_major << "." << cuda_driver_minor << std::endl;
+        std::cout << std::endl;
+
+        bool const cuda_available = cuda_driver_version > 0;
+        if (cuda_available)
+        {
+            bool const version_compatible
+                = cuda_driver_version >= cuda_runtime_version
+                && cuda_runtime_version > 0;
+            if (version_compatible)
+            {
+                do_gpufits = true;
+            }
+            else
+            {
+                std::cout << "The CUDA runtime version is not compatible with the" << std::endl;
+                std::cout << "current graphics driver. Please update the driver, or" << std::endl;
+                std::cout << "re - build Gpufit from source using a compatible version" << std::endl;
+                std::cout << "of the CUDA toolkit." << std::endl;
+                std::cout << std::endl;
+            }
+        }
+        else
+        {
+            std::cout << "No CUDA enabled graphics card detected." << std::endl;
+            std::cout << std::endl;
+        }
+    }
+    else
+    {
+        std::cout << "CUDA error detected. Error string: ";
+        std::cout << gpufit_get_last_error() << std::endl;
+        std::cout << std::endl;
+    }
+    if (!do_gpufits)
+    {
+        std::cout << "Skipping Gpufit computations." << std::endl << std::endl;
+    }
+
+    // all numbers of fits
+    std::vector<std::size_t> n_fits_all;
+    if (sizeof(void*) < 8)
+    {
+        n_fits_all = { 10, 100, 1000, 10000, 100000, 1000000};
+    }
+    else
+    {
+        n_fits_all = { 10, 100, 1000, 10000, 100000, 1000000, 10000000 };
+    }
+
+    std::size_t const max_n_fits = n_fits_all.back();
+
+    // fit parameters constant for every run
+    std::size_t const size_x = 5;
+    std::size_t const n_points = size_x * size_x;
+    std::size_t const n_parameters = 5;
+    std::vector<int> parameters_to_fit(n_parameters, 1);
+    float const tolerance = 0.0001f;
+    int const max_n_iterations = 10;
+
+    // initial parameters
+    Parameters true_parameters;
+    true_parameters.amplitude = 500.f;
+    true_parameters.center_x = static_cast<float>(size_x) / 2.f - 0.5f;
+    true_parameters.center_y = static_cast<float>(size_x) / 2.f - 0.5f;
+    true_parameters.width = 1.f;
+    true_parameters.background = 10.f;
+
+    // test parameters
+    std::vector<Parameters> test_parameters(max_n_fits);
+    generate_test_parameters(test_parameters, true_parameters);
+
+    //  test data
+    std::vector<float> data(max_n_fits * n_points);
+    generate_gauss2d(max_n_fits, n_points, data, test_parameters);
+    add_gauss_noise(data, true_parameters, 10.f);
+
+    // initial parameter set
+    std::vector<float> initial_parameters(n_parameters * max_n_fits);
+    generate_initial_parameters(initial_parameters, test_parameters);
+
+    // print collumn identifiers
+    std::cout << std::endl << std::right;
+    std::cout << std::setw(8) << "Number" << std::setw(3) << "|";
+    std::cout << std::setw(13) << "Cpufit speed" << std::setw(3) << "|";
+    std::cout << std::setw(13) << "Gpufit speed" << std::setw(3) << "|";
+    std::cout << std::setw(12) << "Performance";
+    std::cout << std::endl;
+    std::cout << std::setw(8) << "of fits" << std::setw(3) << "|";
+    std::cout << std::setw(13) << "(fits/s)" << std::setw(3) << "|";
+    std::cout << std::setw(13) << "(fits/s)" << std::setw(3) << "|";
+    std::cout << std::setw(12) << "gain factor";
+    std::cout << std::endl;
+    std::cout << "-------------------------------------------------------";
+    std::cout << std::endl;
+
+    // loop over number of fits
+    for (std::size_t fit_index = 0; fit_index < n_fits_all.size(); fit_index++)
+    {
+        // number of fits
+        std::size_t n_fits = n_fits_all[fit_index];
+        std::cout << std::setw(8) << n_fits << std::setw(3) << "|";
+
+        // Cpufit output
+        std::vector<float> cpufit_parameters(n_fits * n_parameters);
+        std::vector<int> cpufit_states(n_fits);
+        std::vector<float> cpufit_chi_squares(n_fits);
+        std::vector<int> cpufit_n_iterations(n_fits);
+
+        // run Cpufit and measure time
+        std::chrono::high_resolution_clock::time_point t0 = std::chrono::high_resolution_clock::now();
+        int const cpu_status
+            = cpufit
+            (
+                n_fits,
+                n_points,
+                data.data(),
+                0,
+                GAUSS_2D,
+                initial_parameters.data(),
+                tolerance,
+                max_n_iterations,
+                parameters_to_fit.data(),
+                LSE,
+                0,
+                0,
+                cpufit_parameters.data(),
+                cpufit_states.data(),
+                cpufit_chi_squares.data(),
+                cpufit_n_iterations.data()
+            );
+        std::chrono::milliseconds::rep const dt_cpufit = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t0).count();
+
+        if (cpu_status != 0)
+        {
+            // error in cpufit, should actually not happen
+            std::cout << "Error in cpufit: " << cpufit_get_last_error() << std::endl;
+        }
+
+        std::chrono::milliseconds::rep dt_gpufit = 0;
+
+        // if we do not do gpufit, we skip the rest of the loop
+        if (do_gpufits)
+        {
+            // Gpufit output parameters
+            std::vector<float> gpufit_parameters(n_fits * n_parameters);
+            std::vector<int> gpufit_states(n_fits);
+            std::vector<float> gpufit_chi_squares(n_fits);
+            std::vector<int> gpufit_n_iterations(n_fits);
+
+            // run Gpufit and measure time
+            t0 = std::chrono::high_resolution_clock::now();
+            int const gpu_status
+                = gpufit
+                (
+                n_fits,
+                n_points,
+                data.data(),
+                0,
+                GAUSS_2D,
+                initial_parameters.data(),
+                tolerance,
+                max_n_iterations,
+                parameters_to_fit.data(),
+                LSE,
+                0,
+                0,
+                gpufit_parameters.data(),
+                gpufit_states.data(),
+                gpufit_chi_squares.data(),
+                gpufit_n_iterations.data()
+                );
+            dt_gpufit = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t0).count();
+
+            if (gpu_status != 0)
+            {
+                // error in gpufit
+                std::cout << "Error in gpufit: " << gpufit_get_last_error() << std::endl;
+                do_gpufits = false;
+            }
+        }
+
+        // print the calculation speed in fits/s
+        std::cout << std::fixed << std::setprecision(0);
+        if (dt_cpufit)
+        {
+            std::cout << std::setw(13) << static_cast<double>(n_fits) / static_cast<double>(dt_cpufit)* 1000.0 << std::setw(3) << "|";
+        }
+        else
+        {
+            std::cout << std::setw(13) << "inf" << std::setw(3) << "|";
+        }
+        if (dt_gpufit)
+        {
+            std::cout << std::setw(13) << static_cast<double>(n_fits) / static_cast<double>(dt_gpufit)* 1000.0 << std::setw(3) << "|";
+            std::cout << std::fixed << std::setprecision(2);
+            std::cout << std::setw(12) << static_cast<double>(dt_cpufit) / static_cast<double>(dt_gpufit);
+        }
+        else if (!do_gpufits)
+        {
+            std::cout << std::setw(13) << "--" << std::setw(3) << "|";
+            std::cout << std::setw(12) << "--";
+        }
+        else
+        {
+            std::cout << std::setw(13) << "inf" << std::setw(3) << "|";
+            std::cout << std::setw(12) << "inf";
+        }
+        
+        std::cout << std::endl;        
+    }
+    std::cout << std::endl << "Test completed!" << std::endl;
+    std::cout << "Press ENTER to exit" << std::endl;
+    std::getchar();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/examples/Gpufit_Cpufit_Performance_Comparison_readme.txt b/examples/Gpufit_Cpufit_Performance_Comparison_readme.txt
new file mode 100644
index 0000000..92339af
--- /dev/null
+++ b/examples/Gpufit_Cpufit_Performance_Comparison_readme.txt
@@ -0,0 +1,106 @@
+Example application for the Gpufit library (https://github.com/gpufit/Gpufit)
+which implements Levenberg Marquardt curve fitting in CUDA.
+
+Requirements
+------------
+
+- A CUDA capable graphics card with a recent Nvidia graphics driver
+  (at least 367.48 / July 2016)
+- Windows
+- >1.5 GB of free RAM
+
+Running
+-------
+
+Start "Gpufit_Cpufit_Performance_Comparison.exe" to see a speed comparison of
+GPU and CPU implementation.
+
+Output
+------
+
+The accurate execution of the performance comparison example shows the version
+number of the installed CUDA driver and the CUDA runtime Gpufit was built with.
+
+EXAMPLE:
+  CUDA runtime version: 8.0
+  CUDA driver version:  9.0
+
+In the next step the successful generation of test data is indicated by three
+full progress bars.
+
+EXAMPLE:
+
+                                -------------------------
+  Generating test parameters    |||||||||||||||||||||||||
+                                -------------------------
+                                -------------------------
+  Generating data               |||||||||||||||||||||||||
+                                -------------------------
+                                -------------------------
+  Adding noise                  |||||||||||||||||||||||||
+                                -------------------------
+								
+The results of the performance comparison between Gpufit and Cpufit are shown
+in a table. The results demonstrate the performance benefit of Gpufit compared
+to Cpufit executing the fitting process vor various number of fits in a range
+of 10 - 10000000. The execution speed is expressed in fits per second. If the
+execution time was not measureable, the speed is expressed as infinite.
+
+EXAMPLE:
+
+    Number  | Cpufit speed  | Gpufit speed  | Performance
+   of fits  |     (fits/s)  |     (fits/s)  | gain factor
+  -------------------------------------------------------
+        10  |          inf  |           92  |        0.00
+       100  |          inf  |         6667  |        0.00
+      1000  |        66667  |          inf  |         inf
+     10000  |        58480  |       666667  |       11.40
+    100000  |        59916  |      2173913  |       36.28
+   1000000  |        59898  |      2469136  |       41.22
+  10000000  |        60957  |      3038590  |       49.85
+
+Troubleshooting
+---------------
+
+MESSAGE:
+
+  CUDA runtime version: 0.0
+  CUDA driver version:  7.5
+
+  The CUDA runtime version is not compatible with the current graphics driver.
+  Please update the driver, or re-build Gpufit from source using a compatible
+  version of the CUDA toolkit.
+  
+  Skipping Gpufit computations.
+  
+BEHAVIOR:
+
+  The example executes Cpufit skipping Gpufit. Only computation speed of Cpufit
+  is shown in the results table.
+  
+SOLUTION:
+
+  A common reason for this error message is an outdated Nvidia graphics driver.
+  In most cases updating the graphics card driver will solve this error. For
+  older graphics cards which are not supported by the CUDA toolkit used for
+  building Gpufit, re-compile Gpufit using an earlier version of the CUDA
+  toolkit which is compatible with the graphics driver.
+
+MESSAGE:
+  
+  CUDA runtime version: 0.0
+  CUDA driver version:  0.0
+  
+  No CUDA enabled graphics card detected.
+  
+  Skipping Gpufit computations.
+
+BEHAVIOR:
+
+  The example executes Cpufit skipping Gpufit. Only computation speed of Cpufit
+  is shown in the results table.
+  
+SOLUTION:
+
+  The execution of Gpufit requires a CUDA enabled graphics card.
+  Ensure, that the host PC has installed a CUDA enabled graphics card.
\ No newline at end of file
diff --git a/package/README.md b/package/README.md
new file mode 100644
index 0000000..ebf9279
--- /dev/null
+++ b/package/README.md
@@ -0,0 +1,48 @@
+# Creating a binary package
+
+The binary package bundles different builds outputs into a single distributable binary package containing the Gpufit dll,
+the performance comparison example, the Matlab bindings and the Python bindings.
+
+## Calling the script
+
+create_package.bat %1 %2 %3
+
+with 
+
+- %1 is the BUILD_BASE_PATH (the path containing the various (see below) CMake generated Visual Studio projects)
+
+- %2 is the VERSION (e.g. 1.0.0)
+
+- %3 is the SOURCE_BASE_PATH (the path containing the sources)
+
+The output is a folder (BUILD_BASE_PATH/Gpufit-VERSION) which is also zipped if 7-Zip is available.
+
+## Requirements
+
+Note: The script has no way of checking that the requirements are fulfilled!
+
+See also [Build from sources](http://Gpufit.readthedocs.io/en/latest/installation.html#build-from-sources) for instructions.
+
+CMake
+
+- CUDA_ARCHITECTURE must be set to All (it is by default)
+
+- CUDA toolkit 8.0 is used for all builds (must be installed before)
+
+- Build directory for MSVC14 Win64 is BUILD_BASE_PATH/VC14x64-8.0
+
+- Build directory for MSVC14 Win32 is BUILD_BASE_PATH/VC14x32-8.0
+
+- Matlab and Python must be available
+
+Build
+
+- Configuration RelWithDebInfo is used for all builds!
+
+- With MSVC14 Win64 build target PYTHON_WHEEL, MATLAB_GPUFIT_PACKAGE and the Gpufit_Cpufit_Performance_Comparison example
+
+- With MSVC14 Win32 build target PYTHON_WHEEL, MATLAB_GPUFIT_PACKAGE and the Gpufit_Cpufit_Performance_Comparison example
+
+Documentation
+
+- An up-to-date version of the documentation must exist at SOURCE_BASE_PATH\docs\_build\latex\Gpufit.pdf (must be created before).
\ No newline at end of file
diff --git a/package/create_package.bat b/package/create_package.bat
new file mode 100644
index 0000000..75ba751
--- /dev/null
+++ b/package/create_package.bat
@@ -0,0 +1,170 @@
+@ECHO OFF
+
+REM create package for Gpufit, assumes everything is compiled
+
+if "%1" == "" (
+	echo specify build base path
+	goto end
+)
+
+if "%2" == "" (
+	echo specify version
+	goto end
+)
+
+if "%3" == "" (
+	echo specify source base path
+	goto end
+)
+
+REM date and time from https://stackoverflow.com/a/30343827/1536976
+
+@SETLOCAL ENABLEDELAYEDEXPANSION
+
+@REM Use WMIC to retrieve date and time
+@echo off
+FOR /F "skip=1 tokens=1-6" %%A IN ('WMIC Path Win32_LocalTime Get Day^,Hour^,Minute^,Month^,Second^,Year /Format:table') DO (
+    IF NOT "%%~F"=="" (
+        SET /A SortDate = 10000 * %%F + 100 * %%D + %%A
+        set YEAR=!SortDate:~0,4!
+        set MON=!SortDate:~4,2!
+        set DAY=!SortDate:~6,2!
+        @REM Add 1000000 so as to force a prepended 0 if hours less than 10
+        SET /A SortTime = 1000000 + 10000 * %%B + 100 * %%C + %%E
+        set HOUR=!SortTime:~1,2!
+        set MIN=!SortTime:~3,2!
+        set SEC=!SortTime:~5,2!
+    )
+)
+
+set DATECODE=!YEAR!!MON!!DAY!!HOUR!!MIN!
+echo %DATECODE%
+
+REM define paths
+
+set BUILD_BASE=%1
+set VERSION=%2
+set SOURCE_BASE=%3
+
+set OUTPUT_NAME=Gpufit_%VERSION%_win32_win64_build%DATECODE%
+set ROOT_INSTALL=%BUILD_BASE%\%OUTPUT_NAME%
+set OUTPUT_ZIP=%BUILD_BASE%\%OUTPUT_NAME%.zip
+
+set PERFORMANCE_TEST_INSTALL=%ROOT_INSTALL%\gpufit_performance_test
+set PYTHON_INSTALL=%ROOT_INSTALL%\python
+set x32_MATLAB_INSTALL=%ROOT_INSTALL%\matlab32
+set x64_MATLAB_INSTALL=%ROOT_INSTALL%\matlab64
+set SDK_INSTALL_ROOT=%ROOT_INSTALL%\gpufit_sdk
+
+set x64_BUILD=%BUILD_BASE%\VC14x64-8.0\RelWithDebInfo
+set x64_BUILD_LIB=%BUILD_BASE%\VC14x64-8.0\Gpufit\RelWithDebInfo
+set x32_BUILD=%BUILD_BASE%\VC14x32-8.0\RelWithDebInfo
+set x32_BUILD_LIB=%BUILD_BASE%\VC14x32-8.0\Gpufit\RelWithDebInfo
+
+set x64_PYTHON_BUILD=%x64_BUILD%\pyGpufit\dist
+set x32_PYTHON_BUILD=%x32_BUILD%\pyGpufit\dist
+
+set x64_MATLAB_BUILD=%x64_BUILD%\matlab
+set x32_MATLAB_BUILD=%x32_BUILD%\matlab
+
+set EXAMPLES_SOURCE=%SOURCE_BASE%\examples
+set PYTHON_SOURCE=%SOURCE_BASE%\Gpufit\python
+set MATLAB_SOURCE=%SOURCE_BASE%\Gpufit\matlab
+set SDK_README_SOURCE=%SOURCE_BASE%\package\sdk_readme.txt
+
+set MANUAL_SOURCE=%SOURCE_BASE%\docs\_build\latex\Gpufit.pdf
+set MANUAL_INSTALL=%ROOT_INSTALL%\Gpufit_%VERSION%_Manual.pdf
+
+REM clean up (if necessary)
+
+if exist "%ROOT_INSTALL%" rmdir /s /q "%ROOT_INSTALL%"
+if exist "%OUTPUT_ZIP%" del "%OUTPUT_ZIP%"
+
+REM create root folder
+
+echo create root directory
+mkdir "%ROOT_INSTALL%"
+
+REM copy main readme (is markdown, written as txt) and license
+
+copy "%SOURCE_BASE%\README.md" "%ROOT_INSTALL%\README.txt"
+copy "%SOURCE_BASE%\LICENSE.txt" "%ROOT_INSTALL%"
+
+REM copy manual
+
+if not exist "%MANUAL_SOURCE%" (
+	echo file %MANUAL_SOURCE% required, does not exist
+	goto end
+)
+copy "%MANUAL_SOURCE%" "%MANUAL_INSTALL%"
+
+REM copy performance test
+
+echo collect performance test application
+mkdir "%PERFORMANCE_TEST_INSTALL%"
+copy "%EXAMPLES_SOURCE%\Gpufit_Cpufit_Performance_Comparison_readme.txt" "%PERFORMANCE_TEST_INSTALL%\README.txt"
+
+mkdir "%PERFORMANCE_TEST_INSTALL%\win64"
+copy "%x64_BUILD%\Gpufit_Cpufit_Performance_Comparison.exe" "%PERFORMANCE_TEST_INSTALL%\win64"
+copy "%x64_BUILD%\Gpufit.dll" "%PERFORMANCE_TEST_INSTALL%\win64"
+copy "%x64_BUILD%\Cpufit.dll" "%PERFORMANCE_TEST_INSTALL%\win64"
+
+mkdir "%PERFORMANCE_TEST_INSTALL%\win32"
+copy "%x32_BUILD%\Gpufit_Cpufit_Performance_Comparison.exe" "%PERFORMANCE_TEST_INSTALL%\win32"
+copy "%x32_BUILD%\Gpufit.dll" "%PERFORMANCE_TEST_INSTALL%\win32"
+copy "%x32_BUILD%\Cpufit.dll" "%PERFORMANCE_TEST_INSTALL%\win32"
+
+REM copy Python packages
+
+echo collect python
+mkdir "%PYTHON_INSTALL%"
+copy "%x64_PYTHON_BUILD%\pyGpufit-%VERSION%-py2.py3-none-any.whl" "%PYTHON_INSTALL%\pyGpufit-%VERSION%-py2.py3-none-win_amd64.whl"
+copy "%x32_PYTHON_BUILD%\pyGpufit-%VERSION%-py2.py3-none-any.whl" "%PYTHON_INSTALL%\pyGpufit-%VERSION%-py2.py3-none-win32.whl"
+copy "%PYTHON_SOURCE%\README.txt" "%PYTHON_INSTALL%"
+xcopy "%PYTHON_SOURCE%\examples" "%PYTHON_INSTALL%\examples" /i /q
+
+REM copy Matlab 32 bit
+
+echo collect matlab32
+mkdir "%x32_MATLAB_INSTALL%"
+xcopy "%x32_MATLAB_BUILD%" "%x32_MATLAB_INSTALL%" /q
+xcopy "%MATLAB_SOURCE%\examples" "%x32_MATLAB_INSTALL%\examples" /i /q
+
+REM copy Matlab 64 bit
+
+echo collect matlab64
+mkdir "%x64_MATLAB_INSTALL%"
+xcopy "%x64_MATLAB_BUILD%" "%x64_MATLAB_INSTALL%" /q
+xcopy "%MATLAB_SOURCE%\examples" "%x64_MATLAB_INSTALL%\examples" /i /q
+
+REM copy SDK_INSTALL_ROOT
+
+echo collect SDK
+mkdir "%SDK_INSTALL_ROOT%"
+copy "%SDK_README_SOURCE%" "%SDK_INSTALL_ROOT%\README.txt"
+
+mkdir "%SDK_INSTALL_ROOT%\include"
+copy "%SOURCE_BASE%\Gpufit\gpufit.h" "%SDK_INSTALL_ROOT%\include"
+
+mkdir "%SDK_INSTALL_ROOT%\win32"
+copy "%x32_BUILD%\Gpufit.dll" "%SDK_INSTALL_ROOT%\win32"
+copy "%x32_BUILD_LIB%\Gpufit.lib" "%SDK_INSTALL_ROOT%\win32"
+
+mkdir "%SDK_INSTALL_ROOT%\win64"
+copy "%x64_BUILD%\Gpufit.dll" "%SDK_INSTALL_ROOT%\win64"
+copy "%x64_BUILD_LIB%\Gpufit.lib" "%SDK_INSTALL_ROOT%\win64"
+
+REM zip content of temp folder with 7-Zip if availabe
+
+set ZIP=C:\Program Files\7-Zip\7z.exe
+
+if not exist "%ZIP%" (
+	echo 7-Zip not installed, zip manually
+	goto end
+) ELSE (
+	echo zip result
+	"%ZIP%" a -y -r -mem=AES256 "%OUTPUT_ZIP%" "%ROOT_INSTALL%%" > nul
+)
+
+:end
+PAUSE
\ No newline at end of file
diff --git a/package/sdk_readme.txt b/package/sdk_readme.txt
new file mode 100644
index 0000000..59fc094
--- /dev/null
+++ b/package/sdk_readme.txt
@@ -0,0 +1,10 @@
+Software development kit for the Gpufit library (https://github.com/gpufit/Gpufit)
+which implements Levenberg Marquardt curve fitting in CUDA.
+
+Compiled with the Microsoft Visual Studio 2015 C++ compiler and CUDA toolkit 8.0.
+
+Folder include contains the gpufit.h header file representing the C API.
+
+Folder win32 contains the 32 bit compiled dynamic link library and import libary.
+
+Folder win64 contains the 64 bit compiled dynamic link library and import libary.
\ No newline at end of file
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 0000000..c524ac3
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,4 @@
+
+# Tests
+
+add_boost_test( "Cpufit;Gpufit" Consistency )
diff --git a/tests/Consistency.cpp b/tests/Consistency.cpp
new file mode 100644
index 0000000..feb1032
--- /dev/null
+++ b/tests/Consistency.cpp
@@ -0,0 +1,220 @@
+#define BOOST_TEST_MODULE Gpufit
+
+#include "Cpufit/cpufit.h"
+#include "Gpufit/gpufit.h"
+#include "Tests/utils.h"
+
+#include <boost/test/included/unit_test.hpp>
+
+#include <vector>
+
+void generate_input_linear_fit_1d(FitInput & i)
+{
+	// number fits, points, parameters
+	i.n_fits = 1;
+	i.n_points = 2;
+	i.n_parameters = 2; // LINEAR_1D has two parameters
+
+	// data and weights
+	i.data = { 0, 1 };
+	i.weights_ = { 1, 1 };
+
+	// model id and estimator id
+	i.model_id = LINEAR_1D;
+	i.estimator_id = LSE;
+
+	// initial parameters and parameters to fit
+	i.initial_parameters = { 0, 0 };
+	i.parameters_to_fit = { 1, 1 };
+
+	// tolerance and max_n_iterations
+	i.tolerance = 0.001f;
+	i.max_n_iterations = 10;
+
+	// user info
+	i.user_info_ = { 0.f, 1.f };
+}
+
+void generate_input_gauss_fit_1d(FitInput & i)
+{
+	// number fits, points, parameters
+	i.n_fits = 1;
+	i.n_points = 5;
+	i.n_parameters = 4; // GAUSS_1D has four parameters
+
+	// data and weights
+	clean_resize(i.data, i.n_fits * i.n_points);
+	std::vector< float > const true_parameters{ { 4.f, 2.f, 0.5f, 1.f } };
+	generate_gauss_1d(i.data, true_parameters);
+	i.weights_.clear(); // no weights
+
+	// model id and estimator id
+	i.model_id = GAUSS_1D;
+	i.estimator_id = LSE;
+
+	// initial parameters and parameters to fit
+	i.initial_parameters = { 2.f, 1.5f, 0.3f, 0.f };
+	i.parameters_to_fit = { 1, 1, 1, 1 };
+
+	// tolerance and max_n_iterations
+	i.tolerance = 0.001f;
+	i.max_n_iterations = 10;
+
+	// user info
+	i.user_info_.clear(); // no user info
+}
+
+void generate_input_gauss_fit_2d(FitInput & i)
+{
+	// number fits, points, parameters
+	i.n_fits = 1;
+	i.n_points = 25;
+	i.n_parameters = 5; // GAUSS_2D has five parameters
+
+	// data and weights
+	clean_resize(i.data, i.n_fits * i.n_points);
+	std::vector< float > const true_parameters{ { 4.f, 1.8f, 2.2f, 0.5f, 1.f } };
+	generate_gauss_2d(i.data, true_parameters);
+	i.weights_.clear(); // no weights
+
+	// model id and estimator id
+	i.model_id = GAUSS_2D;
+	i.estimator_id = LSE;
+
+	// initial parameters and parameters to fit
+	i.initial_parameters = { 2.f, 1.8f, 2.2f, 0.4f, 0.f };
+	i.parameters_to_fit = { 1, 1, 1, 1, 1 };
+
+	// tolerance and max_n_iterations
+	i.tolerance = 0.0001f;
+	i.max_n_iterations = 20;
+
+	// user info
+	i.user_info_.clear(); // no user info
+}
+
+void generate_input_gauss_fit_2d_elliptic(FitInput & i)
+{
+    // number fits, points, parameters
+    i.n_fits = 1;
+    std::size_t const size_x = 5;
+    i.n_points = size_x * size_x;
+    i.n_parameters = 6; // GAUSS_2D_ELLIPTIC has five parameters
+
+    // data and weights
+    clean_resize(i.data, i.n_fits * i.n_points);
+
+    float const center_x = (static_cast<float>(size_x) - 1.f) / 2.f;
+    std::vector< float > const true_parameters{ { 4.f, center_x, center_x, 0.4f, 0.6f, 1.f} };
+    generate_gauss_2d_elliptic(i.data, true_parameters);
+    i.weights_.clear(); // no weights
+
+    // model id and estimator id
+    i.model_id = GAUSS_2D_ELLIPTIC;
+    i.estimator_id = LSE;
+
+    // initial parameters and parameters to fit
+    i.initial_parameters = { 2.f, 1.8f, 2.2f, 0.5f, 0.5f, 0.f };
+    i.parameters_to_fit = { 1, 1, 1, 1, 1 };
+
+    // tolerance and max_n_iterations
+    i.tolerance = 0.001f;
+    i.max_n_iterations = 10;
+
+    // user info
+    i.user_info_.clear(); // no user info
+}
+
+void perform_cpufit_gpufit_and_check(void (*func)(FitInput &))
+{
+	// generate the data
+	FitInput i;
+	func(i);
+
+	// sanity checks (we don't want to introduce faulty data)
+	BOOST_CHECK(i.sanity_check());
+	
+	// reset output variables
+	FitOutput gpu, cpu;
+	clean_resize(gpu.parameters, i.n_fits * i.n_parameters);
+	clean_resize(gpu.states, i.n_fits);
+	clean_resize(gpu.chi_squares, i.n_fits);
+	clean_resize(gpu.n_iterations, i.n_fits);
+
+	clean_resize(cpu.parameters, i.n_fits * i.n_parameters);
+	clean_resize(cpu.states, i.n_fits);
+	clean_resize(cpu.chi_squares, i.n_fits);
+	clean_resize(cpu.n_iterations, i.n_fits);
+
+
+	// call to cpufit, store output
+	int const cpu_status
+		= cpufit
+		(
+			i.n_fits,
+			i.n_points,
+			i.data.data(),
+			i.weights(),
+			i.model_id,
+			i.initial_parameters.data(),
+			i.tolerance,
+			i.max_n_iterations,
+			i.parameters_to_fit.data(),
+			i.estimator_id,
+			i.user_info_size(),
+			i.user_info(),
+			cpu.parameters.data(),
+			cpu.states.data(),
+			cpu.chi_squares.data(),
+			cpu.n_iterations.data()
+		);
+
+	BOOST_CHECK(cpu_status == 0);
+
+	// call to gpufit, store output
+	int const gpu_status
+		= gpufit
+		(
+			i.n_fits,
+			i.n_points,
+			i.data.data(),
+			i.weights(),
+			i.model_id,
+			i.initial_parameters.data(),
+			i.tolerance,
+			i.max_n_iterations,
+			i.parameters_to_fit.data(),
+			i.estimator_id,
+			i.user_info_size(),
+			i.user_info(),
+			gpu.parameters.data(),
+			gpu.states.data(),
+			gpu.chi_squares.data(),
+			gpu.n_iterations.data()
+		);
+
+	BOOST_CHECK(gpu_status == 0);
+
+	// check both output for equality
+	BOOST_CHECK(cpu.states == gpu.states);
+	BOOST_CHECK(cpu.n_iterations == gpu.n_iterations);
+	BOOST_CHECK(close_or_equal(cpu.parameters, gpu.parameters));
+	BOOST_CHECK(close_or_equal(cpu.chi_squares, gpu.chi_squares));
+
+}
+
+BOOST_AUTO_TEST_CASE( Consistency )
+{
+	BOOST_TEST_MESSAGE( "linear_fit_1d" );
+	perform_cpufit_gpufit_and_check(&generate_input_linear_fit_1d);
+
+	BOOST_TEST_MESSAGE( "gauss_fit_1d" );
+	perform_cpufit_gpufit_and_check(&generate_input_gauss_fit_1d);
+
+	BOOST_TEST_MESSAGE( "gauss_fit_2d" );
+	perform_cpufit_gpufit_and_check(&generate_input_gauss_fit_2d);
+
+    BOOST_TEST_MESSAGE("gauss_fit_2d_elliptic");
+    perform_cpufit_gpufit_and_check(&generate_input_gauss_fit_2d_elliptic);
+
+}
diff --git a/tests/utils.cpp b/tests/utils.cpp
new file mode 100644
index 0000000..16f3970
--- /dev/null
+++ b/tests/utils.cpp
@@ -0,0 +1,60 @@
+#include "utils.h"
+
+// initialize random number generator
+std::mt19937 rng(0);
+
+/*
+    Given a parameter vector p with 4 entries, constructs a 1D Gaussian peak function with x values 0,..,v.size() - 1
+*/
+void generate_gauss_1d(std::vector< float > & v, std::vector< float > const & p)
+{
+	for (std::size_t i = 0; i < v.size(); i++)
+	{
+		float const argx = ((i - p[1]) * (i - p[1])) / (2.f * p[2] * p[2]);
+		float const ex = exp(-argx);
+		v[i] = p[0] * ex + p[3];
+	}
+}
+
+/*
+    Given a parameters vector p with 5 entries, constructs a 2D Gaussian peak function with x, y values 0, .., sqrt(v.size()) - 1
+*/
+void generate_gauss_2d(std::vector< float > & v, std::vector< float > const & p)
+{
+    std::size_t const n = static_cast<std::size_t>(std::sqrt(v.size()));
+    if (n * n != v.size())
+    {
+        throw std::runtime_error("v.size() is not a perfect square number");
+    }
+    
+    for (std::size_t j = 0; j < n; j++)
+    {
+        float const argy = ((j - p[2]) * (j - p[2]));
+        for (std::size_t i = 0; i < n; i++)
+        {
+            float const argx = ((i - p[1]) * (i - p[1]));
+            float const ex = exp(-(argx + argy) / (2.f * p[3] * p[3]));
+            v[j * n + i] = p[0] * ex + p[3];
+        }
+    }
+}
+
+void generate_gauss_2d_elliptic(std::vector< float > & v, std::vector< float > const & p)
+{
+    std::size_t const n = static_cast<std::size_t>(std::sqrt(v.size()));
+    if (n * n != v.size())
+    {
+        throw std::runtime_error("v.size() is not a perfect square number");
+    }
+
+    for (std::size_t j = 0; j < n; j++)
+    {
+        float const argy = ((j - p[2]) * (j - p[2])) / (2.f * p[4] * p[4]);
+        for (std::size_t i = 0; i < n; i++)
+        {
+            float const argx = ((i - p[1]) * (i - p[1])) / (2.f * p[3] * p[3]);
+            float const ex = exp(-(argx + argy));
+            v[j * n + i] = p[0] * ex + p[3];
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/utils.h b/tests/utils.h
new file mode 100644
index 0000000..dd0caa7
--- /dev/null
+++ b/tests/utils.h
@@ -0,0 +1,176 @@
+#ifndef TEST_UTILS_H_INCLUDED
+#define TEST_UTILS_H_INCLUDED
+
+#include<vector>
+#include<random>
+
+#define CHK(x) if (!x) return false
+
+extern std::mt19937 rng;
+
+/*
+Just to make sure that the content is erased after the resize.
+*/
+template<typename T> void clean_resize(std::vector<T> & v, std::size_t const n)
+{
+	v.resize(n);
+	std::fill(v.begin(), v.end(), (T)0);
+}
+
+template<typename T> double max_relative_difference(std::vector<T> const & a, std::vector<T> const & b)
+{
+	double v = 0;
+
+	auto it_a = a.begin();
+	auto it_b = b.begin();
+
+	while (it_a !=a.end())
+	{
+		T va = *it_a++;
+		T vb = *it_b++;
+		double d = static_cast<double>(std::abs(va - vb)) / (std::abs(va) + std::abs(vb));
+		v = std::max(v, d);
+	}
+	return v;
+}
+
+template<typename T> double max_absolute_difference(std::vector<T> const & a, std::vector<T> const & b)
+{
+    double v = 0;
+
+    auto it_a = a.begin();
+    auto it_b = b.begin();
+
+    while (it_a != a.end())
+    {
+        T va = *it_a++;
+        T vb = *it_b++;
+        double d = static_cast<double>(std::abs(va - vb));
+        v = std::max(v, d);
+    }
+    return v;
+}
+
+template<typename T> bool close_or_equal(std::vector<T> const & a, std::vector<T> const & b, double relative_threshold = 1e-3, double absolute_threshold = 1e-6)
+{
+	if (a.empty() && b.empty())
+	{
+		return true;
+	}
+	if (a.size() != b.size())
+	{
+		return false;
+	}
+	double rd = max_relative_difference(a, b);
+    double ad = max_absolute_difference(a, b);
+    return rd < relative_threshold || ad < absolute_threshold;
+}
+
+/*
+Calculates the standard deviation of a vector whose values are the differences of values of two others vectors of equal length.
+Only use values if use[i] == 0.
+*/
+template<typename T> double calculate_standard_deviation(std::vector<T> const & a, std::vector<T> const & b, std::vector<int> const & use)
+{
+    std::size_t n = 0;
+    double sq_diff = 0;
+
+    for (std::size_t i = 0; i < a.size(); i++)
+    {
+        if (use[i] == 0)
+        {
+            n++;
+            sq_diff += static_cast<double>((a[i] - b[i])) * (a[i] - b[i]);
+        }
+    }
+
+    double std_dev = std::sqrt(sq_diff / n);
+    return std_dev;
+}
+
+template<typename T> double calculate_mean(std::vector<T> const & a, std::vector<int> const & use)
+{
+    std::size_t n = 0;
+    double s = 0;
+
+    for (std::size_t i = 0; i < a.size(); i++)
+    {
+        if (use[i] == 0)
+        {
+            n++;
+            s += static_cast<double>(a[i]);
+        }
+    }
+    return s / n;
+}
+
+void generate_gauss_1d(std::vector< float > & v, std::vector< float > const & p);
+
+void generate_gauss_2d(std::vector< float > & v, std::vector< float > const & p);
+
+void generate_gauss_2d_elliptic(std::vector< float > & v, std::vector< float > const & p);
+
+struct FitInput
+{
+	std::size_t n_fits;
+	std::size_t n_points;
+	std::size_t n_parameters;
+
+	std::vector< float > data;
+	std::vector< float > weights_; // size 0 means no weights
+
+	int model_id;
+	int estimator_id;
+
+	std::vector< float > initial_parameters;
+	std::vector< int > parameters_to_fit;
+
+	float tolerance;
+	int max_n_iterations;
+
+	std::vector< float > user_info_; // user info is float
+
+	float * weights()
+	{
+		if (!this->weights_.empty())
+		{
+			return this->weights_.data();
+		}
+		return 0;
+	}
+
+	char * user_info()
+	{
+		if (!this->user_info_.empty())
+		{
+			return reinterpret_cast<char *>(this->user_info_.data());
+		}
+		return 0;
+	}
+
+	std::size_t user_info_size()
+	{
+		return this->user_info_.size() * sizeof(float); // type of user_info is float
+	}
+
+	bool sanity_check()
+	{
+		CHK(this->data.size() == this->n_fits * this->n_points);
+		if (!this->weights_.empty())
+		{
+			CHK(this->weights_.size() == this->n_fits * this->n_points);
+		}
+		CHK(this->initial_parameters.size() == this->n_fits * this->n_parameters);
+		return true;
+	}
+};
+
+struct FitOutput
+{
+	std::vector< float > parameters;
+	std::vector< int > states;
+	std::vector< float > chi_squares;
+	std::vector< int > n_iterations;
+};
+
+#endif
\ No newline at end of file