Skip to content

Commit

Permalink
Momentum (in training) implemented
Browse files Browse the repository at this point in the history
  • Loading branch information
milakov committed Jul 12, 2014
1 parent 483d7d6 commit 0f9317b
Show file tree
Hide file tree
Showing 12 changed files with 203 additions and 74 deletions.
111 changes: 90 additions & 21 deletions nnforge/cuda/network_updater_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,31 @@ namespace nnforge
}
}

__global__ void apply_gradient_with_momentum_kernel(
float * __restrict data,
float * __restrict gradient,
float * __restrict previous_upd,
const float * __restrict learning_rate,
float normalizer,
float weight_decay,
float momentum,
int elem_count)
{
int elem_id = blockDim.x * (blockIdx.y * gridDim.x + blockIdx.x) + threadIdx.x;
if (elem_id < elem_count)
{
float current_weight = data[elem_id];
float lr = learning_rate[elem_id];
float gr = gradient[elem_id];
float prev_upd = previous_upd[elem_id];
float upd = prev_upd * momentum + lr * (gr * normalizer - current_weight * weight_decay);
float new_weight = current_weight + upd;
data[elem_id] = new_weight;
gradient[elem_id] = 0.0F;
previous_upd[elem_id] = upd;
}
}

unsigned int network_updater_cuda::max_entry_count_in_single_batch = 1024;

network_updater_cuda::network_updater_cuda(
Expand Down Expand Up @@ -155,7 +180,8 @@ namespace nnforge
network_data_const_smart_ptr learning_rate,
network_data_smart_ptr data,
unsigned int batch_size,
float weight_decay)
float weight_decay,
float momentum)
{
testing_result_smart_ptr res(new testing_result(ef));

Expand Down Expand Up @@ -193,6 +219,9 @@ namespace nnforge
std::vector<std::vector<cuda_linear_buffer_device_smart_ptr> > net_data = get_data(data);
std::vector<std::vector<const_cuda_linear_buffer_device_smart_ptr> > learning_rate_data = get_learning_rate(learning_rate);
std::vector<std::vector<cuda_linear_buffer_device_smart_ptr> > gradient = get_zero_gradient(net_data);
std::vector<std::vector<cuda_linear_buffer_device_smart_ptr> > previous_upd;
if (momentum > 0.0F)
previous_upd = get_zero_gradient(net_data);

{
buffer_cuda_size_configuration buffers_config;
Expand All @@ -217,6 +246,9 @@ namespace nnforge
for(std::vector<std::vector<cuda_linear_buffer_device_smart_ptr> >::const_iterator it = gradient.begin(); it != gradient.end(); ++it)
for(std::vector<cuda_linear_buffer_device_smart_ptr>::const_iterator it2 = it->begin(); it2 != it->end(); ++it2)
buffers_config.add_constant_buffer((*it2)->get_size());
for(std::vector<std::vector<cuda_linear_buffer_device_smart_ptr> >::const_iterator it = previous_upd.begin(); it != previous_upd.end(); ++it)
for(std::vector<cuda_linear_buffer_device_smart_ptr>::const_iterator it2 = it->begin(); it2 != it->end(); ++it2)
buffers_config.add_constant_buffer((*it2)->get_size());

unsigned int max_entry_count = std::min(std::min(cuda_config->get_max_entry_count(buffers_config), reader.get_entry_count()), max_entry_count_in_single_batch);
if (entry_read_count_list.empty() || (max_entry_count >= batch_size))
Expand Down Expand Up @@ -557,9 +589,11 @@ namespace nnforge
*command_stream,
net_data,
gradient,
previous_upd,
learning_rate_data,
gradient_normalizer,
weight_decay);
weight_decay,
momentum);
entry_gradient_calculated_count = 0;
}

Expand Down Expand Up @@ -600,9 +634,11 @@ namespace nnforge
*command_stream,
net_data,
gradient,
previous_upd,
learning_rate_data,
gradient_normalizer,
weight_decay);
weight_decay,
momentum);
entry_gradient_calculated_count = 0;
}

Expand Down Expand Up @@ -757,29 +793,62 @@ namespace nnforge
cudaStream_t stream_id,
std::vector<std::vector<cuda_linear_buffer_device_smart_ptr> >& data,
std::vector<std::vector<cuda_linear_buffer_device_smart_ptr> >& gradient,
std::vector<std::vector<cuda_linear_buffer_device_smart_ptr> >& prev_upd,
std::vector<std::vector<const_cuda_linear_buffer_device_smart_ptr> >& learning_rate,
float gradient_normalizer,
float weight_decay)
float weight_decay,
float momentum)
{
std::vector<std::vector<cuda_linear_buffer_device_smart_ptr> >::iterator gradient_it = gradient.begin();
std::vector<std::vector<const_cuda_linear_buffer_device_smart_ptr> >::iterator learning_rate_it = learning_rate.begin();
for(std::vector<std::vector<cuda_linear_buffer_device_smart_ptr> >::iterator data_it = data.begin(); data_it != data.end(); ++data_it, ++gradient_it, ++learning_rate_it)
if (momentum> 0.0F)
{
std::vector<cuda_linear_buffer_device_smart_ptr>::iterator gradient_it2 = gradient_it->begin();
std::vector<const_cuda_linear_buffer_device_smart_ptr>::iterator learning_rate_it2 = learning_rate_it->begin();
for(std::vector<cuda_linear_buffer_device_smart_ptr>::iterator data_it2 = data_it->begin(); data_it2 != data_it->end(); ++data_it2, ++gradient_it2, ++learning_rate_it2)
std::vector<std::vector<cuda_linear_buffer_device_smart_ptr> >::iterator gradient_it = gradient.begin();
std::vector<std::vector<cuda_linear_buffer_device_smart_ptr> >::iterator prev_upd_it = prev_upd.begin();
std::vector<std::vector<const_cuda_linear_buffer_device_smart_ptr> >::iterator learning_rate_it = learning_rate.begin();
for(std::vector<std::vector<cuda_linear_buffer_device_smart_ptr> >::iterator data_it = data.begin(); data_it != data.end(); ++data_it, ++gradient_it, ++prev_upd_it, ++learning_rate_it)
{
int elem_count = (*data_it2)->get_size() / sizeof(float);
std::pair<dim3, dim3> kernel_dims = cuda_util::get_grid_and_threadblock_sizes_sequential_access(
*cuda_config,
elem_count);
apply_gradient_kernel<<<kernel_dims.first, kernel_dims.second, 0, stream_id>>>(
**data_it2,
**gradient_it2,
**learning_rate_it2,
gradient_normalizer,
weight_decay,
elem_count);
std::vector<cuda_linear_buffer_device_smart_ptr>::iterator gradient_it2 = gradient_it->begin();
std::vector<cuda_linear_buffer_device_smart_ptr>::iterator prev_upd_it2 = prev_upd_it->begin();
std::vector<const_cuda_linear_buffer_device_smart_ptr>::iterator learning_rate_it2 = learning_rate_it->begin();
for(std::vector<cuda_linear_buffer_device_smart_ptr>::iterator data_it2 = data_it->begin(); data_it2 != data_it->end(); ++data_it2, ++gradient_it2, ++prev_upd_it2, ++learning_rate_it2)
{
int elem_count = (*data_it2)->get_size() / sizeof(float);
std::pair<dim3, dim3> kernel_dims = cuda_util::get_grid_and_threadblock_sizes_sequential_access(
*cuda_config,
elem_count);
apply_gradient_with_momentum_kernel<<<kernel_dims.first, kernel_dims.second, 0, stream_id>>>(
**data_it2,
**gradient_it2,
**prev_upd_it2,
**learning_rate_it2,
gradient_normalizer,
weight_decay,
momentum,
elem_count);
}
}
}
else
{
std::vector<std::vector<cuda_linear_buffer_device_smart_ptr> >::iterator gradient_it = gradient.begin();
std::vector<std::vector<const_cuda_linear_buffer_device_smart_ptr> >::iterator learning_rate_it = learning_rate.begin();
for(std::vector<std::vector<cuda_linear_buffer_device_smart_ptr> >::iterator data_it = data.begin(); data_it != data.end(); ++data_it, ++gradient_it, ++learning_rate_it)
{
std::vector<cuda_linear_buffer_device_smart_ptr>::iterator gradient_it2 = gradient_it->begin();
std::vector<const_cuda_linear_buffer_device_smart_ptr>::iterator learning_rate_it2 = learning_rate_it->begin();
for(std::vector<cuda_linear_buffer_device_smart_ptr>::iterator data_it2 = data_it->begin(); data_it2 != data_it->end(); ++data_it2, ++gradient_it2, ++learning_rate_it2)
{
int elem_count = (*data_it2)->get_size() / sizeof(float);
std::pair<dim3, dim3> kernel_dims = cuda_util::get_grid_and_threadblock_sizes_sequential_access(
*cuda_config,
elem_count);
apply_gradient_kernel<<<kernel_dims.first, kernel_dims.second, 0, stream_id>>>(
**data_it2,
**gradient_it2,
**learning_rate_it2,
gradient_normalizer,
weight_decay,
elem_count);
}
}
}
}
Expand Down
7 changes: 5 additions & 2 deletions nnforge/cuda/network_updater_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ namespace nnforge
network_data_const_smart_ptr learning_rate,
network_data_smart_ptr data,
unsigned int batch_size,
float weight_decay);
float weight_decay,
float momentum);

// The method is called when client calls set_input_configuration_specific and the convolution specific configuration is modified.
// The layer_config_list is guaranteed to be compatible with schema
Expand Down Expand Up @@ -88,9 +89,11 @@ namespace nnforge
cudaStream_t stream_id,
std::vector<std::vector<cuda_linear_buffer_device_smart_ptr> >& data,
std::vector<std::vector<cuda_linear_buffer_device_smart_ptr> >& gradient,
std::vector<std::vector<cuda_linear_buffer_device_smart_ptr> >& prev_upd,
std::vector<std::vector<const_cuda_linear_buffer_device_smart_ptr> >& learning_rate,
float gradient_normalizer,
float weight_decay);
float weight_decay,
float momentum);

cuda_running_configuration_const_smart_ptr cuda_config;

Expand Down
2 changes: 2 additions & 0 deletions nnforge/network_trainer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ namespace nnforge
, learning_rate_decay_tail_epoch_count(0)
, learning_rate_decay_rate(0.5F)
, learning_rate(0.02F)
, batch_size(1)
, momentum(0.0F)
{
}

Expand Down
1 change: 1 addition & 0 deletions nnforge/network_trainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ namespace nnforge
unsigned int learning_rate_rise_head_epoch_count;
float learning_rate_rise_rate;
float weight_decay;
float momentum;

protected:
network_trainer(network_schema_smart_ptr schema);
Expand Down
3 changes: 2 additions & 1 deletion nnforge/network_trainer_sdlm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ namespace nnforge
learning_rate,
task.data,
batch_size,
weight_decay);
weight_decay,
momentum);

boost::chrono::duration<float> sec = (boost::chrono::high_resolution_clock::now() - start);

Expand Down
3 changes: 2 additions & 1 deletion nnforge/network_trainer_sgd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ namespace nnforge
lr_and_comment.first,
task.data,
batch_size,
weight_decay);
weight_decay,
momentum);

boost::chrono::duration<float> sec = (boost::chrono::high_resolution_clock::now() - start);

Expand Down
5 changes: 3 additions & 2 deletions nnforge/network_updater.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ namespace nnforge
network_data_const_smart_ptr learning_rate,
network_data_smart_ptr data,
unsigned int batch_size,
float weight_decay)
float weight_decay,
float momentum)
{
// Check data-schema consistency
data->check_network_data_consistency(*schema);
Expand All @@ -96,7 +97,7 @@ namespace nnforge

data->apply_dropout_layer_config(layer_id_to_dropout_config_map, false);

testing_result_smart_ptr res = actual_update(reader, learning_rate, data, batch_size, weight_decay);
testing_result_smart_ptr res = actual_update(reader, learning_rate, data, batch_size, weight_decay, momentum);

data->apply_dropout_layer_config(layer_id_to_dropout_config_map, true);

Expand Down
6 changes: 4 additions & 2 deletions nnforge/network_updater.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ namespace nnforge
network_data_const_smart_ptr learning_rate,
network_data_smart_ptr data,
unsigned int batch_size,
float weight_decay);
float weight_decay,
float momentum);

// set_input_configuration_specific should be called prior to this method call for this method to succeed
float get_flops_for_single_entry() const;
Expand All @@ -60,7 +61,8 @@ namespace nnforge
network_data_const_smart_ptr learning_rate,
network_data_smart_ptr data,
unsigned int batch_size,
float weight_decay) = 0;
float weight_decay,
float momentum) = 0;

// The method is called when client calls set_input_configuration_specific and the convolution specific configuration is modified.
// The layer_config_list is guaranteed to be compatible with schema
Expand Down
6 changes: 5 additions & 1 deletion nnforge/neural_network_toolset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ namespace nnforge
("epoch_count_in_training_set", boost::program_options::value<unsigned int>(&epoch_count_in_training_set)->default_value(1), "The whole should be split in this amount of epochs.")
("weight_decay", boost::program_options::value<float>(&weight_decay)->default_value(0.0F), "Weight decay.")
("batch_size,B", boost::program_options::value<unsigned int>(&batch_size)->default_value(1), "Training mini-batch size.")
("momentum,M", boost::program_options::value<float>(&momentum)->default_value(0.0F), "Momentum in training.")
;

{
Expand Down Expand Up @@ -369,6 +370,7 @@ namespace nnforge
std::cout << "epoch_count_in_training_set" << "=" << epoch_count_in_training_set << std::endl;
std::cout << "weight_decay" << "=" << weight_decay << std::endl;
std::cout << "batch_size" << "=" << batch_size << std::endl;
std::cout << "momentum" << "=" << momentum << std::endl;
}
{
std::vector<string_option> additional_string_options = get_string_options();
Expand Down Expand Up @@ -490,6 +492,7 @@ namespace nnforge
res->learning_rate_rise_rate = learning_rate_rise_rate;
res->weight_decay = weight_decay;
res->batch_size = batch_size;
res->momentum = momentum;

return res;
}
Expand Down Expand Up @@ -1456,7 +1459,8 @@ namespace nnforge
learning_rates,
data,
batch_size,
weight_decay);
weight_decay,
momentum);
boost::chrono::duration<float> sec = boost::chrono::high_resolution_clock::now() - start;
/*
{
Expand Down
1 change: 1 addition & 0 deletions nnforge/neural_network_toolset.h
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ namespace nnforge
float weight_decay;
unsigned int snapshot_scale;
unsigned int batch_size;
float momentum;

protected:
std::vector<output_neuron_value_set_smart_ptr> run_batch(
Expand Down
Loading

0 comments on commit 0f9317b

Please sign in to comment.