Skip to content

Commit

Permalink
follow-up codereview
Browse files Browse the repository at this point in the history
  • Loading branch information
ahnyoung-paul committed Nov 29, 2024
1 parent 8869544 commit 1af1b5c
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 35 deletions.
5 changes: 3 additions & 2 deletions src/plugins/intel_gpu/src/graph/include/primitive_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,6 @@ class primitive_inst {
bool _can_share_buffer = true;
bool _is_constant = false;
bool _needs_completion_event = false;
bool _no_execution_prev = false;

std::vector<size_t> _max_output_layout_count;
std::vector<size_t> max_intermediates_memory_sizes;
Expand All @@ -428,7 +427,7 @@ class primitive_inst {
bool use_async_compilation();
// if primitive_inst doesn't replace impl to new impl(static impl with opt kerenl or dynamic impl), return false
void update_impl(bool use_async_compilation);
void realloc_if_needed();
void realloc_if_needed(bool prev_execution_skipped=false);

cldnn::network::ptr get_unfused_subgraph();

Expand Down Expand Up @@ -482,6 +481,8 @@ class primitive_inst {
return false;
}

void clear_output_memory();

// This could be implemented via single map std::unordered_map<instrumentation::perf_counter_key, std::tuple<int64_t, size_t>>
// but the overhead on using perf_counter_key as map key is too big, thus we use hash as map key
// and store mapping onto original perf_clounter_key for further data analysis and dumps
Expand Down
62 changes: 29 additions & 33 deletions src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -549,7 +549,12 @@ bool primitive_inst::all_dependencies_cpu_impl() const {
return check_all_deps_cpu(this);
}

void primitive_inst::realloc_if_needed() {
void primitive_inst::clear_output_memory() {
_outputs[0] = nullptr;
_max_output_layout_count[0] = 0;
}

void primitive_inst::realloc_if_needed(bool prev_execution_skipped) {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("realloc_if_needed: " + id()));
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::memory_allocation);
Expand Down Expand Up @@ -738,21 +743,20 @@ void primitive_inst::realloc_if_needed() {

// Clear out memory if was previously reused, but now primitive can't be optimized
if (!_node->is_type<concatenation>() && (_node->is_runtime_skippable() || _node->is_type<crop>())) {
std::function<void(cldnn::primitive_inst*, cldnn::memory::ptr)> reset_user_output_memory;
reset_user_output_memory = [&](cldnn::primitive_inst* curr_inst, cldnn::memory::ptr input_mem_ptr) {
auto curr_output_memory_ptr = curr_inst->output_memory_ptr(0);
if (curr_inst->can_be_optimized()
&& (curr_output_memory_ptr
&& get_network().get_engine().is_the_same_buffer(*curr_output_memory_ptr, *input_mem_ptr))) {
if (curr_inst->mem_allocated()) {
get_network().get_memory_pool().release_memory(curr_inst->_outputs[0].get(),
curr_inst->get_node().get_unique_id(), curr_inst->id(), get_network_id());
_mem_allocated = false;
}
curr_inst->_outputs[0] = nullptr;
curr_inst->_max_output_layout_count[0] = 0;
for (auto& user_inst : curr_inst->get_user_insts()) {
reset_user_output_memory(user_inst, input_mem_ptr);
std::function<void(cldnn::primitive_inst*, cldnn::memory::ptr)> reset_user_output_memory
= [&](cldnn::primitive_inst* curr_inst, cldnn::memory::ptr target_mem_ptr) {
for (auto& user_inst : curr_inst->get_user_insts()) {
auto curr_output_memory_ptr = user_inst->output_memory_ptr(0);
if (user_inst->can_be_optimized()
&& (curr_output_memory_ptr
&& get_network().get_engine().is_the_same_buffer(*curr_output_memory_ptr, *target_mem_ptr))) {
if (user_inst->mem_allocated()) {
get_network().get_memory_pool().release_memory(user_inst->_outputs[0].get(),
user_inst->get_node().get_unique_id(), user_inst->id(), get_network_id());
_mem_allocated = false;
}
user_inst->clear_output_memory();
reset_user_output_memory(user_inst, target_mem_ptr);
}
}
};
Expand All @@ -766,9 +770,7 @@ void primitive_inst::realloc_if_needed() {
// * iter1: node1(skipped) -> node2(skipped) -> node3(executed)
if (_outputs[0] && dep_memory_ptr(0)
&& !_network.get_engine().is_the_same_buffer(dep_memory(0), output_memory(0))) {
for (auto& user_inst : get_user_insts()) {
reset_user_output_memory(user_inst, dep_memory_ptr(0));
}
reset_user_output_memory(this, dep_memory_ptr(0));
}
return;
} else if (_outputs[0] && dep_memory_ptr(0) &&
Expand All @@ -778,28 +780,22 @@ void primitive_inst::realloc_if_needed() {
get_node().get_unique_id(), id(), get_network_id());
_mem_allocated = false;
}
_outputs[0] = nullptr;
_max_output_layout_count[0] = 0;
clear_output_memory();
// Check users recursively and if the users is can_be_optimized && runtime_skippable
// && output_memory of user is same as current input memory,
// then reset the users output memory too.
// Ex.
// * iter0: node1(skipped) -> node2(skipped) -> node3(skipped)
// * iter1: node1(executed) -> node2(skipped) -> node3(executed)
for (auto& user_inst : get_user_insts()) {
reset_user_output_memory(user_inst, dep_memory_ptr(0));
}
reset_user_output_memory(this, dep_memory_ptr(0));
} else {
// when this inst was not executed at the previous iteration,
// Reset output memory becuase current output memory is invalid.
if (_no_execution_prev) {
if (prev_execution_skipped) {
if (_outputs[0]) {
for (auto& user_inst : get_user_insts()) {
reset_user_output_memory(user_inst, _outputs[0]);
}
reset_user_output_memory(this, _outputs[0]);
}
_outputs[0] = nullptr;
_max_output_layout_count[0] = 0;
clear_output_memory();
}
}
}
Expand Down Expand Up @@ -1794,7 +1790,7 @@ void primitive_inst::prepare_primitive() {

// If it is optimized out or skipped for zero dimension at the previous iteration,
// Set this flag true to reset output memory in realloc_if_needed.
_no_execution_prev = can_be_optimized()
const bool prev_execution_skipped = can_be_optimized()
|| (_impl_params->output_layouts[0].is_static() && _impl_params->output_layouts[0].count() == 0);
const auto orig_outputs = _outputs;
if ((is_dynamic() || _node->is_in_shape_of_subgraph()) && !has_inner_networks()) {
Expand Down Expand Up @@ -1855,7 +1851,7 @@ void primitive_inst::prepare_primitive() {
update_impl(can_use_async_compilation);
if (get_flag(ExecutionFlags::IMPL_CHANGED)) {
update_weights();
realloc_if_needed();
realloc_if_needed(prev_execution_skipped);
}
}

Expand All @@ -1864,7 +1860,7 @@ void primitive_inst::prepare_primitive() {
if (_node->is_type<paged_attention>() && !get_flag(ExecutionFlags::IMPL_CHANGED) && _impl->requires_update(*this, *_impl_params)) {
_impl->update(*this, *_impl_params);

realloc_if_needed();
realloc_if_needed(prev_execution_skipped);
}

OPENVINO_ASSERT(_impl_params->get_output_layout().is_static(),
Expand Down

0 comments on commit 1af1b5c

Please sign in to comment.