Skip to content

Commit

Permalink
Introduced BrgemmExternalRepackingAdjuster
Browse files Browse the repository at this point in the history
  • Loading branch information
v-Golubev committed Nov 12, 2024
1 parent a6e585c commit bcdb12e
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 68 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearI
if (linear_ir->is_dynamic()) {
loopPortsAdjuster = BrgemmCopyBLoopPortsAdjuster(linear_ir);
}
externalRepackingAdjuster = BrgemmExternalRepackingAdjuster(linear_ir, this);
}

void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) {
Expand All @@ -60,8 +61,6 @@ void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& l
auto shapes = extract_shapes();
auto layouts = extract_layouts();
m_optimizer.optimize(shapes, layouts);
// Why must it be called before kernel executor table update?
update_requested_descs(linear_ir, shapes, layouts);

if (linear_ir->is_dynamic())
loopPortsAdjuster.optimize();
Expand All @@ -75,7 +74,7 @@ void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& l
update_loop_args(linear_ir);
}
update_data_offsets(shapes, layouts);
adjust_offsets_from_descs(shapes, layouts);
externalRepackingAdjuster.optimize(linear_ir, shapes, layouts);
m_latest_shapes = std::move(shapes);
}

Expand Down Expand Up @@ -110,11 +109,14 @@ void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::Linea
}
}
#ifdef OPENVINO_ARCH_ARM64
CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir) {
}

void CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::optimize() {
}
CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::BrgemmCopyBLoopPortsAdjuster(
const ov::snippets::lowered::LinearIRCPtr& linear_ir,
CPURuntimeConfigurator* configurator) {}

void CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::optimize(
const ov::snippets::lowered::LinearIRCPtr& linear_ir,
const std::vector<ov::snippets::VectorDims>& shapes,
const std::vector<std::vector<size_t>>& layouts) {}
#else
CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir) {
const auto& pass = std::make_shared<intel_cpu::pass::AdjustBrgemmCopyBLoopPorts>();
Expand Down Expand Up @@ -144,62 +146,68 @@ void CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::optimize() {
}
#endif

void CPURuntimeConfigurator::update_requested_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir,
const std::vector<ov::snippets::VectorDims>& shapes,
const std::vector<std::vector<size_t>>& layots) const {
const auto& cpu_config = ov::as_type_ptr<CPURuntimeConfig>(m_config);
auto& optimal_descs = cpu_config->m_in_requested_descs;
#ifdef OPENVINO_ARCH_ARM64
CPURuntimeConfigurator::BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir) {
}

void CPURuntimeConfigurator::BrgemmExternalRepackingAdjuster::optimize() {
}
#else
CPURuntimeConfigurator::BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(
const ov::snippets::lowered::LinearIRCPtr& linear_ir,
CPURuntimeConfigurator* configurator) : m_configurator(configurator) {
const auto& params = linear_ir->get_parameters();
OPENVINO_ASSERT(params.size() == m_in_num);
for (size_t i = 0; i < m_in_num; ++i) {
for (size_t i = 0; i < params.size(); ++i) {
const auto& param = params[i];
auto consumers = param->get_output_port_connector(0)->get_consumers();
const auto consumers = param->get_output_port_connector(0)->get_consumers();
const bool brgemm_with_extracted_repacking =
std::any_of(consumers.begin(), consumers.end(), [](const ov::snippets::lowered::ExpressionPort& port) {
auto brgemm = ov::as_type_ptr<ov::intel_cpu::BrgemmCPU>(port.get_expr()->get_node());
return port.get_index() == 1 && brgemm && brgemm_utils::with_repacking(brgemm->get_type());
});
if (brgemm_with_extracted_repacking) {
const auto& shape = shapes[i];
// TODO: support orbitrary order
const auto& K = *++shape.rbegin();
const auto& N = *shape.rbegin();

const auto& precision = param->get_node()->get_output_element_type(0);
const auto vnni_factor = brgemm_utils::compute_vnni_factor(precision);
// Firstly, batch dims are set
VectorDims requested_blocked_shape(shape.begin(), shape.end() - m_config->tile_rank);
// Then, the blocked dims are formed
requested_blocked_shape.insert(
requested_blocked_shape.end(),
{snippets::utils::div_up(K, vnni_factor), std::max(N, brgemm_utils::repacking::compute_inner_n_block(precision)), vnni_factor});

VectorDims requested_order(shape.size() - m_config->tile_rank);
std::iota(requested_order.begin(), requested_order.end(), 0);
const auto last_idx = shape.size() - 1;
requested_order.insert(requested_order.end(), {last_idx - 1, last_idx, last_idx - 1});

optimal_descs[i] = std::make_shared<CpuBlockedMemoryDesc>(precision, Shape(shape), requested_blocked_shape, requested_order);
m_param_idces_with_external_repacking.insert(i);
}
}
}
void CPURuntimeConfigurator::adjust_offsets_from_descs(const std::vector<ov::snippets::VectorDims>& shapes,
const std::vector<std::vector<size_t>>& layouts) const {
const auto& cpu_config = ov::as_type_ptr<CPURuntimeConfig>(m_config);
for (const auto& map_elem : cpu_config->m_in_requested_descs) {
const auto input_idx = map_elem.first;
const auto& optimal_desc = map_elem.second;
const auto& original_shape = shapes[input_idx];
const auto& blocked_shape = optimal_desc->getBlockDims();

ov::snippets::VectorDims shape_for_offset(m_config->tensor_rank - original_shape.size(), 1);
shape_for_offset.insert(shape_for_offset.end(), blocked_shape.begin(), blocked_shape.end());
auto& offsets = m_config->io_data_offsets[input_idx];
compute_offsets(shape_for_offset, offsets, shape_for_offset.size(), m_io_data_sizes[input_idx], 0);

void CPURuntimeConfigurator::BrgemmExternalRepackingAdjuster::optimize(
const ov::snippets::lowered::LinearIRCPtr& linear_ir,
const std::vector<ov::snippets::VectorDims>& shapes,
const std::vector<std::vector<size_t>>& layouts) {
const auto& cpu_config = ov::as_type_ptr<CPURuntimeConfig>(m_configurator->m_config);
auto& optimal_descs = cpu_config->m_in_requested_descs;
for (const auto& i : m_param_idces_with_external_repacking) {
const auto& shape = shapes[i];
// TODO: support orbitrary order
const auto& K = *++shape.rbegin();
const auto& N = *shape.rbegin();

const auto& precision = linear_ir->get_parameters()[i]->get_node()->get_output_element_type(0);
const auto vnni_factor = brgemm_utils::compute_vnni_factor(precision);
// Firstly, batch dims are set
VectorDims requested_blocked_shape(shape.begin(), shape.end() - m_configurator->m_config->tile_rank);
// Then, the blocked dims are formed
requested_blocked_shape.insert(
requested_blocked_shape.end(),
{snippets::utils::div_up(K, vnni_factor), std::max(N, brgemm_utils::repacking::compute_inner_n_block(precision)), vnni_factor});

VectorDims requested_order(shape.size() - m_configurator->m_config->tile_rank);
std::iota(requested_order.begin(), requested_order.end(), 0);
const auto last_idx = shape.size() - 1;
requested_order.insert(requested_order.end(), {last_idx - 1, last_idx, last_idx - 1});

optimal_descs[i] = std::make_shared<CpuBlockedMemoryDesc>(precision, Shape(shape), requested_blocked_shape, requested_order);

ov::snippets::VectorDims shape_for_offset(m_configurator->m_config->tensor_rank - shape.size(), 1);
shape_for_offset.insert(shape_for_offset.end(), requested_blocked_shape.begin(), requested_blocked_shape.end());
auto& offsets = m_configurator->m_config->io_data_offsets[i];
compute_offsets(shape_for_offset, offsets, shape_for_offset.size(), m_configurator->m_io_data_sizes[i], 0);
// TODO: Support non-planar layout
OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(layouts[input_idx]));
OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(layouts[i]));
}
}
#endif

} // namespace intel_cpu
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,8 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator {
CPURuntimeConfigurator();

protected:
/**
* @brief Update RuntimeConfig based on LinearIR
* @param linear_ir LinearIR
*/
void update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override;
/**
* @brief Update tensor rank based on master shape
* @param master_shape Master shape
*/
void update_tensor_rank(const ov::snippets::VectorDims& master_shape) override;
/**
* @brief Initializes tensor rank of config
* @param linear_ir LinearIR
*/
void init_tensor_rank(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const override;
void initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override;
/**
Expand All @@ -54,12 +42,6 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator {
*/
void update_loop_args(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const;

void update_requested_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir,
const std::vector<ov::snippets::VectorDims>& shapes,
const std::vector<std::vector<size_t>>& layouts) const;
void adjust_offsets_from_descs(const std::vector<ov::snippets::VectorDims>& shapes,
const std::vector<std::vector<size_t>>& layouts) const;

static const size_t rank6D;

class BrgemmCopyBLoopPortsAdjuster {
Expand All @@ -73,6 +55,20 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator {
std::unordered_map<snippets::lowered::UnifiedLoopInfoPtr,
std::vector<snippets::lowered::ExpandedLoopInfoPtr>> m_affected_uni2exp_map;
} loopPortsAdjuster;

class BrgemmExternalRepackingAdjuster {
public:
BrgemmExternalRepackingAdjuster() = default;
BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, CPURuntimeConfigurator* configurator);

void optimize(const ov::snippets::lowered::LinearIRCPtr& linear_ir,
const std::vector<ov::snippets::VectorDims>& shapes,
const std::vector<std::vector<size_t>>& layouts);

private:
CPURuntimeConfigurator* m_configurator = nullptr;
std::set<size_t> m_param_idces_with_external_repacking;
} externalRepackingAdjuster;
};

} // namespace intel_cpu
Expand Down

0 comments on commit bcdb12e

Please sign in to comment.