Skip to content

Commit

Permalink
Merge branch 'wangwang/support_tp_vllm' into river/gpu_p2p_support
Browse files Browse the repository at this point in the history
  • Loading branch information
riverlijunjie committed Sep 18, 2024
2 parents 23980a6 + f960227 commit 5f8e0e2
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 47 deletions.
16 changes: 13 additions & 3 deletions src/plugins/intel_gpu/include/intel_gpu/op/rank_constant.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,29 @@ class RankConstant : public ov::op::v0::Constant {
RankConstant(const std::shared_ptr<ov::Node>& other,
const size_t world_size,
const size_t world_rank,
const TP_MODE tp_mode = TP_MODE::ALL_GATHERH);
const TP_MODE tp_mode = TP_MODE::ALL_GATHERH,
const std::vector<int64_t> qkv_parts = {1, 1, 1});

bool visit_attributes(ov::AttributeVisitor& visitor) override;
void validate_and_infer_types() override;
std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
TP_MODE get_tp_mode() const { return m_tp_mode; }
int get_size() const { return m_world_size; }
int get_rank() const { return m_world_rank; }
std::vector<int64_t> get_qkv_parts() const {
return m_qkv_parts;
}
int get_size() const {
return m_world_size;
}
int get_rank() const {
return m_world_rank;
}

protected:
ov::element::Type m_output_type;
int m_world_size;
int m_world_rank;
TP_MODE m_tp_mode;
std::vector<int64_t> m_qkv_parts;
Shape m_shape{};
element::Type m_element_type{};
};
Expand Down
17 changes: 12 additions & 5 deletions src/plugins/intel_gpu/src/plugin/ops/rank_constant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ static void CreateRankConstantOp(ProgramBuilder& p, const std::shared_ptr<ov::in
auto buf = lock.data();
auto bufSize = constLayout.bytes_count();
int rank = op->get_rank();
int w_size = op->get_size();
switch (op->get_tp_mode()) {
case ov::intel_gpu::op::TP_MODE::ALL_GATHERH: {
int offset = rank * bufSize;
Expand All @@ -127,17 +128,23 @@ static void CreateRankConstantOp(ProgramBuilder& p, const std::shared_ptr<ov::in
break;
case ov::intel_gpu::op::TP_MODE::ALL_REDUCE: {
int step_r = bufSize / const_shape[0];
int step_h = step_r * 2;
int step_h = step_r * w_size;
for (size_t i = 0; i < const_shape[0]; i++) {
std::memcpy(&buf[0] + i * step_r, (&data[0] + (rank * step_r)) + i * step_h, step_r);
}
break;
}
case ov::intel_gpu::op::TP_MODE::ALL_GATHERQKV: {
int copysize = bufSize / 3;
std::memcpy(&buf[0], &data[0] + rank * copysize, copysize);
std::memcpy(&buf[0] + copysize, &data[0] + rank * copysize + copysize * 2, copysize);
std::memcpy(&buf[0] + copysize * 2, &data[0] + rank * copysize + copysize * 4, copysize);
auto qkv_parts = op->get_qkv_parts();
int32_t copysize = bufSize / std::accumulate(qkv_parts.begin(), qkv_parts.end(), 0);
int32_t q_copysize = copysize * qkv_parts[0];
int32_t k_copysize = copysize * qkv_parts[1];
int32_t v_copysize = copysize * qkv_parts[2];
std::memcpy(&buf[0], &data[0] + rank * q_copysize, q_copysize);
std::memcpy(&buf[0] + q_copysize, &data[0] + (w_size * q_copysize) + (rank * k_copysize), k_copysize);
std::memcpy(&buf[0] + q_copysize + k_copysize,
&data[0] + (w_size * (q_copysize + k_copysize)) + (rank * v_copysize),
v_copysize);
break;
}
default: {
Expand Down
11 changes: 3 additions & 8 deletions src/plugins/intel_gpu/src/plugin/plugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
}
auto parse_devices_id = [&](const std::string devices_for_tp,
const std::string delimiter = ",") -> std::vector<std::string> {
std::cout << "devices_for_tp: " << devices_for_tp << std::endl;
bool is_set_device_id = orig_config.find(ov::device::id.name()) != orig_config.end();
std::vector<std::string> ret;
if (is_set_device_id)
Expand Down Expand Up @@ -248,7 +249,7 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
}
std::string target_device = is_set_device_id ? std::string("GPU.") + device_id : "GPU";
if (is_set_device_id) {
if (ret.size() < 2) {
if (ret.size() < 4) {
OPENVINO_THROW("Invalid number of parsed device found for TP from specified device candidate list: ",
devices_for_tp,
" when compiling model to target device: ",
Expand All @@ -273,17 +274,11 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
}
}
}
if (ret.size() > 2) {
GPU_DEBUG_LOG << "Will only select 2 devices for TP." << std::endl;
std::cout << "[WY-DEBUG][" << __FILE__ << ":" << __LINE__
<< "] will keep the first 2 device from list.";
ret = std::vector<std::string>(ret.begin(), ret.begin() + 2);
}
}
return ret;
};
auto devices_id_for_tp = parse_devices_id(devices_for_tp);
std::cout << "[WY-DEBUG][" << __FILE__ << ":" << __LINE__ << "] device priorities after filtered: ";
std::cout << "[DEBUG][" << __FILE__ << ":" << __LINE__ << "] device priorities after filtered: ";
for (const auto& device_id : devices_id_for_tp)
std::cout << "\tGPU." << device_id;
std::cout << std::endl;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@ namespace op {
RankConstant::RankConstant(const std::shared_ptr<ov::Node>& constant_data,
const size_t world_size,
const size_t world_rank,
const TP_MODE tp_mode)
const TP_MODE tp_mode,
const std::vector<int64_t> qkv_parts)
: ov::op::v0::Constant(*std::dynamic_pointer_cast<ov::op::v0::Constant>(constant_data)),
m_world_size(world_size),
m_world_rank(world_rank),
m_tp_mode(tp_mode) {
m_tp_mode(tp_mode),
m_qkv_parts(qkv_parts) {
auto constant = std::dynamic_pointer_cast<ov::op::v0::Constant>(constant_data);
m_shape = constant->get_shape();
m_element_type = constant->get_element_type();
Expand Down
Loading

0 comments on commit 5f8e0e2

Please sign in to comment.