neuralmagic · ElizaWszola · May 17, 2024 · May 21, 2024 · May 22, 2024 · May 23, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -158,7 +158,8 @@ set(VLLM_EXT_SRC
   "csrc/quantization/fp8/common.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/moe_align_block_size_kernels.cu"
-  "csrc/torch_bindings.cpp")
+  "csrc/torch_bindings.cpp"
+  )
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   include(FetchContent)
@@ -214,7 +215,9 @@ define_gpu_extension_target(
 
 set(VLLM_MOE_EXT_SRC
   "csrc/moe/torch_bindings.cpp"
-  "csrc/moe/topk_softmax_kernels.cu")
+  "csrc/moe/topk_softmax_kernels.cu"
+  "csrc/moe/marlin_moe_ops.cu"
+  )
 
 define_gpu_extension_target(
   _moe_C

diff --git a/csrc/moe/marlin_moe_ops.cu b/csrc/moe/marlin_moe_ops.cu
diff --git a/csrc/moe/marlin_moe_ops.h b/csrc/moe/marlin_moe_ops.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <torch/all.h>
+
+torch::Tensor marlin_gemm_moe(torch::Tensor& a, torch::Tensor& b_q_weights, torch::Tensor& sorted_ids, torch::Tensor& topk_weights,
+                        torch::Tensor& b_scales, torch::Tensor& expert_offsets, torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
+                        int64_t num_tokens_post_padded, int64_t num_experts, int64_t topk, int64_t moe_block_size, bool replicate_input, bool apply_weights);
diff --git a/csrc/moe/moe_ops.cpp b/csrc/moe/moe_ops.cpp
@@ -0,0 +1,24 @@
+// #include <Python.h>
+
+// #include "moe_ops.h"
+// #include "marlin_moe_ops.h"
+
+// #include <torch/extension.h>
+
+// #include <pybind11/numpy.h>
+
+// PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+// //   m.def("topk_softmax", &topk_softmax, "Apply topk softmax to the gating outputs.");
+//   m.def("marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, Tensor! topk_weights,
+//                         Tensor! b_scales, py::array_t<int>& expert_offsets, Tensor! workspace, size_m, size_n, size_k,
+//                         num_tokens_post_padded, num_experts, topk, moe_block_size, replicate_input, apply_weights)")
+//   m.impl("marlin_gemm_moe", torch::kCUDA, [](torch::Tensor& a, torch::Tensor& b_q_weights, torch::Tensor& sorted_ids, torch::Tensor& topk_weights,
+//                         torch::Tensor& b_scales, py::array_t<int>& expert_offsets, torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
+//                         int64_t num_tokens_post_padded, int64_t num_experts, int64_t topk, int64_t moe_block_size, bool replicate_input, bool apply_weights){
+//     py::buffer_info expert_offsets_bo = expert_offsets.request(); 
+//     return marlin_gemm_moe(a, b_q_weights, sorted_ids, topk_weights, b_scales,
+//                     static_cast<int*>(expert_offsets_bo.ptr),
+//                     workspace, size_m, size_n, size_k, num_tokens_post_padded, 
+//                     num_experts, topk, moe_block_size, replicate_input, apply_weights);
+//   }, "Marlin gemm moe kernel.");
+// }
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
@@ -1,12 +1,29 @@
 #include "registration.h"
 #include "moe_ops.h"
+#include "marlin_moe_ops.h"
+
+#include <torch/library.h>
 
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
   // Apply topk softmax to the gating outputs.
   m.def(
       "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
       "token_expert_indices, Tensor gating_output) -> ()");
   m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
+
+  m.def("marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, Tensor! topk_weights, "
+                        "Tensor! b_scales, Tensor! expert_offsets, Tensor! workspace, size_m, size_n, size_k, "
+                        "num_tokens_post_padded, num_experts, topk, moe_block_size, replicate_input, apply_weights) -> Tensor");
+  m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
+  // m.impl("marlin_gemm_moe", torch::kCUDA, [](torch::Tensor& a, torch::Tensor& b_q_weights, torch::Tensor& sorted_ids, torch::Tensor& topk_weights,
+  //                       torch::Tensor& b_scales, py::array_t<int>& expert_offsets, torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
+  //                       int64_t num_tokens_post_padded, int64_t num_experts, int64_t topk, int64_t moe_block_size, bool replicate_input, bool apply_weights){
+  //   py::buffer_info expert_offsets_bo = expert_offsets.request(); 
+  //   return marlin_gemm_moe(a, b_q_weights, sorted_ids, topk_weights, b_scales,
+  //                   static_cast<int*>(expert_offsets_bo.ptr),
+  //                   workspace, size_m, size_n, size_k, num_tokens_post_padded, 
+  //                   num_experts, topk, moe_block_size, replicate_input, apply_weights);
+  // }, "Marlin gemm moe kernel.");
 }
 
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/csrc/ops.h b/csrc/ops.h
@@ -73,6 +73,10 @@ torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                           torch::Tensor& b_scales, torch::Tensor& workspace,
                           int64_t size_m, int64_t size_n, int64_t size_k);
 
+// torch::Tensor marlin_gemm_moe(torch::Tensor& a, torch::Tensor& b_q_weights, torch::Tensor& sorted_ids, torch::Tensor& topk_weights,
+//                         torch::Tensor& b_scales, torch::Tensor& expert_offsets, torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
+//                         int64_t num_tokens_post_padded, int64_t num_experts, int64_t topk, int64_t moe_block_size, bool replicate_input, bool apply_weights);
+
 torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                   torch::Tensor& b_meta,
                                   torch::Tensor& b_scales,

diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -121,6 +121,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("marlin_gemm", &marlin_gemm);
   ops.impl("marlin_gemm", torch::kCUDA, &marlin_gemm);
 
+//   ops.def("marlin_gemm_moe", &marlin_gemm_moe);
+//   ops.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
+
   // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
   ops.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm);
   ops.impl("gptq_marlin_24_gemm", torch::kCUDA, &gptq_marlin_24_gemm);