working e2e

vllm-project · Nov 18, 2024 · 14f6141 · 14f6141
1 parent df462b5
commit 14f6141
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 22 deletions.
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -55,7 +55,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         """
 
         w_compressed, meta = ops.cutlass_compress_entry(layer.weight)
-        layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
+        layer.w_compressed = torch.nn.Parameter(w_compressed, requires_grad=False)
         layer.meta = torch.nn.Parameter(meta, requires_grad=False)
 
     def apply_weights(self,
@@ -73,20 +73,12 @@ def apply_weights(self,
         :param bias: The bias to be added to the output tensor
         :return: The output tensor of the layer 
         """
-
-        PAD_MULTIPLE = 16
-        remainder = x.shape[0] % 16        
-        pad_size = PAD_MULTIPLE - remainder if remainder > 0 else 0
-        if pad_size > 0:
-            input = torch.nn.functional.pad(x, (0,0,0,pad_size), value=0)
-        else:
-            input = x
-
+
         q_input, input_scale = ops.scaled_fp8_quant(
-            input, use_per_token_if_dynamic=True)
-    
+            x, use_per_token_if_dynamic=True)
+
         out = ops.cutlass_scaled_sparse_mm(
-            a=layer.weight,
+            a=layer.w_compressed,
             e=layer.meta,
             b=q_input.t(),
             scale_a=layer.weight_scale,
@@ -96,12 +88,7 @@ def apply_weights(self,
         )
 
         out = out.t()
-        if pad_size > 0:
-            out = out[:-pad_size,:].contiguous()
-            # ^ this is of shape [5, 6144]
-        else:
-            out = out.contiguous()
-
+        assert out.is_contiguous()
         return out
 
 

diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -109,9 +109,6 @@ def apply_fp8_linear(
             use_per_token_if_dynamic=use_per_token_if_dynamic)
 
         # Fused GEMM_DQ
-        breakpoint()
-
-
         output = ops.cutlass_scaled_mm(qinput,
                                        weight,
                                        out_dtype=input.dtype,