Skip to content

Commit

Permalink
working e2e
Browse files Browse the repository at this point in the history
  • Loading branch information
robertgshaw2-neuralmagic committed Nov 18, 2024
1 parent df462b5 commit 14f6141
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
"""

w_compressed, meta = ops.cutlass_compress_entry(layer.weight)
layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
layer.w_compressed = torch.nn.Parameter(w_compressed, requires_grad=False)
layer.meta = torch.nn.Parameter(meta, requires_grad=False)

def apply_weights(self,
Expand All @@ -73,20 +73,12 @@ def apply_weights(self,
:param bias: The bias to be added to the output tensor
:return: The output tensor of the layer
"""

PAD_MULTIPLE = 16
remainder = x.shape[0] % 16
pad_size = PAD_MULTIPLE - remainder if remainder > 0 else 0
if pad_size > 0:
input = torch.nn.functional.pad(x, (0,0,0,pad_size), value=0)
else:
input = x


q_input, input_scale = ops.scaled_fp8_quant(
input, use_per_token_if_dynamic=True)
x, use_per_token_if_dynamic=True)

out = ops.cutlass_scaled_sparse_mm(
a=layer.weight,
a=layer.w_compressed,
e=layer.meta,
b=q_input.t(),
scale_a=layer.weight_scale,
Expand All @@ -96,12 +88,7 @@ def apply_weights(self,
)

out = out.t()
if pad_size > 0:
out = out[:-pad_size,:].contiguous()
# ^ this is of shape [5, 6144]
else:
out = out.contiguous()

assert out.is_contiguous()
return out


Expand Down
3 changes: 0 additions & 3 deletions vllm/model_executor/layers/quantization/utils/w8a8_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,6 @@ def apply_fp8_linear(
use_per_token_if_dynamic=use_per_token_if_dynamic)

# Fused GEMM_DQ
breakpoint()


output = ops.cutlass_scaled_mm(qinput,
weight,
out_dtype=input.dtype,
Expand Down

0 comments on commit 14f6141

Please sign in to comment.