Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
[Misc] Fix get_min_capability (vllm-project#5971)
Browse files Browse the repository at this point in the history
  • Loading branch information
dsikka authored and robertgshaw2-neuralmagic committed Jul 1, 2024
1 parent 9c74b00 commit 4153e58
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 6 deletions.
3 changes: 2 additions & 1 deletion vllm/model_executor/layers/quantization/awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def get_name(self) -> str:
def get_supported_act_dtypes(self) -> List[torch.dtype]:
return [torch.half]

def get_min_capability(self) -> int:
@classmethod
def get_min_capability(cls) -> int:
# The AWQ kernel only supports Turing or newer GPUs.
return 75

Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/layers/quantization/base_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,9 @@ def get_supported_act_dtypes(self) -> List[torch.dtype]:
"""List of supported activation dtypes."""
raise NotImplementedError

@classmethod
@abstractmethod
def get_min_capability(self) -> int:
def get_min_capability(cls) -> int:
"""Minimum GPU capability to support the quantization method.
E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/layers/quantization/bitsandbytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def get_supported_act_dtypes(self) -> List[torch.dtype]:
return [torch.float32, torch.float16, torch.bfloat16]

@classmethod
def get_min_capability(self) -> int:
def get_min_capability(cls) -> int:
return 70

@staticmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,9 @@ def get_scaled_act_names(self) -> List[str]:
def get_supported_act_dtypes(cls) -> List[torch.dtype]:
return [torch.float16, torch.bfloat16]

# Need to figure it out
@classmethod
def get_min_capability(cls) -> int:
return 60
return 75

def get_name(self) -> str:
return "compressed_tensors"
Expand Down Expand Up @@ -84,6 +83,14 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
def get_config_filenames(cls) -> List[str]:
return []

def _check_gptq_and_marlin_can_run(self):
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
if capability < 80:
raise RuntimeError("The quantization config is not supported for ",
"the current GPU. Minimum capability: 80. ",
f"Current capability: {capability}.")

def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
input_quant: BaseModel) -> bool:
is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
Expand Down Expand Up @@ -126,6 +133,7 @@ def _get_schema(self, weight_quant: BaseModel,
input_quant: BaseModel) -> "CompressedTensorsScheme":

if self._is_wNa16_group_channel(weight_quant, input_quant):
self._check_gptq_and_marlin_can_run()
if (self.quant_format == CompressionFormat.marlin_24.value
and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS):
return CompressedTensorsW4A16Sparse24(
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/layers/quantization/squeezellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ def get_name(self) -> str:
def get_supported_act_dtypes(self) -> List[torch.dtype]:
return [torch.half]

def get_min_capability(self) -> int:
@classmethod
def get_min_capability(cls) -> int:
return 70

@staticmethod
Expand Down

0 comments on commit 4153e58

Please sign in to comment.