From 70f40690f158c1a6e02ea7c8f8b5f73e40eca1ce Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 1 Nov 2024 18:02:18 +0000 Subject: [PATCH 1/4] change defaults and name Signed-off-by: Kyle Sayers --- src/llmcompressor/observers/min_max.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/llmcompressor/observers/min_max.py b/src/llmcompressor/observers/min_max.py index 5c59a6573..319660c27 100644 --- a/src/llmcompressor/observers/min_max.py +++ b/src/llmcompressor/observers/min_max.py @@ -21,18 +21,18 @@ from llmcompressor.observers.base import Observer -__all__ = ["MovingAverageMinMaxObserver"] +__all__ = ["MinMaxObserver"] @Observer.register("minmax") -class MovingAverageMinMaxObserver(Observer): +class MinMaxObserver(Observer): """ Implements a dynamic quantization observer that sets the scale and zero point based on a moving average of the overall min and max observed values """ def __init__( - self, quantization_args: QuantizationArgs, averaging_constant: float = 0.01 + self, quantization_args: QuantizationArgs, averaging_constant: float = 1.0 ): super().__init__(quantization_args=quantization_args) @@ -66,6 +66,10 @@ def calculate_qparams( min_val = torch.amin(observed, dim=reduce_dims, keepdims=True) max_val = torch.amax(observed, dim=reduce_dims, keepdims=True) + # early stopping, save some computation and memory + if self.averaging_constant == 1.0: + return calculate_qparams(min_val, max_val, self.quantization_args) + running_min_val = self.min_val.get(tensor_id, None) running_max_val = self.max_val.get(tensor_id, None) From 56214ae66ecd320c64c7fc9c1c5d4bb9bcfc993b Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 1 Nov 2024 18:06:39 +0000 Subject: [PATCH 2/4] update docstring, typehints Signed-off-by: Kyle Sayers --- src/llmcompressor/observers/min_max.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/llmcompressor/observers/min_max.py b/src/llmcompressor/observers/min_max.py index 319660c27..b15954692 100644 --- a/src/llmcompressor/observers/min_max.py +++ b/src/llmcompressor/observers/min_max.py @@ -17,7 +17,6 @@ import torch from compressed_tensors.quantization.quant_args import QuantizationArgs from compressed_tensors.quantization.utils import calculate_qparams -from torch import FloatTensor, IntTensor, Tensor from llmcompressor.observers.base import Observer @@ -27,8 +26,9 @@ @Observer.register("minmax") class MinMaxObserver(Observer): """ - Implements a dynamic quantization observer that sets the scale and - zero point based on a moving average of the overall min and max observed values + Implements a quantization observer that calculates scale and zero point based on the + minimum and maximum values of the tensor being observed. If averaging_constant is + specified, then the scales are updated using a moving average """ def __init__( @@ -42,13 +42,13 @@ def __init__( def calculate_qparams( self, - observed: Tensor, + observed: torch.Tensor, reduce_dims: Optional[Tuple[int]] = None, tensor_id: Optional[Any] = None, - ) -> Tuple[FloatTensor, IntTensor]: + ) -> Tuple[torch.FloatTensor, torch.IntTensor]: """ - Updates the observed min and max using a moving average smoothed by the - averaging_constant + Updates the observed min and max values. If averaging_constant is provided, then + the values are updated using a moving average smoothed by the averaging_constant :param observed: observed tensor to calculate quantization parameters for :param reduce_dims: optional tuple of dimensions to reduce along, @@ -92,8 +92,11 @@ def calculate_qparams( ) def get_qparams_along_dim( - self, observed, dim: int, tensor_id: Optional[Any] = None + self, observed: torch.Tensor, dim: int, tensor_id: Optional[Any] = None ): + """ + Calculate quantization parameters along the specified dimension + """ reduce_dims = tuple(idx for idx in range(observed.ndim) if idx != dim) return self.calculate_qparams( observed, reduce_dims=reduce_dims, tensor_id=tensor_id From 6049f4f6034d11c555884f90cdb1358bec4784ce Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 1 Nov 2024 18:11:55 +0000 Subject: [PATCH 3/4] change defaulting averaging_constant Signed-off-by: Kyle Sayers --- src/llmcompressor/modifiers/quantization/calibration.py | 5 ++--- src/llmcompressor/observers/min_max.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py index 9b6f900e4..69079f2d8 100644 --- a/src/llmcompressor/modifiers/quantization/calibration.py +++ b/src/llmcompressor/modifiers/quantization/calibration.py @@ -49,10 +49,9 @@ def initialize_observer( quantization_args = getattr(quantization_scheme, arg_name, None) # dont need observers for dynamic - if quantization_args and not quantization_args.dynamic: - observer = quantization_args.get_observer() + if quantization_args is not None and not quantization_args.dynamic: observer = Observer.load_from_registry( - observer, quantization_args=quantization_args + quantization_args.observer, quantization_args=quantization_args ) module.register_module(f"{base_name}_observer", observer) diff --git a/src/llmcompressor/observers/min_max.py b/src/llmcompressor/observers/min_max.py index b15954692..33b0d422f 100644 --- a/src/llmcompressor/observers/min_max.py +++ b/src/llmcompressor/observers/min_max.py @@ -32,7 +32,7 @@ class MinMaxObserver(Observer): """ def __init__( - self, quantization_args: QuantizationArgs, averaging_constant: float = 1.0 + self, quantization_args: QuantizationArgs, averaging_constant: float = 0.01 ): super().__init__(quantization_args=quantization_args) From 687b3e92422e61c8c4c677d45fb7e5b1312b4d5d Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 4 Nov 2024 21:40:15 +0000 Subject: [PATCH 4/4] update docstring Signed-off-by: Kyle Sayers --- src/llmcompressor/observers/min_max.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llmcompressor/observers/min_max.py b/src/llmcompressor/observers/min_max.py index 33b0d422f..f3a5f6b74 100644 --- a/src/llmcompressor/observers/min_max.py +++ b/src/llmcompressor/observers/min_max.py @@ -47,8 +47,8 @@ def calculate_qparams( tensor_id: Optional[Any] = None, ) -> Tuple[torch.FloatTensor, torch.IntTensor]: """ - Updates the observed min and max values. If averaging_constant is provided, then - the values are updated using a moving average smoothed by the averaging_constant + Updates the observed min and max using a moving average smoothed by the + averaging_constant. Set the averaging_constant to 1.0 to disable averaging. :param observed: observed tensor to calculate quantization parameters for :param reduce_dims: optional tuple of dimensions to reduce along,