From 0e6f64ef20da96f27f66fc5d40f0c0b7f470201c Mon Sep 17 00:00:00 2001 From: Anjok07 <68268275+Anjok07@users.noreply.github.com> Date: Mon, 13 Jun 2022 02:10:10 -0500 Subject: [PATCH] Delete diffq directory --- diffq/__init__.py | 18 --- diffq/base.py | 262 ------------------------------------------ diffq/diffq.py | 286 ---------------------------------------------- diffq/uniform.py | 121 -------------------- diffq/utils.py | 37 ------ 5 files changed, 724 deletions(-) delete mode 100644 diffq/__init__.py delete mode 100644 diffq/base.py delete mode 100644 diffq/diffq.py delete mode 100644 diffq/uniform.py delete mode 100644 diffq/utils.py diff --git a/diffq/__init__.py b/diffq/__init__.py deleted file mode 100644 index 2b997ee..0000000 --- a/diffq/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - -# flake8: noqa -""" -This package implements different quantization strategies: - -- `diffq.uniform.UniformQuantizer`: classic uniform quantization over n bits. -- `diffq.diffq.DiffQuantizer`: differentiable quantizer based on scaled noise injection. - -Also, do check `diffq.base.BaseQuantizer` for the common methods of all Quantizers. -""" - -from .uniform import UniformQuantizer -from .diffq import DiffQuantizer diff --git a/diffq/base.py b/diffq/base.py deleted file mode 100644 index 9bd5276..0000000 --- a/diffq/base.py +++ /dev/null @@ -1,262 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - -from dataclasses import dataclass -from concurrent import futures -from fnmatch import fnmatch -from functools import partial -import io -import math -from multiprocessing import cpu_count -import typing as tp -import zlib - -import torch - - -class BaseQuantizer: - @dataclass - class _QuantizedParam: - name: str - param: torch.nn.Parameter - module: torch.nn.Module - # If a Parameter is used multiple times, `other` can be used - # to share state between the different Quantizers - other: tp.Optional[tp.Any] - - def __init__(self, model: torch.nn.Module, min_size: float = 0.01, float16: bool = False, - exclude: tp.Optional[tp.List[str]] = [], detect_bound: bool = True): - self.model = model - self.min_size = min_size - self.float16 = float16 - self.exclude = exclude - self.detect_bound = detect_bound - self._quantized = False - self._pre_handle = self.model.register_forward_pre_hook(self._forward_pre_hook) - self._post_handle = self.model.register_forward_hook(self._forward_hook) - - self._quantized_state = None - self._qparams = [] - self._float16 = [] - self._others = [] - self._rnns = [] - - self._saved = [] - - self._find_params() - - def _find_params(self): - min_params = self.min_size * 2**20 // 4 - previous = {} - for module_name, module in self.model.named_modules(): - if isinstance(module, torch.nn.RNNBase): - self._rnns.append(module) - for name, param in list(module.named_parameters(recurse=False)): - full_name = f"{module_name}.{name}" - matched = False - for pattern in self.exclude: - if fnmatch(full_name, pattern) or fnmatch(name, pattern): - matched = True - break - - if param.numel() <= min_params or matched: - if id(param) in previous: - continue - if self.detect_bound: - previous[id(param)] = None - if self.float16: - self._float16.append(param) - else: - self._others.append(param) - else: - qparam = self._register_param(name, param, module, previous.get(id(param))) - if self.detect_bound: - previous[id(param)] = qparam - self._qparams.append(qparam) - - def _register_param(self, name, param, module, other): - return self.__class__._QuantizedParam(name, param, module, other) - - def _forward_pre_hook(self, module, input): - if self.model.training: - self._quantized_state = None - if self._quantized: - self.unquantize() - if self._pre_forward_train(): - self._fix_rnns() - else: - self.quantize() - - def _forward_hook(self, module, input, output): - if self.model.training: - if self._post_forward_train(): - self._fix_rnns(flatten=False) # Hacky, next forward will flatten - - def quantize(self, save=True): - """ - Immediately apply quantization to the model parameters. - If `save` is True, save a copy of the unquantized parameters, that can be - restored with `unquantize()`. - """ - if self._quantized: - return - if save: - self._saved = [qp.param.data.to('cpu', copy=True) - for qp in self._qparams if qp.other is None] - self.restore_quantized_state(self.get_quantized_state()) - self._quantized = True - self._fix_rnns() - - def unquantize(self): - """ - Revert a previous call to `quantize()`. - """ - if not self._quantized: - raise RuntimeError("Can only be called on a quantized model.") - if not self._saved: - raise RuntimeError("Nothing to restore.") - for qparam in self._qparams: - if qparam.other is None: - qparam.param.data[:] = self._saved.pop(0) - assert len(self._saved) == 0 - self._quantized = False - self._fix_rnns() - - def _pre_forward_train(self) -> bool: - """ - Called once before each forward for continuous quantization. - Should return True if parameters were changed. - """ - return False - - def _post_forward_train(self) -> bool: - """ - Called once after each forward (to restore state for instance). - Should return True if parameters were changed. - """ - return False - - def _fix_rnns(self, flatten=True): - """ - To be called after quantization happened to fix RNNs. - """ - for rnn in self._rnns: - rnn._flat_weights = [ - (lambda wn: getattr(rnn, wn) if hasattr(rnn, wn) else None)(wn) - for wn in rnn._flat_weights_names] - if flatten: - rnn.flatten_parameters() - - def get_quantized_state(self): - """ - Returns sufficient quantized information to rebuild the model state. - - ..Note:: - To achieve maximum compression, you should compress this with - gzip or other, as quantized weights are not optimally coded! - """ - if self._quantized_state is None: - self._quantized_state = self._get_quantized_state() - return self._quantized_state - - def _get_quantized_state(self): - """ - Actual implementation for `get_quantized_state`. - """ - float16_params = [] - for p in self._float16: - q = p.data.half() - float16_params.append(q) - - return { - "quantized": [self._quantize_param(qparam) for qparam in self._qparams - if qparam.other is None], - "float16": float16_params, - "others": [p.data.clone() for p in self._others], - } - - def _quantize_param(self, qparam: _QuantizedParam) -> tp.Any: - """ - To be overriden. - """ - raise NotImplementedError() - - def _unquantize_param(self, qparam: _QuantizedParam, quantized: tp.Any) -> torch.Tensor: - """ - To be overriden. - """ - raise NotImplementedError() - - def restore_quantized_state(self, state) -> None: - """ - Restore the state of the model from the quantized state. - """ - for p, q in zip(self._float16, state["float16"]): - p.data[:] = q.to(p) - - for p, q in zip(self._others, state["others"]): - p.data[:] = q - - remaining = list(state["quantized"]) - for qparam in self._qparams: - if qparam.other is not None: - # Only unquantize first appearance of nn.Parameter. - continue - quantized = remaining.pop(0) - qparam.param.data[:] = self._unquantize_param(qparam, quantized) - self._fix_rnns() - - def detach(self) -> None: - """ - Detach from the model, removes hooks and anything else. - """ - self._pre_handle.remove() - self._post_handle.remove() - - def model_size(self) -> torch.Tensor: - """ - Returns an estimate of the quantized model size. - """ - total = torch.tensor(0.) - for p in self._float16: - total += 16 * p.numel() - for p in self._others: - total += 32 * p.numel() - return total / 2**20 / 8 # bits to MegaBytes - - def true_model_size(self) -> float: - """ - Return the true quantized model size, in MB, without extra - compression. - """ - return self.model_size().item() - - def compressed_model_size(self, compress_level=-1, num_workers=8) -> float: - """ - Return the compressed quantized model size, in MB. - - Args: - compress_level (int): compression level used with zlib, - see `zlib.compress` for details. - num_workers (int): will split the final big byte representation in that - many chunks processed in parallels. - """ - out = io.BytesIO() - torch.save(self.get_quantized_state(), out) - ms = _parallel_compress_len(out.getvalue(), compress_level, num_workers) - return ms / 2 ** 20 - - -def _compress_len(data, compress_level): - return len(zlib.compress(data, level=compress_level)) - - -def _parallel_compress_len(data, compress_level, num_workers): - num_workers = min(cpu_count(), num_workers) - chunk_size = int(math.ceil(len(data) / num_workers)) - chunks = [data[offset:offset + chunk_size] for offset in range(0, len(data), chunk_size)] - with futures.ProcessPoolExecutor(num_workers) as pool: - return sum(pool.map(partial(_compress_len, compress_level=compress_level), chunks)) diff --git a/diffq/diffq.py b/diffq/diffq.py deleted file mode 100644 index b475ec7..0000000 --- a/diffq/diffq.py +++ /dev/null @@ -1,286 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - -""" -Differentiable quantizer based on scaled noise injection. -""" -from dataclasses import dataclass -import math -import typing as tp - -import torch - -from .base import BaseQuantizer -from .uniform import uniform_quantize, uniform_unquantize -from .utils import simple_repr - - -class DiffQuantizer(BaseQuantizer): - @dataclass - class _QuantizedParam(BaseQuantizer._QuantizedParam): - logit: torch.nn.Parameter - - def __init__(self, model: torch.nn.Module, min_size: float = 0.01, float16: bool = False, - group_size: int = 1, min_bits: float = 2, max_bits: float = 15, - param="bits", noise="gaussian", - init_bits: float = 8, extra_bits: float = 0, suffix: str = "_diffq", - exclude: tp.List[str] = [], detect_bound: bool = True): - """ - Differentiable quantizer based on scaled noise injection. - For every parameter `p` in the model, this introduces a number of bits parameter - `b` with the same dimensions (when group_size = 1). - Before each forward, `p` is replaced by `p + U` - with U uniform iid noise with range [-d/2, d/2], with `d` the uniform quantization - step for `b` bits. - This noise approximates the quantization noise in a differentiable manner, both - with respect to the unquantized parameter `p` and the number of bits `b`. - - At eveluation (as detected with `model.eval()`), the model is replaced - by its true quantized version, and restored when going back to training. - - When doing actual quantization (for serialization, or evaluation), - the number of bits is rounded to the nearest integer, and needs to be stored along. - This will cost a few bits per dimension. To reduce this cost, one can use `group_size`, - which will use a single noise level for multiple weight entries. - - You can use the `DiffQuantizer.model_size` method to get a differentiable estimate of the - model size in MB. You can then use this estimate as a penalty in your training loss. - - Args: - model (torch.nn.Module): model to quantize - min_size (float): minimum size in MB of a parameter to be quantized. - float16 (bool): if a layer is smaller than min_size, should we still do float16? - group_size (int): weight entries are groupped together to reduce the number - of noise scales to store. This should divide the size of all parameters - bigger than min_size. - min_bits (float): minimal number of bits. - max_bits (float): maximal number of bits. - init_bits (float): initial number of bits. - extra_bits (float): extra bits to add for actual quantization (before roundoff). - suffix (str): suffix used for the name of the extra noise scale parameters. - exclude (list[str]): list of patterns used to match parameters to exclude. - For instance `['bias']` to exclude all bias terms. - detect_bound (bool): if True, will detect bound parameters and reuse - the same quantized tensor for both, as well as the same number of bits. - - ..Warning:: - You must call `model.training()` and `model.eval()` for `DiffQuantizer` work properly. - - """ - self.group_size = group_size - self.min_bits = min_bits - self.max_bits = max_bits - self.init_bits = init_bits - self.extra_bits = extra_bits - self.suffix = suffix - self.param = param - self.noise = noise - assert noise in ["gaussian", "uniform"] - self._optimizer_setup = False - - self._min_noise = 1 / (2 ** self.max_bits - 1) - self._max_noise = 1 / (2 ** self.min_bits - 1) - - assert group_size >= 0 - assert min_bits < init_bits < max_bits, \ - "init_bits must be between min_bits and max_bits excluded3" - - for name, _ in model.named_parameters(): - if name.endswith(suffix): - raise RuntimeError("The model already has some noise scales parameters, " - "maybe you used twice a DiffQuantizer on the same model?.") - - super().__init__(model, min_size, float16, exclude, detect_bound) - - def _get_bits(self, logit: torch.Tensor): - if self.param == "noise": - return torch.log2(1 + 1 / self._get_noise_scale(logit)) - else: - t = torch.sigmoid(logit) - return self.max_bits * t + (1 - t) * self.min_bits - - def _get_noise_scale(self, logit: torch.Tensor): - if self.param == "noise": - t = torch.sigmoid(logit) - return torch.exp(t * math.log(self._min_noise) + (1 - t) * math.log(self._max_noise)) - else: - return 1 / (2 ** self._get_bits(logit) - 1) - - def _register_param(self, name, param, module, other): - if other is not None: - return self.__class__._QuantizedParam( - name=name, param=param, module=module, logit=other.logit, other=other) - assert self.group_size == 0 or param.numel() % self.group_size == 0 - # we want the initial number of bits to be init_bits. - if self.param == "noise": - noise_scale = 1 / (2 ** self.init_bits - 1) - t = (math.log(noise_scale) - math.log(self._max_noise)) / ( - math.log(self._min_noise) - math.log(self._max_noise)) - else: - t = (self.init_bits - self.min_bits) / (self.max_bits - self.min_bits) - assert 0 < t < 1 - logit = torch.logit(torch.tensor(float(t))) - assert abs(self._get_bits(logit) - self.init_bits) < 1e-5 - if self.group_size > 0: - nparam = param.numel() // self.group_size - else: - nparam = 1 - logit = torch.nn.Parameter( - torch.full( - (nparam,), - logit, - device=param.device)) - module.register_parameter(name + self.suffix, logit) - return self.__class__._QuantizedParam( - name=name, param=param, module=module, logit=logit, other=None) - - def clear_optimizer(self, optimizer: torch.optim.Optimizer): - params = [qp.logit for qp in self._qparams] - - for group in optimizer.param_groups: - new_params = [] - for q in list(group["params"]): - matched = False - for p in params: - if p is q: - matched = True - if not matched: - new_params.append(q) - group["params"][:] = new_params - - def setup_optimizer(self, optimizer: torch.optim.Optimizer, - lr: float = 1e-3, **kwargs): - """ - Setup the optimizer to tune the number of bits. In particular, this will deactivate - weight decay for the bits parameters. - - Args: - optimizer (torch.Optimizer): optimizer to use. - lr (float): specific learning rate for the bits parameters. 1e-3 - is perfect for Adam.,w - kwargs (dict): overrides for other optimization parameters for the bits. - """ - assert not self._optimizer_setup - self._optimizer_setup = True - - params = [qp.logit for qp in self._qparams] - - for group in optimizer.param_groups: - for q in list(group["params"]): - for p in params: - if p is q: - raise RuntimeError("You should create the optimizer " - "before the quantizer!") - - group = {"params": params, "lr": lr, "weight_decay": 0} - group.update(kwargs) - optimizer.add_param_group(group) - - def no_optimizer(self): - """ - Call this if you do not want to use an optimizer. - """ - self._optimizer_setup = True - - def check_unused(self): - for qparam in self._qparams: - if qparam.other is not None: - continue - grad = qparam.param.grad - if grad is None or (grad == 0).all(): - if qparam.logit.grad is not None: - qparam.logit.grad.data.zero_() - - def model_size(self, exact=False): - """ - Differentiable estimate of the model size. - The size is returned in MB. - - If `exact` is True, then the output is no longer differentiable but - reflect exactly an achievable size, even without compression, - i.e.same as returned by `naive_model_size()`. - """ - total = super().model_size() - subtotal = 0 - for qparam in self._qparams: - # only count the first appearance of a Parameter - if qparam.other is not None: - continue - bits = self.extra_bits + self._get_bits(qparam.logit) - if exact: - bits = bits.round().clamp(1, 15) - if self.group_size == 0: - group_size = qparam.param.numel() - else: - group_size = self.group_size - subtotal += group_size * bits.sum() - subtotal += 2 * 32 # param scale - - # Number of bits to represent each number of bits - bits_bits = math.ceil(math.log2(1 + (bits.max().round().item() - self.min_bits))) - subtotal += 8 # 8 bits for bits_bits - subtotal += bits_bits * bits.numel() - - subtotal /= 2 ** 20 * 8 # bits -> MegaBytes - return total + subtotal - - def true_model_size(self): - """ - Naive model size without zlib compression. - """ - return self.model_size(exact=True).item() - - def _pre_forward_train(self): - if not self._optimizer_setup: - raise RuntimeError("You must call `setup_optimizer()` on your optimizer " - "before starting training.") - for qparam in self._qparams: - if qparam.other is not None: - noisy = qparam.other.module._parameters[qparam.other.name] - else: - bits = self._get_bits(qparam.logit)[:, None] - if self.group_size == 0: - p_flat = qparam.param.view(-1) - else: - p_flat = qparam.param.view(-1, self.group_size) - scale = p_flat.max() - p_flat.min() - unit = 1 / (2**bits - 1) - if self.noise == "uniform": - noise_source = (torch.rand_like(p_flat) - 0.5) - elif self.noise == "gaussian": - noise_source = torch.randn_like(p_flat) / 2 - noise = scale * unit * noise_source - noisy = p_flat + noise - # We bypass the checks by PyTorch on parameters being leafs - qparam.module._parameters[qparam.name] = noisy.view_as(qparam.param) - return True - - def _post_forward_train(self): - for qparam in self._qparams: - qparam.module._parameters[qparam.name] = qparam.param - return True - - def _quantize_param(self, qparam: _QuantizedParam) -> tp.Any: - bits = self.extra_bits + self._get_bits(qparam.logit) - bits = bits.round().clamp(1, 15)[:, None].byte() - if self.group_size == 0: - p = qparam.param.data.view(-1) - else: - p = qparam.param.data.view(-1, self.group_size) - levels, scales = uniform_quantize(p, bits) - return levels, scales, bits - - def _unquantize_param(self, qparam: _QuantizedParam, quantized: tp.Any) -> torch.Tensor: - levels, param_scale, bits = quantized - return uniform_unquantize(levels, param_scale, bits).view_as(qparam.param.data) - - def detach(self): - super().detach() - for qparam in self._qparams: - delattr(qparam.module, qparam.name + self.suffix) - - def __repr__(self): - return simple_repr(self) diff --git a/diffq/uniform.py b/diffq/uniform.py deleted file mode 100644 index f61e912..0000000 --- a/diffq/uniform.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - -""" -Classic uniform quantization over n bits. -""" -from typing import Tuple -import torch - -from .base import BaseQuantizer -from .utils import simple_repr - - -def uniform_quantize(p: torch.Tensor, bits: torch.Tensor = torch.tensor(8.)): - """ - Quantize the given weights over `bits` bits. - - Returns: - - quantized levels - - (min, max) range. - - """ - assert (bits >= 1).all() and (bits <= 15).all() - num_levels = (2 ** bits.float()).long() - mn = p.min().item() - mx = p.max().item() - p = (p - mn) / (mx - mn) # put p in [0, 1] - unit = 1 / (num_levels - 1) # quantization unit - levels = (p / unit).round() - if (bits <= 8).all(): - levels = levels.byte() - else: - levels = levels.short() - return levels, (mn, mx) - - -def uniform_unquantize(levels: torch.Tensor, scales: Tuple[float, float], - bits: torch.Tensor = torch.tensor(8.)): - """ - Unquantize the weights from the levels and scale. Return a float32 tensor. - """ - mn, mx = scales - num_levels = 2 ** bits.float() - unit = 1 / (num_levels - 1) - levels = levels.float() - p = levels * unit # in [0, 1] - return p * (mx - mn) + mn - - -class UniformQuantizer(BaseQuantizer): - def __init__(self, model: torch.nn.Module, bits: float = 8., min_size: float = 0.01, - float16: bool = False, qat: bool = False, exclude=[], detect_bound=True): - """ - Args: - model (torch.nn.Module): model to quantize - bits (float): number of bits to quantize over. - min_size (float): minimum size in MB of a parameter to be quantized. - float16 (bool): if a layer is smaller than min_size, should we still do float16? - qat (bool): perform quantized aware training. - exclude (list[str]): list of patterns used to match parameters to exclude. - For instance `['bias']` to exclude all bias terms. - detect_bound (bool): if True, will detect bound parameters and reuse - the same quantized tensor for both. - """ - self.bits = float(bits) - self.qat = qat - - super().__init__(model, min_size, float16, exclude, detect_bound) - - def __repr__(self): - return simple_repr(self, ) - - def _pre_forward_train(self): - if self.qat: - for qparam in self._qparams: - if qparam.other is not None: - new_param = qparam.other.module._parameters[qparam.other.name] - else: - quantized = self._quantize_param(qparam) - qvalue = self._unquantize_param(qparam, quantized) - new_param = qparam.param + (qvalue - qparam.param).detach() - qparam.module._parameters[qparam.name] = new_param - return True - return False - - def _post_forward_train(self): - if self.qat: - for qparam in self._qparams: - qparam.module._parameters[qparam.name] = qparam.param - return True - return False - - def _quantize_param(self, qparam): - levels, scales = uniform_quantize(qparam.param.data, torch.tensor(self.bits)) - return (levels, scales) - - def _unquantize_param(self, qparam, quantized): - levels, scales = quantized - return uniform_unquantize(levels, scales, torch.tensor(self.bits)) - - def model_size(self): - """ - Non differentiable model size in MB. - """ - total = super().model_size() - subtotal = 0 - for qparam in self._qparams: - if qparam.other is None: # if parameter is bound, count only one copy. - subtotal += self.bits * qparam.param.numel() + 64 # 2 float for the overall scales - subtotal /= 2**20 * 8 # bits to MegaBytes - return total + subtotal - - def true_model_size(self): - """ - Return the true quantized model size, in MB, without extra - compression. - """ - return self.model_size().item() diff --git a/diffq/utils.py b/diffq/utils.py deleted file mode 100644 index be6ab52..0000000 --- a/diffq/utils.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - -import inspect -from typing import Optional, List - - -def simple_repr(obj, attrs: Optional[List[str]] = None, overrides={}): - """ - Return a simple representation string for `obj`. - If `attrs` is not None, it should be a list of attributes to include. - """ - params = inspect.signature(obj.__class__).parameters - attrs_repr = [] - if attrs is None: - attrs = params.keys() - for attr in attrs: - display = False - if attr in overrides: - value = overrides[attr] - elif hasattr(obj, attr): - value = getattr(obj, attr) - else: - continue - if attr in params: - param = params[attr] - if param.default is inspect._empty or value != param.default: - display = True - else: - display = True - - if display: - attrs_repr.append(f"{attr}={value}") - return f"{obj.__class__.__name__}({','.join(attrs_repr)})"