Skip to content

Commit

Permalink
[Tensor] Binary-code-based quantized tensor
Browse files Browse the repository at this point in the history
This PR adds a Tensor class that supports the Binary-code-based quantization type.
Binary-code-based quantization (BCQ) allows extremely low bit quantization and BCQTensor aims to store and manage such quantized data.
Note that the BiQGEMM is utilized for quantized inference and the BiQGEMM library is needed to use BCQTensor fully.

**Self-evaluation:**
1. Build test: [X]Passed [ ]Failed [ ]Skipped
2. Run test:   [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: Donghyeon Jeong <[email protected]>
  • Loading branch information
djeong20 committed Dec 9, 2024
1 parent 5f1ee4c commit 44ee96b
Show file tree
Hide file tree
Showing 8 changed files with 719 additions and 0 deletions.
1 change: 1 addition & 0 deletions api/ccapi/include/tensor_dim.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class TensorDim {
enum class DataType {
QINT4, /** quantized int 4*/
QINT8, /** quantized int 8*/
BCQ, /** binary-code-based quantized*/
UINT8, /** unsigned int 8 bit */
UINT16, /** unsigned int 16 bit */
UINT32, /** unsigned int 32 bit */
Expand Down
11 changes: 11 additions & 0 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,17 @@ if get_option('opencl-kernel-path') != ''
message ('OpenCL kernel path set to: @0@'.format(get_option('opencl-kernel-path')))
extra_defines += '-DOPENCL_KERNEL_PATH=@0@'.format(get_option('opencl-kernel-path'))
endif

if get_option('enable-biqgemm')
# check if BiQGEMM directory exist. otherwise, throw an error
fs = import('fs')
if fs.is_dir('../BiQGEMM')
extra_defines += '-DENABLE_BIQGEMM=1'
biqgemm_inc = include_directories('../BiQGEMM')
else
error ('BiQGEMM cannot be enabled without BiQGEMM library.')
endif
endif

foreach extra_arg : warning_flags
if cc.has_argument (extra_arg)
Expand Down
1 change: 1 addition & 0 deletions meson_options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ option('enable-openmp', type: 'boolean', value: true)
option('enable-neon', type: 'boolean', value: false)
option('enable-avx', type: 'boolean', value: true)
option('enable-opencl', type: 'boolean', value: false)
option('enable-biqgemm', type: 'boolean', value: false)

# ml-api dependency (to enable, install capi-inference from github.com/nnstreamer/api )
# To inter-operate with nnstreamer and ML-API packages, you need to enable this.
Expand Down
365 changes: 365 additions & 0 deletions nntrainer/tensor/bcq_tensor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,365 @@
// SPDX-License-Identifier: Apache-2.0
/**
* @file bcq_tensor.cpp
* @date 06 December 2024
* @brief This is BCQTensor class for binary-code-based quantization
* @see https://github.com/nnstreamer/nntrainer
* @author Donghyeon Jeong <[email protected]>
* @bug No known bugs except for NYI items
*/

#include <iomanip>
#include <iostream>

#include <bcq_tensor.h>
#include <blas_interface.h>
#include <tensor.h>
#include <util_func.h>

#include "BiQGEMM.h"

namespace nntrainer {

BCQTensor::BCQTensor(std::string name_, Tformat fm) :
TensorBase(name_, fm, Tdatatype::BCQ) {}

BCQTensor::BCQTensor(const TensorDim &d, bool alloc_now, Initializer init,
std::string name) :
TensorBase(d, alloc_now, init, name) {
if (alloc_now)
allocate();
}

BCQTensor::BCQTensor(const TensorDim &d, const void *buf) : BCQTensor(d, true) {
if (d.getDataLen() != 0) {
if (buf != nullptr)
copy(buf);
}
}

bool BCQTensor::operator==(const BCQTensor &rhs) const {
const uint32_t *_data = (uint32_t *)getData();
const uint32_t *_rdata = (uint32_t *)rhs.getData();

for (size_t i = 0; i < size() + scale_size(); ++i) {
if (_data[i] - _rdata[i])
return false;
}

return true;
}

void BCQTensor::allocate() {
if (empty() || data)
return;

if (src_tensor) {
/// allocate data based on the source tensor
allocateSrcTensor();
/** as this memory is shared, do NOT initialize */
} else {
/// allocate new memory for the tensor data
MemoryData *mem_data;

mem_data = new MemoryData((void *)(new uint32_t[size() + scale_size()]{}));
data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
delete[] mem_data->template getAddr<uint32_t>();
delete mem_data;
});

offset = 0;
initialize();
}
}

void BCQTensor::deallocate() {
data = nullptr;
offset = 0;
}

void *BCQTensor::getData() const {
if (!data)
return nullptr;

data->validate();
return data->getAddr<uint32_t>() + offset;
}

void *BCQTensor::getData(size_t idx) const {
NNTR_THROW_IF(idx > dim.getDataLen(), std::invalid_argument)
<< "Tensor::getData() index is not valid";

if (!data)
return nullptr;

data->validate();
return data->getAddr<uint32_t>() + offset + (idx / 32);
}

void *BCQTensor::getScale() const {
if (!data)
return nullptr;

data->validate();
return ((uint32_t *)getData()) + size();
}

void *BCQTensor::getScale(size_t idx) const {
NNTR_THROW_IF(idx > scale_size(), std::invalid_argument)
<< "Tensor::getScale() index is not valid";

if (!data)
return nullptr;

data->validate();
return ((uint32_t *)getScale()) + idx;
}

void *BCQTensor::getAddress(unsigned int i) {
size_t index = getIndex(batch(), channel(), height(), width() / 32);
if (i > index) {
return nullptr;
}
return &((uint32_t *)getData())[i];
}

const void *BCQTensor::getAddress(unsigned int i) const {
size_t index = getIndex(batch(), channel(), height(), width() / 32);
if (i > index) {
return nullptr;
}
return &((uint32_t *)getData())[i];
}

const uint32_t &BCQTensor::getValue(unsigned int i) const {
return ((uint32_t *)getData())[i];
}

uint32_t &BCQTensor::getValue(unsigned int i) {
return ((uint32_t *)getData())[i];
}

const uint32_t &BCQTensor::getValue(unsigned int b, unsigned int c,
unsigned int h, unsigned int w) const {
return getValue(getIndex(b, c, h, w / 32));
}

uint32_t &BCQTensor::getValue(unsigned int b, unsigned int c, unsigned int h,
unsigned int w) {
return getValue(getIndex(b, c, h, w / 32));
}

void BCQTensor::setValue(float value) {
uint32_t *data = (uint32_t *)getData();
std::fill(data, data + size(), (uint32_t)value);
}

void BCQTensor::setValue(unsigned int b, unsigned int c, unsigned int h,
unsigned int w, float value) {
((uint32_t *)getData())[getIndex(b, c, h, w / 32)] = (uint32_t)value;
}

void BCQTensor::addValue(unsigned int b, unsigned int c, unsigned int h,
unsigned int w, float value, float beta) {
throw std::invalid_argument("addValue() is not valid for " +
getStringDataType());
}

void BCQTensor::setZero() {
/// @todo replace with apply_i or scal
setValue(0);
}

void BCQTensor::initialize() {
if (empty() || !isAllocated())
return;

/// @note Sampling from the normal/uniform distribution is invalid
switch (initializer) {
case Initializer::ZEROS:
setZero();
break;
case Initializer::ONES:
setValue(1.0f);
break;
case Initializer::NONE:
break;
default:
throw std::invalid_argument("Initializer not valid for " +
getStringDataType());
break;
}

putData();
}

void BCQTensor::initialize(Initializer init) {
initializer = init;
initialize();
}

Tensor &BCQTensor::dot(Tensor const &input, Tensor &output, bool trans,
bool trans_in, float beta) const {
size_t qbit_of_clusters[] = {quantized_bit_size};
size_t size_of_clusters[] = {height()};
const size_t number_of_cluster = 1;

/// @note hidden_tile_size should be set as a multiple of 32. This variable is
/// related to the speed of matrixDotMatrix. The optimal value should be found
/// with various values according to the usage environment.
size_t hidden_tile_size = 32;

BiQGEMM::BCQW bcq_weight = BiQGEMM::BCQW(
(uint32_t *)getData(), (float *)getScale(), height(), width(),
number_of_cluster, qbit_of_clusters, size_of_clusters, hidden_tile_size);

BiQGEMM::matrixDotMatrix(output.getData(), bcq_weight, input.getData(),
input.width());
return output;
}

void BCQTensor::copy(const Tensor &from) {
reshape(from.getDim());
copy(from.getData());
}

void BCQTensor::copyData(const Tensor &from) {
NNTR_THROW_IF(!contiguous, std::invalid_argument)
<< getName() << " is not contiguous, cannot copy.";

NNTR_THROW_IF(size() != from.size(), std::invalid_argument)
<< "Size of tensor to copy must match";

/// @todo support copy from other data types
/// @todo check data type properly
switch (from.getDataType()) {
case ml::train::TensorDim::DataType::BCQ:
copy(from.getData());
default:
throw std::invalid_argument("Error: Unsupported data type");
break;
}
}

void BCQTensor::copy_with_stride(const Tensor &input, Tensor &output) {
for (unsigned int b = 0; b < output.batch(); ++b) {
for (unsigned int c = 0; c < output.channel(); ++c) {
for (unsigned int h = 0; h < output.height(); ++h) {
for (unsigned int w = 0; w < output.width() / 32; ++w) {
output.setValue(b, c, h, w, input.getValue<uint32_t>(b, c, h, w));
}
}
}
}
}

std::vector<unsigned int> BCQTensor::argmax() const {
std::vector<unsigned int> result;
const uint32_t *data = (uint32_t *)getData();
size_t batch_size = batch();
size_t feature_len = dim.getFeatureLen();

result.resize(batch_size);

for (unsigned int b = 0; b < batch_size; b++) {
auto max_iter =
std::max_element(data + b * feature_len, data + (b + 1) * feature_len);
result[b] = std::distance(data, max_iter) - (b * feature_len);
}
return result;
}

// void BCQTensor::save_quant_bit(std::ostream &file) { return; }

// void BCQTensor::read_quant_bit(std::ifstream &file) {
// file.read((char *)&quantized_bit_size, sizeof(uint16_t));
// }

size_t BCQTensor::size() const {
return quantized_bit_size * dim.height() * ((dim.width() + 31) / 32);
}

float BCQTensor::max_abs() const { return maxValue(); }

float BCQTensor::maxValue() const {
const uint32_t *data = (uint32_t *)getData();
return *std::max_element(data, data + size());
}

float BCQTensor::minValue() const {
const uint32_t *data = (uint32_t *)getData();
return *std::min_element(data, data + size());
}

void BCQTensor::print(std::ostream &out) const {
const uint32_t *data = (uint32_t *)getData();
unsigned int len = size();
out << "data addr: " << reinterpret_cast<const float *>(data) << '\n';
out << dim;

if (len > 512) {
out << '[' << (int)data[0] << ' ' << (int)data[1] << ' ' << (int)data[2]
<< " ... " << (int)data[len - 3] << ' ' << (int)data[len - 2] << ' '
<< (int)data[len - 1] << ']' << std::endl;
printScales(out);
return;
}

std::ios init(NULL);
init.copyfmt(out);

size_t idx = 0;
for (unsigned int bit = 0; bit < quantized_bit_size; ++bit) {
for (unsigned int k = 0; k < batch(); k++) {
for (unsigned int l = 0; l < channel(); l++) {
for (unsigned int i = 0; i < height(); i++) {
for (unsigned int j = 0; j < (width() + 31) / 32; j++) {
out << data[idx++] << " ";
}
out << std::endl;
}
}
}
out << "-------" << std::endl;
}
printScales(out);
}

size_t BCQTensor::scale_size() const { return height() * quantized_bit_size; }

void BCQTensor::copy(const void *buf) {
NNTR_THROW_IF(!contiguous, std::invalid_argument)
<< getName() << " is not contiguous, cannot copy.";

if (buf == getData()) {
return;
}

/// @todo need to optimize
for (unsigned int i = 0; i < size() + scale_size(); ++i) {
((uint32_t *)getData())[i] = ((uint32_t *)buf)[i];
}
}

std::string BCQTensor::getStringDataType() const { return "BCQ"; }

void BCQTensor::printScales(std::ostream &out) const {
const float *q_scales = (float *)getScale();
unsigned int len = scale_size();

if (len > 50) {
out << "Scale factors: [" << (int)q_scales[0] << ' ' << (int)q_scales[1]
<< ' ' << (int)q_scales[2] << " ... " << (int)q_scales[len - 3] << ' '
<< (int)q_scales[len - 2] << ' ' << (int)q_scales[len - 1] << ']'
<< std::endl;
return;
}

out << "Scale factors: ";
for (unsigned i = 0; i < scale_size(); ++i) {
out << q_scales[i] << " ";
}
out << std::endl;
}

} // namespace nntrainer
Loading

0 comments on commit 44ee96b

Please sign in to comment.