zzy34407230
/
mindspore2022

 
			
			   
				 
					
						
						
							
							/**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_QUANTIZER_UTIL_H
#define MINDSPORE_LITE_TOOLS_CONVERTER_QUANTIZER_QUANTIZER_UTIL_H

#include <dirent.h>
#include <sys/stat.h>
#include <memory>
#include <string>
#include <cmath>
#include <array>
#include <vector>
#include <algorithm>
#include <limits>
#include <utility>
#include "tools/converter/quantizer/quantizer.h"
#include "src/ops/primitive_c.h"
#include "include/errorcode.h"
#include "ir/func_graph.h"
#include "ir/anf.h"
#include "include/model.h"
#include "base/base.h"
#include "ir/primitive.h"
#include "abstract/dshape.h"
#include "tools/converter/quantizer/huffman_encode.h"
#include "tools/converter/quantizer/bitpacking.h"
#include "src/lite_session.h"
#include "tools/converter/graphdef_transform.h"
#include "src/common/file_utils.h"

namespace mindspore::lite::quant {
static constexpr size_t UINT8_QUANTIZATION = 8;
static constexpr size_t WEIGHT_INDEX = 1;

const char kMethodMaxMin[] = "MAX_MIN";
const char kMethodKL[] = "KL";
const char kMethodOutlier[] = "RemovalOutlier";

struct PostQuantConfig {
  std::vector<std::string> image_paths;
  uint32_t batch_count{100};
  std::string method_x{kMethodKL};
  uint32_t thread_num{1};
  bool bias_correction{false};
  bool mixed{false};
  float mean_error_threshold{0.04};
  std::vector<std::vector<std::vector<int>>> input_shapes;  // different input
  bool inited{false};
};

struct SessionModel {
  session::LiteSession *session{nullptr};
  Model *model{nullptr};
};

/**
 * 1. when op's weight size > mWeightSize just skip
 * 2. only do conv/deconv/convdepthwise/deconvdepthwise/mul/matmul/batchmatmul quantization
 * 3. when conv/deconv/convdepthwise/deconvdepthwise ops' weight channel size > covWeightQuantChannelThreshold just skip
 * */
class QuantStrategy {
 public:
  explicit QuantStrategy(size_t weightSize, size_t covWeightQuantChannelThreshold = 16);

  ~QuantStrategy() = default;

  bool CanConvOpQuantized(const CNodePtr &node) const;
  bool CanMulOpQuantized(const CNodePtr &node) const;
  bool CanOpPostQuantized(AnfNodePtr &node) const;

  size_t mWeightSize;
  size_t mConvWeightQuantChannelThreshold;

 private:
  static const std::vector<schema::PrimitiveType> conv_types;
  static const std::vector<schema::PrimitiveType> mul_types;
};

constexpr float delta = 0.1;
constexpr float ratio = 10.0;
constexpr int percent = 10;
constexpr int quant_param_size = 32 * 8;

STATUS CalQuantizationParams(schema::QuantParamT *quantParam, double mMin, double mMax, bool narrowRange, int quant_max,
                             int quant_min, int num_bits);

STATUS CalQuantizationParams(schema::QuantParamT *quantParam, double mMin, double mMax, bool narrowRange = false,
                             int numBits = UINT8_QUANTIZATION);

std::pair<float, float> OutlierMethod(std::vector<float> min_datas, std::vector<float> max_datas);

std::vector<int8_t> KMeans(float *data, size_t elem_count, size_t k, size_t epochs, schema::QuantParamT *quantParam);

STATUS UpdateTensorDataAndSize(ParamValueLitePtr weight, void *quant_datas, int new_size);

void GetMaxMinPerchannel(int channels, int one_filter_size, int i, int elem_count, float *raw_datas,
                         bool channel_at_first, float *desired_max, float *desired_min);

template <typename T>
T QuantizeData(const float originData, const schema::QuantParamT *quantParam) {
  MS_ASSERT(quantParam != nullptr);
  MS_ASSERT(quantParam->inited);
  const auto scale = quantParam->scale;
  const auto zeroPoint = quantParam->zeroPoint;
  const auto numBit = quantParam->numBits;
  const auto narrowRange = quantParam->narrowRange;
  double maxLimitTemp = static_cast<float>((1 << (unsigned int)numBit) - 1);
  const double maxLimit = static_cast<float>(maxLimitTemp - zeroPoint + std::numeric_limits<T>::min()) * scale;
  double minLimit;
  if (narrowRange) {
    minLimit = static_cast<float>(std::numeric_limits<T>::min() + 1 - zeroPoint) * scale;
  } else {
    minLimit = static_cast<float>(std::numeric_limits<T>::min() - zeroPoint) * scale;
  }

  return [maxLimit, minLimit, zeroPoint, scale, narrowRange, originData] {
    double tmp;
    if (originData > maxLimit) {
      tmp = maxLimit;
    } else if (originData < minLimit) {
      tmp = minLimit;
    } else {
      tmp = originData;
    }
    auto quantData = static_cast<T>(std::round(zeroPoint + tmp / scale));
    return quantData;
  }();
}

template <typename T>
T QuantizeData(float originData, const schema::QuantParamT &quantParam, int quant_max, int quant_min) {
  MS_ASSERT(quantParam != nullptr);
  MS_ASSERT(quantParam->inited);
  const auto scale = quantParam.scale;
  const int zeroPoint = quantParam.zeroPoint;
  const auto narrowRange = quantParam.narrowRange;
  const int maxLimit = quant_max;
  const int minLimit = quant_min;

  return [maxLimit, minLimit, zeroPoint, scale, narrowRange, originData] {
    auto quant_data = std::round(originData / scale + zeroPoint);
    if (quant_data > maxLimit) {
      quant_data = maxLimit;
    } else if (quant_data < minLimit) {
      quant_data = minLimit;
    }
    return static_cast<T>(quant_data);
  }();
}

template <typename T>
STATUS DoPerChannelQuant(const ParamValueLitePtr &weight, const QuantType &quant_type,
                         std::vector<schema::QuantParamT> *quant_params, const int &quant_max, const int &quant_min,
                         const size_t &bit_num, const bool &k_means, std::vector<T> *quant_datas,
                         std::vector<float> *dequant_datas, bool channel_at_first = true) {
  auto dims = weight->tensor_shape();
  size_t elem_count = weight->tensor_shape_size();
  auto *raw_datas = static_cast<float *>(weight->tensor_addr());
  auto channels = dims[0];
  if (!channel_at_first) {
    if (dims.size() != 2) {
      MS_LOG(ERROR) << "unexpected dims size: " << dims.size();
      channel_at_first = true;
    } else {
      channels = dims[1];
    }
  }
  if (channels == 0) {
    MS_LOG(ERROR) << "channels is zero";
    return RET_ERROR;
  }
  size_t one_filter_size = elem_count / channels;
  bool do_quant = quant_param_size / (sizeof(float) * 8 - bit_num) < one_filter_size;
  if (!do_quant && quant_type == QuantType_WeightQuant) {
    MS_LOG(INFO) << "too few elements in a filter, no need to quantize. " << one_filter_size;
    return RET_CONTINUE;
  }
  for (int i = 0; i < channels; i++) {
    float min = FLT_MAX;
    float max = -FLT_MAX;
    GetMaxMinPerchannel(channels, one_filter_size, i, elem_count, raw_datas, channel_at_first, &max, &min);
    schema::QuantParamT quant_param;
    STATUS status = CalQuantizationParams(&quant_param, min, max, false, quant_max, quant_min, bit_num);
    if (status != RET_OK) {
      MS_LOG(ERROR) << "CalQuantizationParams failed" << status;
      return status;
    }
    // do quantization
    double average_dequant = 0;
    double average_raw = 0;
    for (uint32_t j = 0; j < one_filter_size; j++) {
      auto index = j + i * one_filter_size;
      if (!channel_at_first) {
        index = j * channels + i;
      }
      MS_ASSERT(index < elem_count);
      float raw_data = raw_datas[index];
      auto quant_data = QuantizeData<T>(raw_data, quant_param, quant_max, quant_min);
      (*quant_datas)[index] = quant_data;

      if (quant_type == QuantType_WeightQuant) {
        float dequant_data = quant_param.scale * (quant_data - quant_param.zeroPoint);
        (*dequant_datas)[index] = dequant_data;
        average_dequant += dequant_data;
        average_raw += raw_data;
      }
    }
    if (quant_type == QuantType_WeightQuant && !k_means) {
      // mean
      average_dequant = average_dequant / one_filter_size;
      average_raw = average_raw / one_filter_size;
      // std
      double variance_dequant = 0;
      double variance_raw = 0;
      for (uint32_t j = 0; j < one_filter_size; j++) {
        auto index = j + i * one_filter_size;
        if (!channel_at_first) {
          index = j * channels + i;
        }
        MS_ASSERT(index < elem_count);
        variance_dequant += std::pow((*dequant_datas)[index] - average_dequant, 2);
        variance_raw += std::pow(raw_datas[index] - average_raw, 2);
      }
      variance_dequant = std::sqrt(variance_dequant / one_filter_size);
      variance_raw = std::sqrt(variance_raw / one_filter_size);
      quant_param.varCorr = 1;
      if (variance_raw != 0 && variance_dequant != 0) {
        auto temp_var_corr = variance_raw / variance_dequant;
        if (temp_var_corr > 0 && temp_var_corr < 10) {
          quant_param.varCorr = temp_var_corr;
        } else {
          MS_LOG(WARNING) << "unexpected var_corr: " << temp_var_corr;
        }
      }
      quant_param.meanCorr = average_raw - average_dequant * quant_param.varCorr;
    }
    quant_params->emplace_back(quant_param);
  }
  auto status = UpdateTensorDataAndSize(weight, quant_datas->data(), quant_datas->size() * sizeof(T));
  if (status != RET_OK) {
    MS_LOG(ERROR) << "UpdateTensorDataAndSize error";
    return RET_ERROR;
  }
  return RET_OK;
}

template <typename T>
STATUS DoPerLayerQuant(const ParamValueLitePtr &weight, const QuantType &quant_type,
                       std::vector<schema::QuantParamT> *quant_params, const int &quant_max, const int &quant_min,
                       const size_t &bit_num, const bool &k_means, std::vector<T> *quant_datas) {
  auto dims = weight->tensor_shape();
  size_t elem_count = weight->tensor_shape_size();
  auto *raw_datas = static_cast<float *>(weight->tensor_addr());
  float min = FLT_MAX;
  float max = -FLT_MIN;
  for (uint32_t i = 0; i < elem_count; i++) {
    // find max min
    min = std::min(min, raw_datas[i]);
    max = std::max(max, raw_datas[i]);
  }

  schema::QuantParamT quant_param;
  if (!k_means) {
    STATUS status = CalQuantizationParams(&quant_param, min, max, false, quant_max, quant_min, bit_num);
    if (status != RET_OK) {
      MS_LOG(ERROR) << "CalQuantizationParams failed" << status;
      return status;
    }
  }
  quant_params->emplace_back(quant_param);
  // update data and datatype
  for (uint32_t i = 0; i < elem_count; i++) {
    float raw_data = raw_datas[i];
    if (!k_means) {
      auto quant_data = QuantizeData<T>(raw_data, quant_param, quant_max, quant_min);
      (*quant_datas)[i] = quant_data;
    }
  }
  auto status = UpdateTensorDataAndSize(weight, quant_datas->data(), quant_datas->size() * sizeof(T));
  if (status != RET_OK) {
    MS_LOG(ERROR) << "UpdateTensorDataAndSize error";
    return RET_ERROR;
  }
  return RET_OK;
}
template <typename T>
STATUS DoBitPack(const ParamValueLitePtr &weight, const size_t &bit_num, const std::vector<T> &quant_datas) {
  if (bit_num != 8 && bit_num != 16) {
    std::vector<T> data{};
    for (size_t i = 0; i < quant_datas.size(); ++i) {
      data.emplace_back((static_cast<T>(quant_datas[i])));
    }
    if (bit_num > 0 && bit_num < 8) {
      std::vector<uint8_t> pack_data{};
      BitPack::BitPacking<T, uint8_t>(bit_num, data, &pack_data);
      auto status = UpdateTensorDataAndSize(weight, pack_data.data(), pack_data.size() * sizeof(uint8_t));
      if (status != RET_OK) {
        MS_LOG(ERROR) << "UpdateTensorDataAndSize error";
        return RET_ERROR;
      }
    } else if (bit_num > 8 && bit_num < 16) {
      std::vector<uint16_t> pack_data{};
      BitPack::BitPacking<T, uint16_t>(bit_num, data, &pack_data);
      auto status = UpdateTensorDataAndSize(weight, pack_data.data(), pack_data.size() * sizeof(uint16_t));
      if (status != RET_OK) {
        MS_LOG(ERROR) << "UpdateTensorDataAndSize error";
        return RET_ERROR;
      }
    }
  }
  return RET_OK;
}

template <typename T>
STATUS QuantFilter(const ParamValueLitePtr &weight, const std::shared_ptr<PrimitiveC> &primitive_c,
                   QuantType quant_type, int quant_max, int quant_min, size_t bit_num, bool per_channel, int index = 1,
                   bool k_means = false) {
  MS_ASSERT(weight != nullptr);
  MS_ASSERT(primitive_c != nullptr);
  auto dims = weight->tensor_shape();
  if (per_channel) {
    if (dims.size() <= 1) {
      MS_LOG(WARNING) << "dims is " << dims.size() << " can not per_channel";
      per_channel = false;
    }
  }

  std::vector<schema::QuantParamT> quant_params;
  size_t elem_count = weight->tensor_shape_size();
  auto *raw_data = static_cast<float *>(weight->tensor_addr());
  if (raw_data == nullptr) {
    MS_LOG(ERROR) << "rawDatas is nullptr";
    return RET_ERROR;
  }

  std::vector<T> quant_data(elem_count);
  std::vector<float> dequant_datas(elem_count);
  int ret = RET_OK;
  if (per_channel) {
    bool channel_at_first = true;
    auto op_type = (schema::PrimitiveType)primitive_c->Type();
    if (op_type == schema::PrimitiveType_MatMul && weight->tensor_shape().size() == 2) {
      auto matmul_op = primitive_c->primitiveT()->value.AsMatMul();
      MS_ASSERT(matmul_op != nullptr);
      channel_at_first = !(index == 1 && !matmul_op->transposeB);
    }
    // channel at first
    ret = DoPerChannelQuant<T>(weight, quant_type, &quant_params, quant_max, quant_min, bit_num, k_means, &quant_data,
                               &dequant_datas, channel_at_first);
    if (ret == RET_CONTINUE) {
      return ret;
    } else if (ret != RET_OK) {
      MS_LOG(ERROR) << "Do per channel quant failed.";
      return ret;
    }
  } else {
    ret = DoPerLayerQuant<T>(weight, quant_type, &quant_params, quant_max, quant_min, bit_num, k_means, &quant_data);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Do per layer quant failed.";
      return ret;
    }
  }

#ifdef HUFFMAN_ENCODE
  auto huffman_encode = std::make_unique<lite::HuffmanEncode>();
  ret = huffman_encode->DoHuffmanEncode(weight, primitive_c, quant_datas.data(), bit_num);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Do huffman encode failed.";
    return ret;
  }
#else
  // do bit pack
  ret = DoBitPack(weight, bit_num, quant_data);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Do bit pack failed.";
    return ret;
  }
#endif

  if (quant_params.empty()) {
    MS_LOG(ERROR) << "quant_params empty";
    return RET_ERROR;
  }
  if (quant_type == QuantType_PostTraining) {
    primitive_c->AddInputQuantParam(quant_params);
  } else {
    primitive_c->set_input_quant_param(index, quant_params);
  }
  return ret;
}

// utils

schema::PrimitiveType NodePrimitiveType(const CNodePtr &cnode);

STATUS ParseConfigFile(std::string config_file, PostQuantConfig *post_quant_config);

SessionModel CreateSessionByFuncGraph(const FuncGraphPtr &func_graph, const converter::Flags &flags, int thread_num);

STATUS CollectCalibInputs(const std::vector<std::string> &input_dirs, size_t count_limited,
                          std::vector<std::vector<std::string>> *inputs);

STATUS CopyInputDataToTensor(size_t input_index, size_t image_index,
                             const std::vector<std::vector<std::string>> &images, mindspore::tensor::MSTensor *tensor);

FuncGraphPtr CopyFuncGraph(const FuncGraphPtr &);

void GetLiteParameter(const AnfNodePtr &node, ParameterPtr *param_node, ParamValueLitePtr *param_value);
}  // namespace mindspore::lite::quant
#endif