!15622 gpu matmul support bias && add gpu ci model

From: @yeyunpeng2020 Reviewed-by: @zhanghaibo5,@ddwsky Signed-off-by: @ddwsky
5 years ago · 45a17e3edf
--- a/mindspore/lite/src/runtime/gpu/opencl/opencl_executor.h
+++ b/mindspore/lite/src/runtime/gpu/opencl/opencl_executor.h
@@ -29,6 +29,8 @@ class OpenCLExecutor : public Executor {
 public:
  OpenCLExecutor() : Executor() { allocator_ = ocl_runtime.GetInstance()->GetAllocator(); }

  ~OpenCLExecutor() override = default;

  int Prepare(const std::vector<kernel::LiteKernel *> &kernels) override { return RET_OK; }

  int Run(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
--- a/mindspore/lite/src/runtime/infer_manager.h
+++ b/mindspore/lite/src/runtime/infer_manager.h
@@ -56,6 +56,7 @@ class RegistryInferShape {
  RegistryInferShape(int prim_type, InferShape func) {
    InferManager::GetInstance()->InsertInferShapeFunc(prim_type, func);
  }
  ~RegistryInferShape() = default;
 };

 #define REG_INFER_SHAPE(op, prim_type, func) static RegistryInferShape g_##op##InferShape(prim_type, func);
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl
@@ -3,7 +3,7 @@
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void MatMul_2d(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight,
                        int4 in_shape, int4 out_shape) {
                        __read_only image2d_t bias, int4 in_shape, int4 out_shape) {
  int gidx = get_global_id(0);  // CO4
  int gidz = get_global_id(2);  // N
  int lidx = get_local_id(0);
@@ -28,12 +28,13 @@ __kernel void MatMul_2d(__read_only image2d_t input, __write_only image2d_t outp
    result += temp[lidx][1];
    result += temp[lidx][2];
    result += temp[lidx][3];
    result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
    WRITE_IMAGE(output, (int2)(gidx, gidz), result);
  }
 }

 __kernel void MatMul_4d(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight,
                        int4 in_shape, int4 out_shape) {
                        __read_only image2d_t bias, int4 in_shape, int4 out_shape) {
  int gidx = get_global_id(0);  // CO4
  int gidy = get_global_id(1);  // N * H * 4
  int gidz = get_global_id(2);  // W
@@ -62,12 +63,14 @@ __kernel void MatMul_4d(__read_only image2d_t input, __write_only image2d_t outp
    result += temp[lidx][1];
    result += temp[lidx][2];
    result += temp[lidx][3];
    result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
    WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result);
  }
 }

 __kernel void MatMulActWeightTransposeB_4d(__read_only image2d_t input, __write_only image2d_t output,
                                           __read_only image2d_t weight, int4 in_shape, int4 out_shape) {
                                           __read_only image2d_t weight, __read_only image2d_t bias, int4 in_shape,
                                           int4 out_shape) {
  int gidx = get_global_id(0);  // CO4
  int gidy = get_global_id(1);  // N * H * 4
  int gidz = get_global_id(2);  // W
@@ -99,12 +102,14 @@ __kernel void MatMulActWeightTransposeB_4d(__read_only image2d_t input, __write_
    result += temp[lidx][1];
    result += temp[lidx][2];
    result += temp[lidx][3];
    result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
    WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result);
  }
 }

 __kernel void MatMulActWeight_4d(__read_only image2d_t input, __write_only image2d_t output,
                                 __read_only image2d_t weight, int4 in_shape, int4 out_shape) {
                                 __read_only image2d_t weight, __read_only image2d_t bias, int4 in_shape,
                                 int4 out_shape) {
  int gidx = get_global_id(0);  // CO4
  int gidy = get_global_id(1);  // N * H * 4
  int gidz = get_global_id(2);  // W
@@ -136,6 +141,7 @@ __kernel void MatMulActWeight_4d(__read_only image2d_t input, __write_only image
    result += temp[lidx][1];
    result += temp[lidx][2];
    result += temp[lidx][3];
    result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
    WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result);
  }
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
@@ -49,7 +49,7 @@ bool IsUseStrassenMatmul(const std::vector<lite::Tensor *> &in_tensors_) {
 }

 int MatMulOpenCLKernel::CheckSpecs() {
  if (in_tensors_.size() != 2 || out_tensors_.size() != 1) {
  if (!(in_tensors_.size() == 2 || in_tensors_.size() == 3) || out_tensors_.size() != 1) {
    MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
    return RET_ERROR;
  }
@@ -180,6 +180,40 @@ int MatMulOpenCLKernel::InitWeights() {
  }

  allocator->UnmapBuffer(padWeight_);
  return InitBias();
 }

 int MatMulOpenCLKernel::InitBias() {
  // pad FC Bias
  CO_ = GpuTensorInfo(out_tensors_[0]).C;
  auto allocator = ocl_runtime_->GetAllocator();
  int co4 = UP_DIV(CO_, C4NUM);
  size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
  size_t im_dst_x, im_dst_y;
  im_dst_x = co4;
  im_dst_y = 1;
  size_t img_dtype = CL_FLOAT;
  if (enable_fp16_) {
    img_dtype = CL_HALF_FLOAT;
  }
  lite::opencl::ImageSize img_size{im_dst_x, im_dst_y, img_dtype};
  bias_ = allocator->Malloc(img_size);
  bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
  memset(bias_, 0x00, co4 * C4NUM * dtype_size);
  if (in_tensors_.size() == 3) {
    if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) {
      for (int i = 0; i < CO_; i++) {
        reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->data_c())[i];
      }
    } else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) {
      for (int i = 0; i < CO_; i++) {
        reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c())[i];
      }
    } else {
      memcpy(bias_, in_tensors_[2]->data_c(), CO_ * dtype_size);
    }
  }
  allocator->UnmapBuffer(bias_);
  return RET_OK;
 }

@@ -202,6 +236,7 @@ void MatMulOpenCLKernel::SetConstArgs() {
  } else {
    ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF);
  }
  ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_);
  ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape);
  ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape);
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h
@@ -37,6 +37,7 @@ class MatMulOpenCLKernel : public OpenCLKernel {
  void SetConstArgs() override;
  void SetGlobalLocal() override;
  int Tune() override { return lite::RET_OK; }
  int InitBias();

 protected:
  void *padWeight_{nullptr};
@@ -44,6 +45,8 @@ class MatMulOpenCLKernel : public OpenCLKernel {
  bool transposeA{false};
  bool transposeB{true};
  int dims{};
  void *bias_{nullptr};
  int CO_{1};
  static constexpr int MAX_DIMS{4};  // max supported matmul dims
  bool act_weight_{false};
  std::vector<int> inShape{std::vector<int>(MAX_DIMS, 1)};
--- a/mindspore/lite/src/scheduler.cc
+++ b/mindspore/lite/src/scheduler.cc
@@ -405,6 +405,12 @@ kernel::LiteKernel *Scheduler::FindGpuKernel(const std::vector<Tensor *> &in_ten
      return nullptr;
    }

    // we don't need to restore tensor for copy data
    ret = CopyConstTensorData(in_tensors, op_parameter->type_);
    if (ret != RET_OK) {
      MS_LOG(DEBUG) << "CopyConstTensorsData failed: " << ret;
      return nullptr;
    }
    auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, gpu_desc, op_parameter);
    if (kernel != nullptr) {
      MS_LOG(DEBUG) << "Get gpu op success: " << PrimitiveCurVersionTypeName(gpu_desc.type);
--- a/mindspore/lite/test/models_gpu_fp32.cfg
+++ b/mindspore/lite/test/models_gpu_fp32.cfg
@@ -41,4 +41,19 @@ Q_new_detect.tflite
 Q_object_scene.tflite
 Q_pose.tflite
 matmul.pb
 add_uint8.tflite;2
 add_uint8.tflite;0.5;2
 mtk_face_features_v3.onnx
 hdc_Face_Landmark5_MTI_Aesthetic.onnx
 inception_v3.pb;0.5;1;1,299,299,3
 mobilenet_v1_0.25_128_frozen.pb;0.5;1;1,128,128,3
 mobilenet_v2_1.0_224_frozen.pb;0.5;1;1,224,224,3
 ml_face_openclose.pb;0.5;1;1,32,32,3
 hiai_AADB_HADB_MBV2_model.pb;0.5;1;1,224,224,3
 hiai_model_0909_kd_rot_ps_softmax.pb;0.5;1;1,224,224,3
 model_normalize_object_scene_ps_20200519.pb;0.5;1;1,224,224,3
 mtk_AADB_HADB_MBV2_model.pb;0.5;1;1,224,224,3
 mtk_AADB_HADB_MBV3_model.pb;0.5;1;1,224,224,3
 mtk_model_face_dress.pb;0.5;1;1,128,128,3
 hiai_model_normalize_object_scene_ps_20200519.pb;0.5;1;1,224,224,3
 hiai_label_and_video.pb;0.5;1;1,224,224,3
 tinyyolov2-8.onnx;0.5;1;1,416,416,3
--- a/mindspore/lite/test/run_benchmark_nets.sh
+++ b/mindspore/lite/test/run_benchmark_nets.sh
@@ -2109,11 +2109,13 @@ function Run_gpu() {
          continue
        fi
        model_name=`echo ${line} | awk -F ';' '{print $1}'`
        input_num=`echo ${line} | awk -F ';' '{print $2}'`
        accuracy_limit=`echo ${line} | awk -F ';' '{print $2}'`
        input_num=`echo ${line} | awk -F ';' '{print $3}'`
        input_shapes=`echo ${line} | awk -F ';' '{print $4}'`
        input_files=""
        data_path="/data/local/tmp/input_output/"
        output_file=${data_path}'output/'${model_name}'.ms.out'
        if [[ ${input_num} == "" ]]; then
        if [[ ${input_num} == "" || ${input_num} == 1 ]]; then
          input_files=/data/local/tmp/input_output/input/${model_name}.ms.bin
        else
          for i in $(seq 1 $input_num)
@@ -2123,8 +2125,15 @@ function Run_gpu() {
        fi
        echo ${model_name} >> "${run_gpu_log_file}"
        echo 'cd  /data/local/tmp/benchmark_test' > adb_run_cmd.txt
        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}"
        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt
        if [[ $input_shapes == "" ]]; then
          echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}"
          echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt
        else
          echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --inputShapes='${input_shapes}' --accuracyThreshold='${accuracy_limit}' --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}"
          echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --inputShapes='${input_shapes}' --accuracyThreshold='${accuracy_limit}' --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt

        fi

        adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_gpu_log_file}"
        if [ $? = 0 ]; then
            run_result='arm64_gpu: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file}