Browse Source

!15622 gpu matmul support bias && add gpu ci model

From: @yeyunpeng2020
Reviewed-by: @zhanghaibo5,@ddwsky
Signed-off-by: @ddwsky
pull/15622/MERGE
mindspore-ci-bot Gitee 5 years ago
parent
commit
45a17e3edf
8 changed files with 87 additions and 10 deletions
  1. +2
    -0
      mindspore/lite/src/runtime/gpu/opencl/opencl_executor.h
  2. +1
    -0
      mindspore/lite/src/runtime/infer_manager.h
  3. +10
    -4
      mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl
  4. +36
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
  5. +3
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h
  6. +6
    -0
      mindspore/lite/src/scheduler.cc
  7. +16
    -1
      mindspore/lite/test/models_gpu_fp32.cfg
  8. +13
    -4
      mindspore/lite/test/run_benchmark_nets.sh

+ 2
- 0
mindspore/lite/src/runtime/gpu/opencl/opencl_executor.h View File

@@ -29,6 +29,8 @@ class OpenCLExecutor : public Executor {
public:
OpenCLExecutor() : Executor() { allocator_ = ocl_runtime.GetInstance()->GetAllocator(); }

~OpenCLExecutor() override = default;

int Prepare(const std::vector<kernel::LiteKernel *> &kernels) override { return RET_OK; }

int Run(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,


+ 1
- 0
mindspore/lite/src/runtime/infer_manager.h View File

@@ -56,6 +56,7 @@ class RegistryInferShape {
RegistryInferShape(int prim_type, InferShape func) {
InferManager::GetInstance()->InsertInferShapeFunc(prim_type, func);
}
~RegistryInferShape() = default;
};

#define REG_INFER_SHAPE(op, prim_type, func) static RegistryInferShape g_##op##InferShape(prim_type, func);


+ 10
- 4
mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl View File

@@ -3,7 +3,7 @@
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void MatMul_2d(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight,
int4 in_shape, int4 out_shape) {
__read_only image2d_t bias, int4 in_shape, int4 out_shape) {
int gidx = get_global_id(0); // CO4
int gidz = get_global_id(2); // N
int lidx = get_local_id(0);
@@ -28,12 +28,13 @@ __kernel void MatMul_2d(__read_only image2d_t input, __write_only image2d_t outp
result += temp[lidx][1];
result += temp[lidx][2];
result += temp[lidx][3];
result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
WRITE_IMAGE(output, (int2)(gidx, gidz), result);
}
}

__kernel void MatMul_4d(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight,
int4 in_shape, int4 out_shape) {
__read_only image2d_t bias, int4 in_shape, int4 out_shape) {
int gidx = get_global_id(0); // CO4
int gidy = get_global_id(1); // N * H * 4
int gidz = get_global_id(2); // W
@@ -62,12 +63,14 @@ __kernel void MatMul_4d(__read_only image2d_t input, __write_only image2d_t outp
result += temp[lidx][1];
result += temp[lidx][2];
result += temp[lidx][3];
result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result);
}
}

__kernel void MatMulActWeightTransposeB_4d(__read_only image2d_t input, __write_only image2d_t output,
__read_only image2d_t weight, int4 in_shape, int4 out_shape) {
__read_only image2d_t weight, __read_only image2d_t bias, int4 in_shape,
int4 out_shape) {
int gidx = get_global_id(0); // CO4
int gidy = get_global_id(1); // N * H * 4
int gidz = get_global_id(2); // W
@@ -99,12 +102,14 @@ __kernel void MatMulActWeightTransposeB_4d(__read_only image2d_t input, __write_
result += temp[lidx][1];
result += temp[lidx][2];
result += temp[lidx][3];
result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result);
}
}

__kernel void MatMulActWeight_4d(__read_only image2d_t input, __write_only image2d_t output,
__read_only image2d_t weight, int4 in_shape, int4 out_shape) {
__read_only image2d_t weight, __read_only image2d_t bias, int4 in_shape,
int4 out_shape) {
int gidx = get_global_id(0); // CO4
int gidy = get_global_id(1); // N * H * 4
int gidz = get_global_id(2); // W
@@ -136,6 +141,7 @@ __kernel void MatMulActWeight_4d(__read_only image2d_t input, __write_only image
result += temp[lidx][1];
result += temp[lidx][2];
result += temp[lidx][3];
result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result);
}
}

+ 36
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc View File

@@ -49,7 +49,7 @@ bool IsUseStrassenMatmul(const std::vector<lite::Tensor *> &in_tensors_) {
}

int MatMulOpenCLKernel::CheckSpecs() {
if (in_tensors_.size() != 2 || out_tensors_.size() != 1) {
if (!(in_tensors_.size() == 2 || in_tensors_.size() == 3) || out_tensors_.size() != 1) {
MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
return RET_ERROR;
}
@@ -180,6 +180,40 @@ int MatMulOpenCLKernel::InitWeights() {
}

allocator->UnmapBuffer(padWeight_);
return InitBias();
}

int MatMulOpenCLKernel::InitBias() {
// pad FC Bias
CO_ = GpuTensorInfo(out_tensors_[0]).C;
auto allocator = ocl_runtime_->GetAllocator();
int co4 = UP_DIV(CO_, C4NUM);
size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
size_t im_dst_x, im_dst_y;
im_dst_x = co4;
im_dst_y = 1;
size_t img_dtype = CL_FLOAT;
if (enable_fp16_) {
img_dtype = CL_HALF_FLOAT;
}
lite::opencl::ImageSize img_size{im_dst_x, im_dst_y, img_dtype};
bias_ = allocator->Malloc(img_size);
bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
memset(bias_, 0x00, co4 * C4NUM * dtype_size);
if (in_tensors_.size() == 3) {
if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) {
for (int i = 0; i < CO_; i++) {
reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->data_c())[i];
}
} else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) {
for (int i = 0; i < CO_; i++) {
reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c())[i];
}
} else {
memcpy(bias_, in_tensors_[2]->data_c(), CO_ * dtype_size);
}
}
allocator->UnmapBuffer(bias_);
return RET_OK;
}

@@ -202,6 +236,7 @@ void MatMulOpenCLKernel::SetConstArgs() {
} else {
ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF);
}
ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape);
}


+ 3
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h View File

@@ -37,6 +37,7 @@ class MatMulOpenCLKernel : public OpenCLKernel {
void SetConstArgs() override;
void SetGlobalLocal() override;
int Tune() override { return lite::RET_OK; }
int InitBias();

protected:
void *padWeight_{nullptr};
@@ -44,6 +45,8 @@ class MatMulOpenCLKernel : public OpenCLKernel {
bool transposeA{false};
bool transposeB{true};
int dims{};
void *bias_{nullptr};
int CO_{1};
static constexpr int MAX_DIMS{4}; // max supported matmul dims
bool act_weight_{false};
std::vector<int> inShape{std::vector<int>(MAX_DIMS, 1)};


+ 6
- 0
mindspore/lite/src/scheduler.cc View File

@@ -405,6 +405,12 @@ kernel::LiteKernel *Scheduler::FindGpuKernel(const std::vector<Tensor *> &in_ten
return nullptr;
}

// we don't need to restore tensor for copy data
ret = CopyConstTensorData(in_tensors, op_parameter->type_);
if (ret != RET_OK) {
MS_LOG(DEBUG) << "CopyConstTensorsData failed: " << ret;
return nullptr;
}
auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, gpu_desc, op_parameter);
if (kernel != nullptr) {
MS_LOG(DEBUG) << "Get gpu op success: " << PrimitiveCurVersionTypeName(gpu_desc.type);


+ 16
- 1
mindspore/lite/test/models_gpu_fp32.cfg View File

@@ -41,4 +41,19 @@ Q_new_detect.tflite
Q_object_scene.tflite
Q_pose.tflite
matmul.pb
add_uint8.tflite;2
add_uint8.tflite;0.5;2
mtk_face_features_v3.onnx
hdc_Face_Landmark5_MTI_Aesthetic.onnx
inception_v3.pb;0.5;1;1,299,299,3
mobilenet_v1_0.25_128_frozen.pb;0.5;1;1,128,128,3
mobilenet_v2_1.0_224_frozen.pb;0.5;1;1,224,224,3
ml_face_openclose.pb;0.5;1;1,32,32,3
hiai_AADB_HADB_MBV2_model.pb;0.5;1;1,224,224,3
hiai_model_0909_kd_rot_ps_softmax.pb;0.5;1;1,224,224,3
model_normalize_object_scene_ps_20200519.pb;0.5;1;1,224,224,3
mtk_AADB_HADB_MBV2_model.pb;0.5;1;1,224,224,3
mtk_AADB_HADB_MBV3_model.pb;0.5;1;1,224,224,3
mtk_model_face_dress.pb;0.5;1;1,128,128,3
hiai_model_normalize_object_scene_ps_20200519.pb;0.5;1;1,224,224,3
hiai_label_and_video.pb;0.5;1;1,224,224,3
tinyyolov2-8.onnx;0.5;1;1,416,416,3

+ 13
- 4
mindspore/lite/test/run_benchmark_nets.sh View File

@@ -2109,11 +2109,13 @@ function Run_gpu() {
continue
fi
model_name=`echo ${line} | awk -F ';' '{print $1}'`
input_num=`echo ${line} | awk -F ';' '{print $2}'`
accuracy_limit=`echo ${line} | awk -F ';' '{print $2}'`
input_num=`echo ${line} | awk -F ';' '{print $3}'`
input_shapes=`echo ${line} | awk -F ';' '{print $4}'`
input_files=""
data_path="/data/local/tmp/input_output/"
output_file=${data_path}'output/'${model_name}'.ms.out'
if [[ ${input_num} == "" ]]; then
if [[ ${input_num} == "" || ${input_num} == 1 ]]; then
input_files=/data/local/tmp/input_output/input/${model_name}.ms.bin
else
for i in $(seq 1 $input_num)
@@ -2123,8 +2125,15 @@ function Run_gpu() {
fi
echo ${model_name} >> "${run_gpu_log_file}"
echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}"
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt
if [[ $input_shapes == "" ]]; then
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}"
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt
else
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --inputShapes='${input_shapes}' --accuracyThreshold='${accuracy_limit}' --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}"
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --inputShapes='${input_shapes}' --accuracyThreshold='${accuracy_limit}' --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt

fi

adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_gpu_log_file}"
if [ $? = 0 ]; then
run_result='arm64_gpu: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file}


Loading…
Cancel
Save