Merge pull request !5898 from chenzupeng/master-litetags/v1.0.0
| @@ -1,57 +1,146 @@ | |||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||
| #define C4NUM 4 | |||
| #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) | |||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||
| __kernel void MatMul_NHWC4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, | |||
| __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) { | |||
| int2 gid = (int2)(get_global_id(0), get_global_id(1)); | |||
| int2 lid = (int2)(get_local_id(0), get_local_id(1)); | |||
| __kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, | |||
| __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) { | |||
| int gidx = get_global_id(0); // CO4 | |||
| int gidz = get_global_id(2); // N | |||
| int lidx = get_local_id(0); | |||
| int lidy = get_local_id(1); | |||
| int ci4 = UP_DIV(in_shape.w, C4NUM); | |||
| int co4 = UP_DIV(out_shape.w, C4NUM); | |||
| int n = out_shape.z; | |||
| bool inside = gidx < co4 && gidz < n; | |||
| FLT4 result = (FLT4)(0.0f); | |||
| bool inside = gid.x < offset_co.y; | |||
| for (uint i = lid.y; i < offset_ci.y && inside; i += 4) { | |||
| FLT4 v = READ_IMAGE(input, smp_zero, (int2)(i, 0)); | |||
| FLT16 w = weight[gid.x + i * offset_co.y]; | |||
| for (uint i = lidy; i < ci4 && inside; i += 4) { | |||
| FLT4 v = READ_IMAGE(input, smp_zero, (int2)(i, gidz)); | |||
| FLT16 w = weight[i * co4 + gidx]; | |||
| result.x += dot(v, w.s0123); | |||
| result.y += dot(v, w.s4567); | |||
| result.z += dot(v, w.s89ab); | |||
| result.w += dot(v, w.scdef); | |||
| } | |||
| __local FLT4 temp[64][4]; | |||
| temp[lid.x][lid.y] = result; | |||
| WRITE_IMAGE(output, (int2)(gidx, gidz), result); | |||
| __local FLT4 temp[32][4]; | |||
| temp[lidx][lidy] = result; | |||
| barrier(CLK_LOCAL_MEM_FENCE); | |||
| if (lid.y == 0 && inside) { | |||
| result += temp[lid.x][1]; | |||
| result += temp[lid.x][2]; | |||
| result += temp[lid.x][3]; | |||
| if (lidy == 0 && inside) { | |||
| result += temp[lidx][1]; | |||
| result += temp[lidx][2]; | |||
| result += temp[lidx][3]; | |||
| if (has_bias != 0) { | |||
| result += READ_IMAGE(bias, smp_zero, (int2)(gid.x, 0)); | |||
| result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); | |||
| } | |||
| WRITE_IMAGE(output, (int2)(gid.x, 0), result); | |||
| WRITE_IMAGE(output, (int2)(gidx, gidz), result); | |||
| } | |||
| } | |||
| __kernel void MatMul_NC4HW4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, | |||
| __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) { | |||
| int2 gid = (int2)(get_global_id(0), get_global_id(1)); | |||
| int2 lid = (int2)(get_local_id(0), get_local_id(1)); | |||
| __kernel void MatMul_NC4HW4_2d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, | |||
| __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) { | |||
| int gidx = get_global_id(0); // CO4 | |||
| int gidz = get_global_id(2); // N | |||
| int lidx = get_local_id(0); | |||
| int lidy = get_local_id(1); | |||
| int ci4 = UP_DIV(in_shape.w, C4NUM); | |||
| int co4 = UP_DIV(out_shape.w, C4NUM); | |||
| int n = out_shape.z; | |||
| bool inside = gidx < co4 && gidz < n; | |||
| FLT4 result = (FLT4)(0.0f); | |||
| bool inside = gid.x < offset_co.y; | |||
| for (uint i = lid.y; i < offset_ci.y && inside; i += 4) { | |||
| FLT4 v = READ_IMAGE(input, smp_zero, (int2)(0, i)); | |||
| FLT16 w = weight[gid.x + i * offset_co.y]; | |||
| for (uint i = lidy; i < ci4 && inside; i += 4) { | |||
| FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz * ci4 + i, 0)); | |||
| FLT16 w = weight[i * co4 + gidx]; | |||
| result.x += dot(v, w.s0123); | |||
| result.y += dot(v, w.s4567); | |||
| result.z += dot(v, w.s89ab); | |||
| result.w += dot(v, w.scdef); | |||
| } | |||
| __local FLT4 temp[64][4]; | |||
| temp[lid.x][lid.y] = result; | |||
| __local FLT4 temp[32][4]; | |||
| temp[lidx][lidy] = result; | |||
| barrier(CLK_LOCAL_MEM_FENCE); | |||
| if (lid.y == 0 && inside) { | |||
| result += temp[lid.x][1]; | |||
| result += temp[lid.x][2]; | |||
| result += temp[lid.x][3]; | |||
| if (lidy == 0 && inside) { | |||
| result += temp[lidx][1]; | |||
| result += temp[lidx][2]; | |||
| result += temp[lidx][3]; | |||
| if (has_bias != 0) { | |||
| result += READ_IMAGE(bias, smp_zero, (int2)(gid.x, 0)); | |||
| result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); | |||
| } | |||
| WRITE_IMAGE(output, (int2)(0, gid.x), result); | |||
| WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, 0), result); | |||
| } | |||
| } | |||
| __kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, | |||
| __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) { | |||
| int gidx = get_global_id(0); // CO4 | |||
| int gidy = get_global_id(1); // N * H * 4 | |||
| int gidz = get_global_id(2); // W | |||
| int lidx = get_local_id(0); | |||
| int lidy = get_local_id(1); | |||
| int ci4 = UP_DIV(in_shape.w, C4NUM); | |||
| int co4 = UP_DIV(out_shape.w, C4NUM); | |||
| int n = out_shape.x; | |||
| int h = out_shape.y; | |||
| int w = out_shape.z; | |||
| int nh_index = gidy / 4; | |||
| bool inside = gidx < co4 && gidz < w && nh_index < n * h; | |||
| FLT4 result = (FLT4)(0.0f); | |||
| for (uint i = lidy; i < ci4 && inside; i += 4) { | |||
| FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz * ci4 + i, nh_index)); | |||
| FLT16 weight_value = weight[nh_index * ci4 * co4 + i * co4 + gidx]; | |||
| result.x += dot(v, weight_value.s0123); | |||
| result.y += dot(v, weight_value.s4567); | |||
| result.z += dot(v, weight_value.s89ab); | |||
| result.w += dot(v, weight_value.scdef); | |||
| } | |||
| __local FLT4 temp[32][4]; | |||
| temp[lidx][lidy] = result; | |||
| barrier(CLK_LOCAL_MEM_FENCE); | |||
| if (lidy == 0 && inside) { | |||
| result += temp[lidx][1]; | |||
| result += temp[lidx][2]; | |||
| result += temp[lidx][3]; | |||
| if (has_bias != 0) { | |||
| result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); | |||
| } | |||
| WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result); | |||
| } | |||
| } | |||
| __kernel void MatMul_NC4HW4_4d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, | |||
| __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) { | |||
| int gidx = get_global_id(0); // CO4 | |||
| int gidy = get_global_id(1); // N * H * 4 | |||
| int gidz = get_global_id(2); // W | |||
| int lidx = get_local_id(0); | |||
| int lidy = get_local_id(1); | |||
| int ci4 = UP_DIV(in_shape.w, C4NUM); | |||
| int co4 = UP_DIV(out_shape.w, C4NUM); | |||
| int n = out_shape.x; | |||
| int h = out_shape.y; | |||
| int w = out_shape.z; | |||
| int nh_index = gidy / 4; | |||
| bool inside = gidx < co4 && gidz < w && nh_index < n * h; | |||
| int n_index = nh_index / h; | |||
| int h_index = nh_index % h; | |||
| FLT4 result = (FLT4)(0.0f); | |||
| for (uint i = lidy; i < ci4 && inside; i += 4) { | |||
| FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz, n_index * ci4 * h + i * h + h_index)); | |||
| FLT16 weight_value = weight[nh_index * ci4 * co4 + i * co4 + gidx]; | |||
| result.x += dot(v, weight_value.s0123); | |||
| result.y += dot(v, weight_value.s4567); | |||
| result.z += dot(v, weight_value.s89ab); | |||
| result.w += dot(v, weight_value.scdef); | |||
| } | |||
| __local FLT4 temp[32][4]; | |||
| temp[lidx][lidy] = result; | |||
| barrier(CLK_LOCAL_MEM_FENCE); | |||
| if (lidy == 0 && inside) { | |||
| result += temp[lidx][1]; | |||
| result += temp[lidx][2]; | |||
| result += temp[lidx][3]; | |||
| if (has_bias != 0) { | |||
| result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); | |||
| } | |||
| WRITE_IMAGE(output, (int2)(gidz, n_index * co4 * h + gidx * h + h_index), result); | |||
| } | |||
| } | |||
| @@ -0,0 +1,61 @@ | |||
| #ifdef cl_khr_fp16 | |||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||
| #endif | |||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||
| __kernel void mean_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { | |||
| int X = get_global_id(0); // C4 | |||
| if (X >= size.z) { | |||
| return; | |||
| } | |||
| FLT4 result = (FLT4)0.f; | |||
| for (int h = 0; h < size.x; h++) { | |||
| for (int w = 0; w < size.y; w++) { | |||
| result += READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h)); | |||
| } | |||
| } | |||
| result /= size.x * size.y; | |||
| WRITE_IMAGE(dst_data, (int2)(X, 0), result); | |||
| } | |||
| __kernel void mean_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { | |||
| int X = get_global_id(0); // C4 | |||
| if (X >= size.z) { | |||
| return; | |||
| } | |||
| FLT4 result = (FLT4)0.f; | |||
| for (int h = 0; h < size.x; h++) { | |||
| for (int w = 0; w < size.y; w++) { | |||
| result += READ_IMAGE(src_data, smp_zero, (int2)(w, X * size.x + h)); | |||
| } | |||
| } | |||
| result /= size.x * size.y; | |||
| WRITE_IMAGE(dst_data, (int2)(0, X), result); | |||
| } | |||
| __kernel void sum_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { | |||
| int X = get_global_id(0); // C4 | |||
| if (X >= size.z) { | |||
| return; | |||
| } | |||
| FLT4 result = (FLT4)0.f; | |||
| for (int h = 0; h < size.x; h++) { | |||
| for (int w = 0; w < size.y; w++) { | |||
| result += READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h)); | |||
| } | |||
| } | |||
| WRITE_IMAGE(dst_data, (int2)(X, 0), result); | |||
| } | |||
| __kernel void sum_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { | |||
| int X = get_global_id(0); // C4 | |||
| if (X >= size.z) { | |||
| return; | |||
| } | |||
| FLT4 result = (FLT4)0.f; | |||
| for (int h = 0; h < size.x; h++) { | |||
| for (int w = 0; w < size.y; w++) { | |||
| result += READ_IMAGE(src_data, smp_zero, (int2)(w, X * size.x + h)); | |||
| } | |||
| } | |||
| WRITE_IMAGE(dst_data, (int2)(0, X), result); | |||
| } | |||
| @@ -45,7 +45,6 @@ class ActivationOpenClKernel : public OpenCLKernel { | |||
| cl::Kernel kernel_; | |||
| int type_; | |||
| float alpha_; | |||
| void *alpha_buff_; | |||
| int in_size_; | |||
| int out_size_; | |||
| size_t fp_size; | |||
| @@ -94,14 +94,20 @@ void Conv2dTransposeOpenCLKernel::PadWeight() { | |||
| int ori_index = ((ci_offset * kh + kh_i) * kw + kw_i) * co + co_offset; | |||
| if (enable_fp16_) { | |||
| if (weight_dtype == kNumberTypeFloat32) { | |||
| reinterpret_cast<uint16_t *>(padWeight_)[index++] = | |||
| Float32ToShort(reinterpret_cast<float *>(origin_weight)[ori_index]); | |||
| reinterpret_cast<float16_t *>(padWeight_)[index++] = | |||
| reinterpret_cast<float *>(origin_weight)[ori_index]; | |||
| } else { | |||
| reinterpret_cast<uint16_t *>(padWeight_)[index++] = | |||
| reinterpret_cast<uint16_t *>(origin_weight)[ori_index]; | |||
| reinterpret_cast<float16_t *>(padWeight_)[index++] = | |||
| reinterpret_cast<float16_t *>(origin_weight)[ori_index]; | |||
| } | |||
| } else { | |||
| reinterpret_cast<float *>(padWeight_)[index++] = reinterpret_cast<float *>(origin_weight)[ori_index]; | |||
| if (weight_dtype == kNumberTypeFloat32) { | |||
| reinterpret_cast<float *>(padWeight_)[index++] = | |||
| reinterpret_cast<float *>(origin_weight)[ori_index]; | |||
| } else { | |||
| reinterpret_cast<float *>(padWeight_)[index++] = | |||
| reinterpret_cast<float16_t *>(origin_weight)[ori_index]; | |||
| } | |||
| } | |||
| } else { | |||
| index++; | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * Copyright 2019 Huawei Technologies n., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -16,10 +16,10 @@ | |||
| #include <set> | |||
| #include <string> | |||
| #include <map> | |||
| #include "nnacl/fp32/common_func.h" | |||
| #include "src/kernel_registry.h" | |||
| #include "src/runtime/opencl/opencl_runtime.h" | |||
| #include "nnacl/fp32/matmul.h" | |||
| #include "src/runtime/kernel/opencl/kernel/matmul.h" | |||
| #ifndef PROGRAM_WITH_IL | |||
| #include "src/runtime/kernel/opencl/cl/matmul.cl.inc" | |||
| @@ -36,7 +36,26 @@ int MatMulOpenCLKernel::Init() { | |||
| std::string kernel_name = "MatMul"; | |||
| kernel_name += "_" + std::string(EnumNameFormat(op_format_)); | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| auto param = reinterpret_cast<MatMulParameter *>(op_parameter_); | |||
| transposeA = param->a_transpose_; | |||
| if (transposeA) { | |||
| MS_LOG(ERROR) << "matmul only support a_transpose_=false yet."; | |||
| return RET_ERROR; | |||
| } | |||
| transposeB = param->b_transpose_; | |||
| enable_fp16_ = ocl_runtime->GetFp16Enable(); | |||
| if (in_tensors_[0]->shape().size() != out_tensors_[0]->shape().size() || | |||
| (in_tensors_[0]->shape().size() != 2 && in_tensors_[0]->shape().size() != 4)) { | |||
| MS_LOG(ERROR) << "matmul only support input shape size=2 or 4."; | |||
| return RET_ERROR; | |||
| } | |||
| dims = in_tensors_[0]->shape().size(); | |||
| for (int i = 0; i < dims; i++) { | |||
| inShape[MAX_DIMS - dims + i] = in_tensors_[0]->shape()[i]; | |||
| outShape[MAX_DIMS - dims + i] = out_tensors_[0]->shape()[i]; | |||
| } | |||
| std::map<int, std::string> dims2str = {{2, "_2d"}, {4, "_4d"}}; | |||
| kernel_name += dims2str[dims]; | |||
| #ifdef PROGRAM_WITH_IL | |||
| kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name); | |||
| #else | |||
| @@ -46,21 +65,7 @@ int MatMulOpenCLKernel::Init() { | |||
| ocl_runtime->LoadSource(program_name, source); | |||
| ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| #endif | |||
| int ci, co; | |||
| if (in_tensors_[1]->shape().size() != 2) { | |||
| MS_LOG(ERROR) << "matmul do not support input shape size=" << in_tensors_[1]->shape().size(); | |||
| return RET_ERROR; | |||
| } | |||
| if (in_tensors_[1]->shape().size() == 2) { | |||
| ci = in_tensors_[1]->shape()[1]; | |||
| co = in_tensors_[1]->shape()[0]; | |||
| } else { | |||
| ci = in_tensors_[1]->shape()[3]; | |||
| co = in_tensors_[1]->shape()[0]; | |||
| } | |||
| sizeCI = {ci, UP_DIV(ci, C4NUM)}; | |||
| sizeCO = {co, UP_DIV(co, C4NUM)}; | |||
| PadWeight(); | |||
| in_ori_format_ = in_tensors_[0]->GetFormat(); | |||
| out_ori_format_ = out_tensors_[0]->GetFormat(); | |||
| @@ -73,51 +78,69 @@ int MatMulOpenCLKernel::Init() { | |||
| int MatMulOpenCLKernel::ReSize() { return RET_OK; } | |||
| void MatMulOpenCLKernel::PadWeight() { | |||
| // ABMCI @ ABCICO = ABMCO | |||
| auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator(); | |||
| int ci = inShape[3]; | |||
| int ci4 = UP_DIV(ci, C4NUM); | |||
| int co = outShape[3]; | |||
| int co4 = UP_DIV(co, C4NUM); | |||
| int a = inShape[0]; | |||
| int b = inShape[1]; | |||
| size_t dtype_size = enable_fp16_ ? sizeof(int16_t) : sizeof(float); | |||
| padWeight_ = allocator->Malloc(sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * dtype_size); | |||
| size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float); | |||
| padWeight_ = allocator->Malloc(a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size); | |||
| padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true); | |||
| memset(padWeight_, 0x00, sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * dtype_size); | |||
| auto origin_weight = in_tensors_.at(kWeightIndex)->MutableData(); | |||
| int divCI = sizeCI.s[1]; | |||
| int divCO = sizeCO.s[1]; | |||
| int co = sizeCO.s[0]; | |||
| auto padWeightFp32 = reinterpret_cast<float *>(padWeight_); | |||
| auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_); | |||
| memset(padWeight_, 0x00, a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size); | |||
| auto originWeightFp32 = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->MutableData()); | |||
| auto originWeightFp16 = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->MutableData()); | |||
| bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16; | |||
| // pad weight | |||
| // ABCICO -> AB(CI4)(CO4)(4 from CO)(4 from CI) | |||
| // if tranposeB, ABCOCI -> AB(CI4)(CO4)(4 from CO)(4 from CI) | |||
| int index = 0; | |||
| for (int i = 0; i < divCI; ++i) { | |||
| for (int j = 0; j < divCO; ++j) { | |||
| for (int k = 0; k < C4NUM; ++k) { | |||
| for (int l = 0; l < C4NUM; ++l) { | |||
| int src_x = i * C4NUM + l; | |||
| int src_y = j * C4NUM + k; | |||
| if (src_x < sizeCI.s[0] && src_y < sizeCO.s[0]) { | |||
| if (enable_fp16_) { | |||
| if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat32) { | |||
| reinterpret_cast<uint16_t *>(padWeight_)[index++] = | |||
| Float32ToShort(reinterpret_cast<float *>(origin_weight)[src_y * sizeCI.s[0] + src_x]); | |||
| } else { | |||
| reinterpret_cast<uint16_t *>(padWeight_)[index++] = | |||
| reinterpret_cast<uint16_t *>(origin_weight)[src_y * sizeCI.s[0] + src_x]; | |||
| } | |||
| } else { | |||
| if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16) { | |||
| reinterpret_cast<float *>(padWeight_)[index++] = | |||
| ShortToFloat32(reinterpret_cast<uint16_t *>(origin_weight)[src_y * sizeCI.s[0] + src_x]); | |||
| for (int aa = 0; aa < a; aa++) { | |||
| for (int bb = 0; bb < b; bb++) { | |||
| int baseAB = (aa * b + bb) * ci * co; | |||
| for (int i = 0; i < ci4; ++i) { | |||
| for (int j = 0; j < co4; ++j) { | |||
| for (int k = 0; k < C4NUM; ++k) { | |||
| for (int l = 0; l < C4NUM; ++l) { | |||
| int src_ci = i * C4NUM + l; | |||
| int src_co = j * C4NUM + k; | |||
| if (src_ci < ci && src_co < co) { | |||
| int originId = baseAB + src_ci * co + src_co; | |||
| if (transposeB) { | |||
| originId = baseAB + src_co * ci + src_ci; | |||
| } | |||
| if (enable_fp16_) { | |||
| if (!isModelFp16) { | |||
| padWeightFp16[index++] = originWeightFp32[originId]; | |||
| } else { | |||
| padWeightFp16[index++] = originWeightFp16[originId]; | |||
| } | |||
| } else { | |||
| if (!isModelFp16) { | |||
| padWeightFp32[index++] = originWeightFp32[originId]; | |||
| } else { | |||
| padWeightFp32[index++] = originWeightFp16[originId]; | |||
| } | |||
| } | |||
| } else { | |||
| reinterpret_cast<float *>(padWeight_)[index++] = | |||
| reinterpret_cast<float *>(origin_weight)[src_y * sizeCI.s[0] + src_x]; | |||
| index++; | |||
| } | |||
| } | |||
| } else { | |||
| index++; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| // pad FC Bias | |||
| size_t im_dst_x, im_dst_y; | |||
| im_dst_x = divCO; | |||
| im_dst_x = co4; | |||
| im_dst_y = 1; | |||
| size_t img_dtype = CL_FLOAT; | |||
| if (enable_fp16_) { | |||
| @@ -126,13 +149,18 @@ void MatMulOpenCLKernel::PadWeight() { | |||
| std::vector<size_t> img_size{im_dst_x, im_dst_y, img_dtype}; | |||
| bias_ = allocator->Malloc(im_dst_x * im_dst_y * C4NUM * dtype_size, img_size); | |||
| bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); | |||
| memset(bias_, 0x00, divCO * C4NUM * dtype_size); | |||
| memset(bias_, 0x00, co4 * C4NUM * dtype_size); | |||
| if (in_tensors_.size() >= 3) { | |||
| if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) { | |||
| auto fdata = reinterpret_cast<float *>(in_tensors_[2]->MutableData()); | |||
| for (int i = 0; i < co; i++) { | |||
| reinterpret_cast<uint16_t *>(bias_)[i] = Float32ToShort(fdata[i]); | |||
| } | |||
| } else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) { | |||
| auto fdata = reinterpret_cast<uint16_t *>(in_tensors_[2]->MutableData()); | |||
| for (int i = 0; i < co; i++) { | |||
| reinterpret_cast<float *>(bias_)[i] = ShortToFloat32(fdata[i]); | |||
| } | |||
| } else { | |||
| memcpy(bias_, in_tensors_[2]->MutableData(), co * dtype_size); | |||
| } | |||
| @@ -142,12 +170,23 @@ void MatMulOpenCLKernel::PadWeight() { | |||
| int MatMulOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { | |||
| size_t im_dst_x, im_dst_y; | |||
| if (op_format_ == schema::Format::Format_NHWC4) { | |||
| im_dst_x = sizeCO.s[1]; | |||
| im_dst_y = 1; | |||
| } else if (op_format_ == schema::Format::Format_NC4HW4) { | |||
| im_dst_x = 1; | |||
| im_dst_y = sizeCO.s[1]; | |||
| auto out_shape = out_tensors_[0]->shape(); | |||
| int n = 1, h = 1, w = 1, c = 1; | |||
| if (dims == 2) { | |||
| n = out_shape[0]; | |||
| c = out_shape[1]; | |||
| } else if (dims == 4) { | |||
| n = out_shape[0]; | |||
| h = out_shape[1]; | |||
| w = out_shape[2]; | |||
| c = out_shape[3]; | |||
| } | |||
| if (op_format_ == schema::Format_NHWC4) { | |||
| im_dst_x = w * UP_DIV(c, C4NUM); | |||
| im_dst_y = n * h; | |||
| } else if (op_format_ == schema::Format_NC4HW4) { | |||
| im_dst_x = w; | |||
| im_dst_y = n * UP_DIV(c, C4NUM) * h; | |||
| } else { | |||
| MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_); | |||
| return RET_ERROR; | |||
| @@ -166,15 +205,19 @@ int MatMulOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| // local size should less than MAX_GROUP_SIZE | |||
| std::vector<size_t> local = {64, 4}; | |||
| std::vector<size_t> global = {UP_ROUND(sizeCO.s[1], local[0]), 4}; | |||
| std::vector<size_t> local = {32, 4, 1}; | |||
| std::vector<size_t> global = {UP_DIV(static_cast<size_t>(outShape[3]), C4NUM), | |||
| 4 * static_cast<size_t>(outShape[0]) * static_cast<size_t>(outShape[1]), | |||
| static_cast<size_t>(outShape[2])}; | |||
| int arg_count = 0; | |||
| cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]}; | |||
| cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]}; | |||
| ocl_runtime->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->MutableData()); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_count++, bias_); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->MutableData()); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_count++, sizeCI); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_count++, sizeCO); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_count++, in_shape); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_count++, out_shape); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_count++, hasBias_ ? 1 : 0); | |||
| ocl_runtime->RunKernel(kernel_, global, local, nullptr); | |||
| return RET_OK; | |||
| @@ -20,7 +20,7 @@ | |||
| #include <vector> | |||
| #include "src/runtime/kernel/opencl/opencl_kernel.h" | |||
| #include "nnacl/conv_parameter.h" | |||
| #include "nnacl/matmul_parameter.h" | |||
| #include "src/runtime/opencl/opencl_runtime.h" | |||
| namespace mindspore::kernel { | |||
| @@ -29,7 +29,7 @@ class MatMulOpenCLKernel : public OpenCLKernel { | |||
| public: | |||
| explicit MatMulOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, bool hasBias) | |||
| : OpenCLKernel(parameter, inputs, outputs) { | |||
| : OpenCLKernel(parameter, inputs, outputs), inShape(MAX_DIMS, 1), outShape(MAX_DIMS, 1) { | |||
| hasBias_ = hasBias; | |||
| } | |||
| ~MatMulOpenCLKernel() override{}; | |||
| @@ -46,8 +46,12 @@ class MatMulOpenCLKernel : public OpenCLKernel { | |||
| void *bias_; | |||
| bool hasBias_{false}; | |||
| bool enable_fp16_{false}; | |||
| cl_int2 sizeCI; | |||
| cl_int2 sizeCO; | |||
| bool transposeA{false}; | |||
| bool transposeB{true}; | |||
| int dims; | |||
| static constexpr int MAX_DIMS = 4; // max supported matmul dims | |||
| std::vector<int> inShape; | |||
| std::vector<int> outShape; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -0,0 +1,166 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <set> | |||
| #include <string> | |||
| #include <map> | |||
| #include "include/errorcode.h" | |||
| #include "src/kernel_registry.h" | |||
| #include "src/runtime/opencl/opencl_runtime.h" | |||
| #include "src/runtime/kernel/opencl/kernel/reduce.h" | |||
| #include "src/runtime/kernel/opencl/cl/reduce.cl.inc" | |||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | |||
| using mindspore::lite::KernelRegistrar; | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_NULL_PTR; | |||
| using mindspore::lite::RET_OK; | |||
| using mindspore::lite::RET_PARAM_INVALID; | |||
| using mindspore::schema::PrimitiveType_Mean; | |||
| using mindspore::schema::PrimitiveType_Reduce; | |||
| using mindspore::schema::ReduceMode; | |||
| using mindspore::schema::ReduceMode_ReduceMax; | |||
| using mindspore::schema::ReduceMode_ReduceMean; | |||
| using mindspore::schema::ReduceMode_ReduceMin; | |||
| using mindspore::schema::ReduceMode_ReduceProd; | |||
| using mindspore::schema::ReduceMode_ReduceSum; | |||
| using mindspore::schema::ReduceMode_ReduceSumSquare; | |||
| namespace mindspore::kernel { | |||
| int ReduceOpenCLKernel::Init() { | |||
| InitNHWCShape(); | |||
| auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_); | |||
| if (reduce_param == nullptr) { | |||
| return RET_NULL_PTR; | |||
| } | |||
| std::map<int, std::string> reduce_type2str{{ReduceMode_ReduceMean, "mean"}, {ReduceMode_ReduceSum, "sum"}}; | |||
| if (reduce_type2str.find(reduce_param->mode_) == reduce_type2str.end()) { | |||
| MS_LOG(ERROR) << "not supported reduce type:" << reduce_param->mode_; | |||
| return RET_PARAM_INVALID; | |||
| } | |||
| if (reduce_param->num_axes_ != 2 || ((reduce_param->axes_[0] != 1 || reduce_param->axes_[1] != 2) && | |||
| (reduce_param->axes_[0] != 2 || reduce_param->axes_[1] != 1))) { | |||
| MS_LOG(ERROR) << "reduce op only support axes HW"; | |||
| return RET_PARAM_INVALID; | |||
| } | |||
| std::string kernel_name = reduce_type2str.at(reduce_param->mode_); | |||
| kernel_name += "_" + std::string(EnumNameFormat(op_format_)); | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| enable_fp16_ = ocl_runtime->GetFp16Enable(); | |||
| if (in_tensors_[0]->shape().back() != out_tensors_[0]->shape().back()) { | |||
| MS_LOG(ERROR) << "Reduce input channel " << in_tensors_[0]->shape().back() << " should equal output channel" | |||
| << out_tensors_[0]->shape().back(); | |||
| return RET_ERROR; | |||
| } | |||
| #ifdef PROGRAM_WITH_IL | |||
| kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name); | |||
| #else | |||
| std::set<std::string> build_options; | |||
| std::string source = reduce_source; | |||
| ocl_runtime->LoadSource(kernel_name, source); | |||
| ocl_runtime->BuildKernel(kernel_, kernel_name, kernel_name, build_options); | |||
| #endif | |||
| in_ori_format_ = in_tensors_[0]->GetFormat(); | |||
| out_ori_format_ = out_tensors_[0]->GetFormat(); | |||
| in_tensors_[0]->SetFormat(op_format_); | |||
| out_tensors_[0]->SetFormat(op_format_); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return RET_OK; | |||
| } | |||
| void ReduceOpenCLKernel::InitNHWCShape() { | |||
| std::vector<int> shapex = out_tensors_[0]->shape(); | |||
| size_t n = 1, h = 1, w = 1, c = 1; | |||
| if (shapex.size() == 2) { | |||
| n = shapex[0]; | |||
| c = shapex[1]; | |||
| } else if (shapex.size() == 4) { | |||
| n = shapex[0]; | |||
| h = shapex[1]; | |||
| w = shapex[2]; | |||
| c = shapex[3]; | |||
| } | |||
| nhwc_shape_ = {n, h, w, c}; | |||
| } | |||
| int ReduceOpenCLKernel::ReSize() { return RET_OK; } | |||
| int ReduceOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { | |||
| size_t im_dst_x, im_dst_y; | |||
| if (op_format_ == schema::Format_NHWC4) { | |||
| im_dst_x = nhwc_shape_[2] * UP_DIV(nhwc_shape_[3], C4NUM); | |||
| im_dst_y = nhwc_shape_[0] * nhwc_shape_[1]; | |||
| } else if (op_format_ == schema::Format_NC4HW4) { | |||
| im_dst_x = nhwc_shape_[2]; | |||
| im_dst_y = nhwc_shape_[0] * UP_DIV(nhwc_shape_[3], C4NUM) * nhwc_shape_[1]; | |||
| } else { | |||
| MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_); | |||
| return RET_ERROR; | |||
| } | |||
| size_t img_dtype = CL_FLOAT; | |||
| if (enable_fp16_) { | |||
| img_dtype = CL_HALF_FLOAT; | |||
| } | |||
| img_size->clear(); | |||
| std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype}; | |||
| *img_size = vec; | |||
| return RET_OK; | |||
| } | |||
| int ReduceOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| std::vector<int> shapex = in_tensors_[0]->shape(); | |||
| int h = shapex[1]; | |||
| int w = shapex[2]; | |||
| int c = shapex[3]; | |||
| int c4 = UP_DIV(c, C4NUM); | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| std::vector<size_t> local = {}; | |||
| std::vector<size_t> global = {static_cast<size_t>(c4)}; | |||
| cl_int4 size = {h, w, c4, 1}; | |||
| int arg_idx = 0; | |||
| ocl_runtime->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->MutableData()); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->MutableData()); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_idx++, size); | |||
| ocl_runtime->RunKernel(kernel_, global, local, nullptr); | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *OpenCLReduceKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::Context *ctx, const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = new (std::nothrow) ReduceOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed."; | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Mean, OpenCLReduceKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Mean, OpenCLReduceKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Reduce, OpenCLReduceKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Reduce, OpenCLReduceKernelCreator) | |||
| } // namespace mindspore::kernel | |||
| @@ -0,0 +1,48 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_REDUCE_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_REDUCE_H_ | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/opencl/opencl_runtime.h" | |||
| #include "src/runtime/kernel/opencl/opencl_kernel.h" | |||
| #include "nnacl/reduce_parameter.h" | |||
| namespace mindspore::kernel { | |||
| class ReduceOpenCLKernel : public OpenCLKernel { | |||
| public: | |||
| explicit ReduceOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs) | |||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||
| ~ReduceOpenCLKernel() override{}; | |||
| int Init() override; | |||
| int ReSize() override; | |||
| int Run() override; | |||
| int GetImageSize(size_t idx, std::vector<size_t> *img_size) override; | |||
| void InitNHWCShape(); | |||
| private: | |||
| cl::Kernel kernel_; | |||
| bool enable_fp16_{false}; | |||
| std::vector<size_t> nhwc_shape_; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_REDUCE_H_ | |||
| @@ -73,18 +73,6 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te | |||
| return RET_ERROR; | |||
| } | |||
| new_tensor->CopyTensor(*in_tensors[i]); | |||
| if ((dst_format == schema::Format::Format_NCHW || dst_format == schema::Format::Format_NC4HW4) && | |||
| (src_format == schema::Format::Format_NHWC || src_format == schema::Format::Format_NHWC4)) { | |||
| auto shape = new_tensor->shape(); | |||
| std::vector<int> dst_shape{shape[0], shape[3], shape[1], shape[2]}; | |||
| new_tensor->set_shape(shape); | |||
| } | |||
| if ((dst_format == schema::Format::Format_NHWC || dst_format == schema::Format::Format_NHWC4) && | |||
| (src_format == schema::Format::Format_NCHW || src_format == schema::Format::Format_NC4HW4)) { | |||
| auto shape = new_tensor->shape(); | |||
| std::vector<int> dst_shape{shape[0], shape[2], shape[3], shape[1]}; | |||
| new_tensor->set_shape(shape); | |||
| } | |||
| if (mem_type == OpenCLMemType::IMG) { | |||
| new_tensor->SetFormat(dst_format); | |||
| in_tensors[i]->SetFormat(src_format); | |||
| @@ -127,6 +127,7 @@ if (SUPPORT_GPU) | |||
| ${LITE_DIR}/src/runtime/kernel/opencl/kernel/to_format.cc | |||
| ${LITE_DIR}/src/runtime/kernel/opencl/kernel/biasadd.cc | |||
| ${LITE_DIR}/src/runtime/kernel/opencl/kernel/scale.cc | |||
| ${LITE_DIR}/src/runtime/kernel/opencl/kernel/reduce.cc | |||
| ) | |||
| endif() | |||
| ### minddata lite | |||
| @@ -315,6 +316,7 @@ if (SUPPORT_GPU) | |||
| ${TEST_DIR}/ut/src/runtime/kernel/opencl/reshape_tests.cc | |||
| ${TEST_DIR}/ut/src/runtime/kernel/opencl/biasadd_tests.cc | |||
| ${TEST_DIR}/ut/src/runtime/kernel/opencl/scale_tests.cc | |||
| ${TEST_DIR}/ut/src/runtime/kernel/opencl/reduce_tests.cc | |||
| ) | |||
| endif() | |||
| @@ -30,7 +30,7 @@ class TestMatMulOpenCL : public mindspore::CommonTest { | |||
| }; | |||
| void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *weight_data, void *output_data, | |||
| bool enable_fp16) { | |||
| bool enable_fp16, int dims) { | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| ocl_runtime->Init(); | |||
| size_t dtype_size = sizeof(float); | |||
| @@ -39,20 +39,41 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we | |||
| dtype_size = sizeof(int16_t); | |||
| } | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| int ci = shape[0]; | |||
| int co = shape[1]; | |||
| std::vector<int> input_shape = {1, ci}; | |||
| std::vector<int> input_shape, output_shape, weight_shape; | |||
| if (dims == 2) { | |||
| int ci = shape[0]; | |||
| int co = shape[1]; | |||
| input_shape = {1, ci}; | |||
| output_shape = {1, co}; | |||
| weight_shape = {co, ci}; | |||
| } else if (dims == 4) { | |||
| int a = shape[0]; | |||
| int b = shape[1]; | |||
| int m = shape[2]; | |||
| int ci = shape[3]; | |||
| int co = shape[4]; | |||
| input_shape = {a, b, m, ci}; | |||
| output_shape = {a, b, m, co}; | |||
| weight_shape = {a, b, co, ci}; | |||
| } | |||
| auto param_ptr = std::make_unique<MatMulParameter>(); | |||
| auto param = param_ptr.get(); | |||
| if (param == nullptr) { | |||
| MS_LOG(ERROR) << "param_ptr create error."; | |||
| return; | |||
| } | |||
| param->a_transpose_ = false; | |||
| param->b_transpose_ = true; | |||
| auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), | |||
| input_shape, schema::Format_NC); | |||
| input_shape, dims == 2 ? schema::Format_NC : schema::Format_NHWC); | |||
| auto tensor_x = tensor_x_ptr.get(); | |||
| if (tensor_x == nullptr) { | |||
| MS_LOG(ERROR) << "tensor_x create error."; | |||
| return; | |||
| } | |||
| std::vector<int> w_shape = {co, ci}; | |||
| auto tensor_w_ptr = | |||
| std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), w_shape); | |||
| auto tensor_w_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), | |||
| weight_shape, dims == 2 ? schema::Format_NC : schema::Format_NHWC); | |||
| auto tensor_w = tensor_w_ptr.get(); | |||
| if (tensor_w == nullptr) { | |||
| MS_LOG(ERROR) << "tensor_w create error."; | |||
| @@ -60,9 +81,9 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we | |||
| } | |||
| tensor_w->SetData(weight_data); | |||
| std::vector<int> out_shape = {1, co}; | |||
| auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), | |||
| out_shape, schema::Format_NC); | |||
| auto tensor_out_ptr = | |||
| std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), output_shape, | |||
| dims == 2 ? schema::Format_NC : schema::Format_NHWC); | |||
| auto tensor_out = tensor_out_ptr.get(); | |||
| if (tensor_out == nullptr) { | |||
| MS_LOG(ERROR) << "tensor_out create error."; | |||
| @@ -70,7 +91,8 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we | |||
| } | |||
| std::vector<lite::Tensor *> inputs{tensor_x, tensor_w}; | |||
| std::vector<lite::Tensor *> outputs{tensor_out}; | |||
| auto op_kernel_ptr = std::make_unique<kernel::MatMulOpenCLKernel>(nullptr, inputs, outputs, false); | |||
| auto op_kernel_ptr = | |||
| std::make_unique<kernel::MatMulOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs, false); | |||
| auto op_kernel = op_kernel_ptr.get(); | |||
| if (op_kernel == nullptr) { | |||
| MS_LOG(ERROR) << "op_kernel create error."; | |||
| @@ -89,12 +111,13 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we | |||
| return; | |||
| } | |||
| pGraph->Init(); | |||
| memcpy(inputs[0]->MutableData(), input_data, ci * dtype_size); | |||
| memcpy(inputs[0]->MutableData(), input_data, tensor_x->ElementsNum() * dtype_size); | |||
| pGraph->Run(); | |||
| if (enable_fp16) { | |||
| CompareOutput(outputs[0]->MutableData(), output_data, co, static_cast<float16_t>(1e-3), 2e-2); | |||
| CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float16_t>(1e-3), | |||
| 2e-2); | |||
| } else { | |||
| CompareOutput(outputs[0]->MutableData(), output_data, co, static_cast<float>(1e-5)); | |||
| CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float>(1e-5)); | |||
| } | |||
| tensor_x->SetData(nullptr); | |||
| @@ -125,7 +148,7 @@ void RunTestCaseMatMul(const std::vector<int> shape, const std::vector<std::stri | |||
| MS_LOG(ERROR) << "output_data load error."; | |||
| return; | |||
| } | |||
| RunTestCaseMatMul(shape, input_data, weight_data, output_data, enable_fp16); | |||
| RunTestCaseMatMul(shape, input_data, weight_data, output_data, enable_fp16, 2); | |||
| } | |||
| TEST_F(TestMatMulOpenCL, MatMulFp32) { | |||
| @@ -156,7 +179,7 @@ TEST_F(TestMatMulOpenCL, MatMulFp32_2) { | |||
| std::vector<float> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, | |||
| 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; | |||
| std::vector<float> output_data = {10.f, 10.f, 10.f}; | |||
| RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false); | |||
| RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false, 2); | |||
| } | |||
| TEST_F(TestMatMulOpenCL, MatMulFp16_2) { | |||
| @@ -167,6 +190,40 @@ TEST_F(TestMatMulOpenCL, MatMulFp16_2) { | |||
| std::vector<float16_t> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, | |||
| 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; | |||
| std::vector<float16_t> output_data = {10.f, 10.f, 10.f}; | |||
| RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), true); | |||
| RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), true, 2); | |||
| } | |||
| TEST_F(TestMatMulOpenCL, MatMulFp32_4D) { | |||
| int a = 1; | |||
| int b = 2; | |||
| int c = 2; | |||
| int ci = 5; | |||
| int co = 3; | |||
| std::vector<int> shape = {a, b, c, ci, co}; | |||
| std::vector<float> input_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, | |||
| 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; | |||
| std::vector<float> weight_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, | |||
| 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, | |||
| 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f}; | |||
| std::vector<float> output_data = {15.0f, 40.0f, 65.0f, 15.0f, 40.0f, 65.0f, | |||
| 90.0f, 115.0f, 140.0f, 90.0f, 115.0f, 140.0f}; | |||
| RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false, 4); | |||
| } | |||
| TEST_F(TestMatMulOpenCL, MatMulFp16_4D) { | |||
| int a = 1; | |||
| int b = 2; | |||
| int c = 2; | |||
| int ci = 5; | |||
| int co = 3; | |||
| std::vector<int> shape = {a, b, c, ci, co}; | |||
| std::vector<float16_t> input_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, | |||
| 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; | |||
| std::vector<float16_t> weight_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, | |||
| 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, | |||
| 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f}; | |||
| std::vector<float16_t> output_data = {15.0f, 40.0f, 65.0f, 15.0f, 40.0f, 65.0f, | |||
| 90.0f, 115.0f, 140.0f, 90.0f, 115.0f, 140.0f}; | |||
| RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), true, 4); | |||
| } | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,156 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <iostream> | |||
| #include <memory> | |||
| #include "mindspore/core/utils/log_adapter.h" | |||
| #include "common/common_test.h" | |||
| #include "mindspore/lite/src/common/file_utils.h" | |||
| #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" | |||
| #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" | |||
| #include "mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h" | |||
| #include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h" | |||
| namespace mindspore { | |||
| class TestReduceOpenCL : public mindspore::CommonTest { | |||
| public: | |||
| TestReduceOpenCL() {} | |||
| }; | |||
| void RunTestCaseReduce(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16, | |||
| int reduce_mode) { | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| ocl_runtime->Init(); | |||
| size_t dtype_size = sizeof(float); | |||
| if (enable_fp16) { | |||
| ocl_runtime->SetFp16Enable(true); | |||
| dtype_size = sizeof(float16_t); | |||
| } | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| auto param_ptr = std::make_unique<ReduceParameter>(); | |||
| auto param = param_ptr.get(); | |||
| if (param == nullptr) { | |||
| MS_LOG(ERROR) << "param_ptr create error."; | |||
| return; | |||
| } | |||
| param->axes_[0] = 1; | |||
| param->axes_[1] = 2; | |||
| param->num_axes_ = 2; | |||
| param->mode_ = reduce_mode; | |||
| int n = shape[0]; | |||
| int h = shape[1]; | |||
| int w = shape[2]; | |||
| int c = shape[3]; | |||
| std::vector<int> input_shape = {n, h, w, c}; | |||
| auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), | |||
| input_shape, schema::Format_NHWC); | |||
| auto tensor_x = tensor_x_ptr.get(); | |||
| if (tensor_x == nullptr) { | |||
| MS_LOG(ERROR) << "tensor_x create error."; | |||
| return; | |||
| } | |||
| std::vector<int> out_shape = {n, c}; | |||
| auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), | |||
| out_shape, schema::Format_NC); | |||
| auto tensor_out = tensor_out_ptr.get(); | |||
| if (tensor_out == nullptr) { | |||
| MS_LOG(ERROR) << "tensor_out create error."; | |||
| return; | |||
| } | |||
| std::vector<lite::Tensor *> inputs{tensor_x}; | |||
| std::vector<lite::Tensor *> outputs{tensor_out}; | |||
| auto arith_kernel_ptr = | |||
| std::make_unique<kernel::ReduceOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| auto arith_kernel = arith_kernel_ptr.get(); | |||
| if (arith_kernel == nullptr) { | |||
| MS_LOG(ERROR) << "arith_kernel create error."; | |||
| return; | |||
| } | |||
| arith_kernel->Init(); | |||
| inputs[0]->MallocData(allocator); | |||
| std::vector<kernel::LiteKernel *> kernels{arith_kernel}; | |||
| auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs, outputs, kernels, kernels, kernels); | |||
| auto pGraph = pGraph_ptr.get(); | |||
| if (pGraph == nullptr) { | |||
| MS_LOG(ERROR) << "pGraph create error."; | |||
| return; | |||
| } | |||
| pGraph->Init(); | |||
| memcpy(inputs[0]->MutableData(), input_data, inputs[0]->ElementsNum() * dtype_size); | |||
| pGraph->Run(); | |||
| if (enable_fp16) { | |||
| CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3), | |||
| 2e-2); | |||
| } else { | |||
| CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5)); | |||
| } | |||
| inputs[0]->SetData(nullptr); | |||
| outputs[0]->SetData(nullptr); | |||
| MS_LOG(INFO) << "Test Reduce passed"; | |||
| lite::opencl::OpenCLRuntime::DeleteInstance(); | |||
| } | |||
| TEST_F(TestReduceOpenCL, ReduceMeanFp32) { | |||
| int n = 1; | |||
| int h = 2; | |||
| int w = 2; | |||
| int c = 3; | |||
| std::vector<int> shape = {n, h, w, c}; | |||
| std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; | |||
| std::vector<float> output_data = {4.5f, 5.5f, 6.5f}; | |||
| RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMean); | |||
| } | |||
| TEST_F(TestReduceOpenCL, ReduceMeanFp16) { | |||
| int n = 1; | |||
| int h = 2; | |||
| int w = 2; | |||
| int c = 3; | |||
| std::vector<int> shape = {n, h, w, c}; | |||
| std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; | |||
| std::vector<float16_t> output_data = {4.5f, 5.5f, 6.5f}; | |||
| RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceMean); | |||
| } | |||
| TEST_F(TestReduceOpenCL, ReduceSumFp32) { | |||
| int n = 1; | |||
| int h = 2; | |||
| int w = 2; | |||
| int c = 3; | |||
| std::vector<int> shape = {n, h, w, c}; | |||
| std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; | |||
| std::vector<float> output_data = {18.0f, 22.0f, 26.0f}; | |||
| RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSum); | |||
| } | |||
| TEST_F(TestReduceOpenCL, ReduceSumFp16) { | |||
| int n = 1; | |||
| int h = 2; | |||
| int w = 2; | |||
| int c = 3; | |||
| std::vector<int> shape = {n, h, w, c}; | |||
| std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; | |||
| std::vector<float16_t> output_data = {18.0f, 22.0f, 26.0f}; | |||
| RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceSum); | |||
| } | |||
| } // namespace mindspore | |||