| @@ -0,0 +1,106 @@ | |||||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||||
| #define C4NUM 4 | |||||
| #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) | |||||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||||
| __kernel void FullConnection_NHWC4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, | |||||
| __write_only image2d_t output, int4 in_shape, int2 out_shape) { | |||||
| int gidx = get_global_id(0); // CO4 | |||||
| int gidz = get_global_id(2); // N | |||||
| int lidx = get_local_id(0); | |||||
| int lidy = get_local_id(1); | |||||
| int ci4 = UP_DIV(in_shape.w, C4NUM); | |||||
| int hwci4 = ci4 * in_shape.y * in_shape.z; | |||||
| int co4 = UP_DIV(out_shape.y, C4NUM); | |||||
| int n = out_shape.x; | |||||
| bool inside = gidx < co4 && gidz < n; | |||||
| FLT4 result = (FLT4)(0.0f); | |||||
| for (uint i = lidy; i < hwci4 && inside; i += 4) { | |||||
| int index_h = i / (ci4 * in_shape.z); | |||||
| int index_wci4 = i % (ci4 * in_shape.z); | |||||
| FLT4 v = READ_IMAGE(input, smp_zero, (int2)(index_wci4, gidz * in_shape.y + index_h)); | |||||
| FLT16 w = weight[i * co4 + gidx]; | |||||
| result.x += dot(v, w.s0123); | |||||
| result.y += dot(v, w.s4567); | |||||
| result.z += dot(v, w.s89ab); | |||||
| result.w += dot(v, w.scdef); | |||||
| } | |||||
| __local FLT4 temp[32][4]; | |||||
| temp[lidx][lidy] = result; | |||||
| barrier(CLK_LOCAL_MEM_FENCE); | |||||
| if (lidy == 0 && inside) { | |||||
| result += temp[lidx][1]; | |||||
| result += temp[lidx][2]; | |||||
| result += temp[lidx][3]; | |||||
| result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); | |||||
| WRITE_IMAGE(output, (int2)(gidx, gidz), result); | |||||
| } | |||||
| } | |||||
| __kernel void FullConnection_NHWC4_ReLU(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, | |||||
| __write_only image2d_t output, int4 in_shape, int2 out_shape) { | |||||
| int gidx = get_global_id(0); // CO4 | |||||
| int gidz = get_global_id(2); // N | |||||
| int lidx = get_local_id(0); | |||||
| int lidy = get_local_id(1); | |||||
| int ci4 = UP_DIV(in_shape.w, C4NUM); | |||||
| int hwci4 = ci4 * in_shape.y * in_shape.z; | |||||
| int co4 = UP_DIV(out_shape.y, C4NUM); | |||||
| int n = out_shape.x; | |||||
| bool inside = gidx < co4 && gidz < n; | |||||
| FLT4 result = (FLT4)(0.0f); | |||||
| for (uint i = lidy; i < hwci4 && inside; i += 4) { | |||||
| int index_h = i / (ci4 * in_shape.z); | |||||
| int index_wci4 = i % (ci4 * in_shape.z); | |||||
| FLT4 v = READ_IMAGE(input, smp_zero, (int2)(index_wci4, gidz * in_shape.y + index_h)); | |||||
| FLT16 w = weight[i * co4 + gidx]; | |||||
| result.x += dot(v, w.s0123); | |||||
| result.y += dot(v, w.s4567); | |||||
| result.z += dot(v, w.s89ab); | |||||
| result.w += dot(v, w.scdef); | |||||
| } | |||||
| __local FLT4 temp[32][4]; | |||||
| temp[lidx][lidy] = result; | |||||
| barrier(CLK_LOCAL_MEM_FENCE); | |||||
| if (lidy == 0 && inside) { | |||||
| result += temp[lidx][1]; | |||||
| result += temp[lidx][2]; | |||||
| result += temp[lidx][3]; | |||||
| result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); | |||||
| result = max(result, (FLT4)(0.f)); | |||||
| WRITE_IMAGE(output, (int2)(gidx, gidz), result); | |||||
| } | |||||
| } | |||||
| __kernel void FullConnection_NC4HW4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, | |||||
| __write_only image2d_t output, int4 in_shape, int2 out_shape) { | |||||
| int gidx = get_global_id(0); // CO4 | |||||
| int gidz = get_global_id(2); // N | |||||
| int lidx = get_local_id(0); | |||||
| int lidy = get_local_id(1); | |||||
| int ci4 = UP_DIV(in_shape.w, C4NUM); | |||||
| int hwci4 = ci4 * in_shape.y * in_shape.z; | |||||
| int co4 = UP_DIV(out_shape.y, C4NUM); | |||||
| int n = out_shape.x; | |||||
| bool inside = gidx < co4 && gidz < n; | |||||
| FLT4 result = (FLT4)(0.0f); | |||||
| for (uint i = lidy; i < hwci4 && inside; i += 4) { | |||||
| int index_ci4h = i / in_shape.z; | |||||
| int index_w = i % in_shape.z; | |||||
| FLT4 v = READ_IMAGE(input, smp_zero, (int2)(index_w, gidz * in_shape.y * ci4 + index_ci4h)); | |||||
| FLT16 w = weight[i * co4 + gidx]; | |||||
| result.x += dot(v, w.s0123); | |||||
| result.y += dot(v, w.s4567); | |||||
| result.z += dot(v, w.s89ab); | |||||
| result.w += dot(v, w.scdef); | |||||
| } | |||||
| __local FLT4 temp[32][4]; | |||||
| temp[lidx][lidy] = result; | |||||
| barrier(CLK_LOCAL_MEM_FENCE); | |||||
| if (lidy == 0 && inside) { | |||||
| result += temp[lidx][1]; | |||||
| result += temp[lidx][2]; | |||||
| result += temp[lidx][3]; | |||||
| result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); | |||||
| WRITE_IMAGE(output, (int2)(0, gidz * co4 + gidx), result); | |||||
| } | |||||
| } | |||||
| @@ -2,8 +2,8 @@ | |||||
| #define C4NUM 4 | #define C4NUM 4 | ||||
| #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) | #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) | ||||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | ||||
| __kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, | |||||
| __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) { | |||||
| __kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output, | |||||
| int4 in_shape, int4 out_shape) { | |||||
| int gidx = get_global_id(0); // CO4 | int gidx = get_global_id(0); // CO4 | ||||
| int gidz = get_global_id(2); // N | int gidz = get_global_id(2); // N | ||||
| int lidx = get_local_id(0); | int lidx = get_local_id(0); | ||||
| @@ -21,7 +21,6 @@ __kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weigh | |||||
| result.z += dot(v, w.s89ab); | result.z += dot(v, w.s89ab); | ||||
| result.w += dot(v, w.scdef); | result.w += dot(v, w.scdef); | ||||
| } | } | ||||
| WRITE_IMAGE(output, (int2)(gidx, gidz), result); | |||||
| __local FLT4 temp[32][4]; | __local FLT4 temp[32][4]; | ||||
| temp[lidx][lidy] = result; | temp[lidx][lidy] = result; | ||||
| barrier(CLK_LOCAL_MEM_FENCE); | barrier(CLK_LOCAL_MEM_FENCE); | ||||
| @@ -29,15 +28,12 @@ __kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weigh | |||||
| result += temp[lidx][1]; | result += temp[lidx][1]; | ||||
| result += temp[lidx][2]; | result += temp[lidx][2]; | ||||
| result += temp[lidx][3]; | result += temp[lidx][3]; | ||||
| if (has_bias != 0) { | |||||
| result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); | |||||
| } | |||||
| WRITE_IMAGE(output, (int2)(gidx, gidz), result); | WRITE_IMAGE(output, (int2)(gidx, gidz), result); | ||||
| } | } | ||||
| } | } | ||||
| __kernel void MatMul_NC4HW4_2d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, | |||||
| __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) { | |||||
| __kernel void MatMul_NC4HW4_2d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output, | |||||
| int4 in_shape, int4 out_shape) { | |||||
| int gidx = get_global_id(0); // CO4 | int gidx = get_global_id(0); // CO4 | ||||
| int gidz = get_global_id(2); // N | int gidz = get_global_id(2); // N | ||||
| int lidx = get_local_id(0); | int lidx = get_local_id(0); | ||||
| @@ -62,15 +58,12 @@ __kernel void MatMul_NC4HW4_2d(__read_only image2d_t input, __global FLT16 *weig | |||||
| result += temp[lidx][1]; | result += temp[lidx][1]; | ||||
| result += temp[lidx][2]; | result += temp[lidx][2]; | ||||
| result += temp[lidx][3]; | result += temp[lidx][3]; | ||||
| if (has_bias != 0) { | |||||
| result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); | |||||
| } | |||||
| WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, 0), result); | |||||
| WRITE_IMAGE(output, (int2)(0, gidz * co4 + gidx), result); | |||||
| } | } | ||||
| } | } | ||||
| __kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, | |||||
| __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) { | |||||
| __kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output, | |||||
| int4 in_shape, int4 out_shape) { | |||||
| int gidx = get_global_id(0); // CO4 | int gidx = get_global_id(0); // CO4 | ||||
| int gidy = get_global_id(1); // N * H * 4 | int gidy = get_global_id(1); // N * H * 4 | ||||
| int gidz = get_global_id(2); // W | int gidz = get_global_id(2); // W | ||||
| @@ -99,15 +92,12 @@ __kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __global FLT16 *weigh | |||||
| result += temp[lidx][1]; | result += temp[lidx][1]; | ||||
| result += temp[lidx][2]; | result += temp[lidx][2]; | ||||
| result += temp[lidx][3]; | result += temp[lidx][3]; | ||||
| if (has_bias != 0) { | |||||
| result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); | |||||
| } | |||||
| WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result); | WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result); | ||||
| } | } | ||||
| } | } | ||||
| __kernel void MatMul_NC4HW4_4d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, | |||||
| __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) { | |||||
| __kernel void MatMul_NC4HW4_4d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output, | |||||
| int4 in_shape, int4 out_shape) { | |||||
| int gidx = get_global_id(0); // CO4 | int gidx = get_global_id(0); // CO4 | ||||
| int gidy = get_global_id(1); // N * H * 4 | int gidy = get_global_id(1); // N * H * 4 | ||||
| int gidz = get_global_id(2); // W | int gidz = get_global_id(2); // W | ||||
| @@ -138,9 +128,6 @@ __kernel void MatMul_NC4HW4_4d(__read_only image2d_t input, __global FLT16 *weig | |||||
| result += temp[lidx][1]; | result += temp[lidx][1]; | ||||
| result += temp[lidx][2]; | result += temp[lidx][2]; | ||||
| result += temp[lidx][3]; | result += temp[lidx][3]; | ||||
| if (has_bias != 0) { | |||||
| result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); | |||||
| } | |||||
| WRITE_IMAGE(output, (int2)(gidz, n_index * co4 * h + gidx * h + h_index), result); | WRITE_IMAGE(output, (int2)(gidz, n_index * co4 * h + gidx * h + h_index), result); | ||||
| } | } | ||||
| } | } | ||||
| @@ -65,6 +65,35 @@ __kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i | |||||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum); | WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum); | ||||
| } | } | ||||
| __kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, | |||||
| const int4 input_shape, const int4 output_shape, const int2 stride, | |||||
| const int2 kernel_size, const int2 padding) { | |||||
| // axis to dst tensor coordinate | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| // boundary check | |||||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||||
| return; | |||||
| } | |||||
| FLT4 maximum = (FLT4)(-10000.0f); | |||||
| int xs = X * stride.x - padding.x; | |||||
| int ys = Y * stride.y - padding.y; | |||||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||||
| int y_c = ys + ky; | |||||
| if (y_c < 0 || y_c >= input_shape.y) continue; | |||||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||||
| int x_c = xs + kx; | |||||
| if (x_c < 0 || x_c >= input_shape.x) continue; | |||||
| FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c)); | |||||
| maximum = max(src, maximum); | |||||
| } | |||||
| } | |||||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(maximum, (FLT4)(0.f))); | |||||
| } | |||||
| __kernel void MaxPooling2d_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output, | __kernel void MaxPooling2d_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output, | ||||
| const int4 input_shape, const int4 output_shape, const int2 stride, | const int4 input_shape, const int4 output_shape, const int2 stride, | ||||
| const int2 kernel_size, const int2 padding) { | const int2 kernel_size, const int2 padding) { | ||||
| @@ -2,27 +2,23 @@ | |||||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | ||||
| __kernel void reshape_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, int4 size_out) { | __kernel void reshape_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, int4 size_out) { | ||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size_out.x || Y >= size_out.y || Z >= size_out.z) { | |||||
| if (X >= size_out.x * size_out.y * size_out.z * size_out.w) { | |||||
| return; | return; | ||||
| } | } | ||||
| int out_index = X * size_out.y + Y; | |||||
| int ih = out_index / size.y; | |||||
| int iw = out_index % size.y; | |||||
| WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(iw * size.z + Z, ih))); | |||||
| int in_img_x = size.z * size.w; | |||||
| int out_img_x = size_out.z * size_out.w; | |||||
| WRITE_IMAGE(dst_data, (int2)(X % out_img_x, X / out_img_x), | |||||
| READ_IMAGE(src_data, smp_zero, (int2)(X % in_img_x, X / in_img_x))); | |||||
| } | } | ||||
| __kernel void reshape_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, | __kernel void reshape_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, | ||||
| int4 size_out) { | int4 size_out) { | ||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size_out.x || Y >= size_out.y || Z >= size_out.z) { | |||||
| if (X >= size_out.x * size_out.y * size_out.z * size_out.w) { | |||||
| return; | return; | ||||
| } | } | ||||
| int out_index = X * size_out.y + Y; | |||||
| int ih = out_index / size.y; | |||||
| int iw = out_index % size.y; | |||||
| WRITE_IMAGE(dst_data, (int2)(Y, Z * size_out.x + X), READ_IMAGE(src_data, smp_zero, (int2)(iw, Z * size.x + ih))); | |||||
| int in_img_x = size.z; | |||||
| int out_img_x = size_out.z; | |||||
| WRITE_IMAGE(dst_data, (int2)(X % out_img_x, X / out_img_x), | |||||
| READ_IMAGE(src_data, smp_zero, (int2)(X % in_img_x, X / in_img_x))); | |||||
| } | } | ||||
| @@ -0,0 +1,254 @@ | |||||
| /** | |||||
| * Copyright 2019 Huawei Technologies n., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include <set> | |||||
| #include <string> | |||||
| #include <map> | |||||
| #include "nnacl/fp32/common_func.h" | |||||
| #include "src/kernel_registry.h" | |||||
| #include "src/runtime/kernel/opencl/kernel/fullconnection.h" | |||||
| #ifndef PROGRAM_WITH_IL | |||||
| #include "src/runtime/kernel/opencl/cl/fullconnection.cl.inc" | |||||
| #endif | |||||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | |||||
| using mindspore::lite::KernelRegistrar; | |||||
| using mindspore::schema::PrimitiveType_FullConnection; | |||||
| namespace mindspore::kernel { | |||||
| int FullConnectionOpenCLKernel::Init() { | |||||
| std::string kernel_name = "FullConnection"; | |||||
| kernel_name += "_" + std::string(EnumNameFormat(op_format_)); | |||||
| auto param = reinterpret_cast<MatMulParameter *>(op_parameter_); | |||||
| transposeA = param->a_transpose_; | |||||
| if (transposeA) { | |||||
| MS_LOG(ERROR) << "fullconnection only support a_transpose_=false yet."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| transposeB = param->b_transpose_; | |||||
| enable_fp16_ = ocl_runtime_->GetFp16Enable(); | |||||
| if ((in_tensors_[0]->shape().size() != 4 && in_tensors_[0]->shape().size() != 2) || | |||||
| out_tensors_[0]->shape().size() != 2) { | |||||
| MS_LOG(ERROR) << "fullconnection only support input shape size = 2 or 4"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| if (in_tensors_[0]->shape().size() == 4) { | |||||
| if (in_tensors_[0]->shape()[3] % C4NUM != 0) { | |||||
| MS_LOG(ERROR) << "fullconnection only support input shape channel % 4 = 0 if input shape size = 4"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| inShape = {in_tensors_[0]->shape()[0], in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], | |||||
| in_tensors_[0]->shape()[3]}; | |||||
| } else { | |||||
| inShape = {in_tensors_[0]->shape()[0], 1, 1, in_tensors_[0]->shape()[1]}; | |||||
| } | |||||
| outShape = out_tensors_[0]->shape(); | |||||
| switch (param->act_type_) { | |||||
| case ActType_No: | |||||
| break; | |||||
| case ActType_Relu: | |||||
| kernel_name += "_ReLU"; | |||||
| break; | |||||
| default: | |||||
| MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_; | |||||
| return RET_ERROR; | |||||
| } | |||||
| #ifdef PROGRAM_WITH_IL | |||||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | |||||
| #else | |||||
| std::set<std::string> build_options; | |||||
| std::string source = fullconnection_source; | |||||
| std::string program_name = "FullConnection"; | |||||
| ocl_runtime_->LoadSource(program_name, source); | |||||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||||
| #endif | |||||
| PadWeight(); | |||||
| in_ori_format_ = in_tensors_[0]->GetFormat(); | |||||
| out_ori_format_ = out_tensors_[0]->GetFormat(); | |||||
| in_tensors_[0]->SetFormat(op_format_); | |||||
| out_tensors_[0]->SetFormat(op_format_); | |||||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||||
| return RET_OK; | |||||
| } | |||||
| int FullConnectionOpenCLKernel::ReSize() { return RET_OK; } | |||||
| void FullConnectionOpenCLKernel::PadWeight() { | |||||
| // ABMCI @ ABCICO = ABMCO | |||||
| auto allocator = ocl_runtime_->GetAllocator(); | |||||
| int ci = inShape[1] * inShape[2] * inShape[3]; | |||||
| int ci4 = UP_DIV(ci, C4NUM); | |||||
| int co = outShape[1]; | |||||
| int co4 = UP_DIV(co, C4NUM); | |||||
| int a = 1; | |||||
| int b = 1; | |||||
| size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float); | |||||
| padWeight_ = allocator->Malloc(a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size); | |||||
| padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true); | |||||
| auto padWeightFp32 = reinterpret_cast<float *>(padWeight_); | |||||
| auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_); | |||||
| memset(padWeight_, 0x00, a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size); | |||||
| auto originWeightFp32 = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->data_c()); | |||||
| auto originWeightFp16 = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->data_c()); | |||||
| bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16; | |||||
| // pad weight | |||||
| // ABCICO -> AB(CI4)(CO4)(4 from CO)(4 from CI) | |||||
| // if tranposeB, ABCOCI -> AB(CI4)(CO4)(4 from CO)(4 from CI) | |||||
| int index = 0; | |||||
| for (int aa = 0; aa < a; aa++) { | |||||
| for (int bb = 0; bb < b; bb++) { | |||||
| int baseAB = (aa * b + bb) * ci * co; | |||||
| for (int i = 0; i < ci4; ++i) { | |||||
| for (int j = 0; j < co4; ++j) { | |||||
| for (int k = 0; k < C4NUM; ++k) { | |||||
| for (int l = 0; l < C4NUM; ++l) { | |||||
| int src_ci = i * C4NUM + l; | |||||
| int src_co = j * C4NUM + k; | |||||
| if (src_ci < ci && src_co < co) { | |||||
| int originId = baseAB + src_ci * co + src_co; | |||||
| if (transposeB) { | |||||
| originId = baseAB + src_co * ci + src_ci; | |||||
| } | |||||
| if (enable_fp16_) { | |||||
| if (!isModelFp16) { | |||||
| padWeightFp16[index++] = originWeightFp32[originId]; | |||||
| } else { | |||||
| padWeightFp16[index++] = originWeightFp16[originId]; | |||||
| } | |||||
| } else { | |||||
| if (!isModelFp16) { | |||||
| padWeightFp32[index++] = originWeightFp32[originId]; | |||||
| } else { | |||||
| padWeightFp32[index++] = originWeightFp16[originId]; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| index++; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| allocator->UnmapBuffer(padWeight_); | |||||
| // pad FC Bias | |||||
| size_t im_dst_x, im_dst_y; | |||||
| im_dst_x = co4; | |||||
| im_dst_y = 1; | |||||
| size_t img_dtype = CL_FLOAT; | |||||
| if (enable_fp16_) { | |||||
| img_dtype = CL_HALF_FLOAT; | |||||
| } | |||||
| std::vector<size_t> img_size{im_dst_x, im_dst_y, img_dtype}; | |||||
| bias_ = allocator->Malloc(im_dst_x * im_dst_y * C4NUM * dtype_size, img_size); | |||||
| bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); | |||||
| memset(bias_, 0x00, co4 * C4NUM * dtype_size); | |||||
| if (in_tensors_.size() >= 3) { | |||||
| if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) { | |||||
| for (int i = 0; i < co; i++) { | |||||
| reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->data_c())[i]; | |||||
| } | |||||
| } else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) { | |||||
| for (int i = 0; i < co; i++) { | |||||
| reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c())[i]; | |||||
| } | |||||
| } else { | |||||
| memcpy(bias_, in_tensors_[2]->data_c(), co * dtype_size); | |||||
| } | |||||
| } | |||||
| allocator->UnmapBuffer(bias_); | |||||
| } | |||||
| int FullConnectionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { | |||||
| size_t im_dst_x, im_dst_y; | |||||
| auto out_shape = out_tensors_[0]->shape(); | |||||
| int n = 1, h = 1, w = 1, c = 1; | |||||
| if (out_tensors_[0]->shape().size() == 2) { | |||||
| n = out_shape[0]; | |||||
| c = out_shape[1]; | |||||
| } else { | |||||
| n = out_shape[0]; | |||||
| h = out_shape[1]; | |||||
| w = out_shape[2]; | |||||
| c = out_shape[3]; | |||||
| } | |||||
| if (op_format_ == schema::Format_NHWC4) { | |||||
| im_dst_x = w * UP_DIV(c, C4NUM); | |||||
| im_dst_y = n * h; | |||||
| } else if (op_format_ == schema::Format_NC4HW4) { | |||||
| im_dst_x = w; | |||||
| im_dst_y = n * UP_DIV(c, C4NUM) * h; | |||||
| } else { | |||||
| MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_); | |||||
| return RET_ERROR; | |||||
| } | |||||
| size_t img_dtype = CL_FLOAT; | |||||
| if (enable_fp16_) { | |||||
| img_dtype = CL_HALF_FLOAT; | |||||
| } | |||||
| img_size->clear(); | |||||
| std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype}; | |||||
| *img_size = vec; | |||||
| return RET_OK; | |||||
| } | |||||
| int FullConnectionOpenCLKernel::Run() { | |||||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||||
| // local size should less than MAX_GROUP_SIZE | |||||
| std::vector<size_t> local = {32, 4, 1}; | |||||
| std::vector<size_t> global = {UP_DIV(static_cast<size_t>(outShape[1]), C4NUM), 4, static_cast<size_t>(outShape[0])}; | |||||
| int arg_count = 0; | |||||
| cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]}; | |||||
| cl_int2 out_shape = {outShape[0], outShape[1]}; | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape); | |||||
| ocl_runtime_->RunKernel(kernel_, global, local, nullptr); | |||||
| return RET_OK; | |||||
| } | |||||
| kernel::LiteKernel *OpenCLFullConnectionKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||||
| const std::vector<lite::Tensor *> &outputs, | |||||
| OpParameter *opParameter, const lite::InnerContext *ctx, | |||||
| const kernel::KernelKey &desc, | |||||
| const mindspore::lite::PrimitiveC *primitive) { | |||||
| auto *kernel = | |||||
| new (std::nothrow) FullConnectionOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||||
| if (kernel == nullptr) { | |||||
| MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; | |||||
| return nullptr; | |||||
| } | |||||
| auto ret = kernel->Init(); | |||||
| if (ret != RET_OK) { | |||||
| delete kernel; | |||||
| return nullptr; | |||||
| } | |||||
| return kernel; | |||||
| } | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FullConnection, OpenCLFullConnectionKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FullConnection, OpenCLFullConnectionKernelCreator) | |||||
| } // namespace mindspore::kernel | |||||
| @@ -0,0 +1,52 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_FULLCONNECTION_H_ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_FULLCONNECTION_H_ | |||||
| #include <vector> | |||||
| #include "src/runtime/kernel/opencl/opencl_kernel.h" | |||||
| #include "nnacl/matmul_parameter.h" | |||||
| namespace mindspore::kernel { | |||||
| class FullConnectionOpenCLKernel : public OpenCLKernel { | |||||
| public: | |||||
| explicit FullConnectionOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||||
| const std::vector<lite::Tensor *> &outputs) | |||||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||||
| ~FullConnectionOpenCLKernel() override{}; | |||||
| int Init() override; | |||||
| int ReSize() override; | |||||
| int Run() override; | |||||
| void PadWeight(); | |||||
| int GetImageSize(size_t idx, std::vector<size_t> *img_size) override; | |||||
| private: | |||||
| cl::Kernel kernel_; | |||||
| void *padWeight_; | |||||
| void *bias_; | |||||
| bool enable_fp16_{false}; | |||||
| bool transposeA{false}; | |||||
| bool transposeB{true}; | |||||
| std::vector<int> inShape; | |||||
| std::vector<int> outShape; | |||||
| }; | |||||
| } // namespace mindspore::kernel | |||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_FULLCONNECTION_H_ | |||||
| @@ -26,7 +26,6 @@ | |||||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | using mindspore::kernel::KERNEL_ARCH::kGPU; | ||||
| using mindspore::lite::KernelRegistrar; | using mindspore::lite::KernelRegistrar; | ||||
| using mindspore::schema::PrimitiveType_FullConnection; | |||||
| using mindspore::schema::PrimitiveType_MatMul; | using mindspore::schema::PrimitiveType_MatMul; | ||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| @@ -135,33 +134,7 @@ void MatMulOpenCLKernel::PadWeight() { | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| // pad FC Bias | |||||
| size_t im_dst_x, im_dst_y; | |||||
| im_dst_x = co4; | |||||
| im_dst_y = 1; | |||||
| size_t img_dtype = CL_FLOAT; | |||||
| if (enable_fp16_) { | |||||
| img_dtype = CL_HALF_FLOAT; | |||||
| } | |||||
| std::vector<size_t> img_size{im_dst_x, im_dst_y, img_dtype}; | |||||
| bias_ = allocator->Malloc(im_dst_x * im_dst_y * C4NUM * dtype_size, img_size); | |||||
| bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); | |||||
| memset(bias_, 0x00, co4 * C4NUM * dtype_size); | |||||
| if (in_tensors_.size() >= 3) { | |||||
| if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) { | |||||
| for (int i = 0; i < co; i++) { | |||||
| reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->data_c())[i]; | |||||
| } | |||||
| } else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) { | |||||
| for (int i = 0; i < co; i++) { | |||||
| reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c())[i]; | |||||
| } | |||||
| } else { | |||||
| memcpy(bias_, in_tensors_[2]->data_c(), co * dtype_size); | |||||
| } | |||||
| } | |||||
| allocator->UnmapBuffer(bias_); | |||||
| allocator->UnmapBuffer(padWeight_); | |||||
| } | } | ||||
| int MatMulOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { | int MatMulOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { | ||||
| @@ -209,11 +182,9 @@ int MatMulOpenCLKernel::Run() { | |||||
| cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]}; | cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]}; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); | ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape); | ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape); | ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, hasBias_ ? 1 : 0); | |||||
| ocl_runtime_->RunKernel(kernel_, global, local, nullptr); | ocl_runtime_->RunKernel(kernel_, global, local, nullptr); | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -222,12 +193,7 @@ kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector<lite::Tensor *> | |||||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | ||||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | const lite::InnerContext *ctx, const kernel::KernelKey &desc, | ||||
| const mindspore::lite::PrimitiveC *primitive) { | const mindspore::lite::PrimitiveC *primitive) { | ||||
| bool hasBias = false; | |||||
| if (opParameter->type_ == PrimitiveType_FullConnection) { | |||||
| hasBias = (reinterpret_cast<MatMulParameter *>(opParameter))->has_bias_; | |||||
| } | |||||
| auto *kernel = | |||||
| new (std::nothrow) MatMulOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs, hasBias); | |||||
| auto *kernel = new (std::nothrow) MatMulOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||||
| if (kernel == nullptr) { | if (kernel == nullptr) { | ||||
| MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; | MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; | ||||
| return nullptr; | return nullptr; | ||||
| @@ -241,7 +207,5 @@ kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector<lite::Tensor *> | |||||
| } | } | ||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_MatMul, OpenCLMatMulKernelCreator) | REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_MatMul, OpenCLMatMulKernelCreator) | ||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FullConnection, OpenCLMatMulKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_MatMul, OpenCLMatMulKernelCreator) | REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_MatMul, OpenCLMatMulKernelCreator) | ||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FullConnection, OpenCLMatMulKernelCreator) | |||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -27,10 +27,8 @@ namespace mindspore::kernel { | |||||
| class MatMulOpenCLKernel : public OpenCLKernel { | class MatMulOpenCLKernel : public OpenCLKernel { | ||||
| public: | public: | ||||
| explicit MatMulOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | explicit MatMulOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | ||||
| const std::vector<lite::Tensor *> &outputs, bool hasBias) | |||||
| : OpenCLKernel(parameter, inputs, outputs), inShape(MAX_DIMS, 1), outShape(MAX_DIMS, 1) { | |||||
| hasBias_ = hasBias; | |||||
| } | |||||
| const std::vector<lite::Tensor *> &outputs) | |||||
| : OpenCLKernel(parameter, inputs, outputs), inShape(MAX_DIMS, 1), outShape(MAX_DIMS, 1) {} | |||||
| ~MatMulOpenCLKernel() override{}; | ~MatMulOpenCLKernel() override{}; | ||||
| int Init() override; | int Init() override; | ||||
| @@ -42,8 +40,6 @@ class MatMulOpenCLKernel : public OpenCLKernel { | |||||
| private: | private: | ||||
| cl::Kernel kernel_; | cl::Kernel kernel_; | ||||
| void *padWeight_; | void *padWeight_; | ||||
| void *bias_; | |||||
| bool hasBias_{false}; | |||||
| bool enable_fp16_{false}; | bool enable_fp16_{false}; | ||||
| bool transposeA{false}; | bool transposeA{false}; | ||||
| bool transposeB{true}; | bool transposeB{true}; | ||||
| @@ -57,6 +57,16 @@ int PoolingOpenCLKernel::Init() { | |||||
| MS_LOG(ERROR) << "Init `Pooling2d` kernel failed!"; | MS_LOG(ERROR) << "Init `Pooling2d` kernel failed!"; | ||||
| return RET_INVALID_OP_NAME; | return RET_INVALID_OP_NAME; | ||||
| } | } | ||||
| switch (parameter_->act_type_) { | |||||
| case ActType_No: | |||||
| break; | |||||
| case ActType_Relu: | |||||
| kernel_name += "_ReLU"; | |||||
| break; | |||||
| default: | |||||
| MS_LOG(ERROR) << "Unsupported activation type " << parameter_->act_type_; | |||||
| return RET_ERROR; | |||||
| } | |||||
| enable_fp16_ = ocl_runtime_->GetFp16Enable(); | enable_fp16_ = ocl_runtime_->GetFp16Enable(); | ||||
| #ifdef PROGRAM_WITH_IL | #ifdef PROGRAM_WITH_IL | ||||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | ||||
| @@ -38,11 +38,24 @@ int ReshapeOpenCLKernel::Init() { | |||||
| MS_LOG(ERROR) << "Reshape output size should in 2,4"; | MS_LOG(ERROR) << "Reshape output size should in 2,4"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| if (in_tensors_[0]->shape().back() != out_tensors_[0]->shape().back()) { | |||||
| MS_LOG(ERROR) << "Reshape input channel " << in_tensors_[0]->shape().back() << " should equal output channel" | |||||
| << out_tensors_[0]->shape().back(); | |||||
| if ((in_tensors_[0]->shape().back() % 4 != 0 || out_tensors_[0]->shape().back() % 4 != 0) && | |||||
| in_tensors_[0]->shape().back() != out_tensors_[0]->shape().back()) { | |||||
| MS_LOG(ERROR) << "Reshape input channel align 4 should equal output channel, cin:" << in_tensors_[0]->shape().back() | |||||
| << " cout:" << out_tensors_[0]->shape().back(); | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| if (in_tensors_[0]->shape().size() == 2) { | |||||
| inShape = {in_tensors_[0]->shape()[0], 1, 1, in_tensors_[0]->shape()[1]}; | |||||
| } else { | |||||
| inShape = {in_tensors_[0]->shape()[0], in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], | |||||
| in_tensors_[0]->shape()[3]}; | |||||
| } | |||||
| if (out_tensors_[0]->shape().size() == 2) { | |||||
| outShape = {out_tensors_[0]->shape()[0], 1, 1, out_tensors_[0]->shape()[1]}; | |||||
| } else { | |||||
| outShape = {out_tensors_[0]->shape()[0], out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2], | |||||
| out_tensors_[0]->shape()[3]}; | |||||
| } | |||||
| #ifdef PROGRAM_WITH_IL | #ifdef PROGRAM_WITH_IL | ||||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | ||||
| #else | #else | ||||
| @@ -64,18 +77,10 @@ int ReshapeOpenCLKernel::ReSize() { return RET_OK; } | |||||
| int ReshapeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { | int ReshapeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { | ||||
| size_t im_dst_x, im_dst_y; | size_t im_dst_x, im_dst_y; | ||||
| std::vector<int> shapex = out_tensors_[0]->shape(); | |||||
| int n, h, w, c; | |||||
| if (shapex.size() == 2) { | |||||
| n = shapex[0]; | |||||
| h = w = 1; | |||||
| c = shapex[1]; | |||||
| } else { | |||||
| n = shapex[0]; | |||||
| h = shapex[1]; | |||||
| w = shapex[2]; | |||||
| c = shapex[3]; | |||||
| } | |||||
| int n = outShape[0]; | |||||
| int h = outShape[1]; | |||||
| int w = outShape[2]; | |||||
| int c = outShape[3]; | |||||
| if (op_format_ == schema::Format::Format_NHWC4) { | if (op_format_ == schema::Format::Format_NHWC4) { | ||||
| im_dst_x = w * UP_DIV(c, C4NUM); | im_dst_x = w * UP_DIV(c, C4NUM); | ||||
| im_dst_y = n * h; | im_dst_y = n * h; | ||||
| @@ -98,22 +103,12 @@ int ReshapeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) | |||||
| int ReshapeOpenCLKernel::Run() { | int ReshapeOpenCLKernel::Run() { | ||||
| MS_LOG(DEBUG) << this->name() << " Running!"; | MS_LOG(DEBUG) << this->name() << " Running!"; | ||||
| std::vector<int> shapex = in_tensors_[0]->shape(); | |||||
| int h = shapex[1]; | |||||
| int w = shapex[2]; | |||||
| int c = shapex[3]; | |||||
| int c4 = UP_DIV(c, C4NUM); | |||||
| int oh, ow; | |||||
| if (out_tensors_[0]->shape().size() == 2) { | |||||
| oh = ow = 1; | |||||
| } else { | |||||
| oh = out_tensors_[0]->shape()[1]; | |||||
| ow = out_tensors_[0]->shape()[2]; | |||||
| } | |||||
| std::vector<size_t> local = {}; | std::vector<size_t> local = {}; | ||||
| std::vector<size_t> global = {(size_t)oh, (size_t)ow, (size_t)c4}; | |||||
| cl_int4 size = {h, w, c4, 1}; | |||||
| cl_int4 size_out = {oh, ow, c4, 1}; | |||||
| std::vector<size_t> global = { | |||||
| static_cast<size_t>(outShape[0] * outShape[1] * outShape[2] * UP_DIV(outShape[3], C4NUM))}; | |||||
| cl_int4 size = {inShape[0], inShape[1], inShape[2], UP_DIV(inShape[3], C4NUM)}; | |||||
| cl_int4 size_out = {outShape[0], outShape[1], outShape[2], UP_DIV(outShape[3], C4NUM)}; | |||||
| int arg_idx = 0; | int arg_idx = 0; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ||||
| @@ -38,6 +38,8 @@ class ReshapeOpenCLKernel : public OpenCLKernel { | |||||
| private: | private: | ||||
| cl::Kernel kernel_; | cl::Kernel kernel_; | ||||
| bool enable_fp16_{false}; | bool enable_fp16_{false}; | ||||
| std::vector<int> inShape; | |||||
| std::vector<int> outShape; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -31,7 +31,6 @@ ml_hardware_liveness | |||||
| ml_liveness_detect_landmark | ml_liveness_detect_landmark | ||||
| ml_face_contour | ml_face_contour | ||||
| 2012_ATLANTA_1class_20190621_v4.x_nomean | 2012_ATLANTA_1class_20190621_v4.x_nomean | ||||
| ml_handpose | |||||
| ml_ocr_sfz_add_final_0325 | ml_ocr_sfz_add_final_0325 | ||||
| ml_hardware_pose | ml_hardware_pose | ||||
| ml_bank_recog | ml_bank_recog | ||||
| @@ -0,0 +1,196 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include <iostream> | |||||
| #include <memory> | |||||
| #include "src/common/log_adapter.h" | |||||
| #include "common/common_test.h" | |||||
| #include "mindspore/lite/src/common/file_utils.h" | |||||
| #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" | |||||
| #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" | |||||
| #include "mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h" | |||||
| #include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h" | |||||
| namespace mindspore { | |||||
| class TestFullConnectionOpenCL : public mindspore::CommonTest { | |||||
| public: | |||||
| TestFullConnectionOpenCL() {} | |||||
| }; | |||||
| void RunTestCaseFullConnection(const std::vector<int> &shape, void *input_data, void *weight_data, void *bias_data, | |||||
| void *output_data, bool enable_fp16, int dims) { | |||||
| auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); | |||||
| ocl_runtime->Init(); | |||||
| size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); | |||||
| ocl_runtime->SetFp16Enable(enable_fp16); | |||||
| auto allocator = ocl_runtime->GetAllocator(); | |||||
| std::vector<int> input_shape, output_shape, weight_shape, bias_shape; | |||||
| if (dims == 2) { | |||||
| int ci = shape[0]; | |||||
| int co = shape[1]; | |||||
| input_shape = {1, ci}; | |||||
| output_shape = {1, co}; | |||||
| weight_shape = {co, ci}; | |||||
| bias_shape = {co}; | |||||
| } else if (dims == 4) { | |||||
| int n = shape[0]; | |||||
| int h = shape[1]; | |||||
| int w = shape[2]; | |||||
| int ci = shape[3]; | |||||
| int co = shape[4]; | |||||
| input_shape = {n, h, w, ci}; | |||||
| output_shape = {n, co}; | |||||
| weight_shape = {co, h * w * ci}; | |||||
| bias_shape = {co}; | |||||
| } | |||||
| auto param = static_cast<MatMulParameter *>(malloc(sizeof(MatMulParameter))); | |||||
| if (param == nullptr) { | |||||
| MS_LOG(ERROR) << "param_ptr create error."; | |||||
| return; | |||||
| } | |||||
| param->a_transpose_ = false; | |||||
| param->b_transpose_ = true; | |||||
| param->has_bias_ = true; | |||||
| auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), | |||||
| input_shape, dims == 2 ? schema::Format_NC : schema::Format_NHWC); | |||||
| auto tensor_x = tensor_x_ptr.get(); | |||||
| if (tensor_x == nullptr) { | |||||
| MS_LOG(ERROR) << "tensor_x create error."; | |||||
| return; | |||||
| } | |||||
| auto tensor_w_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), | |||||
| weight_shape, schema::Format_NC); | |||||
| auto tensor_w = tensor_w_ptr.get(); | |||||
| if (tensor_w == nullptr) { | |||||
| MS_LOG(ERROR) << "tensor_w create error."; | |||||
| return; | |||||
| } | |||||
| tensor_w->SetData(weight_data); | |||||
| auto tensor_bias_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), | |||||
| bias_shape, schema::Format_NC); | |||||
| auto tensor_bias = tensor_bias_ptr.get(); | |||||
| if (tensor_bias == nullptr) { | |||||
| MS_LOG(ERROR) << "tensor_w create error."; | |||||
| return; | |||||
| } | |||||
| tensor_bias->SetData(bias_data); | |||||
| auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), | |||||
| output_shape, schema::Format_NC); | |||||
| auto tensor_out = tensor_out_ptr.get(); | |||||
| if (tensor_out == nullptr) { | |||||
| MS_LOG(ERROR) << "tensor_out create error."; | |||||
| return; | |||||
| } | |||||
| std::vector<lite::Tensor *> inputs{tensor_x, tensor_w, tensor_bias}; | |||||
| std::vector<lite::Tensor *> outputs{tensor_out}; | |||||
| auto op_kernel_ptr = | |||||
| std::make_unique<kernel::FullConnectionOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||||
| auto op_kernel = op_kernel_ptr.release(); | |||||
| if (op_kernel == nullptr) { | |||||
| MS_LOG(ERROR) << "op_kernel create error."; | |||||
| return; | |||||
| } | |||||
| op_kernel->Init(); | |||||
| inputs[0]->MallocData(allocator); | |||||
| std::vector<kernel::LiteKernel *> kernels{op_kernel}; | |||||
| std::vector<lite::Tensor *> inputs_g{tensor_x}; | |||||
| auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_g, outputs, kernels, kernels, kernels); | |||||
| auto pGraph = pGraph_ptr.get(); | |||||
| if (pGraph == nullptr) { | |||||
| MS_LOG(ERROR) << "pGraph create error."; | |||||
| return; | |||||
| } | |||||
| pGraph->Init(); | |||||
| memcpy(inputs[0]->MutableData(), input_data, tensor_x->ElementsNum() * dtype_size); | |||||
| pGraph->Run(); | |||||
| if (enable_fp16) { | |||||
| CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float16_t>(1e-3), | |||||
| 2e-2); | |||||
| } else { | |||||
| CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float>(1e-5)); | |||||
| } | |||||
| for (auto t : inputs) { | |||||
| t->SetData(nullptr); | |||||
| } | |||||
| for (auto t : outputs) { | |||||
| t->SetData(nullptr); | |||||
| } | |||||
| MS_LOG(INFO) << "TestFullConnection passed"; | |||||
| } | |||||
| TEST_F(TestFullConnectionOpenCL, FullConnection2DFp32) { | |||||
| int ci = 5; | |||||
| int co = 3; | |||||
| std::vector<int> shape = {ci, co}; | |||||
| std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f}; | |||||
| std::vector<float> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, | |||||
| 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; | |||||
| std::vector<float> bias_data = {1.0f, 1.0f, 1.0f}; | |||||
| std::vector<float> output_data = {11.f, 11.f, 11.f}; | |||||
| RunTestCaseFullConnection(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), false, | |||||
| 2); | |||||
| } | |||||
| TEST_F(TestFullConnectionOpenCL, FullConnection2DFp16) { | |||||
| int ci = 5; | |||||
| int co = 3; | |||||
| std::vector<int> shape = {ci, co}; | |||||
| std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f}; | |||||
| std::vector<float16_t> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, | |||||
| 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; | |||||
| std::vector<float16_t> bias_data = {1.0f, 1.0f, 1.0f}; | |||||
| std::vector<float16_t> output_data = {11.f, 11.f, 11.f}; | |||||
| RunTestCaseFullConnection(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), true, | |||||
| 2); | |||||
| } | |||||
| TEST_F(TestFullConnectionOpenCL, FullConnection4DFp32) { | |||||
| int n = 1; | |||||
| int h = 2; | |||||
| int w = 1; | |||||
| int c = 4; | |||||
| int co = 2; | |||||
| std::vector<int> shape = {n, h, w, c, co}; | |||||
| std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; | |||||
| std::vector<float> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, | |||||
| 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; | |||||
| std::vector<float> bias_data = {1.0f, 1.0f}; | |||||
| std::vector<float> output_data = {29.f, 29.f}; | |||||
| RunTestCaseFullConnection(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), false, | |||||
| 4); | |||||
| } | |||||
| TEST_F(TestFullConnectionOpenCL, FullConnection4DFp16) { | |||||
| int n = 1; | |||||
| int h = 2; | |||||
| int w = 1; | |||||
| int c = 4; | |||||
| int co = 2; | |||||
| std::vector<int> shape = {n, h, w, c, co}; | |||||
| std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; | |||||
| std::vector<float16_t> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, | |||||
| 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; | |||||
| std::vector<float16_t> bias_data = {1.0f, 1.0f}; | |||||
| std::vector<float16_t> output_data = {29.f, 29.f}; | |||||
| RunTestCaseFullConnection(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), true, | |||||
| 4); | |||||
| } | |||||
| } // namespace mindspore | |||||
| @@ -88,7 +88,7 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we | |||||
| std::vector<lite::Tensor *> inputs{tensor_x, tensor_w}; | std::vector<lite::Tensor *> inputs{tensor_x, tensor_w}; | ||||
| std::vector<lite::Tensor *> outputs{tensor_out}; | std::vector<lite::Tensor *> outputs{tensor_out}; | ||||
| auto op_kernel_ptr = | auto op_kernel_ptr = | ||||
| std::make_unique<kernel::MatMulOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs, false); | |||||
| std::make_unique<kernel::MatMulOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||||
| auto op_kernel = op_kernel_ptr.release(); | auto op_kernel = op_kernel_ptr.release(); | ||||
| if (op_kernel == nullptr) { | if (op_kernel == nullptr) { | ||||
| MS_LOG(ERROR) << "op_kernel create error."; | MS_LOG(ERROR) << "op_kernel create error."; | ||||
| @@ -52,7 +52,7 @@ void RunTestCaseReshape(const std::vector<int> &shape, void *input_data, void *o | |||||
| } | } | ||||
| std::vector<int> out_shape = {n, oh, ow, c}; | std::vector<int> out_shape = {n, oh, ow, c}; | ||||
| if (is_output_2d) { | if (is_output_2d) { | ||||
| out_shape = {n, c}; | |||||
| out_shape = {n, h * w * c}; | |||||
| } | } | ||||
| auto tensor_out_ptr = | auto tensor_out_ptr = | ||||
| std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape, | std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape, | ||||
| @@ -156,4 +156,20 @@ TEST_F(TestReshapeOpenCL, Reshape4DFp16) { | |||||
| RunTestCaseReshape(shape, input_data.data(), output_data.data(), true, false); | RunTestCaseReshape(shape, input_data.data(), output_data.data(), true, false); | ||||
| } | } | ||||
| TEST_F(TestReshapeOpenCL, Reshape4D2DFp32) { | |||||
| int n = 1; | |||||
| int h = 2; | |||||
| int w = 2; | |||||
| int c = 4; | |||||
| int oh = 2; | |||||
| int ow = 2; | |||||
| std::vector<int> shape = {n, h, w, c, oh, ow}; | |||||
| std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, | |||||
| 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; | |||||
| std::vector<float> output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, | |||||
| 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; | |||||
| RunTestCaseReshape(shape, input_data.data(), output_data.data(), false, true); | |||||
| } | |||||
| } // namespace mindspore | } // namespace mindspore | ||||