| @@ -273,6 +273,7 @@ int LiteSession::Init(Context *context) { | |||
| if (context_->device_ctx_.type == DT_GPU) { | |||
| auto opencl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| opencl_runtime->Init(); | |||
| MS_LOG(INFO) << "Init OpenCL runtime."; | |||
| } | |||
| #endif | |||
| executor = new Executor(); | |||
| @@ -1,8 +1,9 @@ | |||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||
| __kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, | |||
| float relu_clip, __write_only image2d_t dst_data, int2 kernel_size, | |||
| int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) { | |||
| __write_only image2d_t dst_data, int2 kernel_size, | |||
| int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size, | |||
| float relu_clip_min, float relu_clip_max) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| @@ -27,13 +28,14 @@ __kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __globa | |||
| } | |||
| FLT4 bias_p = bias[Z]; | |||
| FLT4 res = TO_FLT4(r) + bias_p; | |||
| res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip)); | |||
| res = clamp(res, (FLT)(relu_clip_min), (FLT)(relu_clip_max)); | |||
| WRITE_IMAGE(dst_data, (int2)(X, (Z * dst_size.y + Y)), res); | |||
| } | |||
| __kernel void DepthwiseConv2d_IMG_NHWC4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, | |||
| float relu_clip, __write_only image2d_t dst_data, int2 kernel_size, | |||
| int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) { | |||
| __write_only image2d_t dst_data, int2 kernel_size, | |||
| int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size, | |||
| float relu_clip_min, float relu_clip_max) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| @@ -58,13 +60,14 @@ __kernel void DepthwiseConv2d_IMG_NHWC4(__read_only image2d_t src_data, __global | |||
| } | |||
| FLT4 bias_p = bias[Z]; | |||
| FLT4 res = TO_FLT4(r) + bias_p; | |||
| res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip)); | |||
| res = clamp(res, (FLT)(relu_clip_min), (FLT)(relu_clip_max)); | |||
| WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y), res); | |||
| } | |||
| __kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, | |||
| float relu_clip, __write_only image2d_t dst_data, int2 kernel_size, | |||
| int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) { | |||
| __write_only image2d_t dst_data, int2 kernel_size, int2 stride, | |||
| int2 padding, int2 dilation, int4 src_size, int4 dst_size, | |||
| float relu_clip_min, float relu_clip_max) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| @@ -81,7 +84,6 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __gl | |||
| bool outside_x = x_c < 0 || x_c >= src_size.x; | |||
| if (!outside_x && !outside_y) { | |||
| FLT4 flt_p = filter[fx_c]; | |||
| // FLT4 src_p =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; | |||
| FLT4 src_p = READ_IMAGE(src_data, smp_zero, (int2)(Z, (y_c * src_size.x + x_c) * src_size.z)); | |||
| r += TO_FLT4(src_p * flt_p); | |||
| } | |||
| @@ -89,13 +91,13 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __gl | |||
| } | |||
| FLT4 bias_p = bias[Z]; | |||
| FLT4 res = TO_FLT4(r) + bias_p; | |||
| res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip)); | |||
| // dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res; | |||
| res = clamp(res, (FLT)(relu_clip_min), (FLT)(relu_clip_max)); | |||
| WRITE_IMAGE(dst_data, (int2)(Z, (Y * dst_size.x + X) * dst_size.z), res); | |||
| } | |||
| __kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias, | |||
| float relu_clip, __global FLT4 *dst_data, int2 kernel_size, int2 stride, | |||
| int2 padding, int2 dilation, int4 src_size, int4 dst_size) { | |||
| __global FLT4 *dst_data, int2 kernel_size, int2 stride, | |||
| int2 padding, int2 dilation, int4 src_size, int4 dst_size, | |||
| float relu_clip_min, float relu_clip_max) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| @@ -120,13 +122,14 @@ __kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *src_data, __global FLT4 | |||
| } | |||
| FLT4 bias_p = bias[Z]; | |||
| FLT4 res = TO_FLT4(r) + bias_p; | |||
| res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip)); | |||
| res = clamp(res, (FLT)(relu_clip_min), (FLT)(relu_clip_max)); | |||
| dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res; | |||
| } | |||
| __kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias, | |||
| float relu_clip, __global FLT4 *dst_data, int2 kernel_size, int2 stride, | |||
| int2 padding, int2 dilation, int4 src_size, int4 dst_size) { | |||
| __global FLT4 *dst_data, int2 kernel_size, int2 stride, | |||
| int2 padding, int2 dilation, int4 src_size, int4 dst_size, | |||
| float relu_clip_min, float relu_clip_max) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| @@ -151,13 +154,14 @@ __kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *src_data, __global FLT4 * | |||
| } | |||
| FLT4 bias_p = bias[Z]; | |||
| FLT4 res = TO_FLT4(r) + bias_p; | |||
| res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip)); | |||
| res = clamp(res, (FLT)(relu_clip_min), (FLT)(relu_clip_max)); | |||
| dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res; | |||
| } | |||
| __kernel void DepthwiseConv2d_BUF_NHWC4_1x1(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias, | |||
| float relu_clip, __global FLT4 *dst_data, int2 kernel_size, int2 stride, | |||
| int2 padding, int2 dilation, int4 src_size, int4 dst_size) { | |||
| __global FLT4 *dst_data, int2 kernel_size, int2 stride, | |||
| int2 padding, int2 dilation, int4 src_size, int4 dst_size, | |||
| float relu_clip_min, float relu_clip_max) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| @@ -181,6 +185,6 @@ __kernel void DepthwiseConv2d_BUF_NHWC4_1x1(__global FLT4 *src_data, __global FL | |||
| } | |||
| FLT4 bias_p = bias[Z]; | |||
| FLT4 res = TO_FLT4(r) + bias_p; | |||
| res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip)); | |||
| res = clamp(res, (FLT)(relu_clip_min), (FLT)(relu_clip_max)); | |||
| dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res; | |||
| } | |||
| @@ -186,8 +186,8 @@ int Conv2dTransposeOpenCLKernel::Run() { | |||
| cl_int4 dst_size = {oh, ow, UP_DIV(co, C4NUM), 1}; | |||
| int arg_cnt = 0; | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cnt++, padWeight_); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cnt++, bias_); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cnt++, padWeight_, lite::opencl::MemType::BUF); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cnt++, bias_, lite::opencl::MemType::BUF); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cnt++, kernel_size); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cnt++, stride); | |||
| @@ -254,34 +254,34 @@ int ConvolutionOpenCLKernel::Run() { | |||
| arg_cn = 0; | |||
| cl_int4 _4x4to36_in_shape = {1, IH, IW, CI_SLICES}; | |||
| cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY, CI_SLICES}; | |||
| ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, in_tensors_[0]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, winograd_mem0_); | |||
| ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, in_tensors_[0]->Data(), lite::opencl::MemType::IMG); | |||
| ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG); | |||
| ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, _4x4to36_in_shape); | |||
| ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, _4x4to36_out_shape); | |||
| arg_cn = 0; | |||
| cl_int4 conv_in_shape = {1, 36, TILES_XY, CI_SLICES}; | |||
| cl_int4 conv_out_shape = {1, 36, TILES_XY, CO_SLICES}; | |||
| ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem0_); | |||
| ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem1_); | |||
| ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_); | |||
| ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG); | |||
| ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG); | |||
| ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); | |||
| ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, conv_in_shape); | |||
| ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, conv_out_shape); | |||
| arg_cn = 0; | |||
| cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY, CO_SLICES}; | |||
| cl_int4 _36to4x4_out_shape = {1, OH, OW, CO_SLICES}; | |||
| ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, winograd_mem1_); | |||
| ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, out_tensors_[0]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, packed_bias_); | |||
| ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG); | |||
| ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, out_tensors_[0]->Data(), lite::opencl::MemType::IMG); | |||
| ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); | |||
| ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_in_shape); | |||
| ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_out_shape); | |||
| } else { | |||
| arg_cn = 0; | |||
| ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, in_tensors_[0]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, out_tensors_[0]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_); | |||
| ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_bias_); | |||
| ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, in_tensors_[0]->Data(), lite::opencl::MemType::IMG); | |||
| ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, out_tensors_[0]->Data(), lite::opencl::MemType::IMG); | |||
| ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); | |||
| ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); | |||
| } | |||
| if (use_winograd_) { | |||
| @@ -594,9 +594,9 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { | |||
| auto param = reinterpret_cast<ConvParameter *>(op_parameter_); | |||
| if (param->act_type_ == ActType_Relu) { | |||
| code += " acc = max(acc, (float4)(0.0f));\n"; | |||
| code += " acc = max(acc, (FLT4)(0.0f));\n"; | |||
| } else if (param->act_type_ == ActType_Relu6) { | |||
| code += " acc = clamp(acc, (float4)(0.0f), (float4)(6.0f));\n"; | |||
| code += " acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));\n"; | |||
| } | |||
| code += | |||
| @@ -15,6 +15,7 @@ | |||
| */ | |||
| #include "src/runtime/kernel/opencl/kernel/depthwise_conv2d.h" | |||
| #include <float.h> | |||
| #include <string> | |||
| #include <set> | |||
| #include <utility> | |||
| @@ -180,7 +181,8 @@ int DepthwiseConv2dOpenCLKernel::Run() { | |||
| std::vector<size_t> local; | |||
| GetLocalSize(0, global, &local); | |||
| float relu_clip1 = 6.0; | |||
| std::map<ActType, std::pair<float, float>> relu_clips{ | |||
| {ActType_No, {FLT_MIN, FLT_MAX}}, {ActType_Relu, {0.0, FLT_MAX}}, {ActType_Relu6, {0, 6.0}}}; | |||
| cl_int2 kernel_size = {parameter->kernel_h_, parameter->kernel_w_}; | |||
| cl_int2 stride = {parameter->stride_h_, parameter->stride_w_}; | |||
| cl_int2 padding = {-parameter->pad_u_, -parameter->pad_l_}; | |||
| @@ -189,17 +191,19 @@ int DepthwiseConv2dOpenCLKernel::Run() { | |||
| cl_int4 dst_size = {(cl_int)out_tensors_[0]->Width(), (cl_int)out_tensors_[0]->Height(), (cl_int)CO4, | |||
| (cl_int)out_tensors_[0]->Batch()}; | |||
| ocl_runtime->SetKernelArg(kernel_, 1, packed_weight_); | |||
| ocl_runtime->SetKernelArg(kernel_, 2, bias_data_); | |||
| ocl_runtime->SetKernelArg(kernel_, 3, relu_clip1); | |||
| ocl_runtime->SetKernelArg(kernel_, 5, kernel_size); | |||
| ocl_runtime->SetKernelArg(kernel_, 6, stride); | |||
| ocl_runtime->SetKernelArg(kernel_, 7, padding); | |||
| ocl_runtime->SetKernelArg(kernel_, 8, dilation); | |||
| ocl_runtime->SetKernelArg(kernel_, 9, src_size); | |||
| ocl_runtime->SetKernelArg(kernel_, 10, dst_size); | |||
| ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_, 4, out_tensors_[0]->Data()); | |||
| int arg_cnt = 0; | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cnt++, packed_weight_, lite::opencl::MemType::BUF); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cnt++, bias_data_, lite::opencl::MemType::BUF); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cnt++, kernel_size); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cnt++, stride); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cnt++, padding); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cnt++, dilation); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cnt++, src_size); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cnt++, dst_size); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].first); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].second); | |||
| ocl_runtime->RunKernel(kernel_, global, local, nullptr); | |||
| return RET_OK; | |||
| } | |||
| @@ -163,8 +163,8 @@ int MatMulOpenCLKernel::Run() { | |||
| std::vector<size_t> global = {UP_ROUND(sizeCO.s[1], local[0]), 4}; | |||
| int arg_count = 0; | |||
| ocl_runtime->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_count++, padWeight_); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_count++, bias_); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_count++, bias_, lite::opencl::MemType::BUF); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_count++, sizeCI); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_count++, sizeCO); | |||
| @@ -147,8 +147,10 @@ int ToFormatOpenCLKernel::Run() { | |||
| cl_int4 shape{(cl_int)nhwc_shape_[0], (cl_int)nhwc_shape_[1], (cl_int)nhwc_shape_[2], (cl_int)nhwc_shape_[3]}; | |||
| cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1}; | |||
| ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_, 1, out_tensors_[0]->Data()); | |||
| auto src_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::BUF : lite::opencl::MemType::IMG; | |||
| auto dst_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::IMG : lite::opencl::MemType::BUF; | |||
| ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->Data(), src_mem_type); | |||
| ocl_runtime->SetKernelArg(kernel_, 1, out_tensors_[0]->Data(), dst_mem_type); | |||
| ocl_runtime->SetKernelArg(kernel_, 2, gsize); | |||
| ocl_runtime->SetKernelArg(kernel_, 3, shape); | |||
| ocl_runtime->RunKernel(kernel_, global, local, nullptr); | |||
| @@ -275,20 +275,23 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na | |||
| // fp16 enable, kernel will use half and read_imageh and write_imageh. | |||
| build_options_str = | |||
| "-DFLT=half -DFLT4=half4 -DFLT16=half16 " | |||
| "-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT4=convert_half4"; | |||
| "-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT4=convert_half4 "; | |||
| } else { | |||
| // fp16 not enable, kernel will use float and read_imagef and write_imagef. | |||
| build_options_str = | |||
| "-DFLT=float -DFLT4=float4 -DFLT16=float16 " | |||
| "-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT4=convert_float4"; | |||
| "-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT4=convert_float4 "; | |||
| } | |||
| build_options_str = std::accumulate( | |||
| build_options.begin(), build_options.end(), build_options_str, | |||
| [](const std::string &options, const std::string &option) -> std::string { return options + " " + option; }); | |||
| auto build_options_ext = std::accumulate( | |||
| build_options.begin(), build_options.end(), std::string(""), | |||
| [](const std::string &options, const std::string &option) -> std::string { | |||
| auto res = options + " " + option; | |||
| return res; | |||
| }); | |||
| build_options_str += default_build_opts_; | |||
| // program identifier = program_name + build_options | |||
| std::string build_program_key = program_name + build_options_str; | |||
| std::string build_program_key = program_name + build_options_str + build_options_ext; | |||
| auto build_program_it = program_map_.find(build_program_key); | |||
| cl::Program program; | |||
| @@ -409,9 +412,7 @@ GpuInfo OpenCLRuntime::ParseGpuInfo(std::string device_name, std::string device_ | |||
| bool OpenCLRuntime::LoadSource(const std::string &program_name, const std::string &source) { | |||
| auto it_source = g_opencl_program_map.find(program_name); | |||
| if (it_source != g_opencl_program_map.end()) { | |||
| it_source->second = source; | |||
| } else { | |||
| if (it_source == g_opencl_program_map.end()) { | |||
| g_opencl_program_map.emplace(program_name, source); | |||
| } | |||
| return true; | |||
| @@ -16,6 +16,8 @@ | |||
| #include "src/scheduler.h" | |||
| #include <vector> | |||
| #include <string> | |||
| #include <algorithm> | |||
| #include "include/errorcode.h" | |||
| #include "src/kernel_registry.h" | |||
| #include "src/common/graph_util.h" | |||
| @@ -140,7 +142,7 @@ int Scheduler::InitOp2Kernel(const lite::Model *model, std::vector<tensor::Tenso | |||
| outputs.emplace_back(tensors->at(size_t(outIndexes->GetAs<uint32_t>(j)))); | |||
| } | |||
| auto *primitive = model->GetOp(cNode->name()->str()); | |||
| auto *kernel = this->ScheduleNode(inputs, outputs, primitive); | |||
| auto *kernel = this->ScheduleNode(inputs, outputs, primitive, cNode); | |||
| if (nullptr == kernel) { | |||
| MS_LOG(ERROR) << "ScheduleNode return nullptr, name: " << cNode->name()->str() | |||
| << ", type: " << schema::EnumNamePrimitiveType(cNode->primitive()->value_type()); | |||
| @@ -176,6 +178,7 @@ void Scheduler::ConstructSubgraphs(std::vector<kernel::LiteKernel *> *kernels) { | |||
| } | |||
| std::vector<kernel::LiteKernel *> subgraph_kernels; | |||
| size_t sub_cnt{0}; | |||
| for (auto temp_kernels : sub_kernels_list) { | |||
| kernel::KERNEL_ARCH arch = temp_kernels.front()->desc().arch; | |||
| if (arch == kernel::KERNEL_ARCH::kCPU) { | |||
| @@ -194,7 +197,10 @@ void Scheduler::ConstructSubgraphs(std::vector<kernel::LiteKernel *> *kernels) { | |||
| } else { | |||
| auto subgraph_kernel = CreateSubKernel(temp_kernels, arch); | |||
| subgraph_kernels.emplace_back(subgraph_kernel); | |||
| std::string arch_name = (arch == kernel::KERNEL_ARCH::kGPU) ? "GPU" : "NPU"; | |||
| MS_LOG(INFO) << arch_name << " subgraph id" << sub_cnt << " created."; | |||
| } | |||
| ++sub_cnt; | |||
| } | |||
| kernels->clear(); | |||
| kernels->insert(kernels->begin(), subgraph_kernels.begin(), subgraph_kernels.end()); | |||
| @@ -223,7 +229,7 @@ kernel::LiteKernel *Scheduler::CreateSubKernel(const std::vector<kernel::LiteKer | |||
| kernel::LiteKernel *Scheduler::ScheduleNode(const std::vector<tensor::Tensor *> &in_tensors, | |||
| const std::vector<tensor::Tensor *> &out_tensors, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| const mindspore::lite::PrimitiveC *primitive, const schema::CNode *cnode) { | |||
| MS_ASSERT(nullptr != primitive); | |||
| auto data_type = in_tensors.front()->data_type(); | |||
| kernel::KernelKey desc{kernel::KERNEL_ARCH::kCPU, data_type, static_cast<schema::PrimitiveType>(primitive->Type())}; | |||
| @@ -233,6 +239,10 @@ kernel::LiteKernel *Scheduler::ScheduleNode(const std::vector<tensor::Tensor *> | |||
| if (nullptr != kernel) { | |||
| kernel->set_desc(desc); | |||
| return kernel; | |||
| } else { | |||
| MS_LOG(ERROR) << "Not supported GPU Op " | |||
| << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(primitive->Type())) << " " | |||
| << (cnode->name()->str()); | |||
| } | |||
| } | |||
| @@ -35,7 +35,8 @@ class Scheduler { | |||
| protected: | |||
| kernel::LiteKernel *ScheduleNode(const std::vector<tensor::Tensor *> &in_tensors, | |||
| const std::vector<tensor::Tensor *> &out_tensors, | |||
| const mindspore::lite::PrimitiveC *primitive); | |||
| const mindspore::lite::PrimitiveC *primitive, | |||
| const schema::CNode *cnode); | |||
| private: | |||
| int InitOp2Kernel(const lite::Model *model, std::vector<tensor::Tensor *> *tensors, | |||
| @@ -207,7 +207,7 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNC4HW4Fp32) { | |||
| float gnd_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022, 1.1872686, | |||
| 2.2294958, 1.6570128, 2.465089, 1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988}; | |||
| DepthWiseTestMain<float, float>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NC4HW4); | |||
| DepthWiseTestMain<float, float>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NC4HW4); | |||
| } | |||
| TEST_F(TestConvolutionDwOpenCL, PadNC4HW4Fp32) { | |||
| @@ -279,7 +279,7 @@ TEST_F(TestConvolutionDwOpenCL, PadNC4HW4Fp32) { | |||
| 0.8749627, 0.8953936, 0.5093431, 1.5496738, 0.54936385, 0.7683113, 1.165742, 1.3682933, | |||
| 1.0517888, 0.59817517, 0.75649744, 1.2075498, 0.38804203}; | |||
| DepthWiseTestMain<float, float>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NC4HW4); | |||
| DepthWiseTestMain<float, float>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NC4HW4); | |||
| } | |||
| TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp32) { | |||
| @@ -324,7 +324,8 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp32) { | |||
| float gnd_data[] = {3.3848767, 1.4446403, 1.8428744, 1.3194335, 2.5873442, 2.1384869, 2.04022, 1.1872686, | |||
| 2.2294958, 1.6570128, 2.465089, 1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988}; | |||
| DepthWiseTestMain<float, float>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4); | |||
| DepthWiseTestMain<float, float>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NHWC4); | |||
| // delete conv_param; | |||
| } | |||
| TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp32) { | |||
| @@ -396,7 +397,7 @@ TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp32) { | |||
| 0.8749627, 0.8953936, 0.5093431, 1.5496738, 0.54936385, 0.7683113, 1.165742, 1.3682933, | |||
| 1.0517888, 0.59817517, 0.75649744, 1.2075498, 0.38804203}; | |||
| DepthWiseTestMain<float, float>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4); | |||
| DepthWiseTestMain<float, float>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NHWC4); | |||
| } | |||
| TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp16) { | |||
| @@ -443,7 +444,7 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp16) { | |||
| 2.2294958, 1.6570128, 2.465089, 1.4294086, 2.7941442, 1.7871612, 2.188921, 1.0601988}; | |||
| lite::opencl::OpenCLRuntime::GetInstance()->SetFp16Enable(true); | |||
| DepthWiseTestMain<float16_t, float16_t>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4, | |||
| DepthWiseTestMain<float16_t, float16_t>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NHWC4, | |||
| kNumberTypeFloat16, true, 1e-2); | |||
| } | |||
| @@ -517,7 +518,7 @@ TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp16) { | |||
| 1.0517888, 0.59817517, 0.75649744, 1.2075498, 0.38804203}; | |||
| lite::opencl::OpenCLRuntime::GetInstance()->SetFp16Enable(true); | |||
| DepthWiseTestMain<float16_t, float16_t>(conv_param.get(), input_data, weight_data, gnd_data, schema::Format_NHWC4, | |||
| DepthWiseTestMain<float16_t, float16_t>(conv_param.release(), input_data, weight_data, gnd_data, schema::Format_NHWC4, | |||
| kNumberTypeFloat16, true, 1e-2); | |||
| } | |||