Merge pull request !7384 from 王东旭/fix_conv_no_bias_bugtags/v1.1.0
| @@ -76,7 +76,9 @@ int ConcatOpenCLKernel::Init() { | |||||
| auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_); | auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_); | ||||
| MS_LOG(DEBUG) << " concat at axis=: " << param->axis_; | MS_LOG(DEBUG) << " concat at axis=: " << param->axis_; | ||||
| param->axis_ = (param->axis_ == -1) ? (in_tensors_[0]->shape().size() - 1) : param->axis_; | |||||
| if (param->axis_ < 0) { | |||||
| param->axis_ += in_tensors_.front()->shape().size(); | |||||
| } | |||||
| if (param->axis_ < 0 || param->axis_ > 3) { | if (param->axis_ < 0 || param->axis_ > 3) { | ||||
| MS_LOG(ERROR) << " only support axis >= 0 and axis <= 3 "; | MS_LOG(ERROR) << " only support axis >= 0 and axis <= 3 "; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| @@ -65,6 +65,7 @@ int ConvolutionOpenCLKernel::Init() { | |||||
| CO_SLICES_ = UP_DIV(CO_, C4NUM); | CO_SLICES_ = UP_DIV(CO_, C4NUM); | ||||
| KH_ = param->kernel_h_; | KH_ = param->kernel_h_; | ||||
| KW_ = param->kernel_w_; | KW_ = param->kernel_w_; | ||||
| has_bias_ = in_tensors_.size() == 3; | |||||
| // note: TILES_X TILES_Y TILES_XY is only used when use_winograd_=true | // note: TILES_X TILES_Y TILES_XY is only used when use_winograd_=true | ||||
| TILES_X_ = UP_DIV(OW_, 4); | TILES_X_ = UP_DIV(OW_, 4); | ||||
| @@ -243,7 +244,9 @@ int ConvolutionOpenCLKernel::InitBias() { | |||||
| int ConvolutionOpenCLKernel::InitBuffer() { | int ConvolutionOpenCLKernel::InitBuffer() { | ||||
| InitWeight(); | InitWeight(); | ||||
| InitBias(); | |||||
| if (has_bias_) { | |||||
| InitBias(); | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -298,7 +301,9 @@ int ConvolutionOpenCLKernel::Run() { | |||||
| cl_int4 _36to4x4_out_shape = {1, OH_, OW_, CO_SLICES_}; | cl_int4 _36to4x4_out_shape = {1, OH_, OW_, CO_SLICES_}; | ||||
| ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG); | ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG); | ||||
| ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG); | ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG); | ||||
| ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); | |||||
| if (has_bias_) { | |||||
| ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); | |||||
| } | |||||
| ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape); | ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape); | ||||
| ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_out_shape); | ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_out_shape); | ||||
| } else { | } else { | ||||
| @@ -306,7 +311,9 @@ int ConvolutionOpenCLKernel::Run() { | |||||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG); | ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG); | ||||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG); | ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG); | ||||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); | ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); | ||||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); | |||||
| if (has_bias_) { | |||||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); | |||||
| } | |||||
| if (op_format_ == Format_NC4HW4) { | if (op_format_ == Format_NC4HW4) { | ||||
| cl_int4 input_shape = {1, IH_, IW_, CI_SLICES_}; | cl_int4 input_shape = {1, IH_, IW_, CI_SLICES_}; | ||||
| cl_int4 output_shape = {1, OH_, OW_, CO_SLICES_}; | cl_int4 output_shape = {1, OH_, OW_, CO_SLICES_}; | ||||
| @@ -372,10 +379,14 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() { | |||||
| code += | code += | ||||
| "__kernel void Convolution(__read_only image2d_t input,\n" | "__kernel void Convolution(__read_only image2d_t input,\n" | ||||
| " __write_only image2d_t output,\n" | |||||
| " __global FLT4 *weight,\n" | |||||
| " __global FLT4 *bias)" | |||||
| "{\n"; | |||||
| " __write_only image2d_t output,\n"; | |||||
| if (has_bias_) { | |||||
| code += | |||||
| " __global FLT4 *weight,\n" | |||||
| " __global FLT4 *bias) {\n"; | |||||
| } else { | |||||
| code += " __global FLT4 *weight) {\n"; | |||||
| } | |||||
| code += " int n_oh = get_global_id(0); // [0, N*OH)\n"; | code += " int n_oh = get_global_id(0); // [0, N*OH)\n"; | ||||
| if (batch_size_ == 1) { | if (batch_size_ == 1) { | ||||
| @@ -426,17 +437,20 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() { | |||||
| " }\n" | " }\n" | ||||
| " }\n" | " }\n" | ||||
| " }\n\n"; | " }\n\n"; | ||||
| code += " FLT4 out0_c4_bias = out0_c4 + bias[co_slice];\n"; | |||||
| if (has_bias_) { | |||||
| code += " out0_c4 = out0_c4 + bias[co_slice];\n"; | |||||
| } | |||||
| if (param->act_type_ == ActType_Relu) { | if (param->act_type_ == ActType_Relu) { | ||||
| code += " out0_c4_bias = max(out0_c4_bias, (FLT4)(0.0f));\n"; | |||||
| code += " out0_c4 = max(out0_c4, (FLT4)(0.0f));\n"; | |||||
| } else if (param->act_type_ == ActType_Relu6) { | } else if (param->act_type_ == ActType_Relu6) { | ||||
| code += " out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n"; | |||||
| code += " out0_c4 = clamp(out0_c4, (FLT4)(0.0f), (FLT4)(6.0f));\n"; | |||||
| } | } | ||||
| if (OW_ * CO_SLICES_ <= MAX_IMAGE2D_SIZE) { | if (OW_ * CO_SLICES_ <= MAX_IMAGE2D_SIZE) { | ||||
| code += " WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, n_oh), out0_c4_bias);// NHWC4: NH WC\n}"; | |||||
| code += " WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, n_oh), out0_c4);// NHWC4: NH WC\n}"; | |||||
| } else { | } else { | ||||
| code += " WRITE_IMAGE(output, (int2)(n_oh * CO_SLICES + co_slice, ow), out0_c4_bias);\n}"; | |||||
| code += " WRITE_IMAGE(output, (int2)(n_oh * CO_SLICES + co_slice, ow), out0_c4);\n}"; | |||||
| } | } | ||||
| return code; | return code; | ||||
| } | } | ||||
| @@ -460,8 +474,11 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() { | |||||
| "\n" | "\n" | ||||
| "__kernel void Convolution(__read_only image2d_t input,\n" | "__kernel void Convolution(__read_only image2d_t input,\n" | ||||
| " __write_only image2d_t output,\n" | " __write_only image2d_t output,\n" | ||||
| " __global FLT4 *weight,\n" | |||||
| " __global FLT4 *bias,\n" | |||||
| " __global FLT4 *weight,\n"; | |||||
| if (has_bias_) { | |||||
| code += " __global FLT4 *bias,\n"; | |||||
| } | |||||
| code += | |||||
| " const int4 input_shape,\n" | " const int4 input_shape,\n" | ||||
| " const int4 output_shape)\n" | " const int4 output_shape)\n" | ||||
| "{\n"; | "{\n"; | ||||
| @@ -578,7 +595,10 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() { | |||||
| " }\n" | " }\n" | ||||
| " }\n\n"; | " }\n\n"; | ||||
| code += " out0 = out0 + bias[co_slice];\n"; | |||||
| if (has_bias_) { | |||||
| code += " out0 = out0 + bias[co_slice];\n"; | |||||
| } | |||||
| if (param->act_type_ == ActType_Relu) { | if (param->act_type_ == ActType_Relu) { | ||||
| code += " out0 = max(out0, (FLT4)(0.0f));\n"; | code += " out0 = max(out0, (FLT4)(0.0f));\n"; | ||||
| } else if (param->act_type_ == ActType_Relu6) { | } else if (param->act_type_ == ActType_Relu6) { | ||||
| @@ -591,7 +611,9 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() { | |||||
| " if (last_is_double)" | " if (last_is_double)" | ||||
| " {\n"; | " {\n"; | ||||
| } | } | ||||
| code += " out1 = out1 + bias[co_slice];\n"; | |||||
| if (has_bias_) { | |||||
| code += " out1 = out1 + bias[co_slice];\n"; | |||||
| } | |||||
| if (param->act_type_ == ActType_Relu) { | if (param->act_type_ == ActType_Relu) { | ||||
| code += " out1 = max(out1, (FLT4)(0.0f));\n"; | code += " out1 = max(out1, (FLT4)(0.0f));\n"; | ||||
| } else if (param->act_type_ == ActType_Relu6) { | } else if (param->act_type_ == ActType_Relu6) { | ||||
| @@ -788,8 +810,11 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { | |||||
| "};\n" | "};\n" | ||||
| "\n" | "\n" | ||||
| "__kernel void Winograd36To4x4(__read_only image2d_t input,\n" | "__kernel void Winograd36To4x4(__read_only image2d_t input,\n" | ||||
| " __write_only image2d_t output,\n" | |||||
| " __global FLT4 *bias,\n" | |||||
| " __write_only image2d_t output,\n"; | |||||
| if (has_bias_) { | |||||
| code += " __global FLT4 *bias,\n"; | |||||
| } | |||||
| code += | |||||
| " int4 input_shape, // N 36 H/4*W/4 CO_SLICES\n" | " int4 input_shape, // N 36 H/4*W/4 CO_SLICES\n" | ||||
| " int4 output_shape) // N H W CO_SLICES\n" | " int4 output_shape) // N H W CO_SLICES\n" | ||||
| "{\n" | "{\n" | ||||
| @@ -824,9 +849,10 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { | |||||
| " for (int y = 0; y < 6; y++)\n" | " for (int y = 0; y < 6; y++)\n" | ||||
| " {\n" | " {\n" | ||||
| " acc += AtM_row[y] * At[x * 6 + y];\n" | " acc += AtM_row[y] * At[x * 6 + y];\n" | ||||
| " }\n" | |||||
| " acc += bias[slice];\n" | |||||
| "\n"; | |||||
| " }\n"; | |||||
| if (has_bias_) { | |||||
| code += " acc += bias[slice];\n"; | |||||
| } | |||||
| auto param = reinterpret_cast<ConvParameter *>(op_parameter_); | auto param = reinterpret_cast<ConvParameter *>(op_parameter_); | ||||
| if (param->act_type_ == ActType_Relu) { | if (param->act_type_ == ActType_Relu) { | ||||
| @@ -35,44 +35,15 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { | |||||
| int Init() override; | int Init() override; | ||||
| int Run() override; | int Run() override; | ||||
| int InitBuffer(); | |||||
| int GetImageSize(size_t idx, std::vector<size_t> *img_size) override; | int GetImageSize(size_t idx, std::vector<size_t> *img_size) override; | ||||
| private: | private: | ||||
| bool use_fp16_ = false; | |||||
| int batch_size_{}; | |||||
| int CI_{}; | |||||
| int IH_{}; | |||||
| int IW_{}; | |||||
| int CO_{}; | |||||
| int OH_{}; | |||||
| int OW_{}; | |||||
| int CI_SLICES_{}; | |||||
| int CO_SLICES_{}; | |||||
| int KH_{}; | |||||
| int KW_{}; | |||||
| void *packed_weight_ = nullptr; | |||||
| void *packed_bias_ = nullptr; | |||||
| bool use_winograd_ = false; | |||||
| int TILES_X_{}; | |||||
| int TILES_Y_{}; | |||||
| int TILES_XY_{}; | |||||
| void *winograd_mem0_ = nullptr; | |||||
| void *winograd_mem1_ = nullptr; | |||||
| cl::Kernel kernel_4x4to36_; | |||||
| cl::Kernel kernel_conv_; | |||||
| cl::Kernel kernel_36to4x4_; | |||||
| int InitBuffer(); | |||||
| int InitWeight(); | int InitWeight(); | ||||
| int InitBias(); | int InitBias(); | ||||
| int GenerateWinogradWeight(); | int GenerateWinogradWeight(); | ||||
| std::string CodeGenConvolutionNHWC4(); | std::string CodeGenConvolutionNHWC4(); | ||||
| std::string CodeGenConvolutionNC4HW4(); | std::string CodeGenConvolutionNC4HW4(); | ||||
| std::string CodeGenWinograd4x4To36(); | std::string CodeGenWinograd4x4To36(); | ||||
| std::string CodeGenWinogradConvolution(); | std::string CodeGenWinogradConvolution(); | ||||
| std::string CodeGenWinograd36To4x4(); | std::string CodeGenWinograd36To4x4(); | ||||
| @@ -110,6 +81,7 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { | |||||
| param->pad_r_, | param->pad_r_, | ||||
| param->dilation_h_, | param->dilation_h_, | ||||
| param->dilation_w_, | param->dilation_w_, | ||||
| has_bias_, | |||||
| use_fp16_, | use_fp16_, | ||||
| op_format_, | op_format_, | ||||
| param->act_type_}; | param->act_type_}; | ||||
| @@ -119,6 +91,34 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { | |||||
| } | } | ||||
| return code_id; | return code_id; | ||||
| } | } | ||||
| bool use_fp16_ = false; | |||||
| int batch_size_{}; | |||||
| int CI_{}; | |||||
| int IH_{}; | |||||
| int IW_{}; | |||||
| int CO_{}; | |||||
| int OH_{}; | |||||
| int OW_{}; | |||||
| int CI_SLICES_{}; | |||||
| int CO_SLICES_{}; | |||||
| int KH_{}; | |||||
| int KW_{}; | |||||
| void *packed_weight_ = nullptr; | |||||
| void *packed_bias_ = nullptr; | |||||
| bool has_bias_ = false; | |||||
| bool use_winograd_ = false; | |||||
| int TILES_X_{}; | |||||
| int TILES_Y_{}; | |||||
| int TILES_XY_{}; | |||||
| void *winograd_mem0_ = nullptr; | |||||
| void *winograd_mem1_ = nullptr; | |||||
| cl::Kernel kernel_4x4to36_; | |||||
| cl::Kernel kernel_conv_; | |||||
| cl::Kernel kernel_36to4x4_; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -34,134 +34,71 @@ namespace mindspore::kernel { | |||||
| int ToFormatOpenCLKernel::Init() { | int ToFormatOpenCLKernel::Init() { | ||||
| auto parameter = reinterpret_cast<OpenCLToFormatParameter *>(op_parameter_); | auto parameter = reinterpret_cast<OpenCLToFormatParameter *>(op_parameter_); | ||||
| out_mem_type_ = parameter->out_mem_type; | out_mem_type_ = parameter->out_mem_type; | ||||
| std::string program_name = "to_format"; | |||||
| std::map<schema::Format, std::string> format_str{ | |||||
| {schema::Format::Format_NCHW, "NCHW"}, {schema::Format::Format_NHWC, "NHWC"}, | |||||
| {schema::Format::Format_NC4HW4, "NC4HW4"}, {schema::Format::Format_NC4, "NHWC4"}, | |||||
| {schema::Format::Format_NC, "NHWC"}, {schema::Format::Format_NHWC4, "NHWC4"}}; | |||||
| std::string kernel_name = | |||||
| "to_format_" + format_str[in_tensors_[0]->GetFormat()] + "_to_" + format_str[out_tensors_[0]->GetFormat()]; | |||||
| std::map<TypeId, std::string> dtype_str{ | |||||
| {kNumberTypeFloat32, "float"}, {kNumberTypeFloat16, "half"}, {kNumberTypeInt8, "Int8"}}; | |||||
| if (out_mem_type_ == OpenCLMemType::IMG) { | |||||
| kernel_name += "_IMG_" + dtype_str[in_tensors_[0]->data_type()]; | |||||
| std::map<TypeId, std::string> dtype_str{{kNumberTypeFloat32, "float"}, {kNumberTypeFloat16, "half"}}; | |||||
| std::string kernel_name; | |||||
| if (parameter->out_mem_type == OpenCLMemType::IMG) { | |||||
| kernel_name = "to_format_NHWC_to_NHWC4_IMG_" + dtype_str[in_tensors_[0]->data_type()]; | |||||
| } else { | } else { | ||||
| kernel_name += "_BUF_" + dtype_str[out_tensors_[0]->data_type()]; | |||||
| kernel_name = "to_format_NHWC4_to_NHWC_BUF_" + dtype_str[out_tensors_[0]->data_type()]; | |||||
| } | } | ||||
| this->set_name(kernel_name); | this->set_name(kernel_name); | ||||
| #ifdef PROGRAM_WITH_IL | #ifdef PROGRAM_WITH_IL | ||||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | ||||
| #else | #else | ||||
| std::string program_name = "to_format"; | |||||
| std::set<std::string> build_options; | std::set<std::string> build_options; | ||||
| std::string source = to_format_source; | std::string source = to_format_source; | ||||
| ocl_runtime_->LoadSource(program_name, source); | ocl_runtime_->LoadSource(program_name, source); | ||||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | ||||
| #endif | #endif | ||||
| InitNHWCShape(); | |||||
| InitNHWC(); | |||||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | MS_LOG(DEBUG) << kernel_name << " Init Done!"; | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int ToFormatOpenCLKernel::InitNHWCShape() { | |||||
| std::vector<int> shapex = out_tensors_[0]->shape(); | |||||
| size_t n, h, w, c; | |||||
| if (shapex.size() == 2) { | |||||
| n = shapex[0]; | |||||
| h = 1; | |||||
| w = 1; | |||||
| c = shapex[1]; | |||||
| nhwc_shape_ = {n, h, w, c}; | |||||
| return RET_OK; | |||||
| int ToFormatOpenCLKernel::InitNHWC() { | |||||
| std::vector<int> out_shape = out_tensors_[0]->shape(); | |||||
| if (out_shape.size() == 1) { | |||||
| N_ = out_shape[0]; | |||||
| H_ = 1; | |||||
| W_ = 1; | |||||
| C_ = 1; | |||||
| } else if (out_shape.size() == 2) { | |||||
| N_ = out_shape[0]; | |||||
| H_ = 1; | |||||
| W_ = 1; | |||||
| C_ = out_shape[1]; | |||||
| } else if (out_shape.size() == 3) { | |||||
| N_ = out_shape[0]; | |||||
| H_ = 1; | |||||
| W_ = out_shape[1]; | |||||
| C_ = out_shape[2]; | |||||
| } else if (out_shape.size() == 4) { | |||||
| N_ = out_shape[0]; | |||||
| H_ = out_shape[1]; | |||||
| W_ = out_shape[2]; | |||||
| C_ = out_shape[3]; | |||||
| } | } | ||||
| if (shapex.size() == 3) { | |||||
| n = 1; | |||||
| h = 1; | |||||
| w = 1; | |||||
| c = 1; | |||||
| nhwc_shape_ = {n, h, w, c}; | |||||
| return RET_OK; | |||||
| } | |||||
| if (out_tensors_[0]->GetFormat() == schema::Format::Format_NC4HW4 || | |||||
| out_tensors_[0]->GetFormat() == schema::Format::Format_NHWC4 || | |||||
| out_tensors_[0]->GetFormat() == schema::Format::Format_NHWC) { | |||||
| n = shapex[0]; | |||||
| h = shapex[1]; | |||||
| w = shapex[2]; | |||||
| c = shapex[3]; | |||||
| } else if (out_tensors_[0]->GetFormat() == schema::Format::Format_NCHW) { | |||||
| n = shapex[0]; | |||||
| h = shapex[2]; | |||||
| w = shapex[3]; | |||||
| c = shapex[1]; | |||||
| } else if (out_tensors_[0]->GetFormat() == schema::Format::Format_NC4 || | |||||
| out_tensors_[0]->GetFormat() == schema::Format::Format_NC) { | |||||
| n = shapex[0]; | |||||
| h = 1; | |||||
| w = 1; | |||||
| c = shapex[1]; | |||||
| } else { | |||||
| n = shapex[0]; | |||||
| h = shapex[1]; | |||||
| w = shapex[2]; | |||||
| c = shapex[3]; | |||||
| } | |||||
| nhwc_shape_ = {n, h, w, c}; | |||||
| return RET_OK; | |||||
| } | |||||
| int ToFormatOpenCLKernel::ReSize() { return RET_OK; } | |||||
| int ToFormatOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t> *global_size) { | |||||
| std::vector<size_t> vec = {nhwc_shape_[0] * nhwc_shape_[1], nhwc_shape_[2], UP_DIV(nhwc_shape_[3], C4NUM)}; | |||||
| *global_size = std::move(vec); | |||||
| return RET_OK; | |||||
| } | |||||
| int ToFormatOpenCLKernel::GetLocalSize(size_t idx, const std::vector<size_t> &global_size, | |||||
| std::vector<size_t> *local_size) { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { | int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { | ||||
| size_t im_dst_x, im_dst_y; | |||||
| if (out_tensors_[0]->GetFormat() == schema::Format::Format_NC4HW4) { | |||||
| int c = nhwc_shape_[3]; | |||||
| int h = nhwc_shape_[1]; | |||||
| int w = nhwc_shape_[2]; | |||||
| im_dst_y = nhwc_shape_[0] * h * UP_DIV(c, C4NUM); | |||||
| im_dst_x = w; | |||||
| } else if (out_tensors_[0]->GetFormat() == schema::Format::Format_NHWC4) { | |||||
| int h = nhwc_shape_[0] * nhwc_shape_[1]; | |||||
| int w = nhwc_shape_[2]; | |||||
| int c = nhwc_shape_[3]; | |||||
| im_dst_x = w * UP_DIV(c, C4NUM); | |||||
| im_dst_y = h; | |||||
| } else if (out_tensors_[0]->GetFormat() == schema::Format::Format_NC4) { | |||||
| int c = nhwc_shape_[3]; | |||||
| im_dst_x = UP_DIV(c, C4NUM); | |||||
| im_dst_y = 1; | |||||
| } else { | |||||
| MS_LOG(ERROR) << "Unsupported format. " << out_tensors_[0]->GetFormat(); | |||||
| return RET_ERROR; | |||||
| } | |||||
| img_size->clear(); | |||||
| auto enable_fp16_ = ocl_runtime_->GetFp16Enable(); | |||||
| size_t img_dtype = CL_FLOAT; | |||||
| if (enable_fp16_) { | |||||
| img_dtype = CL_HALF_FLOAT; | |||||
| } | |||||
| std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype}; | |||||
| *img_size = vec; | |||||
| size_t img_height = N_ * H_; | |||||
| size_t img_width = W_ * UP_DIV(C_, C4NUM); | |||||
| size_t img_dtype = ocl_runtime_->GetFp16Enable() ? CL_HALF_FLOAT : CL_FLOAT; | |||||
| *img_size = {img_width, img_height, img_dtype}; | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int ToFormatOpenCLKernel::Run() { | int ToFormatOpenCLKernel::Run() { | ||||
| MS_LOG(DEBUG) << this->name() << " Running!"; | MS_LOG(DEBUG) << this->name() << " Running!"; | ||||
| std::vector<size_t> local = {}; | |||||
| std::vector<size_t> global; | |||||
| GetGlobalSize(0, &global); | |||||
| cl_int4 shape{(cl_int)nhwc_shape_[0], (cl_int)nhwc_shape_[1], (cl_int)nhwc_shape_[2], (cl_int)nhwc_shape_[3]}; | |||||
| std::vector<size_t> global = {N_ * H_, W_, UP_DIV(C_, C4NUM)}; | |||||
| std::vector<size_t> local = {16, 8, 1}; | |||||
| cl_int4 shape{(cl_int)N_, (cl_int)H_, (cl_int)W_, (cl_int)C_}; | |||||
| cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1}; | cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1}; | ||||
| auto src_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::BUF : lite::opencl::MemType::IMG; | auto src_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::BUF : lite::opencl::MemType::IMG; | ||||
| auto dst_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::IMG : lite::opencl::MemType::BUF; | auto dst_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::IMG : lite::opencl::MemType::BUF; | ||||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), src_mem_type); | ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), src_mem_type); | ||||
| @@ -31,16 +31,18 @@ class ToFormatOpenCLKernel : public OpenCLKernel { | |||||
| ~ToFormatOpenCLKernel() override{}; | ~ToFormatOpenCLKernel() override{}; | ||||
| int Init() override; | int Init() override; | ||||
| int ReSize() override; | |||||
| int ReSize() override { return RET_OK; }; | |||||
| int Run() override; | int Run() override; | ||||
| int GetImageSize(size_t idx, std::vector<size_t> *img_size) override; | int GetImageSize(size_t idx, std::vector<size_t> *img_size) override; | ||||
| int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) override; | |||||
| int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) override; | |||||
| int InitNHWCShape(); | |||||
| private: | private: | ||||
| int InitNHWC(); | |||||
| cl::Kernel kernel_; | cl::Kernel kernel_; | ||||
| std::vector<size_t> nhwc_shape_; | |||||
| size_t N_{1}; | |||||
| size_t H_{1}; | |||||
| size_t W_{1}; | |||||
| size_t C_{1}; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -237,33 +237,33 @@ void PrintTensor(lite::Tensor *tensor, int num, const std::string &out_file) { | |||||
| } | } | ||||
| auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); | auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); | ||||
| auto runtime = runtime_wrapper.GetInstance(); | auto runtime = runtime_wrapper.GetInstance(); | ||||
| runtime->SyncCommandQueue(); | |||||
| auto allocator = runtime->GetAllocator(); | auto allocator = runtime->GetAllocator(); | ||||
| auto origin_data = tensor->data_c(); | auto origin_data = tensor->data_c(); | ||||
| allocator->MapBuffer(origin_data, CL_MAP_READ | CL_MAP_WRITE, nullptr, true); | |||||
| tensor->SetData(origin_data); | |||||
| runtime->SyncCommandQueue(); | |||||
| allocator->MapBuffer(origin_data, CL_MAP_READ, nullptr, true); | |||||
| auto Batch = tensor->Batch(); | |||||
| auto Height = tensor->shape().size() == 4 ? tensor->Height() : 1; | |||||
| auto Width = tensor->shape().size() == 4 ? tensor->Width() : 1; | |||||
| auto SLICES = UP_DIV(tensor->Channel(), C4NUM); | |||||
| auto shape = tensor->shape(); | |||||
| auto N = shape.size() > 0 ? shape[0] : 1; | |||||
| auto H = shape.size() > 1 ? shape[1] : 1; | |||||
| auto W = shape.size() > 2 ? shape[2] : 1; | |||||
| auto C = shape.size() > 3 ? shape[3] : 1; | |||||
| auto SLICES = UP_DIV(C, C4NUM); | |||||
| auto ElementsC4Num = N * H * W * UP_ROUND(C, C4NUM); | |||||
| auto alignment = runtime->GetImagePitchAlignment(); | auto alignment = runtime->GetImagePitchAlignment(); | ||||
| auto dtype_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half4) : sizeof(cl_float4); | |||||
| auto row_pitch = (Width * SLICES + alignment - 1) / alignment * alignment * dtype_size; | |||||
| auto row_size = Width * SLICES * dtype_size; | |||||
| std::vector<char> data(tensor->Size()); | |||||
| for (int i = 0; i < Batch * Height; ++i) { | |||||
| auto FLT4_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half4) : sizeof(cl_float4); | |||||
| auto row_pitch = (W * SLICES + alignment - 1) / alignment * alignment * FLT4_size; | |||||
| auto row_size = W * SLICES * FLT4_size; | |||||
| std::vector<char> data(N * H * row_size); | |||||
| for (int i = 0; i < N * H; ++i) { | |||||
| memcpy(static_cast<char *>(data.data()) + i * row_size, static_cast<char *>(origin_data) + i * row_pitch, row_size); | memcpy(static_cast<char *>(data.data()) + i * row_size, static_cast<char *>(origin_data) + i * row_pitch, row_size); | ||||
| } | } | ||||
| std::cout << "shape=("; | std::cout << "shape=("; | ||||
| for (auto x : tensor->shape()) { | |||||
| for (auto x : shape) { | |||||
| printf("%3d,", x); | printf("%3d,", x); | ||||
| } | } | ||||
| printf("): "); | printf("): "); | ||||
| for (size_t i = 0; i < num && i < tensor->ElementsNum(); ++i) { | |||||
| for (size_t i = 0; i < num && i < ElementsC4Num; ++i) { | |||||
| if (tensor->data_type() == kNumberTypeFloat16) | if (tensor->data_type() == kNumberTypeFloat16) | ||||
| printf("%zu %6.3f | ", i, (reinterpret_cast<float16_t *>(data.data()))[i]); | printf("%zu %6.3f | ", i, (reinterpret_cast<float16_t *>(data.data()))[i]); | ||||
| else | else | ||||
| @@ -272,7 +272,7 @@ void PrintTensor(lite::Tensor *tensor, int num, const std::string &out_file) { | |||||
| printf("\n"); | printf("\n"); | ||||
| if (!out_file.empty()) { | if (!out_file.empty()) { | ||||
| Write2File(data.data(), out_file, tensor->Size()); | |||||
| Write2File(data.data(), out_file, data.size()); | |||||
| } | } | ||||
| allocator->UnmapBuffer(origin_data); | allocator->UnmapBuffer(origin_data); | ||||
| } | } | ||||