Merge pull request !7384 from 王东旭/fix_conv_no_bias_bugtags/v1.1.0
| @@ -76,7 +76,9 @@ int ConcatOpenCLKernel::Init() { | |||
| auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_); | |||
| MS_LOG(DEBUG) << " concat at axis=: " << param->axis_; | |||
| param->axis_ = (param->axis_ == -1) ? (in_tensors_[0]->shape().size() - 1) : param->axis_; | |||
| if (param->axis_ < 0) { | |||
| param->axis_ += in_tensors_.front()->shape().size(); | |||
| } | |||
| if (param->axis_ < 0 || param->axis_ > 3) { | |||
| MS_LOG(ERROR) << " only support axis >= 0 and axis <= 3 "; | |||
| return RET_ERROR; | |||
| @@ -65,6 +65,7 @@ int ConvolutionOpenCLKernel::Init() { | |||
| CO_SLICES_ = UP_DIV(CO_, C4NUM); | |||
| KH_ = param->kernel_h_; | |||
| KW_ = param->kernel_w_; | |||
| has_bias_ = in_tensors_.size() == 3; | |||
| // note: TILES_X TILES_Y TILES_XY is only used when use_winograd_=true | |||
| TILES_X_ = UP_DIV(OW_, 4); | |||
| @@ -243,7 +244,9 @@ int ConvolutionOpenCLKernel::InitBias() { | |||
| int ConvolutionOpenCLKernel::InitBuffer() { | |||
| InitWeight(); | |||
| InitBias(); | |||
| if (has_bias_) { | |||
| InitBias(); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -298,7 +301,9 @@ int ConvolutionOpenCLKernel::Run() { | |||
| cl_int4 _36to4x4_out_shape = {1, OH_, OW_, CO_SLICES_}; | |||
| ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG); | |||
| ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG); | |||
| ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); | |||
| if (has_bias_) { | |||
| ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_out_shape); | |||
| } else { | |||
| @@ -306,7 +311,9 @@ int ConvolutionOpenCLKernel::Run() { | |||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG); | |||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG); | |||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); | |||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); | |||
| if (has_bias_) { | |||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); | |||
| } | |||
| if (op_format_ == Format_NC4HW4) { | |||
| cl_int4 input_shape = {1, IH_, IW_, CI_SLICES_}; | |||
| cl_int4 output_shape = {1, OH_, OW_, CO_SLICES_}; | |||
| @@ -372,10 +379,14 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() { | |||
| code += | |||
| "__kernel void Convolution(__read_only image2d_t input,\n" | |||
| " __write_only image2d_t output,\n" | |||
| " __global FLT4 *weight,\n" | |||
| " __global FLT4 *bias)" | |||
| "{\n"; | |||
| " __write_only image2d_t output,\n"; | |||
| if (has_bias_) { | |||
| code += | |||
| " __global FLT4 *weight,\n" | |||
| " __global FLT4 *bias) {\n"; | |||
| } else { | |||
| code += " __global FLT4 *weight) {\n"; | |||
| } | |||
| code += " int n_oh = get_global_id(0); // [0, N*OH)\n"; | |||
| if (batch_size_ == 1) { | |||
| @@ -426,17 +437,20 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() { | |||
| " }\n" | |||
| " }\n" | |||
| " }\n\n"; | |||
| code += " FLT4 out0_c4_bias = out0_c4 + bias[co_slice];\n"; | |||
| if (has_bias_) { | |||
| code += " out0_c4 = out0_c4 + bias[co_slice];\n"; | |||
| } | |||
| if (param->act_type_ == ActType_Relu) { | |||
| code += " out0_c4_bias = max(out0_c4_bias, (FLT4)(0.0f));\n"; | |||
| code += " out0_c4 = max(out0_c4, (FLT4)(0.0f));\n"; | |||
| } else if (param->act_type_ == ActType_Relu6) { | |||
| code += " out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n"; | |||
| code += " out0_c4 = clamp(out0_c4, (FLT4)(0.0f), (FLT4)(6.0f));\n"; | |||
| } | |||
| if (OW_ * CO_SLICES_ <= MAX_IMAGE2D_SIZE) { | |||
| code += " WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, n_oh), out0_c4_bias);// NHWC4: NH WC\n}"; | |||
| code += " WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, n_oh), out0_c4);// NHWC4: NH WC\n}"; | |||
| } else { | |||
| code += " WRITE_IMAGE(output, (int2)(n_oh * CO_SLICES + co_slice, ow), out0_c4_bias);\n}"; | |||
| code += " WRITE_IMAGE(output, (int2)(n_oh * CO_SLICES + co_slice, ow), out0_c4);\n}"; | |||
| } | |||
| return code; | |||
| } | |||
| @@ -460,8 +474,11 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() { | |||
| "\n" | |||
| "__kernel void Convolution(__read_only image2d_t input,\n" | |||
| " __write_only image2d_t output,\n" | |||
| " __global FLT4 *weight,\n" | |||
| " __global FLT4 *bias,\n" | |||
| " __global FLT4 *weight,\n"; | |||
| if (has_bias_) { | |||
| code += " __global FLT4 *bias,\n"; | |||
| } | |||
| code += | |||
| " const int4 input_shape,\n" | |||
| " const int4 output_shape)\n" | |||
| "{\n"; | |||
| @@ -578,7 +595,10 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() { | |||
| " }\n" | |||
| " }\n\n"; | |||
| code += " out0 = out0 + bias[co_slice];\n"; | |||
| if (has_bias_) { | |||
| code += " out0 = out0 + bias[co_slice];\n"; | |||
| } | |||
| if (param->act_type_ == ActType_Relu) { | |||
| code += " out0 = max(out0, (FLT4)(0.0f));\n"; | |||
| } else if (param->act_type_ == ActType_Relu6) { | |||
| @@ -591,7 +611,9 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() { | |||
| " if (last_is_double)" | |||
| " {\n"; | |||
| } | |||
| code += " out1 = out1 + bias[co_slice];\n"; | |||
| if (has_bias_) { | |||
| code += " out1 = out1 + bias[co_slice];\n"; | |||
| } | |||
| if (param->act_type_ == ActType_Relu) { | |||
| code += " out1 = max(out1, (FLT4)(0.0f));\n"; | |||
| } else if (param->act_type_ == ActType_Relu6) { | |||
| @@ -788,8 +810,11 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { | |||
| "};\n" | |||
| "\n" | |||
| "__kernel void Winograd36To4x4(__read_only image2d_t input,\n" | |||
| " __write_only image2d_t output,\n" | |||
| " __global FLT4 *bias,\n" | |||
| " __write_only image2d_t output,\n"; | |||
| if (has_bias_) { | |||
| code += " __global FLT4 *bias,\n"; | |||
| } | |||
| code += | |||
| " int4 input_shape, // N 36 H/4*W/4 CO_SLICES\n" | |||
| " int4 output_shape) // N H W CO_SLICES\n" | |||
| "{\n" | |||
| @@ -824,9 +849,10 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { | |||
| " for (int y = 0; y < 6; y++)\n" | |||
| " {\n" | |||
| " acc += AtM_row[y] * At[x * 6 + y];\n" | |||
| " }\n" | |||
| " acc += bias[slice];\n" | |||
| "\n"; | |||
| " }\n"; | |||
| if (has_bias_) { | |||
| code += " acc += bias[slice];\n"; | |||
| } | |||
| auto param = reinterpret_cast<ConvParameter *>(op_parameter_); | |||
| if (param->act_type_ == ActType_Relu) { | |||
| @@ -35,44 +35,15 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { | |||
| int Init() override; | |||
| int Run() override; | |||
| int InitBuffer(); | |||
| int GetImageSize(size_t idx, std::vector<size_t> *img_size) override; | |||
| private: | |||
| bool use_fp16_ = false; | |||
| int batch_size_{}; | |||
| int CI_{}; | |||
| int IH_{}; | |||
| int IW_{}; | |||
| int CO_{}; | |||
| int OH_{}; | |||
| int OW_{}; | |||
| int CI_SLICES_{}; | |||
| int CO_SLICES_{}; | |||
| int KH_{}; | |||
| int KW_{}; | |||
| void *packed_weight_ = nullptr; | |||
| void *packed_bias_ = nullptr; | |||
| bool use_winograd_ = false; | |||
| int TILES_X_{}; | |||
| int TILES_Y_{}; | |||
| int TILES_XY_{}; | |||
| void *winograd_mem0_ = nullptr; | |||
| void *winograd_mem1_ = nullptr; | |||
| cl::Kernel kernel_4x4to36_; | |||
| cl::Kernel kernel_conv_; | |||
| cl::Kernel kernel_36to4x4_; | |||
| int InitBuffer(); | |||
| int InitWeight(); | |||
| int InitBias(); | |||
| int GenerateWinogradWeight(); | |||
| std::string CodeGenConvolutionNHWC4(); | |||
| std::string CodeGenConvolutionNC4HW4(); | |||
| std::string CodeGenWinograd4x4To36(); | |||
| std::string CodeGenWinogradConvolution(); | |||
| std::string CodeGenWinograd36To4x4(); | |||
| @@ -110,6 +81,7 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { | |||
| param->pad_r_, | |||
| param->dilation_h_, | |||
| param->dilation_w_, | |||
| has_bias_, | |||
| use_fp16_, | |||
| op_format_, | |||
| param->act_type_}; | |||
| @@ -119,6 +91,34 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { | |||
| } | |||
| return code_id; | |||
| } | |||
| bool use_fp16_ = false; | |||
| int batch_size_{}; | |||
| int CI_{}; | |||
| int IH_{}; | |||
| int IW_{}; | |||
| int CO_{}; | |||
| int OH_{}; | |||
| int OW_{}; | |||
| int CI_SLICES_{}; | |||
| int CO_SLICES_{}; | |||
| int KH_{}; | |||
| int KW_{}; | |||
| void *packed_weight_ = nullptr; | |||
| void *packed_bias_ = nullptr; | |||
| bool has_bias_ = false; | |||
| bool use_winograd_ = false; | |||
| int TILES_X_{}; | |||
| int TILES_Y_{}; | |||
| int TILES_XY_{}; | |||
| void *winograd_mem0_ = nullptr; | |||
| void *winograd_mem1_ = nullptr; | |||
| cl::Kernel kernel_4x4to36_; | |||
| cl::Kernel kernel_conv_; | |||
| cl::Kernel kernel_36to4x4_; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -34,134 +34,71 @@ namespace mindspore::kernel { | |||
| int ToFormatOpenCLKernel::Init() { | |||
| auto parameter = reinterpret_cast<OpenCLToFormatParameter *>(op_parameter_); | |||
| out_mem_type_ = parameter->out_mem_type; | |||
| std::string program_name = "to_format"; | |||
| std::map<schema::Format, std::string> format_str{ | |||
| {schema::Format::Format_NCHW, "NCHW"}, {schema::Format::Format_NHWC, "NHWC"}, | |||
| {schema::Format::Format_NC4HW4, "NC4HW4"}, {schema::Format::Format_NC4, "NHWC4"}, | |||
| {schema::Format::Format_NC, "NHWC"}, {schema::Format::Format_NHWC4, "NHWC4"}}; | |||
| std::string kernel_name = | |||
| "to_format_" + format_str[in_tensors_[0]->GetFormat()] + "_to_" + format_str[out_tensors_[0]->GetFormat()]; | |||
| std::map<TypeId, std::string> dtype_str{ | |||
| {kNumberTypeFloat32, "float"}, {kNumberTypeFloat16, "half"}, {kNumberTypeInt8, "Int8"}}; | |||
| if (out_mem_type_ == OpenCLMemType::IMG) { | |||
| kernel_name += "_IMG_" + dtype_str[in_tensors_[0]->data_type()]; | |||
| std::map<TypeId, std::string> dtype_str{{kNumberTypeFloat32, "float"}, {kNumberTypeFloat16, "half"}}; | |||
| std::string kernel_name; | |||
| if (parameter->out_mem_type == OpenCLMemType::IMG) { | |||
| kernel_name = "to_format_NHWC_to_NHWC4_IMG_" + dtype_str[in_tensors_[0]->data_type()]; | |||
| } else { | |||
| kernel_name += "_BUF_" + dtype_str[out_tensors_[0]->data_type()]; | |||
| kernel_name = "to_format_NHWC4_to_NHWC_BUF_" + dtype_str[out_tensors_[0]->data_type()]; | |||
| } | |||
| this->set_name(kernel_name); | |||
| #ifdef PROGRAM_WITH_IL | |||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | |||
| #else | |||
| std::string program_name = "to_format"; | |||
| std::set<std::string> build_options; | |||
| std::string source = to_format_source; | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| #endif | |||
| InitNHWCShape(); | |||
| InitNHWC(); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return RET_OK; | |||
| } | |||
| int ToFormatOpenCLKernel::InitNHWCShape() { | |||
| std::vector<int> shapex = out_tensors_[0]->shape(); | |||
| size_t n, h, w, c; | |||
| if (shapex.size() == 2) { | |||
| n = shapex[0]; | |||
| h = 1; | |||
| w = 1; | |||
| c = shapex[1]; | |||
| nhwc_shape_ = {n, h, w, c}; | |||
| return RET_OK; | |||
| int ToFormatOpenCLKernel::InitNHWC() { | |||
| std::vector<int> out_shape = out_tensors_[0]->shape(); | |||
| if (out_shape.size() == 1) { | |||
| N_ = out_shape[0]; | |||
| H_ = 1; | |||
| W_ = 1; | |||
| C_ = 1; | |||
| } else if (out_shape.size() == 2) { | |||
| N_ = out_shape[0]; | |||
| H_ = 1; | |||
| W_ = 1; | |||
| C_ = out_shape[1]; | |||
| } else if (out_shape.size() == 3) { | |||
| N_ = out_shape[0]; | |||
| H_ = 1; | |||
| W_ = out_shape[1]; | |||
| C_ = out_shape[2]; | |||
| } else if (out_shape.size() == 4) { | |||
| N_ = out_shape[0]; | |||
| H_ = out_shape[1]; | |||
| W_ = out_shape[2]; | |||
| C_ = out_shape[3]; | |||
| } | |||
| if (shapex.size() == 3) { | |||
| n = 1; | |||
| h = 1; | |||
| w = 1; | |||
| c = 1; | |||
| nhwc_shape_ = {n, h, w, c}; | |||
| return RET_OK; | |||
| } | |||
| if (out_tensors_[0]->GetFormat() == schema::Format::Format_NC4HW4 || | |||
| out_tensors_[0]->GetFormat() == schema::Format::Format_NHWC4 || | |||
| out_tensors_[0]->GetFormat() == schema::Format::Format_NHWC) { | |||
| n = shapex[0]; | |||
| h = shapex[1]; | |||
| w = shapex[2]; | |||
| c = shapex[3]; | |||
| } else if (out_tensors_[0]->GetFormat() == schema::Format::Format_NCHW) { | |||
| n = shapex[0]; | |||
| h = shapex[2]; | |||
| w = shapex[3]; | |||
| c = shapex[1]; | |||
| } else if (out_tensors_[0]->GetFormat() == schema::Format::Format_NC4 || | |||
| out_tensors_[0]->GetFormat() == schema::Format::Format_NC) { | |||
| n = shapex[0]; | |||
| h = 1; | |||
| w = 1; | |||
| c = shapex[1]; | |||
| } else { | |||
| n = shapex[0]; | |||
| h = shapex[1]; | |||
| w = shapex[2]; | |||
| c = shapex[3]; | |||
| } | |||
| nhwc_shape_ = {n, h, w, c}; | |||
| return RET_OK; | |||
| } | |||
| int ToFormatOpenCLKernel::ReSize() { return RET_OK; } | |||
| int ToFormatOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t> *global_size) { | |||
| std::vector<size_t> vec = {nhwc_shape_[0] * nhwc_shape_[1], nhwc_shape_[2], UP_DIV(nhwc_shape_[3], C4NUM)}; | |||
| *global_size = std::move(vec); | |||
| return RET_OK; | |||
| } | |||
| int ToFormatOpenCLKernel::GetLocalSize(size_t idx, const std::vector<size_t> &global_size, | |||
| std::vector<size_t> *local_size) { | |||
| return RET_OK; | |||
| } | |||
| int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { | |||
| size_t im_dst_x, im_dst_y; | |||
| if (out_tensors_[0]->GetFormat() == schema::Format::Format_NC4HW4) { | |||
| int c = nhwc_shape_[3]; | |||
| int h = nhwc_shape_[1]; | |||
| int w = nhwc_shape_[2]; | |||
| im_dst_y = nhwc_shape_[0] * h * UP_DIV(c, C4NUM); | |||
| im_dst_x = w; | |||
| } else if (out_tensors_[0]->GetFormat() == schema::Format::Format_NHWC4) { | |||
| int h = nhwc_shape_[0] * nhwc_shape_[1]; | |||
| int w = nhwc_shape_[2]; | |||
| int c = nhwc_shape_[3]; | |||
| im_dst_x = w * UP_DIV(c, C4NUM); | |||
| im_dst_y = h; | |||
| } else if (out_tensors_[0]->GetFormat() == schema::Format::Format_NC4) { | |||
| int c = nhwc_shape_[3]; | |||
| im_dst_x = UP_DIV(c, C4NUM); | |||
| im_dst_y = 1; | |||
| } else { | |||
| MS_LOG(ERROR) << "Unsupported format. " << out_tensors_[0]->GetFormat(); | |||
| return RET_ERROR; | |||
| } | |||
| img_size->clear(); | |||
| auto enable_fp16_ = ocl_runtime_->GetFp16Enable(); | |||
| size_t img_dtype = CL_FLOAT; | |||
| if (enable_fp16_) { | |||
| img_dtype = CL_HALF_FLOAT; | |||
| } | |||
| std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype}; | |||
| *img_size = vec; | |||
| size_t img_height = N_ * H_; | |||
| size_t img_width = W_ * UP_DIV(C_, C4NUM); | |||
| size_t img_dtype = ocl_runtime_->GetFp16Enable() ? CL_HALF_FLOAT : CL_FLOAT; | |||
| *img_size = {img_width, img_height, img_dtype}; | |||
| return RET_OK; | |||
| } | |||
| int ToFormatOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| std::vector<size_t> local = {}; | |||
| std::vector<size_t> global; | |||
| GetGlobalSize(0, &global); | |||
| cl_int4 shape{(cl_int)nhwc_shape_[0], (cl_int)nhwc_shape_[1], (cl_int)nhwc_shape_[2], (cl_int)nhwc_shape_[3]}; | |||
| std::vector<size_t> global = {N_ * H_, W_, UP_DIV(C_, C4NUM)}; | |||
| std::vector<size_t> local = {16, 8, 1}; | |||
| cl_int4 shape{(cl_int)N_, (cl_int)H_, (cl_int)W_, (cl_int)C_}; | |||
| cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1}; | |||
| auto src_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::BUF : lite::opencl::MemType::IMG; | |||
| auto dst_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::IMG : lite::opencl::MemType::BUF; | |||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), src_mem_type); | |||
| @@ -31,16 +31,18 @@ class ToFormatOpenCLKernel : public OpenCLKernel { | |||
| ~ToFormatOpenCLKernel() override{}; | |||
| int Init() override; | |||
| int ReSize() override; | |||
| int ReSize() override { return RET_OK; }; | |||
| int Run() override; | |||
| int GetImageSize(size_t idx, std::vector<size_t> *img_size) override; | |||
| int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) override; | |||
| int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) override; | |||
| int InitNHWCShape(); | |||
| private: | |||
| int InitNHWC(); | |||
| cl::Kernel kernel_; | |||
| std::vector<size_t> nhwc_shape_; | |||
| size_t N_{1}; | |||
| size_t H_{1}; | |||
| size_t W_{1}; | |||
| size_t C_{1}; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -237,33 +237,33 @@ void PrintTensor(lite::Tensor *tensor, int num, const std::string &out_file) { | |||
| } | |||
| auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); | |||
| auto runtime = runtime_wrapper.GetInstance(); | |||
| runtime->SyncCommandQueue(); | |||
| auto allocator = runtime->GetAllocator(); | |||
| auto origin_data = tensor->data_c(); | |||
| allocator->MapBuffer(origin_data, CL_MAP_READ | CL_MAP_WRITE, nullptr, true); | |||
| tensor->SetData(origin_data); | |||
| runtime->SyncCommandQueue(); | |||
| allocator->MapBuffer(origin_data, CL_MAP_READ, nullptr, true); | |||
| auto Batch = tensor->Batch(); | |||
| auto Height = tensor->shape().size() == 4 ? tensor->Height() : 1; | |||
| auto Width = tensor->shape().size() == 4 ? tensor->Width() : 1; | |||
| auto SLICES = UP_DIV(tensor->Channel(), C4NUM); | |||
| auto shape = tensor->shape(); | |||
| auto N = shape.size() > 0 ? shape[0] : 1; | |||
| auto H = shape.size() > 1 ? shape[1] : 1; | |||
| auto W = shape.size() > 2 ? shape[2] : 1; | |||
| auto C = shape.size() > 3 ? shape[3] : 1; | |||
| auto SLICES = UP_DIV(C, C4NUM); | |||
| auto ElementsC4Num = N * H * W * UP_ROUND(C, C4NUM); | |||
| auto alignment = runtime->GetImagePitchAlignment(); | |||
| auto dtype_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half4) : sizeof(cl_float4); | |||
| auto row_pitch = (Width * SLICES + alignment - 1) / alignment * alignment * dtype_size; | |||
| auto row_size = Width * SLICES * dtype_size; | |||
| std::vector<char> data(tensor->Size()); | |||
| for (int i = 0; i < Batch * Height; ++i) { | |||
| auto FLT4_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half4) : sizeof(cl_float4); | |||
| auto row_pitch = (W * SLICES + alignment - 1) / alignment * alignment * FLT4_size; | |||
| auto row_size = W * SLICES * FLT4_size; | |||
| std::vector<char> data(N * H * row_size); | |||
| for (int i = 0; i < N * H; ++i) { | |||
| memcpy(static_cast<char *>(data.data()) + i * row_size, static_cast<char *>(origin_data) + i * row_pitch, row_size); | |||
| } | |||
| std::cout << "shape=("; | |||
| for (auto x : tensor->shape()) { | |||
| for (auto x : shape) { | |||
| printf("%3d,", x); | |||
| } | |||
| printf("): "); | |||
| for (size_t i = 0; i < num && i < tensor->ElementsNum(); ++i) { | |||
| for (size_t i = 0; i < num && i < ElementsC4Num; ++i) { | |||
| if (tensor->data_type() == kNumberTypeFloat16) | |||
| printf("%zu %6.3f | ", i, (reinterpret_cast<float16_t *>(data.data()))[i]); | |||
| else | |||
| @@ -272,7 +272,7 @@ void PrintTensor(lite::Tensor *tensor, int num, const std::string &out_file) { | |||
| printf("\n"); | |||
| if (!out_file.empty()) { | |||
| Write2File(data.data(), out_file, tensor->Size()); | |||
| Write2File(data.data(), out_file, data.size()); | |||
| } | |||
| allocator->UnmapBuffer(origin_data); | |||
| } | |||