| @@ -0,0 +1,46 @@ | |||||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||||
| __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; | |||||
| __kernel void Cast_Fp32ToFp16_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, int4 output_shape) { | |||||
| int X = get_global_id(0); // N*H | |||||
| int Y = get_global_id(1); // W | |||||
| int Z = get_global_id(2); // c/4 | |||||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||||
| return; | |||||
| } | |||||
| half4 result = convert_half4(READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X)))); | |||||
| write_imageh(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||||
| } | |||||
| __kernel void Cast_Fp32ToFp16_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, int4 output_shape) { | |||||
| int X = get_global_id(0); // N*H | |||||
| int Y = get_global_id(1); // W | |||||
| int Z = get_global_id(2); // c/4 | |||||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||||
| return; | |||||
| } | |||||
| half4 result = convert_half4(READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)))); | |||||
| write_imageh(output, (int2)((Y), (Z * output_shape.y + X)), result); | |||||
| } | |||||
| __kernel void Cast_Fp16ToFp32_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, int4 output_shape) { | |||||
| int X = get_global_id(0); // N*H | |||||
| int Y = get_global_id(1); // W | |||||
| int Z = get_global_id(2); // c/4 | |||||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||||
| return; | |||||
| } | |||||
| float4 result = convert_float4(READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X)))); | |||||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||||
| } | |||||
| __kernel void Cast_Fp16ToFp32_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, int4 output_shape) { | |||||
| int X = get_global_id(0); // N*H | |||||
| int Y = get_global_id(1); // W | |||||
| int Z = get_global_id(2); // c/4 | |||||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||||
| return; | |||||
| } | |||||
| float4 result = convert_float4(READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)))); | |||||
| WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); | |||||
| } | |||||
| @@ -112,12 +112,12 @@ int BatchNormOpenCLKernel::Run() { | |||||
| std::vector<size_t> global = {OH, OW, OC}; | std::vector<size_t> global = {OH, OW, OC}; | ||||
| BatchNormGetWorkGroup(global, &local, max_global[0]); | BatchNormGetWorkGroup(global, &local, max_global[0]); | ||||
| int arg_cn = 0; | int arg_cn = 0; | ||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->MutableData()); // input tensor | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->MutableData()); // scale | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->MutableData()); // offest | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->MutableData()); // mean | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[4]->MutableData()); // variance | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->MutableData()); // out tensor | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c()); // scale | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->data_c()); // offest | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c()); // mean | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[4]->data_c()); // variance | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape_); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape_); | ||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->epsilon_); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->epsilon_); | ||||
| ocl_runtime->RunKernel(kernel_, global, local, nullptr); | ocl_runtime->RunKernel(kernel_, global, local, nullptr); | ||||
| @@ -0,0 +1,152 @@ | |||||
| /** | |||||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include <cstring> | |||||
| #include <algorithm> | |||||
| #include <set> | |||||
| #include<string> | |||||
| #include "src/kernel_registry.h" | |||||
| #include "src/runtime/opencl/opencl_runtime.h" | |||||
| #include "src/runtime/kernel/opencl/kernel/cast.h" | |||||
| #include "src/runtime/kernel/opencl/utils.h" | |||||
| #include "src/runtime/kernel/opencl/cl/cast.cl.inc" | |||||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | |||||
| using mindspore::lite::KernelRegistrar; | |||||
| using mindspore::schema::PrimitiveType_Cast; | |||||
| namespace mindspore::kernel { | |||||
| int CastOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { | |||||
| size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); | |||||
| size_t im_dst_x, im_dst_y; | |||||
| if (in_tensors_[0]->GetFormat() == schema::Format::Format_NHWC4) { | |||||
| im_dst_x = out_tensors_[0]->Width() * CO4; | |||||
| im_dst_y = out_tensors_[0]->Height(); | |||||
| } else { | |||||
| im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * CO4; | |||||
| im_dst_x = out_tensors_[0]->Width(); | |||||
| } | |||||
| size_t img_dtype = CL_FLOAT; | |||||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||||
| auto enable_fp16_ = ocl_runtime->GetFp16Enable(); | |||||
| if (enable_fp16_) { | |||||
| img_dtype = CL_HALF_FLOAT; | |||||
| } | |||||
| img_size->clear(); | |||||
| std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype}; | |||||
| *img_size = vec; | |||||
| return RET_OK; | |||||
| } | |||||
| void CastOpenCLKernel::GetKernelName(std::string *kernel_name, CastParameter *param) { | |||||
| if (param->src_type_ == kNumberTypeFloat32 && param->dst_type_ == kNumberTypeFloat16) { | |||||
| kernel_name[0] += "_Fp32ToFp16"; | |||||
| } else if (param->src_type_ == kNumberTypeFloat16 && param->dst_type_ == kNumberTypeFloat32) { | |||||
| kernel_name[0] += "_Fp16ToFp32"; | |||||
| } else { | |||||
| MS_LOG(ERROR) << "unsupported convert format from : " << param->src_type_ << "to " << param->dst_type_; | |||||
| } | |||||
| } | |||||
| int CastOpenCLKernel::Init() { | |||||
| auto param = reinterpret_cast<CastParameter *>(this->op_parameter_); | |||||
| auto in_format = op_format_; | |||||
| if (in_format != schema::Format_NHWC4 && in_format != schema::Format_NC4HW4) { | |||||
| MS_LOG(ERROR) << "input format(" << in_format << ") " | |||||
| << "format not support!"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| in_ori_format_ = in_tensors_[0]->GetFormat(); | |||||
| in_tensors_[0]->SetFormat(op_format_); | |||||
| out_ori_format_ = out_tensors_[0]->GetFormat(); | |||||
| out_tensors_[0]->SetFormat(op_format_); | |||||
| std::string kernel_name = "Cast"; | |||||
| GetKernelName(&kernel_name, param); | |||||
| if (in_format == schema::Format_NC4HW4) { | |||||
| kernel_name += "_NC4HW4"; | |||||
| } else if (in_format == schema::Format_NHWC4) { | |||||
| kernel_name += "_NHWC4"; | |||||
| } | |||||
| std::set<std::string> build_options; | |||||
| std::string source = cast_source; | |||||
| std::string program_name = "cast"; | |||||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||||
| ocl_runtime->LoadSource(program_name, source); | |||||
| ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||||
| return RET_OK; | |||||
| } | |||||
| int CastOpenCLKernel::ReSize() { return RET_OK; } | |||||
| void CastGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | |||||
| const int max_divider = 8; | |||||
| const int max_x = 4, max_y = 8; | |||||
| int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); | |||||
| int yz = max_size / x; | |||||
| int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); | |||||
| int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2))); | |||||
| local->clear(); | |||||
| local->push_back(x); | |||||
| local->push_back(y); | |||||
| local->push_back(z); | |||||
| } | |||||
| int CastOpenCLKernel::Run() { | |||||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||||
| auto input_shape = in_tensors_[0]->shape(); | |||||
| cl_int4 input_shape_ = {input_shape[0], input_shape[1], input_shape[2], UP_DIV(input_shape[3], C4NUM)}; | |||||
| uint32_t OH = input_shape[1]; | |||||
| uint32_t OW = input_shape[2]; | |||||
| uint32_t OC = UP_DIV(input_shape[3], C4NUM); | |||||
| const std::vector<size_t> &max_global = ocl_runtime->GetWorkItemSize(); | |||||
| std::vector<size_t> local = {1, 1, 1}; // init local | |||||
| std::vector<size_t> global = {OH, OW, OC}; | |||||
| CastGetWorkGroup(global, &local, max_global[0]); | |||||
| int arg_cn = 0; | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape_); | |||||
| ocl_runtime->RunKernel(kernel_, global, local, nullptr); | |||||
| return RET_OK; | |||||
| } | |||||
| kernel::LiteKernel *OpenCLCastKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||||
| const lite::Context *ctx, const kernel::KernelKey &desc, | |||||
| const mindspore::lite::PrimitiveC *primitive) { | |||||
| auto *kernel = new (std::nothrow) CastOpenCLKernel(opParameter, inputs, outputs); | |||||
| if (kernel == nullptr) { | |||||
| MS_LOG(ERROR) << " new CastOpenCLKernel failed "; | |||||
| return nullptr; | |||||
| } | |||||
| auto ret = kernel->Init(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << " Init kernel failed, name: Cast "; | |||||
| delete kernel; | |||||
| return nullptr; | |||||
| } | |||||
| return kernel; | |||||
| } | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Cast, OpenCLCastKernelCreator); | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Cast, OpenCLCastKernelCreator); | |||||
| } // namespace mindspore::kernel | |||||
| @@ -0,0 +1,52 @@ | |||||
| /** | |||||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CAST_H_ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CAST_H_ | |||||
| #include <vector> | |||||
| #include<string> | |||||
| #include "ir/anf.h" | |||||
| #include "src/runtime/kernel/opencl/opencl_kernel.h" | |||||
| #include "src/runtime/opencl/opencl_runtime.h" | |||||
| #include "nnacl/fp32/cast.h" | |||||
| namespace mindspore::kernel { | |||||
| class CastOpenCLKernel : public OpenCLKernel { | |||||
| public: | |||||
| explicit CastOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||||
| const std::vector<lite::Tensor *> &outputs) | |||||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||||
| ~CastOpenCLKernel() override{}; | |||||
| int Init() override; | |||||
| int ReSize() override; | |||||
| int Run() override; | |||||
| void GetKernelName(std::string *kernel_name, CastParameter *param); | |||||
| int GetImageSize(size_t idx, std::vector<size_t> *img_size) override; | |||||
| private: | |||||
| cl::Kernel kernel_; | |||||
| }; | |||||
| } // namespace mindspore::kernel | |||||
| #endif | |||||
| @@ -55,11 +55,11 @@ int ConcatOpenCLKernel::RunAxis0() { | |||||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | ||||
| auto allocator_ = ocl_runtime->GetAllocator(); | auto allocator_ = ocl_runtime->GetAllocator(); | ||||
| std::vector<size_t> img_size; | std::vector<size_t> img_size; | ||||
| auto dst_data = out_tensors_[0]->MutableData(); | |||||
| auto dst_data = out_tensors_[0]->data_c(); | |||||
| auto dst_origin = cl::array<cl::size_type, 3U>{0, 0, 0}; | auto dst_origin = cl::array<cl::size_type, 3U>{0, 0, 0}; | ||||
| cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(dst_data)); | cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(dst_data)); | ||||
| for (int i = 0; i < in_tensors_.size(); i++) { | for (int i = 0; i < in_tensors_.size(); i++) { | ||||
| auto src_data = in_tensors_[i]->MutableData(); | |||||
| auto src_data = in_tensors_[i]->data_c(); | |||||
| allocator_->GetImageSize(src_data, &img_size); | allocator_->GetImageSize(src_data, &img_size); | ||||
| auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0}; | auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0}; | ||||
| auto region = cl::array<cl::size_type, 3U>{img_size[0], img_size[1], 1}; | auto region = cl::array<cl::size_type, 3U>{img_size[0], img_size[1], 1}; | ||||
| @@ -176,9 +176,9 @@ int ConcatOpenCLKernel::Run() { | |||||
| int arg_cn = 0; | int arg_cn = 0; | ||||
| if (in_tensors_.size() == 2) { | if (in_tensors_.size() == 2) { | ||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->MutableData()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->MutableData()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->MutableData()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_); | ||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_); | ||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_); | ||||
| @@ -187,10 +187,10 @@ int ConcatOpenCLKernel::Run() { | |||||
| auto input3_shape = in_tensors_[2]->shape(); | auto input3_shape = in_tensors_[2]->shape(); | ||||
| cl_int4 input_shape3_ = {input3_shape[0], input3_shape[1], input3_shape[2], UP_DIV(input3_shape[3], C4NUM)}; | cl_int4 input_shape3_ = {input3_shape[0], input3_shape[1], input3_shape[2], UP_DIV(input3_shape[3], C4NUM)}; | ||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->MutableData()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->MutableData()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->MutableData()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->MutableData()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->data_c()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_); | ||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_); | ||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape3_); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape3_); | ||||
| @@ -202,11 +202,11 @@ int ConcatOpenCLKernel::Run() { | |||||
| cl_int4 input_shape3_ = {input3_shape[0], input3_shape[1], input3_shape[2], UP_DIV(input3_shape[3], C4NUM)}; | cl_int4 input_shape3_ = {input3_shape[0], input3_shape[1], input3_shape[2], UP_DIV(input3_shape[3], C4NUM)}; | ||||
| cl_int4 input_shape4_ = {input4_shape[0], input4_shape[1], input4_shape[2], UP_DIV(input4_shape[3], C4NUM)}; | cl_int4 input_shape4_ = {input4_shape[0], input4_shape[1], input4_shape[2], UP_DIV(input4_shape[3], C4NUM)}; | ||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->MutableData()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->MutableData()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->MutableData()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->MutableData()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->MutableData()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->data_c()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_); | ||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_); | ||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape3_); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape3_); | ||||
| @@ -110,8 +110,8 @@ int SliceOpenCLKernel::Run() { | |||||
| std::vector<size_t> global = {1, OH, OW}; | std::vector<size_t> global = {1, OH, OW}; | ||||
| SlcieGetWorkGroup(global, &local, max_global[0]); | SlcieGetWorkGroup(global, &local, max_global[0]); | ||||
| int arg_cn = 0; | int arg_cn = 0; | ||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->MutableData()); // input tensor | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->MutableData()); // out tensor | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape_); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape_); | ||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, size_); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, size_); | ||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, begin_); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, begin_); | ||||
| @@ -130,15 +130,15 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) { | |||||
| } | } | ||||
| sub_graph->Init(); | sub_graph->Init(); | ||||
| MS_LOG(INFO) << " init tensors "; | MS_LOG(INFO) << " init tensors "; | ||||
| memcpy(inputs[0]->MutableData(), input_data, input_size); | |||||
| memcpy(inputs[1]->MutableData(), scale_data, scale_size); | |||||
| memcpy(inputs[2]->MutableData(), offset_data, offset_size); | |||||
| memcpy(inputs[3]->MutableData(), mean_data, mean_size); | |||||
| memcpy(inputs[4]->MutableData(), var_data, var_size); | |||||
| memcpy(inputs[0]->data_c(), input_data, input_size); | |||||
| memcpy(inputs[1]->data_c(), scale_data, scale_size); | |||||
| memcpy(inputs[2]->data_c(), offset_data, offset_size); | |||||
| memcpy(inputs[3]->data_c(), mean_data, mean_size); | |||||
| memcpy(inputs[4]->data_c(), var_data, var_size); | |||||
| std::cout << "==================output data================" << std::endl; | std::cout << "==================output data================" << std::endl; | ||||
| sub_graph->Run(); | sub_graph->Run(); | ||||
| auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->MutableData()); | |||||
| auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->data_c()); | |||||
| CompareOutputData(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.01); | CompareOutputData(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.01); | ||||
| for (auto tensor : inputs) { | for (auto tensor : inputs) { | ||||
| delete tensor; | delete tensor; | ||||
| @@ -247,15 +247,15 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) { | |||||
| } | } | ||||
| sub_graph->Init(); | sub_graph->Init(); | ||||
| MS_LOG(INFO) << " init tensors "; | MS_LOG(INFO) << " init tensors "; | ||||
| memcpy(inputs[0]->MutableData(), input_data, input_size); | |||||
| memcpy(inputs[1]->MutableData(), scale_data, scale_size); | |||||
| memcpy(inputs[2]->MutableData(), offset_data, offset_size); | |||||
| memcpy(inputs[3]->MutableData(), mean_data, mean_size); | |||||
| memcpy(inputs[4]->MutableData(), var_data, var_size); | |||||
| memcpy(inputs[0]->data_c(), input_data, input_size); | |||||
| memcpy(inputs[1]->data_c(), scale_data, scale_size); | |||||
| memcpy(inputs[2]->data_c(), offset_data, offset_size); | |||||
| memcpy(inputs[3]->data_c(), mean_data, mean_size); | |||||
| memcpy(inputs[4]->data_c(), var_data, var_size); | |||||
| std::cout << "==================output data================" << std::endl; | std::cout << "==================output data================" << std::endl; | ||||
| sub_graph->Run(); | sub_graph->Run(); | ||||
| auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->MutableData()); | |||||
| auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c()); | |||||
| CompareOutputData(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001); | CompareOutputData(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001); | ||||
| for (auto tensor : inputs) { | for (auto tensor : inputs) { | ||||
| delete tensor; | delete tensor; | ||||
| @@ -0,0 +1,212 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include <iostream> | |||||
| #include <memory> | |||||
| #include "utils/log_adapter.h" | |||||
| #include "common/common_test.h" | |||||
| #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" | |||||
| #include "mindspore/lite/src/common/file_utils.h" | |||||
| #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" | |||||
| #include "mindspore/lite/src/runtime/kernel/opencl/kernel/cast.h" | |||||
| namespace mindspore { | |||||
| class TestCastSelfOpenCL : public mindspore::CommonTest { | |||||
| public: | |||||
| TestCastSelfOpenCL() {} | |||||
| }; | |||||
| template <typename T> | |||||
| void CompareOutputData1(T *output_data, T *correct_data, int size, float err_bound) { | |||||
| for (size_t i = 0; i < size; i++) { | |||||
| T abs = fabs(output_data[i] - correct_data[i]); | |||||
| ASSERT_LE(abs, err_bound); | |||||
| } | |||||
| } | |||||
| TEST_F(TestCastSelfOpenCL, Castfp32tofp16) { | |||||
| MS_LOG(INFO) << " begin test "; | |||||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||||
| ocl_runtime->Init(); | |||||
| auto allocator = ocl_runtime->GetAllocator(); | |||||
| // get the input from .bin | |||||
| size_t input1_size, output_size; | |||||
| std::string input1Ppath = "./test_data/in_castfp32.bin"; | |||||
| std::string correctOutputPath = "./test_data/out_castfp16.bin"; | |||||
| MS_LOG(INFO) << " initialize param "; | |||||
| auto param = new (std::nothrow) CastParameter(); | |||||
| if (param == nullptr) { | |||||
| MS_LOG(INFO) << " new CastParameter failed "; | |||||
| return; | |||||
| } | |||||
| param->src_type_ = kNumberTypeFloat32; | |||||
| param->dst_type_ = kNumberTypeFloat16; | |||||
| auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size)); | |||||
| auto correctOutput = | |||||
| reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size)); | |||||
| MS_LOG(INFO) << " init tensors "; | |||||
| std::vector<int> shape = {1, 23, 39, 47}; | |||||
| auto tensor_type = lite::TensorCategory(schema::NodeType_ValueNode); | |||||
| auto *input_tensor = new (std::nothrow) lite::Tensor(kNumberTypeFloat32, shape, schema::Format_NHWC, tensor_type); | |||||
| auto *output_tensor = new (std::nothrow) lite::Tensor(kNumberTypeFloat16, shape, schema::Format_NHWC, tensor_type); | |||||
| if (input_tensor == nullptr || output_tensor == nullptr) { | |||||
| MS_LOG(INFO) << " new input_tensor or output_tensor failed "; | |||||
| return; | |||||
| } | |||||
| std::vector<lite::Tensor *> inputs{input_tensor}; | |||||
| std::vector<lite::Tensor *> outputs{output_tensor}; | |||||
| auto *cast_kernel = | |||||
| new (std::nothrow) kernel::CastOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||||
| if (cast_kernel == nullptr) { | |||||
| MS_LOG(INFO) << " new kernel::CastOpenCLKernel failed "; | |||||
| for (auto tensor : inputs) { | |||||
| delete tensor; | |||||
| } | |||||
| for (auto tensor : outputs) { | |||||
| delete tensor; | |||||
| } | |||||
| delete param; | |||||
| return; | |||||
| } | |||||
| cast_kernel->SetFormatType(schema::Format_NC4HW4); | |||||
| cast_kernel->Init(); | |||||
| // to do allocate memory for inputs and outputs | |||||
| for (auto &input_tensor : inputs) { | |||||
| input_tensor->MallocData(allocator); | |||||
| } | |||||
| MS_LOG(INFO) << " initialize sub_graph "; | |||||
| std::vector<kernel::LiteKernel *> kernels{cast_kernel}; | |||||
| auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||||
| if (sub_graph == nullptr) { | |||||
| MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed "; | |||||
| for (auto tensor : inputs) { | |||||
| delete tensor; | |||||
| } | |||||
| for (auto tensor : outputs) { | |||||
| delete tensor; | |||||
| } | |||||
| delete param; | |||||
| delete cast_kernel; | |||||
| return; | |||||
| } | |||||
| sub_graph->Init(); | |||||
| MS_LOG(INFO) << " initialize input data "; | |||||
| memcpy(inputs[0]->data_c(), input_data, input1_size); | |||||
| std::cout << "==================output data================" << std::endl; | |||||
| sub_graph->Run(); | |||||
| auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->data_c()); | |||||
| CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001); | |||||
| for (auto tensor : inputs) { | |||||
| delete tensor; | |||||
| } | |||||
| for (auto tensor : outputs) { | |||||
| delete tensor; | |||||
| } | |||||
| delete param; | |||||
| delete cast_kernel; | |||||
| delete sub_graph; | |||||
| } | |||||
| TEST_F(TestCastSelfOpenCL, Castfp16tofp32) { | |||||
| MS_LOG(INFO) << " begin test "; | |||||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||||
| ocl_runtime->Init(); | |||||
| auto allocator = ocl_runtime->GetAllocator(); | |||||
| // get the input from .bin | |||||
| size_t input1_size, output_size; | |||||
| std::string input1Ppath = "./test_data/in_castfp16.bin"; | |||||
| std::string correctOutputPath = "./test_data/out_castfp32.bin"; | |||||
| MS_LOG(INFO) << " initialize param "; | |||||
| auto param = new (std::nothrow) CastParameter(); | |||||
| if (param == nullptr) { | |||||
| MS_LOG(INFO) << " new CastParameter failed "; | |||||
| return; | |||||
| } | |||||
| param->src_type_ = kNumberTypeFloat16; | |||||
| param->dst_type_ = kNumberTypeFloat32; | |||||
| auto input_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size)); | |||||
| auto correctOutput = reinterpret_cast<float *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size)); | |||||
| MS_LOG(INFO) << " init tensors "; | |||||
| std::vector<int> shape = {1, 23, 39, 47}; | |||||
| auto tensor_type = lite::TensorCategory(schema::NodeType_ValueNode); | |||||
| auto *input_tensor = new (std::nothrow) lite::Tensor(kNumberTypeFloat16, shape, schema::Format_NHWC, tensor_type); | |||||
| auto *output_tensor = new (std::nothrow) lite::Tensor(kNumberTypeFloat32, shape, schema::Format_NHWC, tensor_type); | |||||
| if (input_tensor == nullptr || output_tensor == nullptr) { | |||||
| MS_LOG(INFO) << " new input_tensor or output_tensor failed "; | |||||
| return; | |||||
| } | |||||
| std::vector<lite::Tensor *> inputs{input_tensor}; | |||||
| std::vector<lite::Tensor *> outputs{output_tensor}; | |||||
| auto *cast_kernel = | |||||
| new (std::nothrow) kernel::CastOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||||
| if (cast_kernel == nullptr) { | |||||
| MS_LOG(INFO) << " new kernel::CastOpenCLKernel failed "; | |||||
| for (auto tensor : inputs) { | |||||
| delete tensor; | |||||
| } | |||||
| for (auto tensor : outputs) { | |||||
| delete tensor; | |||||
| } | |||||
| delete param; | |||||
| return; | |||||
| } | |||||
| cast_kernel->SetFormatType(schema::Format_NC4HW4); | |||||
| cast_kernel->Init(); | |||||
| // to do allocate memory for inputs and outputs | |||||
| for (auto &input_tensor : inputs) { | |||||
| input_tensor->MallocData(allocator); | |||||
| } | |||||
| MS_LOG(INFO) << " initialize sub_graph "; | |||||
| std::vector<kernel::LiteKernel *> kernels{cast_kernel}; | |||||
| auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||||
| if (sub_graph == nullptr) { | |||||
| MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed "; | |||||
| for (auto tensor : inputs) { | |||||
| delete tensor; | |||||
| } | |||||
| for (auto tensor : outputs) { | |||||
| delete tensor; | |||||
| } | |||||
| delete param; | |||||
| delete cast_kernel; | |||||
| return; | |||||
| } | |||||
| sub_graph->Init(); | |||||
| MS_LOG(INFO) << " initialize input data "; | |||||
| memcpy(inputs[0]->data_c(), input_data, input1_size); | |||||
| std::cout << "==================output data================" << std::endl; | |||||
| sub_graph->Run(); | |||||
| auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c()); | |||||
| CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001); | |||||
| for (auto tensor : inputs) { | |||||
| delete tensor; | |||||
| } | |||||
| for (auto tensor : outputs) { | |||||
| delete tensor; | |||||
| } | |||||
| delete param; | |||||
| delete cast_kernel; | |||||
| delete sub_graph; | |||||
| } | |||||
| } // namespace mindspore | |||||
| @@ -138,24 +138,24 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) { | |||||
| sub_graph->Init(); | sub_graph->Init(); | ||||
| MS_LOG(INFO) << " initialize input data "; | MS_LOG(INFO) << " initialize input data "; | ||||
| if (inputs.size() == 2) { | if (inputs.size() == 2) { | ||||
| memcpy(inputs[0]->MutableData(), input_data1, input1_size); | |||||
| memcpy(inputs[1]->MutableData(), input_data2, input2_size); | |||||
| memcpy(inputs[0]->data_c(), input_data1, input1_size); | |||||
| memcpy(inputs[1]->data_c(), input_data2, input2_size); | |||||
| } else if (inputs.size() == 3) { | } else if (inputs.size() == 3) { | ||||
| memcpy(inputs[0]->MutableData(), input_data1, input1_size); | |||||
| memcpy(inputs[1]->MutableData(), input_data2, input2_size); | |||||
| memcpy(inputs[2]->MutableData(), input_data3, input3_size); | |||||
| memcpy(inputs[0]->data_c(), input_data1, input1_size); | |||||
| memcpy(inputs[1]->data_c(), input_data2, input2_size); | |||||
| memcpy(inputs[2]->data_c(), input_data3, input3_size); | |||||
| } else if (inputs.size() == 4) { | } else if (inputs.size() == 4) { | ||||
| memcpy(inputs[0]->MutableData(), input_data1, input1_size); | |||||
| memcpy(inputs[1]->MutableData(), input_data2, input2_size); | |||||
| memcpy(inputs[2]->MutableData(), input_data3, input3_size); | |||||
| memcpy(inputs[3]->MutableData(), input_data4, input4_size); | |||||
| memcpy(inputs[0]->data_c(), input_data1, input1_size); | |||||
| memcpy(inputs[1]->data_c(), input_data2, input2_size); | |||||
| memcpy(inputs[2]->data_c(), input_data3, input3_size); | |||||
| memcpy(inputs[3]->data_c(), input_data4, input4_size); | |||||
| } else { | } else { | ||||
| MS_LOG(ERROR) << " input size must be 2 or 3 or 4"; | MS_LOG(ERROR) << " input size must be 2 or 3 or 4"; | ||||
| } | } | ||||
| std::cout << "==================output data================" << std::endl; | std::cout << "==================output data================" << std::endl; | ||||
| sub_graph->Run(); | sub_graph->Run(); | ||||
| auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->MutableData()); | |||||
| auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->data_c()); | |||||
| CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001); | CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001); | ||||
| for (auto tensor : inputs) { | for (auto tensor : inputs) { | ||||
| delete tensor; | delete tensor; | ||||
| @@ -263,19 +263,19 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) { | |||||
| sub_graph->Init(); | sub_graph->Init(); | ||||
| MS_LOG(INFO) << " initialize input data "; | MS_LOG(INFO) << " initialize input data "; | ||||
| if (inputs.size() == 2) { | if (inputs.size() == 2) { | ||||
| memcpy(inputs[0]->MutableData(), input_data1, input1_size); | |||||
| memcpy(inputs[1]->MutableData(), input_data2, input2_size); | |||||
| memcpy(inputs[0]->data_c(), input_data1, input1_size); | |||||
| memcpy(inputs[1]->data_c(), input_data2, input2_size); | |||||
| } else if (inputs.size() == 3) { | } else if (inputs.size() == 3) { | ||||
| memcpy(inputs[0]->MutableData(), input_data1, input1_size); | |||||
| memcpy(inputs[1]->MutableData(), input_data2, input2_size); | |||||
| memcpy(inputs[2]->MutableData(), input_data3, input3_size); | |||||
| memcpy(inputs[0]->data_c(), input_data1, input1_size); | |||||
| memcpy(inputs[1]->data_c(), input_data2, input2_size); | |||||
| memcpy(inputs[2]->data_c(), input_data3, input3_size); | |||||
| } else { | } else { | ||||
| MS_LOG(ERROR) << " input size must be 2 or 3 "; | MS_LOG(ERROR) << " input size must be 2 or 3 "; | ||||
| } | } | ||||
| std::cout << "==================output data================" << std::endl; | std::cout << "==================output data================" << std::endl; | ||||
| sub_graph->Run(); | sub_graph->Run(); | ||||
| auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->MutableData()); | |||||
| auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c()); | |||||
| CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001); | CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001); | ||||
| for (auto tensor : inputs) { | for (auto tensor : inputs) { | ||||
| delete tensor; | delete tensor; | ||||
| @@ -130,12 +130,12 @@ TEST_F(TestSliceOpenCLfp32, Slicefp32input_dim4) { | |||||
| sub_graph->Init(); | sub_graph->Init(); | ||||
| MS_LOG(INFO) << " init tensors "; | MS_LOG(INFO) << " init tensors "; | ||||
| memcpy(inputs[0]->MutableData(), input_data, input_size); | |||||
| memcpy(inputs[0]->data_c(), input_data, input_size); | |||||
| std::cout << "==================output data================" << std::endl; | std::cout << "==================output data================" << std::endl; | ||||
| sub_graph->Run(); | sub_graph->Run(); | ||||
| auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->MutableData()); | |||||
| auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c()); | |||||
| CompareOutputData1(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001); | CompareOutputData1(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001); | ||||
| for (auto tensor : inputs) { | for (auto tensor : inputs) { | ||||
| delete tensor; | delete tensor; | ||||
| @@ -238,12 +238,12 @@ TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) { | |||||
| sub_graph->Init(); | sub_graph->Init(); | ||||
| MS_LOG(INFO) << " init tensors "; | MS_LOG(INFO) << " init tensors "; | ||||
| memcpy(inputs[0]->MutableData(), input_data, input_size); | |||||
| memcpy(inputs[0]->data_c(), input_data, input_size); | |||||
| std::cout << "==================output data================" << std::endl; | std::cout << "==================output data================" << std::endl; | ||||
| sub_graph->Run(); | sub_graph->Run(); | ||||
| auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->MutableData()); | |||||
| auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->data_c()); | |||||
| CompareOutputData1(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001); | CompareOutputData1(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001); | ||||
| for (auto tensor : inputs) { | for (auto tensor : inputs) { | ||||
| delete tensor; | delete tensor; | ||||