Merge pull request !4138 from liuchao/arith_imagetags/v0.7.0-beta
| @@ -26,26 +26,9 @@ __kernel void ElementDiv(__global float *input_a, __global float *input_b, __glo | |||
| output[idx] = input_a[idx] * input_b[idx]; | |||
| } | |||
| __kernel void BoardcastAdd(__global float *input_a, float input_b, __global float *output, const unsigned int n) { | |||
| __kernel void BoardcastArith(__global float *input_a, float weight, float bias, __global float *output, | |||
| const unsigned int n) { | |||
| int idx = get_global_id(0); | |||
| if (idx >= n) return; | |||
| output[idx] = input_a[idx] + input_b; | |||
| } | |||
| __kernel void BoardcastSub(__global float *input_a, float input_b, __global float *output, const unsigned int n) { | |||
| int idx = get_global_id(0); | |||
| if (idx >= n) return; | |||
| output[idx] = input_a[idx] - input_b; | |||
| } | |||
| __kernel void BoardcastMul(__global float *input_a, float input_b, __global float *output, const unsigned int n) { | |||
| int idx = get_global_id(0); | |||
| if (idx >= n) return; | |||
| output[idx] = input_a[idx] * input_b; | |||
| } | |||
| __kernel void BoardcastDiv(__global float *input_a, float input_b, __global float *output, const unsigned int n) { | |||
| int idx = get_global_id(0); | |||
| if (idx >= n) return; | |||
| output[idx] = input_a[idx] * input_b; | |||
| output[idx] = weight * input_a[idx] + bias; | |||
| } | |||
| @@ -1,15 +1,65 @@ | |||
| __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; | |||
| __kernel void ElementAdd(__read_only image2d_t *input_a, __read_only image2d_t *input_b, __write_only image2d_t *output, | |||
| const int4 output_shape) { | |||
| __kernel void ElementAdd(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) return; | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| if (idx >= n) return; | |||
| float4 a = read_imagef(input_a, smp_none, (int2)(X, Y * output_shape.w + Z)); | |||
| float4 b = read_imagef(input_b, smp_none, (int2)(X, Y * output_shape.w + Z)); | |||
| src = a + b; | |||
| write_imagef(output, (int2)(0, 0), src); | |||
| float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); | |||
| float4 b = read_imagef(input_b, smp_none, (int2)(X, Y)); | |||
| write_imagef(output, (int2)(X, Y), a + b); | |||
| } | |||
| __kernel void ElementSub(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); | |||
| float4 b = read_imagef(input_b, smp_none, (int2)(X, Y)); | |||
| write_imagef(output, (int2)(X, Y), a - b); | |||
| } | |||
| __kernel void ElementMul(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); | |||
| float4 b = read_imagef(input_b, smp_none, (int2)(X, Y)); | |||
| write_imagef(output, (int2)(X, Y), a * b); | |||
| } | |||
| __kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); | |||
| float4 b = read_imagef(input_b, smp_none, (int2)(X, Y)); | |||
| write_imagef(output, (int2)(X, Y), a / b); | |||
| } | |||
| __kernel void BoardcastArith(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); | |||
| write_imagef(output, (int2)(X, Y), weight * a + bias); | |||
| } | |||
| @@ -40,10 +40,10 @@ std::vector<size_t> ArithmeticOpenCLKernel::InitGlobalSize() const { | |||
| } | |||
| void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() { | |||
| global_size_ = InitGlobalSize(); | |||
| int max_work_group_size = runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*runtime_->Device())()); | |||
| local_size_ = GetCommonLocalSize(global_size_, max_work_group_size); | |||
| global_size_ = GetCommonGlobalSize(local_size_, global_size_); | |||
| size_t H = outputs_[0]->Batch() * outputs_[0]->Height(); | |||
| size_t W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM); | |||
| local_size_ = {16, 16}; | |||
| global_size_ = {H, W}; | |||
| } | |||
| void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() { | |||
| @@ -51,63 +51,75 @@ void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() { | |||
| global_size_ = {element_num}; | |||
| } | |||
| int ArithmeticOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t>* img_size) { | |||
| size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM); | |||
| int H = outputs_[0]->Batch() * outputs_[0]->Height(); | |||
| int W = outputs_[0]->Width() * CO4; | |||
| size_t im_dst_x, im_dst_y; | |||
| if (inputs_[0]->GetFormat() == schema::Format_NHWC4) { | |||
| im_dst_x = W; | |||
| im_dst_y = H; | |||
| } else { | |||
| im_dst_y = outputs_[0]->Batch() * outputs_[0]->Height() * CO4; | |||
| im_dst_x = outputs_[0]->Width(); | |||
| } | |||
| #ifdef ENABLE_FP16 | |||
| size_t img_dtype = CL_HALF_FLOAT; | |||
| #else | |||
| size_t img_dtype = CL_FLOAT; | |||
| #endif | |||
| img_size->clear(); | |||
| std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype}; | |||
| *img_size = vec; | |||
| return 0; | |||
| } | |||
| int ArithmeticOpenCLKernel::Init() { | |||
| runtime_ = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| std::string element_name; | |||
| std::string boardcast_name; | |||
| std::string kernel_name; | |||
| if (inputs_[1]->TensorType() == schema::NodeType_ValueNode && inputs_[1]->Data() != nullptr) { | |||
| element_flag_ = false; | |||
| kernel_name = "BoardcastArith"; | |||
| } else { | |||
| element_flag_ = true; | |||
| switch (opParameter->type_) { | |||
| case PrimitiveType_Mul: | |||
| kernel_name = "ElementMul"; | |||
| break; | |||
| case PrimitiveType_Add: | |||
| kernel_name = "ElementAdd"; | |||
| break; | |||
| case PrimitiveType_Sub: | |||
| kernel_name = "ElementSub"; | |||
| break; | |||
| case PrimitiveType_Div: | |||
| kernel_name = "ElementDiv"; | |||
| break; | |||
| default: | |||
| MS_LOG(ERROR) << "Error Operator type " << opParameter->type_; | |||
| break; | |||
| } | |||
| } | |||
| switch (opParameter->type_) { | |||
| case PrimitiveType_Mul: | |||
| element_name = "ElementMul"; | |||
| boardcast_name = "BoardcastMul"; | |||
| break; | |||
| case PrimitiveType_Add: | |||
| element_name = "ElementAdd"; | |||
| boardcast_name = "BoardcastAdd"; | |||
| break; | |||
| case PrimitiveType_Sub: | |||
| element_name = "ElementSub"; | |||
| boardcast_name = "BoardcastSub"; | |||
| break; | |||
| case PrimitiveType_Div: | |||
| element_name = "ElementDiv"; | |||
| boardcast_name = "BoardcastDiv"; | |||
| break; | |||
| default: | |||
| MS_LOG(ERROR) << "Error Operator type " << opParameter->type_; | |||
| break; | |||
| } | |||
| #ifdef PROGRAM_WITH_IL | |||
| runtime_->CreateKernelFromIL(kernel_(), kernel_name); | |||
| #else | |||
| std::string program_name = "Arithmetic"; | |||
| std::set<std::string> build_options; | |||
| std::string source = arithmetic_buffer_source_fp32; | |||
| std::string source = arithmetic_image2d_source_fp32; | |||
| runtime_->LoadSource(program_name, source); | |||
| if (element_flag_) { | |||
| runtime_->BuildKernel(kernel_, program_name, element_name, build_options); | |||
| MS_LOG(DEBUG) << element_name << " Init Done!"; | |||
| } else { | |||
| runtime_->BuildKernel(kernel_, program_name, boardcast_name, build_options); | |||
| MS_LOG(DEBUG) << boardcast_name << " Init Done!"; | |||
| } | |||
| runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| #endif | |||
| outputs_[0]->SetFormat(schema::Format_NHWC4); | |||
| Image2dGetWorkGroupSize(); | |||
| return 0; | |||
| } | |||
| int ArithmeticOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->Name() << " Running!"; | |||
| auto runtime_ = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| BufferGetWorkGroupSize(); | |||
| int arg_idx = 0; | |||
| uint32_t element_num = outputs_[0]->ElementsC4Num(); | |||
| @@ -116,11 +128,34 @@ int ArithmeticOpenCLKernel::Run() { | |||
| if (element_flag_) { | |||
| runtime_->SetKernelArg(kernel_, arg_idx++, inputs_[1]->Data()); | |||
| } else { | |||
| runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<float *>(inputs_[1]->Data())[0]); | |||
| float value = static_cast<float *>(inputs_[1]->Data())[0]; | |||
| switch (opParameter->type_) { | |||
| case PrimitiveType_Mul: | |||
| weight_ = value; | |||
| break; | |||
| case PrimitiveType_Add: | |||
| bias_ = value; | |||
| break; | |||
| case PrimitiveType_Sub: | |||
| bias_ = -1 * value; | |||
| break; | |||
| case PrimitiveType_Div: | |||
| bias_ = 1 / value; | |||
| break; | |||
| default: | |||
| MS_LOG(ERROR) << "Error Operator type " << opParameter->type_; | |||
| break; | |||
| } | |||
| runtime_->SetKernelArg(kernel_, arg_idx++, weight_); | |||
| runtime_->SetKernelArg(kernel_, arg_idx++, bias_); | |||
| MS_LOG(DEBUG) << arg_idx-2 << " " << weight_; | |||
| MS_LOG(DEBUG) << arg_idx-1 << " " << bias_; | |||
| } | |||
| runtime_->SetKernelArg(kernel_, arg_idx++, outputs_[0]->Data()); | |||
| runtime_->SetKernelArg(kernel_, arg_idx++, element_num); | |||
| int H = outputs_[0]->Batch() * outputs_[0]->Height(); | |||
| int W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM); | |||
| cl_int2 output_shape{H, W}; | |||
| runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); | |||
| runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); | |||
| return 0; | |||
| } | |||
| @@ -24,15 +24,16 @@ | |||
| namespace mindspore::kernel { | |||
| class ArithmeticOpenCLKernel : public ArithmeticCPUKernel { | |||
| class ArithmeticOpenCLKernel : public OpenCLKernel { | |||
| public: | |||
| explicit ArithmeticOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | |||
| const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx) | |||
| : ArithmeticCPUKernel(parameter, inputs, outputs, ctx) {} | |||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||
| ~ArithmeticOpenCLKernel() override{}; | |||
| int Init() override; | |||
| int Run() override; | |||
| int GetImageSize(size_t idx, std::vector<size_t>* img_size) override; | |||
| private: | |||
| std::vector<size_t> InitGlobalSize() const; | |||
| @@ -42,6 +43,8 @@ class ArithmeticOpenCLKernel : public ArithmeticCPUKernel { | |||
| cl::Kernel kernel_; | |||
| lite::opencl::OpenCLRuntime *runtime_; | |||
| bool element_flag_{true}; | |||
| float weight_{1.f}; | |||
| float bias_{.0f}; | |||
| std::vector<size_t> local_size_; | |||
| std::vector<size_t> global_size_; | |||
| @@ -61,13 +61,12 @@ void LogData(void *data, const int size, const std::string prefix) { | |||
| } | |||
| void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b) { | |||
| std::cout << "TestCase" << std::endl; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| bool is_bias_add = shape_b.empty(); | |||
| auto tensorType = schema::NodeType_ValueNode; | |||
| std::cout << "TestCase tensor" << std::endl; | |||
| lite::tensor::Tensor *tensor_a = | |||
| new lite::tensor::Tensor(kNumberTypeFloat32, shape_a, schema::Format_NHWC4, tensorType); | |||
| lite::tensor::Tensor *tensor_b = | |||
| @@ -77,7 +76,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b) | |||
| int64_t element_num = tensor_a->ElementsC4Num(); | |||
| int64_t element_num_b = is_bias_add ? 1 : tensor_b->ElementsC4Num(); | |||
| std::cout << "TestCase new data" << std::endl; | |||
| float *data_a = new float[element_num]; | |||
| float *data_b = new float[element_num_b]; | |||
| float *data_c_cpu = new float[element_num]; | |||
| @@ -87,14 +85,12 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b) | |||
| InitData(data_b, element_num_b); | |||
| memset(data_c_ocl, 0, sizeof(float) * element_num); | |||
| std::cout << "TestCase run cpu" << std::endl; | |||
| if (is_bias_add) { | |||
| BoardcaseAdd(data_a, static_cast<float *>(data_b)[0], data_c_cpu, element_num); | |||
| } else { | |||
| ElementAdd(data_a, data_b, data_c_cpu, element_num); | |||
| } | |||
| std::cout << "TestCase set data" << std::endl; | |||
| std::vector<lite::tensor::Tensor *> inputs = {tensor_a}; | |||
| if (!is_bias_add) { | |||
| inputs.push_back(tensor_b); | |||
| @@ -114,9 +110,10 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b) | |||
| new kernel::ArithmeticOpenCLKernel(reinterpret_cast<OpParameter *>(param), arithmetic_inputs, outputs, &ctx); | |||
| arith_kernel->Init(); | |||
| tensor_a->MallocData(allocator); | |||
| tensor_b->MallocData(allocator); | |||
| std::vector<kernel::LiteKernel *> kernels{arith_kernel}; | |||
| auto *kernel = new kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||
| std::cout << "TestCase Init" << std::endl; | |||
| kernel->Init(); | |||
| memcpy(inputs[0]->Data(), data_a, sizeof(float) * element_num); | |||
| @@ -124,7 +121,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b) | |||
| memcpy(inputs[1]->Data(), data_b, sizeof(float) * element_num_b); | |||
| } | |||
| std::cout << "TestCase Run" << std::endl; | |||
| kernel->Run(); | |||
| memcpy(data_c_ocl, outputs[0]->Data(), sizeof(float) * element_num); | |||
| @@ -136,7 +132,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b) | |||
| LogData(outputs[0]->Data(), 10, "OpenCL compute : "); | |||
| bool cmp = DataCompare(data_c_cpu, data_c_ocl, element_num); | |||
| MS_LOG(INFO) << "Compare " << (cmp ? "success!" : "failed!"); | |||
| std::cout << "TestCase End" << std::endl; | |||
| // free | |||
| delete[] data_a; | |||
| @@ -162,15 +157,15 @@ class TestArithmeticOpenCL : public mindspore::Common { | |||
| }; | |||
| TEST_F(TestArithmeticOpenCL, AddElementwiseTest) { | |||
| const std::vector<int> &shape_a = {1, 32, 32, 4}; | |||
| const std::vector<int> &shape_b = {1, 32, 32, 4}; | |||
| const std::vector<int> &shape_a = {1, 1024, 1024, 4}; | |||
| const std::vector<int> &shape_b = {1, 1024, 1024, 4}; | |||
| TestCase(shape_a, shape_b); | |||
| } | |||
| // TEST_F(TestOpenCLKernel, AddBoardcaseTest) { | |||
| // const std::vector<int> &shape_a = {1, 4, 128, 128}; | |||
| // const std::vector<int> &shape_b = {}; | |||
| // TestCase(shape_a, shape_b); | |||
| //} | |||
| TEST_F(TestArithmeticOpenCL, AddBoardcaseTest) { | |||
| const std::vector<int> &shape_a = {1, 128, 128, 4}; | |||
| const std::vector<int> &shape_b = {}; | |||
| TestCase(shape_a, shape_b); | |||
| } | |||
| } // namespace mindspore | |||