Merge pull request !6454 from liuchao/mastertags/v1.1.0
| @@ -116,7 +116,7 @@ __kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only ima | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), floor(a / b)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, b))); | |||
| } | |||
| __kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||
| @@ -155,7 +155,7 @@ __kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2 | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a == b)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), a == b ? (FLT4)1.f : (FLT4).0f); | |||
| } | |||
| __kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||
| @@ -168,7 +168,7 @@ __kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only ima | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a != b)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), a != b ? (FLT4)1.f : (FLT4).0f); | |||
| } | |||
| __kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||
| @@ -181,7 +181,7 @@ __kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a < b)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), a < b ? (FLT4)1.f : (FLT4).0f); | |||
| } | |||
| __kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||
| @@ -194,7 +194,7 @@ __kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only im | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a <= b)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), a <= b ? (FLT4)1.f : (FLT4).0f); | |||
| } | |||
| __kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||
| @@ -207,7 +207,7 @@ __kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only imag | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a > b)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), a > b ? (FLT4)1.f : (FLT4).0f); | |||
| } | |||
| __kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||
| @@ -220,7 +220,7 @@ __kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a >= b)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), a >= b ? (FLT4)1.f : (FLT4).0f); | |||
| } | |||
| __kernel void BroadcastAdd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| @@ -327,7 +327,7 @@ __kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __wr | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), floor(a / (FLT4)b)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, (FLT4)b))); | |||
| } | |||
| __kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| @@ -363,7 +363,7 @@ __kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a == (FLT4)b)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), a == (FLT4)b ? (FLT4)1.f : (FLT4).0f); | |||
| } | |||
| __kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| @@ -375,7 +375,7 @@ __kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __wr | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a != (FLT4)b)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), a != (FLT4)b ? (FLT4)1.f : (FLT4).0f); | |||
| } | |||
| __kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| @@ -387,7 +387,7 @@ __kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_ | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a < (FLT4)b)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), a < (FLT4)b ? (FLT4)1.f : (FLT4).0f); | |||
| } | |||
| __kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| @@ -399,7 +399,7 @@ __kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __w | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a <= (FLT4)b)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), a <= (FLT4)b ? (FLT4)1.f : (FLT4).0f); | |||
| } | |||
| __kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| @@ -411,7 +411,7 @@ __kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __wri | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a > (FLT4)b)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), a > (FLT4)b ? (FLT4)1.f : (FLT4).0f); | |||
| } | |||
| __kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| @@ -423,7 +423,7 @@ __kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b, | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a >= (FLT4)b)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), a >= (FLT4)b ? (FLT4)1.f : (FLT4).0f); | |||
| } | |||
| __kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output, | |||
| @@ -311,26 +311,19 @@ int ScaleOpenCLKernel::Run() { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, offset); | |||
| } else { | |||
| if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { | |||
| if (in_tensors_[1]->data_type() == kNumberTypeFloat32) { | |||
| float scale = static_cast<float *>(in_tensors_[1]->data_c())[0]; | |||
| float offset = static_cast<float *>(in_tensors_[2]->data_c())[0]; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, offset); | |||
| } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { | |||
| if (in_tensors_[1]->data_type() == kNumberTypeFloat32) { | |||
| float scale = static_cast<float *>(in_tensors_[1]->data_c())[0]; | |||
| float offset = static_cast<float *>(in_tensors_[2]->data_c())[0]; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(scale)); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(offset)); | |||
| } else if (in_tensors_[1]->data_type() == kNumberTypeFloat16) { | |||
| float16_t scale = static_cast<float16_t *>(in_tensors_[1]->data_c())[0]; | |||
| float16_t offset = static_cast<float16_t *>(in_tensors_[2]->data_c())[0]; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(scale)); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(offset)); | |||
| } else { | |||
| MS_LOG(ERROR) << "Unsupport data type " << in_tensors_[1]->data_type(); | |||
| return RET_ERROR; | |||
| } | |||
| } else if (in_tensors_[1]->data_type() == kNumberTypeFloat16) { | |||
| float16_t scale = static_cast<float16_t *>(in_tensors_[1]->data_c())[0]; | |||
| float16_t offset = static_cast<float16_t *>(in_tensors_[2]->data_c())[0]; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<float>(scale)); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<float>(offset)); | |||
| } else { | |||
| MS_LOG(ERROR) << "Unsupport data type " << in_tensors_[1]->data_type(); | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| @@ -31,12 +31,10 @@ cp -fr $TEST_DATA_DIR/testPK ./data | |||
| # for GPU OpenCL | |||
| ./lite-test --gtest_filter="TestConvolutionOpenCL.simple_test*" | |||
| ./lite-test --gtest_filter="TestArithmeticSelfOpenCLCI.ArithmeticSelfRound*" | |||
| ./lite-test --gtest_filter="TestConcatOpenCLCI.ConcatFp32_2inputforCI*" | |||
| ./lite-test --gtest_filter="TestSliceOpenCLfp32.Slicefp32CI*" | |||
| ./lite-test --gtest_filter="TestBatchnormOpenCLCI.Batchnormfp32CI*" | |||
| ./lite-test --gtest_filter="TestAvgPoolingOpenCL*" | |||
| ./lite-test --gtest_filter="TestConv2dTransposeOpenCL*" | |||
| ./lite-test --gtest_filter="TestMatMulOpenCL.MatMul2D*" | |||
| @@ -46,3 +44,5 @@ cp -fr $TEST_DATA_DIR/testPK ./data | |||
| ./lite-test --gtest_filter="TestReshapeOpenCL*" | |||
| ./lite-test --gtest_filter="TestSoftmaxOpenCL*" | |||
| ./lite-test --gtest_filter="TestTransposeOpenCL*" | |||
| ./lite-test --gtest_filter="TestArithmeticOpenCL*" | |||
| ./lite-test --gtest_filter="TestScaleOpenCL*" | |||
| @@ -67,6 +67,7 @@ static void LogData(void *data, const int size, const std::string prefix) { | |||
| template <class T> | |||
| static void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b) { | |||
| bool is_log_data = false; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| @@ -126,7 +127,7 @@ static void TestCase(const std::vector<int> &shape_a, const std::vector<int> &sh | |||
| } | |||
| std::vector<lite::Tensor *> outputs = {tensor_c}; | |||
| ArithmeticParameter *param = new (std::nothrow) ArithmeticParameter(); | |||
| ArithmeticParameter *param = static_cast<ArithmeticParameter *>(malloc(sizeof(ArithmeticParameter))); | |||
| param->broadcasting_ = is_bias_add; | |||
| if (param == nullptr) { | |||
| MS_LOG(ERROR) << "Create parameter failed!"; | |||
| @@ -156,7 +157,7 @@ static void TestCase(const std::vector<int> &shape_a, const std::vector<int> &sh | |||
| delete[] data_b; | |||
| delete[] data_c_cpu; | |||
| delete[] data_c_ocl; | |||
| delete param; | |||
| free(param); | |||
| return; | |||
| } | |||
| arith_kernel->Init(); | |||
| @@ -188,10 +189,12 @@ static void TestCase(const std::vector<int> &shape_a, const std::vector<int> &sh | |||
| memcpy(data_c_ocl, outputs[0]->data_c(), sizeof(T) * element_num); | |||
| LogData<T>(data_a, 10, "Data A : "); | |||
| LogData<T>(data_b, tensor_b->shape().empty() ? 1 : 10, "Data B : "); | |||
| LogData<T>(data_c_cpu, 10, "Expect compute : "); | |||
| LogData<T>(outputs[0]->data_c(), 10, "OpenCL compute : "); | |||
| if (is_log_data) { | |||
| LogData<T>(data_a, 10, "Data A : "); | |||
| LogData<T>(data_b, tensor_b->shape().empty() ? 1 : 10, "Data B : "); | |||
| LogData<T>(data_c_cpu, 10, "Expect compute : "); | |||
| LogData<T>(outputs[0]->data_c(), 10, "OpenCL compute : "); | |||
| } | |||
| bool cmp = DataCompare(data_c_cpu, data_c_ocl, element_num); | |||
| MS_LOG(INFO) << "Compare " << (cmp ? "success!" : "failed!"); | |||
| EXPECT_EQ(true, cmp); | |||
| @@ -203,7 +206,6 @@ static void TestCase(const std::vector<int> &shape_a, const std::vector<int> &sh | |||
| delete[] data_c_ocl; | |||
| delete kernel; | |||
| delete param; | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||
| @@ -67,6 +67,7 @@ static void LogData(void *data, const int size, const std::string prefix) { | |||
| template <class T> | |||
| static void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b) { | |||
| bool is_log_data = false; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| @@ -137,7 +138,7 @@ static void TestCase(const std::vector<int> &shape_a, const std::vector<int> &sh | |||
| } | |||
| std::vector<lite::Tensor *> outputs = {tensor_out}; | |||
| ScaleParameter *param = new (std::nothrow) ScaleParameter(); | |||
| ScaleParameter *param = static_cast<ScaleParameter *>(malloc(sizeof(ScaleParameter))); | |||
| if (param == nullptr) { | |||
| MS_LOG(ERROR) << "Create parameter failed!"; | |||
| delete tensor_in; | |||
| @@ -170,7 +171,7 @@ static void TestCase(const std::vector<int> &shape_a, const std::vector<int> &sh | |||
| delete[] data_offset; | |||
| delete[] data_out_cpu; | |||
| delete[] data_out_ocl; | |||
| delete param; | |||
| free(param); | |||
| return; | |||
| } | |||
| scale_kernel->Init(); | |||
| @@ -206,11 +207,13 @@ static void TestCase(const std::vector<int> &shape_a, const std::vector<int> &sh | |||
| memcpy(data_out_ocl, outputs[0]->data_c(), sizeof(T) * element_num); | |||
| LogData<T>(data_in, 10, "Data input : "); | |||
| LogData<T>(data_scale, tensor_scale->shape().empty() ? 1 : 10, "Data scale : "); | |||
| LogData<T>(data_offset, tensor_offset->shape().empty() ? 1 : 10, "Data offset : "); | |||
| LogData<T>(data_out_cpu, 10, "Expect compute : "); | |||
| LogData<T>(outputs[0]->data_c(), 10, "OpenCL compute : "); | |||
| if (is_log_data) { | |||
| LogData<T>(data_in, 10, "Data input : "); | |||
| LogData<T>(data_scale, tensor_scale->shape().empty() ? 1 : 10, "Data scale : "); | |||
| LogData<T>(data_offset, tensor_offset->shape().empty() ? 1 : 10, "Data offset : "); | |||
| LogData<T>(data_out_cpu, 10, "Expect compute : "); | |||
| LogData<T>(outputs[0]->data_c(), 10, "OpenCL compute : "); | |||
| } | |||
| bool cmp = DataCompare(data_out_cpu, data_out_ocl, element_num); | |||
| MS_LOG(INFO) << "Compare " << (cmp ? "success!" : "failed!"); | |||
| EXPECT_EQ(true, cmp); | |||
| @@ -223,7 +226,6 @@ static void TestCase(const std::vector<int> &shape_a, const std::vector<int> &sh | |||
| delete[] data_out_ocl; | |||
| delete kernel; | |||
| delete param; | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||