| @@ -1,8 +1,9 @@ | |||||
| #define FLT half | #define FLT half | ||||
| #define FLT4 half4 | #define FLT4 half4 | ||||
| #define READ_IMAGE read_imageh | #define READ_IMAGE read_imageh | ||||
| #define WRITE_IMAGE write_imageh | |||||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | ||||
| __kernel void transpose(__read_only image2d_t src_data, __global float4 *dst_data, int2 HW, int2 C) { | |||||
| __kernel void transpose(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= HW.y || Y >= C.y) { | if (X >= HW.y || Y >= C.y) { | ||||
| @@ -37,8 +38,8 @@ __kernel void transpose(__read_only image2d_t src_data, __global float4 *dst_dat | |||||
| result[3].z = x2.w; | result[3].z = x2.w; | ||||
| result[3].w = x3.w; | result[3].w = x3.w; | ||||
| dst_data[4 * Y * HW.y + X] = result[0]; | |||||
| dst_data[(4 * Y + 1) * HW.y + X] = result[1]; | |||||
| dst_data[(4 * Y + 2) * HW.y + X] = result[2]; | |||||
| dst_data[(4 * Y + 3) * HW.y + X] = result[3]; | |||||
| WRITE_IMAGE(dst_data, (int2)(X, 4 * Y), result[0]); | |||||
| WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 1), result[1]); | |||||
| WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 2), result[2]); | |||||
| WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]); | |||||
| } | } | ||||
| @@ -1,8 +1,9 @@ | |||||
| #define FLT float | #define FLT float | ||||
| #define FLT4 float4 | #define FLT4 float4 | ||||
| #define READ_IMAGE read_imagef | #define READ_IMAGE read_imagef | ||||
| #define WRITE_IMAGE write_imagef | |||||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | ||||
| __kernel void transpose(__read_only image2d_t src_data, __global float4 *dst_data, int2 HW, int2 C) { | |||||
| __kernel void transpose(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= HW.y || Y >= C.y) { | if (X >= HW.y || Y >= C.y) { | ||||
| @@ -37,8 +38,8 @@ __kernel void transpose(__read_only image2d_t src_data, __global float4 *dst_dat | |||||
| result[3].z = x2.w; | result[3].z = x2.w; | ||||
| result[3].w = x3.w; | result[3].w = x3.w; | ||||
| dst_data[4 * Y * HW.y + X] = result[0]; | |||||
| dst_data[(4 * Y + 1) * HW.y + X] = result[1]; | |||||
| dst_data[(4 * Y + 2) * HW.y + X] = result[2]; | |||||
| dst_data[(4 * Y + 3) * HW.y + X] = result[3]; | |||||
| WRITE_IMAGE(dst_data, (int2)(X, 4 * Y), result[0]); | |||||
| WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 1), result[1]); | |||||
| WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 2), result[2]); | |||||
| WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]); | |||||
| } | } | ||||
| @@ -43,7 +43,7 @@ void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() { | |||||
| size_t H = outputs_[0]->Batch() * outputs_[0]->Height(); | size_t H = outputs_[0]->Batch() * outputs_[0]->Height(); | ||||
| size_t W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM); | size_t W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM); | ||||
| local_size_ = {16, 16}; | local_size_ = {16, 16}; | ||||
| global_size_ = {H, W}; | |||||
| global_size_ = {W, H}; | |||||
| } | } | ||||
| void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() { | void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() { | ||||
| @@ -140,7 +140,7 @@ int ArithmeticOpenCLKernel::Run() { | |||||
| bias_ = -1 * value; | bias_ = -1 * value; | ||||
| break; | break; | ||||
| case PrimitiveType_Div: | case PrimitiveType_Div: | ||||
| bias_ = 1 / value; | |||||
| weight_ = 1 / value; | |||||
| break; | break; | ||||
| default: | default: | ||||
| MS_LOG(ERROR) << "Error Operator type " << opParameter->type_; | MS_LOG(ERROR) << "Error Operator type " << opParameter->type_; | ||||
| @@ -152,7 +152,7 @@ int ArithmeticOpenCLKernel::Run() { | |||||
| runtime_->SetKernelArg(kernel_, arg_idx++, outputs_[0]->Data()); | runtime_->SetKernelArg(kernel_, arg_idx++, outputs_[0]->Data()); | ||||
| int H = outputs_[0]->Batch() * outputs_[0]->Height(); | int H = outputs_[0]->Batch() * outputs_[0]->Height(); | ||||
| int W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM); | int W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM); | ||||
| cl_int2 output_shape{H, W}; | |||||
| cl_int2 output_shape{W, H}; | |||||
| runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); | runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); | ||||
| runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); | runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); | ||||
| return 0; | return 0; | ||||
| @@ -67,6 +67,21 @@ int TransposeOpenCLKernel::Init() { | |||||
| int TransposeOpenCLKernel::ReSize() { return 0; } | int TransposeOpenCLKernel::ReSize() { return 0; } | ||||
| int TransposeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { | |||||
| size_t im_dst_x, im_dst_y; | |||||
| im_dst_x = UP_DIV(outputs_[0]->Height() * outputs_[0]->Width(), C4NUM); | |||||
| im_dst_y = outputs_[0]->Channel(); | |||||
| #ifdef ENABLE_FP16 | |||||
| size_t img_dtype = CL_HALF_FLOAT; | |||||
| #else | |||||
| size_t img_dtype = CL_FLOAT; | |||||
| #endif | |||||
| img_size->clear(); | |||||
| std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype}; | |||||
| *img_size = vec; | |||||
| return RET_OK; | |||||
| } | |||||
| int TransposeOpenCLKernel::Run() { | int TransposeOpenCLKernel::Run() { | ||||
| MS_LOG(DEBUG) << this->Name() << " Running!"; | MS_LOG(DEBUG) << this->Name() << " Running!"; | ||||
| std::vector<int> shapex = inputs_[0]->shape(); | std::vector<int> shapex = inputs_[0]->shape(); | ||||
| @@ -35,7 +35,7 @@ class TransposeOpenCLKernel : public OpenCLKernel { | |||||
| int Init() override; | int Init() override; | ||||
| int ReSize() override; | int ReSize() override; | ||||
| int Run() override; | int Run() override; | ||||
| int GetImageSize(size_t idx, std::vector<size_t> *img_size) override; | |||||
| private: | private: | ||||
| cl::Kernel kernel_; | cl::Kernel kernel_; | ||||
| }; | }; | ||||
| @@ -50,6 +50,15 @@ auto status = RunBenchmark(5, argv); | |||||
| ASSERT_EQ(status, RET_OK); | ASSERT_EQ(status, RET_OK); | ||||
| } | } | ||||
| TEST_F(BenchmarkTest, Test_MV2_GPU) { | |||||
| const char *argv[] = {"./benchmark", "--modelPath=./hiai/mobilenet_v2.ms", | |||||
| "--inDataPath=./hiai/mobilenet_v2_in.bin", | |||||
| "--calibDataPath=./hiai/mobilenet_v2_out.bin", | |||||
| "--device=GPU"}; | |||||
| auto status = RunBenchmark(5, argv); | |||||
| ASSERT_EQ(status, RET_OK); | |||||
| } | |||||
| TEST_F(BenchmarkTest, TestHebing) { | TEST_F(BenchmarkTest, TestHebing) { | ||||
| const char *argv[] = {"./benchmark", "--modelPath=./hiai/model_hebing_3branch.ms", | const char *argv[] = {"./benchmark", "--modelPath=./hiai/model_hebing_3branch.ms", | ||||
| "--inDataPath=./hiai/model_hebing_3branch.bin", | "--inDataPath=./hiai/model_hebing_3branch.bin", | ||||
| @@ -66,6 +66,9 @@ TEST_F(TestTransposeOpenCL, TransposeFp32) { | |||||
| size_n = size_n > 100 ? 100 : size_n; | size_n = size_n > 100 ? 100 : size_n; | ||||
| for (int i = 0; i < size_n; i++) { | for (int i = 0; i < size_n; i++) { | ||||
| std::cout << output_data[i] << " "; | std::cout << output_data[i] << " "; | ||||
| if ((i + 1) % c == 0) { | |||||
| std::cout << std::endl; | |||||
| } | |||||
| } | } | ||||
| std::cout << std::endl; | std::cout << std::endl; | ||||