Merge pull request !6203 from chenzupeng/master-litetags/v1.0.0
| @@ -35,6 +35,32 @@ __kernel void SoftMax_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *outp | |||
| } | |||
| } | |||
| __kernel void SoftMax_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape) { | |||
| int X = get_global_id(0); // H | |||
| int Y = get_global_id(1); // W | |||
| int H = input_shape.x; | |||
| int W = input_shape.y; | |||
| int C = input_shape.z; | |||
| int S = input_shape.w; | |||
| if (X >= H || Y >= W) return; | |||
| FLT sum = 0.0f; | |||
| for (int d = 0; d < S; ++d) { | |||
| FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X)); | |||
| sum += exp(t.x); | |||
| if (d * 4 + 1 < C) sum += exp(t.y); | |||
| if (d * 4 + 2 < C) sum += exp(t.z); | |||
| if (d * 4 + 3 < C) sum += exp(t.w); | |||
| } | |||
| for (int d = 0; d < S; ++d) { | |||
| FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X)); | |||
| t = exp(t) / sum; | |||
| WRITE_IMAGE(output, (int2)(Y * S + d, X), t); | |||
| } | |||
| } | |||
| __kernel void SoftMax_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 *output, const int4 input_shape) { | |||
| int X = get_global_id(0); // H | |||
| int Y = get_global_id(1); // W | |||
| @@ -66,44 +92,45 @@ __kernel void SoftMax_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 *out | |||
| } | |||
| } | |||
| __kernel void SoftMax_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= input_shape.x || Y >= input_shape.y) return; | |||
| __kernel void SoftMax_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape) { | |||
| int X = get_global_id(0); // H | |||
| int Y = get_global_id(1); // W | |||
| int H = input_shape.x; | |||
| int W = input_shape.y; | |||
| int C = input_shape.z; | |||
| int S = input_shape.w; | |||
| if (X >= H || Y >= W) return; | |||
| FLT sum = 0.0f; | |||
| for (int d = 0; d < input_shape.w; ++d) { | |||
| FLT4 t = READ_IMAGE(input, smp_none, (int2)(Y * input_shape.w + d, X)); | |||
| for (int d = 0; d < S; ++d) { | |||
| FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X)); | |||
| sum += exp(t.x); | |||
| if (d * 4 + 1 < input_shape.z) sum += exp(t.y); | |||
| if (d * 4 + 2 < input_shape.z) sum += exp(t.z); | |||
| if (d * 4 + 3 < input_shape.z) sum += exp(t.w); | |||
| if (d * 4 + 1 < C) sum += exp(t.y); | |||
| if (d * 4 + 2 < C) sum += exp(t.z); | |||
| if (d * 4 + 3 < C) sum += exp(t.w); | |||
| } | |||
| for (int d = 0; d < input_shape.w; ++d) { | |||
| FLT4 t = READ_IMAGE(input, smp_none, (int2)(Y * input_shape.w + d, X)); | |||
| t = divide_no_check(exp(t), sum); | |||
| FLT4 result = TO_FLT4(t); | |||
| WRITE_IMAGE(output, (int2)(Y * input_shape.w + d, X), result); | |||
| for (int d = 0; d < S; ++d) { | |||
| FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X)); | |||
| t = exp(t) / sum; | |||
| WRITE_IMAGE(output, (int2)(Y, d * H + X), t); | |||
| } | |||
| } | |||
| __kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t output, const FLT4 mask, | |||
| const int slices, const int slices_x32) { | |||
| __kernel void SoftMax1x1_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *output, const float4 mask, | |||
| const int slices, const int slices_x32) { | |||
| int tid = get_local_id(0); | |||
| int slices_count = 0; | |||
| int offset = 0; | |||
| FLT sum = 0.0f; | |||
| do { | |||
| int z = offset + tid; | |||
| if (z < slices) { | |||
| FLT4 mask_temp = z == slices - 1 ? mask : (FLT4)(1.0f); | |||
| FLT4 src = READ_IMAGE(input, smp_none, (int2)(0, 0)); | |||
| sum += dot(mask_temp, exp(src)); | |||
| offset += 32; | |||
| } | |||
| slices_count++; | |||
| } while (slices_count < slices_x32); | |||
| for (size_t i = tid; i < slices - 1; i += 32) { | |||
| FLT4 src = READ_IMAGE(input, smp_zero, (int2)(i, 0)); | |||
| sum += dot((FLT4)(1.0f), exp(src)); | |||
| } | |||
| if ((slices - 1) % 32 == tid) { | |||
| FLT4 src = READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0)); | |||
| sum += dot(TO_FLT4(mask), exp(src)); | |||
| } | |||
| __local FLT4 tmp[8]; | |||
| __local FLT *tmpx1 = (__local FLT *)tmp; | |||
| @@ -122,21 +149,31 @@ __kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t | |||
| } | |||
| barrier(CLK_LOCAL_MEM_FENCE); | |||
| sum = tmpx1[0]; | |||
| offset = 0; | |||
| slices_count = 0; | |||
| do { | |||
| int z = offset + tid; | |||
| if (z < slices) { | |||
| FLT4 res = TO_FLT4(exp(READ_IMAGE(input, smp_none, (int2)(0, 0))) * sum); | |||
| WRITE_IMAGE(output, (int2)(0, 0), res); | |||
| offset += 32; | |||
| for (size_t i = tid; i < slices - 1; i += 32) { | |||
| FLT4 result = READ_IMAGE(input, smp_zero, (int2)(i, 0)); | |||
| result = exp(result) * sum; | |||
| output[i] = result; | |||
| } | |||
| if ((slices - 1) % 32 == tid) { | |||
| FLT4 result = READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0)); | |||
| result = exp(result) * sum; | |||
| __global FLT4 *remain_ptr4 = output; | |||
| remain_ptr4 += slices - 1; | |||
| __global FLT *remain_ptr = (__global FLT *)remain_ptr4; | |||
| remain_ptr[0] = result.x; | |||
| if (mask.y > 0.f) { | |||
| remain_ptr[1] = result.y; | |||
| } | |||
| if (mask.z > 0.f) { | |||
| remain_ptr[2] = result.z; | |||
| } | |||
| if (mask.w > 0.f) { | |||
| remain_ptr[3] = result.w; | |||
| } | |||
| slices_count++; | |||
| } while (slices_count < slices_x32); | |||
| } | |||
| } | |||
| __kernel void SoftMax1x1_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *output, const float4 mask, | |||
| __kernel void SoftMax1x1_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const float4 mask, | |||
| const int slices, const int slices_x32) { | |||
| int tid = get_local_id(0); | |||
| FLT sum = 0.0f; | |||
| @@ -167,27 +204,10 @@ __kernel void SoftMax1x1_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *o | |||
| } | |||
| barrier(CLK_LOCAL_MEM_FENCE); | |||
| sum = tmpx1[0]; | |||
| for (size_t i = tid; i < slices - 1; i += 32) { | |||
| for (size_t i = tid; i < slices; i += 32) { | |||
| FLT4 result = READ_IMAGE(input, smp_zero, (int2)(i, 0)); | |||
| result = exp(result) * sum; | |||
| output[i] = result; | |||
| } | |||
| if ((slices - 1) % 32 == tid) { | |||
| FLT4 result = READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0)); | |||
| result = exp(result) * sum; | |||
| __global FLT4 *remain_ptr4 = output; | |||
| remain_ptr4 += slices - 1; | |||
| __global FLT *remain_ptr = (__global FLT *)remain_ptr4; | |||
| remain_ptr[0] = result.x; | |||
| if (mask.y > 0.f) { | |||
| remain_ptr[1] = result.y; | |||
| } | |||
| if (mask.z > 0.f) { | |||
| remain_ptr[2] = result.z; | |||
| } | |||
| if (mask.w > 0.f) { | |||
| remain_ptr[3] = result.w; | |||
| } | |||
| WRITE_IMAGE(output, (int2)(i, 0), result); | |||
| } | |||
| } | |||
| @@ -245,3 +265,41 @@ __kernel void SoftMax1x1_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 * | |||
| } | |||
| } | |||
| } | |||
| __kernel void SoftMax1x1_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output, const float4 mask, | |||
| const int slices, const int slices_x32) { | |||
| int tid = get_local_id(0); | |||
| FLT sum = 0.0f; | |||
| for (size_t i = tid; i < slices - 1; i += 32) { | |||
| FLT4 src = READ_IMAGE(input, smp_zero, (int2)(0, i)); | |||
| sum += dot((FLT4)(1.0f), exp(src)); | |||
| } | |||
| if ((slices - 1) % 32 == tid) { | |||
| FLT4 src = READ_IMAGE(input, smp_zero, (int2)(0, slices - 1)); | |||
| sum += dot(TO_FLT4(mask), exp(src)); | |||
| } | |||
| __local FLT4 tmp[8]; | |||
| __local FLT *tmpx1 = (__local FLT *)tmp; | |||
| tmpx1[tid] = sum; | |||
| barrier(CLK_LOCAL_MEM_FENCE); | |||
| if (tid == 0) { | |||
| sum = dot((FLT4)(1.0f), tmp[0]); | |||
| sum += dot((FLT4)(1.0f), tmp[1]); | |||
| sum += dot((FLT4)(1.0f), tmp[2]); | |||
| sum += dot((FLT4)(1.0f), tmp[3]); | |||
| sum += dot((FLT4)(1.0f), tmp[4]); | |||
| sum += dot((FLT4)(1.0f), tmp[5]); | |||
| sum += dot((FLT4)(1.0f), tmp[6]); | |||
| sum += dot((FLT4)(1.0f), tmp[7]); | |||
| tmpx1[0] = divide_no_check(1.0f, sum); | |||
| } | |||
| barrier(CLK_LOCAL_MEM_FENCE); | |||
| sum = tmpx1[0]; | |||
| for (size_t i = tid; i < slices; i += 32) { | |||
| FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, i)); | |||
| result = exp(result) * sum; | |||
| WRITE_IMAGE(output, (int2)(0, i), result); | |||
| } | |||
| } | |||
| @@ -43,7 +43,7 @@ __kernel void transpose_IMG(__read_only image2d_t src_data, __write_only image2d | |||
| WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]); | |||
| } | |||
| __kernel void transpose_NHWC4_BUF(__read_only image2d_t src_data, global FLT4 *dst_data, int2 HW, int2 C, int W, | |||
| __kernel void transpose_NHWC4_BUF(__read_only image2d_t src_data, global float4 *dst_data, int2 HW, int2 C, int W, | |||
| int H) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| @@ -89,13 +89,13 @@ __kernel void transpose_NHWC4_BUF(__read_only image2d_t src_data, global FLT4 *d | |||
| result[3].z = x2.w; | |||
| result[3].w = x3.w; | |||
| if (4 * Y < C.x) dst_data[4 * Y * HW.y + X] = result[0]; | |||
| if (4 * Y + 1 < C.x) dst_data[(4 * Y + 1) * HW.y + X] = result[1]; | |||
| if (4 * Y + 2 < C.x) dst_data[(4 * Y + 2) * HW.y + X] = result[2]; | |||
| if (4 * Y + 3 < C.x) dst_data[(4 * Y + 3) * HW.y + X] = result[3]; | |||
| if (4 * Y < C.x) dst_data[4 * Y * HW.y + X] = convert_float4(result[0]); | |||
| if (4 * Y + 1 < C.x) dst_data[(4 * Y + 1) * HW.y + X] = convert_float4(result[1]); | |||
| if (4 * Y + 2 < C.x) dst_data[(4 * Y + 2) * HW.y + X] = convert_float4(result[2]); | |||
| if (4 * Y + 3 < C.x) dst_data[(4 * Y + 3) * HW.y + X] = convert_float4(result[3]); | |||
| } | |||
| __kernel void transpose_NC4HW4_BUF(__read_only image2d_t src_data, global FLT4 *dst_data, int2 HW, int2 C, int W, | |||
| __kernel void transpose_NC4HW4_BUF(__read_only image2d_t src_data, global float4 *dst_data, int2 HW, int2 C, int W, | |||
| int H) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| @@ -131,8 +131,8 @@ __kernel void transpose_NC4HW4_BUF(__read_only image2d_t src_data, global FLT4 * | |||
| result[3].z = x2.w; | |||
| result[3].w = x3.w; | |||
| if (4 * Y < C.x) dst_data[4 * Y * HW.y + X] = result[0]; | |||
| if (4 * Y + 1 < C.x) dst_data[(4 * Y + 1) * HW.y + X] = result[1]; | |||
| if (4 * Y + 2 < C.x) dst_data[(4 * Y + 2) * HW.y + X] = result[2]; | |||
| if (4 * Y + 3 < C.x) dst_data[(4 * Y + 3) * HW.y + X] = result[3]; | |||
| if (4 * Y < C.x) dst_data[4 * Y * HW.y + X] = convert_float4(result[0]); | |||
| if (4 * Y + 1 < C.x) dst_data[(4 * Y + 1) * HW.y + X] = convert_float4(result[1]); | |||
| if (4 * Y + 2 < C.x) dst_data[(4 * Y + 2) * HW.y + X] = convert_float4(result[2]); | |||
| if (4 * Y + 3 < C.x) dst_data[(4 * Y + 3) * HW.y + X] = convert_float4(result[3]); | |||
| } | |||
| @@ -100,7 +100,6 @@ int ActivationOpenClKernel::Run() { | |||
| ocl_runtime->SetKernelArg(kernel_, arg_idx++, alpha_); | |||
| } | |||
| std::vector<size_t> local = {}; | |||
| std::cout << img2d_shape.s[1] << " " << img2d_shape.s[2] << std::endl; | |||
| std::vector<size_t> global = {static_cast<size_t>(img2d_shape.s[1]), static_cast<size_t>(img2d_shape.s[2])}; | |||
| auto ret = ocl_runtime->RunKernel(kernel_, global, local, nullptr); | |||
| if (ret != RET_OK) { | |||
| @@ -132,12 +132,15 @@ void Conv2dTransposeOpenCLKernel::PadWeight() { | |||
| bias_ = allocator->Malloc(im_dst_x * im_dst_y * C4NUM * data_size, img_size); | |||
| bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); | |||
| memset(bias_, 0x00, div_co * C4NUM * data_size); | |||
| auto bias_dtype = in_tensors_[2]->data_type(); | |||
| if (in_tensors_.size() >= 3) { | |||
| auto bias_dtype = in_tensors_[2]->data_type(); | |||
| if (bias_dtype == kNumberTypeFloat32 && enable_fp16_) { | |||
| auto fdata = reinterpret_cast<float *>(in_tensors_[2]->MutableData()); | |||
| for (int i = 0; i < co; i++) { | |||
| reinterpret_cast<uint16_t *>(bias_)[i] = Float32ToShort(fdata[i]); | |||
| reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->MutableData())[i]; | |||
| } | |||
| } else if (bias_dtype == kNumberTypeFloat16 && !enable_fp16_) { | |||
| for (int i = 0; i < co; i++) { | |||
| reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(in_tensors_[2]->MutableData())[i]; | |||
| } | |||
| } else { | |||
| memcpy(bias_, in_tensors_[2]->MutableData(), co * data_size); | |||
| @@ -152,14 +152,12 @@ void MatMulOpenCLKernel::PadWeight() { | |||
| memset(bias_, 0x00, co4 * C4NUM * dtype_size); | |||
| if (in_tensors_.size() >= 3) { | |||
| if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) { | |||
| auto fdata = reinterpret_cast<float *>(in_tensors_[2]->MutableData()); | |||
| for (int i = 0; i < co; i++) { | |||
| reinterpret_cast<uint16_t *>(bias_)[i] = Float32ToShort(fdata[i]); | |||
| reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->MutableData())[i]; | |||
| } | |||
| } else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) { | |||
| auto fdata = reinterpret_cast<uint16_t *>(in_tensors_[2]->MutableData()); | |||
| for (int i = 0; i < co; i++) { | |||
| reinterpret_cast<float *>(bias_)[i] = ShortToFloat32(fdata[i]); | |||
| reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(in_tensors_[2]->MutableData())[i]; | |||
| } | |||
| } else { | |||
| memcpy(bias_, in_tensors_[2]->MutableData(), co * dtype_size); | |||
| @@ -65,13 +65,26 @@ int SoftmaxOpenCLKernel::SetWorkGroupSize1x1() { | |||
| int SoftmaxOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { | |||
| size_t im_dst_x, im_dst_y; | |||
| if (onexone_flag_) { | |||
| im_dst_x = UP_DIV(in_tensors_[0]->shape()[1], C4NUM); | |||
| im_dst_y = 1; | |||
| auto out_shape = out_tensors_[0]->shape(); | |||
| int n = 1, h = 1, w = 1, c = 1; | |||
| if (out_shape.size() == 2) { | |||
| n = out_shape[0]; | |||
| c = out_shape[1]; | |||
| } else if (out_shape.size() == 4) { | |||
| n = out_shape[0]; | |||
| h = out_shape[1]; | |||
| w = out_shape[2]; | |||
| c = out_shape[3]; | |||
| } | |||
| if (op_format_ == schema::Format_NHWC4) { | |||
| im_dst_x = w * UP_DIV(c, C4NUM); | |||
| im_dst_y = n * h; | |||
| } else if (op_format_ == schema::Format_NC4HW4) { | |||
| im_dst_x = w; | |||
| im_dst_y = n * UP_DIV(c, C4NUM) * h; | |||
| } else { | |||
| size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); | |||
| im_dst_x = out_tensors_[0]->Width() * CO4; | |||
| im_dst_y = out_tensors_[0]->Height(); | |||
| MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_); | |||
| return RET_ERROR; | |||
| } | |||
| size_t img_dtype = CL_FLOAT; | |||
| if (enable_fp16_) { | |||
| @@ -110,8 +123,7 @@ int SoftmaxOpenCLKernel::Init() { | |||
| if (!is_image_out_) { | |||
| out_mem_type_ = OpenCLMemType::BUF; | |||
| } else { | |||
| MS_LOG(ERROR) << "image2d output not support yet."; | |||
| return RET_ERROR; | |||
| out_mem_type_ = OpenCLMemType::IMG; | |||
| } | |||
| if (out_mem_type_ == OpenCLMemType::BUF) { | |||
| kernel_name += "_BUF"; | |||
| @@ -51,7 +51,7 @@ class SoftmaxOpenCLKernel : public OpenCLKernel { | |||
| bool onexone_flag_{false}; | |||
| std::vector<size_t> local_size_; | |||
| std::vector<size_t> global_size_; | |||
| bool is_image_out_{false}; | |||
| bool is_image_out_{true}; | |||
| bool enable_fp16_{false}; | |||
| }; | |||
| @@ -36,6 +36,14 @@ int TransposeOpenCLKernel::Init() { | |||
| std::string kernel_name = "transpose"; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| enable_fp16_ = ocl_runtime->GetFp16Enable(); | |||
| auto param = reinterpret_cast<TransposeParameter *>(op_parameter_); | |||
| if (param->num_axes_ == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 && | |||
| param->perm_[3] == 2) { | |||
| type = TransposeType::NHWC2NCHW; | |||
| } else { | |||
| MS_LOG(ERROR) << "unsupported transpose axes."; | |||
| return RET_ERROR; | |||
| } | |||
| out_mem_type_ = OpenCLMemType::BUF; | |||
| kernel_name += "_" + std::string(EnumNameFormat(op_format_)); | |||
| if (out_mem_type_ == OpenCLMemType::BUF) { | |||
| @@ -146,4 +154,5 @@ kernel::LiteKernel *OpenCLTransposeKernelCreator(const std::vector<lite::Tensor | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Transpose, OpenCLTransposeKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Transpose, OpenCLTransposeKernelCreator) | |||
| } // namespace mindspore::kernel | |||
| @@ -20,10 +20,14 @@ | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "nnacl/transpose.h" | |||
| #include "src/runtime/opencl/opencl_runtime.h" | |||
| #include "src/runtime/kernel/opencl/opencl_kernel.h" | |||
| namespace mindspore::kernel { | |||
| enum class TransposeType { NHWC2NCHW, NCHW2NHWC }; | |||
| class TransposeOpenCLKernel : public OpenCLKernel { | |||
| public: | |||
| explicit TransposeOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| @@ -39,6 +43,7 @@ class TransposeOpenCLKernel : public OpenCLKernel { | |||
| private: | |||
| cl::Kernel kernel_; | |||
| bool enable_fp16_{false}; | |||
| TransposeType type{TransposeType::NHWC2NCHW}; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -37,6 +37,17 @@ void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *out | |||
| ocl_runtime->SetFp16Enable(true); | |||
| dtype_size = sizeof(float16_t); | |||
| } | |||
| auto param_ptr = std::make_unique<TransposeParameter>(); | |||
| auto param = param_ptr.get(); | |||
| if (param == nullptr) { | |||
| MS_LOG(ERROR) << "param_ptr create error."; | |||
| return; | |||
| } | |||
| param->num_axes_ = 4; | |||
| param->perm_[0] = 0; | |||
| param->perm_[1] = 3; | |||
| param->perm_[2] = 1; | |||
| param->perm_[3] = 2; | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| int h = shape[0]; | |||
| int w = shape[1]; | |||
| @@ -59,7 +70,8 @@ void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *out | |||
| } | |||
| std::vector<lite::Tensor *> inputs{tensor_x}; | |||
| std::vector<lite::Tensor *> outputs{tensor_out}; | |||
| auto arith_kernel_ptr = std::make_unique<kernel::TransposeOpenCLKernel>(nullptr, inputs, outputs); | |||
| auto arith_kernel_ptr = | |||
| std::make_unique<kernel::TransposeOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| auto arith_kernel = arith_kernel_ptr.get(); | |||
| if (arith_kernel == nullptr) { | |||
| MS_LOG(ERROR) << "arith_kernel create error."; | |||
| @@ -94,26 +106,14 @@ void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *out | |||
| } | |||
| TEST_F(TestTransposeOpenCL, TransposeFp32) { | |||
| int h = 1; | |||
| int w = 64; | |||
| int c = 7360; | |||
| int h = 2; | |||
| int w = 2; | |||
| int c = 3; | |||
| std::vector<int> shape = {h, w, c}; | |||
| size_t input_size; | |||
| std::string input_path = "./test_data/transpose/transpose_fp32_input.bin"; | |||
| auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size)); | |||
| if (input_data == nullptr) { | |||
| MS_LOG(ERROR) << "input_data load error."; | |||
| return; | |||
| } | |||
| std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; | |||
| std::vector<float> output_data = {0.0f, 3.0f, 6.0f, 9.0f, 1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f, 8.0f, 11.0f}; | |||
| size_t output_size; | |||
| std::string output_path = "./test_data/transpose/transpose_fp32_output.bin"; | |||
| auto correct_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size)); | |||
| if (correct_data == nullptr) { | |||
| MS_LOG(ERROR) << "correct_data create error."; | |||
| return; | |||
| } | |||
| RunTestTranspose(shape, input_data, correct_data, false); | |||
| RunTestTranspose(shape, input_data.data(), output_data.data(), false); | |||
| } | |||
| TEST_F(TestTransposeOpenCL, TransposeFp16) { | |||