Browse Source

transpose support NCHW2NHWC

tags/v1.1.0
chenzupeng 5 years ago
parent
commit
1f5c6efb5d
13 changed files with 325 additions and 388 deletions
  1. +173
    -120
      mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl
  2. +38
    -49
      mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
  3. +2
    -2
      mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h
  4. +9
    -0
      mindspore/lite/test/run_test.sh
  5. +12
    -12
      mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc
  6. +10
    -79
      mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc
  7. +15
    -61
      mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc
  8. +11
    -12
      mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc
  9. +11
    -12
      mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc
  10. +11
    -11
      mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc
  11. +10
    -10
      mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_tests.cc
  12. +10
    -10
      mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc
  13. +13
    -10
      mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h

+ 173
- 120
mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl View File

@@ -1,138 +1,191 @@
#ifdef cl_khr_fp16 #ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif #endif
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void transpose_IMG(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= HW.y || Y >= C.y) {
__kernel void transpose_0312_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 shape) {
int X = get_global_id(0); // H4, C4 for src
int Y = get_global_id(1); // W, H for src
int Z = get_global_id(2); // C4, W4 for src
if (4 * X >= shape.y || Y >= shape.z || 4 * Z >= shape.w) {
return; return;
} }
FLT4 result[4];
result[0] = (FLT4)(0.0f);
result[1] = (FLT4)(0.0f);
result[2] = (FLT4)(0.0f);
result[3] = (FLT4)(0.0f);
FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X));
FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 1));
FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 2));
FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 3));
result[0].x = x0.x;
result[0].y = x1.x;
result[0].z = x2.x;
result[0].w = x3.x;

result[1].x = x0.y;
result[1].y = x1.y;
result[1].z = x2.y;
result[1].w = x3.y;

result[2].x = x0.z;
result[2].y = x1.z;
result[2].z = x2.z;
result[2].w = x3.z;

result[3].x = x0.w;
result[3].y = x1.w;
result[3].z = x2.w;
result[3].w = x3.w;

WRITE_IMAGE(dst_data, (int2)(X, 4 * Y), result[0]);
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 1), result[1]);
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 2), result[2]);
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]);
int H4 = UP_DIV(shape.y, 4);
int C4 = UP_DIV(shape.w, 4);
FLT4 src0 = READ_IMAGE(src_data, smp_zero, (int2)(4 * Z * H4 + X, Y));
FLT4 src1 = (FLT4)0.f;
if (4 * Z + 1 < shape.w) {
src1 = READ_IMAGE(src_data, smp_zero, (int2)((4 * Z + 1) * H4 + X, Y));
}
FLT4 src2 = (FLT4)0.f;
if (4 * Z + 2 < shape.w) {
src2 = READ_IMAGE(src_data, smp_zero, (int2)((4 * Z + 2) * H4 + X, Y));
}
FLT4 src3 = (FLT4)0.f;
if (4 * Z + 3 < shape.w) {
src3 = READ_IMAGE(src_data, smp_zero, (int2)((4 * Z + 3) * H4 + X, Y));
}
FLT4 dst0 = (FLT4)(src0.x, src1.x, src2.x, src3.x);
FLT4 dst1 = (FLT4)(src0.y, src1.y, src2.y, src3.y);
FLT4 dst2 = (FLT4)(src0.z, src1.z, src2.z, src3.z);
FLT4 dst3 = (FLT4)(src0.w, src1.w, src2.w, src3.w);
WRITE_IMAGE(dst_data, (int2)(Y * C4 + Z, 4 * X), dst0);
if (4 * X + 1 < shape.y) {
WRITE_IMAGE(dst_data, (int2)(Y * C4 + Z, 4 * X + 1), dst1);
}
if (4 * X + 2 < shape.y) {
WRITE_IMAGE(dst_data, (int2)(Y * C4 + Z, 4 * X + 2), dst2);
}
if (4 * X + 3 < shape.y) {
WRITE_IMAGE(dst_data, (int2)(Y * C4 + Z, 4 * X + 3), dst3);
}
} }


__kernel void transpose_NHWC4_BUF(__read_only image2d_t src_data, global float4 *dst_data, int2 HW, int2 C, int W,
int H) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= HW.y || Y >= C.y) {
__kernel void transpose_0312_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 shape) {
int X = get_global_id(0); // H4, C4 for src
int Y = get_global_id(1); // W, H for src
int Z = get_global_id(2); // C4, W4 for src
if (4 * X >= shape.y || Y >= shape.z || 4 * Z >= shape.w) {
return; return;
} }
FLT4 result[4];
result[0] = (FLT4)(0.0f);
result[1] = (FLT4)(0.0f);
result[2] = (FLT4)(0.0f);
result[3] = (FLT4)(0.0f);
bool over_size = W * C.y > 65535;
FLT4 x0, x1, x2, x3;
if (over_size) {
x0 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X));
x1 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 1));
x2 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 2));
x3 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 3));
} else {
x0 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X) % W * C.y + Y, (4 * X) / W));
x1 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 1) % W * C.y + Y, (4 * X + 1) / W));
x2 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 2) % W * C.y + Y, (4 * X + 2) / W));
x3 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 3) % W * C.y + Y, (4 * X + 3) / W));
FLT4 src0 = READ_IMAGE(src_data, smp_zero, (int2)(4 * Z, X * shape.z + Y));
FLT4 src1 = (FLT4)0.f;
if (4 * Z + 1 < shape.w) {
src1 = READ_IMAGE(src_data, smp_zero, (int2)(4 * Z + 1, X * shape.z + Y));
}
FLT4 src2 = (FLT4)0.f;
if (4 * Z + 2 < shape.w) {
src2 = READ_IMAGE(src_data, smp_zero, (int2)(4 * Z + 2, X * shape.z + Y));
}
FLT4 src3 = (FLT4)0.f;
if (4 * Z + 3 < shape.w) {
src3 = READ_IMAGE(src_data, smp_zero, (int2)(4 * Z + 3, X * shape.z + Y));
}
FLT4 dst0 = (FLT4)(src0.x, src1.x, src2.x, src3.x);
FLT4 dst1 = (FLT4)(src0.y, src1.y, src2.y, src3.y);
FLT4 dst2 = (FLT4)(src0.z, src1.z, src2.z, src3.z);
FLT4 dst3 = (FLT4)(src0.w, src1.w, src2.w, src3.w);
WRITE_IMAGE(dst_data, (int2)(Y, Z * shape.y + 4 * X), dst0);
if (4 * X + 1 < shape.y) {
WRITE_IMAGE(dst_data, (int2)(Y, Z * shape.y + 4 * X + 1), dst1);
}
if (4 * X + 2 < shape.y) {
WRITE_IMAGE(dst_data, (int2)(Y, Z * shape.y + 4 * X + 2), dst2);
}
if (4 * X + 3 < shape.y) {
WRITE_IMAGE(dst_data, (int2)(Y, Z * shape.y + 4 * X + 3), dst3);
} }

result[0].x = x0.x;
result[0].y = x1.x;
result[0].z = x2.x;
result[0].w = x3.x;

result[1].x = x0.y;
result[1].y = x1.y;
result[1].z = x2.y;
result[1].w = x3.y;

result[2].x = x0.z;
result[2].y = x1.z;
result[2].z = x2.z;
result[2].w = x3.z;

result[3].x = x0.w;
result[3].y = x1.w;
result[3].z = x2.w;
result[3].w = x3.w;

if (4 * Y < C.x) dst_data[4 * Y * HW.y + X] = convert_float4(result[0]);
if (4 * Y + 1 < C.x) dst_data[(4 * Y + 1) * HW.y + X] = convert_float4(result[1]);
if (4 * Y + 2 < C.x) dst_data[(4 * Y + 2) * HW.y + X] = convert_float4(result[2]);
if (4 * Y + 3 < C.x) dst_data[(4 * Y + 3) * HW.y + X] = convert_float4(result[3]);
} }


__kernel void transpose_NC4HW4_BUF(__read_only image2d_t src_data, global float4 *dst_data, int2 HW, int2 C, int W,
int H) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= HW.y || Y >= C.y) {
__kernel void transpose_0312_oversize_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data,
int4 shape) {
int X = get_global_id(0); // H4, C4 for src
int Y = get_global_id(1); // W, H for src
int Z = get_global_id(2); // C4, W4 for src
if (4 * X >= shape.y || Y >= shape.z || 4 * Z >= shape.w) {
return; return;
} }
FLT4 result[4];
result[0] = (FLT4)(0.0f);
result[1] = (FLT4)(0.0f);
result[2] = (FLT4)(0.0f);
result[3] = (FLT4)(0.0f);
FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X) % W, Y * H + (4 * X) / W));
FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 1) % W, Y * H + (4 * X + 1) / W));
FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 2) % W, Y * H + (4 * X + 2) / W));
FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 3) % W, Y * H + (4 * X + 3) / W));
result[0].x = x0.x;
result[0].y = x1.x;
result[0].z = x2.x;
result[0].w = x3.x;

result[1].x = x0.y;
result[1].y = x1.y;
result[1].z = x2.y;
result[1].w = x3.y;

result[2].x = x0.z;
result[2].y = x1.z;
result[2].z = x2.z;
result[2].w = x3.z;
int H4 = UP_DIV(shape.y, 4);
int C4 = UP_DIV(shape.w, 4);
FLT4 src0 = READ_IMAGE(src_data, smp_zero, (int2)(Y * H4 + X, 4 * Z));
FLT4 src1 = (FLT4)0.f;
if (4 * Z + 1 < shape.w) {
src1 = READ_IMAGE(src_data, smp_zero, (int2)(Y * H4 + X, 4 * Z + 1));
}
FLT4 src2 = (FLT4)0.f;
if (4 * Z + 2 < shape.w) {
src2 = READ_IMAGE(src_data, smp_zero, (int2)(Y * H4 + X, 4 * Z + 2));
}
FLT4 src3 = (FLT4)0.f;
if (4 * Z + 3 < shape.w) {
src3 = READ_IMAGE(src_data, smp_zero, (int2)(Y * H4 + X, 4 * Z + 3));
}
FLT4 dst0 = (FLT4)(src0.x, src1.x, src2.x, src3.x);
FLT4 dst1 = (FLT4)(src0.y, src1.y, src2.y, src3.y);
FLT4 dst2 = (FLT4)(src0.z, src1.z, src2.z, src3.z);
FLT4 dst3 = (FLT4)(src0.w, src1.w, src2.w, src3.w);
WRITE_IMAGE(dst_data, (int2)(Y * C4 + Z, 4 * X), dst0);
if (4 * X + 1 < shape.y) {
WRITE_IMAGE(dst_data, (int2)(Y * C4 + Z, 4 * X + 1), dst1);
}
if (4 * X + 2 < shape.y) {
WRITE_IMAGE(dst_data, (int2)(Y * C4 + Z, 4 * X + 2), dst2);
}
if (4 * X + 3 < shape.y) {
WRITE_IMAGE(dst_data, (int2)(Y * C4 + Z, 4 * X + 3), dst3);
}
}


result[3].x = x0.w;
result[3].y = x1.w;
result[3].z = x2.w;
result[3].w = x3.w;
__kernel void transpose_0231_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 shape) {
int X = get_global_id(0); // H, W for src
int Y = get_global_id(1); // W4, C4 for src
int Z = get_global_id(2); // C4, H4 for src
if (X >= shape.y || 4 * Y >= shape.z || 4 * Z >= shape.w) {
return;
}
int W4 = UP_DIV(shape.y, 4);
int C4 = UP_DIV(shape.w, 4);
FLT4 src0 = READ_IMAGE(src_data, smp_zero, (int2)(X * W4 + Y, 4 * Z));
FLT4 src1 = (FLT4)0.f;
if (4 * Z + 1 < shape.w) {
src1 = READ_IMAGE(src_data, smp_zero, (int2)(X * W4 + Y, 4 * Z + 1));
}
FLT4 src2 = (FLT4)0.f;
if (4 * Z + 2 < shape.w) {
src2 = READ_IMAGE(src_data, smp_zero, (int2)(X * W4 + Y, 4 * Z + 2));
}
FLT4 src3 = (FLT4)0.f;
if (4 * Z + 3 < shape.w) {
src3 = READ_IMAGE(src_data, smp_zero, (int2)(X * W4 + Y, 4 * Z + 3));
}
FLT4 dst0 = (FLT4)(src0.x, src1.x, src2.x, src3.x);
FLT4 dst1 = (FLT4)(src0.y, src1.y, src2.y, src3.y);
FLT4 dst2 = (FLT4)(src0.z, src1.z, src2.z, src3.z);
FLT4 dst3 = (FLT4)(src0.w, src1.w, src2.w, src3.w);
WRITE_IMAGE(dst_data, (int2)(4 * Y * C4 + Z, X), dst0);
if (4 * Y + 1 < shape.z) {
WRITE_IMAGE(dst_data, (int2)((4 * Y + 1) * C4 + Z, X), dst1);
}
if (4 * Y + 2 < shape.z) {
WRITE_IMAGE(dst_data, (int2)((4 * Y + 2) * C4 + Z, X), dst2);
}
if (4 * Y + 3 < shape.z) {
WRITE_IMAGE(dst_data, (int2)((4 * Y + 3) * C4 + Z, X), dst3);
}
}


if (4 * Y < C.x) dst_data[4 * Y * HW.y + X] = convert_float4(result[0]);
if (4 * Y + 1 < C.x) dst_data[(4 * Y + 1) * HW.y + X] = convert_float4(result[1]);
if (4 * Y + 2 < C.x) dst_data[(4 * Y + 2) * HW.y + X] = convert_float4(result[2]);
if (4 * Y + 3 < C.x) dst_data[(4 * Y + 3) * HW.y + X] = convert_float4(result[3]);
__kernel void transpose_0231_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 shape) {
int X = get_global_id(0); // H, W for src
int Y = get_global_id(1); // W4, C4 for src
int Z = get_global_id(2); // C4, H4 for src
if (X >= shape.y || 4 * Y >= shape.z || 4 * Z >= shape.w) {
return;
}
FLT4 src0 = READ_IMAGE(src_data, smp_zero, (int2)(X, Y * shape.w + 4 * Z));
FLT4 src1 = (FLT4)0.f;
if (4 * Z + 1 < shape.w) {
src1 = READ_IMAGE(src_data, smp_zero, (int2)(X, Y * shape.w + 4 * Z + 1));
}
FLT4 src2 = (FLT4)0.f;
if (4 * Z + 2 < shape.w) {
src2 = READ_IMAGE(src_data, smp_zero, (int2)(X, Y * shape.w + 4 * Z + 2));
}
FLT4 src3 = (FLT4)0.f;
if (4 * Z + 3 < shape.w) {
src3 = READ_IMAGE(src_data, smp_zero, (int2)(X, Y * shape.w + 4 * Z + 3));
}
FLT4 dst0 = (FLT4)(src0.x, src1.x, src2.x, src3.x);
FLT4 dst1 = (FLT4)(src0.y, src1.y, src2.y, src3.y);
FLT4 dst2 = (FLT4)(src0.z, src1.z, src2.z, src3.z);
FLT4 dst3 = (FLT4)(src0.w, src1.w, src2.w, src3.w);
WRITE_IMAGE(dst_data, (int2)(4 * Y, Z * shape.y + X), dst0);
if (4 * Y + 1 < shape.z) {
WRITE_IMAGE(dst_data, (int2)(4 * Y + 1, Z * shape.y + X), dst1);
}
if (4 * Y + 2 < shape.z) {
WRITE_IMAGE(dst_data, (int2)(4 * Y + 2, Z * shape.y + X), dst2);
}
if (4 * Y + 3 < shape.z) {
WRITE_IMAGE(dst_data, (int2)(4 * Y + 3, Z * shape.y + X), dst3);
}
} }

+ 38
- 49
mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc View File

@@ -35,20 +35,27 @@ int TransposeOpenCLKernel::Init() {
std::string kernel_name = "transpose"; std::string kernel_name = "transpose";
enable_fp16_ = ocl_runtime_->GetFp16Enable(); enable_fp16_ = ocl_runtime_->GetFp16Enable();
auto param = reinterpret_cast<TransposeParameter *>(op_parameter_); auto param = reinterpret_cast<TransposeParameter *>(op_parameter_);
if (in_tensors_[0]->shape().size() != 4 || in_tensors_[0]->shape()[0] > 1) {
MS_LOG(ERROR) << "Transpose only support 4d tensor and n = 1 yet.";
return RET_ERROR;
}
if (param->num_axes_ == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 && if (param->num_axes_ == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 &&
param->perm_[3] == 2) { param->perm_[3] == 2) {
type = TransposeType::NHWC2NCHW;
kernel_name += "_0312";
type = TransposeType::AXIS0312;
} else if (param->num_axes_ == 4 && param->perm_[0] == 0 && param->perm_[1] == 2 && param->perm_[2] == 3 &&
param->perm_[3] == 1) {
kernel_name += "_0231";
type = TransposeType::AXIS0231;
} else { } else {
MS_LOG(ERROR) << "unsupported transpose axes."; MS_LOG(ERROR) << "unsupported transpose axes.";
return RET_ERROR; return RET_ERROR;
} }
out_mem_type_ = OpenCLMemType::BUF;
kernel_name += "_" + std::string(EnumNameFormat(op_format_));
if (out_mem_type_ == OpenCLMemType::BUF) {
kernel_name += "_BUF";
} else {
kernel_name += "_IMG";
if (in_tensors_[0]->shape()[2] * UP_DIV(in_tensors_[0]->shape()[3], C4NUM) > MAX_IMAGE2D_SIZE) {
// just for input
kernel_name += "_oversize";
} }
kernel_name += "_" + std::string(EnumNameFormat(op_format_));
#ifdef PROGRAM_WITH_IL #ifdef PROGRAM_WITH_IL
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
#else #else
@@ -58,18 +65,10 @@ int TransposeOpenCLKernel::Init() {
ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
#endif #endif
if ((in_tensors_[0]->shape()[1] * in_tensors_[0]->shape()[2]) % 4 != 0) {
MS_LOG(ERROR) << "input H * W % 4 != 0 not support!";
return RET_ERROR;
}
in_ori_format_ = in_tensors_[0]->GetFormat(); in_ori_format_ = in_tensors_[0]->GetFormat();
out_ori_format_ = out_tensors_[0]->GetFormat(); out_ori_format_ = out_tensors_[0]->GetFormat();
in_tensors_[0]->SetFormat(op_format_); in_tensors_[0]->SetFormat(op_format_);
out_tensors_[0]->SetFormat(op_format_); out_tensors_[0]->SetFormat(op_format_);
if (out_mem_type_ == OpenCLMemType::BUF) {
out_ori_format_ = schema::Format::Format_NCHW;
out_tensors_[0]->SetFormat(schema::Format::Format_NCHW);
}


MS_LOG(DEBUG) << kernel_name << " Init Done!"; MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK; return RET_OK;
@@ -78,20 +77,14 @@ int TransposeOpenCLKernel::Init() {
int TransposeOpenCLKernel::ReSize() { return RET_OK; } int TransposeOpenCLKernel::ReSize() { return RET_OK; }


int TransposeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { int TransposeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
size_t im_dst_x, im_dst_y;
int n = out_tensors_[0]->shape()[0];
int h = out_tensors_[0]->shape()[1];
int w = out_tensors_[0]->shape()[2];
int c = out_tensors_[0]->shape()[3];
if (op_format_ == schema::Format::Format_NHWC4) {
im_dst_x = w * UP_DIV(c, C4NUM);
im_dst_y = n * h;
} else if (op_format_ == schema::Format::Format_NC4HW4) {
im_dst_x = w;
im_dst_y = n * UP_DIV(c, C4NUM) * h;
} else {
MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_);
return RET_ERROR;
size_t im_dst_x = 1, im_dst_y = 1;
auto out_shape = out_tensors_[0]->shape();
if (op_format_ == schema::Format_NHWC4) {
im_dst_x = out_shape[2] * UP_DIV(out_shape[3], C4NUM); // W * C4
im_dst_y = out_shape[0] * out_shape[1]; // N * H
} else if (op_format_ == schema::Format_NC4HW4) {
im_dst_x = out_shape[2]; // W
im_dst_y = out_shape[0] * UP_DIV(out_shape[3], C4NUM) * out_shape[1]; // N * C4 * H
} }
size_t img_dtype = CL_FLOAT; size_t img_dtype = CL_FLOAT;
if (enable_fp16_) { if (enable_fp16_) {
@@ -104,30 +97,26 @@ int TransposeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_siz
} }


int TransposeOpenCLKernel::Run() { int TransposeOpenCLKernel::Run() {
// notice: input image2d size = {c/4, h * w}
MS_LOG(DEBUG) << this->name() << " Running!"; MS_LOG(DEBUG) << this->name() << " Running!";
std::vector<int> shapex = in_tensors_[0]->shape();
int h = shapex[1];
int w = shapex[2];
int c = shapex[3];
int c4 = UP_DIV(c, 4);
int hw4 = UP_DIV(h * w, 4);
std::vector<size_t> local = {16, 16};
std::vector<size_t> global = {UP_ROUND(hw4, local[0]), UP_ROUND(c4, local[1])};
std::vector<int> shapex = out_tensors_[0]->shape();
size_t n = shapex[0]; // n=1
size_t h = shapex[1];
size_t w = shapex[2];
size_t c = shapex[3];
size_t c4 = UP_DIV(c, 4);
std::vector<size_t> local = {};
std::vector<size_t> global;
if (type == TransposeType::AXIS0312) {
global = {UP_DIV(h, C4NUM), w, c4};
} else if (type == TransposeType::AXIS0231) {
global = {h, UP_DIV(w, C4NUM), c4};
}


cl_int2 HW = {h * w, hw4};
cl_int2 C = {c, c4};
cl_int4 shape = {static_cast<int>(n), static_cast<int>(h), static_cast<int>(w), static_cast<int>(c)};
int arg_idx = 0; int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
if (out_mem_type_ == OpenCLMemType::BUF) {
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
} else {
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
}
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, HW);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, C);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, w);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, h);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, shape);
ocl_runtime_->RunKernel(kernel_, global, local, nullptr); ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
return RET_OK; return RET_OK;
} }


+ 2
- 2
mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h View File

@@ -25,7 +25,7 @@


namespace mindspore::kernel { namespace mindspore::kernel {


enum class TransposeType { NHWC2NCHW, NCHW2NHWC };
enum class TransposeType { AXIS0312, AXIS0231 };


class TransposeOpenCLKernel : public OpenCLKernel { class TransposeOpenCLKernel : public OpenCLKernel {
public: public:
@@ -42,7 +42,7 @@ class TransposeOpenCLKernel : public OpenCLKernel {
private: private:
cl::Kernel kernel_; cl::Kernel kernel_;
bool enable_fp16_{false}; bool enable_fp16_{false};
TransposeType type{TransposeType::NHWC2NCHW};
TransposeType type{TransposeType::AXIS0312};
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel




+ 9
- 0
mindspore/lite/test/run_test.sh View File

@@ -37,3 +37,12 @@ cp -fr $TEST_DATA_DIR/testPK ./data
./lite-test --gtest_filter="TestSliceOpenCLfp32.Slicefp32CI*" ./lite-test --gtest_filter="TestSliceOpenCLfp32.Slicefp32CI*"
./lite-test --gtest_filter="TestBatchnormOpenCLCI.Batchnormfp32CI*" ./lite-test --gtest_filter="TestBatchnormOpenCLCI.Batchnormfp32CI*"


./lite-test --gtest_filter="TestAvgPoolingOpenCL*"
./lite-test --gtest_filter="TestConv2dTransposeOpenCL*"
./lite-test --gtest_filter="TestMatMulOpenCL.MatMul2D*"
./lite-test --gtest_filter="TestMatMulOpenCL.MatMul4D*"
./lite-test --gtest_filter="TestMaxPoolingOpenCL*"
./lite-test --gtest_filter="TestReduceOpenCL*"
./lite-test --gtest_filter="TestReshapeOpenCL*"
./lite-test --gtest_filter="TestSoftmaxOpenCL*"
./lite-test --gtest_filter="TestTransposeOpenCL*"

+ 12
- 12
mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc View File

@@ -55,11 +55,8 @@ void InitAvgPoolingParam(PoolingParameter *param) {
void RunTestCaseAvgPooling(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16) { void RunTestCaseAvgPooling(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16) {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
ocl_runtime->Init(); ocl_runtime->Init();
size_t dtype_size = sizeof(float);
if (enable_fp16) {
ocl_runtime->SetFp16Enable(true);
dtype_size = sizeof(float16_t);
}
size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
ocl_runtime->SetFp16Enable(enable_fp16);
auto allocator = ocl_runtime->GetAllocator(); auto allocator = ocl_runtime->GetAllocator();
int n = shape[0]; int n = shape[0];
int h = shape[1]; int h = shape[1];
@@ -67,8 +64,7 @@ void RunTestCaseAvgPooling(const std::vector<int> &shape, void *input_data, void
int c = shape[3]; int c = shape[3];
int oh = shape[4]; int oh = shape[4];
int ow = shape[5]; int ow = shape[5];
auto param_ptr = std::make_unique<PoolingParameter>();
auto param = param_ptr.get();
auto param = static_cast<PoolingParameter *>(malloc(sizeof(PoolingParameter)));
if (param == nullptr) { if (param == nullptr) {
MS_LOG(ERROR) << "param create error."; MS_LOG(ERROR) << "param create error.";
return; return;
@@ -94,7 +90,7 @@ void RunTestCaseAvgPooling(const std::vector<int> &shape, void *input_data, void
std::vector<lite::Tensor *> outputs{tensor_out}; std::vector<lite::Tensor *> outputs{tensor_out};
auto arith_kernel_ptr = auto arith_kernel_ptr =
std::make_unique<kernel::PoolingOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs); std::make_unique<kernel::PoolingOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs);
auto arith_kernel = arith_kernel_ptr.get();
auto arith_kernel = arith_kernel_ptr.release();
if (arith_kernel == nullptr) { if (arith_kernel == nullptr) {
MS_LOG(ERROR) << "arith_kernel create error."; MS_LOG(ERROR) << "arith_kernel create error.";
return; return;
@@ -115,13 +111,17 @@ void RunTestCaseAvgPooling(const std::vector<int> &shape, void *input_data, void
pGraph->Run(); pGraph->Run();


if (enable_fp16) { if (enable_fp16) {
CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3),
2e-2);
CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3), 2e-2);
} else { } else {
CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5)); CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5));
} }
inputs[0]->SetData(nullptr);
outputs[0]->SetData(nullptr);

for (auto t : inputs) {
t->SetData(nullptr);
}
for (auto t : outputs) {
t->SetData(nullptr);
}


MS_LOG(INFO) << "Test AvgPool2d passed"; MS_LOG(INFO) << "Test AvgPool2d passed";
lite::opencl::OpenCLRuntime::DeleteInstance(); lite::opencl::OpenCLRuntime::DeleteInstance();


+ 10
- 79
mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc View File

@@ -34,11 +34,8 @@ void RunTestCaseConv2dTranspose(const std::vector<int> &shape, void *input_data,
void *output_data, bool enable_fp16) { void *output_data, bool enable_fp16) {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
ocl_runtime->Init(); ocl_runtime->Init();
size_t dtype_size = sizeof(float);
if (enable_fp16) {
ocl_runtime->SetFp16Enable(true);
dtype_size = sizeof(float16_t);
}
size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
ocl_runtime->SetFp16Enable(enable_fp16);
auto allocator = ocl_runtime->GetAllocator(); auto allocator = ocl_runtime->GetAllocator();
int pad = shape[0]; int pad = shape[0];
int n = shape[1]; int n = shape[1];
@@ -89,8 +86,7 @@ void RunTestCaseConv2dTranspose(const std::vector<int> &shape, void *input_data,
} }
std::vector<lite::Tensor *> inputs{tensor_x, tensor_w, tensor_bias}; std::vector<lite::Tensor *> inputs{tensor_x, tensor_w, tensor_bias};
std::vector<lite::Tensor *> outputs{tensor_out}; std::vector<lite::Tensor *> outputs{tensor_out};
auto opParameter_ptr = std::make_unique<ConvParameter>();
auto opParameter = opParameter_ptr.get();
auto opParameter = static_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
if (opParameter == nullptr) { if (opParameter == nullptr) {
MS_LOG(ERROR) << "opParameter create error."; MS_LOG(ERROR) << "opParameter create error.";
return; return;
@@ -105,7 +101,7 @@ void RunTestCaseConv2dTranspose(const std::vector<int> &shape, void *input_data,
opParameter->output_channel_ = co; opParameter->output_channel_ = co;
auto op_kernel_ptr = std::make_unique<kernel::Conv2dTransposeOpenCLKernel>( auto op_kernel_ptr = std::make_unique<kernel::Conv2dTransposeOpenCLKernel>(
reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
auto op_kernel = op_kernel_ptr.get();
auto op_kernel = op_kernel_ptr.release();
if (op_kernel == nullptr) { if (op_kernel == nullptr) {
MS_LOG(ERROR) << "op_kernel create error."; MS_LOG(ERROR) << "op_kernel create error.";
return; return;
@@ -132,81 +128,16 @@ void RunTestCaseConv2dTranspose(const std::vector<int> &shape, void *input_data,
CompareOutput(outputs[0]->data_c(), output_data, n * oh * ow * co, static_cast<float>(1e-5)); CompareOutput(outputs[0]->data_c(), output_data, n * oh * ow * co, static_cast<float>(1e-5));
} }


inputs[0]->SetData(nullptr);
outputs[0]->SetData(nullptr);
lite::opencl::OpenCLRuntime::DeleteInstance();
}

void RunTestCaseConv2dTranspose(const std::vector<int> shape, const std::vector<std::string> file_path,
bool enable_fp16) {
size_t input_size;
std::string input_path = file_path[0];
auto input_data = mindspore::lite::ReadFile(input_path.c_str(), &input_size);
if (input_data == nullptr) {
MS_LOG(ERROR) << "input_data load error.";
return;
for (auto t : inputs) {
t->SetData(nullptr);
} }

size_t weight_size;
std::string weight_path = file_path[1];
auto weight_data = mindspore::lite::ReadFile(weight_path.c_str(), &weight_size);
if (weight_data == nullptr) {
MS_LOG(ERROR) << "weight_data load error.";
return;
for (auto t : outputs) {
t->SetData(nullptr);
} }

size_t bias_size;
std::string bias_path = file_path[2];
auto bias_data = mindspore::lite::ReadFile(bias_path.c_str(), &bias_size);
if (bias_data == nullptr) {
MS_LOG(ERROR) << "bias_data load error.";
return;
}
size_t output_size;
std::string output_path = file_path[3];
auto output_data = mindspore::lite::ReadFile(output_path.c_str(), &output_size);
if (output_data == nullptr) {
MS_LOG(ERROR) << "output_data load error.";
return;
}
RunTestCaseConv2dTranspose(shape, input_data, weight_data, bias_data, output_data, enable_fp16);
lite::opencl::OpenCLRuntime::DeleteInstance();
} }


TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp32) { TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp32) {
int pad = 0;
int n = 1;
int h = 240;
int w = 240;
int kh = 2;
int kw = 2;
int ci = 128;
int co = 128;
std::vector<int> shape = {pad, n, h, w, kh, kw, ci, co};
std::vector<std::string> file_path = {"./test_data/conv2d_transpose/conv2d_transpose_fp32_input.bin",
"./test_data/conv2d_transpose/conv2d_transpose_fp32_weight.bin",
"./test_data/conv2d_transpose/conv2d_transpose_fp32_bias.bin",
"./test_data/conv2d_transpose/conv2d_transpose_fp32_output.bin"};
RunTestCaseConv2dTranspose(shape, file_path, false);
}

TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp16) {
int pad = 0;
int n = 1;
int h = 240;
int w = 240;
int kh = 2;
int kw = 2;
int ci = 128;
int co = 128;
std::vector<int> shape = {pad, n, h, w, kh, kw, ci, co};
std::vector<std::string> file_path = {"./test_data/conv2d_transpose/conv2d_transpose_fp16_input.bin",
"./test_data/conv2d_transpose/conv2d_transpose_fp16_weight.bin",
"./test_data/conv2d_transpose/conv2d_transpose_fp16_bias.bin",
"./test_data/conv2d_transpose/conv2d_transpose_fp16_output.bin"};
RunTestCaseConv2dTranspose(shape, file_path, true);
}

TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp32_2) {
int pad = 0; int pad = 0;
int n = 1; int n = 1;
int h = 2; int h = 2;
@@ -224,7 +155,7 @@ TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp32_2) {
RunTestCaseConv2dTranspose(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), false); RunTestCaseConv2dTranspose(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), false);
} }


TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp16_2) {
TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp16) {
int pad = 0; int pad = 0;
int n = 1; int n = 1;
int h = 2; int h = 2;


+ 15
- 61
mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc View File

@@ -33,11 +33,8 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we
bool enable_fp16, int dims) { bool enable_fp16, int dims) {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
ocl_runtime->Init(); ocl_runtime->Init();
size_t dtype_size = sizeof(float);
if (enable_fp16) {
ocl_runtime->SetFp16Enable(true);
dtype_size = sizeof(int16_t);
}
size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
ocl_runtime->SetFp16Enable(enable_fp16);
auto allocator = ocl_runtime->GetAllocator(); auto allocator = ocl_runtime->GetAllocator();
std::vector<int> input_shape, output_shape, weight_shape; std::vector<int> input_shape, output_shape, weight_shape;
if (dims == 2) { if (dims == 2) {
@@ -56,8 +53,7 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we
output_shape = {a, b, m, co}; output_shape = {a, b, m, co};
weight_shape = {a, b, co, ci}; weight_shape = {a, b, co, ci};
} }
auto param_ptr = std::make_unique<MatMulParameter>();
auto param = param_ptr.get();
auto param = static_cast<MatMulParameter *>(malloc(sizeof(MatMulParameter)));
if (param == nullptr) { if (param == nullptr) {
MS_LOG(ERROR) << "param_ptr create error."; MS_LOG(ERROR) << "param_ptr create error.";
return; return;
@@ -93,7 +89,7 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we
std::vector<lite::Tensor *> outputs{tensor_out}; std::vector<lite::Tensor *> outputs{tensor_out};
auto op_kernel_ptr = auto op_kernel_ptr =
std::make_unique<kernel::MatMulOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs, false); std::make_unique<kernel::MatMulOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs, false);
auto op_kernel = op_kernel_ptr.get();
auto op_kernel = op_kernel_ptr.release();
if (op_kernel == nullptr) { if (op_kernel == nullptr) {
MS_LOG(ERROR) << "op_kernel create error."; MS_LOG(ERROR) << "op_kernel create error.";
return; return;
@@ -114,64 +110,22 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we
memcpy(inputs[0]->data_c(), input_data, tensor_x->ElementsNum() * dtype_size); memcpy(inputs[0]->data_c(), input_data, tensor_x->ElementsNum() * dtype_size);
pGraph->Run(); pGraph->Run();
if (enable_fp16) { if (enable_fp16) {
CompareOutput(outputs[0]->data_c(), output_data, tensor_out->ElementsNum(), static_cast<float16_t>(1e-3),
2e-2);
CompareOutput(outputs[0]->data_c(), output_data, tensor_out->ElementsNum(), static_cast<float16_t>(1e-3), 2e-2);
} else { } else {
CompareOutput(outputs[0]->data_c(), output_data, tensor_out->ElementsNum(), static_cast<float>(1e-5)); CompareOutput(outputs[0]->data_c(), output_data, tensor_out->ElementsNum(), static_cast<float>(1e-5));
} }


tensor_x->SetData(nullptr);
tensor_out->SetData(nullptr);
MS_LOG(INFO) << "TestMatMulFp32 passed";
lite::opencl::OpenCLRuntime::DeleteInstance();
}

void RunTestCaseMatMul(const std::vector<int> shape, const std::vector<std::string> file_path, bool enable_fp16) {
size_t input_size;
std::string input_path = file_path[0];
auto input_data = mindspore::lite::ReadFile(input_path.c_str(), &input_size);
if (input_data == nullptr) {
MS_LOG(ERROR) << "input_data load error.";
return;
}
size_t weight_size;
std::string weight_path = file_path[1];
auto weight_data = mindspore::lite::ReadFile(weight_path.c_str(), &weight_size);
if (weight_data == nullptr) {
MS_LOG(ERROR) << "weight_data load error.";
return;
for (auto t : inputs) {
t->SetData(nullptr);
} }
size_t output_size;
std::string output_path = file_path[2];
auto output_data = mindspore::lite::ReadFile(output_path.c_str(), &output_size);
if (output_data == nullptr) {
MS_LOG(ERROR) << "output_data load error.";
return;
for (auto t : outputs) {
t->SetData(nullptr);
} }
RunTestCaseMatMul(shape, input_data, weight_data, output_data, enable_fp16, 2);
}

TEST_F(TestMatMulOpenCL, MatMulFp32) {
int ci = 1280;
int co = 1001;
std::vector<int> shape = {ci, co};
std::vector<std::string> file_path = {"./test_data/matmul/matmul_fp32_input.bin",
"./test_data/matmul/matmul_fp32_weight.bin",
"./test_data/matmul/matmul_fp32_output.bin"};
RunTestCaseMatMul(shape, file_path, false);
}

TEST_F(TestMatMulOpenCL, MatMulFp16) {
int ci = 1280;
int co = 1001;
std::vector<int> shape = {ci, co};
std::vector<std::string> file_path = {"./test_data/matmul/matmul_fp16_input.bin",
"./test_data/matmul/matmul_fp16_weight.bin",
"./test_data/matmul/matmul_fp16_output.bin"};
RunTestCaseMatMul(shape, file_path, true);
MS_LOG(INFO) << "TestMatMul passed";
lite::opencl::OpenCLRuntime::DeleteInstance();
} }


TEST_F(TestMatMulOpenCL, MatMulFp32_2) {
TEST_F(TestMatMulOpenCL, MatMul2DFp32) {
int ci = 5; int ci = 5;
int co = 3; int co = 3;
std::vector<int> shape = {ci, co}; std::vector<int> shape = {ci, co};
@@ -182,7 +136,7 @@ TEST_F(TestMatMulOpenCL, MatMulFp32_2) {
RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false, 2); RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false, 2);
} }


TEST_F(TestMatMulOpenCL, MatMulFp16_2) {
TEST_F(TestMatMulOpenCL, MatMul2DFp16) {
int ci = 5; int ci = 5;
int co = 3; int co = 3;
std::vector<int> shape = {ci, co}; std::vector<int> shape = {ci, co};
@@ -193,7 +147,7 @@ TEST_F(TestMatMulOpenCL, MatMulFp16_2) {
RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), true, 2); RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), true, 2);
} }


TEST_F(TestMatMulOpenCL, MatMulFp32_4D) {
TEST_F(TestMatMulOpenCL, MatMul4DFp32) {
int a = 1; int a = 1;
int b = 2; int b = 2;
int c = 2; int c = 2;
@@ -210,7 +164,7 @@ TEST_F(TestMatMulOpenCL, MatMulFp32_4D) {
RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false, 4); RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false, 4);
} }


TEST_F(TestMatMulOpenCL, MatMulFp16_4D) {
TEST_F(TestMatMulOpenCL, MatMul4DFp16) {
int a = 1; int a = 1;
int b = 2; int b = 2;
int c = 2; int c = 2;


+ 11
- 12
mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc View File

@@ -55,11 +55,8 @@ void InitMaxPoolingParam(PoolingParameter *param) {
void RunTestCaseMaxPooling(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16) { void RunTestCaseMaxPooling(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16) {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
ocl_runtime->Init(); ocl_runtime->Init();
size_t dtype_size = sizeof(float);
if (enable_fp16) {
ocl_runtime->SetFp16Enable(true);
dtype_size = sizeof(float16_t);
}
size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
ocl_runtime->SetFp16Enable(enable_fp16);
auto allocator = ocl_runtime->GetAllocator(); auto allocator = ocl_runtime->GetAllocator();
int n = shape[0]; int n = shape[0];
int h = shape[1]; int h = shape[1];
@@ -67,8 +64,7 @@ void RunTestCaseMaxPooling(const std::vector<int> &shape, void *input_data, void
int c = shape[3]; int c = shape[3];
int oh = shape[4]; int oh = shape[4];
int ow = shape[5]; int ow = shape[5];
auto param_ptr = std::make_unique<PoolingParameter>();
auto param = param_ptr.get();
auto param = static_cast<PoolingParameter *>(malloc(sizeof(PoolingParameter)));
if (param == nullptr) { if (param == nullptr) {
MS_LOG(ERROR) << "param create error."; MS_LOG(ERROR) << "param create error.";
return; return;
@@ -94,7 +90,7 @@ void RunTestCaseMaxPooling(const std::vector<int> &shape, void *input_data, void
std::vector<lite::Tensor *> outputs{tensor_out}; std::vector<lite::Tensor *> outputs{tensor_out};
auto arith_kernel_ptr = auto arith_kernel_ptr =
std::make_unique<kernel::PoolingOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs); std::make_unique<kernel::PoolingOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs);
auto arith_kernel = arith_kernel_ptr.get();
auto arith_kernel = arith_kernel_ptr.release();
if (arith_kernel == nullptr) { if (arith_kernel == nullptr) {
MS_LOG(ERROR) << "arith_kernel create error."; MS_LOG(ERROR) << "arith_kernel create error.";
return; return;
@@ -115,13 +111,16 @@ void RunTestCaseMaxPooling(const std::vector<int> &shape, void *input_data, void
pGraph->Run(); pGraph->Run();


if (enable_fp16) { if (enable_fp16) {
CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3),
2e-2);
CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3), 2e-2);
} else { } else {
CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5)); CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5));
} }
inputs[0]->SetData(nullptr);
outputs[0]->SetData(nullptr);
for (auto t : inputs) {
t->SetData(nullptr);
}
for (auto t : outputs) {
t->SetData(nullptr);
}


MS_LOG(INFO) << "Test MaxPool2d passed"; MS_LOG(INFO) << "Test MaxPool2d passed";
lite::opencl::OpenCLRuntime::DeleteInstance(); lite::opencl::OpenCLRuntime::DeleteInstance();


+ 11
- 12
mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc View File

@@ -33,14 +33,10 @@ void RunTestCaseReduce(const std::vector<int> &shape, void *input_data, void *ou
int reduce_mode) { int reduce_mode) {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
ocl_runtime->Init(); ocl_runtime->Init();
size_t dtype_size = sizeof(float);
if (enable_fp16) {
ocl_runtime->SetFp16Enable(true);
dtype_size = sizeof(float16_t);
}
size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
ocl_runtime->SetFp16Enable(enable_fp16);
auto allocator = ocl_runtime->GetAllocator(); auto allocator = ocl_runtime->GetAllocator();
auto param_ptr = std::make_unique<ReduceParameter>();
auto param = param_ptr.get();
auto param = static_cast<ReduceParameter *>(malloc(sizeof(ReduceParameter)));
if (param == nullptr) { if (param == nullptr) {
MS_LOG(ERROR) << "param_ptr create error."; MS_LOG(ERROR) << "param_ptr create error.";
return; return;
@@ -73,7 +69,7 @@ void RunTestCaseReduce(const std::vector<int> &shape, void *input_data, void *ou
std::vector<lite::Tensor *> outputs{tensor_out}; std::vector<lite::Tensor *> outputs{tensor_out};
auto arith_kernel_ptr = auto arith_kernel_ptr =
std::make_unique<kernel::ReduceOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs); std::make_unique<kernel::ReduceOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs);
auto arith_kernel = arith_kernel_ptr.get();
auto arith_kernel = arith_kernel_ptr.release();
if (arith_kernel == nullptr) { if (arith_kernel == nullptr) {
MS_LOG(ERROR) << "arith_kernel create error."; MS_LOG(ERROR) << "arith_kernel create error.";
return; return;
@@ -94,13 +90,16 @@ void RunTestCaseReduce(const std::vector<int> &shape, void *input_data, void *ou
pGraph->Run(); pGraph->Run();


if (enable_fp16) { if (enable_fp16) {
CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3),
2e-2);
CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3), 2e-2);
} else { } else {
CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5)); CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5));
} }
inputs[0]->SetData(nullptr);
outputs[0]->SetData(nullptr);
for (auto t : inputs) {
t->SetData(nullptr);
}
for (auto t : outputs) {
t->SetData(nullptr);
}


MS_LOG(INFO) << "Test Reduce passed"; MS_LOG(INFO) << "Test Reduce passed";
lite::opencl::OpenCLRuntime::DeleteInstance(); lite::opencl::OpenCLRuntime::DeleteInstance();


+ 11
- 11
mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc View File

@@ -33,11 +33,8 @@ void RunTestCaseReshape(const std::vector<int> &shape, void *input_data, void *o
bool is_output_2d) { bool is_output_2d) {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
ocl_runtime->Init(); ocl_runtime->Init();
size_t dtype_size = sizeof(float);
if (enable_fp16) {
ocl_runtime->SetFp16Enable(true);
dtype_size = sizeof(float16_t);
}
size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
ocl_runtime->SetFp16Enable(enable_fp16);
auto allocator = ocl_runtime->GetAllocator(); auto allocator = ocl_runtime->GetAllocator();
int n = shape[0]; int n = shape[0];
int h = shape[1]; int h = shape[1];
@@ -55,7 +52,7 @@ void RunTestCaseReshape(const std::vector<int> &shape, void *input_data, void *o
} }
std::vector<int> out_shape = {n, oh, ow, c}; std::vector<int> out_shape = {n, oh, ow, c};
if (is_output_2d) { if (is_output_2d) {
std::vector<int> out_shape = {n, c};
out_shape = {n, c};
} }
auto tensor_out_ptr = auto tensor_out_ptr =
std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape, std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape,
@@ -68,7 +65,7 @@ void RunTestCaseReshape(const std::vector<int> &shape, void *input_data, void *o
std::vector<lite::Tensor *> inputs{tensor_x}; std::vector<lite::Tensor *> inputs{tensor_x};
std::vector<lite::Tensor *> outputs{tensor_out}; std::vector<lite::Tensor *> outputs{tensor_out};
auto arith_kernel_ptr = std::make_unique<kernel::ReshapeOpenCLKernel>(nullptr, inputs, outputs); auto arith_kernel_ptr = std::make_unique<kernel::ReshapeOpenCLKernel>(nullptr, inputs, outputs);
auto arith_kernel = arith_kernel_ptr.get();
auto arith_kernel = arith_kernel_ptr.release();
if (arith_kernel == nullptr) { if (arith_kernel == nullptr) {
MS_LOG(ERROR) << "arith_kernel create error."; MS_LOG(ERROR) << "arith_kernel create error.";
return; return;
@@ -89,13 +86,16 @@ void RunTestCaseReshape(const std::vector<int> &shape, void *input_data, void *o
pGraph->Run(); pGraph->Run();


if (enable_fp16) { if (enable_fp16) {
CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3),
2e-2);
CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3), 2e-2);
} else { } else {
CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5)); CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5));
} }
inputs[0]->SetData(nullptr);
outputs[0]->SetData(nullptr);
for (auto t : inputs) {
t->SetData(nullptr);
}
for (auto t : outputs) {
t->SetData(nullptr);
}


MS_LOG(INFO) << "Test Reshape passed"; MS_LOG(INFO) << "Test Reshape passed";
lite::opencl::OpenCLRuntime::DeleteInstance(); lite::opencl::OpenCLRuntime::DeleteInstance();


+ 10
- 10
mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_tests.cc View File

@@ -32,11 +32,8 @@ class TestSoftmaxOpenCL : public mindspore::CommonTest {
void RunTestCaseSoftmax(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16) { void RunTestCaseSoftmax(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16) {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
ocl_runtime->Init(); ocl_runtime->Init();
size_t dtype_size = sizeof(float);
if (enable_fp16) {
ocl_runtime->SetFp16Enable(true);
dtype_size = sizeof(float16_t);
}
size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
ocl_runtime->SetFp16Enable(enable_fp16);
auto allocator = ocl_runtime->GetAllocator(); auto allocator = ocl_runtime->GetAllocator();
int n, h, w, c; int n, h, w, c;
bool is_2d = false; bool is_2d = false;
@@ -72,7 +69,7 @@ void RunTestCaseSoftmax(const std::vector<int> &shape, void *input_data, void *o
std::vector<lite::Tensor *> inputs{tensor_x}; std::vector<lite::Tensor *> inputs{tensor_x};
std::vector<lite::Tensor *> outputs{tensor_out}; std::vector<lite::Tensor *> outputs{tensor_out};
auto arith_kernel_ptr = std::make_unique<kernel::SoftmaxOpenCLKernel>(nullptr, inputs, outputs); auto arith_kernel_ptr = std::make_unique<kernel::SoftmaxOpenCLKernel>(nullptr, inputs, outputs);
auto arith_kernel = arith_kernel_ptr.get();
auto arith_kernel = arith_kernel_ptr.release();
if (arith_kernel == nullptr) { if (arith_kernel == nullptr) {
MS_LOG(ERROR) << "arith_kernel create error."; MS_LOG(ERROR) << "arith_kernel create error.";
return; return;
@@ -93,13 +90,16 @@ void RunTestCaseSoftmax(const std::vector<int> &shape, void *input_data, void *o
pGraph->Run(); pGraph->Run();


if (enable_fp16) { if (enable_fp16) {
CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3),
2e-2);
CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3), 2e-2);
} else { } else {
CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5)); CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5));
} }
inputs[0]->SetData(nullptr);
outputs[0]->SetData(nullptr);
for (auto t : inputs) {
t->SetData(nullptr);
}
for (auto t : outputs) {
t->SetData(nullptr);
}


MS_LOG(INFO) << "Test Softmax passed"; MS_LOG(INFO) << "Test Softmax passed";
lite::opencl::OpenCLRuntime::DeleteInstance(); lite::opencl::OpenCLRuntime::DeleteInstance();


+ 10
- 10
mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc View File

@@ -32,13 +32,9 @@ class TestTransposeOpenCL : public mindspore::CommonTest {
void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16) { void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16) {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
ocl_runtime->Init(); ocl_runtime->Init();
size_t dtype_size = sizeof(float);
if (enable_fp16) {
ocl_runtime->SetFp16Enable(true);
dtype_size = sizeof(float16_t);
}
auto param_ptr = std::make_unique<TransposeParameter>();
auto param = param_ptr.get();
size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
ocl_runtime->SetFp16Enable(enable_fp16);
auto param = static_cast<TransposeParameter *>(malloc(sizeof(TransposeParameter)));
if (param == nullptr) { if (param == nullptr) {
MS_LOG(ERROR) << "param_ptr create error."; MS_LOG(ERROR) << "param_ptr create error.";
return; return;
@@ -73,7 +69,7 @@ void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *out
std::vector<lite::Tensor *> outputs{tensor_out}; std::vector<lite::Tensor *> outputs{tensor_out};
auto arith_kernel_ptr = auto arith_kernel_ptr =
std::make_unique<kernel::TransposeOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs); std::make_unique<kernel::TransposeOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs);
auto arith_kernel = arith_kernel_ptr.get();
auto arith_kernel = arith_kernel_ptr.release();
if (arith_kernel == nullptr) { if (arith_kernel == nullptr) {
MS_LOG(ERROR) << "arith_kernel create error."; MS_LOG(ERROR) << "arith_kernel create error.";
return; return;
@@ -99,8 +95,12 @@ void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *out
CompareOutput(outputs[0]->data_c(), output_data, h * w * c, static_cast<float>(1e-5)); CompareOutput(outputs[0]->data_c(), output_data, h * w * c, static_cast<float>(1e-5));
} }


inputs[0]->SetData(nullptr);
outputs[0]->SetData(nullptr);
for (auto t : inputs) {
t->SetData(nullptr);
}
for (auto t : outputs) {
t->SetData(nullptr);
}


MS_LOG(INFO) << "Test TransposeFp32 passed"; MS_LOG(INFO) << "Test TransposeFp32 passed";
lite::opencl::OpenCLRuntime::DeleteInstance(); lite::opencl::OpenCLRuntime::DeleteInstance();


+ 13
- 10
mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h View File

@@ -34,23 +34,26 @@ void CompareOutput(void *output, void *expect, size_t elem_num, T atol, float rt
T *output_data = reinterpret_cast<T *>(output); T *output_data = reinterpret_cast<T *>(output);
T *expect_data = reinterpret_cast<T *>(expect); T *expect_data = reinterpret_cast<T *>(expect);


printf("output[0:12]:");
std::cout << std::setprecision(5) << std::setiosflags(std::ios::fixed) << std::setw(7);
std::cout << "output[0:12]:";
for (int i = 0; i < 12 && i < elem_num; i++) { for (int i = 0; i < 12 && i < elem_num; i++) {
printf("[%d]:%.3f ", i, output_data[i]);
std::cout << output_data[i] << " ";
} }
printf("\n");
printf("expect[0:12]:");
std::cout << std::endl;
std::cout << "expect[0:12]:";
for (int i = 0; i < 12 && i < elem_num; i++) { for (int i = 0; i < 12 && i < elem_num; i++) {
printf("[%d]:%.3f ", i, expect_data[i]);
std::cout << expect_data[i] << " ";
} }
printf("\n");
std::cout << std::endl;
for (int i = 0; i < elem_num; ++i) { for (int i = 0; i < elem_num; ++i) {
if (std::fabs(output_data[i] - expect_data[i]) > atol + rtol * std::fabs(expect_data[i])) {
printf("error at idx[%d] expect=%.3f output=%.3f \n", i, expect_data[i], output_data[i]);
return;
auto left = static_cast<float>(std::fabs(output_data[i] - expect_data[i]));
auto right = static_cast<float>(atol + rtol * std::fabs(expect_data[i]));
if (left > right) {
std::cout << "error at idx[" << i << "] expect=" << expect_data[i] << " output=" << output_data[i] << std::endl;
} }
ASSERT_LE(left, right);
} }
printf("compare success!\n");
std::cout << "compare success!" << std::endl;
} }


template <typename T> template <typename T>


Loading…
Cancel
Save