Browse Source

transpose output set to buffer

tags/v0.7.0-beta
chenzupeng 5 years ago
parent
commit
4d3be49a66
14 changed files with 134 additions and 13 deletions
  1. +42
    -1
      mindspore/lite/src/runtime/kernel/opencl/cl/fp32/transpose.cl
  2. +10
    -2
      mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
  3. +1
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h
  4. +4
    -4
      mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
  5. +2
    -0
      mindspore/lite/test/ut/src/runtime/kernel/opencl/activation_tests.cc
  6. +5
    -0
      mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc
  7. +8
    -0
      mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc
  8. +9
    -0
      mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc
  9. +8
    -0
      mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
  10. +9
    -6
      mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc
  11. +9
    -0
      mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc
  12. +9
    -0
      mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_tests.cc
  13. +9
    -0
      mindspore/lite/test/ut/src/runtime/kernel/opencl/to_format_tests.cc
  14. +9
    -0
      mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc

+ 42
- 1
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/transpose.cl View File

@@ -3,7 +3,7 @@
#define READ_IMAGE read_imagef
#define WRITE_IMAGE write_imagef
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void transpose(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
__kernel void transpose_IMG(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= HW.y || Y >= C.y) {
@@ -43,3 +43,44 @@ __kernel void transpose(__read_only image2d_t src_data, __write_only image2d_t d
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 2), result[2]);
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]);
}

__kernel void transpose_BUF(__read_only image2d_t src_data, global FLT4 *dst_data, int2 HW, int2 C) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= HW.y || Y >= C.y) {
return;
}
FLT4 result[4];
result[0] = (FLT4)(0.0f);
result[1] = (FLT4)(0.0f);
result[2] = (FLT4)(0.0f);
result[3] = (FLT4)(0.0f);
FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X));
FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 1));
FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 2));
FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 3));
result[0].x = x0.x;
result[0].y = x1.x;
result[0].z = x2.x;
result[0].w = x3.x;

result[1].x = x0.y;
result[1].y = x1.y;
result[1].z = x2.y;
result[1].w = x3.y;

result[2].x = x0.z;
result[2].y = x1.z;
result[2].z = x2.z;
result[2].w = x3.z;

result[3].x = x0.w;
result[3].y = x1.w;
result[3].z = x2.w;
result[3].w = x3.w;

dst_data[4 * Y * HW.y + X] = result[0];
dst_data[(4 * Y + 1) * HW.y + X] = result[1];
dst_data[(4 * Y + 2) * HW.y + X] = result[2];
dst_data[(4 * Y + 3) * HW.y + X] = result[3];
}

+ 10
- 2
mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc View File

@@ -36,7 +36,11 @@ namespace mindspore::kernel {
int TransposeOpenCLKernel::Init() {
std::string kernel_name = "transpose";
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();

if (!is_image_out_) {
kernel_name += "_BUF";
} else {
kernel_name += "_IMG";
}
#ifdef PROGRAM_WITH_IL
ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
#else
@@ -60,8 +64,12 @@ int TransposeOpenCLKernel::Init() {
MS_LOG(ERROR) << "input H * W % 4 != 0 not support!";
return RET_ERROR;
}
ori_format_ = out_tensors_[0]->GetFormat();
// Transpose::InferShape just set output->SetFormat(input->GetFormat()); -^-!
ori_format_ = schema::Format_NCHW;
out_tensors_[0]->SetFormat(schema::Format_NCHW);
if (!is_image_out_) {
out_mem_type_ = OpenCLMemType::BUF;
}
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}


+ 1
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h View File

@@ -38,6 +38,7 @@ class TransposeOpenCLKernel : public OpenCLKernel {

private:
cl::Kernel kernel_;
bool is_image_out_ = false;
};
} // namespace mindspore::kernel



+ 4
- 4
mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc View File

@@ -34,13 +34,13 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::tensor::Tensor *
out_parameters->clear();
out_convert_ops->clear();
for (size_t i = 0; i < in_tensors.size(); ++i) {
OpenCLKernel* cur_opencl_op = reinterpret_cast<OpenCLKernel*>(in_kernels[i]);
OpenCLKernel *cur_opencl_op = reinterpret_cast<OpenCLKernel *>(in_kernels[i]);
schema::Format ori_format = cur_opencl_op->GetOriFormat();
if (mem_type == cur_opencl_op->GetMemType() && in_tensors[i]->GetFormat() == ori_format) {
if (mem_type == OpenCLMemType::BUF && mem_type == cur_opencl_op->GetMemType() &&
in_tensors[i]->GetFormat() == ori_format) {
continue;
}
auto dst_format =
(mem_type == OpenCLMemType::IMG) ? in_kernels[i]->out_tensors()[0]->GetFormat() : ori_format;
auto dst_format = (mem_type == OpenCLMemType::IMG) ? in_kernels[i]->out_tensors()[0]->GetFormat() : ori_format;
auto src_format =
(mem_type == OpenCLMemType::IMG) ? in_tensors[i]->GetFormat() : in_kernels[i]->out_tensors()[0]->GetFormat();
lite::tensor::Tensor *new_tensor = new (std::nothrow) lite::tensor::Tensor();


+ 2
- 0
mindspore/lite/test/ut/src/runtime/kernel/opencl/activation_tests.cc View File

@@ -125,6 +125,7 @@ int RunSubGraphOpenCLKernel(const std::vector<lite::tensor::Tensor *> &inputs,
MS_LOG(ERROR) << "Run SubGraphOpenCLKernel error.";
return RET_ERROR;
}
delete sub_graph;
return RET_OK;
}

@@ -180,6 +181,7 @@ TEST_F(TestActivationOpenCL, LeakyReluFp32_dim4) {

delete input_tensor;
delete output_tensor;
lite::opencl::OpenCLRuntime::DeleteInstance();
return;
}
} // namespace mindspore

+ 5
- 0
mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc View File

@@ -119,6 +119,11 @@ TEST_F(TestAvgPoolingOpenCL, AvgPoolFp32) {
}
printf("test all close OK!\n");
lite::CompareOutputData(output_data, expect, 4);
delete tensor_in;
delete tensor_out;
delete pooling_kernel;
delete pGraph;
lite::opencl::OpenCLRuntime::DeleteInstance();
}

} // namespace mindspore

+ 8
- 0
mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc View File

@@ -175,6 +175,14 @@ TEST_F(TestConcatOpenCL, ConcatFp32_2input_dim4_axis3) {
sub_graph->Run();
auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->Data());
CompareOutputData1(output_data_gpu, output_data_cpu.data(), output_tensor->ElementsNum(), 0.00001);
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete concat_kernel;
delete sub_graph;
lite::opencl::OpenCLRuntime::DeleteInstance();
}
} // namespace mindspore

+ 9
- 0
mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc View File

@@ -108,5 +108,14 @@ TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp32) {
CompareOutputData(output_data, correct_data, oh * ow * co, 0.00001);

MS_LOG(INFO) << "Test Conv2dTransposeFp32 passed";
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete arith_kernel;
delete pGraph;
lite::opencl::OpenCLRuntime::DeleteInstance();
}
} // namespace mindspore

+ 8
- 0
mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc View File

@@ -120,6 +120,14 @@ void TEST_MAIN(ConvParameter *param, schema::Format data_format, const std::stri
MyCompareOutput(output_tensor, expect_file);
// lite::CompareOutput(reinterpret_cast<float *>(output_tensor->Data()), expect_file);

for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete conv_kernel;
delete sub_graph;
mindspore::lite::opencl::OpenCLRuntime::DeleteInstance();
}



+ 9
- 6
mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc View File

@@ -75,12 +75,15 @@ TEST_F(TestMatMulOpenCL, MatMulFp32) {
// compare
CompareOutputData(output_data, correct_data, co, 0.00001);

delete input_data;
delete weight_data;
delete tensor_x;
delete tensor_w;
delete tensor_out;
delete correct_data;
MS_LOG(INFO) << "TestMatMulFp32 passed";
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete arith_kernel;
delete pGraph;
lite::opencl::OpenCLRuntime::DeleteInstance();
}
} // namespace mindspore

+ 9
- 0
mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc View File

@@ -92,6 +92,15 @@ TEST_F(TestMaxPoolingOpenCL, MaxPool_1_32_512_96) {
MS_LOG(INFO) << "compare result";
std::cout << "compare result" << std::endl;
CompareOutput(output_tensor, expect_file);
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete pooling_kernel;
delete pGraph;
lite::opencl::OpenCLRuntime::DeleteInstance();
}

} // namespace mindspore

+ 9
- 0
mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_tests.cc View File

@@ -77,6 +77,15 @@ void RunTestCase(std::vector<int> input_shape, std::vector<int> output_shape, st
MS_LOG(INFO) << "compare result";
std::cout << "compare result" << std::endl;
CompareOutput(output_tensor, expect_file);
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete kernel;
delete pGraph;
lite::opencl::OpenCLRuntime::DeleteInstance();
}

TEST_F(TestSoftmaxOpenCL, Softmax_1) {


+ 9
- 0
mindspore/lite/test/ut/src/runtime/kernel/opencl/to_format_tests.cc View File

@@ -75,5 +75,14 @@ TEST_F(TestToFormatOpenCL, TransposeFp32) {
// compare
CompareOutputData(output_data, correct_data, h * w * c, 0.00001);
MS_LOG(INFO) << "TestMatMulFp32 passed";
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete arith_kernel;
delete pGraph;
lite::opencl::OpenCLRuntime::DeleteInstance();
}
} // namespace mindspore

+ 9
- 0
mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc View File

@@ -75,5 +75,14 @@ TEST_F(TestTransposeOpenCL, TransposeFp32) {
// compare
CompareOutputData(output_data, correct_data, h * w * c, 0.00001);
MS_LOG(INFO) << "TestMatMulFp32 passed";
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete arith_kernel;
delete pGraph;
lite::opencl::OpenCLRuntime::DeleteInstance();
}
} // namespace mindspore

Loading…
Cancel
Save