Browse Source

add scale op and test case for opencl

tags/v1.0.0
Corleone 5 years ago
parent
commit
e2d56df80f
7 changed files with 754 additions and 4 deletions
  1. +42
    -0
      mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl
  2. +384
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
  3. +56
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h
  4. +2
    -1
      mindspore/lite/src/runtime/opencl/opencl_allocator.cc
  5. +2
    -0
      mindspore/lite/test/CMakeLists.txt
  6. +3
    -3
      mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
  7. +265
    -0
      mindspore/lite/test/ut/src/runtime/kernel/opencl/scale_tests.cc

+ 42
- 0
mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl View File

@@ -0,0 +1,42 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;

__kernel void Scale_IMG(__read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset,
__write_only image2d_t output, const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}

FLT4 in = read_imagef(input, smp_none, (int2)(X, Y));
FLT4 s = read_imagef(scale, smp_none, (int2)(X, Y));
FLT4 o = read_imagef(offset, smp_none, (int2)(X, Y));
WRITE_IMAGE(output, (int2)(X, Y), in * s + o);
}

__kernel void BoardcastScale_IMG(__read_only image2d_t input, float scale, float offset, __write_only image2d_t output,
const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}

FLT4 in = read_imagef(input, smp_none, (int2)(X, Y));
WRITE_IMAGE(output, (int2)(X, Y), in * (FLT)scale + (FLT)offset);
}

__kernel void Scale_C_IMG(__read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset,
__write_only image2d_t output, const int2 output_shape, const int C) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}

FLT4 in = read_imagef(input, smp_none, (int2)(X, Y));
FLT4 s = read_imagef(scale, smp_none, (int2)(X % C, 0));
FLT4 o = read_imagef(offset, smp_none, (int2)(X % C, 0));
WRITE_IMAGE(output, (int2)(X, Y), in * s + o);
}

+ 384
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc View File

@@ -0,0 +1,384 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "src/runtime/kernel/opencl/kernel/scale.h"
#include <set>
#include <vector>
#include <string>
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "nnacl/fp32/common_func.h"
#include "src/runtime/kernel/opencl/utils.h"
#ifndef PROGRAM_WITH_IL
#include "src/runtime/kernel/opencl/cl/scale.cl.inc"
#endif

using mindspore::kernel::KERNEL_ARCH::kGPU;
using mindspore::lite::KernelRegistrar;
using mindspore::schema::PrimitiveType_Scale;

namespace mindspore::kernel {

ScaleOpenCLKernel::~ScaleOpenCLKernel() {
auto allocator = ocl_runtime_->GetAllocator();
if (scale_ptr_ != nullptr) {
allocator->Free(scale_ptr_);
scale_ptr_ = nullptr;
}
if (offset_ptr_ != nullptr) {
allocator->Free(offset_ptr_);
offset_ptr_ = nullptr;
}
}

std::vector<size_t> ScaleOpenCLKernel::InitGlobalSize() const {
const size_t global_x = out_tensors_[0]->Width();
const size_t global_y = out_tensors_[0]->Height();
const size_t global_z = UP_ROUND_DIV(out_tensors_[0]->Channel(), C4NUM);
std::vector<size_t> global = {global_x, global_y, global_z};
return global;
}

void ScaleOpenCLKernel::Image2dGetWorkGroupSize() {
local_size_ = {16, 16};
if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
size_t H = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
size_t W = out_tensors_[0]->Width();
global_size_ = {W, H};
} else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
size_t H = out_tensors_[0]->Batch() * out_tensors_[0]->Height();
size_t W = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
global_size_ = {W, H};
} else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
size_t H = out_tensors_[0]->Batch();
size_t W = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
global_size_ = {W, H};
} else {
MS_LOG(ERROR) << "Unsupport data format " << out_tensors_[0]->GetFormat();
}
}

void ScaleOpenCLKernel::BufferGetWorkGroupSize() {
uint32_t element_num = out_tensors_[0]->ElementsC4Num();
global_size_ = {element_num};
}

int ScaleOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
size_t im_dst_x, im_dst_y;
if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
im_dst_x = out_tensors_[0]->Width();
im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
} else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
im_dst_x = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height();
} else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
im_dst_y = out_tensors_[0]->Batch();
im_dst_x = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
} else {
MS_LOG(ERROR) << "Unsupport data format " << out_tensors_[0]->GetFormat();
return RET_ERROR;
}

size_t img_dtype = CL_FLOAT;
if (in_tensors_[0]->data_type() == kNumberTypeFloat16) {
img_dtype = CL_HALF_FLOAT;
} else if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
img_dtype = CL_FLOAT;
} else {
MS_LOG(ERROR) << "Unsupport data type " << in_tensors_[0]->data_type();
}
img_size->clear();
std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
*img_size = vec;
return RET_OK;
}

int ScaleOpenCLKernel::InitBuffer() {
if (!element_flag_) {
return RET_OK;
}
if (in_tensors_[1]->TensorType() == schema::NodeType_ValueNode && in_tensors_[1]->Data() != nullptr) {
auto allocator = ocl_runtime_->GetAllocator();
std::vector<size_t> img_size;
GetImageSize(0, &img_size);
if (in_tensors_[1]->shape().size() == 1 && axis_ == 3) {
img_size[0] = 1;
img_size[1] = UP_DIV(in_tensors_[1]->shape()[0], C4NUM);
scale_ptr_ = allocator->CreateImageFromHost(in_tensors_[1]->Data(), in_tensors_[1]->ElementsNum(), img_size);
offset_ptr_ = allocator->CreateImageFromHost(in_tensors_[2]->Data(), in_tensors_[2]->ElementsNum(), img_size);
return RET_OK;
}
int pack_weight_size = in_tensors_[1]->ElementsC4Num();
int plane = in_tensors_[1]->Height() * in_tensors_[1]->Width();
int channel = in_tensors_[1]->Channel();
int batch = in_tensors_[1]->Batch();
if (in_tensors_[0]->GetFormat() == in_tensors_[1]->GetFormat()) {
if (in_tensors_[0]->data_type() == in_tensors_[1]->data_type()) {
scale_ptr_ = allocator->CreateImageFromHost(in_tensors_[1]->Data(), in_tensors_[1]->ElementsNum(), img_size);
offset_ptr_ = allocator->CreateImageFromHost(in_tensors_[2]->Data(), in_tensors_[2]->ElementsNum(), img_size);
} else {
MS_LOG(ERROR) << "Unsupport data type transpose from " << in_tensors_[1]->data_type() << "to "
<< in_tensors_[0]->data_type();
return RET_ERROR;
}
} else if (in_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
if (in_tensors_[1]->GetFormat() == schema::Format_NHWC) {
if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
float *scale = new (std::nothrow) float[pack_weight_size];
if (scale == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed!";
return RET_ERROR;
}
float *offset = new (std::nothrow) float[pack_weight_size];
if (offset == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed!";
delete[] scale;
return RET_ERROR;
}
std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; };
PackNHWCToNC4HW4<float, float>(in_tensors_[1]->Data(), scale, batch, plane, channel, to_dtype);
PackNHWCToNC4HW4<float, float>(in_tensors_[2]->Data(), offset, batch, plane, channel, to_dtype);
scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size);
offset_ptr_ = allocator->CreateImageFromHost(offset, in_tensors_[2]->ElementsNum(), img_size);
delete[] scale;
delete[] offset;
} else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) {
int16_t *scale = new (std::nothrow) int16_t[pack_weight_size];
if (scale == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed!";
return RET_ERROR;
}
int16_t *offset = new (std::nothrow) int16_t[pack_weight_size];
if (offset == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed!";
delete[] scale;
return RET_ERROR;
}
std::function<int16_t(float)> to_dtype = Float32ToShort;
PackNHWCToNC4HW4<float, int16_t>(in_tensors_[1]->Data(), scale, batch, plane, channel, to_dtype);
PackNHWCToNC4HW4<float, int16_t>(in_tensors_[2]->Data(), offset, batch, plane, channel, to_dtype);
scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size);
offset_ptr_ = allocator->CreateImageFromHost(offset, in_tensors_[2]->ElementsNum(), img_size);
delete[] scale;
delete[] offset;
} else {
MS_LOG(ERROR) << "Unsupport data type transpose from " << in_tensors_[1]->data_type() << "to "
<< in_tensors_[0]->data_type();
return RET_ERROR;
}
} else {
MS_LOG(ERROR) << "Unsupport format transpose from " << in_tensors_[1]->GetFormat() << "to "
<< in_tensors_[0]->GetFormat();
return RET_ERROR;
}
} else if (in_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
if (in_tensors_[1]->GetFormat() == schema::Format_NHWC) {
if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
float *scale = new (std::nothrow) float[pack_weight_size];
if (scale == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed!";
return RET_ERROR;
}
float *offset = new (std::nothrow) float[pack_weight_size];
if (offset == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed!";
delete[] scale;
return RET_ERROR;
}
std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; };
PackNHWCToNHWC4<float, float>(in_tensors_[1]->Data(), scale, batch, plane, channel, to_dtype);
PackNHWCToNHWC4<float, float>(in_tensors_[2]->Data(), offset, batch, plane, channel, to_dtype);
scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size);
offset_ptr_ = allocator->CreateImageFromHost(offset, in_tensors_[2]->ElementsNum(), img_size);
delete[] scale;
delete[] offset;
} else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) {
int16_t *scale = new (std::nothrow) int16_t[pack_weight_size];
if (scale == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed!";
return RET_ERROR;
}
int16_t *offset = new (std::nothrow) int16_t[pack_weight_size];
if (offset == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed!";
delete[] scale;
return RET_ERROR;
}
std::function<int16_t(float)> to_dtype = Float32ToShort;
PackNHWCToNHWC4<float, int16_t>(in_tensors_[1]->Data(), scale, batch, plane, channel, to_dtype);
PackNHWCToNHWC4<float, int16_t>(in_tensors_[2]->Data(), offset, batch, plane, channel, to_dtype);
scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size);
offset_ptr_ = allocator->CreateImageFromHost(offset, in_tensors_[2]->ElementsNum(), img_size);
delete[] scale;
delete[] offset;
} else {
MS_LOG(ERROR) << "Unsupport data type transpose from " << in_tensors_[1]->data_type() << "to "
<< in_tensors_[0]->data_type();
return RET_ERROR;
}
} else {
MS_LOG(ERROR) << "Unsupport format transpose from " << in_tensors_[1]->GetFormat() << "to "
<< in_tensors_[0]->GetFormat();
return RET_ERROR;
}
}
return RET_OK;
}
return RET_OK;
}

int ScaleOpenCLKernel::Init() {
ocl_runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
std::string kernel_name;

const ScaleParameter *scale_param = reinterpret_cast<const ScaleParameter *>(op_parameter_);
auto in_tensor = in_tensors_.at(0);
auto in_shape = in_tensor->shape();
auto scale_tensor = in_tensors_.at(1);
auto scale_shape = scale_tensor->shape();
axis_ = scale_param->axis_;
if (axis_ < 0) {
axis_ = axis_ + in_shape.size();
}
if (scale_shape.size() != in_shape.size()) {
if (scale_tensor->ElementsNum() == 1) {
element_flag_ = false;
kernel_name = "BoardcastScale";
} else if (axis_ == 3 && scale_shape.size() == 1) {
element_flag_ = true;
kernel_name = "Scale_C";
}
} else {
element_flag_ = true;
kernel_name = "Scale";
}
lite::STATUS error_code = RET_OK;
#ifdef PROGRAM_WITH_IL
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
#else
if (out_mem_type_ == OpenCLMemType::IMG) {
kernel_name += "_IMG";
} else {
kernel_name += "_BUF";
}
std::string program_name = "Scale";
std::set<std::string> build_options;
std::string source = scale_source;
ocl_runtime_->LoadSource(program_name, source);
error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
#endif
if (error_code != RET_OK) {
return error_code;
}

auto format = op_format_;
if (out_tensors_[0]->shape().size() == 2) {
format = schema::Format_NC4;
}
in_ori_format_ = in_tensors_[0]->GetFormat();
out_ori_format_ = out_tensors_[0]->GetFormat();
in_tensors_[0]->SetFormat(format);
if (element_flag_ && in_tensors_[1]->TensorType() != schema::NodeType_ValueNode) {
in_tensors_[1]->SetFormat(format);
in_tensors_[2]->SetFormat(format);
}
out_tensors_[0]->SetFormat(format);
Image2dGetWorkGroupSize();
InitBuffer();
return RET_OK;
}

int ScaleOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";

int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->Data());
if (element_flag_) {
void *scale = scale_ptr_ == nullptr ? in_tensors_[1]->Data() : scale_ptr_;
void *offset = offset_ptr_ == nullptr ? in_tensors_[2]->Data() : offset_ptr_;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, offset);
} else {
if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
float scale = static_cast<float *>(in_tensors_[1]->Data())[0];
float offset = static_cast<float *>(in_tensors_[2]->Data())[0];
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, offset);
} else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) {
if (in_tensors_[1]->data_type() == kNumberTypeFloat32) {
float scale = static_cast<float *>(in_tensors_[1]->Data())[0];
float offset = static_cast<float *>(in_tensors_[2]->Data())[0];
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(scale));
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(offset));
} else if (in_tensors_[1]->data_type() == kNumberTypeFloat16) {
int16_t scale = static_cast<int16_t *>(in_tensors_[1]->Data())[0];
int16_t offset = static_cast<int16_t *>(in_tensors_[2]->Data())[0];
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(scale));
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(offset));
} else {
MS_LOG(ERROR) << "Unsupport data type " << in_tensors_[1]->data_type();
return RET_ERROR;
}
}
}
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data());
int H = 0;
int W = 0;
if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
H = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
W = out_tensors_[0]->Width();
} else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
H = out_tensors_[0]->Batch() * out_tensors_[0]->Height();
W = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
} else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
H = out_tensors_[0]->Batch();
W = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
} else {
MS_LOG(ERROR) << "Error output type " << out_tensors_[0]->GetFormat();
return RET_ERROR;
}
cl_int2 output_shape{W, H};
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
if (element_flag_ && axis_ == 3) {
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, UP_DIV(in_tensors_[1]->shape()[0], C4NUM));
}
ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
return RET_OK;
}

kernel::LiteKernel *OpenCLScaleKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs,
OpParameter *opParameter, const lite::Context *ctx,
const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel =
new (std::nothrow) ScaleOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs, ctx);
if (kernel == nullptr) {
MS_LOG(ERROR) << "Create OpenCL Scale kernel failed!";
return nullptr;
}
auto ret = kernel->Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init kernel failed, name: Scale";
delete kernel;
return nullptr;
}
return kernel;
}

REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Scale, OpenCLScaleKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Scale, OpenCLScaleKernelCreator)
} // namespace mindspore::kernel

+ 56
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h View File

@@ -0,0 +1,56 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SCALE_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SCALE_H_

#include <vector>
#include "nnacl/scale.h"
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/opencl_kernel.h"

namespace mindspore::kernel {

class ScaleOpenCLKernel : public OpenCLKernel {
public:
explicit ScaleOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx)
: OpenCLKernel(parameter, inputs, outputs) {}
~ScaleOpenCLKernel() override;

int Init() override;
int Run() override;
int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;

private:
std::vector<size_t> InitGlobalSize() const;
void Image2dGetWorkGroupSize();
void BufferGetWorkGroupSize();
int InitBuffer();

cl::Kernel kernel_;
lite::opencl::OpenCLRuntime *ocl_runtime_;
bool element_flag_{true};
void *scale_ptr_{nullptr};
void *offset_ptr_{nullptr};
int axis_{0};

std::vector<size_t> local_size_;
std::vector<size_t> global_size_;
};
} // namespace mindspore::kernel

#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SCALE_H_

+ 2
- 1
mindspore/lite/src/runtime/opencl/opencl_allocator.cc View File

@@ -288,13 +288,14 @@ void OpenCLAllocator::Clear() {
MS_LOG(DEBUG) << "OpenCL free svm buffer : " << it->second->host_ptr_; MS_LOG(DEBUG) << "OpenCL free svm buffer : " << it->second->host_ptr_;
} else { } else {
cl::Buffer *buffer = static_cast<cl::Buffer *>(it->second->device_ptr_); cl::Buffer *buffer = static_cast<cl::Buffer *>(it->second->device_ptr_);
MS_LOG(DEBUG) << "OpenCL free device buffer : " << buffer;
if (buffer != nullptr) { if (buffer != nullptr) {
MS_LOG(DEBUG) << "OpenCL free device buffer : " << buffer;
delete buffer; delete buffer;
it->second->device_ptr_ = nullptr; it->second->device_ptr_ = nullptr;
} }
cl::Image *image = static_cast<cl::Image *>(it->second->image_ptr_); cl::Image *image = static_cast<cl::Image *>(it->second->image_ptr_);
if (image != nullptr) { if (image != nullptr) {
MS_LOG(DEBUG) << "OpenCL free image : " << image;
delete image; delete image;
it->second->image_ptr_ = nullptr; it->second->image_ptr_ = nullptr;
} }


+ 2
- 0
mindspore/lite/test/CMakeLists.txt View File

@@ -161,6 +161,7 @@ if (SUPPORT_GPU)
${LITE_DIR}/src/runtime/kernel/opencl/kernel/prelu.cc ${LITE_DIR}/src/runtime/kernel/opencl/kernel/prelu.cc
${LITE_DIR}/src/runtime/kernel/opencl/kernel/to_format.cc ${LITE_DIR}/src/runtime/kernel/opencl/kernel/to_format.cc
${LITE_DIR}/src/runtime/kernel/opencl/kernel/biasadd.cc ${LITE_DIR}/src/runtime/kernel/opencl/kernel/biasadd.cc
${LITE_DIR}/src/runtime/kernel/opencl/kernel/scale.cc
) )
endif() endif()
### minddata lite ### minddata lite
@@ -349,6 +350,7 @@ if (SUPPORT_GPU)
${TEST_DIR}/ut/src/runtime/kernel/opencl/prelu_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/prelu_tests.cc
${TEST_DIR}/ut/src/runtime/kernel/opencl/reshape_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/reshape_tests.cc
${TEST_DIR}/ut/src/runtime/kernel/opencl/biasadd_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/biasadd_tests.cc
${TEST_DIR}/ut/src/runtime/kernel/opencl/scale_tests.cc
) )
endif() endif()




+ 3
- 3
mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc View File

@@ -136,8 +136,8 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)


std::vector<lite::tensor::Tensor *> arithmetic_inputs = {tensor_a, tensor_b}; std::vector<lite::tensor::Tensor *> arithmetic_inputs = {tensor_a, tensor_b};
lite::Context ctx; lite::Context ctx;
auto *arith_kernel =
new kernel::ArithmeticOpenCLKernel(reinterpret_cast<OpParameter *>(param), arithmetic_inputs, outputs, &ctx);
auto *arith_kernel = new (std::nothrow)
kernel::ArithmeticOpenCLKernel(reinterpret_cast<OpParameter *>(param), arithmetic_inputs, outputs, &ctx);
if (arith_kernel == nullptr) { if (arith_kernel == nullptr) {
MS_LOG(ERROR) << "Create ArithmeticOpenCLKernel failed!"; MS_LOG(ERROR) << "Create ArithmeticOpenCLKernel failed!";
delete tensor_a; delete tensor_a;
@@ -216,7 +216,7 @@ TEST_F(TestArithmeticOpenCL, AddElementwiseTest) {
TestCase(shape_a, shape_b); TestCase(shape_a, shape_b);
} }


TEST_F(TestArithmeticOpenCL, AddBoardcaseTest) {
TEST_F(TestArithmeticOpenCL, AddBroadcastTest) {
const std::vector<int> &shape_a = {1, 128, 128, 4}; const std::vector<int> &shape_a = {1, 128, 128, 4};
const std::vector<int> &shape_b = {}; const std::vector<int> &shape_b = {};
TestCase(shape_a, shape_b); TestCase(shape_a, shape_b);


+ 265
- 0
mindspore/lite/test/ut/src/runtime/kernel/opencl/scale_tests.cc View File

@@ -0,0 +1,265 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "common/common_test.h"
#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
#include "mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h"

namespace mindspore {

template <class T>
static void BoardcaseScale(const T *in, const T scale, const T offset, T *out, const int size) {
for (int i = 0; i < size; i++) {
out[i] = in[i] * scale + offset;
}
}

template <class T>
static void Scale(const T *in, const T *scale, T *offset, T *out, const int size) {
for (int i = 0; i < size; i++) {
out[i] = in[i] * scale[i] + offset[i];
}
}

template <class T>
static bool DataCompare(const T *a, const T *b, const int size, const T accuracy = 1e-4) {
for (int i = 0; i < size; i++) {
auto diff = fabs(a[i] - b[i]);
if (diff > accuracy) {
MS_LOG(ERROR) << "compare failed at " << i << " exp " << a[i] << " bug got " << b[i];
return false;
}
}
return true;
}

template <class T>
static void InitData(void *data, const int size) {
T *data_float = reinterpret_cast<T *>(data);
static unsigned int seed = 123;
for (int i = 0; i < size; i++) {
data_float[i] = static_cast<int>(rand_r(&seed)) % 100;
}
}

template <class T>
static void LogData(void *data, const int size, const std::string prefix) {
std::cout << prefix;
T *data_float = reinterpret_cast<T *>(data);
for (int i = 0; i < size; i++) {
std::cout << data_float[i] << ",";
}
std::cout << std::endl;
}

template <class T>
static void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b) {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
auto allocator = ocl_runtime->GetAllocator();

bool is_broadcast = shape_b.empty();
auto tensorType = schema::NodeType_ValueNode;
auto format = schema::Format_NHWC4;

auto data_type = kNumberTypeFloat32;
if (sizeof(T) == 2) {
data_type = kNumberTypeFloat16;
ocl_runtime->SetFp16Enable(true);
}
lite::tensor::Tensor *tensor_in = new (std::nothrow) lite::tensor::Tensor(data_type, shape_a, format, tensorType);
lite::tensor::Tensor *tensor_scale = new (std::nothrow) lite::tensor::Tensor(data_type, shape_b, format, tensorType);
lite::tensor::Tensor *tensor_offset = new (std::nothrow) lite::tensor::Tensor(data_type, shape_b, format, tensorType);
lite::tensor::Tensor *tensor_out = new (std::nothrow) lite::tensor::Tensor(data_type, shape_a, format, tensorType);
if (tensor_in == nullptr || tensor_scale == nullptr || tensor_offset == nullptr) {
MS_LOG(ERROR) << "Create tensor failed!";
delete tensor_in;
delete tensor_scale;
delete tensor_offset;
delete tensor_out;
return;
}

int64_t element_num = tensor_in->ElementsC4Num();
int64_t element_num_b = is_broadcast ? 1 : tensor_scale->ElementsC4Num();

T *data_in = new (std::nothrow) T[element_num];
T *data_scale = new (std::nothrow) T[element_num_b];
T *data_offset = new (std::nothrow) T[element_num_b];
T *data_out_cpu = new (std::nothrow) T[element_num];
T *data_out_ocl = new (std::nothrow) T[element_num];
if (data_in == nullptr || data_scale == nullptr || data_out_cpu == nullptr || data_out_ocl == nullptr) {
MS_LOG(ERROR) << "Create buffer failed!";
delete tensor_in;
delete tensor_scale;
delete tensor_offset;
delete tensor_out;
delete[] data_in;
delete[] data_scale;
delete[] data_offset;
delete[] data_out_cpu;
delete[] data_out_ocl;
return;
}

InitData<T>(data_in, element_num);
InitData<T>(data_scale, element_num_b);
InitData<T>(data_offset, element_num_b);
memset(data_out_ocl, 0, sizeof(T) * element_num);

if (is_broadcast) {
BoardcaseScale(data_in, static_cast<T *>(data_scale)[0], static_cast<T *>(data_offset)[0], data_out_cpu,
element_num);
} else {
Scale(data_in, data_scale, data_offset, data_out_cpu, element_num);
}

std::vector<lite::tensor::Tensor *> inputs = {tensor_in};
if (!is_broadcast) {
inputs.push_back(tensor_scale);
inputs.push_back(tensor_offset);
} else {
tensor_scale->MallocData();
tensor_offset->MallocData();
memcpy(tensor_scale->Data(), data_scale, sizeof(T));
memcpy(tensor_offset->Data(), data_offset, sizeof(T));
}
std::vector<lite::tensor::Tensor *> outputs = {tensor_out};

ScaleParameter *param = new (std::nothrow) ScaleParameter();
if (param == nullptr) {
MS_LOG(ERROR) << "Create parameter failed!";
delete tensor_in;
delete tensor_scale;
delete tensor_offset;
delete tensor_out;
delete[] data_in;
delete[] data_scale;
delete[] data_offset;
delete[] data_out_cpu;
delete[] data_out_ocl;
return;
}
param->axis_ = 0;
param->op_parameter_.type_ = schema::PrimitiveType_Scale;

std::vector<lite::tensor::Tensor *> scale_inputs = {tensor_in, tensor_scale, tensor_offset};
lite::Context ctx;
auto *scale_kernel =
new (std::nothrow) kernel::ScaleOpenCLKernel(reinterpret_cast<OpParameter *>(param), scale_inputs, outputs, &ctx);
if (scale_kernel == nullptr) {
MS_LOG(ERROR) << "Create ScaleOpenCLKernel failed!";
delete tensor_in;
delete tensor_scale;
delete tensor_offset;
delete tensor_out;
delete[] data_in;
delete[] data_scale;
delete[] data_offset;
delete[] data_out_cpu;
delete[] data_out_ocl;
delete param;
return;
}
scale_kernel->Init();

tensor_in->MallocData(allocator);
tensor_scale->MallocData(allocator);
tensor_offset->MallocData(allocator);
std::vector<kernel::LiteKernel *> kernels{scale_kernel};
auto *kernel = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
if (scale_kernel == nullptr) {
MS_LOG(ERROR) << "Create SubGraphOpenCLKernel failed!";
delete tensor_in;
delete tensor_scale;
delete tensor_offset;
delete tensor_out;
delete[] data_in;
delete[] data_scale;
delete[] data_offset;
delete[] data_out_cpu;
delete[] data_out_ocl;
delete scale_kernel;
return;
}
kernel->Init();

memcpy(inputs[0]->Data(), data_in, sizeof(T) * element_num);
if (!is_broadcast) {
memcpy(inputs[1]->Data(), data_scale, sizeof(T) * element_num_b);
memcpy(inputs[2]->Data(), data_offset, sizeof(T) * element_num_b);
}

kernel->Run();

memcpy(data_out_ocl, outputs[0]->Data(), sizeof(T) * element_num);

LogData<T>(data_in, 10, "Data input : ");
LogData<T>(data_scale, tensor_scale->shape().empty() ? 1 : 10, "Data scale : ");
LogData<T>(data_offset, tensor_offset->shape().empty() ? 1 : 10, "Data offset : ");
LogData<T>(data_out_cpu, 10, "Expect compute : ");
LogData<T>(outputs[0]->Data(), 10, "OpenCL compute : ");
bool cmp = DataCompare(data_out_cpu, data_out_ocl, element_num);
MS_LOG(INFO) << "Compare " << (cmp ? "success!" : "failed!");
EXPECT_EQ(true, cmp);

// free
delete[] data_in;
delete[] data_scale;
delete[] data_offset;
delete[] data_out_cpu;
delete[] data_out_ocl;

delete kernel;
delete scale_kernel;
delete param;
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
lite::opencl::OpenCLRuntime::DeleteInstance();
}

class TestScaleOpenCL : public mindspore::CommonTest {
public:
TestScaleOpenCL() {}
};

TEST_F(TestScaleOpenCL, ElementFP32) {
const std::vector<int> &shape_a = {1, 1024, 1024, 4};
const std::vector<int> &shape_b = {1, 1024, 1024, 4};
TestCase<float>(shape_a, shape_b);
}

TEST_F(TestScaleOpenCL, BroadcastFP32) {
const std::vector<int> &shape_a = {1, 128, 128, 4};
const std::vector<int> &shape_b = {};
TestCase<float>(shape_a, shape_b);
}

TEST_F(TestScaleOpenCL, ElementFP16) {
const std::vector<int> &shape_a = {1, 1024, 1024, 4};
const std::vector<int> &shape_b = {1, 1024, 1024, 4};
TestCase<float16_t>(shape_a, shape_b);
}

TEST_F(TestScaleOpenCL, BroadcastFP16) {
const std::vector<int> &shape_a = {1, 128, 128, 4};
const std::vector<int> &shape_b = {};
TestCase<float16_t>(shape_a, shape_b);
}
} // namespace mindspore

Loading…
Cancel
Save