diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc index bab94aee26..15a44efa2d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc @@ -38,7 +38,7 @@ int ConcatOpenCLKernel::RunAxis0() { auto dst_origin = cl::array{0, 0, 0}; cl::Image2D *out_image = reinterpret_cast(allocator_->GetImage(dst_data)); for (int i = 0; i < in_tensors_.size(); i++) { - auto src_data = in_tensors_[i]->data_c(); + auto src_data = inputs_weight_ptrs_.at(i) == nullptr ? in_tensors_[i]->data_c() : inputs_weight_ptrs_.at(i); allocator_->GetImageSize(src_data, &img_size); auto src_origin = cl::array{0, 0, 0}; auto region = cl::array{img_size[0], img_size[1], 1}; @@ -160,10 +160,76 @@ void ConcatOpenCLKernel::SetGlobalLocal() { OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); } +int ConcatOpenCLKernel::ConvertWeightToTensor(const std::vector &in_tensors, + std::vector *inputs_weight_ptrs, bool fp16_enable, + size_t data_size) { + for (auto in_tensor_ : in_tensors) { + auto nhwc_shape = GetNHWCShape(in_tensor_->shape()); + if (!in_tensor_->IsConst()) { + (*inputs_weight_ptrs).push_back(nullptr); + } else { + auto allocator = ocl_runtime_->GetAllocator(); + std::vector img_size = GetImage2dShapeFromNHWC(nhwc_shape, schema::Format_NHWC4); + int pack_weight_size = img_size[0] * img_size[1] * C4NUM; + int plane = nhwc_shape[1] * nhwc_shape[2]; + int channel = nhwc_shape[3]; + int batch = nhwc_shape[0]; + img_size.push_back(fp16_enable ? CL_HALF_FLOAT : CL_FLOAT); + if (!fp16_enable) { + float *weight = new (std::nothrow) float[pack_weight_size]; + if (weight == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed!"; + return RET_ERROR; + } + memset(weight, 0x00, pack_weight_size * data_size); + if (in_tensor_->data_type() == kNumberTypeFloat32) { + std::function to_dtype = [](float x) -> float { return x; }; + PackNHWCToNHWC4(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype); + } else if (in_tensor_->data_type() == kNumberTypeFloat16) { + std::function to_dtype = [](float16_t x) -> float { return static_cast(x); }; + PackNHWCToNHWC4(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype); + } + if (batch * plane * channel == 1) { + // scalar + weight[3] = weight[2] = weight[1] = weight[0]; + } + auto weight_ptr_ = allocator->Malloc(pack_weight_size, img_size, weight); + (*inputs_weight_ptrs).push_back(weight_ptr_); + delete[] weight; + } else { + float16_t *weight = new (std::nothrow) float16_t[pack_weight_size]; + if (weight == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed!"; + return RET_ERROR; + } + memset(weight, 0x00, pack_weight_size * data_size); + if (in_tensor_->data_type() == kNumberTypeFloat32) { + std::function to_dtype = [](float x) -> float16_t { return static_cast(x); }; + PackNHWCToNHWC4(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype); + } else if (in_tensor_->data_type() == kNumberTypeFloat16) { + std::function to_dtype = [](float16_t x) -> float16_t { return x; }; + PackNHWCToNHWC4(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype); + } + if (batch * plane * channel == 1) { + // scalar + weight[3] = weight[2] = weight[1] = weight[0]; + } + auto weight_ptr_ = allocator->Malloc(pack_weight_size, img_size, weight); + (*inputs_weight_ptrs).push_back(weight_ptr_); + delete[] weight; + } + } + } + return RET_OK; +} + int ConcatOpenCLKernel::Prepare() { + enable_fp16_ = ocl_runtime_->GetFp16Enable(); + auto data_size = enable_fp16_ ? sizeof(float16_t) : sizeof(float); + ConvertWeightToTensor(in_tensors_, &inputs_weight_ptrs_, enable_fp16_, data_size); if (axis_ == 0) { for (int i = 0; i < in_tensors_.size(); ++i) { - if (in_tensors_.at(0)->shape().size() != 1) { + if (in_tensors_.at(i)->shape().size() != 1) { return RET_OK; } } @@ -175,7 +241,7 @@ int ConcatOpenCLKernel::Prepare() { Align_ = false; } } - enable_fp16_ = ocl_runtime_->GetFp16Enable(); + std::string kernel_name = "Concat"; if (axis_ == 3 && !Align_) { kernel_name += "Input" + std::to_string(in_tensors_.size()) + "UnAlign"; @@ -202,7 +268,8 @@ int ConcatOpenCLKernel::Run() { } int arg_cn = 0; for (int i = 0; i < in_tensors_.size(); ++i) { - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c()); + auto input_ptr = inputs_weight_ptrs_.at(i) == nullptr ? in_tensors_[i]->data_c() : inputs_weight_ptrs_.at(i); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_ptr); } if (axis_ == 3 && !Align_) { ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h index dd5960d4ac..9660d6cadc 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h @@ -43,6 +43,7 @@ class ConcatOpenCLKernel : public OpenCLKernel { uint32_t OC = {1}; std::vector global; bool Align_{true}; + std::vector inputs_weight_ptrs_; bool enable_fp16_{false}; cl_int stride_w{1}; cl_int4 in_shape_{}; @@ -51,6 +52,8 @@ class ConcatOpenCLKernel : public OpenCLKernel { private: int RunAxis0(); + int ConvertWeightToTensor(const std::vector &in_tensors, std::vector *inputs_weight_ptrs, + bool fp16_enable, size_t data_size); }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc index 5be0ad12a6..24d01091df 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc @@ -60,7 +60,7 @@ void StrassenOpenCLKernel::AllocatorMemoryForStrassen(int NumA, int NumB) { img_size.push_back(UP_DIV(NumA, C4NUM)); img_size.push_back(NumA); size_t img_dtype = enable_fp16_ ? CL_HALF_FLOAT : CL_FLOAT; - size_t dtype_size = enable_fp16_ ? sizeof(CL_HALF_FLOAT) : sizeof(CL_FLOAT); + size_t dtype_size = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float); img_size.push_back(img_dtype); auto allocator = ocl_runtime_->GetAllocator(); size_t memA = NumA * NumA; @@ -178,29 +178,6 @@ void StrassenOpenCLKernel::SetConstArgs() { ocl_runtime_->SetKernelArg(kernel_, arg_count++, shape_offset); } -// OriginSize = N*H*W*C typesize = sizeof(type data) width = W * UP_DIV(C,C4NUM) size = N -void StrassenOpenCLKernel::PrintImage2d(void *IMGData, size_t typesize, size_t width, size_t size) { - auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); - int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment(); - auto runtime = runtime_wrapper.GetInstance(); - runtime->SyncCommandQueue(); - MS_ASSERT(alignment); - size_t row_pitch = UP_ROUND(width, alignment) * typesize * C4NUM; - size_t OriginSize = size * size * typesize; - std::vector data(OriginSize); - auto row_size = width * typesize * C4NUM; - - for (int i = 0; i < size; ++i) { - memcpy(reinterpret_cast(data.data()) + i * row_size, static_cast(IMGData) + i * row_pitch, - row_size); - } - for (int i = 0; i < size * size; ++i) { - if ((i + 1) % size == 0) { - std::cout << std::endl; - } - } -} - void StrassenOpenCLKernel::StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size, cl_int2 offset, lite::opencl::MemType mem_type) { if (input == nullptr || output == nullptr) { @@ -344,7 +321,7 @@ void StrassenOpenCLKernel::DoStrassen(void *data, void *weight, void *result, co int StrassenOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; - int threshold = 0; + int threshold; const int up_bound = 1024; const int down_bound = 256; if (in_tensors_.at(0)->shape()[0] >= up_bound) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.h index db7432d100..0aa9893f2e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.h @@ -48,7 +48,6 @@ class StrassenOpenCLKernel : public MatMulOpenCLKernel { void StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3, void *input4, void *input5, void *input6, void *input7, void *output, const int size); void StrassenRunMmatmul(void *input, void *weight, void *output, const int size); - void PrintImage2d(void *IMGData, size_t typesize, size_t width, size_t size); cl::Kernel kernel_IMG_add_sub_2; cl::Kernel MatMul_StrassenBUFFilled; cl::Kernel MatMul_StrassenIMGFilled;