Merge pull request !4169 from wandongdong/mastertags/v0.7.0-beta
| @@ -1,4 +1,4 @@ | |||||
| __kernel void AvgPooling2d(__global float4 *input, __global float4 *output, const int4 input_shape, | |||||
| __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape, | |||||
| const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { | const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { | ||||
| // axis to dst tensor coordinate | // axis to dst tensor coordinate | ||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| @@ -31,7 +31,7 @@ __kernel void AvgPooling2d(__global float4 *input, __global float4 *output, cons | |||||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | ||||
| __kernel void AvgPooling2dImage2d(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, | |||||
| __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, | |||||
| const int4 output_shape, const int2 stride, const int2 kernel_size, | const int4 output_shape, const int2 stride, const int2 kernel_size, | ||||
| const int2 padding) { | const int2 padding) { | ||||
| // axis to dst tensor coordinate | // axis to dst tensor coordinate | ||||
| @@ -148,8 +148,6 @@ int ArithmeticOpenCLKernel::Run() { | |||||
| } | } | ||||
| runtime_->SetKernelArg(kernel_, arg_idx++, weight_); | runtime_->SetKernelArg(kernel_, arg_idx++, weight_); | ||||
| runtime_->SetKernelArg(kernel_, arg_idx++, bias_); | runtime_->SetKernelArg(kernel_, arg_idx++, bias_); | ||||
| MS_LOG(DEBUG) << arg_idx-2 << " " << weight_; | |||||
| MS_LOG(DEBUG) << arg_idx-1 << " " << bias_; | |||||
| } | } | ||||
| runtime_->SetKernelArg(kernel_, arg_idx++, outputs_[0]->Data()); | runtime_->SetKernelArg(kernel_, arg_idx++, outputs_[0]->Data()); | ||||
| int H = outputs_[0]->Batch() * outputs_[0]->Height(); | int H = outputs_[0]->Batch() * outputs_[0]->Height(); | ||||
| @@ -147,21 +147,18 @@ void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> * | |||||
| local->push_back(z); | local->push_back(z); | ||||
| } | } | ||||
| int ConcatOpenCLKernel::Run() { | int ConcatOpenCLKernel::Run() { | ||||
| MS_LOG(DEBUG) << this->Name() << " Running!"; | |||||
| auto param = reinterpret_cast<ConcatParameter *>(this->opParameter); | auto param = reinterpret_cast<ConcatParameter *>(this->opParameter); | ||||
| if (param->axis_ == 0) { | if (param->axis_ == 0) { | ||||
| return Run_axis0(); | return Run_axis0(); | ||||
| } | } | ||||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | ||||
| MS_LOG(INFO) << " judge the numbers of input vector"; | |||||
| auto input0_shape = inputs_[0]->shape(); | auto input0_shape = inputs_[0]->shape(); | ||||
| auto input1_shape = inputs_[1]->shape(); | auto input1_shape = inputs_[1]->shape(); | ||||
| auto input2_shape = inputs_[2]->shape(); | |||||
| auto output_shape = outputs_[0]->shape(); | auto output_shape = outputs_[0]->shape(); | ||||
| cl_int2 input0_shape2_ = {DivideRoundUp(input0_shape[3], 4), DivideRoundUp(input1_shape[3], 4)}; // change | cl_int2 input0_shape2_ = {DivideRoundUp(input0_shape[3], 4), DivideRoundUp(input1_shape[3], 4)}; // change | ||||
| cl_int3 input0_shape3_ = {DivideRoundUp(input0_shape[3], 4), DivideRoundUp(input1_shape[3], 4), | |||||
| DivideRoundUp(input2_shape[3], 4)}; | |||||
| cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], DivideRoundUp(output_shape[3], 4)}; | cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], DivideRoundUp(output_shape[3], 4)}; | ||||
| uint32_t OH = output_shape[0] * output_shape[1]; // N*H | uint32_t OH = output_shape[0] * output_shape[1]; // N*H | ||||
| @@ -173,14 +170,15 @@ int ConcatOpenCLKernel::Run() { | |||||
| int arg_cn = 0; | int arg_cn = 0; | ||||
| if (inputs_.size() == 2) { | if (inputs_.size() == 2) { | ||||
| MS_LOG(INFO) << " SetKernelArg"; | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data()); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data()); | ||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data()); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data()); | ||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[1]->Data()); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[1]->Data()); | ||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, input0_shape2_); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, input0_shape2_); | ||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_); | ||||
| } else if (inputs_.size() == 3) { | } else if (inputs_.size() == 3) { | ||||
| MS_LOG(INFO) << " SetKernelArg"; | |||||
| auto input2_shape = inputs_[2]->shape(); | |||||
| cl_int3 input0_shape3_ = {DivideRoundUp(input0_shape[3], 4), DivideRoundUp(input1_shape[3], 4), | |||||
| DivideRoundUp(input2_shape[3], 4)}; | |||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data()); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data()); | ||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data()); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data()); | ||||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[1]->Data()); | ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[1]->Data()); | ||||
| @@ -228,6 +228,9 @@ static int GetBiggestDivider(int x, int y) { | |||||
| int ConvolutionOpenCLKernel::GetGlobalLocal(std::vector<size_t> *global, std::vector<size_t> *local) { | int ConvolutionOpenCLKernel::GetGlobalLocal(std::vector<size_t> *global, std::vector<size_t> *local) { | ||||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | ||||
| auto param = reinterpret_cast<ConvParameter *>(opParameter); | auto param = reinterpret_cast<ConvParameter *>(opParameter); | ||||
| param->output_h_ = outputs_[0]->Height(); | |||||
| param->output_w_ = outputs_[0]->Width(); | |||||
| param->output_channel_ = outputs_[0]->Channel(); | |||||
| constexpr size_t work_group_size[] = {4, 4, 1}; | constexpr size_t work_group_size[] = {4, 4, 1}; | ||||
| auto max_work_item_sizes = ocl_runtime->GetWorkItemSize(); | auto max_work_item_sizes = ocl_runtime->GetWorkItemSize(); | ||||
| @@ -287,7 +290,7 @@ int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_s | |||||
| } | } | ||||
| int ConvolutionOpenCLKernel::Run() { | int ConvolutionOpenCLKernel::Run() { | ||||
| std::cout << "ConvolutionOpenCLKernel::Run()\n"; | |||||
| MS_LOG(DEBUG) << this->Name() << " Running!"; | |||||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | ||||
| int arg_cn = 0; | int arg_cn = 0; | ||||
| @@ -24,6 +24,7 @@ SubGraphOpenCLKernel::~SubGraphOpenCLKernel() { UnInit(); } | |||||
| int SubGraphOpenCLKernel::Init() { | int SubGraphOpenCLKernel::Init() { | ||||
| allocator_ = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator(); | allocator_ = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator(); | ||||
| MS_LOG(DEBUG) << "input num=" << inputs_.size() << ", output num=" << outputs_.size(); | |||||
| for (const auto tensor : inputs_) { | for (const auto tensor : inputs_) { | ||||
| tensor->set_allocator(allocator_); | tensor->set_allocator(allocator_); | ||||
| } | } | ||||
| @@ -38,8 +39,7 @@ int SubGraphOpenCLKernel::Init() { | |||||
| data = allocator_->MapBuffer(data, CL_MAP_WRITE, nullptr, true); | data = allocator_->MapBuffer(data, CL_MAP_WRITE, nullptr, true); | ||||
| tensor->SetData(data); | tensor->SetData(data); | ||||
| } else { | } else { | ||||
| MS_LOG(ERROR) << "OpenCL kernel must use GPU buffer pointer, " | |||||
| << "please make sure that this buffer allocate by OpenCLAllocator!"; | |||||
| MS_LOG(ERROR) << "SubGraphOpenCLKernel input nullptr!"; | |||||
| } | } | ||||
| } | } | ||||
| return 0; | return 0; | ||||
| @@ -109,7 +109,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t>& img_size) | |||||
| auto mem_buf = iter->second; | auto mem_buf = iter->second; | ||||
| bool is_match{mem_buf->img_size.size() == img_size.size()}; | bool is_match{mem_buf->img_size.size() == img_size.size()}; | ||||
| for (int i = 0; i < img_size.size() && is_match; ++i) { | for (int i = 0; i < img_size.size() && is_match; ++i) { | ||||
| is_match = img_size[i] == mem_buf->img_size[i]; | |||||
| is_match &= img_size[i] == mem_buf->img_size[i]; | |||||
| } | } | ||||
| if (is_match) { | if (is_match) { | ||||
| free_list_.erase(iter); | free_list_.erase(iter); | ||||
| @@ -166,7 +166,7 @@ void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::v | |||||
| auto mem_buf = iter->second; | auto mem_buf = iter->second; | ||||
| bool is_match{mem_buf->img_size.size() == img_size.size()}; | bool is_match{mem_buf->img_size.size() == img_size.size()}; | ||||
| for (int i = 0; i < img_size.size() && is_match; ++i) { | for (int i = 0; i < img_size.size() && is_match; ++i) { | ||||
| is_match = img_size[i] == mem_buf->img_size[i]; | |||||
| is_match &= img_size[i] == mem_buf->img_size[i]; | |||||
| } | } | ||||
| if (is_match) { | if (is_match) { | ||||
| free_list_.erase(iter); | free_list_.erase(iter); | ||||
| @@ -168,6 +168,7 @@ int OpenCLExecutor::TransformTensorLayoutToBuffer(tensor::Tensor *tensor, schema | |||||
| int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema::Format src_format, | int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema::Format src_format, | ||||
| schema::Format dst_format) { | schema::Format dst_format) { | ||||
| if (dst_format == schema::Format_NHWC4) { | if (dst_format == schema::Format_NHWC4) { | ||||
| tensor->SetFormat(schema::Format_NHWC4); | |||||
| // convert to nhwc4 | // convert to nhwc4 | ||||
| auto *src_data = tensor->Data(); | auto *src_data = tensor->Data(); | ||||
| auto *dst_data{src_data}; | auto *dst_data{src_data}; | ||||
| @@ -190,7 +191,6 @@ int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema: | |||||
| dst_data = allocator_->CreateImageFromHost(src_data, tensor->Size(), img_size); | dst_data = allocator_->CreateImageFromHost(src_data, tensor->Size(), img_size); | ||||
| tensor->SetData(dst_data); | tensor->SetData(dst_data); | ||||
| allocator_->Free(src_data); | allocator_->Free(src_data); | ||||
| tensor->SetFormat(schema::Format_NHWC4); | |||||
| return RET_OK; | return RET_OK; | ||||
| } else { | } else { | ||||
| MS_LOG(ERROR) << "Unsupport layout transform: " << schema::EnumNameFormat(tensor->GetFormat()) << " to " | MS_LOG(ERROR) << "Unsupport layout transform: " << schema::EnumNameFormat(tensor->GetFormat()) << " to " | ||||
| @@ -142,10 +142,26 @@ kernel::LiteKernel *Scheduler::CreateSubKernel(const std::vector<kernel::LiteKer | |||||
| kernel::LiteKernel *sub_kernel = nullptr; | kernel::LiteKernel *sub_kernel = nullptr; | ||||
| #if SUPPORT_GPU | #if SUPPORT_GPU | ||||
| if (arch == kernel::KERNEL_ARCH::kGPU) { | if (arch == kernel::KERNEL_ARCH::kGPU) { | ||||
| std::vector<tensor::Tensor *> input_tensors = kernel::LiteKernelUtil::SubgraphInputTensors(kernels); | |||||
| std::vector<tensor::Tensor *> output_tensors = kernel::LiteKernelUtil::SubgraphOutputTensors(kernels); | |||||
| std::vector<kernel::LiteKernel *> input_kernels = kernel::LiteKernelUtil::SubgraphInputKernels(kernels); | |||||
| std::vector<kernel::LiteKernel *> output_kernels = kernel::LiteKernelUtil::SubgraphOutputKernels(kernels); | |||||
| auto head_kernel = kernels.front(); | |||||
| auto tail_kernel = kernels.back(); | |||||
| std::vector<kernel::LiteKernel *> input_kernels{head_kernel}; | |||||
| std::vector<kernel::LiteKernel *> output_kernels{tail_kernel}; | |||||
| std::vector<tensor::Tensor *> input_tensors; | |||||
| std::vector<tensor::Tensor *> output_tensors; | |||||
| for (auto tensor : head_kernel->GetInputs()) { | |||||
| if (tensor->Data() == nullptr) { | |||||
| input_tensors.emplace_back(tensor); | |||||
| } | |||||
| } | |||||
| for (auto tensor : tail_kernel->GetInputs()) { | |||||
| if (tensor->Data() == nullptr) { | |||||
| output_tensors.emplace_back(tensor); | |||||
| } | |||||
| } | |||||
| // std::vector<tensor::Tensor *> input_tensors = kernel::LiteKernelUtil::SubgraphInputTensors(kernels); | |||||
| // std::vector<tensor::Tensor *> output_tensors = kernel::LiteKernelUtil::SubgraphOutputTensors(kernels); | |||||
| // std::vector<kernel::LiteKernel *> input_kernels = kernel::LiteKernelUtil::SubgraphInputKernels(kernels); | |||||
| // std::vector<kernel::LiteKernel *> output_kernels = kernel::LiteKernelUtil::SubgraphOutputKernels(kernels); | |||||
| sub_kernel = | sub_kernel = | ||||
| new kernel::SubGraphOpenCLKernel(input_tensors, output_tensors, input_kernels, output_kernels, kernels); | new kernel::SubGraphOpenCLKernel(input_tensors, output_tensors, input_kernels, output_kernels, kernels); | ||||
| sub_kernel->Init(); | sub_kernel->Init(); | ||||