diff --git a/src/gpu.cpp b/src/gpu.cpp index 6dbed566c..6f65e5b3b 100644 --- a/src/gpu.cpp +++ b/src/gpu.cpp @@ -503,7 +503,7 @@ int create_gpu_instance() gpu_info.host_visible_memory_index = find_host_visible_memory(physicalDeviceMemoryProperties); // treat as unified memory architecture if memory heap is the same - if (gpu_info.unified_memory_index != -1) + if (gpu_info.unified_memory_index != (uint32_t)-1) { int unified_memory_heap_index = physicalDeviceMemoryProperties.memoryTypes[gpu_info.unified_memory_index].heapIndex; int device_local_memory_heap_index = physicalDeviceMemoryProperties.memoryTypes[gpu_info.device_local_memory_index].heapIndex; diff --git a/src/layer/flatten.cpp b/src/layer/flatten.cpp index 17baaad3b..f3f40ccb5 100644 --- a/src/layer/flatten.cpp +++ b/src/layer/flatten.cpp @@ -25,6 +25,7 @@ Flatten::Flatten() support_vulkan = true; #if NCNN_VULKAN + pipeline_flatten = 0; pipeline_flatten_pack4 = 0; #endif // NCNN_VULKAN } @@ -61,6 +62,13 @@ int Flatten::create_pipeline() { std::vector specializations; + // pack1 + { + pipeline_flatten = new Pipeline(vkdev); + pipeline_flatten->set_optimal_local_size_xyz(); + pipeline_flatten->create("flatten", specializations, 2, 10); + } + // pack4 { pipeline_flatten_pack4 = new Pipeline(vkdev); @@ -73,6 +81,9 @@ int Flatten::create_pipeline() int Flatten::destroy_pipeline() { + delete pipeline_flatten; + pipeline_flatten = 0; + delete pipeline_flatten_pack4; pipeline_flatten_pack4 = 0; @@ -89,73 +100,55 @@ int Flatten::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, return 0; } - if (dims == 2) - { - top_blob = bottom_blob; - top_blob.dims = 1; - top_blob.w = bottom_blob.w * bottom_blob.h; - top_blob.h = 1; - return 0; - } - int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; size_t elemsize = bottom_blob.elemsize; int packing = bottom_blob.packing; - int out_packing = (w * h * channels * packing) % 4 == 0 ? 4 : 1; - size_t out_elemsize = elemsize / packing * out_packing; + int total = w * h * channels * packing; - top_blob.create(w * h * channels * packing / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator); - if (top_blob.empty()) - return -100; + int out_packing = total % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / packing * out_packing; - if (packing == 4 && out_packing == 4) + if (dims == 2 && packing == 1) { - std::vector bindings(2); - bindings[0] = bottom_blob; - bindings[1] = top_blob; - - std::vector constants(10); - constants[0].i = bottom_blob.dims; - constants[1].i = bottom_blob.w; - constants[2].i = bottom_blob.h; - constants[3].i = bottom_blob.c; - constants[4].i = bottom_blob.cstep; - constants[5].i = top_blob.dims; - constants[6].i = top_blob.w; - constants[7].i = top_blob.h; - constants[8].i = top_blob.c; - constants[9].i = top_blob.cstep; - - // record - cmd.record_prepare_compute_barrier(bottom_blob); - cmd.record_prepare_compute_barrier(top_blob); - cmd.record_pipeline(pipeline_flatten_pack4, bindings, constants, top_blob); - + top_blob = bottom_blob; + top_blob.dims = 1; + top_blob.w = total / out_packing; + top_blob.h = 1; + top_blob.cstep = top_blob.w; + top_blob.elemsize = out_elemsize; + top_blob.packing = out_packing; return 0; } - std::vector regions(channels); - - int srcOffset = 0; - int dstOffset = 0; - for (int q=0; q bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + + std::vector constants(10); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = (packing == 1 && out_packing == 4) ? top_blob.w * out_packing : top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + + const Pipeline* pipeline = packing == 4 ? pipeline_flatten_pack4 : pipeline_flatten; + + // record + cmd.record_prepare_compute_barrier(bottom_blob); + cmd.record_prepare_compute_barrier(top_blob); + cmd.record_pipeline(pipeline, bindings, constants, top_blob); return 0; } diff --git a/src/layer/flatten.h b/src/layer/flatten.h index 41f7dbb16..60ecadd1b 100644 --- a/src/layer/flatten.h +++ b/src/layer/flatten.h @@ -35,6 +35,7 @@ public: public: #if NCNN_VULKAN + Pipeline* pipeline_flatten; Pipeline* pipeline_flatten_pack4; #endif // NCNN_VULKAN }; diff --git a/src/layer/packing.cpp b/src/layer/packing.cpp index 0b62544f9..ec5941325 100644 --- a/src/layer/packing.cpp +++ b/src/layer/packing.cpp @@ -76,6 +76,14 @@ int Packing::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c if (dims == 1) { + if (out_packing == 1) + { + top_blob = bottom_blob; + top_blob.w = w * packing; + top_blob.elemsize = elemsize / packing; + return 0; + } + int outw = (w * packing + out_packing - 1) / out_packing; size_t out_elemsize = elemsize / packing * out_packing; diff --git a/src/layer/shader/flatten.comp b/src/layer/shader/flatten.comp new file mode 100644 index 000000000..b35c1b95a --- /dev/null +++ b/src/layer/shader/flatten.comp @@ -0,0 +1,57 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= 1 || gz >= 1) + return; + + int size = p.w * p.h; + + int z = gx / size; + int y = gx % size / p.w; + int x = gx % size % p.w; + + int v_offset = z * p.cstep + y * p.w + x; + + top_blob_data[gx] = bottom_blob_data[v_offset]; +} diff --git a/src/layer/shader/flatten_pack4.comp b/src/layer/shader/flatten_pack4.comp index 114c78da2..216b63057 100644 --- a/src/layer/shader/flatten_pack4.comp +++ b/src/layer/shader/flatten_pack4.comp @@ -45,20 +45,32 @@ void main() if (gx >= p.outw || gy >= 1 || gz >= 1) return; - const int size = p.w * p.h; + ivec4 v_offset; - const int z = (gx / size) * 4; - const int i = (gx % size) * 4; + if (p.dims == 2) + { + int y = gx / p.w; + ivec4 i4 = (gx % p.w) * 4 + ivec4(0, 1, 2, 3); - ivec4 i4 = ivec4(i, i+1, i+2, i+3); + ivec4 k4 = i4 / p.w; + ivec4 si4 = i4 % p.w; - ivec4 k4 = i4 / size; - ivec4 si4 = i4 % size; + v_offset = ivec4(y * 4 * p.w) + si4 * 4 + k4; + } + else // if (p.dims == 3) + { + int size = p.w * p.h; - ivec4 v_offset = ivec4(z * p.cstep) + si4 * 4 + k4; + int z = gx / size; + ivec4 i4 = (gx % size) * 4 + ivec4(0, 1, 2, 3); - vec4 v; + ivec4 k4 = i4 / size; + ivec4 si4 = i4 % size; + + v_offset = ivec4(z * 4 * p.cstep) + si4 * 4 + k4; + } + vec4 v; v.r = bottom_blob_data[v_offset.r]; v.g = bottom_blob_data[v_offset.g]; v.b = bottom_blob_data[v_offset.b]; diff --git a/src/mat.h b/src/mat.h index f9b2e2188..a820d1cfd 100644 --- a/src/mat.h +++ b/src/mat.h @@ -1412,6 +1412,9 @@ inline void VkMat::prepare_staging_buffer() inline void VkMat::discard_staging_buffer() { + if (allocator->mappable) + return; + if (staging_refcount && NCNN_XADD(staging_refcount, -1) == 1) { if (staging_allocator && staging_data)