flatten image

7 years ago · 5646b7d2c2
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -503,7 +503,7 @@ int create_gpu_instance()
        gpu_info.host_visible_memory_index = find_host_visible_memory(physicalDeviceMemoryProperties);

        // treat as unified memory architecture if memory heap is the same
        if (gpu_info.unified_memory_index != -1)
        if (gpu_info.unified_memory_index != (uint32_t)-1)
        {
            int unified_memory_heap_index = physicalDeviceMemoryProperties.memoryTypes[gpu_info.unified_memory_index].heapIndex;
            int device_local_memory_heap_index = physicalDeviceMemoryProperties.memoryTypes[gpu_info.device_local_memory_index].heapIndex;
--- a/src/layer/flatten.cpp
+++ b/src/layer/flatten.cpp
@@ -25,6 +25,7 @@ Flatten::Flatten()
    support_vulkan = true;

 #if NCNN_VULKAN
    pipeline_flatten = 0;
    pipeline_flatten_pack4 = 0;
 #endif // NCNN_VULKAN
 }
@@ -61,6 +62,13 @@ int Flatten::create_pipeline()
 {
    std::vector<vk_specialization_type> specializations;

    // pack1
    {
        pipeline_flatten = new Pipeline(vkdev);
        pipeline_flatten->set_optimal_local_size_xyz();
        pipeline_flatten->create("flatten", specializations, 2, 10);
    }

    // pack4
    {
        pipeline_flatten_pack4 = new Pipeline(vkdev);
@@ -73,6 +81,9 @@ int Flatten::create_pipeline()

 int Flatten::destroy_pipeline()
 {
    delete pipeline_flatten;
    pipeline_flatten = 0;

    delete pipeline_flatten_pack4;
    pipeline_flatten_pack4 = 0;

@@ -89,73 +100,55 @@ int Flatten::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd,
        return 0;
    }

    if (dims == 2)
    {
        top_blob = bottom_blob;
        top_blob.dims = 1;
        top_blob.w = bottom_blob.w * bottom_blob.h;
        top_blob.h = 1;
        return 0;
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int packing = bottom_blob.packing;

    int out_packing = (w * h * channels * packing) % 4 == 0 ? 4 : 1;
    size_t out_elemsize = elemsize / packing * out_packing;
    int total = w * h * channels * packing;

    top_blob.create(w * h * channels * packing / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
    if (top_blob.empty())
        return -100;
    int out_packing = total % 4 == 0 ? 4 : 1;
    size_t out_elemsize = elemsize / packing * out_packing;

    if (packing == 4 && out_packing == 4)
    if (dims == 2 && packing == 1)
    {
        std::vector<VkMat> bindings(2);
        bindings[0] = bottom_blob;
        bindings[1] = top_blob;

        std::vector<vk_constant_type> constants(10);
        constants[0].i = bottom_blob.dims;
        constants[1].i = bottom_blob.w;
        constants[2].i = bottom_blob.h;
        constants[3].i = bottom_blob.c;
        constants[4].i = bottom_blob.cstep;
        constants[5].i = top_blob.dims;
        constants[6].i = top_blob.w;
        constants[7].i = top_blob.h;
        constants[8].i = top_blob.c;
        constants[9].i = top_blob.cstep;

        // record
        cmd.record_prepare_compute_barrier(bottom_blob);
        cmd.record_prepare_compute_barrier(top_blob);
        cmd.record_pipeline(pipeline_flatten_pack4, bindings, constants, top_blob);

        top_blob = bottom_blob;
        top_blob.dims = 1;
        top_blob.w = total / out_packing;
        top_blob.h = 1;
        top_blob.cstep = top_blob.w;
        top_blob.elemsize = out_elemsize;
        top_blob.packing = out_packing;
        return 0;
    }

    std::vector<VkBufferCopy> regions(channels);

    int srcOffset = 0;
    int dstOffset = 0;
    for (int q=0; q<channels; q++)
    {
        int size = w * h * elemsize;

        regions[q].srcOffset = bottom_blob.buffer_offset() + srcOffset;
        regions[q].dstOffset = top_blob.buffer_offset() + dstOffset;
        regions[q].size = size;

        srcOffset += bottom_blob.cstep * elemsize;
        dstOffset += size;
    }
    top_blob.create(total / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
    if (top_blob.empty())
        return -100;

    cmd.record_prepare_transfer_barrier(bottom_blob);
    cmd.record_prepare_transfer_barrier(top_blob);
    cmd.record_copy_regions(bottom_blob, top_blob, regions);
    std::vector<VkMat> bindings(2);
    bindings[0] = bottom_blob;
    bindings[1] = top_blob;

    std::vector<vk_constant_type> constants(10);
    constants[0].i = bottom_blob.dims;
    constants[1].i = bottom_blob.w;
    constants[2].i = bottom_blob.h;
    constants[3].i = bottom_blob.c;
    constants[4].i = bottom_blob.cstep;
    constants[5].i = top_blob.dims;
    constants[6].i = (packing == 1 && out_packing == 4) ? top_blob.w * out_packing : top_blob.w;
    constants[7].i = top_blob.h;
    constants[8].i = top_blob.c;
    constants[9].i = top_blob.cstep;

    const Pipeline* pipeline = packing == 4 ? pipeline_flatten_pack4 : pipeline_flatten;

    // record
    cmd.record_prepare_compute_barrier(bottom_blob);
    cmd.record_prepare_compute_barrier(top_blob);
    cmd.record_pipeline(pipeline, bindings, constants, top_blob);

    return 0;
 }
--- a/src/layer/flatten.h
+++ b/src/layer/flatten.h
@@ -35,6 +35,7 @@ public:

 public:
 #if NCNN_VULKAN
    Pipeline* pipeline_flatten;
    Pipeline* pipeline_flatten_pack4;
 #endif // NCNN_VULKAN
 };
--- a/src/layer/packing.cpp
+++ b/src/layer/packing.cpp
@@ -76,6 +76,14 @@ int Packing::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c

    if (dims == 1)
    {
        if (out_packing == 1)
        {
            top_blob = bottom_blob;
            top_blob.w = w * packing;
            top_blob.elemsize = elemsize / packing;
            return 0;
        }

        int outw = (w * packing + out_packing - 1) / out_packing;
        size_t out_elemsize = elemsize / packing * out_packing;

--- a/src/layer/shader/flatten.comp
+++ b/src/layer/shader/flatten.comp
@@ -0,0 +1,57 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 layout (local_size_x_id = 233) in;
 layout (local_size_y_id = 234) in;
 layout (local_size_z_id = 235) in;

 layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= p.outw || gy >= 1 || gz >= 1)
        return;

    int size = p.w * p.h;

    int z = gx / size;
    int y = gx % size / p.w;
    int x = gx % size % p.w;

    int v_offset = z * p.cstep + y * p.w + x;

    top_blob_data[gx] = bottom_blob_data[v_offset];
 }
--- a/src/layer/shader/flatten_pack4.comp
+++ b/src/layer/shader/flatten_pack4.comp
@@ -45,20 +45,32 @@ void main()
    if (gx >= p.outw || gy >= 1 || gz >= 1)
        return;

    const int size = p.w * p.h;
    ivec4 v_offset;

    const int z = (gx / size) * 4;
    const int i = (gx % size) * 4;
    if (p.dims == 2)
    {
        int y = gx / p.w;
        ivec4 i4 = (gx % p.w) * 4 + ivec4(0, 1, 2, 3);

    ivec4 i4 = ivec4(i, i+1, i+2, i+3);
        ivec4 k4 = i4 / p.w;
        ivec4 si4 = i4 % p.w;

    ivec4 k4 = i4 / size;
    ivec4 si4 = i4 % size;
        v_offset = ivec4(y * 4 * p.w) + si4 * 4 + k4;
    }
    else // if (p.dims == 3)
    {
        int size = p.w * p.h;

    ivec4 v_offset = ivec4(z * p.cstep) + si4 * 4 + k4;
        int z = gx / size;
        ivec4 i4 = (gx % size) * 4 + ivec4(0, 1, 2, 3);

    vec4 v;
        ivec4 k4 = i4 / size;
        ivec4 si4 = i4 % size;

        v_offset = ivec4(z * 4 * p.cstep) + si4 * 4 + k4;
    }

    vec4 v;
    v.r = bottom_blob_data[v_offset.r];
    v.g = bottom_blob_data[v_offset.g];
    v.b = bottom_blob_data[v_offset.b];
--- a/src/mat.h
+++ b/src/mat.h
@@ -1412,6 +1412,9 @@ inline void VkMat::prepare_staging_buffer()

 inline void VkMat::discard_staging_buffer()
 {
    if (allocator->mappable)
        return;

    if (staging_refcount && NCNN_XADD(staging_refcount, -1) == 1)
    {
        if (staging_allocator && staging_data)