Browse Source

flatten image

tags/20190320
nihuini 7 years ago
parent
commit
5646b7d2c2
7 changed files with 137 additions and 63 deletions
  1. +1
    -1
      src/gpu.cpp
  2. +47
    -54
      src/layer/flatten.cpp
  3. +1
    -0
      src/layer/flatten.h
  4. +8
    -0
      src/layer/packing.cpp
  5. +57
    -0
      src/layer/shader/flatten.comp
  6. +20
    -8
      src/layer/shader/flatten_pack4.comp
  7. +3
    -0
      src/mat.h

+ 1
- 1
src/gpu.cpp View File

@@ -503,7 +503,7 @@ int create_gpu_instance()
gpu_info.host_visible_memory_index = find_host_visible_memory(physicalDeviceMemoryProperties);

// treat as unified memory architecture if memory heap is the same
if (gpu_info.unified_memory_index != -1)
if (gpu_info.unified_memory_index != (uint32_t)-1)
{
int unified_memory_heap_index = physicalDeviceMemoryProperties.memoryTypes[gpu_info.unified_memory_index].heapIndex;
int device_local_memory_heap_index = physicalDeviceMemoryProperties.memoryTypes[gpu_info.device_local_memory_index].heapIndex;


+ 47
- 54
src/layer/flatten.cpp View File

@@ -25,6 +25,7 @@ Flatten::Flatten()
support_vulkan = true;

#if NCNN_VULKAN
pipeline_flatten = 0;
pipeline_flatten_pack4 = 0;
#endif // NCNN_VULKAN
}
@@ -61,6 +62,13 @@ int Flatten::create_pipeline()
{
std::vector<vk_specialization_type> specializations;

// pack1
{
pipeline_flatten = new Pipeline(vkdev);
pipeline_flatten->set_optimal_local_size_xyz();
pipeline_flatten->create("flatten", specializations, 2, 10);
}

// pack4
{
pipeline_flatten_pack4 = new Pipeline(vkdev);
@@ -73,6 +81,9 @@ int Flatten::create_pipeline()

int Flatten::destroy_pipeline()
{
delete pipeline_flatten;
pipeline_flatten = 0;

delete pipeline_flatten_pack4;
pipeline_flatten_pack4 = 0;

@@ -89,73 +100,55 @@ int Flatten::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd,
return 0;
}

if (dims == 2)
{
top_blob = bottom_blob;
top_blob.dims = 1;
top_blob.w = bottom_blob.w * bottom_blob.h;
top_blob.h = 1;
return 0;
}

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int packing = bottom_blob.packing;

int out_packing = (w * h * channels * packing) % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / packing * out_packing;
int total = w * h * channels * packing;

top_blob.create(w * h * channels * packing / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
if (top_blob.empty())
return -100;
int out_packing = total % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / packing * out_packing;

if (packing == 4 && out_packing == 4)
if (dims == 2 && packing == 1)
{
std::vector<VkMat> bindings(2);
bindings[0] = bottom_blob;
bindings[1] = top_blob;

std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = bottom_blob.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = top_blob.cstep;

// record
cmd.record_prepare_compute_barrier(bottom_blob);
cmd.record_prepare_compute_barrier(top_blob);
cmd.record_pipeline(pipeline_flatten_pack4, bindings, constants, top_blob);

top_blob = bottom_blob;
top_blob.dims = 1;
top_blob.w = total / out_packing;
top_blob.h = 1;
top_blob.cstep = top_blob.w;
top_blob.elemsize = out_elemsize;
top_blob.packing = out_packing;
return 0;
}

std::vector<VkBufferCopy> regions(channels);

int srcOffset = 0;
int dstOffset = 0;
for (int q=0; q<channels; q++)
{
int size = w * h * elemsize;

regions[q].srcOffset = bottom_blob.buffer_offset() + srcOffset;
regions[q].dstOffset = top_blob.buffer_offset() + dstOffset;
regions[q].size = size;

srcOffset += bottom_blob.cstep * elemsize;
dstOffset += size;
}
top_blob.create(total / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
if (top_blob.empty())
return -100;

cmd.record_prepare_transfer_barrier(bottom_blob);
cmd.record_prepare_transfer_barrier(top_blob);
cmd.record_copy_regions(bottom_blob, top_blob, regions);
std::vector<VkMat> bindings(2);
bindings[0] = bottom_blob;
bindings[1] = top_blob;

std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = bottom_blob.cstep;
constants[5].i = top_blob.dims;
constants[6].i = (packing == 1 && out_packing == 4) ? top_blob.w * out_packing : top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = top_blob.cstep;

const Pipeline* pipeline = packing == 4 ? pipeline_flatten_pack4 : pipeline_flatten;

// record
cmd.record_prepare_compute_barrier(bottom_blob);
cmd.record_prepare_compute_barrier(top_blob);
cmd.record_pipeline(pipeline, bindings, constants, top_blob);

return 0;
}


+ 1
- 0
src/layer/flatten.h View File

@@ -35,6 +35,7 @@ public:

public:
#if NCNN_VULKAN
Pipeline* pipeline_flatten;
Pipeline* pipeline_flatten_pack4;
#endif // NCNN_VULKAN
};


+ 8
- 0
src/layer/packing.cpp View File

@@ -76,6 +76,14 @@ int Packing::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c

if (dims == 1)
{
if (out_packing == 1)
{
top_blob = bottom_blob;
top_blob.w = w * packing;
top_blob.elemsize = elemsize / packing;
return 0;
}

int outw = (w * packing + out_packing - 1) / out_packing;
size_t out_elemsize = elemsize / packing * out_packing;



+ 57
- 0
src/layer/shader/flatten.comp View File

@@ -0,0 +1,57 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.outw || gy >= 1 || gz >= 1)
return;

int size = p.w * p.h;

int z = gx / size;
int y = gx % size / p.w;
int x = gx % size % p.w;

int v_offset = z * p.cstep + y * p.w + x;

top_blob_data[gx] = bottom_blob_data[v_offset];
}

+ 20
- 8
src/layer/shader/flatten_pack4.comp View File

@@ -45,20 +45,32 @@ void main()
if (gx >= p.outw || gy >= 1 || gz >= 1)
return;

const int size = p.w * p.h;
ivec4 v_offset;

const int z = (gx / size) * 4;
const int i = (gx % size) * 4;
if (p.dims == 2)
{
int y = gx / p.w;
ivec4 i4 = (gx % p.w) * 4 + ivec4(0, 1, 2, 3);

ivec4 i4 = ivec4(i, i+1, i+2, i+3);
ivec4 k4 = i4 / p.w;
ivec4 si4 = i4 % p.w;

ivec4 k4 = i4 / size;
ivec4 si4 = i4 % size;
v_offset = ivec4(y * 4 * p.w) + si4 * 4 + k4;
}
else // if (p.dims == 3)
{
int size = p.w * p.h;

ivec4 v_offset = ivec4(z * p.cstep) + si4 * 4 + k4;
int z = gx / size;
ivec4 i4 = (gx % size) * 4 + ivec4(0, 1, 2, 3);

vec4 v;
ivec4 k4 = i4 / size;
ivec4 si4 = i4 % size;

v_offset = ivec4(z * 4 * p.cstep) + si4 * 4 + k4;
}

vec4 v;
v.r = bottom_blob_data[v_offset.r];
v.g = bottom_blob_data[v_offset.g];
v.b = bottom_blob_data[v_offset.b];


+ 3
- 0
src/mat.h View File

@@ -1412,6 +1412,9 @@ inline void VkMat::prepare_staging_buffer()

inline void VkMat::discard_staging_buffer()
{
if (allocator->mappable)
return;

if (staging_refcount && NCNN_XADD(staging_refcount, -1) == 1)
{
if (staging_allocator && staging_data)


Loading…
Cancel
Save