Browse Source

copy buffer has offset alignment limit, re-implement concat as compute pipeline

tags/20190320
nihui 7 years ago
parent
commit
81ee56b209
5 changed files with 613 additions and 64 deletions
  1. +339
    -64
      src/layer/concat.cpp
  2. +10
    -0
      src/layer/concat.h
  3. +87
    -0
      src/layer/shader/concat.comp
  4. +87
    -0
      src/layer/shader/concat_pack4.comp
  5. +90
    -0
      src/layer/shader/concat_pack4to1.comp

+ 339
- 64
src/layer/concat.cpp View File

@@ -23,6 +23,12 @@ Concat::Concat()
one_blob_only = false;
support_inplace = false;
support_vulkan = true;

#if NCNN_VULKAN
pipeline_concat = 0;
pipeline_concat_pack4 = 0;
pipeline_concat_pack4to1 = 0;
#endif // NCNN_VULKAN
}

int Concat::load_param(const ParamDict& pd)
@@ -261,46 +267,126 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
}

#if NCNN_VULKAN
int Concat::create_pipeline()
{
std::vector<vk_specialization_type> specializations(1);
specializations[0].i = axis;

// pack1
{
pipeline_concat = new Pipeline(vkdev);
pipeline_concat->set_optimal_local_size_xyz();
pipeline_concat->create("concat", specializations, 2, 11);
}

// pack4
{
pipeline_concat_pack4 = new Pipeline(vkdev);
pipeline_concat_pack4->set_optimal_local_size_xyz();
pipeline_concat_pack4->create("concat_pack4", specializations, 2, 11);
}

// pack4to1
{
pipeline_concat_pack4to1 = new Pipeline(vkdev);
pipeline_concat_pack4to1->set_optimal_local_size_xyz();
pipeline_concat_pack4to1->create("concat_pack4to1", specializations, 2, 11);
}

return 0;
}

int Concat::destroy_pipeline()
{
delete pipeline_concat;
pipeline_concat = 0;

delete pipeline_concat_pack4;
pipeline_concat_pack4 = 0;

delete pipeline_concat_pack4to1;
pipeline_concat_pack4to1 = 0;

return 0;
}

int Concat::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const
{
int dims = bottom_blobs[0].dims;
size_t elemsize = bottom_blobs[0].elemsize;
int packing = bottom_blobs[0].packing;

if (dims == 1) // axis == 0
{
// concat vector
// total length
size_t elemsize = bottom_blobs[0].elemsize;
int packing = bottom_blobs[0].packing;
int top_w = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkMat& bottom_blob = bottom_blobs[b];
top_w += bottom_blob.w;
elemsize = std::min(elemsize, bottom_blob.elemsize);
packing = std::min(packing, bottom_blob.packing);
top_w += bottom_blob.w * bottom_blob.packing;
}

int out_packing = top_w % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / packing * out_packing;

// TODO pack1to4 and pack4to1to4 make sense ?
if (packing == 1)
{
out_packing = 1;
out_elemsize = elemsize / packing;
}

VkMat& top_blob = top_blobs[0];
top_blob.create(top_w, elemsize, packing, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(top_w / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
if (top_blob.empty())
return -100;

cmd.record_prepare_transfer_barrier(top_blob);
cmd.record_prepare_compute_barrier(top_blob);

int dstOffset = 0;
int woffset = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkMat& bottom_blob = bottom_blobs[b];

int size = bottom_blob.w * bottom_blob.elemsize;

VkBufferCopy region;
region.srcOffset = bottom_blob.buffer_offset();
region.dstOffset = top_blob.buffer_offset() + dstOffset;
region.size = size;
std::vector<VkMat> bindings(2);
bindings[0] = bottom_blob;
bindings[1] = top_blob;

std::vector<vk_constant_type> constants(11);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = bottom_blob.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = top_blob.cstep;
constants[10].i = woffset;

const Pipeline* pipeline = 0;
if (packing == 1 && out_packing == 1)
{
pipeline = pipeline_concat;
}
else if (packing == 4 && out_packing == 4)
{
pipeline = pipeline_concat_pack4;
}
else if (packing == 4 && out_packing == 1)
{
pipeline = pipeline_concat_pack4to1;
}

cmd.record_prepare_transfer_barrier(bottom_blob);
cmd.record_copy_region(bottom_blob, top_blob, region);
// record
cmd.record_prepare_compute_barrier(bottom_blob);
cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);

dstOffset += size;
woffset += bottom_blob.w * bottom_blob.packing / out_packing;
}

return 0;
@@ -312,36 +398,75 @@ int Concat::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>&
int w = bottom_blobs[0].w;

// total height
size_t elemsize = bottom_blobs[0].elemsize;
int packing = bottom_blobs[0].packing;
int top_h = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkMat& bottom_blob = bottom_blobs[b];
top_h += bottom_blob.h;
elemsize = std::min(elemsize, bottom_blob.elemsize);
packing = std::min(packing, bottom_blob.packing);
top_h += bottom_blob.h * bottom_blob.packing;
}

int out_packing = top_h % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / packing * out_packing;

// TODO pack1to4 and pack4to1to4 make sense ?
if (packing == 1)
{
out_packing = 1;
out_elemsize = elemsize / packing;
}

VkMat& top_blob = top_blobs[0];
top_blob.create(w, top_h, elemsize, packing, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(w, top_h / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
if (top_blob.empty())
return -100;

cmd.record_prepare_transfer_barrier(top_blob);
cmd.record_prepare_compute_barrier(top_blob);

int dstOffset = 0;
int hoffset = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkMat& bottom_blob = bottom_blobs[b];

int size = w * bottom_blob.h * bottom_blob.elemsize;

VkBufferCopy region;
region.srcOffset = bottom_blob.buffer_offset();
region.dstOffset = top_blob.buffer_offset() + dstOffset;
region.size = size;
std::vector<VkMat> bindings(2);
bindings[0] = bottom_blob;
bindings[1] = top_blob;

std::vector<vk_constant_type> constants(11);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = bottom_blob.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = top_blob.cstep;
constants[10].i = hoffset;

const Pipeline* pipeline = 0;
if (packing == 1 && out_packing == 1)
{
pipeline = pipeline_concat;
}
else if (packing == 4 && out_packing == 4)
{
pipeline = pipeline_concat_pack4;
}
else if (packing == 4 && out_packing == 1)
{
pipeline = pipeline_concat_pack4to1;
}

cmd.record_prepare_transfer_barrier(bottom_blob);
cmd.record_copy_region(bottom_blob, top_blob, region);
// record
cmd.record_prepare_compute_barrier(bottom_blob);
cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);

dstOffset += size;
hoffset += bottom_blob.h * bottom_blob.packing / out_packing;
}

return 0;
@@ -351,6 +476,8 @@ int Concat::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>&
{
// interleave image row
int h = bottom_blobs[0].h;
size_t elemsize = bottom_blobs[0].elemsize;
int packing = bottom_blobs[0].packing;

// total width
int top_w = 0;
@@ -365,32 +492,37 @@ int Concat::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>&
if (top_blob.empty())
return -100;

cmd.record_prepare_transfer_barrier(top_blob);

int dstOffset_0 = 0;
cmd.record_prepare_compute_barrier(top_blob);

int woffset = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkMat& bottom_blob = bottom_blobs[b];

int size = bottom_blob.w * bottom_blob.elemsize;

int dstOffset = dstOffset_0;

std::vector<VkBufferCopy> regions(h);
for (int i=0; i<h; i++)
{
regions[i].srcOffset = bottom_blob.buffer_offset();
regions[i].dstOffset = top_blob.buffer_offset() + dstOffset;
regions[i].size = size;

dstOffset += top_blob.w * top_blob.elemsize;
}

cmd.record_prepare_transfer_barrier(bottom_blob);
cmd.record_copy_regions(bottom_blob, top_blob, regions);

dstOffset_0 += size;
std::vector<VkMat> bindings(2);
bindings[0] = bottom_blob;
bindings[1] = top_blob;

std::vector<vk_constant_type> constants(11);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = bottom_blob.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = top_blob.cstep;
constants[10].i = woffset;

const Pipeline* pipeline = packing == 4 ? pipeline_concat_pack4 : pipeline_concat;

// record
cmd.record_prepare_compute_barrier(bottom_blob);
cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);

woffset += bottom_blob.w;
}

return 0;
@@ -403,36 +535,75 @@ int Concat::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>&
int h = bottom_blobs[0].h;

// total channels
size_t elemsize = bottom_blobs[0].elemsize;
int packing = bottom_blobs[0].packing;
int top_channels = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkMat& bottom_blob = bottom_blobs[b];
top_channels += bottom_blob.c;
elemsize = std::min(elemsize, bottom_blob.elemsize);
packing = std::min(packing, bottom_blob.packing);
top_channels += bottom_blob.c * bottom_blob.packing;
}

int out_packing = top_channels % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / packing * out_packing;

// TODO pack1to4 and pack4to1to4 make sense ?
if (packing == 1)
{
out_packing = 1;
out_elemsize = elemsize / packing;
}

VkMat& top_blob = top_blobs[0];
top_blob.create(w, h, top_channels, elemsize, packing, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(w, h, top_channels / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
if (top_blob.empty())
return -100;

cmd.record_prepare_transfer_barrier(top_blob);
cmd.record_prepare_compute_barrier(top_blob);

int dstOffset = 0;
int coffset = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkMat& bottom_blob = bottom_blobs[b];

int size = bottom_blob.total() * bottom_blob.elemsize;

VkBufferCopy region;
region.srcOffset = bottom_blob.buffer_offset();
region.dstOffset = top_blob.buffer_offset() + dstOffset;
region.size = size;
std::vector<VkMat> bindings(2);
bindings[0] = bottom_blob;
bindings[1] = top_blob;

std::vector<vk_constant_type> constants(11);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = bottom_blob.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = top_blob.cstep;
constants[10].i = coffset;

const Pipeline* pipeline = 0;
if (packing == 1 && out_packing == 1)
{
pipeline = pipeline_concat;
}
else if (packing == 4 && out_packing == 4)
{
pipeline = pipeline_concat_pack4;
}
else if (packing == 4 && out_packing == 1)
{
pipeline = pipeline_concat_pack4to1;
}

cmd.record_prepare_transfer_barrier(bottom_blob);
cmd.record_copy_region(bottom_blob, top_blob, region);
// record
cmd.record_prepare_compute_barrier(bottom_blob);
cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);

dstOffset += size;
coffset += bottom_blob.c * bottom_blob.packing / out_packing;
}

return 0;
@@ -440,12 +611,116 @@ int Concat::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>&

if (dims == 3 && axis == 1)
{
// TODO
// interleave dim height
int w = bottom_blobs[0].w;
int channels = bottom_blobs[0].c;
size_t elemsize = bottom_blobs[0].elemsize;
int packing = bottom_blobs[0].packing;

// total height
int top_h = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkMat& bottom_blob = bottom_blobs[b];
top_h += bottom_blob.h;
}

VkMat& top_blob = top_blobs[0];
top_blob.create(w, top_h, channels, elemsize, packing, opt.blob_vkallocator, opt.staging_vkallocator);
if (top_blob.empty())
return -100;

cmd.record_prepare_compute_barrier(top_blob);

int hoffset = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkMat& bottom_blob = bottom_blobs[b];

std::vector<VkMat> bindings(2);
bindings[0] = bottom_blob;
bindings[1] = top_blob;

std::vector<vk_constant_type> constants(11);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = bottom_blob.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = top_blob.cstep;
constants[10].i = hoffset;

const Pipeline* pipeline = packing == 4 ? pipeline_concat_pack4 : pipeline_concat;

// record
cmd.record_prepare_compute_barrier(bottom_blob);
cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);

hoffset += bottom_blob.h;
}

return 0;
}

if (dims == 3 && axis == 2)
{
// TODO
// interleave dim width
int h = bottom_blobs[0].h;
int channels = bottom_blobs[0].c;
size_t elemsize = bottom_blobs[0].elemsize;
int packing = bottom_blobs[0].packing;

// total height
int top_w = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkMat& bottom_blob = bottom_blobs[b];
top_w += bottom_blob.w;
}

VkMat& top_blob = top_blobs[0];
top_blob.create(top_w, h, channels, elemsize, packing, opt.blob_vkallocator, opt.staging_vkallocator);
if (top_blob.empty())
return -100;

cmd.record_prepare_compute_barrier(top_blob);

int woffset = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkMat& bottom_blob = bottom_blobs[b];

std::vector<VkMat> bindings(2);
bindings[0] = bottom_blob;
bindings[1] = top_blob;

std::vector<vk_constant_type> constants(11);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = bottom_blob.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = top_blob.cstep;
constants[10].i = woffset;

const Pipeline* pipeline = packing == 4 ? pipeline_concat_pack4 : pipeline_concat;

// record
cmd.record_prepare_compute_barrier(bottom_blob);
cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);

woffset += bottom_blob.w;
}

return 0;
}

return 0;


+ 10
- 0
src/layer/concat.h View File

@@ -29,11 +29,21 @@ public:
virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

#if NCNN_VULKAN
virtual int create_pipeline();
virtual int destroy_pipeline();

virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
#endif // NCNN_VULKAN

public:
int axis;

#if NCNN_VULKAN
Pipeline* pipeline_concat;
Pipeline* pipeline_concat_pack4;
Pipeline* pipeline_concat_pack4to1;
#endif // NCNN_VULKAN

};

} // namespace ncnn


+ 87
- 0
src/layer/shader/concat.comp View File

@@ -0,0 +1,87 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (constant_id = 0) const int axis = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;

int offset;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.w || gy >= p.h || gz >= p.c)
return;

int v_offset;
float v;

if (p.dims == 1) // axis == 0
{
v_offset = gx + p.offset;
v = bottom_blob_data[gx];
}
else if (p.dims == 2 && axis == 0)
{
v_offset = (gy + p.offset) * p.outw + gx;
v = bottom_blob_data[gy * p.w + gx];
}
else if (p.dims == 2 && axis == 1)
{
v_offset = gy * p.outw + gx + p.offset;
v = bottom_blob_data[gy * p.w + gx];
}
else if (p.dims == 3 && axis == 0)
{
v_offset = (gz + p.offset) * p.outcstep + gy * p.outw + gx;
v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
}
else if (p.dims == 3 && axis == 1)
{
v_offset = gz * p.outcstep + (gy + p.offset) * p.outw + gx;
v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
}
else if (p.dims == 3 && axis == 2)
{
v_offset = gz * p.outcstep + gy * p.outw + gx + p.offset;
v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
}

top_blob_data[v_offset] = v;
}

+ 87
- 0
src/layer/shader/concat_pack4.comp View File

@@ -0,0 +1,87 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (constant_id = 0) const int axis = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;

int offset;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.w || gy >= p.h || gz >= p.c)
return;

int v_offset;
vec4 v;

if (p.dims == 1) // axis == 0
{
v_offset = gx + p.offset;
v = bottom_blob_data[gx];
}
else if (p.dims == 2 && axis == 0)
{
v_offset = (gy + p.offset) * p.outw + gx;
v = bottom_blob_data[gy * p.w + gx];
}
else if (p.dims == 2 && axis == 1)
{
v_offset = gy * p.outw + gx + p.offset;
v = bottom_blob_data[gy * p.w + gx];
}
else if (p.dims == 3 && axis == 0)
{
v_offset = (gz + p.offset) * p.outcstep + gy * p.outw + gx;
v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
}
else if (p.dims == 3 && axis == 1)
{
v_offset = gz * p.outcstep + (gy + p.offset) * p.outw + gx;
v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
}
else if (p.dims == 3 && axis == 2)
{
v_offset = gz * p.outcstep + gy * p.outw + gx + p.offset;
v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
}

top_blob_data[v_offset] = v;
}

+ 90
- 0
src/layer/shader/concat_pack4to1.comp View File

@@ -0,0 +1,90 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (constant_id = 0) const int axis = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;

int offset;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.w || gy >= p.h || gz >= p.c)
return;

ivec4 v_offset;
vec4 v;

if (p.dims == 1) // axis == 0
{
v_offset = ivec4(gx * 4 + p.offset) + ivec4(0, 1, 2, 3);
v = bottom_blob_data[gx];
}
else if (p.dims == 2 && axis == 0)
{
v_offset = (ivec4(gx * 4 + p.offset) + ivec4(0, 1, 2, 3)) * p.outw + gx;
v = bottom_blob_data[gy * p.w + gx];
}
else if (p.dims == 2 && axis == 1)
{
v_offset = (ivec4(gx * 4) + ivec4(0, 1, 2, 3)) * p.outw + gx + p.offset;
v = bottom_blob_data[gy * p.w + gx];
}
else if (p.dims == 3 && axis == 0)
{
v_offset = (ivec4(gz * 4 + p.offset) + ivec4(0, 1, 2, 3)) * p.outcstep + gy * p.outw + gx;
v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
}
else if (p.dims == 3 && axis == 1)
{
v_offset = (ivec4(gz * 4) + ivec4(0, 1, 2, 3)) * p.outcstep + (gy + p.offset) * p.outw + gx;
v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
}
else if (p.dims == 3 && axis == 2)
{
v_offset = (ivec4(gz * 4) + ivec4(0, 1, 2, 3)) * p.outcstep + gy * p.outw + gx + p.offset;
v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
}

top_blob_data[v_offset.r] = v.r;
top_blob_data[v_offset.g] = v.g;
top_blob_data[v_offset.b] = v.b;
top_blob_data[v_offset.a] = v.a;
}

Loading…
Cancel
Save