diff --git a/src/layer/vulkan/crop_vulkan.cpp b/src/layer/vulkan/crop_vulkan.cpp index 6bb16b5cd..7111e79fa 100644 --- a/src/layer/vulkan/crop_vulkan.cpp +++ b/src/layer/vulkan/crop_vulkan.cpp @@ -25,6 +25,8 @@ Crop_vulkan::Crop_vulkan() pipeline_crop = 0; pipeline_crop_pack4 = 0; + pipeline_crop_pack1to4 = 0; + pipeline_crop_pack4to1 = 0; } int Crop_vulkan::create_pipeline(const Option& opt) @@ -45,6 +47,20 @@ int Crop_vulkan::create_pipeline(const Option& opt) pipeline_crop_pack4->create("crop_pack4", opt, specializations, 2, 13); } + // pack1to4 + { + pipeline_crop_pack1to4 = new Pipeline(vkdev); + pipeline_crop_pack1to4->set_optimal_local_size_xyz(); + pipeline_crop_pack1to4->create("crop_pack1to4", opt, specializations, 2, 13); + } + + // pack4to1 + { + pipeline_crop_pack4to1 = new Pipeline(vkdev); + pipeline_crop_pack4to1->set_optimal_local_size_xyz(); + pipeline_crop_pack4to1->create("crop_pack4to1", opt, specializations, 2, 13); + } + return 0; } @@ -56,6 +72,12 @@ int Crop_vulkan::destroy_pipeline(const Option& opt) delete pipeline_crop_pack4; pipeline_crop_pack4 = 0; + delete pipeline_crop_pack1to4; + pipeline_crop_pack1to4 = 0; + + delete pipeline_crop_pack4to1; + pipeline_crop_pack4to1 = 0; + return 0; } @@ -167,19 +189,17 @@ int Crop_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& c } else if (elempack == 4 && out_elempack == 4) { - constants[12].i = _coffset / 4; + constants[12].i = _coffset / 4;// TODO pack4to1to4 pipeline = pipeline_crop_pack4; } else if (elempack == 1 && out_elempack == 4) { - // TODO - return -1; + pipeline = pipeline_crop_pack1to4; } else if (elempack == 4 && out_elempack == 1) { - // TODO - return -1; + pipeline = pipeline_crop_pack4to1; } cmd.record_pipeline(pipeline, bindings, constants, top_blob); @@ -266,19 +286,17 @@ int Crop_vulkan::forward(const std::vector& bottom_blobs, std::vector= p.outw || gy >= p.outh || gz >= p.outc) + return; + + int gi = gz * p.outcstep + gy * p.outw + gx; + + int x = gx + p.woffset; + int y = gy + p.hoffset; + ivec4 z4 = gz * 4 + ivec4(0, 1, 2, 3) + p.coffset; + ivec4 v_offset = z4 * p.cstep + y * p.w + x; + +#if NCNN_fp16_packed + vec2 v0 = vec2(bottom_blob_data[v_offset.r], bottom_blob_data[v_offset.g]); + vec2 v1 = vec2(bottom_blob_data[v_offset.b], bottom_blob_data[v_offset.a]); + + top_blob_data[gi] = uvec2(packHalf2x16(v0), packHalf2x16(v1)); +#else + top_blob_data[gi].r = bottom_blob_data[v_offset.r]; + top_blob_data[gi].g = bottom_blob_data[v_offset.g]; + top_blob_data[gi].b = bottom_blob_data[v_offset.b]; + top_blob_data[gi].a = bottom_blob_data[v_offset.a]; +#endif +} diff --git a/src/layer/vulkan/shader/crop_pack4to1.comp b/src/layer/vulkan/shader/crop_pack4to1.comp new file mode 100644 index 000000000..f17168e7c --- /dev/null +++ b/src/layer/vulkan/shader/crop_pack4to1.comp @@ -0,0 +1,82 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_AMD_gpu_shader_half_float: require +#endif + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int woffset; + int hoffset; + int coffset; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= p.outh || gz >= p.outc) + return; + + int gi = gz * p.outcstep + gy * p.outw + gx; + + int x = gx + p.woffset; + int y = gy + p.hoffset; + int z = gz + p.coffset; + + int sz = z / 4; + int lane = z % 4; + int v_offset = sz * p.cstep + y * p.w + x; + +#if NCNN_fp16_packed + vec4 v = sfp2afpvec4(bottom_blob_data[v_offset]); + + if (lane == 0) top_blob_data[gi] = v.r; + else if (lane == 1) top_blob_data[gi] = v.g; + else if (lane == 2) top_blob_data[gi] = v.b; + else /*(lane == 3)*/ top_blob_data[gi] = v.a; +#else + if (lane == 0) top_blob_data[gi] = bottom_blob_data[v_offset].r; + else if (lane == 1) top_blob_data[gi] = bottom_blob_data[v_offset].g; + else if (lane == 2) top_blob_data[gi] = bottom_blob_data[v_offset].b; + else /*(lane == 3)*/ top_blob_data[gi] = bottom_blob_data[v_offset].a; +#endif +}