|
- // Tencent is pleased to support the open source community by making ncnn available.
- //
- // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
- //
- // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
- // in compliance with the License. You may obtain a copy of the License at
- //
- // https://opensource.org/licenses/BSD-3-Clause
- //
- // Unless required by applicable law or agreed to in writing, software distributed
- // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
- // CONDITIONS OF ANY KIND, either express or implied. See the License for the
- // specific language governing permissions and limitations under the License.
-
- #version 450
-
- #if NCNN_fp16_storage
- #extension GL_EXT_shader_16bit_storage: require
- #endif
- #if NCNN_fp16_arithmetic
- #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
- #endif
-
- layout (constant_id = 0) const int type = 1;
- layout (constant_id = 1) const float value = 0;
- layout (constant_id = 2) const int per_channel_pad = 0;
-
- #define shape_constant_id_offset 3
- layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
- layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
- layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
- layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
- layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
-
- layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
- layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
- layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
- layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
- layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
-
- #if NCNN_image_shader
- layout (binding = 0) uniform unfp sampler3D bottom_blob;
- layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
- layout (binding = 2) uniform unfp sampler3D per_channel_pad_blob;
- #else
- layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
- layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
- layout (binding = 2) readonly buffer per_channel_pad_blob { sfpvec4 per_channel_pad_blob_data[]; };
- #endif
-
- layout (push_constant) uniform parameter
- {
- int dims;
- int w;
- int h;
- int c;
- int cstep;
-
- int outdims;
- int outw;
- int outh;
- int outc;
- int outcstep;
-
- int left;
- int top;
- int front;
- } p;
-
- void main()
- {
- int gx = int(gl_GlobalInvocationID.x);
- int gy = int(gl_GlobalInvocationID.y);
- int gz = int(gl_GlobalInvocationID.z);
-
- if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
- return;
-
- if (psc(dims) == 1)
- {
- int x = gx - p.left / 4;
-
- if (type == 0)
- {
- if (x >= 0 && x < psc(w))
- {
- #if NCNN_image_shader
- image3d_cp4(top_blob, ivec3(gx, 0, 0), bottom_blob, ivec3(x, 0, 0));
- #else
- buffer_cp4(top_blob_data, gx, bottom_blob_data, x);
- #endif
- }
- else
- {
- afpvec4 v = afpvec4(value);
- #if NCNN_image_shader
- image3d_st4(top_blob, ivec3(gx, 0, 0), v);
- #else
- buffer_st4(top_blob_data, gx, v);
- #endif
- }
- }
- if (type == 1)
- {
- x = clamp(x, 0, psc(w) - 1);
-
- #if NCNN_image_shader
- afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, 0, 0));
-
- if (gx < p.left / 4)
- v = afpvec4(v.r);
- else if (gx >= psc(w) + p.left / 4)
- v = afpvec4(v.a);
-
- image3d_st4(top_blob, ivec3(gx, 0, 0), v);
- #else
- afpvec4 v = buffer_ld4(bottom_blob_data, x);
-
- if (gx < p.left / 4)
- v = afpvec4(v.r);
- else if (gx >= psc(w) + p.left / 4)
- v = afpvec4(v.a);
-
- buffer_st4(top_blob_data, gx, v);
- #endif
- }
- if (type == 2)
- {
- x = abs(x);
- // NOTE psc(X) get zeros on nvidia
- // TODO only enable this workaround for some nvidia driver
- x = (p.w - 1) - abs(x - (p.w - 1));
- // x = (psc(w) - 1) - abs(x - (psc(w) - 1));
-
- #if NCNN_image_shader
- afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, 0, 0));
-
- if (gx < p.left / 4)
- {
- afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x - 1, 0, 0));
- v = afpvec4(v.r, v0.a, v0.b, v0.g);
- }
- else if (gx >= psc(w) + p.left / 4)
- {
- afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x + 1, 0, 0));
- v = afpvec4(v1.b, v1.g, v1.r, v.a);
- }
-
- image3d_st4(top_blob, ivec3(gx, 0, 0), v);
- #else
- afpvec4 v = buffer_ld4(bottom_blob_data, x);
-
- if (gx < p.left / 4)
- {
- afpvec4 v0 = buffer_ld4(bottom_blob_data, x - 1);
- v = afpvec4(v.r, v0.a, v0.b, v0.g);
- }
- else if (gx >= psc(w) + p.left / 4)
- {
- afpvec4 v1 = buffer_ld4(bottom_blob_data, x + 1);
- v = afpvec4(v1.b, v1.g, v1.r, v.a);
- }
-
- buffer_st4(top_blob_data, gx, v);
- #endif
- }
- }
- else if (psc(dims) == 2)
- {
- const int gi = gy * psc(outw) + gx;
-
- int x = gx - p.left;
- int y = gy - p.top / 4;
-
- if (type == 0)
- {
- if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h))
- {
- #if NCNN_image_shader
- image3d_cp4(top_blob, ivec3(gx, gy, 0), bottom_blob, ivec3(x, y, 0));
- #else
- int v_offset = y * psc(w) + x;
- buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset);
- #endif
- }
- else
- {
- afpvec4 v = afpvec4(value);
- #if NCNN_image_shader
- image3d_st4(top_blob, ivec3(gx, gy, 0), v);
- #else
- buffer_st4(top_blob_data, gi, v);
- #endif
- }
- }
- if (type == 1)
- {
- x = clamp(x, 0, psc(w) - 1);
- y = clamp(y, 0, psc(h) - 1);
-
- #if NCNN_image_shader
- afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, y, 0));
-
- if (gy < p.top / 4)
- v = afpvec4(v.r);
- else if (gy >= psc(h) + p.top / 4)
- v = afpvec4(v.a);
-
- image3d_st4(top_blob, ivec3(gx, gy, 0), v);
- #else
- int v_offset = y * psc(w) + x;
-
- afpvec4 v = buffer_ld4(bottom_blob_data, v_offset);
-
- if (gy < p.top / 4)
- v = afpvec4(v.r);
- else if (gy >= psc(h) + p.top / 4)
- v = afpvec4(v.a);
-
- buffer_st4(top_blob_data, gi, v);
- #endif
- }
- if (type == 2)
- {
- x = abs(x);
- y = abs(y);
- // NOTE psc(X) get zeros on nvidia
- // TODO only enable this workaround for some nvidia driver
- x = (p.w - 1) - abs(x - (p.w - 1));
- y = (p.h - 1) - abs(y - (p.h - 1));
- // x = (psc(w) - 1) - abs(x - (psc(w) - 1));
- // y = (psc(h) - 1) - abs(y - (psc(h) - 1));
-
- #if NCNN_image_shader
- afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, y, 0));
-
- if (gy < p.top / 4)
- {
- afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y - 1, 0));
- v = afpvec4(v.r, v0.a, v0.b, v0.g);
- }
- else if (gy >= psc(h) + p.top / 4)
- {
- afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y + 1, 0));
- v = afpvec4(v1.b, v1.g, v1.r, v.a);
- }
-
- image3d_st4(top_blob, ivec3(gx, gy, 0), v);
- #else
- int v_offset = y * psc(w) + x;
-
- afpvec4 v = buffer_ld4(bottom_blob_data, v_offset);
-
- if (gy < p.top / 4)
- {
- afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset - psc(w));
- v = afpvec4(v.r, v0.a, v0.b, v0.g);
- }
- else if (gy >= psc(h) + p.top / 4)
- {
- afpvec4 v1 = buffer_ld4(bottom_blob_data, v_offset + psc(w));
- v = afpvec4(v1.b, v1.g, v1.r, v.a);
- }
-
- buffer_st4(top_blob_data, gi, v);
- #endif
- }
- }
- else // if (psc(dims) == 3)
- {
- const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
-
- int x = gx - p.left;
- int y = gy - p.top;
- int z = gz - p.front / 4;
-
- if (type == 0)
- {
- if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h) && z >= 0 && z < psc(c))
- {
- #if NCNN_image_shader
- image3d_cp4(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, z));
- #else
- int v_offset = z * psc(cstep) + y * psc(w) + x;
- buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset);
- #endif
- }
- else if (per_channel_pad == 1)
- {
- #if NCNN_image_shader
- image3d_cp4(top_blob, ivec3(gx, gy, gz), per_channel_pad_blob, ivec3(gz, 0, 0));
- #else
- buffer_cp4(top_blob_data, gi, per_channel_pad_blob_data, gz);
- #endif
- }
- else
- {
- afpvec4 v = afpvec4(value);
- #if NCNN_image_shader
- image3d_st4(top_blob, ivec3(gx, gy, gz), v);
- #else
- buffer_st4(top_blob_data, gi, v);
- #endif
- }
- }
- if (type == 1)
- {
- x = clamp(x, 0, psc(w) - 1);
- y = clamp(y, 0, psc(h) - 1);
- z = clamp(z, 0, psc(c) - 1);
-
- #if NCNN_image_shader
- afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, y, z));
-
- if (gz < p.front / 4)
- v = afpvec4(v.r);
- else if (gz >= psc(c) + p.front / 4)
- v = afpvec4(v.a);
-
- image3d_st4(top_blob, ivec3(gx, gy, gz), v);
- #else
- int v_offset = z * psc(cstep) + y * psc(w) + x;
-
- afpvec4 v = buffer_ld4(bottom_blob_data, v_offset);
-
- if (gz < p.front / 4)
- v = afpvec4(v.r);
- else if (gz >= psc(c) + p.front / 4)
- v = afpvec4(v.a);
-
- buffer_st4(top_blob_data, gi, v);
- #endif
- }
- if (type == 2)
- {
- x = abs(x);
- y = abs(y);
- z = abs(z);
- // NOTE psc(X) get zeros on nvidia
- // TODO only enable this workaround for some nvidia driver
- x = (p.w - 1) - abs(x - (p.w - 1));
- y = (p.h - 1) - abs(y - (p.h - 1));
- z = (p.c - 1) - abs(z - (p.c - 1));
- // x = (psc(w) - 1) - abs(x - (psc(w) - 1));
- // y = (psc(h) - 1) - abs(y - (psc(h) - 1));
- // z = (psc(c) - 1) - abs(z - (psc(c) - 1));
-
- #if NCNN_image_shader
- afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, y, z));
-
- if (gz < p.front / 4)
- {
- afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y, z - 1));
- v = afpvec4(v.r, v0.a, v0.b, v0.g);
- }
- else if (gz >= psc(c) + p.front / 4)
- {
- afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y, z + 1));
- v = afpvec4(v1.b, v1.g, v1.r, v.a);
- }
-
- image3d_st4(top_blob, ivec3(gx, gy, gz), v);
- #else
- int v_offset = z * psc(cstep) + y * psc(w) + x;
-
- afpvec4 v = buffer_ld4(bottom_blob_data, v_offset);
-
- if (gz < p.front / 4)
- {
- afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset - psc(cstep));
- v = afpvec4(v.r, v0.a, v0.b, v0.g);
- }
- else if (gz >= psc(c) + p.front / 4)
- {
- afpvec4 v1 = buffer_ld4(bottom_blob_data, v_offset + psc(cstep));
- v = afpvec4(v1.b, v1.g, v1.r, v.a);
- }
-
- buffer_st4(top_blob_data, gi, v);
- #endif
- }
- }
- }
|