Browse Source

adreno image shader + fp16 + fp16a (#1714)

* wip

* wip

* fix

* image and imageview can not be destroyed until command execution ends

* fast copy path for tightly packed data

* wip

* texture load works

* 1d 3d image

* record clone image, multiple commands share one image reference

* upload download image

* layer forward accept vkimagemat

* vkimagemat graph works

* staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader

* vkimagemat elemsize

* convolution test pass

* conv1x1s1 image shader

* fast staging image allocator from host memory, pooling image shader

* convolutiondepthwise image shader

* innerproduct image shader

* packing image shader

* crop deconvolution image shader

* resolve spirv binding types

* image fp16 and fp16a, cast image shader

* eltwise image shader

* wip

* absval image shader

* deconvolutiondepthwise image shader

* concat image shader, squeezenet works

* noop split image shader

* uniform precision hint

* layer support_image_storage

* wip

* vulkan device utility operator

* command is storage and packing option aware

* fallback to cpu on image allocation failed, mobilenetssd works

* flatten image shader, enable more test

* ci test

* check imgfp32 imgfp16 imgfp16a features

* fix ci test

* fix ci test

* upgrade swiftshader

* wip

* opt aggressive

* imgfp16p

* opt none

* convolution winograd image shader

* fix flush range, fast copy path for continous buffer

* minor fix

* fix innerproduct

* wip ...

* wip

* cast fix

* packing test

* wip

* image fp16p is fp16p

* wip

* silence

* more line info

* code clean

* softmax image shader
tags/20200616
nihui GitHub 6 years ago
parent
commit
62da1228e1
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
100 changed files with 9480 additions and 1114 deletions
  1. +3
    -3
      .github/workflows/linux-x64-gpu-clang.yml
  2. +3
    -3
      .github/workflows/linux-x64-gpu-gcc.yml
  3. +8
    -13
      .github/workflows/macos-x64-gpu.yml
  4. +3
    -3
      .github/workflows/test-coverage.yml
  5. +3
    -3
      .github/workflows/windows-x64-gpu-vs2019.yml
  6. +7
    -2
      benchmark/benchncnn.cpp
  7. +298
    -0
      cmake/ncnn_generate_shader_spv_header.cmake
  8. +4
    -4
      docs/developer-guide/low-level-operation-api.md
  9. +12
    -0
      src/CMakeLists.txt
  10. +927
    -360
      src/allocator.cpp
  11. +74
    -63
      src/allocator.h
  12. +1907
    -438
      src/command.cpp
  13. +40
    -1
      src/command.h
  14. +35
    -6
      src/convert_ycbcr.comp
  15. +469
    -34
      src/gpu.cpp
  16. +57
    -1
      src/gpu.h
  17. +35
    -0
      src/layer.cpp
  18. +13
    -0
      src/layer.h
  19. +16
    -1
      src/layer/input.cpp
  20. +5
    -0
      src/layer/input.h
  21. +6
    -0
      src/layer/noop.cpp
  22. +1
    -0
      src/layer/noop.h
  23. +12
    -2
      src/layer/split.cpp
  24. +1
    -0
      src/layer/split.h
  25. +38
    -1
      src/layer/vulkan/absval_vulkan.cpp
  26. +1
    -0
      src/layer/vulkan/absval_vulkan.h
  27. +115
    -1
      src/layer/vulkan/cast_vulkan.cpp
  28. +1
    -0
      src/layer/vulkan/cast_vulkan.h
  29. +493
    -1
      src/layer/vulkan/concat_vulkan.cpp
  30. +1
    -0
      src/layer/vulkan/concat_vulkan.h
  31. +600
    -93
      src/layer/vulkan/convolution_vulkan.cpp
  32. +6
    -0
      src/layer/vulkan/convolution_vulkan.h
  33. +306
    -5
      src/layer/vulkan/convolutiondepthwise_vulkan.cpp
  34. +4
    -0
      src/layer/vulkan/convolutiondepthwise_vulkan.h
  35. +287
    -2
      src/layer/vulkan/crop_vulkan.cpp
  36. +4
    -0
      src/layer/vulkan/crop_vulkan.h
  37. +245
    -3
      src/layer/vulkan/deconvolution_vulkan.cpp
  38. +4
    -0
      src/layer/vulkan/deconvolution_vulkan.h
  39. +457
    -5
      src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
  40. +4
    -0
      src/layer/vulkan/deconvolutiondepthwise_vulkan.h
  41. +76
    -1
      src/layer/vulkan/eltwise_vulkan.cpp
  42. +1
    -0
      src/layer/vulkan/eltwise_vulkan.h
  43. +96
    -1
      src/layer/vulkan/flatten_vulkan.cpp
  44. +1
    -0
      src/layer/vulkan/flatten_vulkan.h
  45. +121
    -3
      src/layer/vulkan/innerproduct_vulkan.cpp
  46. +4
    -0
      src/layer/vulkan/innerproduct_vulkan.h
  47. +142
    -1
      src/layer/vulkan/packing_vulkan.cpp
  48. +1
    -0
      src/layer/vulkan/packing_vulkan.h
  49. +152
    -2
      src/layer/vulkan/padding_vulkan.cpp
  50. +5
    -0
      src/layer/vulkan/padding_vulkan.h
  51. +197
    -1
      src/layer/vulkan/pooling_vulkan.cpp
  52. +3
    -0
      src/layer/vulkan/pooling_vulkan.h
  53. +40
    -0
      src/layer/vulkan/shader/absval.comp
  54. +40
    -0
      src/layer/vulkan/shader/absval_pack4.comp
  55. +40
    -0
      src/layer/vulkan/shader/absval_pack8.comp
  56. +24
    -0
      src/layer/vulkan/shader/cast_fp16_to_fp32.comp
  57. +24
    -0
      src/layer/vulkan/shader/cast_fp16_to_fp32_pack4.comp
  58. +24
    -0
      src/layer/vulkan/shader/cast_fp16_to_fp32_pack8.comp
  59. +24
    -0
      src/layer/vulkan/shader/cast_fp32_to_fp16.comp
  60. +24
    -0
      src/layer/vulkan/shader/cast_fp32_to_fp16_pack4.comp
  61. +24
    -0
      src/layer/vulkan/shader/cast_fp32_to_fp16_pack8.comp
  62. +27
    -0
      src/layer/vulkan/shader/concat.comp
  63. +27
    -0
      src/layer/vulkan/shader/concat_pack4.comp
  64. +78
    -0
      src/layer/vulkan/shader/concat_pack4to1.comp
  65. +27
    -0
      src/layer/vulkan/shader/concat_pack8.comp
  66. +102
    -0
      src/layer/vulkan/shader/concat_pack8to1.comp
  67. +67
    -0
      src/layer/vulkan/shader/concat_pack8to4.comp
  68. +38
    -0
      src/layer/vulkan/shader/convolution.comp
  69. +69
    -20
      src/layer/vulkan/shader/convolution_1x1s1d1.comp
  70. +42
    -0
      src/layer/vulkan/shader/convolution_pack1to4.comp
  71. +44
    -0
      src/layer/vulkan/shader/convolution_pack1to8.comp
  72. +47
    -0
      src/layer/vulkan/shader/convolution_pack4.comp
  73. +75
    -16
      src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp
  74. +37
    -0
      src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_gemm.comp
  75. +51
    -0
      src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input.comp
  76. +41
    -0
      src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output.comp
  77. +42
    -0
      src/layer/vulkan/shader/convolution_pack4to1.comp
  78. +57
    -0
      src/layer/vulkan/shader/convolution_pack4to8.comp
  79. +57
    -0
      src/layer/vulkan/shader/convolution_pack8.comp
  80. +108
    -15
      src/layer/vulkan/shader/convolution_pack8_1x1s1d1.comp
  81. +71
    -0
      src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_gemm.comp
  82. +51
    -0
      src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_input.comp
  83. +41
    -0
      src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_output.comp
  84. +43
    -0
      src/layer/vulkan/shader/convolution_pack8to1.comp
  85. +49
    -0
      src/layer/vulkan/shader/convolution_pack8to4.comp
  86. +35
    -0
      src/layer/vulkan/shader/convolutiondepthwise.comp
  87. +42
    -0
      src/layer/vulkan/shader/convolutiondepthwise_group.comp
  88. +46
    -0
      src/layer/vulkan/shader/convolutiondepthwise_group_pack1to4.comp
  89. +48
    -0
      src/layer/vulkan/shader/convolutiondepthwise_group_pack1to8.comp
  90. +51
    -0
      src/layer/vulkan/shader/convolutiondepthwise_group_pack4.comp
  91. +46
    -0
      src/layer/vulkan/shader/convolutiondepthwise_group_pack4to1.comp
  92. +61
    -0
      src/layer/vulkan/shader/convolutiondepthwise_group_pack4to8.comp
  93. +61
    -0
      src/layer/vulkan/shader/convolutiondepthwise_group_pack8.comp
  94. +47
    -0
      src/layer/vulkan/shader/convolutiondepthwise_group_pack8to1.comp
  95. +53
    -0
      src/layer/vulkan/shader/convolutiondepthwise_group_pack8to4.comp
  96. +39
    -0
      src/layer/vulkan/shader/convolutiondepthwise_pack4.comp
  97. +41
    -0
      src/layer/vulkan/shader/convolutiondepthwise_pack8.comp
  98. +11
    -2
      src/layer/vulkan/shader/crop.comp
  99. +18
    -2
      src/layer/vulkan/shader/crop_pack1to4.comp
  100. +22
    -2
      src/layer/vulkan/shader/crop_pack1to8.comp

+ 3
- 3
.github/workflows/linux-x64-gpu-clang.yml View File

@@ -27,14 +27,14 @@ jobs:
uses: actions/cache@v1
with:
path: swiftshader-install
key: swiftshader-linux-install
key: swiftshader-linux-install-20200426-3
- name: checkout-swiftshader
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
uses: actions/checkout@v2
with:
repository: google/swiftshader
path: swiftshader
ref: 59465799210b3f4962af1a9dc44a4ffecb422c10
ref: 60aa34a990fa77553e2d9a69d34f0b3601ced66a
- name: checkout-swiftshader-submodules
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
run: |
@@ -45,7 +45,7 @@ jobs:
run: |
cd swiftshader
mkdir -p build; cd build
cmake -DCMAKE_INSTALL_PREFIX=install -DBUILD_EGL=0 -DBUILD_GLESv2=0 -DBUILD_GLES_CM=0 -DBUILD_VULKAN=1 -DBUILD_SAMPLES=0 -DBUILD_TESTS=0 -DWARNINGS_AS_ERRORS=0 ..
cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_DEFAULT_OPT_LEVEL=None -DCMAKE_BUILD_TYPE=Release ..
cmake --build . -j 2
mkdir $GITHUB_WORKSPACE/swiftshader-install
cp Linux/* $GITHUB_WORKSPACE/swiftshader-install


+ 3
- 3
.github/workflows/linux-x64-gpu-gcc.yml View File

@@ -27,14 +27,14 @@ jobs:
uses: actions/cache@v1
with:
path: swiftshader-install
key: swiftshader-linux-install
key: swiftshader-linux-install-20200426-3
- name: checkout-swiftshader
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
uses: actions/checkout@v2
with:
repository: google/swiftshader
path: swiftshader
ref: 59465799210b3f4962af1a9dc44a4ffecb422c10
ref: 60aa34a990fa77553e2d9a69d34f0b3601ced66a
- name: checkout-swiftshader-submodules
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
run: |
@@ -45,7 +45,7 @@ jobs:
run: |
cd swiftshader
mkdir -p build; cd build
cmake -DCMAKE_INSTALL_PREFIX=install -DBUILD_EGL=0 -DBUILD_GLESv2=0 -DBUILD_GLES_CM=0 -DBUILD_VULKAN=1 -DBUILD_SAMPLES=0 -DBUILD_TESTS=0 -DWARNINGS_AS_ERRORS=0 ..
cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_DEFAULT_OPT_LEVEL=None -DCMAKE_BUILD_TYPE=Release ..
cmake --build . -j 2
mkdir $GITHUB_WORKSPACE/swiftshader-install
cp Linux/* $GITHUB_WORKSPACE/swiftshader-install


+ 8
- 13
.github/workflows/macos-x64-gpu.yml View File

@@ -25,14 +25,14 @@ jobs:
uses: actions/cache@v1
with:
path: swiftshader-install
key: swiftshader-macos-install
key: swiftshader-macos-install-20200426-3
- name: checkout-swiftshader
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
uses: actions/checkout@v2
with:
repository: google/swiftshader
path: swiftshader
ref: 59465799210b3f4962af1a9dc44a4ffecb422c10
ref: 60aa34a990fa77553e2d9a69d34f0b3601ced66a
- name: checkout-swiftshader-submodules
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
run: |
@@ -43,7 +43,7 @@ jobs:
run: |
cd swiftshader
mkdir -p build; cd build
cmake -DCMAKE_INSTALL_PREFIX=install -DBUILD_EGL=0 -DBUILD_GLESv2=0 -DBUILD_GLES_CM=0 -DBUILD_VULKAN=1 -DBUILD_SAMPLES=0 -DBUILD_TESTS=0 -DWARNINGS_AS_ERRORS=0 ..
cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_DEFAULT_OPT_LEVEL=None -DCMAKE_BUILD_TYPE=Release ..
cmake --build . -j 2
mkdir $GITHUB_WORKSPACE/swiftshader-install
cp Darwin/* $GITHUB_WORKSPACE/swiftshader-install
@@ -51,16 +51,11 @@ jobs:
run: export VULKAN_SDK=`pwd`/vulkansdk-macos-1.1.114.0/macOS && mkdir build && cd build && cmake -DNCNN_VULKAN=ON ..
- name: build
run: cmake --build build -j 2
# - name: test
# run: |
# find "swiftshader-install/"
# find "vulkansdk-macos-1.1.114.0/"
# export DYLD_LIBRARY_PATH="vulkansdk-macos-1.1.114.0/macOS/lib":$DYLD_LIBRARY_PATH
# export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
# ./vulkansdk-macos-1.1.114.0/macOS/bin/vulkaninfo
# cd build && ctest --output-on-failure -j 2
# export VK_ICD_FILENAMES="vulkansdk-macos-1.1.114.0/macOS/etc/vulkan/icd.d/MoltenVK_icd.json"
# cd build && ctest --output-on-failure -j 2
- name: test
run: |
export DYLD_LIBRARY_PATH="vulkansdk-macos-1.1.114.0/macOS/lib":$DYLD_LIBRARY_PATH
export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
cd build && ctest --output-on-failure -j 2

macos-clang-gpu-nostdio:
runs-on: macos-latest


+ 3
- 3
.github/workflows/test-coverage.yml View File

@@ -25,14 +25,14 @@ jobs:
uses: actions/cache@v1
with:
path: swiftshader-install
key: swiftshader-linux-install
key: swiftshader-linux-install-20200426-3
- name: checkout-swiftshader
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
uses: actions/checkout@v2
with:
repository: google/swiftshader
path: swiftshader
ref: 59465799210b3f4962af1a9dc44a4ffecb422c10
ref: 60aa34a990fa77553e2d9a69d34f0b3601ced66a
- name: checkout-swiftshader-submodules
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
run: |
@@ -43,7 +43,7 @@ jobs:
run: |
cd swiftshader
mkdir -p build; cd build
cmake -DCMAKE_INSTALL_PREFIX=install -DBUILD_EGL=0 -DBUILD_GLESv2=0 -DBUILD_GLES_CM=0 -DBUILD_VULKAN=1 -DBUILD_SAMPLES=0 -DBUILD_TESTS=0 -DWARNINGS_AS_ERRORS=0 ..
cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_DEFAULT_OPT_LEVEL=None -DCMAKE_BUILD_TYPE=Release ..
cmake --build . -j 2
mkdir $GITHUB_WORKSPACE/swiftshader-install
cp Linux/* $GITHUB_WORKSPACE/swiftshader-install


+ 3
- 3
.github/workflows/windows-x64-gpu-vs2019.yml View File

@@ -37,14 +37,14 @@ jobs:
uses: actions/cache@v1
with:
path: swiftshader-install
key: swiftshader-windows-install
key: swiftshader-windows-install-20200426-3
- name: checkout-swiftshader
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
uses: actions/checkout@v2
with:
repository: google/swiftshader
path: swiftshader
ref: 59465799210b3f4962af1a9dc44a4ffecb422c10
ref: 60aa34a990fa77553e2d9a69d34f0b3601ced66a
- name: checkout-swiftshader-submodules
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
run: |
@@ -55,7 +55,7 @@ jobs:
run: |
cd swiftshader
mkdir build-vs2019; cd build-vs2019
cmake -DCMAKE_INSTALL_PREFIX=install -DBUILD_EGL=0 -DBUILD_GLESv2=0 -DBUILD_GLES_CM=0 -DBUILD_VULKAN=1 -DBUILD_SAMPLES=0 -DBUILD_TESTS=0 -DWARNINGS_AS_ERRORS=0 ..
cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_DEFAULT_OPT_LEVEL=None -DCMAKE_BUILD_TYPE=Release ..
cmake --build . --config Release -j 2
mkdir "$env:GITHUB_WORKSPACE/swiftshader-install"
Copy-Item -Path "Windows\*" -Destination "$env:GITHUB_WORKSPACE\swiftshader-install"


+ 7
- 2
benchmark/benchncnn.cpp View File

@@ -188,8 +188,8 @@ int main(int argc, char** argv)

g_vkdev = ncnn::get_gpu_device(gpu_device);

g_blob_vkallocator = new ncnn::VkBlobBufferAllocator(g_vkdev);
g_staging_vkallocator = new ncnn::VkStagingBufferAllocator(g_vkdev);
g_blob_vkallocator = new ncnn::VkBlobAllocator(g_vkdev);
g_staging_vkallocator = new ncnn::VkStagingAllocator(g_vkdev);
}
#endif // NCNN_VULKAN

@@ -214,6 +214,11 @@ int main(int argc, char** argv)
opt.use_int8_storage = true;
opt.use_int8_arithmetic = true;
opt.use_packing_layout = true;
opt.use_shader_pack8 = false;
opt.use_image_storage = false;
opt.use_image_fp16_packed = true;
opt.use_image_fp16_storage = true;
opt.use_image_fp16_arithmetic = true;

ncnn::set_cpu_powersave(powersave);



+ 298
- 0
cmake/ncnn_generate_shader_spv_header.cmake View File

@@ -184,6 +184,296 @@ function(ncnn_generate_shader_spv_header SHADER_SPV_HEADER SHADER_SPV_HEX_HEADER
)
set_source_files_properties(${SHADER_fp16sa_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)

# image + fp32
set(SHADER_image_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image")

set(SHADER_image_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_SRC_NAME_WE}.spv.hex.h)
add_custom_command(
OUTPUT ${SHADER_image_SPV_HEX_FILE}
COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
ARGS -Dsfp=float -Dsfpvec2=vec2 -Dsfpvec4=vec4 -Dsfpvec8=mat2x4 -Dsfpmat4=mat4
-Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpvec8=mat2x4 -Dafpmat4=mat4

-Dimfmtc1=r32f -Dimfmtc4=rgba32f
-Dunfp=highp

"-D image1d_ld1(tex,p)=texelFetch(tex,p,0).r"
"-D image2d_ld1(tex,p)=texelFetch(tex,p,0).r"
"-D image3d_ld1(tex,p)=texelFetch(tex,p,0).r"
"-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
"-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
"-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
"-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
"-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
"-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"

"-D image1d_ld4(tex,p)=texelFetch(tex,p,0)"
"-D image2d_ld4(tex,p)=texelFetch(tex,p,0)"
"-D image3d_ld4(tex,p)=texelFetch(tex,p,0)"
"-D image1d_st4(img,p,v)={imageStore(img,p,v);}"
"-D image2d_st4(img,p,v)={imageStore(img,p,v);}"
"-D image3d_st4(img,p,v)={imageStore(img,p,v);}"
"-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
"-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
"-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"

"-D image1d_ld8(tex,p)=mat2x4(texelFetch(tex,p*2,0),texelFetch(tex,p*2+1,0))"
"-D image2d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"
"-D image3d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"
"-D image1d_st8(img,p,v)={imageStore(img,p*2,v[0]);imageStore(img,p*2+1,v[1]);}"
"-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"
"-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"
"-D image1d_cp8(img,p,tex,sp)={imageStore(img,p*2,texelFetch(tex,sp*2,0));imageStore(img,p*2+1,texelFetch(tex,sp*2+1,0));}"
"-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"
"-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"

"-D buffer_ld1(buf,i)=buf[i]"
"-D buffer_st1(buf,i,v)={buf[i]=v;}"
"-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
"-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"
"-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"
"-D buffer_ld2(buf,i)=buf[i]"
"-D buffer_st2(buf,i,v)={buf[i]=v;}"
"-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
"-D buffer_ld4(buf,i)=buf[i]"
"-D buffer_st4(buf,i,v)={buf[i]=v;}"
"-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
"-D buffer_cp4to1(buf,i4,sbuf,si)={vec4 _v=sbuf[si]; buf[i4.r]=_v.r;buf[i4.g]=_v.g;buf[i4.b]=_v.b;buf[i4.a]=_v.a;}"
"-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=mat2x4(sbuf[si2.r],sbuf[si2.g]);}"
"-D buffer_ld8(buf,i)=buf[i]"
"-D buffer_st8(buf,i,v)={buf[i]=v;}"
"-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
"-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"
"-D buffer_cp8to4(buf,i2,sbuf,si)={mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"

"-D sfp2afpmat4(v)=v"
"-D afp2sfpmat4(v)=v"
"-D psc(x)=(x==0?p.x:x)"
-DNCNN_image_shader=1
-V -s -x -o ${SHADER_image_SPV_HEX_FILE} ${SHADER_SRC}
DEPENDS ${SHADER_SRC}
COMMENT "Building SPIR-V module ${SHADER_image_SRC_NAME_WE}.spv"
VERBATIM
)
set_source_files_properties(${SHADER_image_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)

# image + fp16p
set(SHADER_image_fp16p_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16p")

set(SHADER_image_fp16p_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16p_SRC_NAME_WE}.spv.hex.h)
add_custom_command(
OUTPUT ${SHADER_image_fp16p_SPV_HEX_FILE}
COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
ARGS -Dsfp=float -Dsfpvec2=uint -Dsfpvec4=uvec2 -Dsfpvec8=uvec4
-Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpvec8=mat2x4 -Dafpmat4=mat4

-Dimfmtc1=r32f -Dimfmtc4=rgba16f
-Dunfp=mediump

"-D image1d_ld1(tex,p)=texelFetch(tex,p,0).r"
"-D image2d_ld1(tex,p)=texelFetch(tex,p,0).r"
"-D image3d_ld1(tex,p)=texelFetch(tex,p,0).r"
"-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
"-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
"-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
"-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
"-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
"-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"

"-D image1d_ld4(tex,p)=texelFetch(tex,p,0)"
"-D image2d_ld4(tex,p)=texelFetch(tex,p,0)"
"-D image3d_ld4(tex,p)=texelFetch(tex,p,0)"
"-D image1d_st4(img,p,v)={imageStore(img,p,v);}"
"-D image2d_st4(img,p,v)={imageStore(img,p,v);}"
"-D image3d_st4(img,p,v)={imageStore(img,p,v);}"
"-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
"-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
"-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"

"-D image1d_ld8(tex,p)=mat2x4(texelFetch(tex,p*2,0),texelFetch(tex,p*2+1,0))"
"-D image2d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"
"-D image3d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"
"-D image1d_st8(img,p,v)={imageStore(img,p*2,v[0]);imageStore(img,p*2+1,v[1]);}"
"-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"
"-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"
"-D image1d_cp8(img,p,tex,sp)={imageStore(img,p*2,texelFetch(tex,sp*2,0));imageStore(img,p*2+1,texelFetch(tex,sp*2+1,0));}"
"-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"
"-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"

"-D buffer_ld1(buf,i)=buf[i]"
"-D buffer_st1(buf,i,v)={buf[i]=v;}"
"-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
"-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=uvec2(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])));}"
"-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=uvec4(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])),packHalf2x16(vec2(sbuf[sii4.r],sbuf[sii4.g])),packHalf2x16(vec2(sbuf[sii4.b],sbuf[sii4.a])));}"
"-D buffer_ld2(buf,i)=unpackHalf2x16(buf[i])"
"-D buffer_st2(buf,i,v)={buf[i]=packHalf2x16(v)}"
"-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
"-D buffer_ld4(buf,i)=vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y))"
"-D buffer_st4(buf,i,v)={buf[i]=uvec2(packHalf2x16(v.rg),packHalf2x16(v.ba));}"
"-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
"-D buffer_cp4to1(buf,i4,sbuf,si)={uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"
"-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"
"-D buffer_ld8(buf,i)=mat2x4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g)),vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a)))"
"-D buffer_st8(buf,i,v)={buf[i]=uvec4(uvec2(packHalf2x16(v[0].rg),packHalf2x16(v[0].ba)),uvec2(packHalf2x16(v[1].rg),packHalf2x16(v[1].ba)));}"
"-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
"-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"
"-D buffer_cp8to4(buf,i2,sbuf,si)={uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"

"-D psc(x)=(x==0?p.x:x)"
-DNCNN_image_shader=1 -DNCNN_fp16_packed=1
-V -s -x -o ${SHADER_image_fp16p_SPV_HEX_FILE} ${SHADER_SRC}
DEPENDS ${SHADER_SRC}
COMMENT "Building SPIR-V module ${SHADER_image_fp16p_SRC_NAME_WE}.spv"
VERBATIM
)
set_source_files_properties(${SHADER_image_fp16p_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)

# image + fp16s
set(SHADER_image_fp16s_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16s")

set(SHADER_image_fp16s_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16s_SRC_NAME_WE}.spv.hex.h)
add_custom_command(
OUTPUT ${SHADER_image_fp16s_SPV_HEX_FILE}
COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4
-Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpvec8=mat2x4 -Dafpmat4=mat4

-Dimfmtc1=r16f -Dimfmtc4=rgba16f
-Dunfp=mediump

"-D image1d_ld1(tex,p)=texelFetch(tex,p,0).r"
"-D image2d_ld1(tex,p)=texelFetch(tex,p,0).r"
"-D image3d_ld1(tex,p)=texelFetch(tex,p,0).r"
"-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
"-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
"-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
"-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
"-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
"-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"

"-D image1d_ld4(tex,p)=texelFetch(tex,p,0)"
"-D image2d_ld4(tex,p)=texelFetch(tex,p,0)"
"-D image3d_ld4(tex,p)=texelFetch(tex,p,0)"
"-D image1d_st4(img,p,v)={imageStore(img,p,v);}"
"-D image2d_st4(img,p,v)={imageStore(img,p,v);}"
"-D image3d_st4(img,p,v)={imageStore(img,p,v);}"
"-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
"-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
"-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"

"-D image1d_ld8(tex,p)=mat2x4(texelFetch(tex,p*2,0),texelFetch(tex,p*2+1,0))"
"-D image2d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"
"-D image3d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"
"-D image1d_st8(img,p,v)={imageStore(img,p*2,v[0]);imageStore(img,p*2+1,v[1]);}"
"-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"
"-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"
"-D image1d_cp8(img,p,tex,sp)={imageStore(img,p*2,texelFetch(tex,sp*2,0));imageStore(img,p*2+1,texelFetch(tex,sp*2+1,0));}"
"-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"
"-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"

"-D buffer_ld1(buf,i)=float(buf[i])"
"-D buffer_st1(buf,i,v)={buf[i]=float16_t(v);}"
"-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
"-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i].r=sbuf[si4.r];buf[i].g=sbuf[si4.g];buf[i].b=sbuf[si4.b];buf[i].a=sbuf[si4.a];}"
"-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i].abcd.r=sbuf[si4.r];buf[i].abcd.g=sbuf[si4.g];buf[i].abcd.b=sbuf[si4.b];buf[i].abcd.a=sbuf[si4.a];buf[i].efgh.r=sbuf[sii4.r];buf[i].efgh.g=sbuf[sii4.g];buf[i].efgh.b=sbuf[sii4.b];buf[i].efgh.a=sbuf[sii4.a];}"
"-D buffer_ld2(buf,i)=vec2(buf[i])"
"-D buffer_st2(buf,i,v)={buf[i]=f16vec2(v);}"
"-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
"-D buffer_ld4(buf,i)=vec4(buf[i])"
"-D buffer_st4(buf,i,v)={buf[i]=f16vec4(v);}"
"-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
"-D buffer_cp4to1(buf,i4,sbuf,si)={buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"
"-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i].abcd=sbuf[si2.r];buf[i].efgh=sbuf[si2.g];}"
"-D buffer_ld8(buf,i)=mat2x4(vec4(buf[i].abcd),vec4(buf[i].efgh))"
"-D buffer_st8(buf,i,v)={buf[i].abcd=f16vec4(v[0]);buf[i].efgh=f16vec4(v[1]);}"
"-D buffer_cp8(buf,i,sbuf,si)={buf[i].abcd=sbuf[si].abcd;buf[i].efgh=sbuf[si].efgh;}"
"-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={buf[i4.r]=sbuf[si].abcd.r;buf[i4.g]=sbuf[si].abcd.g;buf[i4.b]=sbuf[si].abcd.b;buf[i4.a]=sbuf[si].abcd.a; buf[ii4.r]=sbuf[si].efgh.r;buf[ii4.g]=sbuf[si].efgh.g;buf[ii4.b]=sbuf[si].efgh.b;buf[ii4.a]=sbuf[si].efgh.a;}"
"-D buffer_cp8to4(buf,i2,sbuf,si)={buf[i2.r]=sbuf[si].abcd;buf[i2.g]=sbuf[si].efgh;}"

"-D sfp2afpmat4(v)=v"
"-D afp2sfpmat4(v)=v"
"-D psc(x)=(x==0?p.x:x)"
-DNCNN_image_shader=1 -DNCNN_fp16_storage=1
-V -s -x -o ${SHADER_image_fp16s_SPV_HEX_FILE} ${SHADER_SRC}
DEPENDS ${SHADER_SRC}
COMMENT "Building SPIR-V module ${SHADER_image_fp16s_SRC_NAME_WE}.spv"
VERBATIM
)
set_source_files_properties(${SHADER_image_fp16s_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)

# image + fp16a
set(SHADER_image_fp16a_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16a")

set(SHADER_image_fp16a_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16a_SRC_NAME_WE}.spv.hex.h)
add_custom_command(
OUTPUT ${SHADER_image_fp16a_SPV_HEX_FILE}
COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4 -Dsfpvec8=f16mat2x4 -Dsfpmat4=f16mat4
-Dafp=float16_t -Dafpvec2=f16vec2 -Dafpvec4=f16vec4 -Dafpvec8=f16mat2x4 -Dafpmat4=f16mat4

-Dimfmtc1=r16f -Dimfmtc4=rgba16f
-Dunfp=mediump

"-D image1d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
"-D image2d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
"-D image3d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
"-D image1d_st1(img,p,v)={vec4 _v;_v.r=float(v);imageStore(img,p,_v);}"
"-D image2d_st1(img,p,v)={vec4 _v;_v.r=float(v);imageStore(img,p,_v);}"
"-D image3d_st1(img,p,v)={vec4 _v;_v.r=float(v);imageStore(img,p,_v);}"
"-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
"-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
"-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"

"-D image1d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
"-D image2d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
"-D image3d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
"-D image1d_st4(img,p,v)={imageStore(img,p,vec4(v));}"
"-D image2d_st4(img,p,v)={imageStore(img,p,vec4(v));}"
"-D image3d_st4(img,p,v)={imageStore(img,p,vec4(v));}"
"-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
"-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
"-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"

"-D image1d_ld8(tex,p)=f16mat2x4(texelFetch(tex,p*2,0),texelFetch(tex,p*2+1,0))"
"-D image2d_ld8(tex,p)=f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"
"-D image3d_ld8(tex,p)=f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"
"-D image1d_st8(img,p,v)={imageStore(img,p*2,vec4(v[0]));imageStore(img,p*2+1,vec4(v[1]));}"
"-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),vec4(v[0]));imageStore(img,ivec2(p.x*2+1,p.y),vec4(v[1]));}"
"-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),vec4(v[0]));imageStore(img,ivec3(p.x*2+1,p.y,p.z),vec4(v[1]));}"
"-D image1d_cp8(img,p,tex,sp)={imageStore(img,p*2,texelFetch(tex,sp*2,0));imageStore(img,p*2+1,texelFetch(tex,sp*2+1,0));}"
"-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"
"-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"

"-D buffer_ld1(buf,i)=buf[i]"
"-D buffer_st1(buf,i,v)={buf[i]=v;}"
"-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
"-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=f16vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"
"-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=f16mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"
"-D buffer_ld2(buf,i)=buf[i]"
"-D buffer_st2(buf,i,v)={buf[i]=v;}"
"-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
"-D buffer_ld4(buf,i)=buf[i]"
"-D buffer_st4(buf,i,v)={buf[i]=v;}"
"-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
"-D buffer_cp4to1(buf,i4,sbuf,si)={buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"
"-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=f16mat2x4(sbuf[si2.r],sbuf[si2.g]);}"
"-D buffer_ld8(buf,i)=buf[i]"
"-D buffer_st8(buf,i,v)={buf[i]=v;}"
"-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
"-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={f16mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"
"-D buffer_cp8to4(buf,i2,sbuf,si)={f16mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"
"-D sfp2afpmat4(v)=v"
"-D afp2sfpmat4(v)=v"

"-D psc(x)=(x==0?p.x:x)"
-DNCNN_image_shader=1 -DNCNN_fp16_storage=1 -DNCNN_fp16_arithmetic=1
-V -s -x -o ${SHADER_image_fp16a_SPV_HEX_FILE} ${SHADER_SRC}
DEPENDS ${SHADER_SRC}
COMMENT "Building SPIR-V module ${SHADER_image_fp16a_SRC_NAME_WE}.spv"
VERBATIM
)
set_source_files_properties(${SHADER_image_fp16a_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)

set(LOCAL_SHADER_SPV_HEADER ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_SRC_NAME_WE}.spv.h)

file(WRITE ${LOCAL_SHADER_SPV_HEADER}
@@ -192,6 +482,10 @@ function(ncnn_generate_shader_spv_header SHADER_SPV_HEADER SHADER_SPV_HEX_HEADER
"static const uint32_t ${SHADER_fp16pa_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16pa_SRC_NAME_WE}.spv.hex.h\"\n};\n"
"static const uint32_t ${SHADER_fp16s_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16s_SRC_NAME_WE}.spv.hex.h\"\n};\n"
"static const uint32_t ${SHADER_fp16sa_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16sa_SRC_NAME_WE}.spv.hex.h\"\n};\n"
"static const uint32_t ${SHADER_image_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_SRC_NAME_WE}.spv.hex.h\"\n};\n"
"static const uint32_t ${SHADER_image_fp16p_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16p_SRC_NAME_WE}.spv.hex.h\"\n};\n"
"static const uint32_t ${SHADER_image_fp16s_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16s_SRC_NAME_WE}.spv.hex.h\"\n};\n"
"static const uint32_t ${SHADER_image_fp16a_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16a_SRC_NAME_WE}.spv.hex.h\"\n};\n"
)

set_source_files_properties(${LOCAL_SHADER_SPV_HEADER} PROPERTIES GENERATED TRUE)
@@ -202,6 +496,10 @@ function(ncnn_generate_shader_spv_header SHADER_SPV_HEADER SHADER_SPV_HEX_HEADER
${SHADER_fp16pa_SPV_HEX_FILE}
${SHADER_fp16s_SPV_HEX_FILE}
${SHADER_fp16sa_SPV_HEX_FILE}
${SHADER_image_SPV_HEX_FILE}
${SHADER_image_fp16p_SPV_HEX_FILE}
${SHADER_image_fp16s_SPV_HEX_FILE}
${SHADER_image_fp16a_SPV_HEX_FILE}
)

set(${SHADER_SPV_HEADER} ${LOCAL_SHADER_SPV_HEADER} PARENT_SCOPE)


+ 4
- 4
docs/developer-guide/low-level-operation-api.md View File

@@ -141,10 +141,10 @@ ncnn::create_gpu_instance();
{
ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();

ncnn::VkWeightBufferAllocator g_weight_vkallocator(vkdev);
ncnn::VkBlobBufferAllocator g_blob_vkallocator(vkdev);
ncnn::VkStagingBufferAllocator g_staging_vkallocator(vkdev);
ncnn::VkWeightStagingBufferAllocator g_weight_staging_vkallocator(vkdev);
ncnn::VkWeightAllocator g_weight_vkallocator(vkdev);
ncnn::VkBlobAllocator g_blob_vkallocator(vkdev);
ncnn::VkStagingAllocator g_staging_vkallocator(vkdev);
ncnn::VkWeightStagingAllocator g_weight_staging_vkallocator(vkdev);

// create layer
ncnn::Layer* convolution = ncnn::create_layer("Convolution");


+ 12
- 0
src/CMakeLists.txt View File

@@ -60,6 +60,10 @@ macro(ncnn_add_shader SHADER_SRC)
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16pa_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16s_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16sa_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16p_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16s_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16a_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16a_spv_data)},\n")

list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEADER})
list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEX_HEADERS})
@@ -75,6 +79,14 @@ macro(ncnn_add_shader SHADER_SRC)
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16a = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
endmacro()

macro(ncnn_add_layer class)


+ 927
- 360
src/allocator.cpp
File diff suppressed because it is too large
View File


+ 74
- 63
src/allocator.h View File

@@ -199,17 +199,54 @@ public:
int refcount;
};

class VkImageMemory
{
public:
VkImage image;
VkImageView imageview;

// underlying info assigned by allocator
VkImageType image_type;
VkImageViewType imageview_type;
int width;
int height;
int depth;
VkFormat format;

VkDeviceMemory memory;
void* mapped_ptr;

// the base offset assigned by allocator
size_t bind_offset;
size_t bind_capacity;

// image state, modified by command functions internally
mutable VkAccessFlags access_flags;
mutable VkImageLayout image_layout;
mutable VkPipelineStageFlags stage_flags;

// in-execution state, modified by command functions internally
mutable int command_refcount;

// initialize and modified by mat
int refcount;
};

class VkAllocator
{
public:
VkAllocator(const VulkanDevice* _vkdev);
virtual ~VkAllocator() { clear(); }
virtual void clear() {}

virtual VkBufferMemory* fastMalloc(size_t size) = 0;
virtual void fastFree(VkBufferMemory* ptr) = 0;
virtual int flush(VkBufferMemory* ptr);
virtual int invalidate(VkBufferMemory* ptr);

virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack) = 0;
virtual void fastFree(VkImageMemory* ptr) = 0;

public:
const VulkanDevice* vkdev;
uint32_t memory_type_index;
@@ -219,14 +256,17 @@ public:
protected:
VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage);
VkDeviceMemory allocate_memory(size_t size);
VkDeviceMemory allocate_dedicated_memory(size_t size, VkBuffer buffer);
VkDeviceMemory allocate_dedicated_memory(size_t size, VkImage image, VkBuffer buffer);

VkImage create_image(VkImageType type, int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage);
VkImageView create_imageview(VkImageViewType type, VkImage image, VkFormat format);
};

class VkBlobBufferAllocator : public VkAllocator
class VkBlobAllocator : public VkAllocator
{
public:
VkBlobBufferAllocator(const VulkanDevice* vkdev);
virtual ~VkBlobBufferAllocator();
VkBlobAllocator(const VulkanDevice* vkdev);
virtual ~VkBlobAllocator();

public:
// release all budgets immediately
@@ -234,19 +274,24 @@ public:

virtual VkBufferMemory* fastMalloc(size_t size);
virtual void fastFree(VkBufferMemory* ptr);
virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack);
virtual void fastFree(VkImageMemory* ptr);

private:
size_t block_size;
size_t buffer_offset_alignment;
std::vector< std::list< std::pair<size_t, size_t> > > budgets;
size_t bind_memory_offset_alignment;
std::vector< std::list< std::pair<size_t, size_t> > > buffer_budgets;
std::vector<VkBufferMemory*> buffer_blocks;
std::vector< std::list< std::pair<size_t, size_t> > > image_memory_budgets;
std::vector<VkDeviceMemory> image_memory_blocks;
};

class VkWeightBufferAllocator : public VkAllocator
class VkWeightAllocator : public VkAllocator
{
public:
VkWeightBufferAllocator(const VulkanDevice* vkdev);
virtual ~VkWeightBufferAllocator();
VkWeightAllocator(const VulkanDevice* vkdev);
virtual ~VkWeightAllocator();

public:
// release all blocks immediately
@@ -255,20 +300,26 @@ public:
public:
virtual VkBufferMemory* fastMalloc(size_t size);
virtual void fastFree(VkBufferMemory* ptr);
virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack);
virtual void fastFree(VkImageMemory* ptr);

private:
size_t block_size;
size_t buffer_offset_alignment;
size_t bind_memory_offset_alignment;
std::vector<size_t> buffer_block_free_spaces;
std::vector<VkBufferMemory*> buffer_blocks;
std::vector<VkBufferMemory*> dedicated_buffer_blocks;
std::vector<size_t> image_memory_block_free_spaces;
std::vector<VkDeviceMemory> image_memory_blocks;
std::vector<VkDeviceMemory> dedicated_image_memory_blocks;
};

class VkStagingBufferAllocator : public VkAllocator
class VkStagingAllocator : public VkAllocator
{
public:
VkStagingBufferAllocator(const VulkanDevice* vkdev);
virtual ~VkStagingBufferAllocator();
VkStagingAllocator(const VulkanDevice* vkdev);
virtual ~VkStagingAllocator();

public:
// ratio range 0 ~ 1
@@ -280,82 +331,42 @@ public:

virtual VkBufferMemory* fastMalloc(size_t size);
virtual void fastFree(VkBufferMemory* ptr);
virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack);
virtual void fastFree(VkImageMemory* ptr);

private:
unsigned int size_compare_ratio;// 0~256
std::list<VkBufferMemory*> budgets;
std::list<VkBufferMemory*> buffer_budgets;
};

class VkWeightStagingBufferAllocator : public VkAllocator
class VkWeightStagingAllocator : public VkAllocator
{
public:
VkWeightStagingBufferAllocator(const VulkanDevice* vkdev);
virtual ~VkWeightStagingBufferAllocator();
VkWeightStagingAllocator(const VulkanDevice* vkdev);
virtual ~VkWeightStagingAllocator();

public:
virtual VkBufferMemory* fastMalloc(size_t size);
virtual void fastFree(VkBufferMemory* ptr);
virtual VkImageMemory* fastMalloc(int /*dims*/, int /*w*/, int /*h*/, int /*c*/, size_t /*elemsize*/, int /*elempack*/) { return 0; }
virtual void fastFree(VkImageMemory* /*ptr*/) {}

private:
};

class VkImageMemory
{
public:
VkImage image;
VkImageView imageview;

VkDeviceMemory memory;

// image state, modified by command functions internally
mutable VkAccessFlags access_flags;
mutable VkPipelineStageFlags stage_flags;

// initialize and modified by mat
int refcount;
};

class VkImageAllocator : public VkAllocator
{
public:
VkImageAllocator(const VulkanDevice* _vkdev);
virtual ~VkImageAllocator() { clear(); }
virtual void clear() {}
virtual VkImageMemory* fastMalloc(int width, int height, VkFormat format) = 0;
virtual void fastFree(VkImageMemory* ptr) = 0;

protected:
virtual VkBufferMemory* fastMalloc(size_t /*size*/) { return 0; }
virtual void fastFree(VkBufferMemory* /*ptr*/) {}

protected:
VkImage create_image(int width, int height, VkFormat format, VkImageUsageFlags usage);
VkImageView create_imageview(VkImage image, VkFormat format);
VkDeviceMemory allocate_dedicated_memory(size_t size, VkImage image);
};

class VkSimpleImageAllocator : public VkImageAllocator
{
public:
VkSimpleImageAllocator(const VulkanDevice* vkdev);
virtual ~VkSimpleImageAllocator();

public:
virtual VkImageMemory* fastMalloc(int width, int height, VkFormat format);
virtual void fastFree(VkImageMemory* ptr);
};

#if __ANDROID_API__ >= 26
class ImportAndroidHardwareBufferPipeline;
class VkAndroidHardwareBufferImageAllocator : public VkImageAllocator
class VkAndroidHardwareBufferImageAllocator : public VkAllocator
{
public:
VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb);
virtual ~VkAndroidHardwareBufferImageAllocator();

public:
virtual VkImageMemory* fastMalloc(int width, int height, VkFormat format);
virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack);
virtual void fastFree(VkImageMemory* ptr);
virtual VkBufferMemory* fastMalloc(size_t /*size*/) { return 0; }
virtual void fastFree(VkBufferMemory* /*ptr*/) {}

public:
int init();


+ 1907
- 438
src/command.cpp
File diff suppressed because it is too large
View File


+ 40
- 1
src/command.h View File

@@ -35,18 +35,44 @@ public:
public:
void record_upload(const Mat& src, VkMat& dst, const Option& opt);

void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);

void record_download(const VkMat& src, Mat& dst, const Option& opt);

void record_download(const VkImageMat& src, Mat& dst, const Option& opt);

void record_buffer_to_image(const VkMat& src, VkImageMat& dst, const Option& opt);

void record_image_to_buffer(const VkImageMat& src, VkMat& dst, const Option& opt);

void record_clone(const Mat& src, VkMat& dst, const Option& opt);

void record_clone(const Mat& src, VkImageMat& dst, const Option& opt);

void record_clone(const VkMat& src, Mat& dst, const Option& opt);

void record_clone(const VkImageMat& src, Mat& dst, const Option& opt);

void record_clone(const VkMat& src, VkMat& dst, const Option& opt);

void record_clone(const VkImageMat& src, VkImageMat& dst, const Option& opt);

void record_clone(const VkMat& src, VkImageMat& dst, const Option& opt);

void record_clone(const VkImageMat& src, VkMat& dst, const Option& opt);

void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);

void record_pipeline(const Pipeline* pipeline, const std::vector<VkImageMat>& bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);

#if NCNN_BENCHMARK
void record_write_timestamp(uint32_t query);
#endif // NCNN_BENCHMARK

#if __ANDROID_API__ >= 26
void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst);

void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkImageMat& dst);
#endif // __ANDROID_API__ >= 26

int submit_and_wait();
@@ -75,8 +101,11 @@ protected:

std::vector<VkMat> upload_staging_buffers;
std::vector<VkMat> download_post_buffers;
std::vector<Mat> download_post_mats_fp16;
std::vector<Mat> download_post_mats;

std::vector<VkImageMemory*> image_blocks_to_destroy;

// the good-old path for device without VK_KHR_push_descriptor
std::vector<VkDescriptorPool> descriptor_pools;
std::vector<VkDescriptorSet> descriptorsets;
@@ -86,6 +115,9 @@ protected:
enum
{
TYPE_copy_buffer,
TYPE_copy_image,
TYPE_copy_buffer_to_image,
TYPE_copy_image_to_buffer,
TYPE_bind_pipeline,
TYPE_bind_descriptorsets,
TYPE_push_constants,
@@ -99,6 +131,7 @@ protected:
#endif // NCNN_BENCHMARK

TYPE_post_download,
TYPE_post_cast_float16_to_float32,
};

int type;
@@ -107,6 +140,9 @@ protected:
union
{
struct { VkBuffer src; VkBuffer dst; uint32_t region_count; const VkBufferCopy* regions; } copy_buffer;
struct { VkImage src; VkImageLayout src_layout; VkImage dst; VkImageLayout dst_layout; uint32_t region_count; const VkImageCopy* regions; } copy_image;
struct { VkBuffer src; VkImage dst; VkImageLayout layout; uint32_t region_count; const VkBufferImageCopy* regions; } copy_buffer_to_image;
struct { VkImage src; VkImageLayout layout; VkBuffer dst; uint32_t region_count; const VkBufferImageCopy* regions; } copy_image_to_buffer;

struct { VkPipelineBindPoint bind_point; VkPipeline pipeline; } bind_pipeline;
struct { VkPipelineBindPoint bind_point; VkPipelineLayout pipeline_layout; uint32_t descriptorset_count; uint32_t descriptorset_offset; } bind_descriptorsets;
@@ -122,7 +158,8 @@ protected:
struct { uint32_t query; } write_timestamp;
#endif // NCNN_BENCHMARK

struct { uint32_t download_post_buffer_mat_offset; } post_download;
struct { uint32_t download_post_buffer_mat_offset; uint32_t download_post_mat_fp16_offset; } post_download;
struct { uint32_t download_post_mat_fp16_offset; uint32_t download_post_mat_offset; } post_cast_float16_to_float32;
};
};

@@ -143,6 +180,8 @@ public:
public:
void record_upload(const Mat& src, VkMat& dst, const Option& opt);

void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);

int submit_and_wait();

protected:


+ 35
- 6
src/convert_ycbcr.comp View File

@@ -34,8 +34,13 @@ layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) uniform sampler2D android_hardware_buffer_image;
#if NCNN_image_shader
layout (binding = 1, imfmtc1) writeonly uniform unfp image3D vkmat_blob;
layout (binding = 2, imfmtc4) writeonly uniform unfp image3D vkmat_pack4_blob;
#else
layout (binding = 1) writeonly buffer vkmat_blob { sfp vkmat_blob_data[]; };
layout (binding = 2) writeonly buffer vkmat_pack4_blob { sfpvec4 vkmat_pack4_blob_data[]; };
#endif

void main()
{
@@ -108,51 +113,75 @@ void main()

if (type_to == 1) // PIXEL_RGB
{
#if NCNN_image_shader
image3d_st1(vkmat_blob, ivec3(gx, gy, 0), rgb.r);
image3d_st1(vkmat_blob, ivec3(gx, gy, 1), rgb.g);
image3d_st1(vkmat_blob, ivec3(gx, gy, 2), rgb.b);
#else
ivec3 v_offset = (gy * outw + gx) + ivec3(0, 1, 2) * outcstep;

buffer_st1(vkmat_blob_data, v_offset.r, afp(rgb.r));
buffer_st1(vkmat_blob_data, v_offset.g, afp(rgb.g));
buffer_st1(vkmat_blob_data, v_offset.b, afp(rgb.b));
#endif
}

if (type_to == 2) // PIXEL_BGR
{
#if NCNN_image_shader
image3d_st1(vkmat_blob, ivec3(gx, gy, 0), rgb.b);
image3d_st1(vkmat_blob, ivec3(gx, gy, 1), rgb.g);
image3d_st1(vkmat_blob, ivec3(gx, gy, 2), rgb.r);
#else
ivec3 v_offset = (gy * outw + gx) + ivec3(0, 1, 2) * outcstep;

buffer_st1(vkmat_blob_data, v_offset.r, afp(rgb.b));
buffer_st1(vkmat_blob_data, v_offset.g, afp(rgb.g));
buffer_st1(vkmat_blob_data, v_offset.b, afp(rgb.r));
#endif
}

if (type_to == 3) // PIXEL_GRAY
{
int v_offset = gy * outw + gx;

// coeffs for r g b = 0.299f, 0.587f, 0.114f
float v = clamp(rgb.r * 0.299f + rgb.g * 0.587f + rgb.b * 0.114f, 0.f, 255.f);

#if NCNN_image_shader
image3d_st1(vkmat_blob, ivec3(gx, gy, 0), v);
#else
int v_offset = gy * outw + gx;

buffer_st1(vkmat_blob_data, v_offset, afp(v));
#endif
}

if (type_to == 4) // PIXEL_RGBA
{
int v_offset = gy * outw + gx;

vec4 rgba;
rgba.rgb = rgb;
rgba.a = 255.f;

#if NCNN_image_shader
image3d_st4(vkmat_pack4_blob, ivec3(gx, gy, 0), rgba);
#else
int v_offset = gy * outw + gx;

buffer_st4(vkmat_pack4_blob_data, v_offset, afpvec4(rgba));
#endif
}

if (type_to == 5) // PIXEL_BGRA
{
int v_offset = gy * outw + gx;

vec4 rgba;
rgba.bgr = rgb;
rgba.a = 255.f;

#if NCNN_image_shader
image3d_st4(vkmat_pack4_blob, ivec3(gx, gy, 0), rgba);
#else
int v_offset = gy * outw + gx;

buffer_st4(vkmat_pack4_blob_data, v_offset, afpvec4(rgba));
#endif
}
}

+ 469
- 34
src/gpu.cpp View File

@@ -27,6 +27,9 @@
#include <vector>

#include "mat.h"
#include "command.h"
#include "layer_type.h"
#include "layer.h"

#if __ANDROID__
#define ENABLE_VALIDATION_LAYER 0
@@ -603,6 +606,10 @@ int create_gpu_instance()
gpu_info.memory_map_alignment = physicalDeviceProperties.limits.minMemoryMapAlignment;
gpu_info.buffer_offset_alignment = physicalDeviceProperties.limits.minStorageBufferOffsetAlignment;
gpu_info.non_coherent_atom_size = physicalDeviceProperties.limits.nonCoherentAtomSize;
gpu_info.buffer_image_granularity = physicalDeviceProperties.limits.bufferImageGranularity;
gpu_info.max_image_dimension_1d = physicalDeviceProperties.limits.maxImageDimension1D;
gpu_info.max_image_dimension_2d = physicalDeviceProperties.limits.maxImageDimension2D;
gpu_info.max_image_dimension_3d = physicalDeviceProperties.limits.maxImageDimension3D;

gpu_info.timestamp_period = physicalDeviceProperties.limits.timestampPeriod;

@@ -810,6 +817,54 @@ int create_gpu_instance()
gpu_info.support_fp16_arithmetic = true;
}

// check format
gpu_info.support_image_storage = false;
gpu_info.support_image_fp16_packed = false;
gpu_info.support_image_fp16_storage = false;
gpu_info.support_image_fp16_arithmetic = false;
{
VkFormatProperties r32f_formatProperties;
VkFormatProperties rgba32f_formatProperties;
vkGetPhysicalDeviceFormatProperties(physicalDevice, VK_FORMAT_R32_SFLOAT, &r32f_formatProperties);
vkGetPhysicalDeviceFormatProperties(physicalDevice, VK_FORMAT_R32G32B32A32_SFLOAT, &rgba32f_formatProperties);

if ((r32f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)
&& (r32f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)
&& (rgba32f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)
&& (rgba32f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT))
gpu_info.support_image_storage = true;
}
{
VkFormatProperties rgba16f_formatProperties;
vkGetPhysicalDeviceFormatProperties(physicalDevice, VK_FORMAT_R16G16B16A16_SFLOAT, &rgba16f_formatProperties);

if ((rgba16f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)
&& (rgba16f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT))
gpu_info.support_image_fp16_packed = true;
}
{
VkFormatProperties r16f_formatProperties;
VkFormatProperties rgba16f_formatProperties;
vkGetPhysicalDeviceFormatProperties(physicalDevice, VK_FORMAT_R16_SFLOAT, &r16f_formatProperties);
vkGetPhysicalDeviceFormatProperties(physicalDevice, VK_FORMAT_R16G16B16A16_SFLOAT, &rgba16f_formatProperties);

if ((r16f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)
&& (r16f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)
&& (rgba16f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)
&& (rgba16f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT))
gpu_info.support_image_fp16_storage = true;
}
if (gpu_info.support_fp16_arithmetic)
{
gpu_info.support_image_fp16_arithmetic = true;
}

if (physicalDeviceProperties.vendorID == 0x1ae0 && physicalDeviceProperties.deviceID == 0xc0de)
{
// swiftshader image r16f is not supported
gpu_info.support_image_fp16_storage = false;
}

fprintf(stderr, "[%u %s] queueC=%u[%u] queueG=%u[%u] queueT=%u[%u]\n", i, physicalDeviceProperties.deviceName,
gpu_info.compute_queue_family_index, gpu_info.compute_queue_count,
gpu_info.graphics_queue_family_index, gpu_info.graphics_queue_count,
@@ -822,6 +877,10 @@ int create_gpu_instance()
gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic,
gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic);

fprintf(stderr, "[%u %s] imgfp32=%d imgfp16p=%d imgfp16s=%d imgfp16a=%d\n", i, physicalDeviceProperties.deviceName,
gpu_info.support_image_storage, gpu_info.support_image_fp16_packed,
gpu_info.support_image_fp16_storage, gpu_info.support_image_fp16_arithmetic);

gpu_info_index++;
}

@@ -833,7 +892,7 @@ int create_gpu_instance()
// resolve shader info
for (int i=0; i<layer_shader_registry_entry_count; i++)
{
layer_shader_infos[i] = resolve_shader_info(layer_shader_registry[i].spv_data, layer_shader_registry[i].spv_data_size);
resolve_shader_info(layer_shader_registry[i].spv_data, layer_shader_registry[i].spv_data_size, layer_shader_infos[i]);
}

return 0;
@@ -1043,8 +1102,8 @@ VulkanDevice::VulkanDevice(int device_index) : info(g_gpu_infos[device_index])
for (uint32_t i = 0; i < info.compute_queue_count; i++)
{
vkGetDeviceQueue(device, info.compute_queue_family_index, i, &compute_queues[i]);
blob_allocators[i] = new VkBlobBufferAllocator(this);
staging_allocators[i] = new VkStagingBufferAllocator(this);
blob_allocators[i] = new VkBlobAllocator(this);
staging_allocators[i] = new VkStagingAllocator(this);
}
if (info.compute_queue_family_index != info.graphics_queue_family_index)
{
@@ -1062,10 +1121,49 @@ VulkanDevice::VulkanDevice(int device_index) : info(g_gpu_infos[device_index])
vkGetDeviceQueue(device, info.transfer_queue_family_index, i, &transfer_queues[i]);
}
}

// prepare immutable texelfetch sampler
{
VkSamplerCreateInfo samplerCreateInfo;
samplerCreateInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
samplerCreateInfo.pNext = 0;
samplerCreateInfo.flags = 0;
samplerCreateInfo.magFilter = VK_FILTER_NEAREST;
samplerCreateInfo.minFilter = VK_FILTER_NEAREST;
samplerCreateInfo.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
samplerCreateInfo.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
samplerCreateInfo.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
samplerCreateInfo.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
samplerCreateInfo.mipLodBias = 0.0f;
samplerCreateInfo.anisotropyEnable = VK_FALSE;
samplerCreateInfo.maxAnisotropy = 1;
samplerCreateInfo.compareEnable = VK_FALSE;
samplerCreateInfo.compareOp = VK_COMPARE_OP_NEVER;
samplerCreateInfo.minLod = 0.0f;
samplerCreateInfo.maxLod = 0.0f;
samplerCreateInfo.borderColor = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK;
samplerCreateInfo.unnormalizedCoordinates = VK_TRUE;

texelfetch_sampler = 0;
ret = vkCreateSampler(device, &samplerCreateInfo, 0, &texelfetch_sampler);
if (ret != VK_SUCCESS)
{
fprintf(stderr, "vkCreateSampler failed %d\n", ret);
}
}

create_utility_operator();
}

VulkanDevice::~VulkanDevice()
{
destroy_utility_operator();

if (texelfetch_sampler)
{
vkDestroySampler(device, texelfetch_sampler, 0);
}

for (uint32_t i = 0; i < info.compute_queue_count; i++)
{
delete blob_allocators[i];
@@ -1436,40 +1534,69 @@ void VulkanDevice::reclaim_staging_allocator(VkAllocator* allocator) const
fprintf(stderr, "FATAL ERROR! reclaim_staging_allocator get wild allocator %p\n", allocator);
}

static inline bool string_ends_with_fp16p(const char* name)
const VkSampler* VulkanDevice::immutable_texelfetch_sampler() const
{
int len = strlen(name);
if (len < 6)
return false;
return &texelfetch_sampler;
}

return memcmp(name + len - 6, "_fp16p", 6) == 0;
void VulkanDevice::cast_float32_to_float16(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const
{
int uoi = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0;
uop_cast_float32_to_float16[uoi]->forward(src, dst, cmd, opt);
}

static inline bool string_ends_with_fp16pa(const char* name)
void VulkanDevice::cast_float32_to_float16(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const
{
int len = strlen(name);
if (len < 7)
return false;
int uoi = opt.use_image_fp16_storage ? 5 : opt.use_image_fp16_packed ? 4 : 3;
uop_cast_float32_to_float16[uoi]->forward(src, dst, cmd, opt);
}

return memcmp(name + len - 7, "_fp16pa", 7) == 0;
void VulkanDevice::cast_float16_to_float32(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const
{
int uoi = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0;
uop_cast_float16_to_float32[uoi]->forward(src, dst, cmd, opt);
}

static inline bool string_ends_with_fp16s(const char* name)
void VulkanDevice::cast_float16_to_float32(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const
{
int len = strlen(name);
if (len < 6)
return false;
int uoi = opt.use_image_fp16_storage ? 5 : opt.use_image_fp16_packed ? 4 : 3;
uop_cast_float16_to_float32[uoi]->forward(src, dst, cmd, opt);
}

return memcmp(name + len - 6, "_fp16s", 6) == 0;
void VulkanDevice::packing_pack1(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const
{
int uoi = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0;
uop_packing_pack1[uoi]->forward(src, dst, cmd, opt);
}

static inline bool string_ends_with_fp16sa(const char* name)
void VulkanDevice::packing_pack1(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const
{
int len = strlen(name);
if (len < 7)
return false;
int uoi = opt.use_image_fp16_storage ? 5 : opt.use_image_fp16_packed ? 4 : 3;
uop_packing_pack1[uoi]->forward(src, dst, cmd, opt);
}

return memcmp(name + len - 7, "_fp16sa", 7) == 0;
void VulkanDevice::packing_pack4(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const
{
int uoi = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0;
uop_packing_pack4[uoi]->forward(src, dst, cmd, opt);
}

void VulkanDevice::packing_pack4(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const
{
int uoi = opt.use_image_fp16_storage ? 5 : opt.use_image_fp16_packed ? 4 : 3;
uop_packing_pack4[uoi]->forward(src, dst, cmd, opt);
}

void VulkanDevice::packing_pack8(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const
{
int uoi = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0;
uop_packing_pack8[uoi]->forward(src, dst, cmd, opt);
}

void VulkanDevice::packing_pack8(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const
{
int uoi = opt.use_image_fp16_storage ? 5 : opt.use_image_fp16_packed ? 4 : 3;
uop_packing_pack8[uoi]->forward(src, dst, cmd, opt);
}

int VulkanDevice::create_shader_module()
@@ -1490,28 +1617,56 @@ int VulkanDevice::create_shader_module()
// 2 = fp16pa
// 3 = fp16s
// 4 = fp16sa
// 5 = image
// 6 = image_fp16p
// 7 = image_fp16s
// 8 = image_fp16a

if (!info.support_fp16_packed)
{
if (i % 5 == 1)
if (i % 9 == 1)
continue;
}

if (!info.support_fp16_packed || !info.support_fp16_arithmetic)
{
if (i % 5 == 2)
if (i % 9 == 2)
continue;
}

if (!info.support_fp16_storage)
{
if (i % 5 == 3)
if (i % 9 == 3)
continue;
}

if (!info.support_fp16_storage || !info.support_fp16_arithmetic)
{
if (i % 5 == 4)
if (i % 9 == 4)
continue;
}

if (!info.support_image_storage)
{
if (i % 9 == 5)
continue;
}

if (!info.support_image_storage || !info.support_image_fp16_packed)
{
if (i % 9 == 6)
continue;
}

if (!info.support_image_storage || !info.support_image_fp16_storage)
{
if (i % 9 == 7)
continue;
}

if (!info.support_image_storage || !info.support_image_fp16_storage || !info.support_image_fp16_arithmetic)
{
if (i % 9 == 8)
continue;
}

@@ -1606,6 +1761,214 @@ int VulkanDevice::init_device_extension()
return 0;
}

int VulkanDevice::create_utility_operator()
{
Option opt[6];

opt[0].use_fp16_packed = false;
opt[0].use_fp16_storage = false;
opt[0].use_image_storage = false;
opt[0].use_image_fp16_packed = false;
opt[0].use_image_fp16_storage = false;
opt[0].use_shader_pack8 = true;

opt[1].use_fp16_packed = true;
opt[1].use_fp16_storage = false;
opt[1].use_image_storage = false;
opt[1].use_image_fp16_packed = false;
opt[1].use_image_fp16_storage = false;
opt[1].use_shader_pack8 = true;

opt[2].use_fp16_packed = true;
opt[2].use_fp16_storage = true;
opt[2].use_image_storage = false;
opt[2].use_image_fp16_packed = false;
opt[2].use_image_fp16_storage = false;
opt[2].use_shader_pack8 = true;

opt[3].use_fp16_packed = false;
opt[3].use_fp16_storage = false;
opt[3].use_image_storage = true;
opt[3].use_image_fp16_packed = false;
opt[3].use_image_fp16_storage = false;
opt[3].use_shader_pack8 = true;

opt[4].use_fp16_packed = false;
opt[4].use_fp16_storage = false;
opt[4].use_image_storage = true;
opt[4].use_image_fp16_packed = true;
opt[4].use_image_fp16_storage = false;
opt[4].use_shader_pack8 = true;

opt[5].use_fp16_packed = false;
opt[5].use_fp16_storage = false;
opt[5].use_image_storage = true;
opt[5].use_image_fp16_packed = true;
opt[5].use_image_fp16_storage = true;
opt[5].use_shader_pack8 = true;

for (int i = 0; i < 6; i++)
{
uop_cast_float32_to_float16[i] = 0;
uop_cast_float16_to_float32[i] = 0;
uop_packing_pack1[i] = 0;
uop_packing_pack4[i] = 0;
uop_packing_pack8[i] = 0;

if (i == 1 && !info.support_fp16_packed)
continue;

if (i == 2 && !info.support_fp16_storage)
continue;

if (i == 3 && !info.support_image_storage)
continue;

if (i == 4 && (!info.support_image_storage || !info.support_image_fp16_packed))
continue;

if (i == 5 && (!info.support_image_storage || !info.support_image_fp16_storage))
continue;

{
uop_cast_float32_to_float16[i] = ncnn::create_layer(ncnn::LayerType::Cast);
uop_cast_float32_to_float16[i]->vkdev = this;

ncnn::ParamDict pd;
pd.set(0, 1);
pd.set(1, 2);

uop_cast_float32_to_float16[i]->load_param(pd);
}

{
uop_cast_float16_to_float32[i] = ncnn::create_layer(ncnn::LayerType::Cast);
uop_cast_float16_to_float32[i]->vkdev = this;

ncnn::ParamDict pd;
pd.set(0, 2);
pd.set(1, 1);

uop_cast_float16_to_float32[i]->load_param(pd);
}

{
uop_packing_pack1[i] = ncnn::create_layer(ncnn::LayerType::Packing);
uop_packing_pack1[i]->vkdev = this;

ncnn::ParamDict pd;
pd.set(0, 1);

uop_packing_pack1[i]->load_param(pd);
}

{
uop_packing_pack4[i] = ncnn::create_layer(ncnn::LayerType::Packing);
uop_packing_pack4[i]->vkdev = this;

ncnn::ParamDict pd;
pd.set(0, 4);

uop_packing_pack4[i]->load_param(pd);
}

{
uop_packing_pack8[i] = ncnn::create_layer(ncnn::LayerType::Packing);
uop_packing_pack8[i]->vkdev = this;

ncnn::ParamDict pd;
pd.set(0, 8);

uop_packing_pack8[i]->load_param(pd);
}

uop_cast_float32_to_float16[i]->create_pipeline(opt[i]);
uop_cast_float16_to_float32[i]->create_pipeline(opt[i]);
uop_packing_pack1[i]->create_pipeline(opt[i]);
uop_packing_pack4[i]->create_pipeline(opt[i]);
uop_packing_pack8[i]->create_pipeline(opt[i]);
}

return 0;
}

void VulkanDevice::destroy_utility_operator()
{
Option opt[6];

opt[0].use_fp16_packed = false;
opt[0].use_fp16_storage = false;
opt[0].use_image_storage = false;
opt[0].use_image_fp16_packed = false;
opt[0].use_image_fp16_storage = false;
opt[0].use_shader_pack8 = true;

opt[1].use_fp16_packed = true;
opt[1].use_fp16_storage = false;
opt[1].use_image_storage = false;
opt[1].use_image_fp16_packed = false;
opt[1].use_image_fp16_storage = false;
opt[1].use_shader_pack8 = true;

opt[2].use_fp16_packed = true;
opt[2].use_fp16_storage = true;
opt[2].use_image_storage = false;
opt[2].use_image_fp16_packed = false;
opt[2].use_image_fp16_storage = false;
opt[2].use_shader_pack8 = true;

opt[3].use_fp16_packed = false;
opt[3].use_fp16_storage = false;
opt[3].use_image_storage = true;
opt[3].use_image_fp16_packed = false;
opt[3].use_image_fp16_storage = false;
opt[3].use_shader_pack8 = true;

opt[4].use_fp16_packed = false;
opt[4].use_fp16_storage = false;
opt[4].use_image_storage = true;
opt[4].use_image_fp16_packed = true;
opt[4].use_image_fp16_storage = false;
opt[4].use_shader_pack8 = true;

opt[5].use_fp16_packed = false;
opt[5].use_fp16_storage = false;
opt[5].use_image_storage = true;
opt[5].use_image_fp16_packed = true;
opt[5].use_image_fp16_storage = true;
opt[5].use_shader_pack8 = true;

for (int i = 0; i < 6; i++)
{
if (i == 1 && !info.support_fp16_packed)
continue;

if (i == 2 && !info.support_fp16_storage)
continue;

if (i == 3 && !info.support_image_storage)
continue;

if (i == 4 && (!info.support_image_storage || !info.support_image_fp16_packed))
continue;

if (i == 5 && (!info.support_image_storage || !info.support_image_fp16_storage))
continue;

uop_cast_float32_to_float16[i]->destroy_pipeline(opt[i]);
uop_cast_float16_to_float32[i]->destroy_pipeline(opt[i]);
uop_packing_pack1[i]->destroy_pipeline(opt[i]);
uop_packing_pack4[i]->destroy_pipeline(opt[i]);
uop_packing_pack8[i]->destroy_pipeline(opt[i]);

delete uop_cast_float32_to_float16[i];
delete uop_cast_float16_to_float32[i];
delete uop_packing_pack1[i];
delete uop_packing_pack4[i];
delete uop_packing_pack8[i];
}
}

VulkanDevice* get_gpu_device(int device_index)
{
if (device_index < 0 || device_index >= g_gpu_count)
@@ -1630,16 +1993,30 @@ const ShaderInfo& get_shader_info(int shader_type_index)
return layer_shader_infos[shader_type_index];
}

ShaderInfo resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size)
int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info)
{
shader_info.specialization_count = 0;
shader_info.binding_count = 0;
shader_info.push_constant_count = 0;

uint32_t parameter_id = -233;

int specialization_count = 0;
int binding_count = 0;
int push_constant_count = 0;

// id -> binding_type
std::vector<int> id_types;

// binding_id -> binding_type
std::vector<int> binding_types;

const uint32_t* p = spv_data;

int bound = p[3];

id_types.resize(bound);

// skip magic version generator bound schema
p += 5;

@@ -1668,28 +2045,86 @@ ShaderInfo resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size)
push_constant_count++;
}
}
else if (op == 25) // OpTypeImage
{
uint32_t id = p[1];
id_types[id] = 2;
}
else if (op == 27) // OpTypeSampledImage
{
uint32_t id = p[1];
id_types[id] = 3;
}
else if (op == 32) // OpTypePointer
{
uint32_t id = p[1];
uint32_t storage_class = p[2];
uint32_t type = p[3];
if (storage_class == 0) // UniformConstant
{
id_types[id] = id_types[type];
}
if (storage_class == 2) // Uniform
{
id_types[id] = id_types[type];
}
}
else if (op == 59) // OpVariable
{
uint32_t id = p[1];
uint32_t var_id = p[2];
uint32_t storage_class = p[3];
if (storage_class == 0) // UniformConstant
{
id_types[var_id] = id_types[id];
}
if (storage_class == 2) // Uniform
{
id_types[var_id] = id_types[id];
}
}
else if (op == 71) // OpDecorate
{
uint32_t id = p[1];
uint32_t decoration = p[2];
uint32_t binding_id = p[3];
if (decoration == 1) // SpecId
{
specialization_count++;
}
if (decoration == 3) // BufferBlock
{
id_types[id] = 1;
}
else if (decoration == 33) // Binding
{
binding_count++;
binding_count = std::max(binding_count, (int)binding_id + 1);

binding_types.resize(binding_count);
binding_types[binding_id] = id;
}
}

p += wordcount;
}

ShaderInfo si;
si.specialization_count = specialization_count;
si.binding_count = binding_count;
si.push_constant_count = push_constant_count;
if (binding_count > 16)
{
fprintf(stderr, "too many binding %d\n", binding_count);
return -1;
}

shader_info.specialization_count = specialization_count;
shader_info.binding_count = binding_count;
shader_info.push_constant_count = push_constant_count;

return si;
// resolve binding_types
for (int i=0; i<binding_count; i++)
{
shader_info.binding_types[i] = id_types[ binding_types[i] ];
}

return 0;
}

} // namespace ncnn


+ 57
- 1
src/gpu.h View File

@@ -100,6 +100,10 @@ public:
size_t memory_map_alignment;
size_t buffer_offset_alignment;
size_t non_coherent_atom_size;
size_t buffer_image_granularity;
uint32_t max_image_dimension_1d;
uint32_t max_image_dimension_2d;
uint32_t max_image_dimension_3d;
float timestamp_period;

// runtime
@@ -127,6 +131,12 @@ public:
bool support_int8_storage;
bool support_int8_arithmetic;

// image feature
bool support_image_storage;
bool support_image_fp16_packed;
bool support_image_fp16_storage;
bool support_image_fp16_arithmetic;

// ycbcr conversion feature
bool support_ycbcr_conversion;

@@ -154,6 +164,11 @@ public:
const GpuInfo& get_gpu_info(int device_index = get_default_gpu_index());

class VkAllocator;
class VkCompute;
class VkMat;
class VkImageMat;
class Layer;
class Option;
class VulkanDevice
{
public:
@@ -188,6 +203,21 @@ public:
VkAllocator* acquire_staging_allocator() const;
void reclaim_staging_allocator(VkAllocator* allocator) const;

// immutable sampler for texelfetch
const VkSampler* immutable_texelfetch_sampler() const;

// utility operator
void cast_float32_to_float16(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const;
void cast_float32_to_float16(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const;
void cast_float16_to_float32(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const;
void cast_float16_to_float32(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const;
void packing_pack1(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const;
void packing_pack1(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const;
void packing_pack4(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const;
void packing_pack4(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const;
void packing_pack8(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const;
void packing_pack8(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const;

// VK_KHR_bind_memory2
PFN_vkBindBufferMemory2KHR vkBindBufferMemory2KHR;
PFN_vkBindImageMemory2KHR vkBindImageMemory2KHR;
@@ -234,6 +264,10 @@ protected:
// device extension
int init_device_extension();

// utility operator
int create_utility_operator();
void destroy_utility_operator();

private:
VkDevice device;
std::vector<VkShaderModule> shader_modules;
@@ -251,6 +285,22 @@ private:
// default staging allocator for each queue
mutable std::vector<VkAllocator*> staging_allocators;
mutable Mutex staging_allocator_lock;

// nearest sampler for texelfetch
VkSampler texelfetch_sampler;

// utility operator
// 0 = fp32
// 1 = fp16p
// 2 = fp16s
// 3 = image
// 4 = image_fp16p
// 5 = image_fp16s
ncnn::Layer* uop_cast_float32_to_float16[6];
ncnn::Layer* uop_cast_float16_to_float32[6];
ncnn::Layer* uop_packing_pack1[6];
ncnn::Layer* uop_packing_pack4[6];
ncnn::Layer* uop_packing_pack8[6];
};

VulkanDevice* get_gpu_device(int device_index = get_default_gpu_index());
@@ -262,10 +312,16 @@ public:
int specialization_count;
int binding_count;
int push_constant_count;

// 0 = null
// 1 = storage buffer
// 2 = storage image
// 3 = combined image sampler
int binding_types[16];// 16 is large enough I think ...
};

const ShaderInfo& get_shader_info(int shader_type_index);
ShaderInfo resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size);
int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info);

} // namespace ncnn



+ 35
- 0
src/layer.cpp View File

@@ -39,6 +39,7 @@ Layer::Layer()
support_packing = false;

support_bf16_storage = false;
support_image_storage = false;

#if NCNN_VULKAN
vkdev = 0;
@@ -137,6 +138,30 @@ int Layer::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, co
return forward_inplace(top_blob, cmd, opt);
}

int Layer::forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const
{
if (!support_inplace)
return -1;

top_blobs.resize(bottom_blobs.size());
for (int i = 0; i < (int)top_blobs.size(); i++)
{
cmd.record_clone(bottom_blobs[i], top_blobs[i], opt);
}

return forward_inplace(top_blobs, cmd, opt);
}

int Layer::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
{
if (!support_inplace)
return -1;

cmd.record_clone(bottom_blob, top_blob, opt);

return forward_inplace(top_blob, cmd, opt);
}

int Layer::forward_inplace(std::vector<VkMat>& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const
{
return -1;
@@ -146,6 +171,16 @@ int Layer::forward_inplace(VkMat& /*bottom_top_blob*/, VkCompute& /*cmd*/, const
{
return -1;
}

int Layer::forward_inplace(std::vector<VkImageMat>& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const
{
return -1;
}

int Layer::forward_inplace(VkImageMat& /*bottom_top_blob*/, VkCompute& /*cmd*/, const Option& /*opt*/) const
{
return -1;
}
#endif // NCNN_VULKAN

static const layer_registry_entry layer_registry[] =


+ 13
- 0
src/layer.h View File

@@ -73,6 +73,9 @@ public:
// accept bf16
bool support_bf16_storage;

// shader image storage
bool support_image_storage;

public:
// implement inference
// return 0 if success
@@ -95,11 +98,21 @@ public:
virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;

// implement inference
// return 0 if success
virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;

// implement inplace inference
// return 0 if success
virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;

// implement inplace inference
// return 0 if success
virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;

public:
// assigned immediately after creating this layer
const VulkanDevice* vkdev;


+ 16
- 1
src/layer/input.cpp View File

@@ -22,7 +22,10 @@ Input::Input()
{
one_blob_only = true;
support_inplace = true;
support_vulkan = false;
support_vulkan = true;
support_packing = true;
support_bf16_storage = true;
support_image_storage = true;
}

int Input::load_param(const ParamDict& pd)
@@ -39,4 +42,16 @@ int Input::forward_inplace(Mat& /*bottom_top_blob*/, const Option& /*opt*/) cons
return 0;
}

#if NCNN_VULKAN
int Input::forward_inplace(VkMat& /*bottom_top_blob*/, VkCompute& /*cmd*/, const Option& /*opt*/) const
{
return 0;
}

int Input::forward_inplace(VkImageMat& /*bottom_top_blob*/, VkCompute& /*cmd*/, const Option& /*opt*/) const
{
return 0;
}
#endif // NCNN_VULKAN

} // namespace ncnn

+ 5
- 0
src/layer/input.h View File

@@ -28,6 +28,11 @@ public:

virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

#if NCNN_VULKAN
virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
#endif // NCNN_VULKAN

public:
int w;
int h;


+ 6
- 0
src/layer/noop.cpp View File

@@ -23,6 +23,7 @@ Noop::Noop()
support_inplace = true;
support_vulkan = true;
support_packing = true;
support_image_storage = true;
}

int Noop::forward_inplace(std::vector<Mat>& /*bottom_top_blobs*/, const Option& /*opt*/) const
@@ -35,6 +36,11 @@ int Noop::forward_inplace(std::vector<VkMat>& /*bottom_top_blobs*/, VkCompute& /
{
return 0;
}

int Noop::forward_inplace(std::vector<VkImageMat>& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const
{
return 0;
}
#endif // NCNN_VULKAN

} // namespace ncnn

+ 1
- 0
src/layer/noop.h View File

@@ -28,6 +28,7 @@ public:

#if NCNN_VULKAN
virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
#endif // NCNN_VULKAN
};



+ 12
- 2
src/layer/split.cpp View File

@@ -25,6 +25,7 @@ Split::Split()
support_vulkan = true;
support_packing = true;
support_bf16_storage = true;
support_image_storage = true;
}

int Split::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& /*opt*/) const
@@ -41,8 +42,6 @@ int Split::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_b
#if NCNN_VULKAN
int Split::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& /*cmd*/, const Option& /*opt*/) const
{
// fprintf(stderr, "Split::forward %p\n", bottom_blobs[0].buffer());

const VkMat& bottom_blob = bottom_blobs[0];
for (size_t i=0; i<top_blobs.size(); i++)
{
@@ -51,6 +50,17 @@ int Split::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& t

return 0;
}

int Split::forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& /*cmd*/, const Option& /*opt*/) const
{
const VkImageMat& bottom_blob = bottom_blobs[0];
for (size_t i=0; i<top_blobs.size(); i++)
{
top_blobs[i] = bottom_blob;
}

return 0;
}
#endif // NCNN_VULKAN

} // namespace ncnn

+ 1
- 0
src/layer/split.h View File

@@ -28,6 +28,7 @@ public:

#if NCNN_VULKAN
virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
#endif // NCNN_VULKAN

public:


+ 38
- 1
src/layer/vulkan/absval_vulkan.cpp View File

@@ -23,6 +23,7 @@ DEFINE_LAYER_CREATOR(AbsVal_vulkan)
AbsVal_vulkan::AbsVal_vulkan()
{
support_vulkan = true;
support_image_storage = true;

pipeline_absval = 0;
pipeline_absval_pack4 = 0;
@@ -39,7 +40,19 @@ int AbsVal_vulkan::create_pipeline(const Option& opt)
if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;

size_t elemsize;
if (opt.use_fp16_storage)
if (opt.use_image_storage && opt.use_image_fp16_storage)
{
elemsize = elempack * 2u;
}
else if (opt.use_image_storage && opt.use_image_fp16_packed)
{
elemsize = elempack == 1 ? 4u : elempack * 2u;
}
else if (opt.use_image_storage)
{
elemsize = elempack * 4u;
}
else if (opt.use_fp16_storage)
{
elemsize = elempack * 2u;
}
@@ -148,4 +161,28 @@ int AbsVal_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const
return 0;
}

int AbsVal_vulkan::forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& /*opt*/) const
{
int elempack = bottom_top_blob.elempack;

std::vector<VkImageMat> bindings(2);
bindings[0] = bottom_top_blob;
bindings[1] = bottom_top_blob;

std::vector<vk_constant_type> constants(5);
constants[0].i = bottom_top_blob.dims;
constants[1].i = bottom_top_blob.w;
constants[2].i = bottom_top_blob.h;
constants[3].i = bottom_top_blob.c;
constants[4].i = 0;//bottom_top_blob.cstep;

const Pipeline* pipeline = elempack == 8 ? pipeline_absval_pack8
: elempack == 4 ? pipeline_absval_pack4
: pipeline_absval;

cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob);

return 0;
}

} // namespace ncnn

+ 1
- 0
src/layer/vulkan/absval_vulkan.h View File

@@ -29,6 +29,7 @@ public:

using AbsVal::forward_inplace;
virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;

public:
Pipeline* pipeline_absval;


+ 115
- 1
src/layer/vulkan/cast_vulkan.cpp View File

@@ -23,6 +23,7 @@ DEFINE_LAYER_CREATOR(Cast_vulkan)
Cast_vulkan::Cast_vulkan()
{
support_vulkan = true;
support_image_storage = true;

pipeline_cast_fp32_to_fp16 = 0;
pipeline_cast_fp32_to_fp16_pack4 = 0;
@@ -49,7 +50,22 @@ int Cast_vulkan::create_pipeline(const Option& opt)

size_t elemsize;
size_t out_elemsize;
if (opt.use_fp16_storage)
if (opt.use_image_storage && opt.use_image_fp16_storage)
{
elemsize = elempack * 2u;
out_elemsize = out_elempack * 2u;
}
else if (opt.use_image_storage && opt.use_image_fp16_packed)
{
elemsize = elempack == 1 ? 4u : elempack * 2u;
out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
}
else if (opt.use_image_storage)
{
elemsize = elempack * 4u;
out_elemsize = out_elempack * 4u;
}
else if (opt.use_fp16_storage)
{
elemsize = elempack * 2u;
out_elemsize = out_elempack * 2u;
@@ -285,4 +301,102 @@ int Cast_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& c
return 0;
}

int Cast_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
{
if (type_from == type_to)
{
top_blob = bottom_blob;
return 0;
}

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int dims = bottom_blob.dims;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

size_t out_elemsize = elemsize;
if (type_to == 1)
{
// float32
out_elemsize = 4 * elempack;
}
else if (type_to == 2)
{
// float16
out_elemsize = 2 * elempack;

if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage)
{
if (elempack == 8) out_elemsize = 8*2u;
if (elempack == 4) out_elemsize = 4*2u;
if (elempack == 1) out_elemsize = 4u;
}

if (!opt.use_image_fp16_packed && !opt.use_image_fp16_storage)
{
// fallback to fp32 :(
out_elemsize = 4 * elempack;
}
}
else if (type_to == 3)
{
// int8
out_elemsize = elempack;
}

if (dims == 1)
{
top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator);
}
else if (dims == 2)
{
top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator);
}
else if (dims == 3)
{
top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator);
}
if (top_blob.empty())
return -100;

std::vector<VkImageMat> bindings(2);
bindings[0] = bottom_blob;
bindings[1] = top_blob;

std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = 0;//bottom_blob.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = 0;//top_blob.cstep;

const Pipeline* pipeline = 0;

if (type_from == 1 && type_to == 2)
{
pipeline = elempack == 8 ? pipeline_cast_fp32_to_fp16_pack8
: elempack == 4 ? pipeline_cast_fp32_to_fp16_pack4
: pipeline_cast_fp32_to_fp16;
}
if (type_from == 2 && type_to == 1)
{
pipeline = elempack == 8 ? pipeline_cast_fp16_to_fp32_pack8
: elempack == 4 ? pipeline_cast_fp16_to_fp32_pack4
: pipeline_cast_fp16_to_fp32;
}

// TODO more cast type

cmd.record_pipeline(pipeline, bindings, constants, top_blob);

return 0;
}

} // namespace ncnn

+ 1
- 0
src/layer/vulkan/cast_vulkan.h View File

@@ -29,6 +29,7 @@ public:

using Cast::forward;
virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;

public:
Pipeline* pipeline_cast_fp32_to_fp16;


+ 493
- 1
src/layer/vulkan/concat_vulkan.cpp View File

@@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(Concat_vulkan)
Concat_vulkan::Concat_vulkan()
{
support_vulkan = true;
support_image_storage = true;

packing_pack4 = 0;
packing_pack8 = 0;
@@ -77,7 +78,19 @@ int Concat_vulkan::create_pipeline(const Option& opt)
}

size_t elemsize;
if (opt.use_fp16_storage)
if (opt.use_image_storage && opt.use_image_fp16_storage)
{
elemsize = elempack * 2u;
}
else if (opt.use_image_storage && opt.use_image_fp16_packed)
{
elemsize = elempack == 1 ? 4u : elempack * 2u;
}
else if (opt.use_image_storage)
{
elemsize = elempack * 4u;
}
else if (opt.use_fp16_storage)
{
elemsize = elempack * 2u;
}
@@ -761,4 +774,483 @@ int Concat_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<V
return 0;
}

int Concat_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const
{
int dims = bottom_blobs[0].dims;

if (dims == 1) // axis == 0
{
// concat vector
// total length
size_t elemsize = bottom_blobs[0].elemsize;
int elempack = bottom_blobs[0].elempack;
int top_w = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkImageMat& bottom_blob = bottom_blobs[b];
elemsize = std::min(elemsize, bottom_blob.elemsize);
elempack = std::min(elempack, bottom_blob.elempack);
top_w += bottom_blob.w * bottom_blob.elempack;
}

int out_elempack = opt.use_shader_pack8 && top_w % 8 == 0 ? 8 : top_w % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / elempack * out_elempack;

if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage)
{
if (out_elempack == 8) out_elemsize = 8*2u;
if (out_elempack == 4) out_elemsize = 4*2u;
if (out_elempack == 1) out_elemsize = 4u;
}

VkImageMat& top_blob = top_blobs[0];
top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

VkImageMat top_blob_unpacked = top_blob;
if (elempack < out_elempack)
{
top_blob_unpacked.create(top_w / elempack, elemsize, elempack, opt.workspace_vkallocator);
if (top_blob_unpacked.empty())
return -100;
}

int woffset = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkImageMat& bottom_blob = bottom_blobs[b];

std::vector<VkImageMat> bindings(2);
bindings[0] = bottom_blob;
bindings[1] = top_blob_unpacked;

std::vector<vk_constant_type> constants(11);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = 0;//bottom_blob.cstep;
constants[5].i = top_blob_unpacked.dims;
constants[6].i = top_blob_unpacked.w;
constants[7].i = top_blob_unpacked.h;
constants[8].i = top_blob_unpacked.c;
constants[9].i = 0;//top_blob_unpacked.cstep;
constants[10].i = woffset;

const Pipeline* pipeline = 0;
if (bottom_blob.elempack == 1 && elempack == 1)
{
pipeline = pipeline_concat[b%2];
}
else if (bottom_blob.elempack == 4 && elempack == 4)
{
pipeline = pipeline_concat_pack4[b%2];
}
else if (bottom_blob.elempack == 4 && elempack == 1)
{
pipeline = pipeline_concat_pack4to1[b%2];
}
else if (bottom_blob.elempack == 8 && elempack == 8)
{
pipeline = pipeline_concat_pack8[b%2];
}
else if (bottom_blob.elempack == 8 && elempack == 4)
{
pipeline = pipeline_concat_pack8to4[b%2];
}
else if (bottom_blob.elempack == 8 && elempack == 1)
{
pipeline = pipeline_concat_pack8to1[b%2];
}

cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);

woffset += bottom_blob.w * bottom_blob.elempack / elempack;
}

// packing
if (elempack < out_elempack)
{
const Layer* packing = out_elempack == 8 ? packing_pack8 : packing_pack4;
packing->forward(top_blob_unpacked, top_blob, cmd, opt);
}

return 0;
}

if (dims == 2 && axis == 0)
{
// concat image
int w = bottom_blobs[0].w;

// total height
size_t elemsize = bottom_blobs[0].elemsize;
int elempack = bottom_blobs[0].elempack;
int top_h = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkImageMat& bottom_blob = bottom_blobs[b];
elemsize = std::min(elemsize, bottom_blob.elemsize);
elempack = std::min(elempack, bottom_blob.elempack);
top_h += bottom_blob.h * bottom_blob.elempack;
}

int out_elempack = opt.use_shader_pack8 && top_h % 8 == 0 ? 8 : top_h % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / elempack * out_elempack;

if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage)
{
if (out_elempack == 8) out_elemsize = 8*2u;
if (out_elempack == 4) out_elemsize = 4*2u;
if (out_elempack == 1) out_elemsize = 4u;
}

VkImageMat& top_blob = top_blobs[0];
top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

VkImageMat top_blob_unpacked = top_blob;
if (elempack < out_elempack)
{
top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_vkallocator);
if (top_blob_unpacked.empty())
return -100;
}

int hoffset = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkImageMat& bottom_blob = bottom_blobs[b];

std::vector<VkImageMat> bindings(2);
bindings[0] = bottom_blob;
bindings[1] = top_blob_unpacked;

std::vector<vk_constant_type> constants(11);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = 0;//bottom_blob.cstep;
constants[5].i = top_blob_unpacked.dims;
constants[6].i = top_blob_unpacked.w;
constants[7].i = top_blob_unpacked.h;
constants[8].i = top_blob_unpacked.c;
constants[9].i = 0;//top_blob_unpacked.cstep;
constants[10].i = hoffset;

const Pipeline* pipeline = 0;
if (bottom_blob.elempack == 1 && elempack == 1)
{
pipeline = pipeline_concat[b%2];
}
else if (bottom_blob.elempack == 4 && elempack == 4)
{
pipeline = pipeline_concat_pack4[b%2];
}
else if (bottom_blob.elempack == 4 && elempack == 1)
{
pipeline = pipeline_concat_pack4to1[b%2];
}
else if (bottom_blob.elempack == 8 && elempack == 8)
{
pipeline = pipeline_concat_pack8[b%2];
}
else if (bottom_blob.elempack == 8 && elempack == 4)
{
pipeline = pipeline_concat_pack8to4[b%2];
}
else if (bottom_blob.elempack == 8 && elempack == 1)
{
pipeline = pipeline_concat_pack8to1[b%2];
}

cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);

hoffset += bottom_blob.h * bottom_blob.elempack / elempack;
}

// packing
if (elempack < out_elempack)
{
const Layer* packing = out_elempack == 8 ? packing_pack8 : packing_pack4;
packing->forward(top_blob_unpacked, top_blob, cmd, opt);
}

return 0;
}

if (dims == 2 && axis == 1)
{
// interleave image row
int h = bottom_blobs[0].h;
size_t elemsize = bottom_blobs[0].elemsize;
int elempack = bottom_blobs[0].elempack;

// total width
int top_w = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkImageMat& bottom_blob = bottom_blobs[b];
top_w += bottom_blob.w;
}

VkImageMat& top_blob = top_blobs[0];
top_blob.create(top_w, h, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

int woffset = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkImageMat& bottom_blob = bottom_blobs[b];

std::vector<VkImageMat> bindings(2);
bindings[0] = bottom_blob;
bindings[1] = top_blob;

std::vector<vk_constant_type> constants(11);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = 0;//bottom_blob.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = 0;//top_blob.cstep;
constants[10].i = woffset;

const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b%2]
: elempack == 4 ? pipeline_concat_pack4[b%2]
: pipeline_concat[b%2];

cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);

woffset += bottom_blob.w;
}

return 0;
}

if (dims == 3 && axis == 0)
{
// concat dim
int w = bottom_blobs[0].w;
int h = bottom_blobs[0].h;

// total channels
size_t elemsize = bottom_blobs[0].elemsize;
int elempack = bottom_blobs[0].elempack;
int top_channels = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkImageMat& bottom_blob = bottom_blobs[b];
elemsize = std::min(elemsize, bottom_blob.elemsize);
elempack = std::min(elempack, bottom_blob.elempack);
top_channels += bottom_blob.c * bottom_blob.elempack;
}

int out_elempack = opt.use_shader_pack8 && top_channels % 8 == 0 ? 8 : top_channels % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / elempack * out_elempack;

if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage)
{
if (out_elempack == 8) out_elemsize = 8*2u;
if (out_elempack == 4) out_elemsize = 4*2u;
if (out_elempack == 1) out_elemsize = 4u;
}

VkImageMat& top_blob = top_blobs[0];
top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

VkImageMat top_blob_unpacked = top_blob;
if (elempack < out_elempack)
{
top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_vkallocator);
if (top_blob_unpacked.empty())
return -100;
}

int coffset = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkImageMat& bottom_blob = bottom_blobs[b];

std::vector<VkImageMat> bindings(2);
bindings[0] = bottom_blob;
bindings[1] = top_blob_unpacked;

std::vector<vk_constant_type> constants(11);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = 0;//bottom_blob.cstep;
constants[5].i = top_blob_unpacked.dims;
constants[6].i = top_blob_unpacked.w;
constants[7].i = top_blob_unpacked.h;
constants[8].i = top_blob_unpacked.c;
constants[9].i = 0;//top_blob_unpacked.cstep;
constants[10].i = coffset;

const Pipeline* pipeline = 0;
if (bottom_blob.elempack == 1 && elempack == 1)
{
pipeline = pipeline_concat[b%2];
}
else if (bottom_blob.elempack == 4 && elempack == 4)
{
pipeline = pipeline_concat_pack4[b%2];
}
else if (bottom_blob.elempack == 4 && elempack == 1)
{
pipeline = pipeline_concat_pack4to1[b%2];
}
else if (bottom_blob.elempack == 8 && elempack == 8)
{
pipeline = pipeline_concat_pack8[b%2];
}
else if (bottom_blob.elempack == 8 && elempack == 4)
{
pipeline = pipeline_concat_pack8to4[b%2];
}
else if (bottom_blob.elempack == 8 && elempack == 1)
{
pipeline = pipeline_concat_pack8to1[b%2];
}

cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);

coffset += bottom_blob.c * bottom_blob.elempack / elempack;
}

// packing
if (elempack < out_elempack)
{
const Layer* packing = out_elempack == 8 ? packing_pack8 : packing_pack4;
packing->forward(top_blob_unpacked, top_blob, cmd, opt);
}

return 0;
}

if (dims == 3 && axis == 1)
{
// interleave dim height
int w = bottom_blobs[0].w;
int channels = bottom_blobs[0].c;
size_t elemsize = bottom_blobs[0].elemsize;
int elempack = bottom_blobs[0].elempack;

// total height
int top_h = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkImageMat& bottom_blob = bottom_blobs[b];
top_h += bottom_blob.h;
}

VkImageMat& top_blob = top_blobs[0];
top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

int hoffset = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkImageMat& bottom_blob = bottom_blobs[b];

std::vector<VkImageMat> bindings(2);
bindings[0] = bottom_blob;
bindings[1] = top_blob;

std::vector<vk_constant_type> constants(11);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = 0;//bottom_blob.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = 0;//top_blob.cstep;
constants[10].i = hoffset;

const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b%2]
: elempack == 4 ? pipeline_concat_pack4[b%2]
: pipeline_concat[b%2];

cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);

hoffset += bottom_blob.h;
}

return 0;
}

if (dims == 3 && axis == 2)
{
// interleave dim width
int h = bottom_blobs[0].h;
int channels = bottom_blobs[0].c;
size_t elemsize = bottom_blobs[0].elemsize;
int elempack = bottom_blobs[0].elempack;

// total height
int top_w = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkImageMat& bottom_blob = bottom_blobs[b];
top_w += bottom_blob.w;
}

VkImageMat& top_blob = top_blobs[0];
top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

int woffset = 0;
for (size_t b=0; b<bottom_blobs.size(); b++)
{
const VkImageMat& bottom_blob = bottom_blobs[b];

std::vector<VkImageMat> bindings(2);
bindings[0] = bottom_blob;
bindings[1] = top_blob;

std::vector<vk_constant_type> constants(11);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = 0;//bottom_blob.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = 0;//top_blob.cstep;
constants[10].i = woffset;

const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b%2]
: elempack == 4 ? pipeline_concat_pack4[b%2]
: pipeline_concat[b%2];

cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);

woffset += bottom_blob.w;
}

return 0;
}

return 0;
}

} // namespace ncnn

+ 1
- 0
src/layer/vulkan/concat_vulkan.h View File

@@ -29,6 +29,7 @@ public:

using Concat::forward;
virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;

public:
ncnn::Layer* packing_pack4;


+ 600
- 93
src/layer/vulkan/convolution_vulkan.cpp View File

@@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(Convolution_vulkan)
Convolution_vulkan::Convolution_vulkan()
{
support_vulkan = true;
support_image_storage = true;

padding = 0;

@@ -149,7 +150,22 @@ int Convolution_vulkan::create_pipeline(const Option& opt)

size_t elemsize;
size_t out_elemsize;
if (opt.use_fp16_storage)
if (opt.use_image_storage && opt.use_image_fp16_storage)
{
elemsize = elempack * 2u;
out_elemsize = out_elempack * 2u;
}
else if (opt.use_image_storage && opt.use_image_fp16_packed)
{
elemsize = elempack == 1 ? 4u : elempack * 2u;
out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
}
else if (opt.use_image_storage)
{
elemsize = elempack * 4u;
out_elemsize = out_elempack * 4u;
}
else if (opt.use_fp16_storage)
{
elemsize = elempack * 2u;
out_elemsize = out_elempack * 2u;
@@ -210,22 +226,21 @@ int Convolution_vulkan::create_pipeline(const Option& opt)
if (is_conv1x1s1d1)
{
pipeline_convolution_1x1s1d1 = new Pipeline(vkdev);
if (opt.use_image_storage)
{
Mat local_size_xyz_local(4, 4, std::min(4, num_output / out_elempack), (void*)0);
if (out_shape_packed.dims != 0)
{
local_size_xyz_local.w = std::max(1, std::min(4, (out_shape_packed.w + 1) / 2));
local_size_xyz_local.h = std::max(1, std::min(4, (out_shape_packed.h + 1) / 2));
local_size_xyz_local.c = std::min(4, out_shape_packed.c);
}
pipeline_convolution_1x1s1d1->set_optimal_local_size_xyz(local_size_xyz_local);
}
else
{
pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output));

std::vector<vk_specialization_type> specializations(4 + 8);
specializations[0].i = bias_term;
specializations[1].i = activation_type;
specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f;
specializations[4 + 0].i = shape_bordered_packed.dims;
specializations[4 + 1].i = shape_bordered_packed.cstep / 4;
specializations[4 + 2].i = shape_bordered_packed.c;
specializations[4 + 3].i = shape_bordered_packed.cstep / 4;
specializations[4 + 4].i = out_shape_packed.dims;
specializations[4 + 5].i = out_shape_packed.cstep / 4;
specializations[4 + 6].i = out_shape_packed.c;
specializations[4 + 7].i = out_shape_packed.cstep / 4;

}
pipeline_convolution_1x1s1d1->create(LayerShaderType::convolution_1x1s1d1, opt, specializations);
}
else
@@ -242,22 +257,21 @@ int Convolution_vulkan::create_pipeline(const Option& opt)
if (is_conv1x1s1d1)
{
pipeline_convolution_pack4_1x1s1d1 = new Pipeline(vkdev);
if (opt.use_image_storage)
{
Mat local_size_xyz_local(4, 4, std::min(4, num_output / out_elempack), (void*)0);
if (out_shape_packed.dims != 0)
{
local_size_xyz_local.w = std::max(1, std::min(4, (out_shape_packed.w + 1) / 2));
local_size_xyz_local.h = std::max(1, std::min(4, (out_shape_packed.h + 1) / 2));
local_size_xyz_local.c = std::min(4, out_shape_packed.c);
}
pipeline_convolution_pack4_1x1s1d1->set_optimal_local_size_xyz(local_size_xyz_local);
}
else
{
pipeline_convolution_pack4_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output / 4));

std::vector<vk_specialization_type> specializations(4 + 8);
specializations[0].i = bias_term;
specializations[1].i = activation_type;
specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f;
specializations[4 + 0].i = shape_bordered_packed.dims;
specializations[4 + 1].i = shape_bordered_packed.w * shape_bordered_packed.h;
specializations[4 + 2].i = shape_bordered_packed.c;
specializations[4 + 3].i = shape_bordered_packed.cstep;
specializations[4 + 4].i = out_shape_packed.dims;
specializations[4 + 5].i = out_shape_packed.w * out_shape_packed.h;
specializations[4 + 6].i = out_shape_packed.c;
specializations[4 + 7].i = out_shape_packed.cstep;

}
pipeline_convolution_pack4_1x1s1d1->create(LayerShaderType::convolution_pack4_1x1s1d1, opt, specializations);
}
else if (is_conv3x3s1d1 && num_input >= 16 && num_output >= 16)
@@ -419,22 +433,21 @@ int Convolution_vulkan::create_pipeline(const Option& opt)
if (is_conv1x1s1d1)
{
pipeline_convolution_pack8_1x1s1d1 = new Pipeline(vkdev);
if (opt.use_image_storage)
{
Mat local_size_xyz_local(4, 4, std::min(4, num_output / out_elempack), (void*)0);
if (out_shape_packed.dims != 0)
{
local_size_xyz_local.w = std::max(1, std::min(4, (out_shape_packed.w + 1) / 2));
local_size_xyz_local.h = std::max(1, std::min(4, (out_shape_packed.h + 1) / 2));
local_size_xyz_local.c = std::min(4, out_shape_packed.c);
}
pipeline_convolution_pack8_1x1s1d1->set_optimal_local_size_xyz(local_size_xyz_local);
}
else
{
pipeline_convolution_pack8_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output / 8));

std::vector<vk_specialization_type> specializations(4 + 8);
specializations[0].i = bias_term;
specializations[1].i = activation_type;
specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f;
specializations[4 + 0].i = shape_bordered_packed.dims;
specializations[4 + 1].i = shape_bordered_packed.w * shape_bordered_packed.h;
specializations[4 + 2].i = shape_bordered_packed.c;
specializations[4 + 3].i = shape_bordered_packed.cstep;
specializations[4 + 4].i = out_shape_packed.dims;
specializations[4 + 5].i = out_shape_packed.w * out_shape_packed.h;
specializations[4 + 6].i = out_shape_packed.c;
specializations[4 + 7].i = out_shape_packed.cstep;

}
pipeline_convolution_pack8_1x1s1d1->create(LayerShaderType::convolution_pack8_1x1s1d1, opt, specializations);
}
else if (is_conv3x3s1d1 && num_input >= 16 && num_output >= 16)
@@ -695,6 +708,21 @@ int Convolution_vulkan::destroy_pipeline(const Option& opt)

int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
{
if (padding)
{
padding->upload_model(cmd, opt);
}

if (winograd_padding)
{
winograd_padding->upload_model(cmd, opt);
}

if (winograd_crop)
{
winograd_crop->upload_model(cmd, opt);
}

const int maxk = kernel_w * kernel_h;
int num_input = weight_data_size / maxk / num_output;

@@ -738,8 +766,14 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
}
}

cmd.record_upload(weight_data_packed, weight_data_gpu, opt);

if (opt.use_image_storage)
{
cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt);
}
else
{
cmd.record_upload(weight_data_packed, weight_data_gpu, opt);
}

bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1;

@@ -862,7 +896,14 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
}
}

cmd.record_upload(weight_data_pack4_tm, weight_data_gpu_pack4_tm, opt);
if (opt.use_image_storage)
{
cmd.record_upload(weight_data_pack4_tm, weight_data_gpu_pack4_tm_image, opt);
}
else
{
cmd.record_upload(weight_data_pack4_tm, weight_data_gpu_pack4_tm, opt);
}
}
}

@@ -952,7 +993,14 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
}
}

cmd.record_upload(weight_data_pack8_tm, weight_data_gpu_pack8_tm, opt);
if (opt.use_image_storage)
{
cmd.record_upload(weight_data_pack8_tm, weight_data_gpu_pack8_tm_image, opt);
}
else
{
cmd.record_upload(weight_data_pack8_tm, weight_data_gpu_pack8_tm, opt);
}
}
}

@@ -961,7 +1009,18 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
Mat bias_data_packed;
convert_packing(bias_data, bias_data_packed, out_elempack);

cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
if (opt.use_image_storage)
{
cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt);
}
else
{
cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
}
}
else if (opt.use_image_storage)
{
cmd.record_upload(Mat(1), bias_data_gpu_image, opt);
}

if (innerproduct)
@@ -1070,6 +1129,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
}

bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1;

if (elempack == 4 && out_elempack == 4 && is_conv3x3s1d1 && channels * elempack >= 16 && num_output >= 16)
{
// winograd23
@@ -1353,7 +1413,6 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
return 0;
}


top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;
@@ -1364,19 +1423,21 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
bindings[2] = weight_data_gpu;
bindings[3] = bias_term ? bias_data_gpu : bindings[2];// TODO use dummy buffer

std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob_bordered.dims;
constants[1].i = bottom_blob_bordered.w;
constants[2].i = bottom_blob_bordered.h;
constants[3].i = bottom_blob_bordered.c;
constants[4].i = bottom_blob_bordered.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = top_blob.cstep;

// record
if (elempack == 1 && out_elempack == 1 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
{
std::vector<vk_constant_type> constants(8);
constants[0].i = bottom_blob_bordered.dims;
constants[1].i = bottom_blob_bordered.cstep / 4;
constants[2].i = bottom_blob_bordered.c;
constants[3].i = bottom_blob_bordered.cstep / 4;
constants[4].i = top_blob.dims;
constants[5].i = top_blob.cstep / 4;
constants[6].i = top_blob.c;
constants[7].i = top_blob.cstep / 4;

VkMat dispatcher;
dispatcher.w = top_blob.cstep / 4;
dispatcher.h = 1;
@@ -1386,16 +1447,6 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
}
else if (elempack == 4 && out_elempack == 4 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
{
std::vector<vk_constant_type> constants(8);
constants[0].i = bottom_blob_bordered.dims;
constants[1].i = bottom_blob_bordered.w * bottom_blob_bordered.h;
constants[2].i = bottom_blob_bordered.c;
constants[3].i = bottom_blob_bordered.cstep;
constants[4].i = top_blob.dims;
constants[5].i = top_blob.w * top_blob.h;
constants[6].i = top_blob.c;
constants[7].i = top_blob.cstep;

VkMat dispatcher;
dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
dispatcher.h = 1;
@@ -1405,16 +1456,6 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
}
else if (elempack == 8 && out_elempack == 8 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
{
std::vector<vk_constant_type> constants(8);
constants[0].i = bottom_blob_bordered.dims;
constants[1].i = bottom_blob_bordered.w * bottom_blob_bordered.h;
constants[2].i = bottom_blob_bordered.c;
constants[3].i = bottom_blob_bordered.cstep;
constants[4].i = top_blob.dims;
constants[5].i = top_blob.w * top_blob.h;
constants[6].i = top_blob.c;
constants[7].i = top_blob.cstep;

VkMat dispatcher;
dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
dispatcher.h = 1;
@@ -1424,18 +1465,484 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
}
else
{
std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob_bordered.dims;
constants[1].i = bottom_blob_bordered.w;
constants[2].i = bottom_blob_bordered.h;
constants[3].i = bottom_blob_bordered.c;
constants[4].i = bottom_blob_bordered.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = top_blob.cstep;
const Pipeline* pipeline = 0;
if (elempack == 1 && out_elempack == 1)
{
pipeline = pipeline_convolution;
}
else if (elempack == 4 && out_elempack == 4)
{
pipeline = pipeline_convolution_pack4;
}
else if (elempack == 1 && out_elempack == 4)
{
pipeline = pipeline_convolution_pack1to4;
}
else if (elempack == 4 && out_elempack == 1)
{
pipeline = pipeline_convolution_pack4to1;
}
else if (elempack == 8 && out_elempack == 8)
{
pipeline = pipeline_convolution_pack8;
}
else if (elempack == 1 && out_elempack == 8)
{
pipeline = pipeline_convolution_pack1to8;
}
else if (elempack == 4 && out_elempack == 8)
{
pipeline = pipeline_convolution_pack4to8;
}
else if (elempack == 8 && out_elempack == 4)
{
pipeline = pipeline_convolution_pack8to4;
}
else if (elempack == 8 && out_elempack == 1)
{
pipeline = pipeline_convolution_pack8to1;
}

cmd.record_pipeline(pipeline, bindings, constants, top_blob);
}

return 0;
}

int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

// flattened blob, implement as InnerProduct
if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
{
int num_input = weight_data_size / num_output;
if (bottom_blob.w * bottom_blob.elempack == num_input)
{
return innerproduct->forward(bottom_blob, top_blob, cmd, opt);
}
}

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

VkImageMat bottom_blob_bordered = bottom_blob;
if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
{
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad);
}
else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
{
int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
if (wpad > 0 || hpad > 0)
{
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* padding_params = padding_param_blob.mapped();

padding_params[0] = hpad / 2;
padding_params[1] = hpad - hpad / 2;
padding_params[2] = wpad / 2;
padding_params[3] = wpad - wpad / 2;

std::vector<VkImageMat> padding_inputs(2);
padding_inputs[0] = bottom_blob;
padding_inputs[1] = padding_param_blob;

std::vector<VkImageMat> padding_outputs(1);
padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
bottom_blob_bordered = padding_outputs[0];
}
}
else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)
{
int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
if (wpad > 0 || hpad > 0)
{
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* padding_params = padding_param_blob.mapped();

padding_params[0] = hpad - hpad / 2;
padding_params[1] = hpad / 2;
padding_params[2] = wpad - wpad / 2;
padding_params[3] = wpad / 2;

std::vector<VkImageMat> padding_inputs(2);
padding_inputs[0] = bottom_blob;
padding_inputs[1] = padding_param_blob;

std::vector<VkImageMat> padding_outputs(1);
padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
bottom_blob_bordered = padding_outputs[0];
}
}

w = bottom_blob_bordered.w;
h = bottom_blob_bordered.h;

int outw = (w - kernel_extent_w) / stride_w + 1;
int outh = (h - kernel_extent_h) / stride_h + 1;
int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / elempack * out_elempack;

if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage)
{
if (out_elempack == 8) out_elemsize = 8*2u;
if (out_elempack == 4) out_elemsize = 4*2u;
if (out_elempack == 1) out_elemsize = 4u;
}

bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1;

if (elempack == 4 && out_elempack == 4 && is_conv3x3s1d1 && channels * elempack >= 16 && num_output >= 16)
{
// winograd23
int outw_bordered = (outw + 1) / 2 * 2;
int outh_bordered = (outh + 1) / 2 * 2;

int w_bordered = outw_bordered + 2;
int h_bordered = outh_bordered + 2;

int block_x = outw_bordered / 2;
int block_y = outh_bordered / 2;

// pad to 2n+2
{
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* padding_params = padding_param_blob.mapped();

padding_params[0] = 0;
padding_params[1] = h_bordered - bottom_blob_bordered.h;
padding_params[2] = 0;
padding_params[3] = w_bordered - bottom_blob_bordered.w;

std::vector<VkImageMat> padding_inputs(2);
padding_inputs[0] = bottom_blob_bordered;
padding_inputs[1] = padding_param_blob;

std::vector<VkImageMat> padding_outputs(1);
winograd_padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
bottom_blob_bordered = padding_outputs[0];
}

// transform input
VkImageMat bottom_tm_blob;
{
bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator);
if (bottom_tm_blob.empty())
return -100;

std::vector<VkImageMat> bindings(2);
bindings[0] = bottom_blob_bordered;
bindings[1] = bottom_tm_blob;

std::vector<vk_constant_type> constants(7);
constants[0].i = bottom_blob_bordered.w;
constants[1].i = bottom_blob_bordered.h;
constants[2].i = bottom_blob_bordered.c;
constants[3].i = 0;//bottom_blob_bordered.cstep;
constants[4].i = 0;//bottom_tm_blob.cstep;
constants[5].i = block_x;
constants[6].i = block_y;

VkImageMat dispatcher;
dispatcher.w = block_x;
dispatcher.h = block_y;
dispatcher.c = bottom_tm_blob.c;

cmd.record_pipeline(pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input, bindings, constants, dispatcher);
}

// gemm
VkImageMat top_tm_blob;
{
top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
if (top_tm_blob.empty())
return -100;

std::vector<VkImageMat> bindings(3);
bindings[0] = bottom_tm_blob;
bindings[1] = top_tm_blob;
bindings[2] = weight_data_gpu_pack4_tm_image;

std::vector<vk_constant_type> constants(5);
constants[0].i = bottom_tm_blob.c;
constants[1].i = 0;//bottom_tm_blob.cstep;
constants[2].i = top_tm_blob.h;
constants[3].i = top_tm_blob.c;
constants[4].i = 0;//top_tm_blob.cstep;

VkImageMat dispatcher;
dispatcher.w = top_tm_blob.w;
dispatcher.h = (top_tm_blob.h + 3) / 4;
dispatcher.c = top_tm_blob.c;

cmd.record_pipeline(pipeline_convolution_pack4_3x3s1d1_winograd23_gemm, bindings, constants, dispatcher);
}

// transform output
VkImageMat top_blob_bordered;
{
top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob_bordered.empty())
return -100;

std::vector<VkImageMat> bindings(3);
bindings[0] = top_tm_blob;
bindings[1] = top_blob_bordered;
bindings[2] = bias_data_gpu_image;// TODO use dummy buffer

std::vector<vk_constant_type> constants(7);
constants[0].i = top_tm_blob.c;
constants[1].i = 0;//top_tm_blob.cstep;
constants[2].i = block_x;
constants[3].i = block_y;
constants[4].i = top_blob_bordered.w;
constants[5].i = top_blob_bordered.h;
constants[6].i = 0;//top_blob_bordered.cstep;

VkImageMat dispatcher;
dispatcher.w = block_x;
dispatcher.h = block_y;
dispatcher.c = top_blob_bordered.c;

cmd.record_pipeline(pipeline_convolution_pack4_3x3s1d1_winograd23_transform_output, bindings, constants, dispatcher);
}

// crop top_blob
{
VkImageMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
int* crop_params = crop_param_blob.mapped();

crop_params[0] = 0;
crop_params[1] = 0;
crop_params[2] = 0;
crop_params[3] = outw;
crop_params[4] = outh;
crop_params[5] = num_output;

std::vector<VkImageMat> crop_inputs(2);
crop_inputs[0] = top_blob_bordered;
crop_inputs[1] = crop_param_blob;

std::vector<VkImageMat> crop_outputs(1);
winograd_crop->forward(crop_inputs, crop_outputs, cmd, opt);
top_blob = crop_outputs[0];
}

return 0;
}
if (elempack == 8 && out_elempack == 8 && is_conv3x3s1d1 && channels * elempack >= 16 && num_output >= 16)
{
// winograd23
int outw_bordered = (outw + 1) / 2 * 2;
int outh_bordered = (outh + 1) / 2 * 2;

int w_bordered = outw_bordered + 2;
int h_bordered = outh_bordered + 2;

int block_x = outw_bordered / 2;
int block_y = outh_bordered / 2;

// pad to 2n+2
{
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* padding_params = padding_param_blob.mapped();

padding_params[0] = 0;
padding_params[1] = h_bordered - bottom_blob_bordered.h;
padding_params[2] = 0;
padding_params[3] = w_bordered - bottom_blob_bordered.w;

std::vector<VkImageMat> padding_inputs(2);
padding_inputs[0] = bottom_blob_bordered;
padding_inputs[1] = padding_param_blob;

std::vector<VkImageMat> padding_outputs(1);
winograd_padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
bottom_blob_bordered = padding_outputs[0];
}

// transform input
VkImageMat bottom_tm_blob;
{
bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator);
if (bottom_tm_blob.empty())
return -100;

std::vector<VkImageMat> bindings(2);
bindings[0] = bottom_blob_bordered;
bindings[1] = bottom_tm_blob;

std::vector<vk_constant_type> constants(7);
constants[0].i = bottom_blob_bordered.w;
constants[1].i = bottom_blob_bordered.h;
constants[2].i = bottom_blob_bordered.c;
constants[3].i = 0;//bottom_blob_bordered.cstep;
constants[4].i = 0;//bottom_tm_blob.cstep;
constants[5].i = block_x;
constants[6].i = block_y;

VkImageMat dispatcher;
dispatcher.w = block_x;
dispatcher.h = block_y;
dispatcher.c = bottom_tm_blob.c;

cmd.record_pipeline(pipeline_convolution_pack8_3x3s1d1_winograd23_transform_input, bindings, constants, dispatcher);
}

// gemm
VkImageMat top_tm_blob;
{
top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
if (top_tm_blob.empty())
return -100;

std::vector<VkImageMat> bindings(3);
bindings[0] = bottom_tm_blob;
bindings[1] = top_tm_blob;
bindings[2] = weight_data_gpu_pack8_tm_image;

std::vector<vk_constant_type> constants(5);
constants[0].i = bottom_tm_blob.c;
constants[1].i = 0;//bottom_tm_blob.cstep;
constants[2].i = top_tm_blob.h;
constants[3].i = top_tm_blob.c;
constants[4].i = 0;//top_tm_blob.cstep;

VkImageMat dispatcher;
dispatcher.w = top_tm_blob.w;
dispatcher.h = (top_tm_blob.h + 3) / 4;
dispatcher.c = top_tm_blob.c;

cmd.record_pipeline(pipeline_convolution_pack8_3x3s1d1_winograd23_gemm, bindings, constants, dispatcher);
}

// transform output
VkImageMat top_blob_bordered;
{
top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob_bordered.empty())
return -100;

std::vector<VkImageMat> bindings(3);
bindings[0] = top_tm_blob;
bindings[1] = top_blob_bordered;
bindings[2] = bias_data_gpu_image;// TODO use dummy buffer

std::vector<vk_constant_type> constants(7);
constants[0].i = top_tm_blob.c;
constants[1].i = 0;//top_tm_blob.cstep;
constants[2].i = block_x;
constants[3].i = block_y;
constants[4].i = top_blob_bordered.w;
constants[5].i = top_blob_bordered.h;
constants[6].i = 0;//top_blob_bordered.cstep;

VkImageMat dispatcher;
dispatcher.w = block_x;
dispatcher.h = block_y;
dispatcher.c = top_blob_bordered.c;

cmd.record_pipeline(pipeline_convolution_pack8_3x3s1d1_winograd23_transform_output, bindings, constants, dispatcher);
}

// crop top_blob
{
VkImageMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
int* crop_params = crop_param_blob.mapped();

crop_params[0] = 0;
crop_params[1] = 0;
crop_params[2] = 0;
crop_params[3] = outw;
crop_params[4] = outh;
crop_params[5] = num_output;

std::vector<VkImageMat> crop_inputs(2);
crop_inputs[0] = top_blob_bordered;
crop_inputs[1] = crop_param_blob;

std::vector<VkImageMat> crop_outputs(1);
winograd_crop->forward(crop_inputs, crop_outputs, cmd, opt);
top_blob = crop_outputs[0];
}

return 0;
}

top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

std::vector<VkImageMat> bindings(4);
bindings[0] = bottom_blob_bordered;
bindings[1] = top_blob;
bindings[2] = weight_data_gpu_image;
bindings[3] = bias_data_gpu_image;// TODO use dummy buffer

std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob_bordered.dims;
constants[1].i = bottom_blob_bordered.w;
constants[2].i = bottom_blob_bordered.h;
constants[3].i = bottom_blob_bordered.c;
constants[4].i = 0;//bottom_blob_bordered.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = 0;//top_blob.cstep;

// record
if (elempack == 1 && out_elempack == 1 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
{
VkImageMat dispatcher;
dispatcher.w = (top_blob.w + 1) / 2;
dispatcher.h = (top_blob.h + 1) / 2;
dispatcher.c = top_blob.c;

cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher);
}
else if (elempack == 4 && out_elempack == 4 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
{
VkImageMat dispatcher;
dispatcher.w = (top_blob.w + 1) / 2;
dispatcher.h = (top_blob.h + 1) / 2;
dispatcher.c = top_blob.c;

cmd.record_pipeline(pipeline_convolution_pack4_1x1s1d1, bindings, constants, dispatcher);
}
else if (elempack == 8 && out_elempack == 8 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
{
VkImageMat dispatcher;
dispatcher.w = (top_blob.w + 1) / 2;
dispatcher.h = (top_blob.h + 1) / 2;
dispatcher.c = top_blob.c;

cmd.record_pipeline(pipeline_convolution_pack8_1x1s1d1, bindings, constants, dispatcher);
}
else
{
const Pipeline* pipeline = 0;
if (elempack == 1 && out_elempack == 1)
{


+ 6
- 0
src/layer/vulkan/convolution_vulkan.h View File

@@ -31,6 +31,7 @@ public:

using Convolution::forward;
virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;

public:
ncnn::Layer* padding;
@@ -38,6 +39,9 @@ public:
VkMat weight_data_gpu;
VkMat bias_data_gpu;

VkImageMat weight_data_gpu_image;
VkImageMat bias_data_gpu_image;

Pipeline* pipeline_convolution;
Pipeline* pipeline_convolution_1x1s1d1;
Pipeline* pipeline_convolution_pack4;
@@ -55,12 +59,14 @@ public:
ncnn::Layer* winograd_padding;
ncnn::Layer* winograd_crop;
VkMat weight_data_gpu_pack4_tm;
VkImageMat weight_data_gpu_pack4_tm_image;
Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input;
Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd23_gemm;
Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd23_transform_output;

// pack8 winograd23
VkMat weight_data_gpu_pack8_tm;
VkImageMat weight_data_gpu_pack8_tm_image;
Pipeline* pipeline_convolution_pack8_3x3s1d1_winograd23_transform_input;
Pipeline* pipeline_convolution_pack8_3x3s1d1_winograd23_gemm;
Pipeline* pipeline_convolution_pack8_3x3s1d1_winograd23_transform_output;


+ 306
- 5
src/layer/vulkan/convolutiondepthwise_vulkan.cpp View File

@@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(ConvolutionDepthWise_vulkan)
ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan()
{
support_vulkan = true;
support_image_storage = true;

padding = 0;
packing_unpack = 0;
@@ -106,7 +107,22 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& opt)

size_t elemsize;
size_t out_elemsize;
if (opt.use_fp16_storage)
if (opt.use_image_storage && opt.use_image_fp16_storage)
{
elemsize = elempack * 2u;
out_elemsize = out_elempack * 2u;
}
else if (opt.use_image_storage && opt.use_image_fp16_packed)
{
elemsize = elempack == 1 ? 4u : elempack * 2u;
out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
}
else if (opt.use_image_storage)
{
elemsize = elempack * 4u;
out_elemsize = out_elempack * 4u;
}
else if (opt.use_fp16_storage)
{
elemsize = elempack * 2u;
out_elemsize = out_elempack * 2u;
@@ -199,7 +215,22 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& opt)

size_t elemsize_g;
size_t out_elemsize_g;
if (opt.use_fp16_storage)
if (opt.use_image_storage && opt.use_image_fp16_storage)
{
elemsize_g = elempack_g * 2u;
out_elemsize_g = out_elempack_g * 2u;
}
else if (opt.use_image_storage && opt.use_image_fp16_packed)
{
elemsize_g = elempack_g == 1 ? 4u : elempack_g * 2u;
out_elemsize_g = out_elempack_g == 1 ? 4u : out_elempack_g * 2u;
}
else if (opt.use_image_storage)
{
elemsize_g = elempack_g * 4u;
out_elemsize_g = out_elempack_g * 4u;
}
else if (opt.use_fp16_storage)
{
elemsize_g = elempack_g * 2u;
out_elemsize_g = out_elempack_g * 2u;
@@ -415,6 +446,21 @@ int ConvolutionDepthWise_vulkan::destroy_pipeline(const Option& opt)

int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
{
if (padding)
{
padding->upload_model(cmd, opt);
}

if (packing_unpack)
{
packing_unpack->upload_model(cmd, opt);
}

if (packing_pack)
{
packing_pack->upload_model(cmd, opt);
}

const int maxk = kernel_w * kernel_h;
int channels = (weight_data_size / group) / maxk / (num_output / group) * group;

@@ -430,12 +476,25 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt

cmd.record_upload(weight_data_packed, weight_data_gpu, opt);

cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt);

if (bias_term)
{
Mat bias_data_packed;
convert_packing(bias_data, bias_data_packed, out_elempack);

cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
if (opt.use_image_storage)
{
cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt);
}
else
{
cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
}
}
else if (opt.use_image_storage)
{
cmd.record_upload(Mat(1), bias_data_gpu_image, opt);
}

return 0;
@@ -493,14 +552,32 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt
}
}

cmd.record_upload(weight_data_packed_groups, weight_data_gpu, opt);
if (opt.use_image_storage)
{
cmd.record_upload(weight_data_packed_groups, weight_data_gpu_image, opt);
}
else
{
cmd.record_upload(weight_data_packed_groups, weight_data_gpu, opt);
}

if (bias_term)
{
Mat bias_data_packed;
convert_packing(bias_data, bias_data_packed, out_elempack_g);

cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
if (opt.use_image_storage)
{
cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt);
}
else
{
cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
}
}
else if (opt.use_image_storage)
{
cmd.record_upload(Mat(1), bias_data_gpu_image, opt);
}

return 0;
@@ -730,4 +807,228 @@ int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_bl
return 0;
}

int ConvolutionDepthWise_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

VkImageMat bottom_blob_bordered = bottom_blob;
if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
{
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad);
}
else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
{
int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
if (wpad > 0 || hpad > 0)
{
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* padding_params = padding_param_blob.mapped();

padding_params[0] = hpad / 2;
padding_params[1] = hpad - hpad / 2;
padding_params[2] = wpad / 2;
padding_params[3] = wpad - wpad / 2;

std::vector<VkImageMat> padding_inputs(2);
padding_inputs[0] = bottom_blob;
padding_inputs[1] = padding_param_blob;

std::vector<VkImageMat> padding_outputs(1);
padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
bottom_blob_bordered = padding_outputs[0];
}
}
else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)
{
int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
if (wpad > 0 || hpad > 0)
{
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* padding_params = padding_param_blob.mapped();

padding_params[0] = hpad - hpad / 2;
padding_params[1] = hpad / 2;
padding_params[2] = wpad - wpad / 2;
padding_params[3] = wpad / 2;

std::vector<VkImageMat> padding_inputs(2);
padding_inputs[0] = bottom_blob;
padding_inputs[1] = padding_param_blob;

std::vector<VkImageMat> padding_outputs(1);
padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
bottom_blob_bordered = padding_outputs[0];
}
}

w = bottom_blob_bordered.w;
h = bottom_blob_bordered.h;

int outw = (w - kernel_extent_w) / stride_w + 1;
int outh = (h - kernel_extent_h) / stride_h + 1;
int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / elempack * out_elempack;

if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage)
{
if (out_elempack == 8) out_elemsize = 8*2u;
if (out_elempack == 4) out_elemsize = 4*2u;
if (out_elempack == 1) out_elemsize = 4u;
}

top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

// depth-wise
if (channels == group / elempack && group / elempack == num_output / elempack)
{
std::vector<VkImageMat> bindings(4);
bindings[0] = bottom_blob_bordered;
bindings[1] = top_blob;
bindings[2] = weight_data_gpu_image;
bindings[3] = bias_data_gpu_image;// TODO use dummy buffer

std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob_bordered.dims;
constants[1].i = bottom_blob_bordered.w;
constants[2].i = bottom_blob_bordered.h;
constants[3].i = bottom_blob_bordered.c;
constants[4].i = 0;//bottom_blob_bordered.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = 0;//top_blob.cstep;

const Pipeline* pipeline = elempack == 8 ? pipeline_convolutiondepthwise_pack8
: elempack == 4 ? pipeline_convolutiondepthwise_pack4
: pipeline_convolutiondepthwise;

cmd.record_pipeline(pipeline, bindings, constants, top_blob);

return 0;
}

const int channels_g = channels * elempack / group;
const int num_output_g = num_output / group;

int elempack_g = opt.use_shader_pack8 && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
int out_elempack_g = opt.use_shader_pack8 && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
size_t out_elemsize_g = elemsize / elempack * out_elempack_g;

if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage)
{
if (out_elempack_g == 8) out_elemsize_g = 8*2u;
if (out_elempack_g == 4) out_elemsize_g = 4*2u;
if (out_elempack_g == 1) out_elemsize_g = 4u;
}

// unpacking
VkImageMat bottom_blob_bordered_unpacked = bottom_blob_bordered;
if (elempack > elempack_g)
{
Option opt_pack1 = opt;
opt_pack1.blob_vkallocator = opt.workspace_vkallocator;

packing_unpack->forward(bottom_blob_bordered, bottom_blob_bordered_unpacked, cmd, opt_pack1);
}

VkImageMat top_blob_unpacked = top_blob;
if (out_elempack_g < out_elempack)
{
top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator);
if (top_blob_unpacked.empty())
return -100;
}

std::vector<VkImageMat> bindings(4);
bindings[0] = bottom_blob_bordered_unpacked;
bindings[1] = top_blob_unpacked;
bindings[2] = weight_data_gpu_image;
bindings[3] = bias_data_gpu_image;// TODO use dummy buffer

std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob_bordered_unpacked.dims;
constants[1].i = bottom_blob_bordered_unpacked.w;
constants[2].i = bottom_blob_bordered_unpacked.h;
constants[3].i = bottom_blob_bordered_unpacked.c;
constants[4].i = 0;//bottom_blob_bordered_unpacked.cstep;
constants[5].i = top_blob_unpacked.dims;
constants[6].i = top_blob_unpacked.w;
constants[7].i = top_blob_unpacked.h;
constants[8].i = top_blob_unpacked.c;
constants[9].i = 0;//top_blob_unpacked.cstep;

const Pipeline* pipeline = 0;
if (elempack_g == 1 && out_elempack_g == 1)
{
pipeline = pipeline_convolutiondepthwise_group;
}
else if (elempack_g == 4 && out_elempack_g == 4)
{
pipeline = pipeline_convolutiondepthwise_group_pack4;
}
else if (elempack_g == 1 && out_elempack_g == 4)
{
pipeline = pipeline_convolutiondepthwise_group_pack1to4;
}
else if (elempack_g == 4 && out_elempack_g == 1)
{
pipeline = pipeline_convolutiondepthwise_group_pack4to1;
}
else if (elempack_g == 8 && out_elempack_g == 8)
{
pipeline = pipeline_convolutiondepthwise_group_pack8;
}
else if (elempack_g == 1 && out_elempack_g == 8)
{
pipeline = pipeline_convolutiondepthwise_group_pack1to8;
}
else if (elempack_g == 4 && out_elempack_g == 8)
{
pipeline = pipeline_convolutiondepthwise_group_pack4to8;
}
else if (elempack_g == 8 && out_elempack_g == 4)
{
pipeline = pipeline_convolutiondepthwise_group_pack8to4;
}
else if (elempack_g == 8 && out_elempack_g == 1)
{
pipeline = pipeline_convolutiondepthwise_group_pack8to1;
}

cmd.record_pipeline(pipeline, bindings, constants, top_blob_unpacked);

// packing
if (out_elempack_g < out_elempack)
{
packing_pack->forward(top_blob_unpacked, top_blob, cmd, opt);
}
else
{
top_blob = top_blob_unpacked;
}

return 0;
}

} // namespace ncnn

+ 4
- 0
src/layer/vulkan/convolutiondepthwise_vulkan.h View File

@@ -31,11 +31,15 @@ public:

using ConvolutionDepthWise::forward;
virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;

public:
VkMat weight_data_gpu;
VkMat bias_data_gpu;

VkImageMat weight_data_gpu_image;
VkImageMat bias_data_gpu_image;

ncnn::Layer* padding;
ncnn::Layer* packing_unpack;
ncnn::Layer* packing_pack;


+ 287
- 2
src/layer/vulkan/crop_vulkan.cpp View File

@@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(Crop_vulkan)
Crop_vulkan::Crop_vulkan()
{
support_vulkan = true;
support_image_storage = true;

packing_pack1 = 0;
packing_pack4 = 0;
@@ -104,7 +105,22 @@ int Crop_vulkan::create_pipeline(const Option& opt)

size_t elemsize;
size_t out_elemsize;
if (opt.use_fp16_storage)
if (opt.use_image_storage && opt.use_image_fp16_storage)
{
elemsize = elempack * 2u;
out_elemsize = out_elempack * 2u;
}
else if (opt.use_image_storage && opt.use_image_fp16_packed)
{
elemsize = elempack == 1 ? 4u : elempack * 2u;
out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
}
else if (opt.use_image_storage)
{
elemsize = elempack * 4u;
out_elemsize = out_elempack * 4u;
}
else if (opt.use_fp16_storage)
{
elemsize = elempack * 2u;
out_elemsize = out_elempack * 2u;
@@ -134,7 +150,19 @@ int Crop_vulkan::create_pipeline(const Option& opt)
if (bottom_shapes.size() == 1 && shape.dims != 0 && elempack == out_elempack && elempack > offset_elempack)
{
size_t offset_elemsize;
if (opt.use_fp16_storage)
if (opt.use_image_storage && opt.use_image_fp16_storage)
{
offset_elemsize = offset_elempack * 2u;
}
else if (opt.use_image_storage && opt.use_image_fp16_packed)
{
offset_elemsize = offset_elempack == 1 ? 4u : offset_elempack * 2u;
}
else if (opt.use_image_storage)
{
offset_elemsize = offset_elempack * 4u;
}
else if (opt.use_fp16_storage)
{
offset_elemsize = offset_elempack * 2u;
}
@@ -598,4 +626,261 @@ int Crop_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkM
return 0;
}

int Crop_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int dims = bottom_blob.dims;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

int _woffset, _hoffset, _coffset;
int _outw, _outh, _outc;
resolve_crop_roi(bottom_blob.shape(), _woffset, _hoffset, _coffset, _outw, _outh, _outc);

// TODO vec and image crop

if (dims == 3)
{
int offset_elempack = _coffset == 0 ? elempack : opt.use_shader_pack8 && _coffset % 8 == 0 ? 8 : _coffset % 4 == 0 ? 4 : 1;

int out_elempack = opt.use_shader_pack8 && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / elempack * out_elempack;

if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage)
{
if (out_elempack == 8) out_elemsize = 8*2u;
if (out_elempack == 4) out_elemsize = 4*2u;
if (out_elempack == 1) out_elemsize = 4u;
}

// unpacking
VkImageMat bottom_blob_unpacked = bottom_blob;
if (elempack == out_elempack && elempack > offset_elempack)
{
Option opt_pack1 = opt;
opt_pack1.blob_vkallocator = opt.workspace_vkallocator;

const Layer* packing = offset_elempack == 4 ? packing_pack4 : packing_pack1;
packing->forward(bottom_blob, bottom_blob_unpacked, cmd, opt_pack1);
}

top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

std::vector<VkImageMat> bindings(2);
bindings[0] = bottom_blob_unpacked;
bindings[1] = top_blob;

std::vector<vk_constant_type> constants(13);
constants[0].i = bottom_blob_unpacked.dims;
constants[1].i = bottom_blob_unpacked.w;
constants[2].i = bottom_blob_unpacked.h;
constants[3].i = bottom_blob_unpacked.c;
constants[4].i = 0;//bottom_blob_unpacked.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = 0;//top_blob.cstep;
constants[10].i = _woffset;
constants[11].i = _hoffset;
constants[12].i = _coffset;

const Pipeline* pipeline = 0;
if (elempack == 1 && out_elempack == 1)
{
pipeline = pipeline_crop;
}
else if (elempack == 4 && offset_elempack == 4 && out_elempack == 4)
{
constants[12].i = _coffset / 4;

pipeline = pipeline_crop_pack4;
}
else if (elempack == 4 && offset_elempack == 1 && out_elempack == 4)
{
pipeline = pipeline_crop_pack1to4;
}
else if (elempack == 1 && out_elempack == 4)
{
pipeline = pipeline_crop_pack1to4;
}
else if (elempack == 4 && out_elempack == 1)
{
pipeline = pipeline_crop_pack4to1;
}
else if (elempack == 8 && offset_elempack == 8 && out_elempack == 8)
{
constants[12].i = _coffset / 8;

pipeline = pipeline_crop_pack8;
}
else if (elempack == 8 && offset_elempack == 4 && out_elempack == 8)
{
pipeline = pipeline_crop_pack4to8;
}
else if (elempack == 8 && offset_elempack == 1 && out_elempack == 8)
{
pipeline = pipeline_crop_pack1to8;
}
else if (elempack == 1 && out_elempack == 8)
{
pipeline = pipeline_crop_pack1to8;
}
else if (elempack == 4 && out_elempack == 8)
{
pipeline = pipeline_crop_pack4to8;
}
else if (elempack == 8 && out_elempack == 4)
{
pipeline = pipeline_crop_pack8to4;
}
else if (elempack == 8 && out_elempack == 1)
{
pipeline = pipeline_crop_pack8to1;
}

cmd.record_pipeline(pipeline, bindings, constants, top_blob);
}

return 0;
}

int Crop_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const
{
const VkImageMat& bottom_blob = bottom_blobs[0];
const VkImageMat& reference_blob = bottom_blobs[1];

int h = bottom_blob.h;
int channels = bottom_blob.c;
int dims = bottom_blob.dims;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

int _woffset, _hoffset, _coffset;
int _outw, _outh, _outc;
if (woffset == -233)
{
resolve_crop_roi(bottom_blob.shape(), (const int*)reference_blob.mapped(), _woffset, _hoffset, _coffset, _outw, _outh, _outc);
}
else
{
resolve_crop_roi(bottom_blob.shape(), reference_blob.shape(), _woffset, _hoffset, _coffset, _outw, _outh, _outc);
}

// TODO vec and image crop

if (dims == 3)
{
int offset_elempack = _coffset == 0 ? elempack : opt.use_shader_pack8 && _coffset % 8 == 0 ? 8 : _coffset % 4 == 0 ? 4 : 1;

int out_elempack = opt.use_shader_pack8 && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / elempack * out_elempack;

if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage)
{
if (out_elempack == 8) out_elemsize = 8*2u;
if (out_elempack == 4) out_elemsize = 4*2u;
if (out_elempack == 1) out_elemsize = 4u;
}

// unpacking
VkImageMat bottom_blob_unpacked = bottom_blob;
if (elempack == out_elempack && elempack > offset_elempack)
{
Option opt_pack1 = opt;
opt_pack1.blob_vkallocator = opt.workspace_vkallocator;

const Layer* packing = offset_elempack == 4 ? packing_pack4 : packing_pack1;
packing->forward(bottom_blob, bottom_blob_unpacked, cmd, opt_pack1);
}

VkImageMat& top_blob = top_blobs[0];

top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

std::vector<VkImageMat> bindings(2);
bindings[0] = bottom_blob_unpacked;
bindings[1] = top_blob;

std::vector<vk_constant_type> constants(13);
constants[0].i = bottom_blob_unpacked.dims;
constants[1].i = bottom_blob_unpacked.w;
constants[2].i = bottom_blob_unpacked.h;
constants[3].i = bottom_blob_unpacked.c;
constants[4].i = 0;//bottom_blob_unpacked.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = 0;//top_blob.cstep;
constants[10].i = _woffset;
constants[11].i = _hoffset;
constants[12].i = _coffset;

const Pipeline* pipeline = 0;
if (elempack == 1 && out_elempack == 1)
{
pipeline = pipeline_crop;
}
else if (elempack == 4 && offset_elempack == 4 && out_elempack == 4)
{
constants[12].i = _coffset / 4;

pipeline = pipeline_crop_pack4;
}
else if (elempack == 4 && offset_elempack == 1 && out_elempack == 4)
{
pipeline = pipeline_crop_pack1to4;
}
else if (elempack == 1 && out_elempack == 4)
{
pipeline = pipeline_crop_pack1to4;
}
else if (elempack == 4 && out_elempack == 1)
{
pipeline = pipeline_crop_pack4to1;
}
else if (elempack == 8 && offset_elempack == 8 && out_elempack == 8)
{
constants[12].i = _coffset / 8;

pipeline = pipeline_crop_pack8;
}
else if (elempack == 8 && offset_elempack == 4 && out_elempack == 8)
{
pipeline = pipeline_crop_pack4to8;
}
else if (elempack == 8 && offset_elempack == 1 && out_elempack == 8)
{
pipeline = pipeline_crop_pack1to8;
}
else if (elempack == 1 && out_elempack == 8)
{
pipeline = pipeline_crop_pack1to8;
}
else if (elempack == 4 && out_elempack == 8)
{
pipeline = pipeline_crop_pack4to8;
}
else if (elempack == 8 && out_elempack == 4)
{
pipeline = pipeline_crop_pack8to4;
}
else if (elempack == 8 && out_elempack == 1)
{
pipeline = pipeline_crop_pack8to1;
}

cmd.record_pipeline(pipeline, bindings, constants, top_blob);
}

return 0;
}

} // namespace ncnn

+ 4
- 0
src/layer/vulkan/crop_vulkan.h View File

@@ -32,6 +32,10 @@ public:

virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;

virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;

virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;

public:
ncnn::Layer* packing_pack1;
ncnn::Layer* packing_pack4;


+ 245
- 3
src/layer/vulkan/deconvolution_vulkan.cpp View File

@@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(Deconvolution_vulkan)
Deconvolution_vulkan::Deconvolution_vulkan()
{
support_vulkan = true;
support_image_storage = true;

crop = 0;
output_pad = 0;
@@ -130,7 +131,22 @@ int Deconvolution_vulkan::create_pipeline(const Option& opt)

size_t elemsize;
size_t out_elemsize;
if (opt.use_fp16_storage)
if (opt.use_image_storage && opt.use_image_fp16_storage)
{
elemsize = elempack * 2u;
out_elemsize = out_elempack * 2u;
}
else if (opt.use_image_storage && opt.use_image_fp16_packed)
{
elemsize = elempack == 1 ? 4u : elempack * 2u;
out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
}
else if (opt.use_image_storage)
{
elemsize = elempack * 4u;
out_elemsize = out_elempack * 4u;
}
else if (opt.use_fp16_storage)
{
elemsize = elempack * 2u;
out_elemsize = out_elempack * 2u;
@@ -316,6 +332,21 @@ int Deconvolution_vulkan::destroy_pipeline(const Option& opt)

int Deconvolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
{
if (crop)
{
crop->upload_model(cmd, opt);
}

if (output_pad)
{
output_pad->upload_model(cmd, opt);
}

if (output_crop)
{
output_crop->upload_model(cmd, opt);
}

const int maxk = kernel_w * kernel_h;
int num_input = weight_data_size / maxk / num_output;

@@ -376,14 +407,32 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
}
}

cmd.record_upload(weight_data_packed, weight_data_gpu, opt);
if (opt.use_image_storage)
{
cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt);
}
else
{
cmd.record_upload(weight_data_packed, weight_data_gpu, opt);
}

if (bias_term)
{
Mat bias_data_packed;
convert_packing(bias_data, bias_data_packed, out_elempack);

cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
if (opt.use_image_storage)
{
cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt);
}
else
{
cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
}
}
else if (opt.use_image_storage)
{
cmd.record_upload(Mat(1), bias_data_gpu_image, opt);
}

return 0;
@@ -582,4 +631,197 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC
return 0;
}

int Deconvolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

int outw = (w - 1) * stride_w + kernel_extent_w;
int outh = (h - 1) * stride_h + kernel_extent_h;
int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / elempack * out_elempack;

if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage)
{
if (out_elempack == 8) out_elemsize = 8*2u;
if (out_elempack == 4) out_elemsize = 4*2u;
if (out_elempack == 1) out_elemsize = 4u;
}

VkImageMat top_blob_bordered;
if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
{
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
}
else
{
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
}
if (top_blob_bordered.empty())
return -100;

std::vector<VkImageMat> bindings(4);
bindings[0] = bottom_blob;
bindings[1] = top_blob_bordered;
bindings[2] = weight_data_gpu_image;
bindings[3] = bias_data_gpu_image;// TODO use dummy buffer

std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = 0;//bottom_blob.cstep;
constants[5].i = top_blob_bordered.dims;
constants[6].i = top_blob_bordered.w;
constants[7].i = top_blob_bordered.h;
constants[8].i = top_blob_bordered.c;
constants[9].i = 0;//top_blob_bordered.cstep;

const Pipeline* pipeline = 0;
if (elempack == 1 && out_elempack == 1)
{
pipeline = pipeline_deconvolution;
}
else if (elempack == 4 && out_elempack == 4)
{
pipeline = pipeline_deconvolution_pack4;
}
else if (elempack == 1 && out_elempack == 4)
{
pipeline = pipeline_deconvolution_pack1to4;
}
else if (elempack == 4 && out_elempack == 1)
{
pipeline = pipeline_deconvolution_pack4to1;
}
else if (elempack == 8 && out_elempack == 8)
{
pipeline = pipeline_deconvolution_pack8;
}
else if (elempack == 1 && out_elempack == 8)
{
pipeline = pipeline_deconvolution_pack1to8;
}
else if (elempack == 4 && out_elempack == 8)
{
pipeline = pipeline_deconvolution_pack4to8;
}
else if (elempack == 8 && out_elempack == 4)
{
pipeline = pipeline_deconvolution_pack8to4;
}
else if (elempack == 8 && out_elempack == 1)
{
pipeline = pipeline_deconvolution_pack8to1;
}

cmd.record_pipeline(pipeline, bindings, constants, top_blob_bordered);

if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
{
VkImageMat top_blob_bordered_adj = top_blob_bordered;
if (output_pad_right > 0 || output_pad_bottom > 0)
{
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;
output_pad->forward(top_blob_bordered, top_blob_bordered_adj, cmd, opt_pad);
if (top_blob_bordered_adj.empty())
return -100;
}

{
VkImageMat reference_blob;
reference_blob.dims = 2;
reference_blob.w = top_blob_bordered_adj.w - pad_left - pad_right;
reference_blob.h = top_blob_bordered_adj.h - pad_top - pad_bottom;
reference_blob.elempack = 1;

std::vector<VkImageMat> crop_bottom_blobs(2);
crop_bottom_blobs[0] = top_blob_bordered_adj;
crop_bottom_blobs[1] = reference_blob;
std::vector<VkImageMat> crop_top_blobs(1);
crop->forward(crop_bottom_blobs, crop_top_blobs, cmd, opt);
top_blob = crop_top_blobs[0];
}
if (top_blob.empty())
return -100;

outw = top_blob.w;
outh = top_blob.h;
}
else if (output_w > 0 && output_h > 0)
{
VkImageMat top_blob_bordered_adj = top_blob_bordered;
if (output_pad_right > 0 || output_pad_bottom > 0)
{
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;
output_pad->forward(top_blob_bordered, top_blob_bordered_adj, cmd, opt_pad);
if (top_blob_bordered_adj.empty())
return -100;
}

int wcut = top_blob_bordered_adj.w - output_w;
int hcut = top_blob_bordered_adj.h - output_h;

VkImageMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* crop_params = crop_param_blob.mapped();

if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233)
{
// onnx padding=SAME_UPPER
crop_params[0] = wcut / 2;
crop_params[1] = hcut / 2;
crop_params[2] = 0;
crop_params[3] = top_blob_bordered_adj.w - wcut;
crop_params[4] = top_blob_bordered_adj.h - hcut;
crop_params[5] = top_blob_bordered_adj.c;
}
else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234)
{
// onnx padding=SAME_LOWER
crop_params[0] = wcut - wcut / 2;
crop_params[1] = hcut - hcut / 2;
crop_params[2] = 0;
crop_params[3] = top_blob_bordered_adj.w - wcut;
crop_params[4] = top_blob_bordered_adj.h - hcut;
crop_params[5] = top_blob_bordered_adj.c;
}

std::vector<VkImageMat> crop_inputs(2);
crop_inputs[0] = top_blob_bordered_adj;
crop_inputs[1] = crop_param_blob;

std::vector<VkImageMat> crop_outputs(1);
output_crop->forward(crop_inputs, crop_outputs, cmd, opt);
top_blob = crop_outputs[0];
if (top_blob.empty())
return -100;

outw = top_blob.w;
outh = top_blob.h;
}
else
{
if (output_pad_right > 0 || output_pad_bottom > 0)
{
output_pad->forward(top_blob_bordered, top_blob, cmd, opt);
if (top_blob.empty())
return -100;
}
else
{
top_blob = top_blob_bordered;
}
}

return 0;
}

} // namespace ncnn

+ 4
- 0
src/layer/vulkan/deconvolution_vulkan.h View File

@@ -31,11 +31,15 @@ public:

using Deconvolution::forward;
virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;

public:
VkMat weight_data_gpu;
VkMat bias_data_gpu;

VkImageMat weight_data_gpu_image;
VkImageMat bias_data_gpu_image;

ncnn::Layer* crop;
ncnn::Layer* output_pad;
ncnn::Layer* output_crop;


+ 457
- 5
src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp View File

@@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(DeconvolutionDepthWise_vulkan)
DeconvolutionDepthWise_vulkan::DeconvolutionDepthWise_vulkan()
{
support_vulkan = true;
support_image_storage = true;

crop = 0;
output_pad = 0;
@@ -136,7 +137,22 @@ int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& opt)

size_t elemsize;
size_t out_elemsize;
if (opt.use_fp16_storage)
if (opt.use_image_storage && opt.use_image_fp16_storage)
{
elemsize = elempack * 2u;
out_elemsize = out_elempack * 2u;
}
else if (opt.use_image_storage && opt.use_image_fp16_packed)
{
elemsize = elempack == 1 ? 4u : elempack * 2u;
out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
}
else if (opt.use_image_storage)
{
elemsize = elempack * 4u;
out_elemsize = out_elempack * 4u;
}
else if (opt.use_fp16_storage)
{
elemsize = elempack * 2u;
out_elemsize = out_elempack * 2u;
@@ -233,7 +249,22 @@ int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& opt)

size_t elemsize_g;
size_t out_elemsize_g;
if (opt.use_fp16_storage)
if (opt.use_image_storage && opt.use_image_fp16_storage)
{
elemsize_g = elempack_g * 2u;
out_elemsize_g = out_elempack_g * 2u;
}
else if (opt.use_image_storage && opt.use_image_fp16_packed)
{
elemsize_g = elempack_g == 1 ? 4u : elempack_g * 2u;
out_elemsize_g = out_elempack_g == 1 ? 4u : out_elempack_g * 2u;
}
else if (opt.use_image_storage)
{
elemsize_g = elempack_g * 4u;
out_elemsize_g = out_elempack_g * 4u;
}
else if (opt.use_fp16_storage)
{
elemsize_g = elempack_g * 2u;
out_elemsize_g = out_elempack_g * 2u;
@@ -463,6 +494,31 @@ int DeconvolutionDepthWise_vulkan::destroy_pipeline(const Option& opt)

int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
{
if (crop)
{
crop->upload_model(cmd, opt);
}

if (output_pad)
{
output_pad->upload_model(cmd, opt);
}

if (output_crop)
{
output_crop->upload_model(cmd, opt);
}

if (packing_unpack)
{
packing_unpack->upload_model(cmd, opt);
}

if (packing_pack)
{
packing_pack->upload_model(cmd, opt);
}

const int maxk = kernel_w * kernel_h;
int channels = (weight_data_size / group) / maxk / (num_output / group) * group;

@@ -495,12 +551,25 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& o

cmd.record_upload(weight_data_r2_packed, weight_data_gpu, opt);

cmd.record_upload(weight_data_r2_packed, weight_data_gpu_image, opt);

if (bias_term)
{
Mat bias_data_packed;
convert_packing(bias_data, bias_data_packed, out_elempack);

cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
if (opt.use_image_storage)
{
cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt);
}
else
{
cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
}
}
else if (opt.use_image_storage)
{
cmd.record_upload(Mat(1), bias_data_gpu_image, opt);
}

return 0;
@@ -558,14 +627,32 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& o
}
}

cmd.record_upload(weight_data_packed_groups, weight_data_gpu, opt);
if (opt.use_image_storage)
{
cmd.record_upload(weight_data_packed_groups, weight_data_gpu_image, opt);
}
else
{
cmd.record_upload(weight_data_packed_groups, weight_data_gpu, opt);
}

if (bias_term)
{
Mat bias_data_packed;
convert_packing(bias_data, bias_data_packed, out_elempack_g);

cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
if (opt.use_image_storage)
{
cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt);
}
else
{
cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
}
}
else if (opt.use_image_storage)
{
cmd.record_upload(Mat(1), bias_data_gpu_image, opt);
}

return 0;
@@ -936,4 +1023,369 @@ int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_
return 0;
}

int DeconvolutionDepthWise_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

int outw = (w - 1) * stride_w + kernel_extent_w;
int outh = (h - 1) * stride_h + kernel_extent_h;
int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / elempack * out_elempack;

if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage)
{
if (out_elempack == 8) out_elemsize = 8*2u;
if (out_elempack == 4) out_elemsize = 4*2u;
if (out_elempack == 1) out_elemsize = 4u;
}

VkImageMat top_blob_bordered;
if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
{
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
}
else
{
top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
}
if (top_blob_bordered.empty())
return -100;

// depth-wise
if (channels == group / elempack && group / elempack == num_output / elempack)
{
std::vector<VkImageMat> bindings(4);
bindings[0] = bottom_blob;
bindings[1] = top_blob_bordered;
bindings[2] = weight_data_gpu_image;
bindings[3] = bias_data_gpu_image;// TODO use dummy buffer

std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = 0;//bottom_blob.cstep;
constants[5].i = top_blob_bordered.dims;
constants[6].i = top_blob_bordered.w;
constants[7].i = top_blob_bordered.h;
constants[8].i = top_blob_bordered.c;
constants[9].i = 0;//top_blob_bordered.cstep;

const Pipeline* pipeline = elempack == 8 ? pipeline_deconvolutiondepthwise_pack8
: elempack == 4 ? pipeline_deconvolutiondepthwise_pack4
: pipeline_deconvolutiondepthwise;

// record
cmd.record_pipeline(pipeline, bindings, constants, top_blob_bordered);

if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
{
VkImageMat top_blob_bordered_adj = top_blob_bordered;
if (output_pad_right > 0 || output_pad_bottom > 0)
{
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;
output_pad->forward(top_blob_bordered, top_blob_bordered_adj, cmd, opt_pad);
if (top_blob_bordered_adj.empty())
return -100;
}

{
VkImageMat reference_blob;
reference_blob.dims = 2;
reference_blob.w = top_blob_bordered_adj.w - pad_left - pad_right;
reference_blob.h = top_blob_bordered_adj.h - pad_top - pad_bottom;
reference_blob.elempack = 1;

std::vector<VkImageMat> crop_bottom_blobs(2);
crop_bottom_blobs[0] = top_blob_bordered_adj;
crop_bottom_blobs[1] = reference_blob;
std::vector<VkImageMat> crop_top_blobs(1);
crop->forward(crop_bottom_blobs, crop_top_blobs, cmd, opt);
top_blob = crop_top_blobs[0];
}
if (top_blob.empty())
return -100;

outw = top_blob.w;
outh = top_blob.h;
}
else if (output_w > 0 && output_h > 0)
{
VkImageMat top_blob_bordered_adj = top_blob_bordered;
if (output_pad_right > 0 || output_pad_bottom > 0)
{
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;
output_pad->forward(top_blob_bordered, top_blob_bordered_adj, cmd, opt_pad);
if (top_blob_bordered_adj.empty())
return -100;
}

int wcut = top_blob_bordered_adj.w - output_w;
int hcut = top_blob_bordered_adj.h - output_h;

VkImageMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* crop_params = crop_param_blob.mapped();

if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233)
{
// onnx padding=SAME_UPPER
crop_params[0] = wcut / 2;
crop_params[1] = hcut / 2;
crop_params[2] = 0;
crop_params[3] = top_blob_bordered_adj.w - wcut;
crop_params[4] = top_blob_bordered_adj.h - hcut;
crop_params[5] = top_blob_bordered_adj.c;
}
else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234)
{
// onnx padding=SAME_LOWER
crop_params[0] = wcut - wcut / 2;
crop_params[1] = hcut - hcut / 2;
crop_params[2] = 0;
crop_params[3] = top_blob_bordered_adj.w - wcut;
crop_params[4] = top_blob_bordered_adj.h - hcut;
crop_params[5] = top_blob_bordered_adj.c;
}

std::vector<VkImageMat> crop_inputs(2);
crop_inputs[0] = top_blob_bordered_adj;
crop_inputs[1] = crop_param_blob;

std::vector<VkImageMat> crop_outputs(1);
output_crop->forward(crop_inputs, crop_outputs, cmd, opt);
top_blob = crop_outputs[0];
if (top_blob.empty())
return -100;

outw = top_blob.w;
outh = top_blob.h;
}
else
{
if (output_pad_right > 0 || output_pad_bottom > 0)
{
output_pad->forward(top_blob_bordered, top_blob, cmd, opt);
if (top_blob.empty())
return -100;
}
else
{
top_blob = top_blob_bordered;
}
}

return 0;
}

const int channels_g = channels * elempack / group;
const int num_output_g = num_output / group;

int elempack_g = opt.use_shader_pack8 && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
int out_elempack_g = opt.use_shader_pack8 && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
size_t out_elemsize_g = elemsize / elempack * out_elempack_g;

if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage)
{
if (out_elempack_g == 8) out_elemsize_g = 8*2u;
if (out_elempack_g == 4) out_elemsize_g = 4*2u;
if (out_elempack_g == 1) out_elemsize_g = 4u;
}

// unpacking
VkImageMat bottom_blob_unpacked = bottom_blob;
if (elempack > elempack_g)
{
Option opt_pack1 = opt;
opt_pack1.blob_vkallocator = opt.workspace_vkallocator;

packing_unpack->forward(bottom_blob, bottom_blob_unpacked, cmd, opt_pack1);
}

VkImageMat top_blob_unpacked = top_blob_bordered;
if (out_elempack_g < out_elempack)
{
top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator);
if (top_blob_unpacked.empty())
return -100;
}

std::vector<VkImageMat> bindings(4);
bindings[0] = bottom_blob_unpacked;
bindings[1] = top_blob_unpacked;
bindings[2] = weight_data_gpu_image;
bindings[3] = bias_data_gpu_image;// TODO use dummy buffer

std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob_unpacked.dims;
constants[1].i = bottom_blob_unpacked.w;
constants[2].i = bottom_blob_unpacked.h;
constants[3].i = bottom_blob_unpacked.c;
constants[4].i = 0;//bottom_blob_unpacked.cstep;
constants[5].i = top_blob_unpacked.dims;
constants[6].i = top_blob_unpacked.w;
constants[7].i = top_blob_unpacked.h;
constants[8].i = top_blob_unpacked.c;
constants[9].i = 0;//top_blob_unpacked.cstep;

const Pipeline* pipeline = 0;
if (elempack_g == 1 && out_elempack_g == 1)
{
pipeline = pipeline_deconvolutiondepthwise_group;
}
else if (elempack_g == 4 && out_elempack_g == 4)
{
pipeline = pipeline_deconvolutiondepthwise_group_pack4;
}
else if (elempack_g == 1 && out_elempack_g == 4)
{
pipeline = pipeline_deconvolutiondepthwise_group_pack1to4;
}
else if (elempack_g == 4 && out_elempack_g == 1)
{
pipeline = pipeline_deconvolutiondepthwise_group_pack4to1;
}
else if (elempack_g == 8 && out_elempack_g == 8)
{
pipeline = pipeline_deconvolutiondepthwise_group_pack8;
}
else if (elempack_g == 1 && out_elempack_g == 8)
{
pipeline = pipeline_deconvolutiondepthwise_group_pack1to8;
}
else if (elempack_g == 4 && out_elempack_g == 8)
{
pipeline = pipeline_deconvolutiondepthwise_group_pack4to8;
}
else if (elempack_g == 8 && out_elempack_g == 4)
{
pipeline = pipeline_deconvolutiondepthwise_group_pack8to4;
}
else if (elempack_g == 8 && out_elempack_g == 1)
{
pipeline = pipeline_deconvolutiondepthwise_group_pack8to1;
}

cmd.record_pipeline(pipeline, bindings, constants, top_blob_unpacked);

// packing
if (out_elempack_g < out_elempack)
{
packing_pack->forward(top_blob_unpacked, top_blob_bordered, cmd, opt);
}
else
{
top_blob_bordered = top_blob_unpacked;
}

if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
{
VkImageMat top_blob_bordered_adj = top_blob_bordered;
if (output_pad_right > 0 || output_pad_bottom > 0)
{
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;
output_pad->forward(top_blob_bordered, top_blob_bordered_adj, cmd, opt_pad);
if (top_blob_bordered_adj.empty())
return -100;
}

{
VkImageMat reference_blob;
reference_blob.dims = 2;
reference_blob.w = top_blob_bordered_adj.w - pad_left - pad_right;
reference_blob.h = top_blob_bordered_adj.h - pad_top - pad_bottom;
reference_blob.elempack = 1;

std::vector<VkImageMat> crop_bottom_blobs(2);
crop_bottom_blobs[0] = top_blob_bordered_adj;
crop_bottom_blobs[1] = reference_blob;
std::vector<VkImageMat> crop_top_blobs(1);
crop->forward(crop_bottom_blobs, crop_top_blobs, cmd, opt);
top_blob = crop_top_blobs[0];
}
if (top_blob.empty())
return -100;

outw = top_blob.w;
outh = top_blob.h;
}
else if (output_w > 0 && output_h > 0)
{
VkImageMat top_blob_bordered_adj = top_blob_bordered;
if (output_pad_right > 0 || output_pad_bottom > 0)
{
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;
output_pad->forward(top_blob_bordered, top_blob_bordered_adj, cmd, opt_pad);
if (top_blob_bordered_adj.empty())
return -100;
}

int wcut = top_blob_bordered_adj.w - output_w;
int hcut = top_blob_bordered_adj.h - output_h;

VkImageMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* crop_params = crop_param_blob.mapped();

if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233)
{
// onnx padding=SAME_UPPER
crop_params[0] = wcut / 2;
crop_params[1] = hcut / 2;
crop_params[2] = 0;
crop_params[3] = top_blob_bordered_adj.w - wcut;
crop_params[4] = top_blob_bordered_adj.h - hcut;
crop_params[5] = top_blob_bordered_adj.c;
}
else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234)
{
// onnx padding=SAME_LOWER
crop_params[0] = wcut - wcut / 2;
crop_params[1] = hcut - hcut / 2;
crop_params[2] = 0;
crop_params[3] = top_blob_bordered_adj.w - wcut;
crop_params[4] = top_blob_bordered_adj.h - hcut;
crop_params[5] = top_blob_bordered_adj.c;
}

std::vector<VkImageMat> crop_inputs(2);
crop_inputs[0] = top_blob_bordered_adj;
crop_inputs[1] = crop_param_blob;

std::vector<VkImageMat> crop_outputs(1);
output_crop->forward(crop_inputs, crop_outputs, cmd, opt);
top_blob = crop_outputs[0];
if (top_blob.empty())
return -100;

outw = top_blob.w;
outh = top_blob.h;
}
else
{
if (output_pad_right > 0 || output_pad_bottom > 0)
{
output_pad->forward(top_blob_bordered, top_blob, cmd, opt);
if (top_blob.empty())
return -100;
}
else
{
top_blob = top_blob_bordered;
}
}

return 0;
}

} // namespace ncnn

+ 4
- 0
src/layer/vulkan/deconvolutiondepthwise_vulkan.h View File

@@ -31,11 +31,15 @@ public:

using DeconvolutionDepthWise::forward;
virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;

public:
VkMat weight_data_gpu;
VkMat bias_data_gpu;

VkImageMat weight_data_gpu_image;
VkImageMat bias_data_gpu_image;

ncnn::Layer* crop;
ncnn::Layer* output_pad;
ncnn::Layer* output_crop;


+ 76
- 1
src/layer/vulkan/eltwise_vulkan.cpp View File

@@ -23,6 +23,7 @@ DEFINE_LAYER_CREATOR(Eltwise_vulkan)
Eltwise_vulkan::Eltwise_vulkan()
{
support_vulkan = true;
support_image_storage = true;

pipeline_eltwise[0] = 0;
pipeline_eltwise[1] = 0;
@@ -42,7 +43,19 @@ int Eltwise_vulkan::create_pipeline(const Option& opt)
if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;

size_t elemsize;
if (opt.use_fp16_storage)
if (opt.use_image_storage && opt.use_image_fp16_storage)
{
elemsize = elempack * 2u;
}
else if (opt.use_image_storage && opt.use_image_fp16_packed)
{
elemsize = elempack == 1 ? 4u : elempack * 2u;
}
else if (opt.use_image_storage)
{
elemsize = elempack * 4u;
}
else if (opt.use_fp16_storage)
{
elemsize = elempack * 2u;
}
@@ -207,4 +220,66 @@ int Eltwise_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<
return 0;
}

int Eltwise_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const
{
const VkImageMat& bottom_blob = bottom_blobs[0];
const VkImageMat& bottom_blob1 = bottom_blobs[1];

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

VkImageMat& top_blob = top_blobs[0];
top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

std::vector<VkImageMat> bindings(3);
bindings[0] = bottom_blob;
bindings[1] = bottom_blob1;
bindings[2] = top_blob;

std::vector<vk_constant_type> constants(5 + 2);
constants[0].i = top_blob.dims;
constants[1].i = top_blob.w;
constants[2].i = top_blob.h;
constants[3].i = top_blob.c;
constants[4].i = 0;//top_blob.cstep;
constants[5].f = coeffs.w == 0 ? 1.f : coeffs[0];
constants[6].f = coeffs.w == 0 ? 1.f : coeffs[1];

const Pipeline* pipeline = elempack == 8 ? pipeline_eltwise_pack8[1]
: elempack == 4 ? pipeline_eltwise_pack4[1]
: pipeline_eltwise[1];

cmd.record_pipeline(pipeline, bindings, constants, top_blob);

for (size_t b=2; b<bottom_blobs.size(); b++)
{
std::vector<VkImageMat> bindings(3);
bindings[0] = top_blob;
bindings[1] = bottom_blobs[b];
bindings[2] = top_blob;// TODO use separated pipeline ?

std::vector<vk_constant_type> constants(5 + 2);
constants[0].i = top_blob.dims;
constants[1].i = top_blob.w;
constants[2].i = top_blob.h;
constants[3].i = top_blob.c;
constants[4].i = 0;//top_blob.cstep;
constants[5].f = 1.f;
constants[6].f = coeffs.w == 0 ? 1 : coeffs[b];

const Pipeline* pipeline = elempack == 8 ? pipeline_eltwise_pack8[b%2]
: elempack == 4 ? pipeline_eltwise_pack4[b%2]
: pipeline_eltwise[b%2];

cmd.record_pipeline(pipeline, bindings, constants, top_blob);
}

return 0;
}

} // namespace ncnn

+ 1
- 0
src/layer/vulkan/eltwise_vulkan.h View File

@@ -29,6 +29,7 @@ public:

using Eltwise::forward;
virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;

public:
Pipeline* pipeline_eltwise[2];


+ 96
- 1
src/layer/vulkan/flatten_vulkan.cpp View File

@@ -23,6 +23,7 @@ DEFINE_LAYER_CREATOR(Flatten_vulkan)
Flatten_vulkan::Flatten_vulkan()
{
support_vulkan = true;
support_image_storage = true;

pipeline_flatten = 0;
pipeline_flatten_pack4 = 0;
@@ -47,7 +48,22 @@ int Flatten_vulkan::create_pipeline(const Option& opt)

size_t elemsize;
size_t out_elemsize;
if (opt.use_fp16_storage)
if (opt.use_image_storage && opt.use_image_fp16_storage)
{
elemsize = elempack * 2u;
out_elemsize = out_elempack * 2u;
}
else if (opt.use_image_storage && opt.use_image_fp16_packed)
{
elemsize = elempack == 1 ? 4u : elempack * 2u;
out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
}
else if (opt.use_image_storage)
{
elemsize = elempack * 4u;
out_elemsize = out_elempack * 4u;
}
else if (opt.use_fp16_storage)
{
elemsize = elempack * 2u;
out_elemsize = out_elempack * 2u;
@@ -256,4 +272,83 @@ int Flatten_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
return 0;
}

int Flatten_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
{
int dims = bottom_blob.dims;

if (dims == 1)
{
top_blob = bottom_blob;
return 0;
}

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

int total = w * h * channels * elempack;

int out_elempack = opt.use_shader_pack8 && total % 8 == 0 ? 8 : total % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / elempack * out_elempack;

if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage)
{
if (out_elempack == 8) out_elemsize = 8*2u;
if (out_elempack == 4) out_elemsize = 4*2u;
if (out_elempack == 1) out_elemsize = 4u;
}

top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

std::vector<VkImageMat> bindings(2);
bindings[0] = bottom_blob;
bindings[1] = top_blob;

std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = 0;//bottom_blob.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = 0;//top_blob.cstep;

const Pipeline* pipeline = 0;
if (elempack == 1 && out_elempack == 1)
{
pipeline = pipeline_flatten;
}
else if (elempack == 4 && out_elempack == 4)
{
pipeline = pipeline_flatten_pack4;
}
else if (elempack == 1 && out_elempack == 4)
{
pipeline = pipeline_flatten_pack1to4;
}
else if (elempack == 8 /*&& out_elempack == 8*/)
{
pipeline = pipeline_flatten_pack8;
}
else if (elempack == 1 && out_elempack == 8)
{
pipeline = pipeline_flatten_pack1to8;
}
else if (elempack == 4 && out_elempack == 8)
{
pipeline = pipeline_flatten_pack4to8;
}

cmd.record_pipeline(pipeline, bindings, constants, top_blob);

return 0;
}

} // namespace ncnn

+ 1
- 0
src/layer/vulkan/flatten_vulkan.h View File

@@ -29,6 +29,7 @@ public:

using Flatten::forward;
virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;

public:
Pipeline* pipeline_flatten;


+ 121
- 3
src/layer/vulkan/innerproduct_vulkan.cpp View File

@@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(InnerProduct_vulkan)
InnerProduct_vulkan::InnerProduct_vulkan()
{
support_vulkan = true;
support_image_storage = true;

flatten = 0;

@@ -72,7 +73,17 @@ int InnerProduct_vulkan::create_pipeline(const Option& opt)

size_t elemsize;
size_t out_elemsize;
if (opt.use_fp16_storage)
if (opt.use_image_storage && opt.use_image_fp16_storage)
{
elemsize = elempack * 2u;
out_elemsize = out_elempack * 2u;
}
else if (opt.use_image_storage)
{
elemsize = elempack * 4u;
out_elemsize = out_elempack * 4u;
}
else if (opt.use_fp16_storage)
{
elemsize = elempack * 2u;
out_elemsize = out_elempack * 2u;
@@ -269,14 +280,32 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
}
}

cmd.record_upload(weight_data_packed, weight_data_gpu, opt);
if (opt.use_image_storage)
{
cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt);
}
else
{
cmd.record_upload(weight_data_packed, weight_data_gpu, opt);
}

if (bias_term)
{
Mat bias_data_packed;
convert_packing(bias_data, bias_data_packed, out_elempack);

cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
if (opt.use_image_storage)
{
cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt);
}
else
{
cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
}
}
else if (opt.use_image_storage)
{
cmd.record_upload(Mat(1), bias_data_gpu_image, opt);
}

return 0;
@@ -371,4 +400,93 @@ int InnerProduct_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCo
return 0;
}

int InnerProduct_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
{
// flatten
VkImageMat bottom_blob_flattened = bottom_blob;
{
Option opt_flatten = opt;
opt_flatten.blob_vkallocator = opt.workspace_vkallocator;

flatten->forward(bottom_blob, bottom_blob_flattened, cmd, opt_flatten);
}

size_t elemsize = bottom_blob_flattened.elemsize;
int elempack = bottom_blob_flattened.elempack;

int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / elempack * out_elempack;

if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage)
{
if (out_elempack == 8) out_elemsize = 8*2u;
if (out_elempack == 4) out_elemsize = 4*2u;
if (out_elempack == 1) out_elemsize = 4u;
}

top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

std::vector<VkImageMat> bindings(4);
bindings[0] = bottom_blob_flattened;
bindings[1] = top_blob;
bindings[2] = weight_data_gpu_image;
bindings[3] = bias_data_gpu_image;// TODO use dummy buffer

std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob_flattened.dims;
constants[1].i = bottom_blob_flattened.w;
constants[2].i = bottom_blob_flattened.h;
constants[3].i = bottom_blob_flattened.c;
constants[4].i = 0;//bottom_blob_flattened.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = 0;//top_blob.cstep;

const Pipeline* pipeline = 0;
if (elempack == 1 && out_elempack == 1)
{
pipeline = pipeline_innerproduct;
}
else if (elempack == 4 && out_elempack == 4)
{
pipeline = pipeline_innerproduct_pack4;
}
else if (elempack == 1 && out_elempack == 4)
{
pipeline = pipeline_innerproduct_pack1to4;
}
else if (elempack == 4 && out_elempack == 1)
{
pipeline = pipeline_innerproduct_pack4to1;
}
else if (elempack == 8 && out_elempack == 8)
{
pipeline = pipeline_innerproduct_pack8;
}
else if (elempack == 1 && out_elempack == 8)
{
pipeline = pipeline_innerproduct_pack1to8;
}
else if (elempack == 4 && out_elempack == 8)
{
pipeline = pipeline_innerproduct_pack4to8;
}
else if (elempack == 8 && out_elempack == 4)
{
pipeline = pipeline_innerproduct_pack8to4;
}
else if (elempack == 8 && out_elempack == 1)
{
pipeline = pipeline_innerproduct_pack8to1;
}

cmd.record_pipeline(pipeline, bindings, constants, top_blob);

return 0;
}

} // namespace ncnn

+ 4
- 0
src/layer/vulkan/innerproduct_vulkan.h View File

@@ -31,6 +31,7 @@ public:

using InnerProduct::forward;
virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;

public:
ncnn::Layer* flatten;
@@ -38,6 +39,9 @@ public:
VkMat weight_data_gpu;
VkMat bias_data_gpu;

VkImageMat weight_data_gpu_image;
VkImageMat bias_data_gpu_image;

Pipeline* pipeline_innerproduct;
Pipeline* pipeline_innerproduct_pack4;
Pipeline* pipeline_innerproduct_pack1to4;


+ 142
- 1
src/layer/vulkan/packing_vulkan.cpp View File

@@ -22,6 +22,7 @@ DEFINE_LAYER_CREATOR(Packing_vulkan)
Packing_vulkan::Packing_vulkan()
{
support_vulkan = true;
support_image_storage = true;

pipeline_packing_1to4 = 0;
pipeline_packing_4to1 = 0;
@@ -37,7 +38,19 @@ int Packing_vulkan::create_pipeline(const Option& opt)
const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];

size_t out_elemsize;
if (opt.use_fp16_storage)
if (opt.use_image_storage && opt.use_image_fp16_storage)
{
out_elemsize = out_elempack * 2u;
}
else if (opt.use_image_storage && opt.use_image_fp16_packed)
{
out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
}
else if (opt.use_image_storage)
{
out_elemsize = out_elempack * 4u;
}
else if (opt.use_fp16_storage)
{
out_elemsize = out_elempack * 2u;
}
@@ -284,4 +297,132 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
return 0;
}

int Packing_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
{
int elempack = bottom_blob.elempack;

if (elempack == out_elempack)
{
top_blob = bottom_blob;
return 0;
}

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int dims = bottom_blob.dims;
size_t elemsize = bottom_blob.elemsize;

if (!use_padding)
{
// identity if use_padding not allowed
if (dims == 1 && w * elempack % out_elempack != 0)
{
top_blob = bottom_blob;
return 0;
}
if (dims == 2 && h * elempack % out_elempack != 0)
{
top_blob = bottom_blob;
return 0;
}
if (dims == 3 && channels * elempack % out_elempack != 0)
{
top_blob = bottom_blob;
return 0;
}
}

if (dims == 1)
{
int outw = (w * elempack + out_elempack - 1) / out_elempack;
size_t out_elemsize = elemsize / elempack * out_elempack;
if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage)
{
if (out_elempack == 8) out_elemsize = 8*2u;
if (out_elempack == 4) out_elemsize = 4*2u;
if (out_elempack == 1) out_elemsize = 4u;
}

top_blob.create(outw, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;
}

if (dims == 2)
{
int outh = (h * elempack + out_elempack - 1) / out_elempack;
size_t out_elemsize = elemsize / elempack * out_elempack;
if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage)
{
if (out_elempack == 8) out_elemsize = 8*2u;
if (out_elempack == 4) out_elemsize = 4*2u;
if (out_elempack == 1) out_elemsize = 4u;
}

top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;
}

if (dims == 3)
{
int outc = (channels * elempack + out_elempack - 1) / out_elempack;
size_t out_elemsize = elemsize / elempack * out_elempack;
if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage)
{
if (out_elempack == 8) out_elemsize = 8*2u;
if (out_elempack == 4) out_elemsize = 4*2u;
if (out_elempack == 1) out_elemsize = 4u;
}

top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;
}

std::vector<VkImageMat> bindings(2);
bindings[0] = bottom_blob;
bindings[1] = top_blob;

std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = 0;//bottom_blob.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = 0;//top_blob.cstep;

if (elempack == 1 && out_elempack == 4)
{
cmd.record_pipeline(pipeline_packing_1to4, bindings, constants, top_blob);
}
if (elempack == 4 && out_elempack == 1)
{
cmd.record_pipeline(pipeline_packing_4to1, bindings, constants, bottom_blob);
}
if (elempack == 1 && out_elempack == 8)
{
cmd.record_pipeline(pipeline_packing_1to8, bindings, constants, top_blob);
}
if (elempack == 4 && out_elempack == 8)
{
cmd.record_pipeline(pipeline_packing_4to8, bindings, constants, top_blob);
}
if (elempack == 8 && out_elempack == 4)
{
cmd.record_pipeline(pipeline_packing_8to4, bindings, constants, bottom_blob);
}
if (elempack == 8 && out_elempack == 1)
{
cmd.record_pipeline(pipeline_packing_8to1, bindings, constants, bottom_blob);
}

return 0;
}

} // namespace ncnn

+ 1
- 0
src/layer/vulkan/packing_vulkan.h View File

@@ -29,6 +29,7 @@ public:

using Packing::forward;
virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;

public:
Pipeline* pipeline_packing_1to4;


+ 152
- 2
src/layer/vulkan/padding_vulkan.cpp View File

@@ -23,6 +23,7 @@ DEFINE_LAYER_CREATOR(Padding_vulkan)
Padding_vulkan::Padding_vulkan()
{
support_vulkan = true;
support_image_storage = true;

pipeline_padding = 0;
pipeline_padding_pack4 = 0;
@@ -46,7 +47,22 @@ int Padding_vulkan::create_pipeline(const Option& opt)

size_t elemsize;
size_t out_elemsize;
if (opt.use_fp16_storage)
if (opt.use_image_storage && opt.use_image_fp16_storage)
{
elemsize = elempack * 2u;
out_elemsize = out_elempack * 2u;
}
else if (opt.use_image_storage && opt.use_image_fp16_packed)
{
elemsize = elempack == 1 ? 4u : elempack * 2u;
out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
}
else if (opt.use_image_storage)
{
elemsize = elempack * 4u;
out_elemsize = out_elempack * 4u;
}
else if (opt.use_fp16_storage)
{
elemsize = elempack * 2u;
out_elemsize = out_elempack * 2u;
@@ -139,14 +155,28 @@ int Padding_vulkan::destroy_pipeline(const Option& /*opt*/)
int Padding_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
{
if (per_channel_pad_data_size == 0)
{
if (opt.use_image_storage)
{
cmd.record_upload(Mat(1), per_channel_pad_data_gpu_image, opt);
}

return 0;
}

int elempack = opt.use_shader_pack8 && per_channel_pad_data_size % 8 == 0 ? 8 : per_channel_pad_data_size % 4 == 0 ? 4 : 1;

Mat per_channel_pad_data_packed;
convert_packing(per_channel_pad_data, per_channel_pad_data_packed, elempack);

cmd.record_upload(per_channel_pad_data_packed, per_channel_pad_data_gpu, opt);
if (opt.use_image_storage)
{
cmd.record_upload(per_channel_pad_data_packed, per_channel_pad_data_gpu_image, opt);
}
else
{
cmd.record_upload(per_channel_pad_data_packed, per_channel_pad_data_gpu, opt);
}

return 0;
}
@@ -271,4 +301,124 @@ int Padding_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<
return 0;
}

int Padding_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
{
if (top == 0 && bottom == 0 && left == 0 && right == 0)
{
top_blob = bottom_blob;
return 0;
}

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

// TODO vec and image padding

int outw = w + left + right;
int outh = h + top + bottom;

top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

std::vector<VkImageMat> bindings(3);
bindings[0] = bottom_blob;
bindings[1] = top_blob;
bindings[2] = per_channel_pad_data_gpu_image;// TODO use dummy buffer

std::vector<vk_constant_type> constants(12);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = 0;//bottom_blob.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = 0;//top_blob.cstep;
constants[10].i = left;
constants[11].i = top;

const Pipeline* pipeline = elempack == 8 ? pipeline_padding_pack8
: elempack == 4 ? pipeline_padding_pack4
: pipeline_padding;

cmd.record_pipeline(pipeline, bindings, constants, top_blob);

return 0;
}

int Padding_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const
{
const VkImageMat& bottom_blob = bottom_blobs[0];
const VkImageMat& reference_blob = bottom_blobs[1];

VkImageMat& top_blob = top_blobs[0];

int _top;
int _bottom;
int _left;
int _right;
{
const int* param_data = reference_blob.mapped();

_top = param_data[0];
_bottom = param_data[1];
_left = param_data[2];
_right = param_data[3];
}

if (_top == 0 && _bottom == 0 && _left == 0 && _right == 0)
{
top_blob = bottom_blob;
return 0;
}

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

// TODO vec and image padding

int outw = w + _left + _right;
int outh = h + _top + _bottom;

top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

std::vector<VkImageMat> bindings(3);
bindings[0] = bottom_blob;
bindings[1] = top_blob;
bindings[2] = per_channel_pad_data_gpu_image;// TODO use dummy buffer

std::vector<vk_constant_type> constants(12);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = 0;//bottom_blob.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = 0;//top_blob.cstep;
constants[10].i = _left;
constants[11].i = _top;

const Pipeline* pipeline = elempack == 8 ? pipeline_padding_pack8
: elempack == 4 ? pipeline_padding_pack4
: pipeline_padding;

cmd.record_pipeline(pipeline, bindings, constants, top_blob);

return 0;
}

} // namespace ncnn

+ 5
- 0
src/layer/vulkan/padding_vulkan.h View File

@@ -34,8 +34,13 @@ public:

virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;

virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;

virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;

public:
VkMat per_channel_pad_data_gpu;
VkImageMat per_channel_pad_data_gpu_image;
Pipeline* pipeline_padding;
Pipeline* pipeline_padding_pack4;
Pipeline* pipeline_padding_pack8;


+ 197
- 1
src/layer/vulkan/pooling_vulkan.cpp View File

@@ -25,6 +25,7 @@ DEFINE_LAYER_CREATOR(Pooling_vulkan)
Pooling_vulkan::Pooling_vulkan()
{
support_vulkan = true;
support_image_storage = true;

padding = 0;
pipeline_pooling = 0;
@@ -112,7 +113,22 @@ int Pooling_vulkan::create_pipeline(const Option& opt)

size_t elemsize;
size_t out_elemsize;
if (opt.use_fp16_storage)
if (opt.use_image_storage && opt.use_image_fp16_storage)
{
elemsize = elempack * 2u;
out_elemsize = out_elempack * 2u;
}
else if (opt.use_image_storage && opt.use_image_fp16_packed)
{
elemsize = elempack == 1 ? 4u : elempack * 2u;
out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
}
else if (opt.use_image_storage)
{
elemsize = elempack * 4u;
out_elemsize = out_elempack * 4u;
}
else if (opt.use_fp16_storage)
{
elemsize = elempack * 2u;
out_elemsize = out_elempack * 2u;
@@ -277,6 +293,16 @@ int Pooling_vulkan::destroy_pipeline(const Option& opt)
return 0;
}

int Pooling_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
{
if (padding)
{
padding->upload_model(cmd, opt);
}

return 0;
}

int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
{
int w = bottom_blob.w;
@@ -447,4 +473,174 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
return 0;
}

int Pooling_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

if (global_pooling)
{
top_blob.create(channels, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

std::vector<VkImageMat> bindings(2);
bindings[0] = bottom_blob;
bindings[1] = top_blob;

std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = 0;//bottom_blob.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = 0;//top_blob.cstep;

const Pipeline* pipeline = elempack == 8 ? pipeline_pooling_global_pack8
: elempack == 4 ? pipeline_pooling_global_pack4
: pipeline_pooling_global;

cmd.record_pipeline(pipeline, bindings, constants, top_blob);

return 0;
}

VkImageMat bottom_blob_bordered = bottom_blob;

int wtailpad = 0;
int htailpad = 0;

if (pad_mode == 0) // full padding
{
int wtail = (w + pad_left + pad_right - kernel_w) % stride_w;
int htail = (h + pad_top + pad_bottom - kernel_h) % stride_h;

if (wtail != 0)
wtailpad = stride_w - wtail;
if (htail != 0)
htailpad = stride_h - htail;

Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* padding_params = padding_param_blob.mapped();

padding_params[0] = pad_top;
padding_params[1] = pad_bottom + htailpad;
padding_params[2] = pad_left;
padding_params[3] = pad_right + wtailpad;

std::vector<VkImageMat> padding_inputs(2);
padding_inputs[0] = bottom_blob;
padding_inputs[1] = padding_param_blob;

std::vector<VkImageMat> padding_outputs(1);
padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
bottom_blob_bordered = padding_outputs[0];
}
else if (pad_mode == 1) // valid padding
{
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad);
}
else if (pad_mode == 2) // tensorflow padding=SAME or onnx padding=SAME_UPPER
{
int wpad = kernel_w + (w - 1) / stride_w * stride_w - w;
int hpad = kernel_h + (h - 1) / stride_h * stride_h - h;
if (wpad > 0 || hpad > 0)
{
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* padding_params = padding_param_blob.mapped();

padding_params[0] = hpad / 2;
padding_params[1] = hpad - hpad / 2;
padding_params[2] = wpad / 2;
padding_params[3] = wpad - wpad / 2;

std::vector<VkImageMat> padding_inputs(2);
padding_inputs[0] = bottom_blob;
padding_inputs[1] = padding_param_blob;

std::vector<VkImageMat> padding_outputs(1);
padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
bottom_blob_bordered = padding_outputs[0];
}
}
else if (pad_mode == 3) // onnx padding=SAME_LOWER
{
int wpad = kernel_w + (w - 1) / stride_w * stride_w - w;
int hpad = kernel_h + (h - 1) / stride_h * stride_h - h;
if (wpad > 0 || hpad > 0)
{
Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
int* padding_params = padding_param_blob.mapped();

padding_params[0] = hpad - hpad / 2;
padding_params[1] = hpad / 2;
padding_params[2] = wpad - wpad / 2;
padding_params[3] = wpad / 2;

std::vector<VkImageMat> padding_inputs(2);
padding_inputs[0] = bottom_blob;
padding_inputs[1] = padding_param_blob;

std::vector<VkImageMat> padding_outputs(1);
padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
bottom_blob_bordered = padding_outputs[0];
}
}

w = bottom_blob_bordered.w;
h = bottom_blob_bordered.h;

int outw = (w - kernel_w) / stride_w + 1;
int outh = (h - kernel_h) / stride_h + 1;

top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

std::vector<VkImageMat> bindings(2);
bindings[0] = bottom_blob_bordered;
bindings[1] = top_blob;

std::vector<vk_constant_type> constants(12);
constants[0].i = bottom_blob_bordered.dims;
constants[1].i = bottom_blob_bordered.w;
constants[2].i = bottom_blob_bordered.h;
constants[3].i = bottom_blob_bordered.c;
constants[4].i = 0;//bottom_blob_bordered.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = 0;//top_blob.cstep;
constants[10].i = wtailpad;
constants[11].i = htailpad;

const Pipeline* pipeline = elempack == 8 ? pipeline_pooling_pack8
: elempack == 4 ? pipeline_pooling_pack4
: pipeline_pooling;

cmd.record_pipeline(pipeline, bindings, constants, top_blob);

return 0;
}

} // namespace ncnn

+ 3
- 0
src/layer/vulkan/pooling_vulkan.h View File

@@ -27,8 +27,11 @@ public:
virtual int create_pipeline(const Option& opt);
virtual int destroy_pipeline(const Option& opt);

virtual int upload_model(VkTransfer& cmd, const Option& opt);

using Pooling::forward;
virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;

public:
ncnn::Layer* padding;


+ 40
- 0
src/layer/vulkan/shader/absval.comp View File

@@ -32,7 +32,16 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
#else
layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -52,11 +61,42 @@ void main()
if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
return;

#if NCNN_image_shader
afp v;
if (psc(dims) == 1)
{
v = image1d_ld1(bottom_blob_1d, gx);
}
else if (psc(dims) == 2)
{
v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy));
}
else // if (psc(dims) == 3)
{
v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz));
}
#else
const int gi = gz * psc(cstep) + gy * psc(w) + gx;

afp v = buffer_ld1(bottom_top_blob_data, gi);
#endif

v = abs(v);

#if NCNN_image_shader
if (psc(dims) == 1)
{
image1d_st1(top_blob_1d, gx, v);
}
else if (psc(dims) == 2)
{
image2d_st1(top_blob_2d, ivec2(gx, gy), v);
}
else // if (psc(dims) == 3)
{
image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v);
}
#else
buffer_st1(bottom_top_blob_data, gi, v);
#endif
}

+ 40
- 0
src/layer/vulkan/shader/absval_pack4.comp View File

@@ -32,7 +32,16 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
#else
layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -52,11 +61,42 @@ void main()
if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
return;

#if NCNN_image_shader
afpvec4 v;
if (psc(dims) == 1)
{
v = image1d_ld4(bottom_blob_1d, gx);
}
else if (psc(dims) == 2)
{
v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy));
}
else // if (psc(dims) == 3)
{
v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz));
}
#else
const int gi = gz * psc(cstep) + gy * psc(w) + gx;

afpvec4 v = buffer_ld4(bottom_top_blob_data, gi);
#endif

v = abs(v);

#if NCNN_image_shader
if (psc(dims) == 1)
{
image1d_st4(top_blob_1d, gx, v);
}
else if (psc(dims) == 2)
{
image2d_st4(top_blob_2d, ivec2(gx, gy), v);
}
else // if (psc(dims) == 3)
{
image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);
}
#else
buffer_st4(bottom_top_blob_data, gi, v);
#endif
}

+ 40
- 0
src/layer/vulkan/shader/absval_pack8.comp View File

@@ -33,7 +33,16 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
#else
layout (binding = 0) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -53,12 +62,43 @@ void main()
if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
return;

#if NCNN_image_shader
afpvec8 v;
if (psc(dims) == 1)
{
v = image1d_ld8(bottom_blob_1d, gx);
}
else if (psc(dims) == 2)
{
v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy));
}
else // if (psc(dims) == 3)
{
v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
}
#else
const int gi = gz * psc(cstep) + gy * psc(w) + gx;

afpvec8 v = buffer_ld8(bottom_top_blob_data, gi);
#endif

v[0] = abs(v[0]);
v[1] = abs(v[1]);

#if NCNN_image_shader
if (psc(dims) == 1)
{
image1d_st8(top_blob_1d, gx, v);
}
else if (psc(dims) == 2)
{
image2d_st8(top_blob_2d, ivec2(gx, gy), v);
}
else // if (psc(dims) == 3)
{
image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
}
#else
buffer_st8(bottom_top_blob_data, gi, v);
#endif
}

+ 24
- 0
src/layer/vulkan/shader/cast_fp16_to_fp32.comp View File

@@ -38,8 +38,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
layout (binding = 1, r32f) writeonly uniform highp image1D top_blob_1d;
layout (binding = 1, r32f) writeonly uniform highp image2D top_blob_2d;
layout (binding = 1, r32f) writeonly uniform highp image3D top_blob_3d;
#else
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -65,9 +74,24 @@ void main()
if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
return;

#if NCNN_image_shader
if (psc(dims) == 1)
{
image1d_cp1(top_blob_1d, gx, bottom_blob_1d, gx);
}
else if (psc(dims) == 2)
{
image2d_cp1(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gx, gy));
}
else // if (psc(dims) == 3)
{
image3d_cp1(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz));
}
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

const int v_offset = gz * psc(cstep) + gy * psc(w) + gx;

top_blob_data[gi] = float(buffer_ld1(bottom_blob_data, v_offset));
#endif
}

+ 24
- 0
src/layer/vulkan/shader/cast_fp16_to_fp32_pack4.comp View File

@@ -38,8 +38,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
layout (binding = 1, rgba32f) writeonly uniform highp image1D top_blob_1d;
layout (binding = 1, rgba32f) writeonly uniform highp image2D top_blob_2d;
layout (binding = 1, rgba32f) writeonly uniform highp image3D top_blob_3d;
#else
layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -65,9 +74,24 @@ void main()
if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
return;

#if NCNN_image_shader
if (psc(dims) == 1)
{
image1d_cp4(top_blob_1d, gx, bottom_blob_1d, gx);
}
else if (psc(dims) == 2)
{
image2d_cp4(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gx, gy));
}
else // if (psc(dims) == 3)
{
image3d_cp4(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz));
}
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

const int v_offset = gz * psc(cstep) + gy * psc(w) + gx;

top_blob_data[gi] = vec4(buffer_ld4(bottom_blob_data, v_offset));
#endif
}

+ 24
- 0
src/layer/vulkan/shader/cast_fp16_to_fp32_pack8.comp View File

@@ -39,8 +39,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
layout (binding = 1, rgba32f) writeonly uniform highp image1D top_blob_1d;
layout (binding = 1, rgba32f) writeonly uniform highp image2D top_blob_2d;
layout (binding = 1, rgba32f) writeonly uniform highp image3D top_blob_3d;
#else
layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { mat2x4 top_blob_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -66,9 +75,24 @@ void main()
if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
return;

#if NCNN_image_shader
if (psc(dims) == 1)
{
image1d_cp8(top_blob_1d, gx, bottom_blob_1d, gx);
}
else if (psc(dims) == 2)
{
image2d_cp8(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gx, gy));
}
else // if (psc(dims) == 3)
{
image3d_cp8(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz));
}
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

const int v_offset = gz * psc(cstep) + gy * psc(w) + gx;

top_blob_data[gi] = mat2x4(buffer_ld8(bottom_blob_data, v_offset));
#endif
}

+ 24
- 0
src/layer/vulkan/shader/cast_fp32_to_fp16.comp View File

@@ -38,8 +38,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform highp sampler1D bottom_blob_1d;
layout (binding = 0) uniform highp sampler2D bottom_blob_2d;
layout (binding = 0) uniform highp sampler3D bottom_blob_3d;
layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
#else
layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -65,9 +74,24 @@ void main()
if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
return;

#if NCNN_image_shader
if (psc(dims) == 1)
{
image1d_cp1(top_blob_1d, gx, bottom_blob_1d, gx);
}
else if (psc(dims) == 2)
{
image2d_cp1(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gx, gy));
}
else // if (psc(dims) == 3)
{
image3d_cp1(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz));
}
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

const int v_offset = gz * psc(cstep) + gy * psc(w) + gx;

buffer_st1(top_blob_data, gi, afp(bottom_blob_data[v_offset]));
#endif
}

+ 24
- 0
src/layer/vulkan/shader/cast_fp32_to_fp16_pack4.comp View File

@@ -38,8 +38,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform highp sampler1D bottom_blob_1d;
layout (binding = 0) uniform highp sampler2D bottom_blob_2d;
layout (binding = 0) uniform highp sampler3D bottom_blob_3d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
#else
layout (binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -65,9 +74,24 @@ void main()
if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
return;

#if NCNN_image_shader
if (psc(dims) == 1)
{
image1d_cp4(top_blob_1d, gx, bottom_blob_1d, gx);
}
else if (psc(dims) == 2)
{
image2d_cp4(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gx, gy));
}
else // if (psc(dims) == 3)
{
image3d_cp4(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz));
}
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

const int v_offset = gz * psc(cstep) + gy * psc(w) + gx;

buffer_st4(top_blob_data, gi, afpvec4(bottom_blob_data[v_offset]));
#endif
}

+ 24
- 0
src/layer/vulkan/shader/cast_fp32_to_fp16_pack8.comp View File

@@ -39,8 +39,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform highp sampler1D bottom_blob_1d;
layout (binding = 0) uniform highp sampler2D bottom_blob_2d;
layout (binding = 0) uniform highp sampler3D bottom_blob_3d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
#else
layout (binding = 0) readonly buffer bottom_blob { mat2x4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -66,9 +75,24 @@ void main()
if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
return;

#if NCNN_image_shader
if (psc(dims) == 1)
{
image1d_cp8(top_blob_1d, gx, bottom_blob_1d, gx);
}
else if (psc(dims) == 2)
{
image2d_cp8(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gx, gy));
}
else // if (psc(dims) == 3)
{
image3d_cp8(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz));
}
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

const int v_offset = gz * psc(cstep) + gy * psc(w) + gx;

buffer_st8(top_blob_data, gi, afpvec8(bottom_blob_data[v_offset]));
#endif
}

+ 27
- 0
src/layer/vulkan/shader/concat.comp View File

@@ -40,8 +40,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
#else
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -69,6 +78,23 @@ void main()
if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
return;

#if NCNN_image_shader
if (psc(dims) == 1)
{
image1d_cp1(top_blob_1d, gx + p.offset, bottom_blob_1d, gx);
}
else if (psc(dims) == 2)
{
if (axis == 0) image2d_cp1(top_blob_2d, ivec2(gx, gy + p.offset), bottom_blob_2d, ivec2(gx, gy));
if (axis == 1) image2d_cp1(top_blob_2d, ivec2(gx + p.offset, gy), bottom_blob_2d, ivec2(gx, gy));
}
else // if (psc(dims) == 3)
{
if (axis == 0) image3d_cp1(top_blob_3d, ivec3(gx, gy, gz + p.offset), bottom_blob_3d, ivec3(gx, gy, gz));
if (axis == 1) image3d_cp1(top_blob_3d, ivec3(gx, gy + p.offset, gz), bottom_blob_3d, ivec3(gx, gy, gz));
if (axis == 2) image3d_cp1(top_blob_3d, ivec3(gx + p.offset, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz));
}
#else
const int gi = gz * psc(cstep) + gy * psc(w) + gx;

ivec3 gxyz = ivec3(gx, gy, gz);
@@ -78,4 +104,5 @@ void main()
int v_offset = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x;

buffer_cp1(top_blob_data, v_offset, bottom_blob_data, gi);
#endif
}

+ 27
- 0
src/layer/vulkan/shader/concat_pack4.comp View File

@@ -40,8 +40,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
#else
layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -69,6 +78,23 @@ void main()
if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
return;

#if NCNN_image_shader
if (psc(dims) == 1)
{
image1d_cp4(top_blob_1d, gx + p.offset, bottom_blob_1d, gx);
}
else if (psc(dims) == 2)
{
if (axis == 0) image2d_cp4(top_blob_2d, ivec2(gx, gy + p.offset), bottom_blob_2d, ivec2(gx, gy));
if (axis == 1) image2d_cp4(top_blob_2d, ivec2(gx + p.offset, gy), bottom_blob_2d, ivec2(gx, gy));
}
else // if (psc(dims) == 3)
{
if (axis == 0) image3d_cp4(top_blob_3d, ivec3(gx, gy, gz + p.offset), bottom_blob_3d, ivec3(gx, gy, gz));
if (axis == 1) image3d_cp4(top_blob_3d, ivec3(gx, gy + p.offset, gz), bottom_blob_3d, ivec3(gx, gy, gz));
if (axis == 2) image3d_cp4(top_blob_3d, ivec3(gx + p.offset, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz));
}
#else
const int gi = gz * psc(cstep) + gy * psc(w) + gx;

ivec3 gxyz = ivec3(gx, gy, gz);
@@ -78,4 +104,5 @@ void main()
int v_offset = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x;

buffer_cp4(top_blob_data, v_offset, bottom_blob_data, gi);
#endif
}

+ 78
- 0
src/layer/vulkan/shader/concat_pack4to1.comp View File

@@ -40,8 +40,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
#else
layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -69,6 +78,74 @@ void main()
if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
return;

#if NCNN_image_shader
if (psc(dims) == 1)
{
afpvec4 v = image1d_ld4(bottom_blob_1d, gx);

int gx4 = gx * 4 + p.offset;

image1d_st1(top_blob_1d, gx4 + 0, v.r);
image1d_st1(top_blob_1d, gx4 + 1, v.g);
image1d_st1(top_blob_1d, gx4 + 2, v.b);
image1d_st1(top_blob_1d, gx4 + 3, v.a);
}
else if (psc(dims) == 2)
{
afpvec4 v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy));

if (axis == 0)
{
int gy4 = gy * 4 + p.offset;

image2d_st1(top_blob_2d, ivec2(gx, gy4 + 0), v.r);
image2d_st1(top_blob_2d, ivec2(gx, gy4 + 1), v.g);
image2d_st1(top_blob_2d, ivec2(gx, gy4 + 2), v.b);
image2d_st1(top_blob_2d, ivec2(gx, gy4 + 3), v.a);
}
if (axis == 1)
{
int gx4 = gx * 4 + p.offset;

image2d_st1(top_blob_2d, ivec2(gx4 + 0, gy), v.r);
image2d_st1(top_blob_2d, ivec2(gx4 + 1, gy), v.g);
image2d_st1(top_blob_2d, ivec2(gx4 + 2, gy), v.b);
image2d_st1(top_blob_2d, ivec2(gx4 + 3, gy), v.a);
}
}
else // if (psc(dims) == 3)
{
afpvec4 v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz));

if (axis == 0)
{
int gz4 = gz * 4 + p.offset;

image3d_st1(top_blob_3d, ivec3(gx, gy, gz4 + 0), v.r);
image3d_st1(top_blob_3d, ivec3(gx, gy, gz4 + 1), v.g);
image3d_st1(top_blob_3d, ivec3(gx, gy, gz4 + 2), v.b);
image3d_st1(top_blob_3d, ivec3(gx, gy, gz4 + 3), v.a);
}
if (axis == 1)
{
int gy4 = gy * 4 + p.offset;

image3d_st1(top_blob_3d, ivec3(gx, gy4 + 0, gz), v.r);
image3d_st1(top_blob_3d, ivec3(gx, gy4 + 1, gz), v.g);
image3d_st1(top_blob_3d, ivec3(gx, gy4 + 2, gz), v.b);
image3d_st1(top_blob_3d, ivec3(gx, gy4 + 3, gz), v.a);
}
if (axis == 2)
{
int gx4 = gx * 4 + p.offset;

image3d_st1(top_blob_3d, ivec3(gx4 + 0, gy, gz), v.r);
image3d_st1(top_blob_3d, ivec3(gx4 + 1, gy, gz), v.g);
image3d_st1(top_blob_3d, ivec3(gx4 + 2, gy, gz), v.b);
image3d_st1(top_blob_3d, ivec3(gx4 + 3, gy, gz), v.a);
}
}
#else
const int gi = gz * psc(cstep) + gy * psc(w) + gx;

ivec3 gxyz = ivec3(gx, gy, gz);
@@ -83,4 +160,5 @@ void main()
ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * gxyz4[psc(dims) - 1 - axis];

buffer_cp4to1(top_blob_data, v_offset, bottom_blob_data, gi);
#endif
}

+ 27
- 0
src/layer/vulkan/shader/concat_pack8.comp View File

@@ -41,8 +41,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
#else
layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -70,6 +79,23 @@ void main()
if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
return;

#if NCNN_image_shader
if (psc(dims) == 1)
{
image1d_cp8(top_blob_1d, gx + p.offset, bottom_blob_1d, gx);
}
else if (psc(dims) == 2)
{
if (axis == 0) image2d_cp8(top_blob_2d, ivec2(gx, gy + p.offset), bottom_blob_2d, ivec2(gx, gy));
if (axis == 1) image2d_cp8(top_blob_2d, ivec2(gx + p.offset, gy), bottom_blob_2d, ivec2(gx, gy));
}
else // if (psc(dims) == 3)
{
if (axis == 0) image3d_cp8(top_blob_3d, ivec3(gx, gy, gz + p.offset), bottom_blob_3d, ivec3(gx, gy, gz));
if (axis == 1) image3d_cp8(top_blob_3d, ivec3(gx, gy + p.offset, gz), bottom_blob_3d, ivec3(gx, gy, gz));
if (axis == 2) image3d_cp8(top_blob_3d, ivec3(gx + p.offset, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz));
}
#else
const int gi = gz * psc(cstep) + gy * psc(w) + gx;

ivec3 gxyz = ivec3(gx, gy, gz);
@@ -79,4 +105,5 @@ void main()
int v_offset = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x;

buffer_cp8(top_blob_data, v_offset, bottom_blob_data, gi);
#endif
}

+ 102
- 0
src/layer/vulkan/shader/concat_pack8to1.comp View File

@@ -41,8 +41,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
#else
layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -70,6 +79,98 @@ void main()
if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
return;

#if NCNN_image_shader
if (psc(dims) == 1)
{
afpvec8 v = image1d_ld8(bottom_blob_1d, gx);

int gx8 = gx * 8 + p.offset;

image1d_st1(top_blob_1d, gx8 + 0, v[0].r);
image1d_st1(top_blob_1d, gx8 + 1, v[0].g);
image1d_st1(top_blob_1d, gx8 + 2, v[0].b);
image1d_st1(top_blob_1d, gx8 + 3, v[0].a);
image1d_st1(top_blob_1d, gx8 + 4, v[1].r);
image1d_st1(top_blob_1d, gx8 + 5, v[1].g);
image1d_st1(top_blob_1d, gx8 + 6, v[1].b);
image1d_st1(top_blob_1d, gx8 + 7, v[1].a);
}
else if (psc(dims) == 2)
{
afpvec8 v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy));

if (axis == 0)
{
int gy8 = gy * 8 + p.offset;

image2d_st1(top_blob_2d, ivec2(gx, gy8 + 0), v[0].r);
image2d_st1(top_blob_2d, ivec2(gx, gy8 + 1), v[0].g);
image2d_st1(top_blob_2d, ivec2(gx, gy8 + 2), v[0].b);
image2d_st1(top_blob_2d, ivec2(gx, gy8 + 3), v[0].a);
image2d_st1(top_blob_2d, ivec2(gx, gy8 + 4), v[1].r);
image2d_st1(top_blob_2d, ivec2(gx, gy8 + 5), v[1].g);
image2d_st1(top_blob_2d, ivec2(gx, gy8 + 6), v[1].b);
image2d_st1(top_blob_2d, ivec2(gx, gy8 + 7), v[1].a);
}
if (axis == 1)
{
int gx8 = gx * 8 + p.offset;

image2d_st1(top_blob_2d, ivec2(gx8 + 0, gy), v[0].r);
image2d_st1(top_blob_2d, ivec2(gx8 + 1, gy), v[0].g);
image2d_st1(top_blob_2d, ivec2(gx8 + 2, gy), v[0].b);
image2d_st1(top_blob_2d, ivec2(gx8 + 3, gy), v[0].a);
image2d_st1(top_blob_2d, ivec2(gx8 + 4, gy), v[1].r);
image2d_st1(top_blob_2d, ivec2(gx8 + 5, gy), v[1].g);
image2d_st1(top_blob_2d, ivec2(gx8 + 6, gy), v[1].b);
image2d_st1(top_blob_2d, ivec2(gx8 + 7, gy), v[1].a);
}
}
else // if (psc(dims) == 3)
{
afpvec8 v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));

if (axis == 0)
{
int gz8 = gz * 8 + p.offset;

image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 0), v[0].r);
image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 1), v[0].g);
image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 2), v[0].b);
image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 3), v[0].a);
image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 4), v[1].r);
image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 5), v[1].g);
image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 6), v[1].b);
image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 7), v[1].a);
}
if (axis == 1)
{
int gy8 = gy * 8 + p.offset;

image3d_st1(top_blob_3d, ivec3(gx, gy8 + 0, gz), v[0].r);
image3d_st1(top_blob_3d, ivec3(gx, gy8 + 1, gz), v[0].g);
image3d_st1(top_blob_3d, ivec3(gx, gy8 + 2, gz), v[0].b);
image3d_st1(top_blob_3d, ivec3(gx, gy8 + 3, gz), v[0].a);
image3d_st1(top_blob_3d, ivec3(gx, gy8 + 4, gz), v[1].r);
image3d_st1(top_blob_3d, ivec3(gx, gy8 + 5, gz), v[1].g);
image3d_st1(top_blob_3d, ivec3(gx, gy8 + 6, gz), v[1].b);
image3d_st1(top_blob_3d, ivec3(gx, gy8 + 7, gz), v[1].a);
}
if (axis == 2)
{
int gx8 = gx * 8 + p.offset;

image3d_st1(top_blob_3d, ivec3(gx8 + 0, gy, gz), v[0].r);
image3d_st1(top_blob_3d, ivec3(gx8 + 1, gy, gz), v[0].g);
image3d_st1(top_blob_3d, ivec3(gx8 + 2, gy, gz), v[0].b);
image3d_st1(top_blob_3d, ivec3(gx8 + 3, gy, gz), v[0].a);
image3d_st1(top_blob_3d, ivec3(gx8 + 4, gy, gz), v[1].r);
image3d_st1(top_blob_3d, ivec3(gx8 + 5, gy, gz), v[1].g);
image3d_st1(top_blob_3d, ivec3(gx8 + 6, gy, gz), v[1].b);
image3d_st1(top_blob_3d, ivec3(gx8 + 7, gy, gz), v[1].a);
}
}
#else
const int gi = gz * psc(cstep) + gy * psc(w) + gx;

ivec3 gxyz = ivec3(gx, gy, gz);
@@ -85,4 +186,5 @@ void main()
ivec4 vv_offset = v_offset + 4 * gxyz4[psc(dims) - 1 - axis];

buffer_cp8to1(top_blob_data, v_offset, vv_offset, bottom_blob_data, gi);
#endif
}

+ 67
- 0
src/layer/vulkan/shader/concat_pack8to4.comp View File

@@ -41,8 +41,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
#else
layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -70,6 +79,63 @@ void main()
if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
return;

#if NCNN_image_shader
if (psc(dims) == 1)
{
afpvec8 v = image1d_ld8(bottom_blob_1d, gx);

int gx2 = gx * 2 + p.offset;

image1d_st4(top_blob_1d, gx2 + 0, v[0]);
image1d_st4(top_blob_1d, gx2 + 1, v[1]);

}
else if (psc(dims) == 2)
{
afpvec8 v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy));

if (axis == 0)
{
int gy2 = gy * 2 + p.offset;

image2d_st4(top_blob_2d, ivec2(gx, gy2 + 0), v[0]);
image2d_st4(top_blob_2d, ivec2(gx, gy2 + 1), v[1]);
}
if (axis == 1)
{
int gx2 = gx * 2 + p.offset;

image2d_st4(top_blob_2d, ivec2(gx2 + 0, gy), v[0]);
image2d_st4(top_blob_2d, ivec2(gx2 + 1, gy), v[1]);
}
}
else // if (psc(dims) == 3)
{
afpvec8 v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));

if (axis == 0)
{
int gz2 = gz * 2 + p.offset;

image3d_st4(top_blob_3d, ivec3(gx, gy, gz2 + 0), v[0]);
image3d_st4(top_blob_3d, ivec3(gx, gy, gz2 + 1), v[1]);
}
if (axis == 1)
{
int gy2 = gy * 2 + p.offset;

image3d_st4(top_blob_3d, ivec3(gx, gy2 + 0, gz), v[0]);
image3d_st4(top_blob_3d, ivec3(gx, gy2 + 1, gz), v[1]);
}
if (axis == 2)
{
int gx2 = gx * 2 + p.offset;

image3d_st4(top_blob_3d, ivec3(gx2 + 0, gy, gz), v[0]);
image3d_st4(top_blob_3d, ivec3(gx2 + 1, gy, gz), v[1]);
}
}
#else
const int gi = gz * psc(cstep) + gy * psc(w) + gx;

ivec3 gxyz = ivec3(gx, gy, gz);
@@ -84,4 +150,5 @@ void main()
ivec2 v_offset = v_offset_0 + ivec2(0, 1) * gxyz4[psc(dims) - 1 - axis];

buffer_cp8to4(top_blob_data, v_offset, bottom_blob_data, gi);
#endif
}

+ 38
- 0
src/layer/vulkan/shader/convolution.comp View File

@@ -49,10 +49,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -82,13 +89,39 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld1(bias_blob, gz);
#else
sum = buffer_ld1(bias_data, gz);
#endif
}
else
{
sum = afp(0.f);
}

#if NCNN_image_shader
for (int z = 0; z < psc(c); z++)
{
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
sum += image3d_ld1(weight_blob, ivec3(wx, z, gz)) * image3d_ld1(bottom_blob, ivec3(sx, sy, z));

sx += dilation_w;
wx += 1;
}

sy += dilation_h;
}
}
#else // NCNN_image_shader
int w_offset = gz * psc(c) * kernel_w * kernel_h;

for (int z = 0; z < psc(c); z++)
@@ -106,6 +139,7 @@ void main()
w_offset += kernel_w;
}
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -127,7 +161,11 @@ void main()
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st1(top_blob_data, gi, sum);
#endif
}

+ 69
- 20
src/layer/vulkan/shader/convolution_1x1s1d1.comp View File

@@ -21,26 +21,40 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
layout (constant_id = 3) const float activation_param_1 = 0;

#define shape_constant_id_offset 4
layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
layout (constant_id = 3) const int dilation_h = 1;
layout (constant_id = 4) const int stride_w = 1;
layout (constant_id = 5) const int stride_h = 1;
layout (constant_id = 6) const int bias_term = 0;
layout (constant_id = 7) const int activation_type = 0;
layout (constant_id = 8) const float activation_param_0 = 0;
layout (constant_id = 9) const float activation_param_1 = 0;

#define shape_constant_id_offset 10
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int size_4 = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep_4 = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outsize_4 = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outcstep_4 = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
#if NCNN_fp16_packed
layout (binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; };
@@ -50,40 +64,67 @@ layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
#endif
layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
int dims;
int size_4;
int w;
int h;
int c;
int cstep_4;
int cstep;

int outdims;
int outsize_4;
int outw;
int outh;
int outc;
int outcstep_4;
int outcstep;
} p;

void main()
{
#if NCNN_image_shader
int gx = int(gl_GlobalInvocationID.x) * 2;
int gy = int(gl_GlobalInvocationID.y) * 2;
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
return;
#else
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outsize_4) || gy >= 1 || gz >= psc(outc))
if (gx * 4 >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
return;
#endif

afpvec4 sum;

if (bias_term == 1)
{
#if NCNN_image_shader
sum = afpvec4(image1d_ld1(bias_blob, gz));
#else
sum = afpvec4(buffer_ld1(bias_data, gz));
#endif
}
else
{
sum = afpvec4(0.f);
}

#if NCNN_image_shader
for (int z = 0; z < psc(c); z++)
{
afp k = image3d_ld1(weight_blob, ivec3(0, z, gz));

sum.r += k * image3d_ld1(bottom_blob, ivec3(gx, gy, z));
sum.g += k * image3d_ld1(bottom_blob, ivec3(gx+1, gy, z));
sum.b += k * image3d_ld1(bottom_blob, ivec3(gx, gy+1, z));
sum.a += k * image3d_ld1(bottom_blob, ivec3(gx+1, gy+1, z));
}
#else // NCNN_image_shader
int w_offset = gz * psc(c);
int v_offset = gx;

@@ -96,8 +137,9 @@ void main()
#endif

w_offset += 1;
v_offset += psc(cstep_4);
v_offset += psc(cstep) / 4;
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -119,11 +161,18 @@ void main()
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

const int gi = gz * psc(outcstep_4) + gx;
#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, gz), sum.r);
image3d_st1(top_blob, ivec3(gx+1, gy, gz), sum.g);
image3d_st1(top_blob, ivec3(gx, gy+1, gz), sum.b);
image3d_st1(top_blob, ivec3(gx+1, gy+1, gz), sum.a);
#else
const int gi = gz * psc(outcstep) + gx;

#if NCNN_fp16_packed
top_blob_data[gi] = sum;
#else
buffer_st4(top_blob_data, gi, sum);
#endif
#endif
}

+ 42
- 0
src/layer/vulkan/shader/convolution_pack1to4.comp View File

@@ -49,10 +49,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -82,13 +89,43 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld4(bias_blob, gz);
#else
sum = buffer_ld4(bias_data, gz);
#endif
}
else
{
sum = afpvec4(0.f);
}

#if NCNN_image_shader
for (int z = 0; z < psc(c); z++)
{
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, z));

afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz));

sum += v * k;

sx += dilation_w;
wx += 1;
}

sy += dilation_h;
}
}
#else // NCNN_image_shader
int w_offset = gz * psc(c) * kernel_w * kernel_h;

for (int z = 0; z < psc(c); z++)
@@ -110,6 +147,7 @@ void main()
w_offset += kernel_w;
}
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -131,7 +169,11 @@ void main()
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st4(top_blob_data, gi, sum);
#endif
}

+ 44
- 0
src/layer/vulkan/shader/convolution_pack1to8.comp View File

@@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -83,13 +90,45 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld8(bias_blob, gz);
#else
sum = buffer_ld8(bias_data, gz);
#endif
}
else
{
sum = afpvec8(afpvec4(0.f), afpvec4(0.f));
}

#if NCNN_image_shader
for (int z = 0; z < psc(c); z++)
{
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, z));

afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz));

// sum += v * k;
sum[0] += v * k[0];
sum[1] += v * k[1];

sx += dilation_w;
wx += 1;
}

sy += dilation_h;
}
}
#else // NCNN_image_shader
int w_offset = gz * psc(c) * kernel_w * kernel_h;

for (int z = 0; z < psc(c); z++)
@@ -113,6 +152,7 @@ void main()
w_offset += kernel_w;
}
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -138,7 +178,11 @@ void main()
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st8(top_blob_data, gi, sum);
#endif
}

+ 47
- 0
src/layer/vulkan/shader/convolution_pack4.comp View File

@@ -49,6 +49,12 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
@@ -58,6 +64,7 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; };
#endif
layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -87,13 +94,48 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld4(bias_blob, gz);
#else
sum = buffer_ld4(bias_data, gz);
#endif
}
else
{
sum = afpvec4(0.f);
}

#if NCNN_image_shader
for (int z = 0; z < psc(c); z++)
{
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, z));

afpmat4 k = afpmat4(
image3d_ld4(weight_blob, ivec3(wx + 0, z, gz)),
image3d_ld4(weight_blob, ivec3(wx + 1, z, gz)),
image3d_ld4(weight_blob, ivec3(wx + 2, z, gz)),
image3d_ld4(weight_blob, ivec3(wx + 3, z, gz))
);

sum += v * k;

sx += dilation_w;
wx += 4;
}

sy += dilation_h;
}
}
#else // NCNN_image_shader
int w_offset = gz * psc(c) * kernel_w * kernel_h;

for (int z = 0; z < psc(c); z++)
@@ -125,6 +167,7 @@ void main()
w_offset += kernel_w;
}
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -146,7 +189,11 @@ void main()
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st4(top_blob_data, gi, sum);
#endif
}

+ 75
- 16
src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp View File

@@ -21,26 +21,40 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
layout (constant_id = 3) const float activation_param_1 = 0;

#define shape_constant_id_offset 4
layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
layout (constant_id = 3) const int dilation_h = 1;
layout (constant_id = 4) const int stride_w = 1;
layout (constant_id = 5) const int stride_h = 1;
layout (constant_id = 6) const int bias_term = 0;
layout (constant_id = 7) const int activation_type = 0;
layout (constant_id = 8) const float activation_param_0 = 0;
layout (constant_id = 9) const float activation_param_1 = 0;

#define shape_constant_id_offset 10
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int size = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outsize = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
@@ -50,28 +64,40 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; };
#endif
layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
int dims;
int size;
int w;
int h;
int c;
int cstep;

int outdims;
int outsize;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
#if NCNN_image_shader
int gx = int(gl_GlobalInvocationID.x) * 2;
int gy = int(gl_GlobalInvocationID.y) * 2;
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
return;
#else
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outsize) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
return;
#endif

afpvec4 sum0;
afpvec4 sum1;
@@ -80,7 +106,11 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
afpvec4 b = image1d_ld4(bias_blob, gz);
#else
afpvec4 b = buffer_ld4(bias_data, gz);
#endif
sum0 = b;
sum1 = b;
sum2 = b;
@@ -94,6 +124,27 @@ void main()
sum3 = afpvec4(0.f);
}

#if NCNN_image_shader
for (int z = 0; z < psc(c); z++)
{
afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(gx, gy, z));
afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(gx+1, gy, z));
afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(gx, gy+1, z));
afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(gx+1, gy+1, z));

afpmat4 k = afpmat4(
image3d_ld4(weight_blob, ivec3(0, z, gz)),
image3d_ld4(weight_blob, ivec3(1, z, gz)),
image3d_ld4(weight_blob, ivec3(2, z, gz)),
image3d_ld4(weight_blob, ivec3(3, z, gz))
);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}
#else // NCNN_image_shader
int w_offset = gz * psc(c);
int v_offset = gx;

@@ -124,6 +175,7 @@ void main()
w_offset += 1;
v_offset += psc(cstep);
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -157,10 +209,17 @@ void main()
sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
}

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, gy, gz), sum0);
image3d_st4(top_blob, ivec3(gx+1, gy, gz), sum1);
image3d_st4(top_blob, ivec3(gx, gy+1, gz), sum2);
image3d_st4(top_blob, ivec3(gx+1, gy+1, gz), sum3);
#else
int gi = gz * psc(outcstep) + gx;

buffer_st4(top_blob_data, gi + 0, sum0);
if (gx + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, sum1);
if (gx + 2 < psc(outcstep)) buffer_st4(top_blob_data, gi + 2, sum2);
if (gx + 3 < psc(outcstep)) buffer_st4(top_blob_data, gi + 3, sum3);
#endif
}

+ 37
- 0
src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_gemm.comp View File

@@ -33,6 +33,11 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_tm_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_tm_blob;
layout (binding = 2) uniform unfp sampler3D weight_tm_blob;
#else
layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; };
layout (binding = 1) writeonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; };
#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
@@ -41,6 +46,7 @@ layout (binding = 2) readonly buffer weight_tm_blob { sfpvec4 weight_tm_data[];
#else
layout (binding = 2) readonly buffer weight_tm_blob { sfpmat4 weight_tm_data[]; };
#endif
#endif

layout (push_constant) uniform parameter
{
@@ -66,6 +72,29 @@ void main()
afpvec4 sum2 = afpvec4(0.f);
afpvec4 sum3 = afpvec4(0.f);

#if NCNN_image_shader
int wx = gx * 4;

for (int z = 0; z < psc(c); z++)
{
afpvec4 v0 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 0, z));
afpvec4 v1 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 1, z));
afpvec4 v2 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 2, z));
afpvec4 v3 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 3, z));

afpmat4 k = afpmat4(
image3d_ld4(weight_tm_blob, ivec3(wx + 0, z, gz)),
image3d_ld4(weight_tm_blob, ivec3(wx + 1, z, gz)),
image3d_ld4(weight_tm_blob, ivec3(wx + 2, z, gz)),
image3d_ld4(weight_tm_blob, ivec3(wx + 3, z, gz))
);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}
#else
int v_offset = gy * 16 + gx;
int w_offset = gz * psc(c) * 16 + gx;

@@ -96,11 +125,19 @@ void main()
v_offset += psc(cstep);
w_offset += 16;
}
#endif

#if NCNN_image_shader
image3d_st4(top_tm_blob, ivec3(gx, gy + 0, gz), sum0);
image3d_st4(top_tm_blob, ivec3(gx, gy + 1, gz), sum1);
image3d_st4(top_tm_blob, ivec3(gx, gy + 2, gz), sum2);
image3d_st4(top_tm_blob, ivec3(gx, gy + 3, gz), sum3);
#else
int gi = gz * psc(outcstep) + gy * 16 + gx;

buffer_st4(top_tm_blob_data, gi + 0, sum0);
if (gy + 1 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 16, sum1);
if (gy + 2 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 32, sum2);
if (gy + 3 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 48, sum3);
#endif
}

+ 51
- 0
src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input.comp View File

@@ -36,8 +36,13 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D bottom_tm_blob;
#else
layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -62,6 +67,30 @@ void main()
return;

// load 4x4
#if NCNN_image_shader
int sx = gx * 2;
int sy = gy * 2;

afpvec4 v00 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 0, gz));
afpvec4 v01 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 0, gz));
afpvec4 v02 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 0, gz));
afpvec4 v03 = image3d_ld4(bottom_blob, ivec3(sx + 3, sy + 0, gz));

afpvec4 v10 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 1, gz));
afpvec4 v11 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 1, gz));
afpvec4 v12 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 1, gz));
afpvec4 v13 = image3d_ld4(bottom_blob, ivec3(sx + 3, sy + 1, gz));

afpvec4 v20 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 2, gz));
afpvec4 v21 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 2, gz));
afpvec4 v22 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 2, gz));
afpvec4 v23 = image3d_ld4(bottom_blob, ivec3(sx + 3, sy + 2, gz));

afpvec4 v30 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 3, gz));
afpvec4 v31 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 3, gz));
afpvec4 v32 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 3, gz));
afpvec4 v33 = image3d_ld4(bottom_blob, ivec3(sx + 3, sy + 3, gz));
#else
int v_offset_0 = gz * psc(cstep) + gy * 2 * psc(w) + gx * 2;
ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w);

@@ -84,6 +113,7 @@ void main()
afpvec4 v31 = buffer_ld4(bottom_blob_data, v_offset.a + 1);
afpvec4 v32 = buffer_ld4(bottom_blob_data, v_offset.a + 2);
afpvec4 v33 = buffer_ld4(bottom_blob_data, v_offset.a + 3);
#endif

// const float itm[4][4] = {
// {1.0f, 0.0f, -1.0f, 0.0f},
@@ -134,6 +164,26 @@ void main()
v33 = m33 - m31;

// store 16
#if NCNN_image_shader
int y = gy * p.block_x + gx;

image3d_st4(bottom_tm_blob, ivec3(0, y, gz), v00);
image3d_st4(bottom_tm_blob, ivec3(1, y, gz), v01);
image3d_st4(bottom_tm_blob, ivec3(2, y, gz), v02);
image3d_st4(bottom_tm_blob, ivec3(3, y, gz), v03);
image3d_st4(bottom_tm_blob, ivec3(4, y, gz), v10);
image3d_st4(bottom_tm_blob, ivec3(5, y, gz), v11);
image3d_st4(bottom_tm_blob, ivec3(6, y, gz), v12);
image3d_st4(bottom_tm_blob, ivec3(7, y, gz), v13);
image3d_st4(bottom_tm_blob, ivec3(8, y, gz), v20);
image3d_st4(bottom_tm_blob, ivec3(9, y, gz), v21);
image3d_st4(bottom_tm_blob, ivec3(10, y, gz), v22);
image3d_st4(bottom_tm_blob, ivec3(11, y, gz), v23);
image3d_st4(bottom_tm_blob, ivec3(12, y, gz), v30);
image3d_st4(bottom_tm_blob, ivec3(13, y, gz), v31);
image3d_st4(bottom_tm_blob, ivec3(14, y, gz), v32);
image3d_st4(bottom_tm_blob, ivec3(15, y, gz), v33);
#else
int v_tm_offset = gz * psc(outcstep) + (gy * p.block_x + gx) * 16;

buffer_st4(bottom_tm_blob_data, v_tm_offset + 0, v00);
@@ -152,4 +202,5 @@ void main()
buffer_st4(bottom_tm_blob_data, v_tm_offset + 13, v31);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 14, v32);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 15, v33);
#endif
}

+ 41
- 0
src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output.comp View File

@@ -41,9 +41,15 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D top_tm_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler1D bias_blob;
#else
layout (binding = 0) readonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
layout (binding = 2) readonly buffer bias_blob { sfpvec4 bias_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -68,6 +74,26 @@ void main()
return;

// load 16
#if NCNN_image_shader
int sy = gy * p.block_x + gx;

afpvec4 v00 = image3d_ld4(top_tm_blob, ivec3(0, sy, gz));
afpvec4 v01 = image3d_ld4(top_tm_blob, ivec3(1, sy, gz));
afpvec4 v02 = image3d_ld4(top_tm_blob, ivec3(2, sy, gz));
afpvec4 v03 = image3d_ld4(top_tm_blob, ivec3(3, sy, gz));
afpvec4 v10 = image3d_ld4(top_tm_blob, ivec3(4, sy, gz));
afpvec4 v11 = image3d_ld4(top_tm_blob, ivec3(5, sy, gz));
afpvec4 v12 = image3d_ld4(top_tm_blob, ivec3(6, sy, gz));
afpvec4 v13 = image3d_ld4(top_tm_blob, ivec3(7, sy, gz));
afpvec4 v20 = image3d_ld4(top_tm_blob, ivec3(8, sy, gz));
afpvec4 v21 = image3d_ld4(top_tm_blob, ivec3(9, sy, gz));
afpvec4 v22 = image3d_ld4(top_tm_blob, ivec3(10, sy, gz));
afpvec4 v23 = image3d_ld4(top_tm_blob, ivec3(11, sy, gz));
afpvec4 v30 = image3d_ld4(top_tm_blob, ivec3(12, sy, gz));
afpvec4 v31 = image3d_ld4(top_tm_blob, ivec3(13, sy, gz));
afpvec4 v32 = image3d_ld4(top_tm_blob, ivec3(14, sy, gz));
afpvec4 v33 = image3d_ld4(top_tm_blob, ivec3(15, sy, gz));
#else
int v_tm_offset = gz * psc(cstep) + (gy * p.block_x + gx) * 16;

afpvec4 v00 = buffer_ld4(top_tm_blob_data, v_tm_offset + 0);
@@ -86,6 +112,7 @@ void main()
afpvec4 v31 = buffer_ld4(top_tm_blob_data, v_tm_offset + 13);
afpvec4 v32 = buffer_ld4(top_tm_blob_data, v_tm_offset + 14);
afpvec4 v33 = buffer_ld4(top_tm_blob_data, v_tm_offset + 15);
#endif

// const float itm[2][4] = {
// {1.0f, 1.0f, 1.0f, 0.0f},
@@ -105,7 +132,11 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
const afpvec4 bias_value = image1d_ld4(bias_blob, gz);
#else
const afpvec4 bias_value = buffer_ld4(bias_data, gz);
#endif

v00 = bias_value + m00 + m01 + m02;
v10 = bias_value + m10 + m11 + m12;
@@ -155,6 +186,15 @@ void main()
}

// store 2x2
#if NCNN_image_shader
int x = gx * 2;
int y = gy * 2;

image3d_st4(top_blob, ivec3(x, y, gz), v00);
image3d_st4(top_blob, ivec3(x + 1, y, gz), v01);
image3d_st4(top_blob, ivec3(x, y + 1, gz), v10);
image3d_st4(top_blob, ivec3(x + 1, y + 1, gz), v11);
#else
int v_offset_0 = gz * psc(outcstep) + gy * 2 * psc(outw) + gx * 2;
int v_offset_1 = v_offset_0 + psc(outw);

@@ -162,4 +202,5 @@ void main()
buffer_st4(top_blob_data, v_offset_0 + 1, v01);
buffer_st4(top_blob_data, v_offset_1 + 0, v10);
buffer_st4(top_blob_data, v_offset_1 + 1, v11);
#endif
}

+ 42
- 0
src/layer/vulkan/shader/convolution_pack4to1.comp View File

@@ -49,10 +49,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -82,13 +89,43 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld1(bias_blob, gz);
#else
sum = buffer_ld1(bias_data, gz);
#endif
}
else
{
sum = afp(0.f);
}

#if NCNN_image_shader
for (int z = 0; z < psc(c); z++)
{
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, z));

afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz));

sum += dot(v, k);

sx += dilation_w;
wx += 1;
}

sy += dilation_h;
}
}
#else // NCNN_image_shader
int w_offset = gz * psc(c) * kernel_w * kernel_h;

for (int z = 0; z < psc(c); z++)
@@ -110,6 +147,7 @@ void main()
w_offset += kernel_w;
}
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -131,7 +169,11 @@ void main()
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st1(top_blob_data, gi, sum);
#endif
}

+ 57
- 0
src/layer/vulkan/shader/convolution_pack4to8.comp View File

@@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -83,13 +90,58 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld8(bias_blob, gz);
#else
sum = buffer_ld8(bias_data, gz);
#endif
}
else
{
sum = afpvec8(afpvec4(0.f), afpvec4(0.f));
}

#if NCNN_image_shader
for (int z = 0; z < psc(c); z++)
{
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, z));

afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx + 0, z, gz));
afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx + 1, z, gz));
afpvec4 k2 = image3d_ld4(weight_blob, ivec3(wx + 2, z, gz));
afpvec4 k3 = image3d_ld4(weight_blob, ivec3(wx + 3, z, gz));
afpvec4 k4 = image3d_ld4(weight_blob, ivec3(wx + 4, z, gz));
afpvec4 k5 = image3d_ld4(weight_blob, ivec3(wx + 5, z, gz));
afpvec4 k6 = image3d_ld4(weight_blob, ivec3(wx + 6, z, gz));
afpvec4 k7 = image3d_ld4(weight_blob, ivec3(wx + 7, z, gz));

// sum += v * k;
sum[0].r += dot(v, k0);
sum[0].g += dot(v, k1);
sum[0].b += dot(v, k2);
sum[0].a += dot(v, k3);
sum[1].r += dot(v, k4);
sum[1].g += dot(v, k5);
sum[1].b += dot(v, k6);
sum[1].a += dot(v, k7);

sx += dilation_w;
wx += 8;
}

sy += dilation_h;
}
}
#else // NCNN_image_shader
int w_offset = gz * psc(c) * kernel_w * kernel_h;

for (int z = 0; z < psc(c); z++)
@@ -126,6 +178,7 @@ void main()
w_offset += kernel_w;
}
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -151,7 +204,11 @@ void main()
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st8(top_blob_data, gi, sum);
#endif
}

+ 57
- 0
src/layer/vulkan/shader/convolution_pack8.comp View File

@@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -83,13 +90,58 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld8(bias_blob, gz);
#else
sum = buffer_ld8(bias_data, gz);
#endif
}
else
{
sum = afpvec8(afpvec4(0.f), afpvec4(0.f));
}

#if NCNN_image_shader
for (int z = 0; z < psc(c); z++)
{
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, z));

afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz));
afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz));
afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz));
afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz));
afpvec8 k4 = image3d_ld8(weight_blob, ivec3(wx + 4, z, gz));
afpvec8 k5 = image3d_ld8(weight_blob, ivec3(wx + 5, z, gz));
afpvec8 k6 = image3d_ld8(weight_blob, ivec3(wx + 6, z, gz));
afpvec8 k7 = image3d_ld8(weight_blob, ivec3(wx + 7, z, gz));

// sum += v * k;
sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]);
sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]);
sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]);
sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]);
sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]);
sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]);
sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]);
sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]);

sx += dilation_w;
wx += 8;
}

sy += dilation_h;
}
}
#else // NCNN_image_shader
int w_offset = gz * psc(c) * kernel_w * kernel_h;

for (int z = 0; z < psc(c); z++)
@@ -126,6 +178,7 @@ void main()
w_offset += kernel_w;
}
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -151,7 +204,11 @@ void main()
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st8(top_blob_data, gi, sum);
#endif
}

+ 108
- 15
src/layer/vulkan/shader/convolution_pack8_1x1s1d1.comp View File

@@ -22,52 +22,78 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
layout (constant_id = 3) const float activation_param_1 = 0;
layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
layout (constant_id = 3) const int dilation_h = 1;
layout (constant_id = 4) const int stride_w = 1;
layout (constant_id = 5) const int stride_h = 1;
layout (constant_id = 6) const int bias_term = 0;
layout (constant_id = 7) const int activation_type = 0;
layout (constant_id = 8) const float activation_param_0 = 0;
layout (constant_id = 9) const float activation_param_1 = 0;

#define shape_constant_id_offset 4
#define shape_constant_id_offset 10
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int size = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outsize = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
int dims;
int size;
int w;
int h;
int c;
int cstep;

int outdims;
int outsize;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
#if NCNN_image_shader
int gx = int(gl_GlobalInvocationID.x) * 2;
int gy = int(gl_GlobalInvocationID.y) * 2;
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
return;
#else
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outsize) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
return;
#endif

afpvec8 sum0;
afpvec8 sum1;
@@ -76,7 +102,11 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
afpvec8 b = image1d_ld8(bias_blob, gz);
#else
afpvec8 b = buffer_ld8(bias_data, gz);
#endif
sum0 = b;
sum1 = b;
sum2 = b;
@@ -90,6 +120,61 @@ void main()
sum3 = afpvec8(afpvec4(0.f), afpvec4(0.f));
}

#if NCNN_image_shader
for (int z = 0; z < psc(c); z++)
{
afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(gx, gy, z));
afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(gx+1, gy, z));
afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(gx, gy+1, z));
afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(gx+1, gy+1, z));

afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gz));
afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gz));
afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gz));
afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gz));
afpvec8 k4 = image3d_ld8(weight_blob, ivec3(4, z, gz));
afpvec8 k5 = image3d_ld8(weight_blob, ivec3(5, z, gz));
afpvec8 k6 = image3d_ld8(weight_blob, ivec3(6, z, gz));
afpvec8 k7 = image3d_ld8(weight_blob, ivec3(7, z, gz));

// sum += v * k
sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);
sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]);
sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]);
sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]);
sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]);

sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);
sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]);
sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]);
sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]);
sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]);

sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]);
sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]);
sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]);
sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]);
sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]);
sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]);
sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]);
sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]);

sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]);
sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]);
sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]);
sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]);
sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]);
sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]);
sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]);
sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]);
}
#else // NCNN_image_shader
int w_offset = gz * psc(c) * 8;
int v_offset = gx;

@@ -149,6 +234,7 @@ void main()
w_offset += 8;
v_offset += psc(cstep);
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -198,10 +284,17 @@ void main()
sum3[1] = afp(1.f) / (afp(1.f) + exp(-sum3[1]));
}

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, gy, gz), sum0);
image3d_st8(top_blob, ivec3(gx+1, gy, gz), sum1);
image3d_st8(top_blob, ivec3(gx, gy+1, gz), sum2);
image3d_st8(top_blob, ivec3(gx+1, gy+1, gz), sum3);
#else
int gi = gz * psc(outcstep) + gx;

buffer_st8(top_blob_data, gi + 0, sum0);
if (gx + 1 < psc(outcstep)) buffer_st8(top_blob_data, gi + 1, sum1);
if (gx + 2 < psc(outcstep)) buffer_st8(top_blob_data, gi + 2, sum2);
if (gx + 3 < psc(outcstep)) buffer_st8(top_blob_data, gi + 3, sum3);
#endif
}

+ 71
- 0
src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_gemm.comp View File

@@ -34,9 +34,15 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_tm_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_tm_blob;
layout (binding = 2) uniform unfp sampler3D weight_tm_blob;
#else
layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec8 bottom_tm_blob_data[]; };
layout (binding = 1) writeonly buffer top_tm_blob { sfpvec8 top_tm_blob_data[]; };
layout (binding = 2) readonly buffer weight_tm_blob { sfpvec8 weight_tm_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -62,6 +68,63 @@ void main()
afpvec8 sum2 = afpvec8(afpvec4(0.f), afpvec4(0.f));
afpvec8 sum3 = afpvec8(afpvec4(0.f), afpvec4(0.f));

#if NCNN_image_shader
int wx = gx * 8;

for (int z = 0; z < psc(c); z++)
{
afpvec8 v0 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 0, z));
afpvec8 v1 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 1, z));
afpvec8 v2 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 2, z));
afpvec8 v3 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 3, z));

afpvec8 k0 = image3d_ld8(weight_tm_blob, ivec3(wx + 0, z, gz));
afpvec8 k1 = image3d_ld8(weight_tm_blob, ivec3(wx + 1, z, gz));
afpvec8 k2 = image3d_ld8(weight_tm_blob, ivec3(wx + 2, z, gz));
afpvec8 k3 = image3d_ld8(weight_tm_blob, ivec3(wx + 3, z, gz));
afpvec8 k4 = image3d_ld8(weight_tm_blob, ivec3(wx + 4, z, gz));
afpvec8 k5 = image3d_ld8(weight_tm_blob, ivec3(wx + 5, z, gz));
afpvec8 k6 = image3d_ld8(weight_tm_blob, ivec3(wx + 6, z, gz));
afpvec8 k7 = image3d_ld8(weight_tm_blob, ivec3(wx + 7, z, gz));

// sum += v * k
sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);
sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]);
sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]);
sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]);
sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]);

sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);
sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]);
sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]);
sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]);
sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]);

sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]);
sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]);
sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]);
sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]);
sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]);
sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]);
sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]);
sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]);

sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]);
sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]);
sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]);
sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]);
sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]);
sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]);
sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]);
sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]);
}
#else
int v_offset = gy * 16 + gx;
int w_offset = (gz * psc(c) * 16 + gx) * 8;

@@ -121,11 +184,19 @@ void main()
v_offset += psc(cstep);
w_offset += 16 * 8;
}
#endif

#if NCNN_image_shader
image3d_st8(top_tm_blob, ivec3(gx, gy + 0, gz), sum0);
image3d_st8(top_tm_blob, ivec3(gx, gy + 1, gz), sum1);
image3d_st8(top_tm_blob, ivec3(gx, gy + 2, gz), sum2);
image3d_st8(top_tm_blob, ivec3(gx, gy + 3, gz), sum3);
#else
int gi = gz * psc(outcstep) + gy * 16 + gx;

buffer_st8(top_tm_blob_data, gi + 0, sum0);
if (gy + 1 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 16, sum1);
if (gy + 2 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 32, sum2);
if (gy + 3 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 48, sum3);
#endif
}

+ 51
- 0
src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_input.comp View File

@@ -37,8 +37,13 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D bottom_tm_blob;
#else
layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer bottom_tm_blob { sfpvec8 bottom_tm_blob_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -63,6 +68,30 @@ void main()
return;

// load 4x4
#if NCNN_image_shader
int sx = gx * 2;
int sy = gy * 2;

afpvec8 v00 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 0, gz));
afpvec8 v01 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 0, gz));
afpvec8 v02 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 0, gz));
afpvec8 v03 = image3d_ld8(bottom_blob, ivec3(sx + 3, sy + 0, gz));

afpvec8 v10 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 1, gz));
afpvec8 v11 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 1, gz));
afpvec8 v12 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 1, gz));
afpvec8 v13 = image3d_ld8(bottom_blob, ivec3(sx + 3, sy + 1, gz));

afpvec8 v20 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 2, gz));
afpvec8 v21 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 2, gz));
afpvec8 v22 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 2, gz));
afpvec8 v23 = image3d_ld8(bottom_blob, ivec3(sx + 3, sy + 2, gz));

afpvec8 v30 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 3, gz));
afpvec8 v31 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 3, gz));
afpvec8 v32 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 3, gz));
afpvec8 v33 = image3d_ld8(bottom_blob, ivec3(sx + 3, sy + 3, gz));
#else
int v_offset_0 = gz * psc(cstep) + gy * 2 * psc(w) + gx * 2;
ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w);

@@ -85,6 +114,7 @@ void main()
afpvec8 v31 = buffer_ld8(bottom_blob_data, v_offset.a + 1);
afpvec8 v32 = buffer_ld8(bottom_blob_data, v_offset.a + 2);
afpvec8 v33 = buffer_ld8(bottom_blob_data, v_offset.a + 3);
#endif

// const float itm[4][4] = {
// {1.0f, 0.0f, -1.0f, 0.0f},
@@ -135,6 +165,26 @@ void main()
v33 = m33 - m31;

// store 16
#if NCNN_image_shader
int y = gy * p.block_x + gx;

image3d_st8(bottom_tm_blob, ivec3(0, y, gz), v00);
image3d_st8(bottom_tm_blob, ivec3(1, y, gz), v01);
image3d_st8(bottom_tm_blob, ivec3(2, y, gz), v02);
image3d_st8(bottom_tm_blob, ivec3(3, y, gz), v03);
image3d_st8(bottom_tm_blob, ivec3(4, y, gz), v10);
image3d_st8(bottom_tm_blob, ivec3(5, y, gz), v11);
image3d_st8(bottom_tm_blob, ivec3(6, y, gz), v12);
image3d_st8(bottom_tm_blob, ivec3(7, y, gz), v13);
image3d_st8(bottom_tm_blob, ivec3(8, y, gz), v20);
image3d_st8(bottom_tm_blob, ivec3(9, y, gz), v21);
image3d_st8(bottom_tm_blob, ivec3(10, y, gz), v22);
image3d_st8(bottom_tm_blob, ivec3(11, y, gz), v23);
image3d_st8(bottom_tm_blob, ivec3(12, y, gz), v30);
image3d_st8(bottom_tm_blob, ivec3(13, y, gz), v31);
image3d_st8(bottom_tm_blob, ivec3(14, y, gz), v32);
image3d_st8(bottom_tm_blob, ivec3(15, y, gz), v33);
#else
int v_tm_offset = gz * psc(outcstep) + (gy * p.block_x + gx) * 16;

buffer_st8(bottom_tm_blob_data, v_tm_offset + 0, v00);
@@ -153,4 +203,5 @@ void main()
buffer_st8(bottom_tm_blob_data, v_tm_offset + 13, v31);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 14, v32);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 15, v33);
#endif
}

+ 41
- 0
src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_output.comp View File

@@ -42,9 +42,15 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D top_tm_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler1D bias_blob;
#else
layout (binding = 0) readonly buffer top_tm_blob { sfpvec8 top_tm_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
layout (binding = 2) readonly buffer bias_blob { sfpvec8 bias_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -69,6 +75,26 @@ void main()
return;

// load 16
#if NCNN_image_shader
int sy = gy * p.block_x + gx;

afpvec8 v00 = image3d_ld8(top_tm_blob, ivec3(0, sy, gz));
afpvec8 v01 = image3d_ld8(top_tm_blob, ivec3(1, sy, gz));
afpvec8 v02 = image3d_ld8(top_tm_blob, ivec3(2, sy, gz));
afpvec8 v03 = image3d_ld8(top_tm_blob, ivec3(3, sy, gz));
afpvec8 v10 = image3d_ld8(top_tm_blob, ivec3(4, sy, gz));
afpvec8 v11 = image3d_ld8(top_tm_blob, ivec3(5, sy, gz));
afpvec8 v12 = image3d_ld8(top_tm_blob, ivec3(6, sy, gz));
afpvec8 v13 = image3d_ld8(top_tm_blob, ivec3(7, sy, gz));
afpvec8 v20 = image3d_ld8(top_tm_blob, ivec3(8, sy, gz));
afpvec8 v21 = image3d_ld8(top_tm_blob, ivec3(9, sy, gz));
afpvec8 v22 = image3d_ld8(top_tm_blob, ivec3(10, sy, gz));
afpvec8 v23 = image3d_ld8(top_tm_blob, ivec3(11, sy, gz));
afpvec8 v30 = image3d_ld8(top_tm_blob, ivec3(12, sy, gz));
afpvec8 v31 = image3d_ld8(top_tm_blob, ivec3(13, sy, gz));
afpvec8 v32 = image3d_ld8(top_tm_blob, ivec3(14, sy, gz));
afpvec8 v33 = image3d_ld8(top_tm_blob, ivec3(15, sy, gz));
#else
int v_tm_offset = gz * psc(cstep) + (gy * p.block_x + gx) * 16;

afpvec8 v00 = buffer_ld8(top_tm_blob_data, v_tm_offset + 0);
@@ -87,6 +113,7 @@ void main()
afpvec8 v31 = buffer_ld8(top_tm_blob_data, v_tm_offset + 13);
afpvec8 v32 = buffer_ld8(top_tm_blob_data, v_tm_offset + 14);
afpvec8 v33 = buffer_ld8(top_tm_blob_data, v_tm_offset + 15);
#endif

// const float itm[2][4] = {
// {1.0f, 1.0f, 1.0f, 0.0f},
@@ -106,7 +133,11 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
const afpvec8 bias_value = image1d_ld8(bias_blob, gz);
#else
const afpvec8 bias_value = buffer_ld8(bias_data, gz);
#endif

v00 = bias_value + m00 + m01 + m02;
v10 = bias_value + m10 + m11 + m12;
@@ -172,6 +203,15 @@ void main()
}

// store 2x2
#if NCNN_image_shader
int x = gx * 2;
int y = gy * 2;

image3d_st8(top_blob, ivec3(x, y, gz), v00);
image3d_st8(top_blob, ivec3(x + 1, y, gz), v01);
image3d_st8(top_blob, ivec3(x, y + 1, gz), v10);
image3d_st8(top_blob, ivec3(x + 1, y + 1, gz), v11);
#else
int v_offset_0 = gz * psc(outcstep) + gy * 2 * psc(outw) + gx * 2;
int v_offset_1 = v_offset_0 + psc(outw);

@@ -179,4 +219,5 @@ void main()
buffer_st8(top_blob_data, v_offset_0 + 1, v01);
buffer_st8(top_blob_data, v_offset_1 + 0, v10);
buffer_st8(top_blob_data, v_offset_1 + 1, v11);
#endif
}

+ 43
- 0
src/layer/vulkan/shader/convolution_pack8to1.comp View File

@@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -83,13 +90,44 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld1(bias_blob, gz);
#else
sum = buffer_ld1(bias_data, gz);
#endif
}
else
{
sum = afp(0.f);
}

#if NCNN_image_shader
for (int z = 0; z < psc(c); z++)
{
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, z));

afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz));

// sum += dot(v, k);
sum += dot(v[0], k[0]) + dot(v[1], k[1]);

sx += dilation_w;
wx += 1;
}

sy += dilation_h;
}
}
#else // NCNN_image_shader
int w_offset = gz * psc(c) * kernel_w * kernel_h;

for (int z = 0; z < psc(c); z++)
@@ -112,6 +150,7 @@ void main()
w_offset += kernel_w;
}
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -133,7 +172,11 @@ void main()
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st1(top_blob_data, gi, sum);
#endif
}

+ 49
- 0
src/layer/vulkan/shader/convolution_pack8to4.comp View File

@@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -83,13 +90,50 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld4(bias_blob, gz);
#else
sum = buffer_ld4(bias_data, gz);
#endif
}
else
{
sum = afpvec4(0.f);
}

#if NCNN_image_shader
for (int z = 0; z < psc(c); z++)
{
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, z));

afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz));
afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz));
afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz));
afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz));

// sum += v * k
sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]);
sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]);
sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]);
sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]);

sx += dilation_w;
wx += 4;
}

sy += dilation_h;
}
}
#else // NCNN_image_shader
int w_offset = gz * psc(c) * kernel_w * kernel_h;

for (int z = 0; z < psc(c); z++)
@@ -118,6 +162,7 @@ void main()
w_offset += kernel_w;
}
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -139,7 +184,11 @@ void main()
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st4(top_blob_data, gi, sum);
#endif
}

+ 35
- 0
src/layer/vulkan/shader/convolutiondepthwise.comp View File

@@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler2D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -83,7 +90,11 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld1(bias_blob, gz);
#else
sum = buffer_ld1(bias_data, gz);
#endif
}
else
{
@@ -91,6 +102,25 @@ void main()
}

// depth-wise convolution
#if NCNN_image_shader
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
sum += image2d_ld1(weight_blob, ivec2(wx, gz)) * image3d_ld1(bottom_blob, ivec3(sx, sy, gz));

sx += dilation_w;
wx += 1;
}

sy += dilation_h;
}
#else // NCNN_image_shader
int w_offset = gz * kernel_w * kernel_h;
int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;

@@ -104,6 +134,7 @@ void main()
v_offset += dilation_h * psc(w);
w_offset += kernel_w;
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -125,7 +156,11 @@ void main()
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st1(top_blob_data, gi, sum);
#endif
}

+ 42
- 0
src/layer/vulkan/shader/convolutiondepthwise_group.comp View File

@@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -83,7 +90,11 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld1(bias_blob, gz);
#else
sum = buffer_ld1(bias_data, gz);
#endif
}
else
{
@@ -97,6 +108,32 @@ void main()
// group id
const int gg = gz / num_output_g;

#if NCNN_image_shader
int sz = gg * channels_g;

for (int z = 0; z < channels_g; z++)
{
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
sum += image3d_ld1(weight_blob, ivec3(wx, z, gz)) * image3d_ld1(bottom_blob, ivec3(sx, sy, sz));

sx += dilation_w;
wx += 1;
}

sy += dilation_h;
}

sz += 1;
}
#else // NCNN_image_shader
int w_offset = gz * channels_g * kernel_w * kernel_h;
int v_offset_0 = gg * channels_g * psc(cstep);

@@ -117,6 +154,7 @@ void main()

v_offset_0 += psc(cstep);
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -138,7 +176,11 @@ void main()
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st1(top_blob_data, gi, sum);
#endif
}

+ 46
- 0
src/layer/vulkan/shader/convolutiondepthwise_group_pack1to4.comp View File

@@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -83,7 +90,11 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld4(bias_blob, gz);
#else
sum = buffer_ld4(bias_data, gz);
#endif
}
else
{
@@ -97,6 +108,36 @@ void main()
// group id
const int gg = gz / num_output_g;

#if NCNN_image_shader
int sz = gg * channels_g;

for (int z = 0; z < channels_g; z++)
{
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, sz));

afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz));

sum += v * k;

sx += dilation_w;
wx += 1;
}

sy += dilation_h;
}

sz += 1;
}
#else // NCNN_image_shader
int w_offset = gz * channels_g * kernel_w * kernel_h;
int v_offset_0 = gg * channels_g * psc(cstep);

@@ -121,6 +162,7 @@ void main()

v_offset_0 += psc(cstep);
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -142,7 +184,11 @@ void main()
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st4(top_blob_data, gi, sum);
#endif
}

+ 48
- 0
src/layer/vulkan/shader/convolutiondepthwise_group_pack1to8.comp View File

@@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -84,7 +91,11 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld8(bias_blob, gz);
#else
sum = buffer_ld8(bias_data, gz);
#endif
}
else
{
@@ -98,6 +109,38 @@ void main()
// group id
const int gg = gz / num_output_g;

#if NCNN_image_shader
int sz = gg * channels_g;

for (int z = 0; z < channels_g; z++)
{
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, sz));

afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz));

// sum += v * k;
sum[0] += v * k[0];
sum[1] += v * k[1];

sx += dilation_w;
wx += 1;
}

sy += dilation_h;
}

sz += 1;
}
#else // NCNN_image_shader
int w_offset = gz * channels_g * kernel_w * kernel_h;
int v_offset_0 = gg * channels_g * psc(cstep);

@@ -124,6 +167,7 @@ void main()

v_offset_0 += psc(cstep);
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -149,7 +193,11 @@ void main()
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st8(top_blob_data, gi, sum);
#endif
}

+ 51
- 0
src/layer/vulkan/shader/convolutiondepthwise_group_pack4.comp View File

@@ -50,6 +50,12 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
@@ -59,6 +65,7 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; };
#endif
layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -88,7 +95,11 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld4(bias_blob, gz);
#else
sum = buffer_ld4(bias_data, gz);
#endif
}
else
{
@@ -102,6 +113,41 @@ void main()
// group id
const int gg = gz / num_output_g;

#if NCNN_image_shader
int sz = gg * channels_g;

for (int z = 0; z < channels_g; z++)
{
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, sz));

afpmat4 k = afpmat4(
image3d_ld4(weight_blob, ivec3(wx + 0, z, gz)),
image3d_ld4(weight_blob, ivec3(wx + 1, z, gz)),
image3d_ld4(weight_blob, ivec3(wx + 2, z, gz)),
image3d_ld4(weight_blob, ivec3(wx + 3, z, gz))
);

sum += v * k;

sx += dilation_w;
wx += 4;
}

sy += dilation_h;
}

sz += 1;
}
#else // NCNN_image_shader
int w_offset = gz * channels_g * kernel_w * kernel_h;
int v_offset_0 = gg * channels_g * psc(cstep);

@@ -136,6 +182,7 @@ void main()

v_offset_0 += psc(cstep);
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -157,7 +204,11 @@ void main()
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st4(top_blob_data, gi, sum);
#endif
}

+ 46
- 0
src/layer/vulkan/shader/convolutiondepthwise_group_pack4to1.comp View File

@@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -83,7 +90,11 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld1(bias_blob, gz);
#else
sum = buffer_ld1(bias_data, gz);
#endif
}
else
{
@@ -97,6 +108,36 @@ void main()
// group id
const int gg = gz / num_output_g;

#if NCNN_image_shader
int sz = gg * channels_g;

for (int z = 0; z < channels_g; z++)
{
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, sz));

afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz));

sum += dot(v, k);

sx += dilation_w;
wx += 1;
}

sy += dilation_h;
}

sz += 1;
}
#else // NCNN_image_shader
int w_offset = gz * channels_g * kernel_w * kernel_h;
int v_offset_0 = gg * channels_g * psc(cstep);

@@ -121,6 +162,7 @@ void main()

v_offset_0 += psc(cstep);
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -142,7 +184,11 @@ void main()
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st1(top_blob_data, gi, sum);
#endif
}

+ 61
- 0
src/layer/vulkan/shader/convolutiondepthwise_group_pack4to8.comp View File

@@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -84,7 +91,11 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld8(bias_blob, gz);
#else
sum = buffer_ld8(bias_data, gz);
#endif
}
else
{
@@ -98,6 +109,51 @@ void main()
// group id
const int gg = gz / num_output_g;

#if NCNN_image_shader
int sz = gg * channels_g;

for (int z = 0; z < channels_g; z++)
{
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, sz));

afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx + 0, z, gz));
afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx + 1, z, gz));
afpvec4 k2 = image3d_ld4(weight_blob, ivec3(wx + 2, z, gz));
afpvec4 k3 = image3d_ld4(weight_blob, ivec3(wx + 3, z, gz));
afpvec4 k4 = image3d_ld4(weight_blob, ivec3(wx + 4, z, gz));
afpvec4 k5 = image3d_ld4(weight_blob, ivec3(wx + 5, z, gz));
afpvec4 k6 = image3d_ld4(weight_blob, ivec3(wx + 6, z, gz));
afpvec4 k7 = image3d_ld4(weight_blob, ivec3(wx + 7, z, gz));

// sum += v * k;
sum[0].r += dot(v, k0);
sum[0].g += dot(v, k1);
sum[0].b += dot(v, k2);
sum[0].a += dot(v, k3);
sum[1].r += dot(v, k4);
sum[1].g += dot(v, k5);
sum[1].b += dot(v, k6);
sum[1].a += dot(v, k7);

sx += dilation_w;
wx += 8;
}

sy += dilation_h;
}

sz += 1;
}
#else // NCNN_image_shader
int w_offset = gz * channels_g * kernel_w * kernel_h;
int v_offset_0 = gg * channels_g * psc(cstep);

@@ -137,6 +193,7 @@ void main()

v_offset_0 += psc(cstep);
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -162,7 +219,11 @@ void main()
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st8(top_blob_data, gi, sum);
#endif
}

+ 61
- 0
src/layer/vulkan/shader/convolutiondepthwise_group_pack8.comp View File

@@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -84,7 +91,11 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld8(bias_blob, gz);
#else
sum = buffer_ld8(bias_data, gz);
#endif
}
else
{
@@ -98,6 +109,51 @@ void main()
// group id
const int gg = gz / num_output_g;

#if NCNN_image_shader
int sz = gg * channels_g;

for (int z = 0; z < channels_g; z++)
{
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, sz));

afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz));
afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz));
afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz));
afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz));
afpvec8 k4 = image3d_ld8(weight_blob, ivec3(wx + 4, z, gz));
afpvec8 k5 = image3d_ld8(weight_blob, ivec3(wx + 5, z, gz));
afpvec8 k6 = image3d_ld8(weight_blob, ivec3(wx + 6, z, gz));
afpvec8 k7 = image3d_ld8(weight_blob, ivec3(wx + 7, z, gz));

// sum += v * k
sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]);
sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]);
sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]);
sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]);
sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]);
sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]);
sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]);
sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]);

sx += dilation_w;
wx += 8;
}

sy += dilation_h;
}

sz += 1;
}
#else // NCNN_image_shader
int w_offset = gz * channels_g * kernel_w * kernel_h;
int v_offset_0 = gg * channels_g * psc(cstep);

@@ -137,6 +193,7 @@ void main()

v_offset_0 += psc(cstep);
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -162,7 +219,11 @@ void main()
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st8(top_blob_data, gi, sum);
#endif
}

+ 47
- 0
src/layer/vulkan/shader/convolutiondepthwise_group_pack8to1.comp View File

@@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -84,7 +91,11 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld1(bias_blob, gz);
#else
sum = buffer_ld1(bias_data, gz);
#endif
}
else
{
@@ -98,6 +109,37 @@ void main()
// group id
const int gg = gz / num_output_g;

#if NCNN_image_shader
int sz = gg * channels_g;

for (int z = 0; z < channels_g; z++)
{
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, sz));

afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz));

// sum += dot(v, k);
sum += dot(v[0], k[0]) + dot(v[1], k[1]);

sx += dilation_w;
wx += 1;
}

sy += dilation_h;
}

sz += 1;
}
#else // NCNN_image_shader
int w_offset = gz * channels_g * kernel_w * kernel_h;
int v_offset_0 = gg * channels_g * psc(cstep);

@@ -123,6 +165,7 @@ void main()

v_offset_0 += psc(cstep);
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -144,7 +187,11 @@ void main()
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st1(top_blob_data, gi, sum);
#endif
}

+ 53
- 0
src/layer/vulkan/shader/convolutiondepthwise_group_pack8to4.comp View File

@@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -84,7 +91,11 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld4(bias_blob, gz);
#else
sum = buffer_ld4(bias_data, gz);
#endif
}
else
{
@@ -98,6 +109,43 @@ void main()
// group id
const int gg = gz / num_output_g;

#if NCNN_image_shader
int sz = gg * channels_g;

for (int z = 0; z < channels_g; z++)
{
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, sz));

afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz));
afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz));
afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz));
afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz));

// sum += v * k
sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]);
sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]);
sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]);
sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]);

sx += dilation_w;
wx += 4;
}

sy += dilation_h;
}

sz += 1;
}
#else // NCNN_image_shader
int w_offset = gz * channels_g * kernel_w * kernel_h;
int v_offset_0 = gg * channels_g * psc(cstep);

@@ -129,6 +177,7 @@ void main()

v_offset_0 += psc(cstep);
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -150,7 +199,11 @@ void main()
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st4(top_blob_data, gi, sum);
#endif
}

+ 39
- 0
src/layer/vulkan/shader/convolutiondepthwise_pack4.comp View File

@@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler2D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -83,7 +90,11 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld4(bias_blob, gz);
#else
sum = buffer_ld4(bias_data, gz);
#endif
}
else
{
@@ -91,6 +102,29 @@ void main()
}

// depth-wise convolution
#if NCNN_image_shader
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, gz));

afpvec4 k = image2d_ld4(weight_blob, ivec2(wx, gz));

sum += v * k;

sx += dilation_w;
wx += 1;
}

sy += dilation_h;
}
#else // NCNN_image_shader
int w_offset = gz * kernel_w * kernel_h;
int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;

@@ -108,6 +142,7 @@ void main()
v_offset += dilation_h * psc(w);
w_offset += kernel_w;
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -129,7 +164,11 @@ void main()
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st4(top_blob_data, gi, sum);
#endif
}

+ 41
- 0
src/layer/vulkan/shader/convolutiondepthwise_pack8.comp View File

@@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler2D weight_blob;
layout (binding = 3) uniform unfp sampler1D bias_blob;
#else // NCNN_image_shader
layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
#endif // NCNN_image_shader

layout (push_constant) uniform parameter
{
@@ -84,7 +91,11 @@ void main()

if (bias_term == 1)
{
#if NCNN_image_shader
sum = image1d_ld8(bias_blob, gz);
#else
sum = buffer_ld8(bias_data, gz);
#endif
}
else
{
@@ -92,6 +103,31 @@ void main()
}

// depth-wise convolution
#if NCNN_image_shader
int sy = gy * stride_h;
int wx = 0;

for (int y = 0; y < kernel_h; y++)
{
int sx = gx * stride_w;

for (int x = 0; x < kernel_w; x++)
{
afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, gz));

afpvec8 k = image2d_ld8(weight_blob, ivec2(wx, gz));

// sum += v * k;
sum[0] += v[0] * k[0];
sum[1] += v[1] * k[1];

sx += dilation_w;
wx += 1;
}

sy += dilation_h;
}
#else // NCNN_image_shader
int w_offset = gz * kernel_w * kernel_h;
int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;

@@ -111,6 +147,7 @@ void main()
v_offset += dilation_h * psc(w);
w_offset += kernel_w;
}
#endif // NCNN_image_shader

if (activation_type == 1)
{
@@ -136,7 +173,11 @@ void main()
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, gy, gz), sum);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st8(top_blob_data, gi, sum);
#endif
}

+ 11
- 2
src/layer/vulkan/shader/crop.comp View File

@@ -38,8 +38,13 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
#else
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -69,13 +74,17 @@ void main()
if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
return;

const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

int x = gx + p.woffset;
int y = gy + p.hoffset;
int z = gz + p.coffset;

#if NCNN_image_shader
image3d_cp1(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, z));
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

int v_offset = z * psc(cstep) + y * psc(w) + x;

buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
#endif
}

+ 18
- 2
src/layer/vulkan/shader/crop_pack1to4.comp View File

@@ -38,8 +38,13 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
#else
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -69,12 +74,23 @@ void main()
if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
return;

int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

int x = gx + p.woffset;
int y = gy + p.hoffset;
int z = gz * 4 + p.coffset;

#if NCNN_image_shader
afpvec4 v;
v.r = image3d_ld1(bottom_blob, ivec3(x, y, z + 0));
v.g = image3d_ld1(bottom_blob, ivec3(x, y, z + 1));
v.b = image3d_ld1(bottom_blob, ivec3(x, y, z + 2));
v.a = image3d_ld1(bottom_blob, ivec3(x, y, z + 3));

image3d_st4(top_blob, ivec3(gx, gy, gz), v);
#else
int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

ivec4 v_offset = z * psc(cstep) + y * psc(w) + x + ivec4(0, 1, 2, 3) * psc(cstep);

buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
#endif
}

+ 22
- 2
src/layer/vulkan/shader/crop_pack1to8.comp View File

@@ -39,8 +39,13 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
#else
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
#endif

layout (push_constant) uniform parameter
{
@@ -70,13 +75,28 @@ void main()
if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
return;

int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

int x = gx + p.woffset;
int y = gy + p.hoffset;
int z = gz * 8 + p.coffset;

#if NCNN_image_shader
afpvec8 v;
v[0].r = image3d_ld1(bottom_blob, ivec3(x, y, z + 0));
v[0].g = image3d_ld1(bottom_blob, ivec3(x, y, z + 1));
v[0].b = image3d_ld1(bottom_blob, ivec3(x, y, z + 2));
v[0].a = image3d_ld1(bottom_blob, ivec3(x, y, z + 3));
v[1].r = image3d_ld1(bottom_blob, ivec3(x, y, z + 4));
v[1].g = image3d_ld1(bottom_blob, ivec3(x, y, z + 5));
v[1].b = image3d_ld1(bottom_blob, ivec3(x, y, z + 6));
v[1].a = image3d_ld1(bottom_blob, ivec3(x, y, z + 7));

image3d_st8(top_blob, ivec3(gx, gy, gz), v);
#else
int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

ivec4 v_offset = z * psc(cstep) + y * psc(w) + x + ivec4(0, 1, 2, 3) * psc(cstep);
ivec4 vv_offset = v_offset + 4 * psc(cstep);

buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
#endif
}

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save