From 62da1228e1999a1dd7bc7d1fccd0eff9ef422e69 Mon Sep 17 00:00:00 2001 From: nihui Date: Thu, 30 Apr 2020 17:08:30 +0800 Subject: [PATCH] adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader --- .github/workflows/linux-x64-gpu-clang.yml | 6 +- .github/workflows/linux-x64-gpu-gcc.yml | 6 +- .github/workflows/macos-x64-gpu.yml | 21 +- .github/workflows/test-coverage.yml | 6 +- .github/workflows/windows-x64-gpu-vs2019.yml | 6 +- benchmark/benchncnn.cpp | 9 +- cmake/ncnn_generate_shader_spv_header.cmake | 298 +++ .../low-level-operation-api.md | 8 +- src/CMakeLists.txt | 12 + src/allocator.cpp | 1287 ++++++--- src/allocator.h | 137 +- src/command.cpp | 2345 ++++++++++++++--- src/command.h | 41 +- src/convert_ycbcr.comp | 41 +- src/gpu.cpp | 503 +++- src/gpu.h | 58 +- src/layer.cpp | 35 + src/layer.h | 13 + src/layer/input.cpp | 17 +- src/layer/input.h | 5 + src/layer/noop.cpp | 6 + src/layer/noop.h | 1 + src/layer/split.cpp | 14 +- src/layer/split.h | 1 + src/layer/vulkan/absval_vulkan.cpp | 39 +- src/layer/vulkan/absval_vulkan.h | 1 + src/layer/vulkan/cast_vulkan.cpp | 116 +- src/layer/vulkan/cast_vulkan.h | 1 + src/layer/vulkan/concat_vulkan.cpp | 494 +++- src/layer/vulkan/concat_vulkan.h | 1 + src/layer/vulkan/convolution_vulkan.cpp | 693 ++++- src/layer/vulkan/convolution_vulkan.h | 6 + .../vulkan/convolutiondepthwise_vulkan.cpp | 311 ++- .../vulkan/convolutiondepthwise_vulkan.h | 4 + src/layer/vulkan/crop_vulkan.cpp | 289 +- src/layer/vulkan/crop_vulkan.h | 4 + src/layer/vulkan/deconvolution_vulkan.cpp | 248 +- src/layer/vulkan/deconvolution_vulkan.h | 4 + .../vulkan/deconvolutiondepthwise_vulkan.cpp | 462 +++- .../vulkan/deconvolutiondepthwise_vulkan.h | 4 + src/layer/vulkan/eltwise_vulkan.cpp | 77 +- src/layer/vulkan/eltwise_vulkan.h | 1 + src/layer/vulkan/flatten_vulkan.cpp | 97 +- src/layer/vulkan/flatten_vulkan.h | 1 + src/layer/vulkan/innerproduct_vulkan.cpp | 124 +- src/layer/vulkan/innerproduct_vulkan.h | 4 + src/layer/vulkan/packing_vulkan.cpp | 143 +- src/layer/vulkan/packing_vulkan.h | 1 + src/layer/vulkan/padding_vulkan.cpp | 154 +- src/layer/vulkan/padding_vulkan.h | 5 + src/layer/vulkan/pooling_vulkan.cpp | 198 +- src/layer/vulkan/pooling_vulkan.h | 3 + src/layer/vulkan/shader/absval.comp | 40 + src/layer/vulkan/shader/absval_pack4.comp | 40 + src/layer/vulkan/shader/absval_pack8.comp | 40 + .../vulkan/shader/cast_fp16_to_fp32.comp | 24 + .../shader/cast_fp16_to_fp32_pack4.comp | 24 + .../shader/cast_fp16_to_fp32_pack8.comp | 24 + .../vulkan/shader/cast_fp32_to_fp16.comp | 24 + .../shader/cast_fp32_to_fp16_pack4.comp | 24 + .../shader/cast_fp32_to_fp16_pack8.comp | 24 + src/layer/vulkan/shader/concat.comp | 27 + src/layer/vulkan/shader/concat_pack4.comp | 27 + src/layer/vulkan/shader/concat_pack4to1.comp | 78 + src/layer/vulkan/shader/concat_pack8.comp | 27 + src/layer/vulkan/shader/concat_pack8to1.comp | 102 + src/layer/vulkan/shader/concat_pack8to4.comp | 67 + src/layer/vulkan/shader/convolution.comp | 38 + .../vulkan/shader/convolution_1x1s1d1.comp | 89 +- .../vulkan/shader/convolution_pack1to4.comp | 42 + .../vulkan/shader/convolution_pack1to8.comp | 44 + .../vulkan/shader/convolution_pack4.comp | 47 + .../shader/convolution_pack4_1x1s1d1.comp | 91 +- ...olution_pack4_3x3s1d1_winograd23_gemm.comp | 37 + ...k4_3x3s1d1_winograd23_transform_input.comp | 51 + ...4_3x3s1d1_winograd23_transform_output.comp | 41 + .../vulkan/shader/convolution_pack4to1.comp | 42 + .../vulkan/shader/convolution_pack4to8.comp | 57 + .../vulkan/shader/convolution_pack8.comp | 57 + .../shader/convolution_pack8_1x1s1d1.comp | 123 +- ...olution_pack8_3x3s1d1_winograd23_gemm.comp | 71 + ...k8_3x3s1d1_winograd23_transform_input.comp | 51 + ...8_3x3s1d1_winograd23_transform_output.comp | 41 + .../vulkan/shader/convolution_pack8to1.comp | 43 + .../vulkan/shader/convolution_pack8to4.comp | 49 + .../vulkan/shader/convolutiondepthwise.comp | 35 + .../shader/convolutiondepthwise_group.comp | 42 + .../convolutiondepthwise_group_pack1to4.comp | 46 + .../convolutiondepthwise_group_pack1to8.comp | 48 + .../convolutiondepthwise_group_pack4.comp | 51 + .../convolutiondepthwise_group_pack4to1.comp | 46 + .../convolutiondepthwise_group_pack4to8.comp | 61 + .../convolutiondepthwise_group_pack8.comp | 61 + .../convolutiondepthwise_group_pack8to1.comp | 47 + .../convolutiondepthwise_group_pack8to4.comp | 53 + .../shader/convolutiondepthwise_pack4.comp | 39 + .../shader/convolutiondepthwise_pack8.comp | 41 + src/layer/vulkan/shader/crop.comp | 13 +- src/layer/vulkan/shader/crop_pack1to4.comp | 20 +- src/layer/vulkan/shader/crop_pack1to8.comp | 24 +- src/layer/vulkan/shader/crop_pack4.comp | 13 +- src/layer/vulkan/shader/crop_pack4to1.comp | 15 +- src/layer/vulkan/shader/crop_pack4to8.comp | 28 + src/layer/vulkan/shader/crop_pack8.comp | 13 +- src/layer/vulkan/shader/crop_pack8to1.comp | 15 +- src/layer/vulkan/shader/crop_pack8to4.comp | 20 + src/layer/vulkan/shader/deconvolution.comp | 46 + .../vulkan/shader/deconvolution_pack1to4.comp | 50 + .../vulkan/shader/deconvolution_pack1to8.comp | 52 + .../vulkan/shader/deconvolution_pack4.comp | 55 + .../vulkan/shader/deconvolution_pack4to1.comp | 50 + .../vulkan/shader/deconvolution_pack4to8.comp | 65 + .../vulkan/shader/deconvolution_pack8.comp | 65 + .../vulkan/shader/deconvolution_pack8to1.comp | 51 + .../vulkan/shader/deconvolution_pack8to4.comp | 57 + .../vulkan/shader/deconvolutiondepthwise.comp | 43 + .../shader/deconvolutiondepthwise_group.comp | 49 + ...deconvolutiondepthwise_group_pack1to4.comp | 53 + ...deconvolutiondepthwise_group_pack1to8.comp | 55 + .../deconvolutiondepthwise_group_pack4.comp | 58 + ...deconvolutiondepthwise_group_pack4to1.comp | 53 + ...deconvolutiondepthwise_group_pack4to8.comp | 68 + .../deconvolutiondepthwise_group_pack8.comp | 68 + ...deconvolutiondepthwise_group_pack8to1.comp | 54 + ...deconvolutiondepthwise_group_pack8to4.comp | 60 + .../shader/deconvolutiondepthwise_pack4.comp | 47 + .../shader/deconvolutiondepthwise_pack8.comp | 49 + src/layer/vulkan/shader/eltwise.comp | 47 + src/layer/vulkan/shader/eltwise_pack4.comp | 47 + src/layer/vulkan/shader/eltwise_pack8.comp | 47 + src/layer/vulkan/shader/flatten.comp | 21 + src/layer/vulkan/shader/flatten_pack1to4.comp | 36 + src/layer/vulkan/shader/flatten_pack1to8.comp | 51 + src/layer/vulkan/shader/flatten_pack4.comp | 46 + src/layer/vulkan/shader/flatten_pack4to8.comp | 67 + src/layer/vulkan/shader/flatten_pack8.comp | 67 + src/layer/vulkan/shader/innerproduct.comp | 22 + .../vulkan/shader/innerproduct_pack1to4.comp | 26 + .../vulkan/shader/innerproduct_pack1to8.comp | 30 + .../vulkan/shader/innerproduct_pack4.comp | 34 + .../vulkan/shader/innerproduct_pack4to1.comp | 26 + .../vulkan/shader/innerproduct_pack4to8.comp | 45 + .../vulkan/shader/innerproduct_pack8.comp | 45 + .../vulkan/shader/innerproduct_pack8to1.comp | 27 + .../vulkan/shader/innerproduct_pack8to4.comp | 37 + src/layer/vulkan/shader/packing_1to4.comp | 47 + src/layer/vulkan/shader/packing_1to8.comp | 59 + src/layer/vulkan/shader/packing_4to1.comp | 47 + src/layer/vulkan/shader/packing_4to8.comp | 41 + src/layer/vulkan/shader/packing_8to1.comp | 59 + src/layer/vulkan/shader/packing_8to4.comp | 41 + src/layer/vulkan/shader/padding.comp | 31 +- src/layer/vulkan/shader/padding_pack4.comp | 31 +- src/layer/vulkan/shader/padding_pack8.comp | 31 +- src/layer/vulkan/shader/pooling.comp | 65 +- src/layer/vulkan/shader/pooling_global.comp | 34 +- .../vulkan/shader/pooling_global_pack4.comp | 34 +- .../vulkan/shader/pooling_global_pack8.comp | 37 +- src/layer/vulkan/shader/pooling_pack4.comp | 61 +- src/layer/vulkan/shader/pooling_pack8.comp | 66 +- src/layer/vulkan/shader/softmax_div_sum.comp | 62 + .../vulkan/shader/softmax_div_sum_pack4.comp | 62 + .../vulkan/shader/softmax_div_sum_pack8.comp | 62 + .../vulkan/shader/softmax_exp_sub_max.comp | 62 + .../shader/softmax_exp_sub_max_pack4.comp | 62 + .../shader/softmax_exp_sub_max_pack8.comp | 62 + .../vulkan/shader/softmax_reduce_max.comp | 56 + .../shader/softmax_reduce_max_pack4.comp | 56 + .../shader/softmax_reduce_max_pack8.comp | 56 + .../vulkan/shader/softmax_reduce_sum.comp | 74 +- .../shader/softmax_reduce_sum_pack4.comp | 74 +- .../shader/softmax_reduce_sum_pack8.comp | 74 +- src/layer/vulkan/softmax_vulkan.cpp | 149 ++ src/layer/vulkan/softmax_vulkan.h | 1 + src/mat.cpp | 2 +- src/mat.h | 412 ++- src/net.cpp | 1405 +++++++--- src/net.h | 28 +- src/option.cpp | 5 + src/option.h | 6 + src/pipeline.cpp | 147 +- src/pipeline.h | 10 +- tests/test_cast.cpp | 257 +- tests/test_packing.cpp | 289 +- tests/testutil.h | 262 +- 185 files changed, 15214 insertions(+), 1772 deletions(-) diff --git a/.github/workflows/linux-x64-gpu-clang.yml b/.github/workflows/linux-x64-gpu-clang.yml index 33492a538..2b250d747 100644 --- a/.github/workflows/linux-x64-gpu-clang.yml +++ b/.github/workflows/linux-x64-gpu-clang.yml @@ -27,14 +27,14 @@ jobs: uses: actions/cache@v1 with: path: swiftshader-install - key: swiftshader-linux-install + key: swiftshader-linux-install-20200426-3 - name: checkout-swiftshader if: steps.cache-swiftshader.outputs.cache-hit != 'true' uses: actions/checkout@v2 with: repository: google/swiftshader path: swiftshader - ref: 59465799210b3f4962af1a9dc44a4ffecb422c10 + ref: 60aa34a990fa77553e2d9a69d34f0b3601ced66a - name: checkout-swiftshader-submodules if: steps.cache-swiftshader.outputs.cache-hit != 'true' run: | @@ -45,7 +45,7 @@ jobs: run: | cd swiftshader mkdir -p build; cd build - cmake -DCMAKE_INSTALL_PREFIX=install -DBUILD_EGL=0 -DBUILD_GLESv2=0 -DBUILD_GLES_CM=0 -DBUILD_VULKAN=1 -DBUILD_SAMPLES=0 -DBUILD_TESTS=0 -DWARNINGS_AS_ERRORS=0 .. + cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_DEFAULT_OPT_LEVEL=None -DCMAKE_BUILD_TYPE=Release .. cmake --build . -j 2 mkdir $GITHUB_WORKSPACE/swiftshader-install cp Linux/* $GITHUB_WORKSPACE/swiftshader-install diff --git a/.github/workflows/linux-x64-gpu-gcc.yml b/.github/workflows/linux-x64-gpu-gcc.yml index 0e78ac8f3..bf181028f 100644 --- a/.github/workflows/linux-x64-gpu-gcc.yml +++ b/.github/workflows/linux-x64-gpu-gcc.yml @@ -27,14 +27,14 @@ jobs: uses: actions/cache@v1 with: path: swiftshader-install - key: swiftshader-linux-install + key: swiftshader-linux-install-20200426-3 - name: checkout-swiftshader if: steps.cache-swiftshader.outputs.cache-hit != 'true' uses: actions/checkout@v2 with: repository: google/swiftshader path: swiftshader - ref: 59465799210b3f4962af1a9dc44a4ffecb422c10 + ref: 60aa34a990fa77553e2d9a69d34f0b3601ced66a - name: checkout-swiftshader-submodules if: steps.cache-swiftshader.outputs.cache-hit != 'true' run: | @@ -45,7 +45,7 @@ jobs: run: | cd swiftshader mkdir -p build; cd build - cmake -DCMAKE_INSTALL_PREFIX=install -DBUILD_EGL=0 -DBUILD_GLESv2=0 -DBUILD_GLES_CM=0 -DBUILD_VULKAN=1 -DBUILD_SAMPLES=0 -DBUILD_TESTS=0 -DWARNINGS_AS_ERRORS=0 .. + cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_DEFAULT_OPT_LEVEL=None -DCMAKE_BUILD_TYPE=Release .. cmake --build . -j 2 mkdir $GITHUB_WORKSPACE/swiftshader-install cp Linux/* $GITHUB_WORKSPACE/swiftshader-install diff --git a/.github/workflows/macos-x64-gpu.yml b/.github/workflows/macos-x64-gpu.yml index 612799964..959f2ab22 100644 --- a/.github/workflows/macos-x64-gpu.yml +++ b/.github/workflows/macos-x64-gpu.yml @@ -25,14 +25,14 @@ jobs: uses: actions/cache@v1 with: path: swiftshader-install - key: swiftshader-macos-install + key: swiftshader-macos-install-20200426-3 - name: checkout-swiftshader if: steps.cache-swiftshader.outputs.cache-hit != 'true' uses: actions/checkout@v2 with: repository: google/swiftshader path: swiftshader - ref: 59465799210b3f4962af1a9dc44a4ffecb422c10 + ref: 60aa34a990fa77553e2d9a69d34f0b3601ced66a - name: checkout-swiftshader-submodules if: steps.cache-swiftshader.outputs.cache-hit != 'true' run: | @@ -43,7 +43,7 @@ jobs: run: | cd swiftshader mkdir -p build; cd build - cmake -DCMAKE_INSTALL_PREFIX=install -DBUILD_EGL=0 -DBUILD_GLESv2=0 -DBUILD_GLES_CM=0 -DBUILD_VULKAN=1 -DBUILD_SAMPLES=0 -DBUILD_TESTS=0 -DWARNINGS_AS_ERRORS=0 .. + cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_DEFAULT_OPT_LEVEL=None -DCMAKE_BUILD_TYPE=Release .. cmake --build . -j 2 mkdir $GITHUB_WORKSPACE/swiftshader-install cp Darwin/* $GITHUB_WORKSPACE/swiftshader-install @@ -51,16 +51,11 @@ jobs: run: export VULKAN_SDK=`pwd`/vulkansdk-macos-1.1.114.0/macOS && mkdir build && cd build && cmake -DNCNN_VULKAN=ON .. - name: build run: cmake --build build -j 2 -# - name: test -# run: | -# find "swiftshader-install/" -# find "vulkansdk-macos-1.1.114.0/" -# export DYLD_LIBRARY_PATH="vulkansdk-macos-1.1.114.0/macOS/lib":$DYLD_LIBRARY_PATH -# export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" -# ./vulkansdk-macos-1.1.114.0/macOS/bin/vulkaninfo -# cd build && ctest --output-on-failure -j 2 -# export VK_ICD_FILENAMES="vulkansdk-macos-1.1.114.0/macOS/etc/vulkan/icd.d/MoltenVK_icd.json" -# cd build && ctest --output-on-failure -j 2 + - name: test + run: | + export DYLD_LIBRARY_PATH="vulkansdk-macos-1.1.114.0/macOS/lib":$DYLD_LIBRARY_PATH + export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" + cd build && ctest --output-on-failure -j 2 macos-clang-gpu-nostdio: runs-on: macos-latest diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml index d0c29f1e1..27707711b 100644 --- a/.github/workflows/test-coverage.yml +++ b/.github/workflows/test-coverage.yml @@ -25,14 +25,14 @@ jobs: uses: actions/cache@v1 with: path: swiftshader-install - key: swiftshader-linux-install + key: swiftshader-linux-install-20200426-3 - name: checkout-swiftshader if: steps.cache-swiftshader.outputs.cache-hit != 'true' uses: actions/checkout@v2 with: repository: google/swiftshader path: swiftshader - ref: 59465799210b3f4962af1a9dc44a4ffecb422c10 + ref: 60aa34a990fa77553e2d9a69d34f0b3601ced66a - name: checkout-swiftshader-submodules if: steps.cache-swiftshader.outputs.cache-hit != 'true' run: | @@ -43,7 +43,7 @@ jobs: run: | cd swiftshader mkdir -p build; cd build - cmake -DCMAKE_INSTALL_PREFIX=install -DBUILD_EGL=0 -DBUILD_GLESv2=0 -DBUILD_GLES_CM=0 -DBUILD_VULKAN=1 -DBUILD_SAMPLES=0 -DBUILD_TESTS=0 -DWARNINGS_AS_ERRORS=0 .. + cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_DEFAULT_OPT_LEVEL=None -DCMAKE_BUILD_TYPE=Release .. cmake --build . -j 2 mkdir $GITHUB_WORKSPACE/swiftshader-install cp Linux/* $GITHUB_WORKSPACE/swiftshader-install diff --git a/.github/workflows/windows-x64-gpu-vs2019.yml b/.github/workflows/windows-x64-gpu-vs2019.yml index 6cdff29b8..068837b93 100644 --- a/.github/workflows/windows-x64-gpu-vs2019.yml +++ b/.github/workflows/windows-x64-gpu-vs2019.yml @@ -37,14 +37,14 @@ jobs: uses: actions/cache@v1 with: path: swiftshader-install - key: swiftshader-windows-install + key: swiftshader-windows-install-20200426-3 - name: checkout-swiftshader if: steps.cache-swiftshader.outputs.cache-hit != 'true' uses: actions/checkout@v2 with: repository: google/swiftshader path: swiftshader - ref: 59465799210b3f4962af1a9dc44a4ffecb422c10 + ref: 60aa34a990fa77553e2d9a69d34f0b3601ced66a - name: checkout-swiftshader-submodules if: steps.cache-swiftshader.outputs.cache-hit != 'true' run: | @@ -55,7 +55,7 @@ jobs: run: | cd swiftshader mkdir build-vs2019; cd build-vs2019 - cmake -DCMAKE_INSTALL_PREFIX=install -DBUILD_EGL=0 -DBUILD_GLESv2=0 -DBUILD_GLES_CM=0 -DBUILD_VULKAN=1 -DBUILD_SAMPLES=0 -DBUILD_TESTS=0 -DWARNINGS_AS_ERRORS=0 .. + cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_DEFAULT_OPT_LEVEL=None -DCMAKE_BUILD_TYPE=Release .. cmake --build . --config Release -j 2 mkdir "$env:GITHUB_WORKSPACE/swiftshader-install" Copy-Item -Path "Windows\*" -Destination "$env:GITHUB_WORKSPACE\swiftshader-install" diff --git a/benchmark/benchncnn.cpp b/benchmark/benchncnn.cpp index 329bb68c9..455b6ae3c 100644 --- a/benchmark/benchncnn.cpp +++ b/benchmark/benchncnn.cpp @@ -188,8 +188,8 @@ int main(int argc, char** argv) g_vkdev = ncnn::get_gpu_device(gpu_device); - g_blob_vkallocator = new ncnn::VkBlobBufferAllocator(g_vkdev); - g_staging_vkallocator = new ncnn::VkStagingBufferAllocator(g_vkdev); + g_blob_vkallocator = new ncnn::VkBlobAllocator(g_vkdev); + g_staging_vkallocator = new ncnn::VkStagingAllocator(g_vkdev); } #endif // NCNN_VULKAN @@ -214,6 +214,11 @@ int main(int argc, char** argv) opt.use_int8_storage = true; opt.use_int8_arithmetic = true; opt.use_packing_layout = true; + opt.use_shader_pack8 = false; + opt.use_image_storage = false; + opt.use_image_fp16_packed = true; + opt.use_image_fp16_storage = true; + opt.use_image_fp16_arithmetic = true; ncnn::set_cpu_powersave(powersave); diff --git a/cmake/ncnn_generate_shader_spv_header.cmake b/cmake/ncnn_generate_shader_spv_header.cmake index 6e9d66af5..65dcdff4b 100644 --- a/cmake/ncnn_generate_shader_spv_header.cmake +++ b/cmake/ncnn_generate_shader_spv_header.cmake @@ -184,6 +184,296 @@ function(ncnn_generate_shader_spv_header SHADER_SPV_HEADER SHADER_SPV_HEX_HEADER ) set_source_files_properties(${SHADER_fp16sa_SPV_HEX_FILE} PROPERTIES GENERATED TRUE) + # image + fp32 + set(SHADER_image_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image") + + set(SHADER_image_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_SRC_NAME_WE}.spv.hex.h) + add_custom_command( + OUTPUT ${SHADER_image_SPV_HEX_FILE} + COMMAND ${GLSLANGVALIDATOR_EXECUTABLE} + ARGS -Dsfp=float -Dsfpvec2=vec2 -Dsfpvec4=vec4 -Dsfpvec8=mat2x4 -Dsfpmat4=mat4 + -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpvec8=mat2x4 -Dafpmat4=mat4 + + -Dimfmtc1=r32f -Dimfmtc4=rgba32f + -Dunfp=highp + + "-D image1d_ld1(tex,p)=texelFetch(tex,p,0).r" + "-D image2d_ld1(tex,p)=texelFetch(tex,p,0).r" + "-D image3d_ld1(tex,p)=texelFetch(tex,p,0).r" + "-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}" + "-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}" + "-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}" + "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + + "-D image1d_ld4(tex,p)=texelFetch(tex,p,0)" + "-D image2d_ld4(tex,p)=texelFetch(tex,p,0)" + "-D image3d_ld4(tex,p)=texelFetch(tex,p,0)" + "-D image1d_st4(img,p,v)={imageStore(img,p,v);}" + "-D image2d_st4(img,p,v)={imageStore(img,p,v);}" + "-D image3d_st4(img,p,v)={imageStore(img,p,v);}" + "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + + "-D image1d_ld8(tex,p)=mat2x4(texelFetch(tex,p*2,0),texelFetch(tex,p*2+1,0))" + "-D image2d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))" + "-D image3d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))" + "-D image1d_st8(img,p,v)={imageStore(img,p*2,v[0]);imageStore(img,p*2+1,v[1]);}" + "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}" + "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}" + "-D image1d_cp8(img,p,tex,sp)={imageStore(img,p*2,texelFetch(tex,sp*2,0));imageStore(img,p*2+1,texelFetch(tex,sp*2+1,0));}" + "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}" + "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}" + + "-D buffer_ld1(buf,i)=buf[i]" + "-D buffer_st1(buf,i,v)={buf[i]=v;}" + "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}" + "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}" + "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}" + "-D buffer_ld2(buf,i)=buf[i]" + "-D buffer_st2(buf,i,v)={buf[i]=v;}" + "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}" + "-D buffer_ld4(buf,i)=buf[i]" + "-D buffer_st4(buf,i,v)={buf[i]=v;}" + "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}" + "-D buffer_cp4to1(buf,i4,sbuf,si)={vec4 _v=sbuf[si]; buf[i4.r]=_v.r;buf[i4.g]=_v.g;buf[i4.b]=_v.b;buf[i4.a]=_v.a;}" + "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=mat2x4(sbuf[si2.r],sbuf[si2.g]);}" + "-D buffer_ld8(buf,i)=buf[i]" + "-D buffer_st8(buf,i,v)={buf[i]=v;}" + "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}" + "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}" + "-D buffer_cp8to4(buf,i2,sbuf,si)={mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}" + + "-D sfp2afpmat4(v)=v" + "-D afp2sfpmat4(v)=v" + "-D psc(x)=(x==0?p.x:x)" + -DNCNN_image_shader=1 + -V -s -x -o ${SHADER_image_SPV_HEX_FILE} ${SHADER_SRC} + DEPENDS ${SHADER_SRC} + COMMENT "Building SPIR-V module ${SHADER_image_SRC_NAME_WE}.spv" + VERBATIM + ) + set_source_files_properties(${SHADER_image_SPV_HEX_FILE} PROPERTIES GENERATED TRUE) + + # image + fp16p + set(SHADER_image_fp16p_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16p") + + set(SHADER_image_fp16p_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16p_SRC_NAME_WE}.spv.hex.h) + add_custom_command( + OUTPUT ${SHADER_image_fp16p_SPV_HEX_FILE} + COMMAND ${GLSLANGVALIDATOR_EXECUTABLE} + ARGS -Dsfp=float -Dsfpvec2=uint -Dsfpvec4=uvec2 -Dsfpvec8=uvec4 + -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpvec8=mat2x4 -Dafpmat4=mat4 + + -Dimfmtc1=r32f -Dimfmtc4=rgba16f + -Dunfp=mediump + + "-D image1d_ld1(tex,p)=texelFetch(tex,p,0).r" + "-D image2d_ld1(tex,p)=texelFetch(tex,p,0).r" + "-D image3d_ld1(tex,p)=texelFetch(tex,p,0).r" + "-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}" + "-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}" + "-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}" + "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + + "-D image1d_ld4(tex,p)=texelFetch(tex,p,0)" + "-D image2d_ld4(tex,p)=texelFetch(tex,p,0)" + "-D image3d_ld4(tex,p)=texelFetch(tex,p,0)" + "-D image1d_st4(img,p,v)={imageStore(img,p,v);}" + "-D image2d_st4(img,p,v)={imageStore(img,p,v);}" + "-D image3d_st4(img,p,v)={imageStore(img,p,v);}" + "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + + "-D image1d_ld8(tex,p)=mat2x4(texelFetch(tex,p*2,0),texelFetch(tex,p*2+1,0))" + "-D image2d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))" + "-D image3d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))" + "-D image1d_st8(img,p,v)={imageStore(img,p*2,v[0]);imageStore(img,p*2+1,v[1]);}" + "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}" + "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}" + "-D image1d_cp8(img,p,tex,sp)={imageStore(img,p*2,texelFetch(tex,sp*2,0));imageStore(img,p*2+1,texelFetch(tex,sp*2+1,0));}" + "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}" + "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}" + + "-D buffer_ld1(buf,i)=buf[i]" + "-D buffer_st1(buf,i,v)={buf[i]=v;}" + "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}" + "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=uvec2(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])));}" + "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=uvec4(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])),packHalf2x16(vec2(sbuf[sii4.r],sbuf[sii4.g])),packHalf2x16(vec2(sbuf[sii4.b],sbuf[sii4.a])));}" + "-D buffer_ld2(buf,i)=unpackHalf2x16(buf[i])" + "-D buffer_st2(buf,i,v)={buf[i]=packHalf2x16(v)}" + "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}" + "-D buffer_ld4(buf,i)=vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y))" + "-D buffer_st4(buf,i,v)={buf[i]=uvec2(packHalf2x16(v.rg),packHalf2x16(v.ba));}" + "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}" + "-D buffer_cp4to1(buf,i4,sbuf,si)={uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}" + "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}" + "-D buffer_ld8(buf,i)=mat2x4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g)),vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a)))" + "-D buffer_st8(buf,i,v)={buf[i]=uvec4(uvec2(packHalf2x16(v[0].rg),packHalf2x16(v[0].ba)),uvec2(packHalf2x16(v[1].rg),packHalf2x16(v[1].ba)));}" + "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}" + "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}" + "-D buffer_cp8to4(buf,i2,sbuf,si)={uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}" + + "-D psc(x)=(x==0?p.x:x)" + -DNCNN_image_shader=1 -DNCNN_fp16_packed=1 + -V -s -x -o ${SHADER_image_fp16p_SPV_HEX_FILE} ${SHADER_SRC} + DEPENDS ${SHADER_SRC} + COMMENT "Building SPIR-V module ${SHADER_image_fp16p_SRC_NAME_WE}.spv" + VERBATIM + ) + set_source_files_properties(${SHADER_image_fp16p_SPV_HEX_FILE} PROPERTIES GENERATED TRUE) + + # image + fp16s + set(SHADER_image_fp16s_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16s") + + set(SHADER_image_fp16s_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16s_SRC_NAME_WE}.spv.hex.h) + add_custom_command( + OUTPUT ${SHADER_image_fp16s_SPV_HEX_FILE} + COMMAND ${GLSLANGVALIDATOR_EXECUTABLE} + ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4 + -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpvec8=mat2x4 -Dafpmat4=mat4 + + -Dimfmtc1=r16f -Dimfmtc4=rgba16f + -Dunfp=mediump + + "-D image1d_ld1(tex,p)=texelFetch(tex,p,0).r" + "-D image2d_ld1(tex,p)=texelFetch(tex,p,0).r" + "-D image3d_ld1(tex,p)=texelFetch(tex,p,0).r" + "-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}" + "-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}" + "-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}" + "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + + "-D image1d_ld4(tex,p)=texelFetch(tex,p,0)" + "-D image2d_ld4(tex,p)=texelFetch(tex,p,0)" + "-D image3d_ld4(tex,p)=texelFetch(tex,p,0)" + "-D image1d_st4(img,p,v)={imageStore(img,p,v);}" + "-D image2d_st4(img,p,v)={imageStore(img,p,v);}" + "-D image3d_st4(img,p,v)={imageStore(img,p,v);}" + "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + + "-D image1d_ld8(tex,p)=mat2x4(texelFetch(tex,p*2,0),texelFetch(tex,p*2+1,0))" + "-D image2d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))" + "-D image3d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))" + "-D image1d_st8(img,p,v)={imageStore(img,p*2,v[0]);imageStore(img,p*2+1,v[1]);}" + "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}" + "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}" + "-D image1d_cp8(img,p,tex,sp)={imageStore(img,p*2,texelFetch(tex,sp*2,0));imageStore(img,p*2+1,texelFetch(tex,sp*2+1,0));}" + "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}" + "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}" + + "-D buffer_ld1(buf,i)=float(buf[i])" + "-D buffer_st1(buf,i,v)={buf[i]=float16_t(v);}" + "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}" + "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i].r=sbuf[si4.r];buf[i].g=sbuf[si4.g];buf[i].b=sbuf[si4.b];buf[i].a=sbuf[si4.a];}" + "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i].abcd.r=sbuf[si4.r];buf[i].abcd.g=sbuf[si4.g];buf[i].abcd.b=sbuf[si4.b];buf[i].abcd.a=sbuf[si4.a];buf[i].efgh.r=sbuf[sii4.r];buf[i].efgh.g=sbuf[sii4.g];buf[i].efgh.b=sbuf[sii4.b];buf[i].efgh.a=sbuf[sii4.a];}" + "-D buffer_ld2(buf,i)=vec2(buf[i])" + "-D buffer_st2(buf,i,v)={buf[i]=f16vec2(v);}" + "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}" + "-D buffer_ld4(buf,i)=vec4(buf[i])" + "-D buffer_st4(buf,i,v)={buf[i]=f16vec4(v);}" + "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}" + "-D buffer_cp4to1(buf,i4,sbuf,si)={buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}" + "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i].abcd=sbuf[si2.r];buf[i].efgh=sbuf[si2.g];}" + "-D buffer_ld8(buf,i)=mat2x4(vec4(buf[i].abcd),vec4(buf[i].efgh))" + "-D buffer_st8(buf,i,v)={buf[i].abcd=f16vec4(v[0]);buf[i].efgh=f16vec4(v[1]);}" + "-D buffer_cp8(buf,i,sbuf,si)={buf[i].abcd=sbuf[si].abcd;buf[i].efgh=sbuf[si].efgh;}" + "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={buf[i4.r]=sbuf[si].abcd.r;buf[i4.g]=sbuf[si].abcd.g;buf[i4.b]=sbuf[si].abcd.b;buf[i4.a]=sbuf[si].abcd.a; buf[ii4.r]=sbuf[si].efgh.r;buf[ii4.g]=sbuf[si].efgh.g;buf[ii4.b]=sbuf[si].efgh.b;buf[ii4.a]=sbuf[si].efgh.a;}" + "-D buffer_cp8to4(buf,i2,sbuf,si)={buf[i2.r]=sbuf[si].abcd;buf[i2.g]=sbuf[si].efgh;}" + + "-D sfp2afpmat4(v)=v" + "-D afp2sfpmat4(v)=v" + "-D psc(x)=(x==0?p.x:x)" + -DNCNN_image_shader=1 -DNCNN_fp16_storage=1 + -V -s -x -o ${SHADER_image_fp16s_SPV_HEX_FILE} ${SHADER_SRC} + DEPENDS ${SHADER_SRC} + COMMENT "Building SPIR-V module ${SHADER_image_fp16s_SRC_NAME_WE}.spv" + VERBATIM + ) + set_source_files_properties(${SHADER_image_fp16s_SPV_HEX_FILE} PROPERTIES GENERATED TRUE) + + # image + fp16a + set(SHADER_image_fp16a_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16a") + + set(SHADER_image_fp16a_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16a_SRC_NAME_WE}.spv.hex.h) + add_custom_command( + OUTPUT ${SHADER_image_fp16a_SPV_HEX_FILE} + COMMAND ${GLSLANGVALIDATOR_EXECUTABLE} + ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4 -Dsfpvec8=f16mat2x4 -Dsfpmat4=f16mat4 + -Dafp=float16_t -Dafpvec2=f16vec2 -Dafpvec4=f16vec4 -Dafpvec8=f16mat2x4 -Dafpmat4=f16mat4 + + -Dimfmtc1=r16f -Dimfmtc4=rgba16f + -Dunfp=mediump + + "-D image1d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)" + "-D image2d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)" + "-D image3d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)" + "-D image1d_st1(img,p,v)={vec4 _v;_v.r=float(v);imageStore(img,p,_v);}" + "-D image2d_st1(img,p,v)={vec4 _v;_v.r=float(v);imageStore(img,p,_v);}" + "-D image3d_st1(img,p,v)={vec4 _v;_v.r=float(v);imageStore(img,p,_v);}" + "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + + "-D image1d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))" + "-D image2d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))" + "-D image3d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))" + "-D image1d_st4(img,p,v)={imageStore(img,p,vec4(v));}" + "-D image2d_st4(img,p,v)={imageStore(img,p,vec4(v));}" + "-D image3d_st4(img,p,v)={imageStore(img,p,vec4(v));}" + "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" + + "-D image1d_ld8(tex,p)=f16mat2x4(texelFetch(tex,p*2,0),texelFetch(tex,p*2+1,0))" + "-D image2d_ld8(tex,p)=f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))" + "-D image3d_ld8(tex,p)=f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))" + "-D image1d_st8(img,p,v)={imageStore(img,p*2,vec4(v[0]));imageStore(img,p*2+1,vec4(v[1]));}" + "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),vec4(v[0]));imageStore(img,ivec2(p.x*2+1,p.y),vec4(v[1]));}" + "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),vec4(v[0]));imageStore(img,ivec3(p.x*2+1,p.y,p.z),vec4(v[1]));}" + "-D image1d_cp8(img,p,tex,sp)={imageStore(img,p*2,texelFetch(tex,sp*2,0));imageStore(img,p*2+1,texelFetch(tex,sp*2+1,0));}" + "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}" + "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}" + + "-D buffer_ld1(buf,i)=buf[i]" + "-D buffer_st1(buf,i,v)={buf[i]=v;}" + "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}" + "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=f16vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}" + "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=f16mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}" + "-D buffer_ld2(buf,i)=buf[i]" + "-D buffer_st2(buf,i,v)={buf[i]=v;}" + "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}" + "-D buffer_ld4(buf,i)=buf[i]" + "-D buffer_st4(buf,i,v)={buf[i]=v;}" + "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}" + "-D buffer_cp4to1(buf,i4,sbuf,si)={buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}" + "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=f16mat2x4(sbuf[si2.r],sbuf[si2.g]);}" + "-D buffer_ld8(buf,i)=buf[i]" + "-D buffer_st8(buf,i,v)={buf[i]=v;}" + "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}" + "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={f16mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}" + "-D buffer_cp8to4(buf,i2,sbuf,si)={f16mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}" + "-D sfp2afpmat4(v)=v" + "-D afp2sfpmat4(v)=v" + + "-D psc(x)=(x==0?p.x:x)" + -DNCNN_image_shader=1 -DNCNN_fp16_storage=1 -DNCNN_fp16_arithmetic=1 + -V -s -x -o ${SHADER_image_fp16a_SPV_HEX_FILE} ${SHADER_SRC} + DEPENDS ${SHADER_SRC} + COMMENT "Building SPIR-V module ${SHADER_image_fp16a_SRC_NAME_WE}.spv" + VERBATIM + ) + set_source_files_properties(${SHADER_image_fp16a_SPV_HEX_FILE} PROPERTIES GENERATED TRUE) + set(LOCAL_SHADER_SPV_HEADER ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_SRC_NAME_WE}.spv.h) file(WRITE ${LOCAL_SHADER_SPV_HEADER} @@ -192,6 +482,10 @@ function(ncnn_generate_shader_spv_header SHADER_SPV_HEADER SHADER_SPV_HEX_HEADER "static const uint32_t ${SHADER_fp16pa_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16pa_SRC_NAME_WE}.spv.hex.h\"\n};\n" "static const uint32_t ${SHADER_fp16s_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16s_SRC_NAME_WE}.spv.hex.h\"\n};\n" "static const uint32_t ${SHADER_fp16sa_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16sa_SRC_NAME_WE}.spv.hex.h\"\n};\n" + "static const uint32_t ${SHADER_image_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_SRC_NAME_WE}.spv.hex.h\"\n};\n" + "static const uint32_t ${SHADER_image_fp16p_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16p_SRC_NAME_WE}.spv.hex.h\"\n};\n" + "static const uint32_t ${SHADER_image_fp16s_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16s_SRC_NAME_WE}.spv.hex.h\"\n};\n" + "static const uint32_t ${SHADER_image_fp16a_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16a_SRC_NAME_WE}.spv.hex.h\"\n};\n" ) set_source_files_properties(${LOCAL_SHADER_SPV_HEADER} PROPERTIES GENERATED TRUE) @@ -202,6 +496,10 @@ function(ncnn_generate_shader_spv_header SHADER_SPV_HEADER SHADER_SPV_HEX_HEADER ${SHADER_fp16pa_SPV_HEX_FILE} ${SHADER_fp16s_SPV_HEX_FILE} ${SHADER_fp16sa_SPV_HEX_FILE} + ${SHADER_image_SPV_HEX_FILE} + ${SHADER_image_fp16p_SPV_HEX_FILE} + ${SHADER_image_fp16s_SPV_HEX_FILE} + ${SHADER_image_fp16a_SPV_HEX_FILE} ) set(${SHADER_SPV_HEADER} ${LOCAL_SHADER_SPV_HEADER} PARENT_SCOPE) diff --git a/docs/developer-guide/low-level-operation-api.md b/docs/developer-guide/low-level-operation-api.md index e4d2f51ce..d5949675a 100644 --- a/docs/developer-guide/low-level-operation-api.md +++ b/docs/developer-guide/low-level-operation-api.md @@ -141,10 +141,10 @@ ncnn::create_gpu_instance(); { ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(); -ncnn::VkWeightBufferAllocator g_weight_vkallocator(vkdev); -ncnn::VkBlobBufferAllocator g_blob_vkallocator(vkdev); -ncnn::VkStagingBufferAllocator g_staging_vkallocator(vkdev); -ncnn::VkWeightStagingBufferAllocator g_weight_staging_vkallocator(vkdev); +ncnn::VkWeightAllocator g_weight_vkallocator(vkdev); +ncnn::VkBlobAllocator g_blob_vkallocator(vkdev); +ncnn::VkStagingAllocator g_staging_vkallocator(vkdev); +ncnn::VkWeightStagingAllocator g_weight_staging_vkallocator(vkdev); // create layer ncnn::Layer* convolution = ncnn::create_layer("Convolution"); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3696a587c..08e1da0dd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -60,6 +60,10 @@ macro(ncnn_add_shader SHADER_SRC) string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16pa_spv_data)},\n") string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16s_spv_data)},\n") string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16sa_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16p_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16s_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16a_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16a_spv_data)},\n") list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEADER}) list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEX_HEADERS}) @@ -75,6 +79,14 @@ macro(ncnn_add_shader SHADER_SRC) math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16a = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") endmacro() macro(ncnn_add_layer class) diff --git a/src/allocator.cpp b/src/allocator.cpp index 5b7569d5c..6cc902400 100644 --- a/src/allocator.cpp +++ b/src/allocator.cpp @@ -256,7 +256,7 @@ VkAllocator::VkAllocator(const VulkanDevice* _vkdev) : vkdev(_vkdev) static inline size_t round_up(size_t n, size_t multiple) { - return (n + n - 1) / multiple * multiple; + return (n + multiple - 1) / multiple * multiple; } static inline size_t round_down(size_t n, size_t multiple) @@ -350,7 +350,7 @@ VkDeviceMemory VkAllocator::allocate_memory(size_t size) return memory; } -VkDeviceMemory VkAllocator::allocate_dedicated_memory(size_t size, VkBuffer buffer) +VkDeviceMemory VkAllocator::allocate_dedicated_memory(size_t size, VkImage image, VkBuffer buffer) { VkMemoryAllocateInfo memoryAllocateInfo; memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; @@ -361,7 +361,7 @@ VkDeviceMemory VkAllocator::allocate_dedicated_memory(size_t size, VkBuffer buff VkMemoryDedicatedAllocateInfoKHR memoryDedicatedAllocateInfo; memoryDedicatedAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR; memoryDedicatedAllocateInfo.pNext = 0; - memoryDedicatedAllocateInfo.image = 0; + memoryDedicatedAllocateInfo.image = image; memoryDedicatedAllocateInfo.buffer = buffer; memoryAllocateInfo.pNext = &memoryDedicatedAllocateInfo; @@ -376,6 +376,68 @@ VkDeviceMemory VkAllocator::allocate_dedicated_memory(size_t size, VkBuffer buff return memory; } +VkImage VkAllocator::create_image(VkImageType type, int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage) +{ + VkImageCreateInfo imageCreateInfo; + imageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + imageCreateInfo.pNext = 0; + imageCreateInfo.flags = 0; + imageCreateInfo.imageType = type; + imageCreateInfo.format = format; + imageCreateInfo.extent.width = width; + imageCreateInfo.extent.height = height; + imageCreateInfo.extent.depth = depth; + imageCreateInfo.mipLevels = 1; + imageCreateInfo.arrayLayers = 1; + imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT; + imageCreateInfo.tiling = tiling; + imageCreateInfo.usage = usage; + imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + imageCreateInfo.queueFamilyIndexCount = 0; + imageCreateInfo.pQueueFamilyIndices = 0; + imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + + VkImage image; + VkResult ret = vkCreateImage(vkdev->vkdevice(), &imageCreateInfo, 0, &image); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkCreateImage failed %d %d %d %d %d %d %d %d\n", ret, type, width, height, depth, format, tiling, usage); + return 0; + } + + return image; +} + +VkImageView VkAllocator::create_imageview(VkImageViewType type, VkImage image, VkFormat format) +{ + VkImageViewCreateInfo imageViewCreateInfo; + imageViewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; + imageViewCreateInfo.pNext = 0; + imageViewCreateInfo.flags = 0; + imageViewCreateInfo.image = image; + imageViewCreateInfo.viewType = type; + imageViewCreateInfo.format = format; + imageViewCreateInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY; + imageViewCreateInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY; + imageViewCreateInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY; + imageViewCreateInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY; + imageViewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + imageViewCreateInfo.subresourceRange.baseMipLevel = 0; + imageViewCreateInfo.subresourceRange.levelCount = 1; + imageViewCreateInfo.subresourceRange.baseArrayLayer = 0; + imageViewCreateInfo.subresourceRange.layerCount = 1; + + VkImageView imageview; + VkResult ret = vkCreateImageView(vkdev->vkdevice(), &imageViewCreateInfo, 0, &imageview); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkCreateImageView failed %d\n", ret); + return 0; + } + + return imageview; +} + static inline size_t least_common_multiple(size_t a, size_t b) { if (a == b) @@ -393,9 +455,10 @@ static inline size_t least_common_multiple(size_t a, size_t b) return lcm; } -VkBlobBufferAllocator::VkBlobBufferAllocator(const VulkanDevice* _vkdev) : VkAllocator(_vkdev) +VkBlobAllocator::VkBlobAllocator(const VulkanDevice* _vkdev) : VkAllocator(_vkdev) { buffer_offset_alignment = vkdev->info.buffer_offset_alignment; + bind_memory_offset_alignment = vkdev->info.buffer_image_granularity; if (vkdev->info.type == 1) { @@ -410,23 +473,23 @@ VkBlobBufferAllocator::VkBlobBufferAllocator(const VulkanDevice* _vkdev) : VkAll block_size = alignSize(16 * 1024 * 1024, buffer_offset_alignment);// 16M } -VkBlobBufferAllocator::~VkBlobBufferAllocator() +VkBlobAllocator::~VkBlobAllocator() { clear(); } -void VkBlobBufferAllocator::clear() +void VkBlobAllocator::clear() { -// fprintf(stderr, "VkBlobBufferAllocator %lu\n", buffer_blocks.size()); +// fprintf(stderr, "VkBlobAllocator %lu\n", buffer_blocks.size()); for (size_t i=0; i >::iterator it = budgets[i].begin(); -// while (it != budgets[i].end()) +// std::list< std::pair >::iterator it = buffer_budgets[i].begin(); +// while (it != buffer_budgets[i].end()) // { -// fprintf(stderr, "VkBlobBufferAllocator budget %p %lu %lu\n", ptr->buffer, it->first, it->second); +// fprintf(stderr, "VkBlobAllocator budget %p %lu %lu\n", ptr->buffer, it->first, it->second); // it++; // } @@ -440,10 +503,27 @@ void VkBlobBufferAllocator::clear() } buffer_blocks.clear(); - budgets.clear(); + buffer_budgets.clear(); + + for (size_t i=0; i >::iterator it = image_memory_budgets[i].begin(); +// while (it != image_memory_budgets[i].end()) +// { +// fprintf(stderr, "VkBlobAllocator budget %p %lu %lu\n", memory, it->first, it->second); +// it++; +// } + + vkFreeMemory(vkdev->vkdevice(), memory, 0); + } + image_memory_blocks.clear(); + + image_memory_budgets.clear(); } -VkBufferMemory* VkBlobBufferAllocator::fastMalloc(size_t size) +VkBufferMemory* VkBlobAllocator::fastMalloc(size_t size) { size_t aligned_size = alignSize(size, buffer_offset_alignment); @@ -452,8 +532,8 @@ VkBufferMemory* VkBlobBufferAllocator::fastMalloc(size_t size) // find first spare space in buffer_blocks for (int i=0; i >::iterator it = budgets[i].begin(); - while (it != budgets[i].end()) + std::list< std::pair >::iterator it = buffer_budgets[i].begin(); + while (it != buffer_budgets[i].end()) { size_t budget_size = it->second; if (budget_size < aligned_size) @@ -473,10 +553,10 @@ VkBufferMemory* VkBlobBufferAllocator::fastMalloc(size_t size) ptr->access_flags = 0; ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - // adjust budgets + // adjust buffer_budgets if (budget_size == aligned_size) { - budgets[i].erase(it); + buffer_budgets[i].erase(it); } else { @@ -484,7 +564,7 @@ VkBufferMemory* VkBlobBufferAllocator::fastMalloc(size_t size) it->second -= aligned_size; } -// fprintf(stderr, "VkBlobBufferAllocator M %p +%lu %lu\n", ptr->buffer, ptr->offset, ptr->capacity); +// fprintf(stderr, "VkBlobAllocator M %p +%lu %lu\n", ptr->buffer, ptr->offset, ptr->capacity); return ptr; } @@ -523,6 +603,7 @@ VkBufferMemory* VkBlobBufferAllocator::fastMalloc(size_t size) block->memory = allocate_memory(memoryRequirements.size); + // ignore memoryRequirements.alignment as we always bind at zero offset vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0); block->mapped_ptr = 0; @@ -544,22 +625,22 @@ VkBufferMemory* VkBlobBufferAllocator::fastMalloc(size_t size) ptr->access_flags = 0; ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - // adjust budgets + // adjust buffer_budgets std::list< std::pair > budget; if (new_block_size > aligned_size) { budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size)); } - budgets.push_back(budget); + buffer_budgets.push_back(budget); -// fprintf(stderr, "VkBlobBufferAllocator M %p +%lu %lu\n", ptr->buffer, ptr->offset, ptr->capacity); +// fprintf(stderr, "VkBlobAllocator M %p +%lu %lu\n", ptr->buffer, ptr->offset, ptr->capacity); return ptr; } -void VkBlobBufferAllocator::fastFree(VkBufferMemory* ptr) +void VkBlobAllocator::fastFree(VkBufferMemory* ptr) { -// fprintf(stderr, "VkBlobBufferAllocator F %p +%lu %lu\n", ptr->buffer, ptr->offset, ptr->capacity); +// fprintf(stderr, "VkBlobAllocator F %p +%lu %lu\n", ptr->buffer, ptr->offset, ptr->capacity); const int buffer_block_count = buffer_blocks.size(); @@ -575,7 +656,7 @@ void VkBlobBufferAllocator::fastFree(VkBufferMemory* ptr) if (block_index == -1) { - fprintf(stderr, "FATAL ERROR! unlocked VkBlobBufferAllocator get wild %p\n", ptr->buffer); + fprintf(stderr, "FATAL ERROR! unlocked VkBlobAllocator get wild %p\n", ptr->buffer); delete ptr; @@ -583,10 +664,10 @@ void VkBlobBufferAllocator::fastFree(VkBufferMemory* ptr) } // merge - std::list< std::pair >::iterator it_merge_left = budgets[block_index].end(); - std::list< std::pair >::iterator it_merge_right = budgets[block_index].end(); - std::list< std::pair >::iterator it = budgets[block_index].begin(); - for ( ; it != budgets[block_index].end(); it++) + std::list< std::pair >::iterator it_merge_left = buffer_budgets[block_index].end(); + std::list< std::pair >::iterator it_merge_right = buffer_budgets[block_index].end(); + std::list< std::pair >::iterator it = buffer_budgets[block_index].begin(); + for ( ; it != buffer_budgets[block_index].end(); it++) { if (it->first + it->second == ptr->offset) { @@ -598,16 +679,16 @@ void VkBlobBufferAllocator::fastFree(VkBufferMemory* ptr) } } - if (it_merge_left != budgets[block_index].end() && it_merge_right != budgets[block_index].end()) + if (it_merge_left != buffer_budgets[block_index].end() && it_merge_right != buffer_budgets[block_index].end()) { it_merge_left->second = it_merge_right->first + it_merge_right->second - it_merge_left->first; - budgets[block_index].erase(it_merge_right); + buffer_budgets[block_index].erase(it_merge_right); } - else if (it_merge_left != budgets[block_index].end()) + else if (it_merge_left != buffer_budgets[block_index].end()) { it_merge_left->second = ptr->offset + ptr->capacity - it_merge_left->first; } - else if (it_merge_right != budgets[block_index].end()) + else if (it_merge_right != buffer_budgets[block_index].end()) { it_merge_right->second = it_merge_right->first + it_merge_right->second - ptr->offset; it_merge_right->first = ptr->offset; @@ -617,192 +698,170 @@ void VkBlobBufferAllocator::fastFree(VkBufferMemory* ptr) if (ptr->offset == 0) { // chain leading block - budgets[block_index].push_front(std::make_pair(ptr->offset, ptr->capacity)); + buffer_budgets[block_index].push_front(std::make_pair(ptr->offset, ptr->capacity)); } else { - budgets[block_index].push_back(std::make_pair(ptr->offset, ptr->capacity)); + buffer_budgets[block_index].push_back(std::make_pair(ptr->offset, ptr->capacity)); } } delete ptr; } -VkWeightBufferAllocator::VkWeightBufferAllocator(const VulkanDevice* _vkdev) : VkAllocator(_vkdev) +VkImageMemory* VkBlobAllocator::fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack) { - buffer_offset_alignment = vkdev->info.buffer_offset_alignment; - - if (vkdev->info.type == 1) + if (elempack != 1 && elempack != 4 && elempack != 8) { - // on integrated gpu, there may be device local only memory too, eg. AMD APU - // assuming larger alignment always keeps us safe :) - - // least common multiple for memory_map_alignment and buffer_offset_alignment and non_coherent_atom_size - buffer_offset_alignment = least_common_multiple(buffer_offset_alignment, vkdev->info.memory_map_alignment); - buffer_offset_alignment = least_common_multiple(buffer_offset_alignment, vkdev->info.non_coherent_atom_size); + fprintf(stderr, "elempack must be 1 4 8\n"); + return 0; } - block_size = alignSize(8 * 1024 * 1024, buffer_offset_alignment);// 8M -} + // resolve format + VkFormat format = VK_FORMAT_UNDEFINED; -VkWeightBufferAllocator::~VkWeightBufferAllocator() -{ - clear(); -} + if (elemsize / elempack == 4) + { + // fp32 + if (elempack == 1) format = VK_FORMAT_R32_SFLOAT; + if (elempack == 4) format = VK_FORMAT_R32G32B32A32_SFLOAT; + if (elempack == 8) format = VK_FORMAT_R32G32B32A32_SFLOAT; + } + if (elemsize / elempack == 2) + { + // fp16 + if (elempack == 1) format = VK_FORMAT_R16_SFLOAT; + if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT; + if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT; + } -void VkWeightBufferAllocator::clear() -{ -// fprintf(stderr, "VkWeightBufferAllocator %lu %lu\n", buffer_blocks.size(), dedicated_buffer_blocks.size()); + // resolve image width height depth + int width = w; + int height = h; + int depth = c; - buffer_block_free_spaces.clear(); + // large elempack spills on image w + if (elempack == 8) width *= 2; - for (size_t i=0; ivkdevice(), ptr->memory); - - vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0); - vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0); + image_type = VK_IMAGE_TYPE_1D; + imageview_type = VK_IMAGE_VIEW_TYPE_1D; - delete ptr; + if (width > (int)vkdev->info.max_image_dimension_1d) + { + fprintf(stderr, "image dimension too large %d > %d\n", width, (int)vkdev->info.max_image_dimension_1d); + return 0; + } } - buffer_blocks.clear(); - - for (size_t i=0; ivkdevice(), ptr->memory); - - vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0); - vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0); + image_type = VK_IMAGE_TYPE_2D; + imageview_type = VK_IMAGE_VIEW_TYPE_2D; - delete ptr; + if (width > (int)vkdev->info.max_image_dimension_2d || height > (int)vkdev->info.max_image_dimension_2d) + { + fprintf(stderr, "image dimension too large %d %d > %d\n", width, height, (int)vkdev->info.max_image_dimension_2d); + return 0; + } } - dedicated_buffer_blocks.clear(); -} - -VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size) -{ -// fprintf(stderr, "VkWeightBufferAllocator fastMalloc %lu\n", size); - - size_t aligned_size = alignSize(size, buffer_offset_alignment); - - const int buffer_block_count = buffer_blocks.size(); - - // find first spare space in buffer_blocks - int block_index = -1; - size_t block_offset = 0; - for (int i=0; i= aligned_size) + image_type = VK_IMAGE_TYPE_3D; + imageview_type = VK_IMAGE_VIEW_TYPE_3D; + + if (width > (int)vkdev->info.max_image_dimension_3d || height > (int)vkdev->info.max_image_dimension_3d || depth > (int)vkdev->info.max_image_dimension_3d) { - block_index = i; - block_offset = block_size - free_size; - break; + fprintf(stderr, "image dimension too large %d %d %d > %d\n", width, height, depth, (int)vkdev->info.max_image_dimension_3d); + return 0; } } - if (block_index != -1) - { - // return sub buffer - VkBufferMemory* ptr = new VkBufferMemory; + VkImageMemory* ptr = new VkImageMemory; - ptr->buffer = buffer_blocks[block_index]->buffer; - ptr->offset = block_offset; - ptr->memory = buffer_blocks[block_index]->memory; - ptr->capacity = aligned_size; - ptr->mapped_ptr = buffer_blocks[block_index]->mapped_ptr; - ptr->access_flags = 0; - ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + ptr->image = create_image(image_type, width, height, depth, format, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT); - buffer_block_free_spaces[block_index] -= aligned_size; + ptr->image_type = image_type; + ptr->imageview_type = imageview_type; + ptr->width = width; + ptr->height = height; + ptr->depth = depth; + ptr->format = format; - return ptr; - } + // TODO respect VK_KHR_dedicated_allocation ? + VkMemoryRequirements memoryRequirements; + vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements); - size_t new_block_size = std::max(block_size, aligned_size); + const size_t size = memoryRequirements.size; + const size_t alignment = std::max((size_t)memoryRequirements.alignment, bind_memory_offset_alignment); - // create new block - VkBufferMemory* block = new VkBufferMemory; + size_t aligned_size = alignSize(size, alignment); - block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT); - block->offset = 0; + const int image_memory_block_count = image_memory_blocks.size(); - if (vkdev->info.support_VK_KHR_get_memory_requirements2 && vkdev->info.support_VK_KHR_dedicated_allocation) + // find first spare space in image_memory_blocks + for (int i=0; ibuffer; + std::list< std::pair >::iterator it = image_memory_budgets[i].begin(); + while (it != image_memory_budgets[i].end()) + { + // we cannot use it->first directly for base offset alignment + size_t bind_base_offset = it->first; + size_t bind_offset = alignSize(bind_base_offset, alignment); + size_t budget_size = it->second; + if (budget_size < aligned_size + (bind_offset - bind_base_offset)) + { + it++; + continue; + } - VkMemoryRequirements2KHR memoryRequirements2; - memoryRequirements2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR; - memoryRequirements2.pNext = 0; + // bind at memory offset + ptr->memory = image_memory_blocks[i]; + ptr->bind_offset = bind_offset; + ptr->bind_capacity = aligned_size; - VkMemoryDedicatedRequirementsKHR memoryDedicatedRequirements; - memoryDedicatedRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR; - memoryDedicatedRequirements.pNext = 0; - memoryRequirements2.pNext = &memoryDedicatedRequirements; + vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset); - vkdev->vkGetBufferMemoryRequirements2KHR(vkdev->vkdevice(), &bufferMemoryRequirementsInfo2, &memoryRequirements2); + // do not allow host access to optimal tiling image + ptr->mapped_ptr = 0; - bool dedicatedAllocation = memoryDedicatedRequirements.requiresDedicatedAllocation || memoryDedicatedRequirements.prefersDedicatedAllocation; + ptr->imageview = create_imageview(imageview_type, ptr->image, format); - if (dedicatedAllocation) - { - // setup memory type and alignment - if (memory_type_index == (uint32_t)-1) + ptr->access_flags = 0; + ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + ptr->command_refcount = 0; + + if (bind_base_offset != bind_offset) { - if (vkdev->info.type == 1) - { - // integrated gpu, prefer unified memory - memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); - } - else - { - // discrete gpu, device local - memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); - } + // NOTE there is small offset inside bind_base_offset and bind_offset + // adjust ptr->bind_offset and ptr->bind_capacity after vkBindImageMemory + // so that memory management could be easier + aligned_size += (bind_offset - bind_base_offset); - mappable = vkdev->is_mappable(memory_type_index); - coherent = vkdev->is_coherent(memory_type_index); + ptr->bind_offset = bind_base_offset; + ptr->bind_capacity = aligned_size; } - block->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, block->buffer); - - vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0); - - block->mapped_ptr = 0; - if (mappable) + // adjust image_memory_budgets + if (budget_size == aligned_size) { - vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr); + image_memory_budgets[i].erase(it); + } + else + { + it->first += aligned_size; + it->second -= aligned_size; } - dedicated_buffer_blocks.push_back(block); - - // return sub buffer - VkBufferMemory* ptr = new VkBufferMemory; - - ptr->buffer = block->buffer; - ptr->offset = 0; - ptr->memory = block->memory; - ptr->capacity = new_block_size; - ptr->mapped_ptr = block->mapped_ptr; - ptr->access_flags = 0; - ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; +// fprintf(stderr, "VkBlobAllocator M %p +%lu %lu\n", ptr->memory, ptr->bind_offset, ptr->bind_capacity); return ptr; } } - VkMemoryRequirements memoryRequirements; - vkGetBufferMemoryRequirements(vkdev->vkdevice(), block->buffer, &memoryRequirements); - // setup memory type and alignment if (memory_type_index == (uint32_t)-1) { @@ -821,158 +880,697 @@ VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size) coherent = vkdev->is_coherent(memory_type_index); } - block->memory = allocate_memory(memoryRequirements.size); - - vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0); - -// fprintf(stderr, "VkWeightBufferAllocator M %p\n", block->buffer); + // create new block + size_t new_block_size = std::max(block_size, aligned_size); - block->mapped_ptr = 0; - if (mappable) - { - vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr); - } + // bind at memory offset + ptr->memory = allocate_memory(new_block_size); + ptr->bind_offset = 0; + ptr->bind_capacity = aligned_size; - buffer_blocks.push_back(block); + // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset + vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset); - buffer_block_free_spaces.push_back(new_block_size - aligned_size); + // do not allow host access to optimal tiling image + ptr->mapped_ptr = 0; - // return sub buffer - VkBufferMemory* ptr = new VkBufferMemory; + ptr->imageview = create_imageview(imageview_type, ptr->image, format); - ptr->buffer = block->buffer; - ptr->offset = 0; - ptr->memory = block->memory; - ptr->capacity = aligned_size; - ptr->mapped_ptr = block->mapped_ptr; ptr->access_flags = 0; + ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED; ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + ptr->command_refcount = 0; - return ptr; -} - -void VkWeightBufferAllocator::fastFree(VkBufferMemory* ptr) -{ -// fprintf(stderr, "VkWeightBufferAllocator F %p\n", ptr->buffer); + // adjust image_memory_budgets + image_memory_blocks.push_back(ptr->memory); - delete ptr; -} + std::list< std::pair > budget; + if (new_block_size > aligned_size) + { + budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size)); + } + image_memory_budgets.push_back(budget); -VkStagingBufferAllocator::VkStagingBufferAllocator(const VulkanDevice* _vkdev) : VkAllocator(_vkdev) -{ - mappable = true; - coherent = true; +// fprintf(stderr, "VkBlobAllocator M %p +%lu %lu\n", ptr->memory, ptr->bind_offset, ptr->bind_capacity); - size_compare_ratio = 192;// 0.75f * 256 + return ptr; } -VkStagingBufferAllocator::~VkStagingBufferAllocator() +void VkBlobAllocator::fastFree(VkImageMemory* ptr) { - clear(); -} +// fprintf(stderr, "VkBlobAllocator F %p +%lu %lu\n", ptr->memory, ptr->bind_offset, ptr->bind_capacity); -void VkStagingBufferAllocator::set_size_compare_ratio(float scr) -{ - if (scr < 0.f || scr > 1.f) + const int image_memory_block_count = image_memory_blocks.size(); + + int block_index = -1; + for (int i=0; imemory) + { + block_index = i; + break; + } } - size_compare_ratio = (unsigned int)(scr * 256); -} + if (block_index == -1) + { + fprintf(stderr, "FATAL ERROR! unlocked VkBlobAllocator get wild %p\n", ptr->memory); + + if (!ptr->command_refcount) + { + vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0); + vkDestroyImage(vkdev->vkdevice(), ptr->image, 0); + + delete ptr; + } + + return; + } + + // merge + std::list< std::pair >::iterator it_merge_left = image_memory_budgets[block_index].end(); + std::list< std::pair >::iterator it_merge_right = image_memory_budgets[block_index].end(); + std::list< std::pair >::iterator it = image_memory_budgets[block_index].begin(); + for ( ; it != image_memory_budgets[block_index].end(); it++) + { + if (it->first + it->second == ptr->bind_offset) + { + it_merge_left = it; + } + else if (ptr->bind_offset + ptr->bind_capacity == it->first) + { + it_merge_right = it; + } + } + + if (it_merge_left != image_memory_budgets[block_index].end() && it_merge_right != image_memory_budgets[block_index].end()) + { + it_merge_left->second = it_merge_right->first + it_merge_right->second - it_merge_left->first; + image_memory_budgets[block_index].erase(it_merge_right); + } + else if (it_merge_left != image_memory_budgets[block_index].end()) + { + it_merge_left->second = ptr->bind_offset + ptr->bind_capacity - it_merge_left->first; + } + else if (it_merge_right != image_memory_budgets[block_index].end()) + { + it_merge_right->second = it_merge_right->first + it_merge_right->second - ptr->bind_offset; + it_merge_right->first = ptr->bind_offset; + } + else + { + if (ptr->bind_offset == 0) + { + // chain leading block + image_memory_budgets[block_index].push_front(std::make_pair(ptr->bind_offset, ptr->bind_capacity)); + } + else + { + image_memory_budgets[block_index].push_back(std::make_pair(ptr->bind_offset, ptr->bind_capacity)); + } + } + + if (!ptr->command_refcount) + { + vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0); + vkDestroyImage(vkdev->vkdevice(), ptr->image, 0); + + delete ptr; + } +} + +VkWeightAllocator::VkWeightAllocator(const VulkanDevice* _vkdev) : VkAllocator(_vkdev) +{ + buffer_offset_alignment = vkdev->info.buffer_offset_alignment; + bind_memory_offset_alignment = vkdev->info.buffer_image_granularity; + + if (vkdev->info.type == 1) + { + // on integrated gpu, there may be device local only memory too, eg. AMD APU + // assuming larger alignment always keeps us safe :) + + // least common multiple for memory_map_alignment and buffer_offset_alignment and non_coherent_atom_size + buffer_offset_alignment = least_common_multiple(buffer_offset_alignment, vkdev->info.memory_map_alignment); + buffer_offset_alignment = least_common_multiple(buffer_offset_alignment, vkdev->info.non_coherent_atom_size); + } + + block_size = alignSize(8 * 1024 * 1024, buffer_offset_alignment);// 8M +} + +VkWeightAllocator::~VkWeightAllocator() +{ + clear(); +} + +void VkWeightAllocator::clear() +{ +// fprintf(stderr, "VkWeightAllocator %lu %lu\n", buffer_blocks.size(), dedicated_buffer_blocks.size()); + + buffer_block_free_spaces.clear(); + + for (size_t i=0; ivkdevice(), ptr->memory); + + vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0); + vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0); + + delete ptr; + } + buffer_blocks.clear(); + + for (size_t i=0; ivkdevice(), ptr->memory); + + vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0); + vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0); + + delete ptr; + } + dedicated_buffer_blocks.clear(); + + image_memory_block_free_spaces.clear(); + + for (size_t i=0; ivkdevice(), memory, 0); + } + image_memory_blocks.clear(); + + for (size_t i=0; ivkdevice(), memory, 0); + } + dedicated_image_memory_blocks.clear(); +} + +VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size) +{ +// fprintf(stderr, "VkWeightAllocator fastMalloc %lu\n", size); + + size_t aligned_size = alignSize(size, buffer_offset_alignment); + + const int buffer_block_count = buffer_blocks.size(); + + // find first spare space in buffer_blocks + for (int i=0; i= aligned_size) + { + size_t block_offset = block_size - free_size; + + // return sub buffer + VkBufferMemory* ptr = new VkBufferMemory; + + ptr->buffer = buffer_blocks[i]->buffer; + ptr->offset = block_offset; + ptr->memory = buffer_blocks[i]->memory; + ptr->capacity = aligned_size; + ptr->mapped_ptr = buffer_blocks[i]->mapped_ptr; + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + + buffer_block_free_spaces[i] -= aligned_size; + + return ptr; + } + } + + size_t new_block_size = std::max(block_size, aligned_size); + + // create new block + VkBufferMemory* block = new VkBufferMemory; + + block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT); + block->offset = 0; + + if (vkdev->info.support_VK_KHR_get_memory_requirements2 && vkdev->info.support_VK_KHR_dedicated_allocation) + { + VkBufferMemoryRequirementsInfo2KHR bufferMemoryRequirementsInfo2; + bufferMemoryRequirementsInfo2.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2_KHR; + bufferMemoryRequirementsInfo2.pNext = 0; + bufferMemoryRequirementsInfo2.buffer = block->buffer; + + VkMemoryRequirements2KHR memoryRequirements2; + memoryRequirements2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR; + memoryRequirements2.pNext = 0; + + VkMemoryDedicatedRequirementsKHR memoryDedicatedRequirements; + memoryDedicatedRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR; + memoryDedicatedRequirements.pNext = 0; + memoryRequirements2.pNext = &memoryDedicatedRequirements; + + vkdev->vkGetBufferMemoryRequirements2KHR(vkdev->vkdevice(), &bufferMemoryRequirementsInfo2, &memoryRequirements2); + + bool dedicatedAllocation = memoryDedicatedRequirements.requiresDedicatedAllocation || memoryDedicatedRequirements.prefersDedicatedAllocation; + + if (dedicatedAllocation) + { + // setup memory type and alignment + if (memory_type_index == (uint32_t)-1) + { + if (vkdev->info.type == 1) + { + // integrated gpu, prefer unified memory + memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); + } + else + { + // discrete gpu, device local + memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + } + + mappable = vkdev->is_mappable(memory_type_index); + coherent = vkdev->is_coherent(memory_type_index); + } + + block->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, 0, block->buffer); + + // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset + vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0); + + block->mapped_ptr = 0; + if (mappable) + { + vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr); + } + + dedicated_buffer_blocks.push_back(block); + + // return sub buffer + VkBufferMemory* ptr = new VkBufferMemory; + + ptr->buffer = block->buffer; + ptr->offset = 0; + ptr->memory = block->memory; + ptr->capacity = new_block_size; + ptr->mapped_ptr = block->mapped_ptr; + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + + return ptr; + } + } + + VkMemoryRequirements memoryRequirements; + vkGetBufferMemoryRequirements(vkdev->vkdevice(), block->buffer, &memoryRequirements); + + // setup memory type and alignment + if (memory_type_index == (uint32_t)-1) + { + if (vkdev->info.type == 1) + { + // integrated gpu, prefer unified memory + memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); + } + else + { + // discrete gpu, device local + memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + } + + mappable = vkdev->is_mappable(memory_type_index); + coherent = vkdev->is_coherent(memory_type_index); + } + + block->memory = allocate_memory(memoryRequirements.size); + + // ignore memoryRequirements.alignment as we always bind at zero offset + vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0); + +// fprintf(stderr, "VkWeightAllocator M %p\n", block->buffer); + + block->mapped_ptr = 0; + if (mappable) + { + vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr); + } + + buffer_blocks.push_back(block); + + buffer_block_free_spaces.push_back(new_block_size - aligned_size); + + // return sub buffer + VkBufferMemory* ptr = new VkBufferMemory; + + ptr->buffer = block->buffer; + ptr->offset = 0; + ptr->memory = block->memory; + ptr->capacity = aligned_size; + ptr->mapped_ptr = block->mapped_ptr; + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + + return ptr; +} + +void VkWeightAllocator::fastFree(VkBufferMemory* ptr) +{ +// fprintf(stderr, "VkWeightAllocator F %p\n", ptr->buffer); + + delete ptr; +} + +VkImageMemory* VkWeightAllocator::fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack) +{ + if (elempack != 1 && elempack != 4 && elempack != 8 && elempack != 16 && elempack != 32 && elempack != 64) + { + fprintf(stderr, "elempack must be 1 4 8 16 32 64\n"); + return 0; + } + + // resolve format + VkFormat format = VK_FORMAT_UNDEFINED; + + if (elemsize / elempack == 4) + { + // fp32 + if (elempack == 1) format = VK_FORMAT_R32_SFLOAT; + if (elempack == 4) format = VK_FORMAT_R32G32B32A32_SFLOAT; + if (elempack == 8) format = VK_FORMAT_R32G32B32A32_SFLOAT; + if (elempack == 16) format = VK_FORMAT_R32G32B32A32_SFLOAT; + if (elempack == 32) format = VK_FORMAT_R32G32B32A32_SFLOAT; + if (elempack == 64) format = VK_FORMAT_R32G32B32A32_SFLOAT; + } + if (elemsize / elempack == 2) + { + // fp16 + if (elempack == 1) format = VK_FORMAT_R16_SFLOAT; + if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT; + if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT; + if (elempack == 16) format = VK_FORMAT_R16G16B16A16_SFLOAT; + if (elempack == 32) format = VK_FORMAT_R16G16B16A16_SFLOAT; + if (elempack == 64) format = VK_FORMAT_R16G16B16A16_SFLOAT; + } + + // resolve image width height depth + int width = w; + int height = h; + int depth = c; + + // large elempack spills on image w + if (elempack == 8) width *= 2; + if (elempack == 16) width *= 4; + if (elempack == 32) width *= 8; + if (elempack == 64) width *= 16; + + VkImageType image_type; + VkImageViewType imageview_type; + if (dims == 1) + { + image_type = VK_IMAGE_TYPE_1D; + imageview_type = VK_IMAGE_VIEW_TYPE_1D; + + if (width > (int)vkdev->info.max_image_dimension_1d) + { + fprintf(stderr, "image dimension too large %d > %d\n", width, (int)vkdev->info.max_image_dimension_1d); + return 0; + } + } + else if (dims == 2) + { + image_type = VK_IMAGE_TYPE_2D; + imageview_type = VK_IMAGE_VIEW_TYPE_2D; + + if (width > (int)vkdev->info.max_image_dimension_2d || height > (int)vkdev->info.max_image_dimension_2d) + { + fprintf(stderr, "image dimension too large %d %d > %d\n", width, height, (int)vkdev->info.max_image_dimension_2d); + return 0; + } + } + else // if (dims == 3) + { + image_type = VK_IMAGE_TYPE_3D; + imageview_type = VK_IMAGE_VIEW_TYPE_3D; + + if (width > (int)vkdev->info.max_image_dimension_3d || height > (int)vkdev->info.max_image_dimension_3d || depth > (int)vkdev->info.max_image_dimension_3d) + { + fprintf(stderr, "image dimension too large %d %d %d > %d\n", width, height, depth, (int)vkdev->info.max_image_dimension_3d); + return 0; + } + } + + VkImageMemory* ptr = new VkImageMemory; + + ptr->image = create_image(image_type, width, height, depth, format, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT); + + ptr->image_type = image_type; + ptr->imageview_type = imageview_type; + ptr->width = width; + ptr->height = height; + ptr->depth = depth; + ptr->format = format; + + if (vkdev->info.support_VK_KHR_get_memory_requirements2 && vkdev->info.support_VK_KHR_dedicated_allocation) + { + VkImageMemoryRequirementsInfo2KHR imageMemoryRequirementsInfo2; + imageMemoryRequirementsInfo2.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2; + imageMemoryRequirementsInfo2.pNext = 0; + imageMemoryRequirementsInfo2.image = ptr->image; + + VkMemoryRequirements2KHR memoryRequirements2; + memoryRequirements2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR; + memoryRequirements2.pNext = 0; + + VkMemoryDedicatedRequirementsKHR memoryDedicatedRequirements; + memoryDedicatedRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR; + memoryDedicatedRequirements.pNext = 0; + memoryRequirements2.pNext = &memoryDedicatedRequirements; -void VkStagingBufferAllocator::clear() -{ -// fprintf(stderr, "VkStagingBufferAllocator %lu\n", budgets.size()); + vkdev->vkGetImageMemoryRequirements2KHR(vkdev->vkdevice(), &imageMemoryRequirementsInfo2, &memoryRequirements2); - std::list::iterator it = budgets.begin(); - for (; it != budgets.end(); it++) - { - VkBufferMemory* ptr = *it; + bool dedicatedAllocation = memoryDedicatedRequirements.requiresDedicatedAllocation || memoryDedicatedRequirements.prefersDedicatedAllocation; -// fprintf(stderr, "VkStagingBufferAllocator F %p\n", ptr->buffer); + if (dedicatedAllocation) + { + // setup memory type and alignment + if (memory_type_index == (uint32_t)-1) + { + if (vkdev->info.type == 1) + { + // integrated gpu, prefer unified memory + memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); + } + else + { + // discrete gpu, device local + memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + } - vkUnmapMemory(vkdev->vkdevice(), ptr->memory); - vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0); - vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0); + mappable = vkdev->is_mappable(memory_type_index); + coherent = vkdev->is_coherent(memory_type_index); + } - delete ptr; - } - budgets.clear(); -} + // bind memory + ptr->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, ptr->image, 0); + ptr->bind_offset = 0; + ptr->bind_capacity = memoryRequirements2.memoryRequirements.size; -VkBufferMemory* VkStagingBufferAllocator::fastMalloc(size_t size) -{ - // find free budget - std::list::iterator it = budgets.begin(); - for (; it != budgets.end(); it++) - { - VkBufferMemory* ptr = *it; + // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset + vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset); - size_t capacity = ptr->capacity; + // do not allow host access to optimal tiling image + ptr->mapped_ptr = 0; - // size_compare_ratio ~ 100% - if (capacity >= size && ((capacity * size_compare_ratio) >> 8) <= size) - { - budgets.erase(it); + ptr->imageview = create_imageview(imageview_type, ptr->image, format); + + ptr->access_flags = 0; + ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + ptr->command_refcount = 0; -// fprintf(stderr, "VkStagingBufferAllocator M %p %lu reused %lu\n", ptr->buffer, size, capacity); + dedicated_image_memory_blocks.push_back(ptr->memory); return ptr; } } - VkBufferMemory* ptr = new VkBufferMemory; + VkMemoryRequirements memoryRequirements; + vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements); - ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT); - ptr->offset = 0; + const size_t size = memoryRequirements.size; + const size_t alignment = std::max((size_t)memoryRequirements.alignment, bind_memory_offset_alignment); - VkMemoryRequirements memoryRequirements; - vkGetBufferMemoryRequirements(vkdev->vkdevice(), ptr->buffer, &memoryRequirements); + size_t aligned_size = alignSize(size, alignment); - // setup memory type + const int image_memory_block_count = image_memory_blocks.size(); + + // find first spare space in buffer_blocks + for (int i=0; i= aligned_size + (bind_offset - bind_base_offset)) + { + // bind at memory offset + ptr->memory = image_memory_blocks[i]; + ptr->bind_offset = bind_offset; + ptr->bind_capacity = aligned_size; + + vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset); + + // do not allow host access to optimal tiling image + ptr->mapped_ptr = 0; + + ptr->imageview = create_imageview(imageview_type, ptr->image, format); + + ptr->access_flags = 0; + ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + ptr->command_refcount = 0; + + if (bind_base_offset != bind_offset) + { + // NOTE there is small offset inside bind_base_offset and bind_offset + // adjust ptr->bind_offset and ptr->bind_capacity after vkBindImageMemory + // so that memory management could be easier + aligned_size += (bind_offset - bind_base_offset); + + ptr->bind_offset = bind_base_offset; + ptr->bind_capacity = aligned_size; + } + + image_memory_block_free_spaces[i] -= aligned_size; + + return ptr; + } + } + + // setup memory type and alignment if (memory_type_index == (uint32_t)-1) { - memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + if (vkdev->info.type == 1) + { + // integrated gpu, prefer unified memory + memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); + } + else + { + // discrete gpu, device local + memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + } + + mappable = vkdev->is_mappable(memory_type_index); + coherent = vkdev->is_coherent(memory_type_index); } - ptr->memory = allocate_memory(memoryRequirements.size); + // create new block + size_t new_block_size = std::max(block_size, aligned_size); - vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0); + // bind at memory offset + ptr->memory = allocate_memory(new_block_size); + ptr->bind_offset = 0; + ptr->bind_capacity = aligned_size; - ptr->capacity = size; + // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset + vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset); - vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr); + // do not allow host access to optimal tiling image + ptr->mapped_ptr = 0; + + ptr->imageview = create_imageview(imageview_type, ptr->image, format); ptr->access_flags = 0; + ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED; ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + ptr->command_refcount = 0; -// fprintf(stderr, "VkStagingBufferAllocator M %p %lu\n", ptr->buffer, size); + image_memory_blocks.push_back(ptr->memory); + image_memory_block_free_spaces.push_back(new_block_size - aligned_size); return ptr; } -void VkStagingBufferAllocator::fastFree(VkBufferMemory* ptr) +void VkWeightAllocator::fastFree(VkImageMemory* ptr) { -// fprintf(stderr, "VkStagingBufferAllocator F %p\n", ptr->buffer); +// fprintf(stderr, "VkWeightAllocator F %p\n", ptr->memory); - // return to budgets - budgets.push_back(ptr); + if (!ptr->command_refcount) + { + vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0); + vkDestroyImage(vkdev->vkdevice(), ptr->image, 0); + + delete ptr; + } } -VkWeightStagingBufferAllocator::VkWeightStagingBufferAllocator(const VulkanDevice* _vkdev) : VkAllocator(_vkdev) +VkStagingAllocator::VkStagingAllocator(const VulkanDevice* _vkdev) : VkAllocator(_vkdev) { mappable = true; coherent = true; + + size_compare_ratio = 192;// 0.75f * 256 +} + +VkStagingAllocator::~VkStagingAllocator() +{ + clear(); +} + +void VkStagingAllocator::set_size_compare_ratio(float scr) +{ + if (scr < 0.f || scr > 1.f) + { + fprintf(stderr, "invalid size compare ratio %f\n", scr); + return; + } + + size_compare_ratio = (unsigned int)(scr * 256); } -VkWeightStagingBufferAllocator::~VkWeightStagingBufferAllocator() +void VkStagingAllocator::clear() { +// fprintf(stderr, "VkStagingAllocator %lu\n", buffer_budgets.size()); + + for (std::list::iterator it = buffer_budgets.begin(); it != buffer_budgets.end(); it++) + { + VkBufferMemory* ptr = *it; + +// fprintf(stderr, "VkStagingAllocator F %p\n", ptr->buffer); + + vkUnmapMemory(vkdev->vkdevice(), ptr->memory); + vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0); + vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0); + + delete ptr; + } + buffer_budgets.clear(); } -VkBufferMemory* VkWeightStagingBufferAllocator::fastMalloc(size_t size) +VkBufferMemory* VkStagingAllocator::fastMalloc(size_t size) { + // find free budget + std::list::iterator it = buffer_budgets.begin(); + for (; it != buffer_budgets.end(); it++) + { + VkBufferMemory* ptr = *it; + + size_t capacity = ptr->capacity; + + // size_compare_ratio ~ 100% + if (capacity >= size && ((capacity * size_compare_ratio) >> 8) <= size) + { + buffer_budgets.erase(it); + +// fprintf(stderr, "VkStagingAllocator M %p %lu reused %lu\n", ptr->buffer, size, capacity); + + return ptr; + } + } + VkBufferMemory* ptr = new VkBufferMemory; ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT); @@ -989,6 +1587,7 @@ VkBufferMemory* VkWeightStagingBufferAllocator::fastMalloc(size_t size) ptr->memory = allocate_memory(memoryRequirements.size); + // ignore memoryRequirements.alignment as we always bind at zero offset vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0); ptr->capacity = size; @@ -998,169 +1597,136 @@ VkBufferMemory* VkWeightStagingBufferAllocator::fastMalloc(size_t size) ptr->access_flags = 0; ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; -// fprintf(stderr, "VkWeightStagingBufferAllocator M %p %lu\n", ptr->buffer, size); +// fprintf(stderr, "VkStagingAllocator M %p %lu\n", ptr->buffer, size); return ptr; } -void VkWeightStagingBufferAllocator::fastFree(VkBufferMemory* ptr) +void VkStagingAllocator::fastFree(VkBufferMemory* ptr) { -// fprintf(stderr, "VkWeightStagingBufferAllocator F %p\n", ptr->buffer); - - vkUnmapMemory(vkdev->vkdevice(), ptr->memory); - vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0); - vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0); +// fprintf(stderr, "VkStagingAllocator F %p\n", ptr->buffer); - delete ptr; + // return to buffer_budgets + buffer_budgets.push_back(ptr); } -VkImageAllocator::VkImageAllocator(const VulkanDevice* _vkdev) : VkAllocator(_vkdev) +VkImageMemory* VkStagingAllocator::fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack) { - memory_type_index = (uint32_t)-1; -} + // staging image is mainly used for storing small piece of dynamic parameters + // we allocate host memory as a fake image, it's simple and good -VkImage VkImageAllocator::create_image(int width, int height, VkFormat format, VkImageUsageFlags usage) -{ - VkImageCreateInfo imageCreateInfo; - imageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, - imageCreateInfo.pNext = 0; - imageCreateInfo.flags = 0; - imageCreateInfo.imageType = VK_IMAGE_TYPE_2D; - imageCreateInfo.format = format; - imageCreateInfo.extent.width = width; - imageCreateInfo.extent.height = height; - imageCreateInfo.extent.depth = 1; - imageCreateInfo.mipLevels = 1; - imageCreateInfo.arrayLayers = 1; - imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT; - imageCreateInfo.tiling = VK_IMAGE_TILING_OPTIMAL; - imageCreateInfo.usage = usage; - imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - imageCreateInfo.queueFamilyIndexCount = 0; - imageCreateInfo.pQueueFamilyIndices = 0; - imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + const size_t size = w * h * c * elemsize; - VkImage image; - VkResult ret = vkCreateImage(vkdev->vkdevice(), &imageCreateInfo, 0, &image); - if (ret != VK_SUCCESS) + VkImageType image_type; + VkImageViewType imageview_type; + if (dims == 1) { - fprintf(stderr, "vkCreateImage failed %d\n", ret); - return 0; + image_type = VK_IMAGE_TYPE_1D; + imageview_type = VK_IMAGE_VIEW_TYPE_1D; + } + else if (dims == 2) + { + image_type = VK_IMAGE_TYPE_2D; + imageview_type = VK_IMAGE_VIEW_TYPE_2D; + } + else // if (dims == 3) + { + image_type = VK_IMAGE_TYPE_3D; + imageview_type = VK_IMAGE_VIEW_TYPE_3D; } - return image; -} + VkImageMemory* ptr = new VkImageMemory; -VkImageView VkImageAllocator::create_imageview(VkImage image, VkFormat format) -{ - VkImageViewCreateInfo imageViewCreateInfo; - imageViewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; - imageViewCreateInfo.pNext = 0; - imageViewCreateInfo.flags = 0; - imageViewCreateInfo.image = image; - imageViewCreateInfo.viewType = VK_IMAGE_VIEW_TYPE_2D; - imageViewCreateInfo.format = format; - imageViewCreateInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY; - imageViewCreateInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY; - imageViewCreateInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY; - imageViewCreateInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY; - imageViewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; - imageViewCreateInfo.subresourceRange.baseMipLevel = 0; - imageViewCreateInfo.subresourceRange.levelCount = 1; - imageViewCreateInfo.subresourceRange.baseArrayLayer = 0; - imageViewCreateInfo.subresourceRange.layerCount = 1; + ptr->image = 0; + ptr->image_type = image_type; + ptr->imageview_type = imageview_type; + ptr->width = w; + ptr->height = h; + ptr->depth = c; + ptr->format = VK_FORMAT_UNDEFINED; + ptr->memory = 0; + ptr->bind_offset = 0; + ptr->bind_capacity = size; - VkImageView imageview; - VkResult ret = vkCreateImageView(vkdev->vkdevice(), &imageViewCreateInfo, 0, &imageview); - if (ret != VK_SUCCESS) - { - fprintf(stderr, "vkCreateImageView failed %d\n", ret); - return 0; - } + ptr->mapped_ptr = malloc(size); - return imageview; + ptr->imageview = 0; + + ptr->access_flags = 0; + ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED; + ptr->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; + ptr->command_refcount = 0; + +// fprintf(stderr, "VkStagingAllocator M %p %d %d %d %d %d\n", ptr->image, dims, width, height, depth, format); + + return ptr; } -VkDeviceMemory VkImageAllocator::allocate_dedicated_memory(size_t size, VkImage image) +void VkStagingAllocator::fastFree(VkImageMemory* ptr) { - VkMemoryAllocateInfo memoryAllocateInfo; - memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - memoryAllocateInfo.pNext = 0; - memoryAllocateInfo.allocationSize = size; - memoryAllocateInfo.memoryTypeIndex = memory_type_index; - - VkMemoryDedicatedAllocateInfoKHR memoryDedicatedAllocateInfo; - memoryDedicatedAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR; - memoryDedicatedAllocateInfo.pNext = 0; - memoryDedicatedAllocateInfo.image = image; - memoryDedicatedAllocateInfo.buffer = 0; - memoryAllocateInfo.pNext = &memoryDedicatedAllocateInfo; +// fprintf(stderr, "VkStagingAllocator F %p\n", ptr->image); - VkDeviceMemory memory = 0; - VkResult ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory); - if (ret != VK_SUCCESS) - { - fprintf(stderr, "vkAllocateMemory failed %d\n", ret); - } + free(ptr->mapped_ptr); - return memory; + delete ptr; } -VkSimpleImageAllocator::VkSimpleImageAllocator(const VulkanDevice* _vkdev) : VkImageAllocator(_vkdev) +VkWeightStagingAllocator::VkWeightStagingAllocator(const VulkanDevice* _vkdev) : VkAllocator(_vkdev) { + mappable = true; + coherent = true; } -VkSimpleImageAllocator::~VkSimpleImageAllocator() +VkWeightStagingAllocator::~VkWeightStagingAllocator() { } -VkImageMemory* VkSimpleImageAllocator::fastMalloc(int width, int height, VkFormat format) +VkBufferMemory* VkWeightStagingAllocator::fastMalloc(size_t size) { - VkImageMemory* ptr = new VkImageMemory; + VkBufferMemory* ptr = new VkBufferMemory; - ptr->image = create_image(width, height, format, VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT); + ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT); + ptr->offset = 0; VkMemoryRequirements memoryRequirements; - vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements); + vkGetBufferMemoryRequirements(vkdev->vkdevice(), ptr->buffer, &memoryRequirements); // setup memory type if (memory_type_index == (uint32_t)-1) { - if (vkdev->info.type == 1) - { - // integrated gpu, prefer unified memory - memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, 0); - } - else - { - // discrete gpu, device local - memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); - } + memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); } ptr->memory = allocate_memory(memoryRequirements.size); - vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, 0); + // ignore memoryRequirements.alignment as we always bind at zero offset + vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0); + + ptr->capacity = size; - ptr->imageview = create_imageview(ptr->image, format); + vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr); ptr->access_flags = 0; ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; +// fprintf(stderr, "VkWeightStagingAllocator M %p %lu\n", ptr->buffer, size); + return ptr; } -void VkSimpleImageAllocator::fastFree(VkImageMemory* ptr) +void VkWeightStagingAllocator::fastFree(VkBufferMemory* ptr) { - vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0); - vkDestroyImage(vkdev->vkdevice(), ptr->image, 0); +// fprintf(stderr, "VkWeightStagingAllocator F %p\n", ptr->buffer); + + vkUnmapMemory(vkdev->vkdevice(), ptr->memory); + vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0); vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0); delete ptr; } #if __ANDROID_API__ >= 26 -VkAndroidHardwareBufferImageAllocator::VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb) : VkImageAllocator(_vkdev), hb(_hb) +VkAndroidHardwareBufferImageAllocator::VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb) : VkAllocator(_vkdev), hb(_hb) { samplerYcbcrConversion = 0; @@ -1176,7 +1742,7 @@ VkAndroidHardwareBufferImageAllocator::~VkAndroidHardwareBufferImageAllocator() } } -VkImageMemory* VkAndroidHardwareBufferImageAllocator::fastMalloc(int /*width*/, int /*height*/, VkFormat /*format*/) +VkImageMemory* VkAndroidHardwareBufferImageAllocator::fastMalloc(int /*dims*/, int /*w*/, int /*h*/, int /*c*/, size_t /*elemsize*/, int /*elempack*/) { VkResult ret; @@ -1299,6 +1865,7 @@ VkImageMemory* VkAndroidHardwareBufferImageAllocator::fastMalloc(int /*width*/, ptr->memory = memory; ptr->imageview = imageview; ptr->access_flags = 0; + ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED; ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; return ptr; diff --git a/src/allocator.h b/src/allocator.h index a70b61c8f..18aad01d4 100644 --- a/src/allocator.h +++ b/src/allocator.h @@ -199,17 +199,54 @@ public: int refcount; }; +class VkImageMemory +{ +public: + VkImage image; + VkImageView imageview; + + // underlying info assigned by allocator + VkImageType image_type; + VkImageViewType imageview_type; + int width; + int height; + int depth; + VkFormat format; + + VkDeviceMemory memory; + void* mapped_ptr; + + // the base offset assigned by allocator + size_t bind_offset; + size_t bind_capacity; + + // image state, modified by command functions internally + mutable VkAccessFlags access_flags; + mutable VkImageLayout image_layout; + mutable VkPipelineStageFlags stage_flags; + + // in-execution state, modified by command functions internally + mutable int command_refcount; + + // initialize and modified by mat + int refcount; +}; + class VkAllocator { public: VkAllocator(const VulkanDevice* _vkdev); virtual ~VkAllocator() { clear(); } virtual void clear() {} + virtual VkBufferMemory* fastMalloc(size_t size) = 0; virtual void fastFree(VkBufferMemory* ptr) = 0; virtual int flush(VkBufferMemory* ptr); virtual int invalidate(VkBufferMemory* ptr); + virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack) = 0; + virtual void fastFree(VkImageMemory* ptr) = 0; + public: const VulkanDevice* vkdev; uint32_t memory_type_index; @@ -219,14 +256,17 @@ public: protected: VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage); VkDeviceMemory allocate_memory(size_t size); - VkDeviceMemory allocate_dedicated_memory(size_t size, VkBuffer buffer); + VkDeviceMemory allocate_dedicated_memory(size_t size, VkImage image, VkBuffer buffer); + + VkImage create_image(VkImageType type, int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage); + VkImageView create_imageview(VkImageViewType type, VkImage image, VkFormat format); }; -class VkBlobBufferAllocator : public VkAllocator +class VkBlobAllocator : public VkAllocator { public: - VkBlobBufferAllocator(const VulkanDevice* vkdev); - virtual ~VkBlobBufferAllocator(); + VkBlobAllocator(const VulkanDevice* vkdev); + virtual ~VkBlobAllocator(); public: // release all budgets immediately @@ -234,19 +274,24 @@ public: virtual VkBufferMemory* fastMalloc(size_t size); virtual void fastFree(VkBufferMemory* ptr); + virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack); + virtual void fastFree(VkImageMemory* ptr); private: size_t block_size; size_t buffer_offset_alignment; - std::vector< std::list< std::pair > > budgets; + size_t bind_memory_offset_alignment; + std::vector< std::list< std::pair > > buffer_budgets; std::vector buffer_blocks; + std::vector< std::list< std::pair > > image_memory_budgets; + std::vector image_memory_blocks; }; -class VkWeightBufferAllocator : public VkAllocator +class VkWeightAllocator : public VkAllocator { public: - VkWeightBufferAllocator(const VulkanDevice* vkdev); - virtual ~VkWeightBufferAllocator(); + VkWeightAllocator(const VulkanDevice* vkdev); + virtual ~VkWeightAllocator(); public: // release all blocks immediately @@ -255,20 +300,26 @@ public: public: virtual VkBufferMemory* fastMalloc(size_t size); virtual void fastFree(VkBufferMemory* ptr); + virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack); + virtual void fastFree(VkImageMemory* ptr); private: size_t block_size; size_t buffer_offset_alignment; + size_t bind_memory_offset_alignment; std::vector buffer_block_free_spaces; std::vector buffer_blocks; std::vector dedicated_buffer_blocks; + std::vector image_memory_block_free_spaces; + std::vector image_memory_blocks; + std::vector dedicated_image_memory_blocks; }; -class VkStagingBufferAllocator : public VkAllocator +class VkStagingAllocator : public VkAllocator { public: - VkStagingBufferAllocator(const VulkanDevice* vkdev); - virtual ~VkStagingBufferAllocator(); + VkStagingAllocator(const VulkanDevice* vkdev); + virtual ~VkStagingAllocator(); public: // ratio range 0 ~ 1 @@ -280,82 +331,42 @@ public: virtual VkBufferMemory* fastMalloc(size_t size); virtual void fastFree(VkBufferMemory* ptr); + virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack); + virtual void fastFree(VkImageMemory* ptr); private: unsigned int size_compare_ratio;// 0~256 - std::list budgets; + std::list buffer_budgets; }; -class VkWeightStagingBufferAllocator : public VkAllocator +class VkWeightStagingAllocator : public VkAllocator { public: - VkWeightStagingBufferAllocator(const VulkanDevice* vkdev); - virtual ~VkWeightStagingBufferAllocator(); + VkWeightStagingAllocator(const VulkanDevice* vkdev); + virtual ~VkWeightStagingAllocator(); public: virtual VkBufferMemory* fastMalloc(size_t size); virtual void fastFree(VkBufferMemory* ptr); + virtual VkImageMemory* fastMalloc(int /*dims*/, int /*w*/, int /*h*/, int /*c*/, size_t /*elemsize*/, int /*elempack*/) { return 0; } + virtual void fastFree(VkImageMemory* /*ptr*/) {} private: }; -class VkImageMemory -{ -public: - VkImage image; - VkImageView imageview; - - VkDeviceMemory memory; - - // image state, modified by command functions internally - mutable VkAccessFlags access_flags; - mutable VkPipelineStageFlags stage_flags; - - // initialize and modified by mat - int refcount; -}; - -class VkImageAllocator : public VkAllocator -{ -public: - VkImageAllocator(const VulkanDevice* _vkdev); - virtual ~VkImageAllocator() { clear(); } - virtual void clear() {} - virtual VkImageMemory* fastMalloc(int width, int height, VkFormat format) = 0; - virtual void fastFree(VkImageMemory* ptr) = 0; - -protected: - virtual VkBufferMemory* fastMalloc(size_t /*size*/) { return 0; } - virtual void fastFree(VkBufferMemory* /*ptr*/) {} - -protected: - VkImage create_image(int width, int height, VkFormat format, VkImageUsageFlags usage); - VkImageView create_imageview(VkImage image, VkFormat format); - VkDeviceMemory allocate_dedicated_memory(size_t size, VkImage image); -}; - -class VkSimpleImageAllocator : public VkImageAllocator -{ -public: - VkSimpleImageAllocator(const VulkanDevice* vkdev); - virtual ~VkSimpleImageAllocator(); - -public: - virtual VkImageMemory* fastMalloc(int width, int height, VkFormat format); - virtual void fastFree(VkImageMemory* ptr); -}; - #if __ANDROID_API__ >= 26 class ImportAndroidHardwareBufferPipeline; -class VkAndroidHardwareBufferImageAllocator : public VkImageAllocator +class VkAndroidHardwareBufferImageAllocator : public VkAllocator { public: VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb); virtual ~VkAndroidHardwareBufferImageAllocator(); public: - virtual VkImageMemory* fastMalloc(int width, int height, VkFormat format); + virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack); virtual void fastFree(VkImageMemory* ptr); + virtual VkBufferMemory* fastMalloc(size_t /*size*/) { return 0; } + virtual void fastFree(VkBufferMemory* /*ptr*/) {} public: int init(); diff --git a/src/command.cpp b/src/command.cpp index 361401a9e..e9c10cfdb 100644 --- a/src/command.cpp +++ b/src/command.cpp @@ -39,6 +39,26 @@ VkCompute::VkCompute(const VulkanDevice* _vkdev) : vkdev(_vkdev) VkCompute::~VkCompute() { + for (size_t i=0; icommand_refcount, -1); + if (ptr->refcount == 0 && old_command_refcount == 1) + { + // no userspace reference and we are the last command reference + vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0); + vkDestroyImage(vkdev->vkdevice(), ptr->image, 0); + + delete ptr; + } + else + { + // reference exists in user code or other command + } + } + image_blocks_to_destroy.clear(); + if (!vkdev->info.support_VK_KHR_push_descriptor) { for (size_t i=0; imappable) + Mat src_fp16; + if (src.elemsize == src.elempack * 4u) { - // memcpy src to device - memcpy(dst.mapped_ptr(), src.data, src.total() * src.elemsize); - dst.allocator->flush(dst.data); - - // mark device host-write @ null - dst.data->access_flags = VK_ACCESS_HOST_WRITE_BIT; - dst.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; - - return; + // cpu cast to fp16 (discrete gpu) + if (vkdev->info.type == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && src.elempack % 4 == 0))) + { + ncnn::cast_float32_to_float16(src, src_fp16, opt); + } + else + { + src_fp16 = src; + } + } + else + { + src_fp16 = src; } - // create staging - VkMat dst_staging; - dst_staging.create_like(src, opt.staging_vkallocator); + // upload + VkMat dst_unpacked; + record_clone(src_fp16, dst_unpacked, opt); - // memcpy src to staging - memcpy(dst_staging.mapped_ptr(), src.data, src.total() * src.elemsize); - dst_staging.allocator->flush(dst_staging.data); + VkMat dst_unpacked_fp16; + if (dst_unpacked.elemsize == dst_unpacked.elempack * 4u) + { + // cast to fp16 (integrated gpu) + if (vkdev->info.type != 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && dst_unpacked.elempack % 4 == 0))) + { + vkdev->cast_float32_to_float16(dst_unpacked, dst_unpacked_fp16, *this, opt); + } + else + { + dst_unpacked_fp16 = dst_unpacked; + } + } + else + { + dst_unpacked_fp16 = dst_unpacked; + } - // barrier staging host-write @ null to transfer-read @ compute + // packing + if (opt.use_shader_pack8) { - VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; - barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barriers[0].pNext = 0; - barriers[0].srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; - barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; - barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barriers[0].buffer = dst_staging.buffer(); - barriers[0].offset = dst_staging.buffer_offset(); - barriers[0].size = dst_staging.buffer_capacity(); + vkdev->packing_pack8(dst_unpacked_fp16, dst, *this, opt); + if (dst.elempack != 8) + vkdev->packing_pack4(dst_unpacked_fp16, dst, *this, opt); + } + else + vkdev->packing_pack4(dst_unpacked_fp16, dst, *this, opt); +} - VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_HOST_BIT; - VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; +void VkCompute::record_upload(const Mat& src, VkImageMat& dst, const Option& opt) +{ +// fprintf(stderr, "record_upload image\n"); - if (vkdev->info.support_VK_KHR_push_descriptor) + Mat src_fp16; + if (src.elemsize == src.elempack * 4u) + { + // cpu cast to fp16 (discrete gpu) + if (vkdev->info.type == 0 && (opt.use_image_fp16_storage || (opt.use_image_fp16_packed && src.elempack % 4 == 0))) { - vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); - delete[] barriers; + ncnn::cast_float32_to_float16(src, src_fp16, opt); } else { - record r; - r.type = record::TYPE_buffer_barrers; - r.command_buffer = compute_command_buffer; - r.buffer_barrers.src_stage = src_stage; - r.buffer_barrers.dst_stage = dst_stage; - r.buffer_barrers.barrier_count = 1; - r.buffer_barrers.barriers = barriers; - delayed_records.push_back(r); + src_fp16 = src; } } - - // record staging to device + else { - VkBufferCopy* regions = new VkBufferCopy[1]; - regions[0].srcOffset = dst_staging.buffer_offset(); - regions[0].dstOffset = dst.buffer_offset(); - regions[0].size = std::min(dst_staging.buffer_capacity(), dst.buffer_capacity()); + src_fp16 = src; + } - if (vkdev->info.support_VK_KHR_push_descriptor) + // upload + VkMat dst_staging; + record_clone(src_fp16, dst_staging, opt); + + // staging to image + VkImageMat dst_unpacked; + record_clone(dst_staging, dst_unpacked, opt); + + VkImageMat dst_unpacked_fp16; + if (dst_unpacked.elemsize == dst_unpacked.elempack * 4u) + { + // cast to fp16 (integrated gpu) + if (vkdev->info.type != 0 && (opt.use_image_fp16_storage || (opt.use_image_fp16_packed && dst_unpacked.elempack % 4 == 0))) { - vkCmdCopyBuffer(compute_command_buffer, dst_staging.buffer(), dst.buffer(), 1, regions); - delete[] regions; + vkdev->cast_float32_to_float16(dst_unpacked, dst_unpacked_fp16, *this, opt); } else { - record r; - r.type = record::TYPE_copy_buffer; - r.command_buffer = compute_command_buffer; - r.copy_buffer.src = dst_staging.buffer(); - r.copy_buffer.dst = dst.buffer(); - r.copy_buffer.region_count = 1; - r.copy_buffer.regions = regions; - delayed_records.push_back(r); + dst_unpacked_fp16 = dst_unpacked; } } + else + { + dst_unpacked_fp16 = dst_unpacked; + } - // mark device transfer-write @ queue - dst.data->access_flags = VK_ACCESS_TRANSFER_WRITE_BIT; - dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; - - // stash staging - upload_staging_buffers.push_back(dst_staging); + // packing + if (opt.use_shader_pack8) + { + vkdev->packing_pack8(dst_unpacked_fp16, dst, *this, opt); + if (dst.elempack != 8) + vkdev->packing_pack4(dst_unpacked_fp16, dst, *this, opt); + } + else + vkdev->packing_pack4(dst_unpacked_fp16, dst, *this, opt); } void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt) { -// fprintf(stderr, "record_download\n"); +// fprintf(stderr, "record_download buffer\n"); - // create dst - dst.create_like(src, opt.blob_allocator); + VkMat src_unpacked_fp16; + if (opt.use_packing_layout) + { + vkdev->packing_pack4(src, src_unpacked_fp16, *this, opt); + } + else + { + // unpacking + vkdev->packing_pack1(src, src_unpacked_fp16, *this, opt); + } - if (src.allocator->mappable) + // cast to fp32 (integrated gpu) + VkMat src_unpacked; + if (src_unpacked_fp16.elemsize == src_unpacked_fp16.elempack * 2u) { - // barrier device any @ compute to host-read @ compute - if (src.data->access_flags != VK_ACCESS_HOST_READ_BIT || src.data->stage_flags != VK_PIPELINE_STAGE_HOST_BIT) + if (vkdev->info.type != 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && src_unpacked_fp16.elempack % 4 == 0 ))) { - VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; - barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barriers[0].pNext = 0; - barriers[0].srcAccessMask = src.data->access_flags; - barriers[0].dstAccessMask = VK_ACCESS_HOST_READ_BIT; - barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barriers[0].buffer = src.buffer(); - barriers[0].offset = src.buffer_offset(); - barriers[0].size = src.buffer_capacity(); - - VkPipelineStageFlags src_stage = src.data->stage_flags; - VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_HOST_BIT; - - if (vkdev->info.support_VK_KHR_push_descriptor) - { - vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); - delete[] barriers; - } - else - { - record r; - r.type = record::TYPE_buffer_barrers; - r.command_buffer = compute_command_buffer; - r.buffer_barrers.src_stage = src_stage; - r.buffer_barrers.dst_stage = dst_stage; - r.buffer_barrers.barrier_count = 1; - r.buffer_barrers.barriers = barriers; - delayed_records.push_back(r); - } - - // mark device host-read @ any - src.data->access_flags = VK_ACCESS_HOST_READ_BIT; - src.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; + vkdev->cast_float16_to_float32(src_unpacked_fp16, src_unpacked, *this, opt); + } + else + { + src_unpacked = src_unpacked_fp16; } + } + else + { + src_unpacked = src_unpacked_fp16; + } - // stash download post buffer and mat - download_post_buffers.push_back(src); - download_post_mats.push_back(dst); + // download + Mat dst_fp16; + record_clone(src_unpacked, dst_fp16, opt); - // post memcpy device to dst + // cast to fp32 (discrete gpu) + if (dst_fp16.elemsize == dst_fp16.elempack * 2u) + { + if (vkdev->info.type == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && dst_fp16.elempack % 4 == 0))) { + int dims = dst_fp16.dims; + if (dims == 1) + dst.create(dst_fp16.w, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); + if (dims == 2) + dst.create(dst_fp16.w, dst_fp16.h, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); + if (dims == 3) + dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); + + download_post_mats_fp16.push_back(dst_fp16); + download_post_mats.push_back(dst); + record r; - r.type = record::TYPE_post_download; + r.type = record::TYPE_post_cast_float16_to_float32; r.command_buffer = 0; - r.post_download.download_post_buffer_mat_offset = download_post_buffers.size() - 1; + r.post_cast_float16_to_float32.download_post_mat_fp16_offset = download_post_mats_fp16.size() - 1; + r.post_cast_float16_to_float32.download_post_mat_offset = download_post_mats.size() - 1; delayed_records.push_back(r); } - - return; + else + { + dst = dst_fp16; + } } - - if (src.data->access_flags != VK_ACCESS_TRANSFER_READ_BIT || src.data->stage_flags != VK_PIPELINE_STAGE_TRANSFER_BIT) + else { - // barrier device any @ compute to transfer-read @ compute - VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; - barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barriers[0].pNext = 0; - barriers[0].srcAccessMask = src.data->access_flags; - barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; - barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barriers[0].buffer = src.buffer(); - barriers[0].offset = src.buffer_offset(); - barriers[0].size = src.buffer_capacity(); + dst = dst_fp16; + } +} - VkPipelineStageFlags src_stage = src.data->stage_flags; - VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; +void VkCompute::record_download(const VkImageMat& src, Mat& dst, const Option& opt) +{ +// fprintf(stderr, "record_download image\n"); - if (vkdev->info.support_VK_KHR_push_descriptor) + VkImageMat src_unpacked_fp16; + if (opt.use_packing_layout) + { + vkdev->packing_pack4(src, src_unpacked_fp16, *this, opt); + } + else + { + // unpacking + vkdev->packing_pack1(src, src_unpacked_fp16, *this, opt); + } + + // cast to fp32 (integrated gpu) + VkImageMat src_unpacked; + if (src_unpacked_fp16.elemsize == src_unpacked_fp16.elempack * 2u) + { + if (vkdev->info.type != 0 && (opt.use_image_fp16_storage || (opt.use_image_fp16_packed && src_unpacked_fp16.elempack % 4 == 0))) { - vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); - delete[] barriers; + vkdev->cast_float16_to_float32(src_unpacked_fp16, src_unpacked, *this, opt); } else { - record r; - r.type = record::TYPE_buffer_barrers; - r.command_buffer = compute_command_buffer; - r.buffer_barrers.src_stage = src_stage; - r.buffer_barrers.dst_stage = dst_stage; - r.buffer_barrers.barrier_count = 1; - r.buffer_barrers.barriers = barriers; - delayed_records.push_back(r); + src_unpacked = src_unpacked_fp16; } - - // mark device transfer-read @ transfer - src.data->access_flags = VK_ACCESS_TRANSFER_READ_BIT; - src.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; + } + else + { + src_unpacked = src_unpacked_fp16; } - // create staging + // image to staging VkMat src_staging; - src_staging.create_like(src, opt.staging_vkallocator); + record_clone(src_unpacked, src_staging, opt); - // record device to staging - { - VkBufferCopy* regions = new VkBufferCopy[1]; - regions[0].srcOffset = src.buffer_offset(); - regions[0].dstOffset = src_staging.buffer_offset(); - regions[0].size = std::min(src.buffer_capacity(), src_staging.buffer_capacity()); + // download + Mat dst_fp16; + record_clone(src_staging, dst_fp16, opt); - if (vkdev->info.support_VK_KHR_push_descriptor) + // cast to fp32 (discrete gpu) + if (dst_fp16.elemsize == dst_fp16.elempack * 2u) + { + if (vkdev->info.type == 0 && (opt.use_image_fp16_storage || (opt.use_image_fp16_packed && dst_fp16.elempack % 4 == 0))) { - vkCmdCopyBuffer(compute_command_buffer, src.buffer(), src_staging.buffer(), 1, regions); - delete[] regions; + int dims = dst_fp16.dims; + if (dims == 1) + dst.create(dst_fp16.w, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); + if (dims == 2) + dst.create(dst_fp16.w, dst_fp16.h, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); + if (dims == 3) + dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); + + download_post_mats_fp16.push_back(dst_fp16); + download_post_mats.push_back(dst); + + record r; + r.type = record::TYPE_post_cast_float16_to_float32; + r.command_buffer = 0; + r.post_cast_float16_to_float32.download_post_mat_fp16_offset = download_post_mats_fp16.size() - 1; + r.post_cast_float16_to_float32.download_post_mat_offset = download_post_mats.size() - 1; + delayed_records.push_back(r); } else { - record r; - r.type = record::TYPE_copy_buffer; + dst = dst_fp16; + } + } + else + { + dst = dst_fp16; + } +} + +void VkCompute::record_buffer_to_image(const VkMat& src, VkImageMat& dst, const Option& opt) +{ +// fprintf(stderr, "record_buffer_to_image\n"); + + if ((opt.use_image_fp16_storage && opt.use_fp16_storage) + || (!opt.use_image_fp16_storage && !opt.use_fp16_storage && !opt.use_fp16_packed)) + { + // fp16 to fp16s copy or fp32 to fp32 copy + record_clone(src, dst, opt); + } + else if (opt.use_image_fp16_storage && !opt.use_fp16_storage && !opt.use_fp16_packed) + { + // fp32 to fp16 + VkImageMat src_fp32; + record_clone(src, src_fp32, opt); + if (src_fp32.empty()) + return; + + vkdev->cast_float32_to_float16(src_fp32, dst, *this, opt); + } + else if (opt.use_image_fp16_storage && !opt.use_fp16_storage && opt.use_fp16_packed) + { + // fp16p to fp32 to fp16 + VkMat src_fp32; + vkdev->cast_float16_to_float32(src, src_fp32, *this, opt); + if (src_fp32.empty()) + return; + + VkImageMat dst_fp32; + record_clone(src_fp32, dst_fp32, opt); + if (dst_fp32.empty()) + return; + + vkdev->cast_float32_to_float16(dst_fp32, dst, *this, opt); + } + else if (!opt.use_image_fp16_storage && opt.use_fp16_storage) + { + // fp16s to fp32 + VkImageMat dst_fp16; + record_clone(src, dst_fp16, opt); + if (dst_fp16.empty()) + return; + + vkdev->cast_float16_to_float32(dst_fp16, dst, *this, opt); + } + else if (!opt.use_image_fp16_storage && !opt.use_fp16_storage && opt.use_fp16_packed) + { + // fp16p to fp32 + VkMat src_fp32; + vkdev->cast_float32_to_float16(src, src_fp32, *this, opt); + + record_clone(src_fp32, dst, opt); + } + else + { + fprintf(stderr, "FATAL ERROR! unsupported record_buffer_to_image option\n"); + } +} + +void VkCompute::record_image_to_buffer(const VkImageMat& src, VkMat& dst, const Option& opt) +{ +// fprintf(stderr, "record_image_to_buffer\n"); + + if ((opt.use_image_fp16_storage && opt.use_fp16_storage) + || (!opt.use_image_fp16_storage && !opt.use_fp16_storage && !opt.use_fp16_packed)) + { + // fp16 to fp16s copy or fp32 to fp32 copy + record_clone(src, dst, opt); + } + else if (opt.use_image_fp16_storage && !opt.use_fp16_storage && !opt.use_fp16_packed) + { + // fp16 to fp32 + VkImageMat src_fp32; + vkdev->cast_float16_to_float32(src, src_fp32, *this, opt); + if (src_fp32.empty()) + return; + + record_clone(src_fp32, dst, opt); + } + else if (opt.use_image_fp16_storage && !opt.use_fp16_storage && opt.use_fp16_packed) + { + // fp16 to fp32 to fp16p + VkImageMat src_fp32; + vkdev->cast_float16_to_float32(src, src_fp32, *this, opt); + if (src_fp32.empty()) + return; + + VkMat dst_fp32; + record_clone(src_fp32, dst_fp32, opt); + if (dst_fp32.empty()) + return; + + vkdev->cast_float32_to_float16(dst_fp32, dst, *this, opt); + } + else if (!opt.use_image_fp16_storage && opt.use_fp16_storage) + { + // fp32 to fp16s + VkImageMat src_fp16; + vkdev->cast_float32_to_float16(src, src_fp16, *this, opt); + if (src_fp16.empty()) + return; + + record_clone(src_fp16, dst, opt); + } + else if (!opt.use_image_fp16_storage && !opt.use_fp16_storage && opt.use_fp16_packed) + { + // fp32 to fp16p + VkMat dst_fp32; + record_clone(src, dst_fp32, opt); + if (dst_fp32.empty()) + return; + + vkdev->cast_float32_to_float16(dst_fp32, dst, *this, opt); + } + else + { + fprintf(stderr, "FATAL ERROR! unsupported record_image_to_buffer option\n"); + } +} + +void VkCompute::record_clone(const Mat& src, VkMat& dst, const Option& opt) +{ +// fprintf(stderr, "record_clone host to buffer\n"); + + if (!opt.blob_vkallocator->mappable) + { + // host to staging + VkMat dst_staging; + Option opt_staging = opt; + opt_staging.blob_vkallocator = opt.staging_vkallocator; + record_clone(src, dst_staging, opt_staging); + + // staging to device + record_clone(dst_staging, dst, opt); + + // stash staging + upload_staging_buffers.push_back(dst_staging); + + return; + } + + // create dst + dst.create_like(src, opt.blob_vkallocator); + if (dst.empty()) + return; + + // memcpy src to device + memcpy(dst.mapped_ptr(), src.data, src.total() * src.elemsize); + dst.allocator->flush(dst.data); + + // mark device host-write @ null + dst.data->access_flags = VK_ACCESS_HOST_WRITE_BIT; + dst.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; +} + +void VkCompute::record_clone(const Mat& src, VkImageMat& dst, const Option& opt) +{ +// fprintf(stderr, "record_clone host to image\n"); + + // host to staging + VkMat dst_staging; + Option opt_staging = opt; + opt_staging.blob_vkallocator = opt.staging_vkallocator; + record_clone(src, dst_staging, opt_staging); + + // staging to image + record_clone(dst_staging, dst, opt); + + // stash staging + upload_staging_buffers.push_back(dst_staging); +} + +void VkCompute::record_clone(const VkMat& src, Mat& dst, const Option& opt) +{ +// fprintf(stderr, "record_clone buffer to host\n"); + + if (!src.allocator->mappable) + { + // device to staging + VkMat src_staging; + Option opt_staging = opt; + opt_staging.blob_vkallocator = opt.staging_vkallocator; + record_clone(src, src_staging, opt_staging); + + // staging to host + record_clone(src_staging, dst, opt); + + return; + } + + // create dst + dst.create_like(src, opt.blob_allocator); + if (dst.empty()) + return; + + // barrier device any @ compute to host-read @ compute + if (src.data->access_flags & VK_ACCESS_HOST_WRITE_BIT || src.data->stage_flags != VK_PIPELINE_STAGE_HOST_BIT) + { + VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = src.data->access_flags; + barriers[0].dstAccessMask = VK_ACCESS_HOST_READ_BIT; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].buffer = src.buffer(); + barriers[0].offset = src.buffer_offset(); + barriers[0].size = src.buffer_capacity(); + + VkPipelineStageFlags src_stage = src.data->stage_flags; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_HOST_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_buffer_barrers; + r.command_buffer = compute_command_buffer; + r.buffer_barrers.src_stage = src_stage; + r.buffer_barrers.dst_stage = dst_stage; + r.buffer_barrers.barrier_count = 1; + r.buffer_barrers.barriers = barriers; + delayed_records.push_back(r); + } + + // mark device host-read @ any + src.data->access_flags = VK_ACCESS_HOST_READ_BIT; + src.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; + } + + // stash download post buffer and mat + download_post_buffers.push_back(src); + download_post_mats_fp16.push_back(dst); + + // post memcpy device to dst + { + record r; + r.type = record::TYPE_post_download; + r.command_buffer = 0; + r.post_download.download_post_buffer_mat_offset = download_post_buffers.size() - 1; + r.post_download.download_post_mat_fp16_offset = download_post_mats_fp16.size() - 1; + delayed_records.push_back(r); + } +} + +void VkCompute::record_clone(const VkImageMat& src, Mat& dst, const Option& opt) +{ +// fprintf(stderr, "record_clone image to host\n"); + + // image to staging + VkMat src_staging; + Option opt_staging = opt; + opt_staging.blob_vkallocator = opt.staging_vkallocator; + record_clone(src, src_staging, opt_staging); + + // staging to host + record_clone(src_staging, dst, opt); +} + +void VkCompute::record_clone(const VkMat& src, VkMat& dst, const Option& opt) +{ +// fprintf(stderr, "record_clone buffer to buffer\n"); + + // create dst + dst.create_like(src, opt.blob_vkallocator); + if (dst.empty()) + return; + + if (src.data->access_flags & VK_ACCESS_TRANSFER_WRITE_BIT || src.data->stage_flags != VK_PIPELINE_STAGE_TRANSFER_BIT) + { + // barrier device any @ compute to transfer-read @ compute + VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = src.data->access_flags; + barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].buffer = src.buffer(); + barriers[0].offset = src.buffer_offset(); + barriers[0].size = src.buffer_capacity(); + + VkPipelineStageFlags src_stage = src.data->stage_flags; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_buffer_barrers; + r.command_buffer = compute_command_buffer; + r.buffer_barrers.src_stage = src_stage; + r.buffer_barrers.dst_stage = dst_stage; + r.buffer_barrers.barrier_count = 1; + r.buffer_barrers.barriers = barriers; + delayed_records.push_back(r); + } + + // mark device transfer-read @ transfer + src.data->access_flags = VK_ACCESS_TRANSFER_READ_BIT; + src.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; + } + + { + // barrier device any @ null to transfer-write @ compute + + // mark device transfer-write @ transfer + dst.data->access_flags = VK_ACCESS_TRANSFER_WRITE_BIT; + dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; + } + + // record device to staging + { + VkBufferCopy* regions = new VkBufferCopy[1]; + regions[0].srcOffset = src.buffer_offset(); + regions[0].dstOffset = dst.buffer_offset(); + regions[0].size = std::min(src.buffer_capacity(), dst.buffer_capacity()); + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdCopyBuffer(compute_command_buffer, src.buffer(), dst.buffer(), 1, regions); + delete[] regions; + } + else + { + record r; + r.type = record::TYPE_copy_buffer; + r.command_buffer = compute_command_buffer; + r.copy_buffer.src = src.buffer(); + r.copy_buffer.dst = dst.buffer(); + r.copy_buffer.region_count = 1; + r.copy_buffer.regions = regions; + delayed_records.push_back(r); + } + } +} + +void VkCompute::record_clone(const VkImageMat& src, VkImageMat& dst, const Option& opt) +{ +// fprintf(stderr, "record_clone image to image\n"); + + // create dst + dst.create_like(src, opt.blob_vkallocator); + if (dst.empty()) + return; + + // image layout transform any @ any to transfer-src-optimal @ compute + if (src.data->access_flags & VK_ACCESS_TRANSFER_WRITE_BIT || src.data->image_layout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL || src.data->stage_flags != VK_PIPELINE_STAGE_TRANSFER_BIT) + { + VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = src.data->access_flags; + barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + barriers[0].oldLayout = src.data->image_layout; + barriers[0].newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].image = src.image(); + barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + barriers[0].subresourceRange.baseMipLevel = 0; + barriers[0].subresourceRange.levelCount = 1; + barriers[0].subresourceRange.baseArrayLayer = 0; + barriers[0].subresourceRange.layerCount = 1; + + VkPipelineStageFlags src_stage = src.data->stage_flags; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_image_barrers; + r.command_buffer = compute_command_buffer; + r.image_barrers.src_stage = src_stage; + r.image_barrers.dst_stage = dst_stage; + r.image_barrers.barrier_count = 1; + r.image_barrers.barriers = barriers; + delayed_records.push_back(r); + } + + // mark image transfer-src-optimal @ compute + src.data->access_flags = VK_ACCESS_TRANSFER_READ_BIT; + src.data->image_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; + src.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; + } + + // image layout transform undefined @ null to transfer-dst-optimal @ compute + { + VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = 0; + barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barriers[0].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + barriers[0].newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].image = dst.image(); + barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + barriers[0].subresourceRange.baseMipLevel = 0; + barriers[0].subresourceRange.levelCount = 1; + barriers[0].subresourceRange.baseArrayLayer = 0; + barriers[0].subresourceRange.layerCount = 1; + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_image_barrers; + r.command_buffer = compute_command_buffer; + r.image_barrers.src_stage = src_stage; + r.image_barrers.dst_stage = dst_stage; + r.image_barrers.barrier_count = 1; + r.image_barrers.barriers = barriers; + delayed_records.push_back(r); + } + + // mark image transfer-dst-optimal @ compute + dst.data->access_flags = VK_ACCESS_TRANSFER_WRITE_BIT; + dst.data->image_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; + } + + // record device to staging + { + VkImageCopy* regions = new VkImageCopy[1]; + regions[0].srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + regions[0].srcSubresource.mipLevel = 0; + regions[0].srcSubresource.baseArrayLayer = 0; + regions[0].srcSubresource.layerCount = 1; + regions[0].srcOffset.x = 0; + regions[0].srcOffset.y = 0; + regions[0].srcOffset.z = 0; + regions[0].dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + regions[0].dstSubresource.mipLevel = 0; + regions[0].dstSubresource.baseArrayLayer = 0; + regions[0].dstSubresource.layerCount = 1; + regions[0].dstOffset.x = 0; + regions[0].dstOffset.y = 0; + regions[0].dstOffset.z = 0; + regions[0].extent.width = src.data->width; + regions[0].extent.height = src.data->height; + regions[0].extent.depth = src.data->depth; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdCopyImage(compute_command_buffer, src.image(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst.image(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, regions); + delete[] regions; + } + else + { + record r; + r.type = record::TYPE_copy_image; + r.command_buffer = compute_command_buffer; + r.copy_image.src = src.image(); + r.copy_image.src_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; + r.copy_image.dst = dst.image(); + r.copy_image.dst_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + r.copy_image.region_count = 1; + r.copy_image.regions = regions; + delayed_records.push_back(r); + } + } + + // image and imageview can not be destroyed until command execution ends + NCNN_XADD(&src.data->command_refcount, 1); + NCNN_XADD(&dst.data->command_refcount, 1); + image_blocks_to_destroy.push_back(src.data); + image_blocks_to_destroy.push_back(dst.data); +} + +void VkCompute::record_clone(const VkMat& src, VkImageMat& dst, const Option& opt) +{ +// fprintf(stderr, "record_clone buffer to image\n"); + + // create dst + dst.create_like(src, opt.blob_vkallocator); + if (dst.empty()) + return; + + // barrier device any @ any to transfer-read @ compute + if (src.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || src.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) + { + VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = src.data->access_flags; + barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].buffer = src.buffer(); + barriers[0].offset = src.buffer_offset(); + barriers[0].size = src.buffer_capacity(); + + VkPipelineStageFlags src_stage = src.data->stage_flags; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_buffer_barrers; + r.command_buffer = compute_command_buffer; + r.buffer_barrers.src_stage = src_stage; + r.buffer_barrers.dst_stage = dst_stage; + r.buffer_barrers.barrier_count = 1; + r.buffer_barrers.barriers = barriers; + delayed_records.push_back(r); + } + + // mark device transfer-read @ compute + src.data->access_flags = VK_ACCESS_TRANSFER_READ_BIT; + src.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; + } + + // image layout transform undefined @ null to transfer-dst-optimal @ compute + { + VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = 0; + barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barriers[0].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + barriers[0].newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].image = dst.image(); + barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + barriers[0].subresourceRange.baseMipLevel = 0; + barriers[0].subresourceRange.levelCount = 1; + barriers[0].subresourceRange.baseArrayLayer = 0; + barriers[0].subresourceRange.layerCount = 1; + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_image_barrers; + r.command_buffer = compute_command_buffer; + r.image_barrers.src_stage = src_stage; + r.image_barrers.dst_stage = dst_stage; + r.image_barrers.barrier_count = 1; + r.image_barrers.barriers = barriers; + delayed_records.push_back(r); + } + + // mark image transfer-dst-optimal @ compute + dst.data->access_flags = VK_ACCESS_TRANSFER_WRITE_BIT; + dst.data->image_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; + } + + // record device to image + { + int region_count; + VkBufferImageCopy* regions; + if (dst.elemsize * dst.w * dst.h % 16 == 0) + { + region_count = 1; + regions = new VkBufferImageCopy[1]; + regions[0].bufferOffset = src.buffer_offset(); + regions[0].bufferRowLength = 0; + regions[0].bufferImageHeight = 0; + regions[0].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + regions[0].imageSubresource.mipLevel = 0; + regions[0].imageSubresource.baseArrayLayer = 0; + regions[0].imageSubresource.layerCount = 1; + regions[0].imageOffset.x = 0; + regions[0].imageOffset.y = 0; + regions[0].imageOffset.z = 0; + regions[0].imageExtent.width = dst.data->width; + regions[0].imageExtent.height = dst.data->height; + regions[0].imageExtent.depth = dst.data->depth; + } + else + { + region_count = dst.c; + regions = new VkBufferImageCopy[region_count]; + for (int i = 0; i < region_count; i++) + { + regions[i].bufferOffset = src.buffer_offset() + src.cstep * src.elemsize * i; + regions[i].bufferRowLength = 0; + regions[i].bufferImageHeight = 0; + regions[i].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + regions[i].imageSubresource.mipLevel = 0; + regions[i].imageSubresource.baseArrayLayer = 0; + regions[i].imageSubresource.layerCount = 1; + regions[i].imageOffset.x = 0; + regions[i].imageOffset.y = 0; + regions[i].imageOffset.z = i; + regions[i].imageExtent.width = dst.data->width; + regions[i].imageExtent.height = dst.data->height; + regions[i].imageExtent.depth = 1; + } + } + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdCopyBufferToImage(compute_command_buffer, src.buffer(), dst.image(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, region_count, regions); + delete[] regions; + } + else + { + record r; + r.type = record::TYPE_copy_buffer_to_image; + r.command_buffer = compute_command_buffer; + r.copy_buffer_to_image.src = src.buffer(); + r.copy_buffer_to_image.dst = dst.image(); + r.copy_buffer_to_image.layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + r.copy_buffer_to_image.region_count = region_count; + r.copy_buffer_to_image.regions = regions; + delayed_records.push_back(r); + } + } + + // image and imageview can not be destroyed until command execution ends + NCNN_XADD(&dst.data->command_refcount, 1); + image_blocks_to_destroy.push_back(dst.data); +} + +void VkCompute::record_clone(const VkImageMat& src, VkMat& dst, const Option& opt) +{ +// fprintf(stderr, "record_clone image to buffer\n"); + + // create dst + dst.create_like(src, opt.blob_vkallocator); + if (dst.empty()) + return; + + // image layout transform any @ any to transfer-src-optimal @ compute + if (src.data->access_flags & VK_ACCESS_TRANSFER_WRITE_BIT || src.data->image_layout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL || src.data->stage_flags != VK_PIPELINE_STAGE_TRANSFER_BIT) + { + VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = src.data->access_flags; + barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + barriers[0].oldLayout = src.data->image_layout; + barriers[0].newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].image = src.image(); + barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + barriers[0].subresourceRange.baseMipLevel = 0; + barriers[0].subresourceRange.levelCount = 1; + barriers[0].subresourceRange.baseArrayLayer = 0; + barriers[0].subresourceRange.layerCount = 1; + + VkPipelineStageFlags src_stage = src.data->stage_flags; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_image_barrers; + r.command_buffer = compute_command_buffer; + r.image_barrers.src_stage = src_stage; + r.image_barrers.dst_stage = dst_stage; + r.image_barrers.barrier_count = 1; + r.image_barrers.barriers = barriers; + delayed_records.push_back(r); + } + + // mark image transfer-src-optimal @ compute + src.data->access_flags = VK_ACCESS_TRANSFER_READ_BIT; + src.data->image_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; + src.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; + } + + { + // barrier device any @ null to transfer-write @ compute + + // mark device transfer-write @ transfer + dst.data->access_flags = VK_ACCESS_TRANSFER_WRITE_BIT; + dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; + } + + // record image to device + { + int region_count; + VkBufferImageCopy* regions; + if (src.elemsize * src.w * src.h % 16 == 0) + { + region_count = 1; + regions = new VkBufferImageCopy[1]; + regions[0].bufferOffset = dst.buffer_offset(); + regions[0].bufferRowLength = 0; + regions[0].bufferImageHeight = 0; + regions[0].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + regions[0].imageSubresource.mipLevel = 0; + regions[0].imageSubresource.baseArrayLayer = 0; + regions[0].imageSubresource.layerCount = 1; + regions[0].imageOffset.x = 0; + regions[0].imageOffset.y = 0; + regions[0].imageOffset.z = 0; + regions[0].imageExtent.width = src.data->width; + regions[0].imageExtent.height = src.data->height; + regions[0].imageExtent.depth = src.data->depth; + } + else + { + region_count = src.c; + regions = new VkBufferImageCopy[region_count]; + for (int i = 0; i < region_count; i++) + { + regions[i].bufferOffset = dst.buffer_offset() + dst.cstep * dst.elemsize * i; + regions[i].bufferRowLength = 0; + regions[i].bufferImageHeight = 0; + regions[i].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + regions[i].imageSubresource.mipLevel = 0; + regions[i].imageSubresource.baseArrayLayer = 0; + regions[i].imageSubresource.layerCount = 1; + regions[i].imageOffset.x = 0; + regions[i].imageOffset.y = 0; + regions[i].imageOffset.z = i; + regions[i].imageExtent.width = src.data->width; + regions[i].imageExtent.height = src.data->height; + regions[i].imageExtent.depth = 1; + } + } + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdCopyImageToBuffer(compute_command_buffer, src.image(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst.buffer(), region_count, regions); + delete[] regions; + } + else + { + record r; + r.type = record::TYPE_copy_image_to_buffer; + r.command_buffer = compute_command_buffer; + r.copy_image_to_buffer.src = src.image(); + r.copy_image_to_buffer.layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; + r.copy_image_to_buffer.dst = dst.buffer(); + r.copy_image_to_buffer.region_count = region_count; + r.copy_image_to_buffer.regions = regions; + delayed_records.push_back(r); + } + } + + // image and imageview can not be destroyed until command execution ends + NCNN_XADD(&src.data->command_refcount, 1); + image_blocks_to_destroy.push_back(src.data); +} + +void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector& bindings, const std::vector& constants, const VkMat& dispatcher) +{ +// fprintf(stderr, "record_pipeline %p\n", pipeline); + + const int binding_count = (int)bindings.size(); + const int constant_count = (int)constants.size(); + + if (binding_count != pipeline->shader_info.binding_count) + { + fprintf(stderr, "binding_count not match, expect %d but got %d\n", pipeline->shader_info.binding_count, binding_count); + } + + if (constant_count != pipeline->shader_info.push_constant_count) + { + fprintf(stderr, "push_constant_count not match, expect %d but got %d\n", pipeline->shader_info.push_constant_count, constant_count); + } + + for (int i=0; iaccess_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) + { + // barrier device any @ compute/null to shader-readwrite @ compute + VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = binding.data->access_flags; + barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].buffer = binding.buffer(); + barriers[0].offset = binding.buffer_offset(); + barriers[0].size = binding.buffer_capacity(); + + VkPipelineStageFlags src_stage = binding.data->stage_flags; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_buffer_barrers; + r.command_buffer = compute_command_buffer; + r.buffer_barrers.src_stage = src_stage; + r.buffer_barrers.dst_stage = dst_stage; + r.buffer_barrers.barrier_count = 1; + r.buffer_barrers.barriers = barriers; + delayed_records.push_back(r); + } + + // mark device shader-readwrite @ compute + binding.data->access_flags = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + binding.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + } + } + + // record bind pipeline + { + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdBindPipeline(compute_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline); + } + else + { + record r; + r.type = record::TYPE_bind_pipeline; + r.command_buffer = compute_command_buffer; + r.bind_pipeline.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; + r.bind_pipeline.pipeline = pipeline->pipeline; + delayed_records.push_back(r); + } + } + + // record update bindings + if (binding_count > 0) + { + std::vector descriptorBufferInfos(binding_count); + for (int i=0; iinfo.support_VK_KHR_push_descriptor) + { + vkdev->vkCmdPushDescriptorSetWithTemplateKHR(compute_command_buffer, pipeline->descriptor_update_template, pipeline->pipeline_layout, 0, descriptorBufferInfos.data()); + } + else + { + // create new descriptor_pool and descriptorset + VkDescriptorPool descriptor_pool; + { + VkDescriptorPoolSize poolSize; + poolSize.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + poolSize.descriptorCount = binding_count; + + VkDescriptorPoolCreateInfo descriptorPoolCreateInfo; + descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + descriptorPoolCreateInfo.pNext = 0; + descriptorPoolCreateInfo.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; + descriptorPoolCreateInfo.maxSets = 1; + descriptorPoolCreateInfo.poolSizeCount = 1; + descriptorPoolCreateInfo.pPoolSizes = &poolSize; + + VkResult ret = vkCreateDescriptorPool(vkdev->vkdevice(), &descriptorPoolCreateInfo, 0, &descriptor_pool); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkCreateDescriptorPool failed %d\n", ret); + return; + } + } + descriptor_pools.push_back(descriptor_pool); + + VkDescriptorSet descriptorset; + { + VkDescriptorSetAllocateInfo descriptorSetAllocateInfo; + descriptorSetAllocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + descriptorSetAllocateInfo.pNext = 0; + descriptorSetAllocateInfo.descriptorPool = descriptor_pool; + descriptorSetAllocateInfo.descriptorSetCount = 1; + descriptorSetAllocateInfo.pSetLayouts = &pipeline->descriptorset_layout; + + VkResult ret = vkAllocateDescriptorSets(vkdev->vkdevice(), &descriptorSetAllocateInfo, &descriptorset); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkAllocateDescriptorSets failed %d\n", ret); + return; + } + } + descriptorsets.push_back(descriptorset); + + if (vkdev->info.support_VK_KHR_descriptor_update_template) + { + vkdev->vkUpdateDescriptorSetWithTemplateKHR(vkdev->vkdevice(), descriptorset, pipeline->descriptor_update_template, descriptorBufferInfos.data()); + } + else + { + std::vector writeDescriptorSets(binding_count); + for (int i=0; ivkdevice(), binding_count, writeDescriptorSets.data(), 0, 0); + } + + record r; + r.type = record::TYPE_bind_descriptorsets; + r.command_buffer = compute_command_buffer; + r.bind_descriptorsets.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; + r.bind_descriptorsets.pipeline_layout = pipeline->pipeline_layout; + r.bind_descriptorsets.descriptorset_count = 1; + r.bind_descriptorsets.descriptorset_offset = descriptorsets.size() - 1; + delayed_records.push_back(r); + } + } + + // record push constants + if (constant_count > 0) + { + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPushConstants(compute_command_buffer, pipeline->pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, constant_count * sizeof(vk_constant_type), constants.data()); + } + else + { + uint32_t size = constant_count * sizeof(vk_constant_type); + unsigned char* constant_values = new unsigned char[size]; + memcpy(constant_values, constants.data(), size); + + record r; + r.type = record::TYPE_push_constants; + r.command_buffer = compute_command_buffer; + r.push_constants.pipeline_layout = pipeline->pipeline_layout; + r.push_constants.stage_flags = VK_SHADER_STAGE_COMPUTE_BIT; + r.push_constants.size = size; + r.push_constants.values = constant_values; + delayed_records.push_back(r); + } + } + + // record dispatch + { + uint32_t group_count_x = (dispatcher.w + pipeline->local_size_x - 1) / pipeline->local_size_x; + uint32_t group_count_y = (dispatcher.h + pipeline->local_size_y - 1) / pipeline->local_size_y; + uint32_t group_count_z = (dispatcher.c + pipeline->local_size_z - 1) / pipeline->local_size_z; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdDispatch(compute_command_buffer, group_count_x, group_count_y, group_count_z); + } + else + { + record r; + r.type = record::TYPE_dispatch; + r.command_buffer = compute_command_buffer; + r.dispatch.group_count_x = group_count_x; + r.dispatch.group_count_y = group_count_y; + r.dispatch.group_count_z = group_count_z; + delayed_records.push_back(r); + } + } +} + +void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector& bindings, const std::vector& constants, const VkImageMat& dispatcher) +{ +// fprintf(stderr, "record_pipeline %p\n", pipeline); + + const int binding_count = (int)bindings.size(); + const int constant_count = (int)constants.size(); + + if (binding_count != pipeline->shader_info.binding_count) + { + fprintf(stderr, "binding_count not match, expect %d but got %d\n", pipeline->shader_info.binding_count, binding_count); + } + + if (constant_count != pipeline->shader_info.push_constant_count) + { + fprintf(stderr, "push_constant_count not match, expect %d but got %d\n", pipeline->shader_info.push_constant_count, constant_count); + } + + // if the same image used for both storage image and combined image sampler + // only apply image layout transition to general + for (int i=0; ishader_info.binding_types[i]; + + if (binding_type == 2) + { + if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->image_layout != VK_IMAGE_LAYOUT_GENERAL || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) + { + // image layout transform any @ any to shader-write @ compute + VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = binding.data->access_flags; + barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + barriers[0].oldLayout = binding.data->image_layout; + barriers[0].newLayout = VK_IMAGE_LAYOUT_GENERAL; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].image = binding.image(); + barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + barriers[0].subresourceRange.baseMipLevel = 0; + barriers[0].subresourceRange.levelCount = 1; + barriers[0].subresourceRange.baseArrayLayer = 0; + barriers[0].subresourceRange.layerCount = 1; + + VkPipelineStageFlags src_stage = binding.data->stage_flags; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_image_barrers; + r.command_buffer = compute_command_buffer; + r.image_barrers.src_stage = src_stage; + r.image_barrers.dst_stage = dst_stage; + r.image_barrers.barrier_count = 1; + r.image_barrers.barriers = barriers; + delayed_records.push_back(r); + } + + // mark image shader-write @ compute + binding.data->access_flags = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + binding.data->image_layout = VK_IMAGE_LAYOUT_GENERAL; + binding.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + } + } + else // if (binding_type == 3) + { + for (int j=0; jshader_info.binding_types[j] == 2 && bindings[i].data == bindings[j].data) + { + // the same image is used as storage image, skip it + continue; + } + } + + if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->image_layout != VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) + { + // image layout transform any @ any to shader-readonly-optimal @ compute + VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = binding.data->access_flags; + barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barriers[0].oldLayout = binding.data->image_layout; + barriers[0].newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].image = binding.image(); + barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + barriers[0].subresourceRange.baseMipLevel = 0; + barriers[0].subresourceRange.levelCount = 1; + barriers[0].subresourceRange.baseArrayLayer = 0; + barriers[0].subresourceRange.layerCount = 1; + + VkPipelineStageFlags src_stage = binding.data->stage_flags; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_image_barrers; + r.command_buffer = compute_command_buffer; + r.image_barrers.src_stage = src_stage; + r.image_barrers.dst_stage = dst_stage; + r.image_barrers.barrier_count = 1; + r.image_barrers.barriers = barriers; + delayed_records.push_back(r); + } + + // mark image shader-readonly-optimal @ compute + binding.data->access_flags = VK_ACCESS_SHADER_READ_BIT; + binding.data->image_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + binding.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + } + } + + // image and imageview can not be destroyed until command execution ends + NCNN_XADD(&binding.data->command_refcount, 1); + image_blocks_to_destroy.push_back(binding.data); + } + + // record bind pipeline + { + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdBindPipeline(compute_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline); + } + else + { + record r; + r.type = record::TYPE_bind_pipeline; + r.command_buffer = compute_command_buffer; + r.bind_pipeline.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; + r.bind_pipeline.pipeline = pipeline->pipeline; + delayed_records.push_back(r); + } + } + + // record update bindings + if (binding_count > 0) + { + std::vector descriptorImageInfos(binding_count); + for (int i=0; iimage_layout; + } + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkdev->vkCmdPushDescriptorSetWithTemplateKHR(compute_command_buffer, pipeline->descriptor_update_template, pipeline->pipeline_layout, 0, descriptorImageInfos.data()); + } + else + { + // create new descriptor_pool and descriptorset + VkDescriptorPool descriptor_pool; + { + int image_binding_count = 0; + int sampler_binding_count = 0; + for (int i=0; ishader_info.binding_types[i]; + + if (binding_type == 2) + image_binding_count++; + else // if (binding_type == 3) + sampler_binding_count++; + } + + VkDescriptorPoolSize poolSizes[2]; + poolSizes[0].type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; + poolSizes[0].descriptorCount = image_binding_count; + poolSizes[1].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + poolSizes[1].descriptorCount = sampler_binding_count; + + VkDescriptorPoolCreateInfo descriptorPoolCreateInfo; + descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + descriptorPoolCreateInfo.pNext = 0; + descriptorPoolCreateInfo.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; + descriptorPoolCreateInfo.maxSets = 1; + descriptorPoolCreateInfo.poolSizeCount = 2; + descriptorPoolCreateInfo.pPoolSizes = poolSizes; + + VkResult ret = vkCreateDescriptorPool(vkdev->vkdevice(), &descriptorPoolCreateInfo, 0, &descriptor_pool); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkCreateDescriptorPool failed %d\n", ret); + return; + } + } + descriptor_pools.push_back(descriptor_pool); + + VkDescriptorSet descriptorset; + { + VkDescriptorSetAllocateInfo descriptorSetAllocateInfo; + descriptorSetAllocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + descriptorSetAllocateInfo.pNext = 0; + descriptorSetAllocateInfo.descriptorPool = descriptor_pool; + descriptorSetAllocateInfo.descriptorSetCount = 1; + descriptorSetAllocateInfo.pSetLayouts = &pipeline->descriptorset_layout; + + VkResult ret = vkAllocateDescriptorSets(vkdev->vkdevice(), &descriptorSetAllocateInfo, &descriptorset); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkAllocateDescriptorSets failed %d\n", ret); + return; + } + } + descriptorsets.push_back(descriptorset); + + if (vkdev->info.support_VK_KHR_descriptor_update_template) + { + vkdev->vkUpdateDescriptorSetWithTemplateKHR(vkdev->vkdevice(), descriptorset, pipeline->descriptor_update_template, descriptorImageInfos.data()); + } + else + { + std::vector writeDescriptorSets(binding_count); + for (int i=0; ishader_info.binding_types[i]; + + writeDescriptorSets[i].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writeDescriptorSets[i].pNext = 0; + writeDescriptorSets[i].dstSet = descriptorset; + writeDescriptorSets[i].dstBinding = i; + writeDescriptorSets[i].dstArrayElement = 0; + writeDescriptorSets[i].descriptorCount = 1; + writeDescriptorSets[i].pImageInfo = &descriptorImageInfos[i]; + writeDescriptorSets[i].pBufferInfo = 0; + writeDescriptorSets[i].pTexelBufferView = 0; + + if (binding_type == 2) + { + writeDescriptorSets[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; + } + else // if (binding_type == 3) + { + writeDescriptorSets[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + } + } + + vkUpdateDescriptorSets(vkdev->vkdevice(), binding_count, writeDescriptorSets.data(), 0, 0); + } + + record r; + r.type = record::TYPE_bind_descriptorsets; + r.command_buffer = compute_command_buffer; + r.bind_descriptorsets.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; + r.bind_descriptorsets.pipeline_layout = pipeline->pipeline_layout; + r.bind_descriptorsets.descriptorset_count = 1; + r.bind_descriptorsets.descriptorset_offset = descriptorsets.size() - 1; + delayed_records.push_back(r); + } + } + + // record push constants + if (constant_count > 0) + { + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPushConstants(compute_command_buffer, pipeline->pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, constant_count * sizeof(vk_constant_type), constants.data()); + } + else + { + uint32_t size = constant_count * sizeof(vk_constant_type); + unsigned char* constant_values = new unsigned char[size]; + memcpy(constant_values, constants.data(), size); + + record r; + r.type = record::TYPE_push_constants; r.command_buffer = compute_command_buffer; - r.copy_buffer.src = src.buffer(); - r.copy_buffer.dst = src_staging.buffer(); - r.copy_buffer.region_count = 1; - r.copy_buffer.regions = regions; + r.push_constants.pipeline_layout = pipeline->pipeline_layout; + r.push_constants.stage_flags = VK_SHADER_STAGE_COMPUTE_BIT; + r.push_constants.size = size; + r.push_constants.values = constant_values; delayed_records.push_back(r); } } - // barrier staging transfer-write @ compute to host-read @ compute + // record dispatch { - VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; - barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barriers[0].pNext = 0; - barriers[0].srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - barriers[0].dstAccessMask = VK_ACCESS_HOST_READ_BIT; - barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barriers[0].buffer = src_staging.buffer(); - barriers[0].offset = src_staging.buffer_offset(); - barriers[0].size = src_staging.buffer_capacity(); - - VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; - VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_HOST_BIT; + uint32_t group_count_x = (dispatcher.w + pipeline->local_size_x - 1) / pipeline->local_size_x; + uint32_t group_count_y = (dispatcher.h + pipeline->local_size_y - 1) / pipeline->local_size_y; + uint32_t group_count_z = (dispatcher.c + pipeline->local_size_z - 1) / pipeline->local_size_z; if (vkdev->info.support_VK_KHR_push_descriptor) { - vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); - delete[] barriers; + vkCmdDispatch(compute_command_buffer, group_count_x, group_count_y, group_count_z); } else { record r; - r.type = record::TYPE_buffer_barrers; + r.type = record::TYPE_dispatch; r.command_buffer = compute_command_buffer; - r.buffer_barrers.src_stage = src_stage; - r.buffer_barrers.dst_stage = dst_stage; - r.buffer_barrers.barrier_count = 1; - r.buffer_barrers.barriers = barriers; + r.dispatch.group_count_x = group_count_x; + r.dispatch.group_count_y = group_count_y; + r.dispatch.group_count_z = group_count_z; delayed_records.push_back(r); } } +} - // stash download post buffer and mat - download_post_buffers.push_back(src_staging); - download_post_mats.push_back(dst); - - // post memcpy device to dst +#if NCNN_BENCHMARK +void VkCompute::record_write_timestamp(uint32_t query) +{ + if (vkdev->info.support_VK_KHR_push_descriptor) + { + if (query_pool) + vkCmdWriteTimestamp(compute_command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, query_pool, query); + } + else { record r; - r.type = record::TYPE_post_download; - r.command_buffer = 0; - r.post_download.download_post_buffer_mat_offset = download_post_buffers.size() - 1; + r.type = record::TYPE_write_timestamp; + r.command_buffer = compute_command_buffer; + r.write_timestamp.query = query; delayed_records.push_back(r); } } +#endif // NCNN_BENCHMARK -void VkCompute::record_clone(const VkMat& src, VkMat& dst, const Option& opt) +#if __ANDROID_API__ >= 26 +void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst) { -// fprintf(stderr, "record_clone\n"); - - if (src.data->access_flags != VK_ACCESS_TRANSFER_READ_BIT || src.data->stage_flags != VK_PIPELINE_STAGE_TRANSFER_BIT) + // image layout transform undefined @ null to general @ compute { - // barrier device any @ compute to transfer-read @ compute - VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; - barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barriers[0].pNext = 0; - barriers[0].srcAccessMask = src.data->access_flags; - barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + barriers[0].srcAccessMask = 0; + barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barriers[0].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + barriers[0].newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barriers[0].buffer = src.buffer(); - barriers[0].offset = src.buffer_offset(); - barriers[0].size = src.buffer_capacity(); + barriers[0].image = src.image(); + barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + barriers[0].subresourceRange.baseMipLevel = 0; + barriers[0].subresourceRange.levelCount = 1; + barriers[0].subresourceRange.baseArrayLayer = 0; + barriers[0].subresourceRange.layerCount = 1; - VkPipelineStageFlags src_stage = src.data->stage_flags; - VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; if (vkdev->info.support_VK_KHR_push_descriptor) { - vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); delete[] barriers; } else { record r; - r.type = record::TYPE_buffer_barrers; - r.command_buffer = compute_command_buffer; - r.buffer_barrers.src_stage = src_stage; - r.buffer_barrers.dst_stage = dst_stage; - r.buffer_barrers.barrier_count = 1; - r.buffer_barrers.barriers = barriers; - delayed_records.push_back(r); - } - - // mark device transfer-read @ transfer - src.data->access_flags = VK_ACCESS_TRANSFER_READ_BIT; - src.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; - } - - // create dst - dst.create_like(src, opt.blob_vkallocator); - - // record device to staging - { - VkBufferCopy* regions = new VkBufferCopy[1]; - regions[0].srcOffset = src.buffer_offset(); - regions[0].dstOffset = dst.buffer_offset(); - regions[0].size = std::min(src.buffer_capacity(), dst.buffer_capacity()); - - if (vkdev->info.support_VK_KHR_push_descriptor) - { - vkCmdCopyBuffer(compute_command_buffer, src.buffer(), dst.buffer(), 1, regions); - delete[] regions; - } - else - { - record r; - r.type = record::TYPE_copy_buffer; + r.type = record::TYPE_image_barrers; r.command_buffer = compute_command_buffer; - r.copy_buffer.src = src.buffer(); - r.copy_buffer.dst = dst.buffer(); - r.copy_buffer.region_count = 1; - r.copy_buffer.regions = regions; + r.image_barrers.src_stage = src_stage; + r.image_barrers.dst_stage = dst_stage; + r.image_barrers.barrier_count = 1; + r.image_barrers.barriers = barriers; delayed_records.push_back(r); } } - // mark device transfer-write @ transfer - dst.data->access_flags = VK_ACCESS_TRANSFER_WRITE_BIT; - dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; -} - -void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector& bindings, const std::vector& constants, const VkMat& dispatcher) -{ -// fprintf(stderr, "record_pipeline %p\n", pipeline); - - const size_t binding_count = bindings.size(); - const size_t constant_count = constants.size(); - - for (size_t i=0; iaccess_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) - { - // barrier device any @ compute/null to shader-readwrite @ compute - VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; - barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barriers[0].pNext = 0; - barriers[0].srcAccessMask = binding.data->access_flags; - barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; - barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barriers[0].buffer = binding.buffer(); - barriers[0].offset = binding.buffer_offset(); - barriers[0].size = binding.buffer_capacity(); - - VkPipelineStageFlags src_stage = binding.data->stage_flags; - VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; - - if (vkdev->info.support_VK_KHR_push_descriptor) - { - vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); - delete[] barriers; - } - else - { - record r; - r.type = record::TYPE_buffer_barrers; - r.command_buffer = compute_command_buffer; - r.buffer_barrers.src_stage = src_stage; - r.buffer_barrers.dst_stage = dst_stage; - r.buffer_barrers.barrier_count = 1; - r.buffer_barrers.barriers = barriers; - delayed_records.push_back(r); - } - - // mark device shader-readwrite @ compute - binding.data->access_flags = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; - binding.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; - } - } - // record bind pipeline { if (vkdev->info.support_VK_KHR_push_descriptor) @@ -484,36 +1740,51 @@ void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector 0) { - std::vector descriptorBufferInfos(binding_count); - for (size_t i=0; isampler; + descriptorImageInfo.imageView = src.imageview(); + descriptorImageInfo.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + + VkDescriptorBufferInfo descriptorBufferInfo; + descriptorBufferInfo.buffer = dst.buffer(); + descriptorBufferInfo.offset = dst.buffer_offset(); + descriptorBufferInfo.range = dst.total() * dst.elemsize; if (vkdev->info.support_VK_KHR_push_descriptor) { - vkdev->vkCmdPushDescriptorSetWithTemplateKHR(compute_command_buffer, pipeline->descriptor_update_template, pipeline->pipeline_layout, 0, descriptorBufferInfos.data()); + struct ImportAndroidHardwareBufferDescriptorInfo + { + VkDescriptorImageInfo imageInfo; + VkDescriptorBufferInfo bufferInfo; + VkDescriptorBufferInfo buffer4Info; + }; + + ImportAndroidHardwareBufferDescriptorInfo info; + info.imageInfo = descriptorImageInfo; + info.bufferInfo = descriptorBufferInfo; + info.buffer4Info = descriptorBufferInfo; + + vkdev->vkCmdPushDescriptorSetWithTemplateKHR(compute_command_buffer, pipeline->descriptor_update_template, pipeline->pipeline_layout, 0, &info); } else { // create new descriptor_pool and descriptorset VkDescriptorPool descriptor_pool; { - VkDescriptorPoolSize poolSize; - poolSize.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - poolSize.descriptorCount = binding_count; + VkDescriptorPoolSize poolSizes[2]; + poolSizes[0].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + poolSizes[0].descriptorCount = 1; + poolSizes[1].type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + poolSizes[1].descriptorCount = 2; VkDescriptorPoolCreateInfo descriptorPoolCreateInfo; descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; descriptorPoolCreateInfo.pNext = 0; descriptorPoolCreateInfo.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; descriptorPoolCreateInfo.maxSets = 1; - descriptorPoolCreateInfo.poolSizeCount = 1; - descriptorPoolCreateInfo.pPoolSizes = &poolSize; + descriptorPoolCreateInfo.poolSizeCount = 2; + descriptorPoolCreateInfo.pPoolSizes = poolSizes; VkResult ret = vkCreateDescriptorPool(vkdev->vkdevice(), &descriptorPoolCreateInfo, 0, &descriptor_pool); if (ret != VK_SUCCESS) @@ -544,26 +1815,55 @@ void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vectorinfo.support_VK_KHR_descriptor_update_template) { - vkdev->vkUpdateDescriptorSetWithTemplateKHR(vkdev->vkdevice(), descriptorset, pipeline->descriptor_update_template, descriptorBufferInfos.data()); + struct ImportAndroidHardwareBufferDescriptorInfo + { + VkDescriptorImageInfo imageInfo; + VkDescriptorBufferInfo bufferInfo; + VkDescriptorBufferInfo buffer4Info; + }; + + ImportAndroidHardwareBufferDescriptorInfo info; + info.imageInfo = descriptorImageInfo; + info.bufferInfo = descriptorBufferInfo; + info.buffer4Info = descriptorBufferInfo; + + vkdev->vkUpdateDescriptorSetWithTemplateKHR(vkdev->vkdevice(), descriptorset, pipeline->descriptor_update_template, &info); } else { - std::vector writeDescriptorSets(binding_count); - for (size_t i=0; ivkdevice(), binding_count, writeDescriptorSets.data(), 0, 0); + vkUpdateDescriptorSets(vkdev->vkdevice(), 3, writeDescriptorSets, 0, 0); } record r; @@ -577,35 +1877,11 @@ void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector 0) - { - if (vkdev->info.support_VK_KHR_push_descriptor) - { - vkCmdPushConstants(compute_command_buffer, pipeline->pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, constant_count * sizeof(vk_constant_type), constants.data()); - } - else - { - uint32_t size = constant_count * sizeof(vk_constant_type); - unsigned char* constant_values = new unsigned char[size]; - memcpy(constant_values, constants.data(), size); - - record r; - r.type = record::TYPE_push_constants; - r.command_buffer = compute_command_buffer; - r.push_constants.pipeline_layout = pipeline->pipeline_layout; - r.push_constants.stage_flags = VK_SHADER_STAGE_COMPUTE_BIT; - r.push_constants.size = size; - r.push_constants.values = constant_values; - delayed_records.push_back(r); - } - } - // record dispatch { - uint32_t group_count_x = (dispatcher.w + pipeline->local_size_x - 1) / pipeline->local_size_x; - uint32_t group_count_y = (dispatcher.h + pipeline->local_size_y - 1) / pipeline->local_size_y; - uint32_t group_count_z = (dispatcher.c + pipeline->local_size_z - 1) / pipeline->local_size_z; + uint32_t group_count_x = (dst.w + pipeline->local_size_x - 1) / pipeline->local_size_x; + uint32_t group_count_y = (dst.h + pipeline->local_size_y - 1) / pipeline->local_size_y; + uint32_t group_count_z = (dst.c + pipeline->local_size_z - 1) / pipeline->local_size_z; if (vkdev->info.support_VK_KHR_push_descriptor) { @@ -624,37 +1900,17 @@ void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vectorinfo.support_VK_KHR_push_descriptor) - { - if (query_pool) - vkCmdWriteTimestamp(compute_command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, query_pool, query); - } - else - { - record r; - r.type = record::TYPE_write_timestamp; - r.command_buffer = compute_command_buffer; - r.write_timestamp.query = query; - delayed_records.push_back(r); - } -} -#endif // NCNN_BENCHMARK - -#if __ANDROID_API__ >= 26 -void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst) +void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkImageMat& dst) { // image layout transform undefined @ null to general @ compute { - VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; + VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[2]; barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barriers[0].pNext = 0; barriers[0].srcAccessMask = 0; barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT; barriers[0].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; - barriers[0].newLayout = VK_IMAGE_LAYOUT_GENERAL; + barriers[0].newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].image = src.image(); @@ -663,13 +1919,27 @@ void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwar barriers[0].subresourceRange.levelCount = 1; barriers[0].subresourceRange.baseArrayLayer = 0; barriers[0].subresourceRange.layerCount = 1; + barriers[1].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barriers[1].pNext = 0; + barriers[1].srcAccessMask = 0; + barriers[1].dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + barriers[1].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + barriers[1].newLayout = VK_IMAGE_LAYOUT_GENERAL; + barriers[1].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[1].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[1].image = dst.image(); + barriers[1].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + barriers[1].subresourceRange.baseMipLevel = 0; + barriers[1].subresourceRange.levelCount = 1; + barriers[1].subresourceRange.baseArrayLayer = 0; + barriers[1].subresourceRange.layerCount = 1; VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; if (vkdev->info.support_VK_KHR_push_descriptor) { - vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 2, barriers); delete[] barriers; } else @@ -679,7 +1949,7 @@ void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwar r.command_buffer = compute_command_buffer; r.image_barrers.src_stage = src_stage; r.image_barrers.dst_stage = dst_stage; - r.image_barrers.barrier_count = 1; + r.image_barrers.barrier_count = 2; r.image_barrers.barriers = barriers; delayed_records.push_back(r); } @@ -704,31 +1974,20 @@ void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwar // record update bindings { - VkDescriptorImageInfo descriptorImageInfo; - descriptorImageInfo.sampler = pipeline->sampler; - descriptorImageInfo.imageView = src.imageview(); - descriptorImageInfo.imageLayout = VK_IMAGE_LAYOUT_GENERAL; - - VkDescriptorBufferInfo descriptorBufferInfo; - descriptorBufferInfo.buffer = dst.buffer(); - descriptorBufferInfo.offset = dst.buffer_offset(); - descriptorBufferInfo.range = dst.total() * dst.elemsize; + VkDescriptorImageInfo descriptorImageInfos[3]; + descriptorImageInfos[0].sampler = pipeline->sampler; + descriptorImageInfos[0].imageView = src.imageview(); + descriptorImageInfos[0].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + descriptorImageInfos[1].sampler = 0; + descriptorImageInfos[1].imageView = dst.imageview(); + descriptorImageInfos[1].imageLayout = VK_IMAGE_LAYOUT_GENERAL; + descriptorImageInfos[2].sampler = 0; + descriptorImageInfos[2].imageView = dst.imageview(); + descriptorImageInfos[2].imageLayout = VK_IMAGE_LAYOUT_GENERAL; if (vkdev->info.support_VK_KHR_push_descriptor) { - struct ImportAndroidHardwareBufferDescriptorInfo - { - VkDescriptorImageInfo imageInfo; - VkDescriptorBufferInfo bufferInfo; - VkDescriptorBufferInfo buffer4Info; - }; - - ImportAndroidHardwareBufferDescriptorInfo info; - info.imageInfo = descriptorImageInfo; - info.bufferInfo = descriptorBufferInfo; - info.buffer4Info = descriptorBufferInfo; - - vkdev->vkCmdPushDescriptorSetWithTemplateKHR(compute_command_buffer, pipeline->descriptor_update_template, pipeline->pipeline_layout, 0, &info); + vkdev->vkCmdPushDescriptorSetWithTemplateKHR(compute_command_buffer, pipeline->descriptor_update_template, pipeline->pipeline_layout, 0, descriptorImageInfos); } else { @@ -738,8 +1997,8 @@ void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwar VkDescriptorPoolSize poolSizes[2]; poolSizes[0].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; poolSizes[0].descriptorCount = 1; - poolSizes[1].type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - poolSizes[1].descriptorCount = 1; + poolSizes[1].type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; + poolSizes[1].descriptorCount = 2; VkDescriptorPoolCreateInfo descriptorPoolCreateInfo; descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; @@ -778,19 +2037,7 @@ void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwar if (vkdev->info.support_VK_KHR_descriptor_update_template) { - struct ImportAndroidHardwareBufferDescriptorInfo - { - VkDescriptorImageInfo imageInfo; - VkDescriptorBufferInfo bufferInfo; - VkDescriptorBufferInfo buffer4Info; - }; - - ImportAndroidHardwareBufferDescriptorInfo info; - info.imageInfo = descriptorImageInfo; - info.bufferInfo = descriptorBufferInfo; - info.buffer4Info = descriptorBufferInfo; - - vkdev->vkUpdateDescriptorSetWithTemplateKHR(vkdev->vkdevice(), descriptorset, pipeline->descriptor_update_template, &info); + vkdev->vkUpdateDescriptorSetWithTemplateKHR(vkdev->vkdevice(), descriptorset, pipeline->descriptor_update_template, descriptorImageInfos); } else { @@ -802,7 +2049,7 @@ void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwar writeDescriptorSets[0].dstArrayElement = 0; writeDescriptorSets[0].descriptorCount = 1; writeDescriptorSets[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; - writeDescriptorSets[0].pImageInfo = &descriptorImageInfo; + writeDescriptorSets[0].pImageInfo = &descriptorImageInfos[0]; writeDescriptorSets[0].pBufferInfo = 0; writeDescriptorSets[0].pTexelBufferView = 0; writeDescriptorSets[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; @@ -811,9 +2058,9 @@ void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwar writeDescriptorSets[1].dstBinding = 1; writeDescriptorSets[1].dstArrayElement = 0; writeDescriptorSets[1].descriptorCount = 1; - writeDescriptorSets[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - writeDescriptorSets[1].pImageInfo = 0; - writeDescriptorSets[1].pBufferInfo = &descriptorBufferInfo; + writeDescriptorSets[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; + writeDescriptorSets[1].pImageInfo = &descriptorImageInfos[1]; + writeDescriptorSets[1].pBufferInfo = 0; writeDescriptorSets[1].pTexelBufferView = 0; writeDescriptorSets[2].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; writeDescriptorSets[2].pNext = 0; @@ -821,9 +2068,9 @@ void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwar writeDescriptorSets[2].dstBinding = 2; writeDescriptorSets[2].dstArrayElement = 0; writeDescriptorSets[2].descriptorCount = 1; - writeDescriptorSets[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - writeDescriptorSets[2].pImageInfo = 0; - writeDescriptorSets[2].pBufferInfo = &descriptorBufferInfo; + writeDescriptorSets[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; + writeDescriptorSets[2].pImageInfo = &descriptorImageInfos[2]; + writeDescriptorSets[2].pBufferInfo = 0; writeDescriptorSets[2].pTexelBufferView = 0; vkUpdateDescriptorSets(vkdev->vkdevice(), 3, writeDescriptorSets, 0, 0); @@ -892,6 +2139,24 @@ int VkCompute::submit_and_wait() delete[] r.copy_buffer.regions; break; } + case record::TYPE_copy_image: + { + vkCmdCopyImage(r.command_buffer, r.copy_image.src, r.copy_image.src_layout, r.copy_image.dst, r.copy_image.dst_layout, r.copy_image.region_count, r.copy_image.regions); + delete[] r.copy_image.regions; + break; + } + case record::TYPE_copy_buffer_to_image: + { + vkCmdCopyBufferToImage(r.command_buffer, r.copy_buffer_to_image.src, r.copy_buffer_to_image.dst, r.copy_buffer_to_image.layout, r.copy_buffer_to_image.region_count, r.copy_buffer_to_image.regions); + delete[] r.copy_buffer_to_image.regions; + break; + } + case record::TYPE_copy_image_to_buffer: + { + vkCmdCopyImageToBuffer(r.command_buffer, r.copy_image_to_buffer.src, r.copy_image_to_buffer.layout, r.copy_image_to_buffer.dst, r.copy_image_to_buffer.region_count, r.copy_image_to_buffer.regions); + delete[] r.copy_image_to_buffer.regions; + break; + } case record::TYPE_bind_pipeline: { vkCmdBindPipeline(r.command_buffer, r.bind_pipeline.bind_point, r.bind_pipeline.pipeline); @@ -939,9 +2204,9 @@ int VkCompute::submit_and_wait() } #endif // NCNN_BENCHMARK case record::TYPE_post_download: + case record::TYPE_post_cast_float16_to_float32: default: break; - } } } @@ -1003,23 +2268,24 @@ int VkCompute::submit_and_wait() case record::TYPE_post_download: { const VkMat& src = download_post_buffers[r.post_download.download_post_buffer_mat_offset]; - Mat& dst = download_post_mats[r.post_download.download_post_buffer_mat_offset]; + Mat& dst = download_post_mats_fp16[r.post_download.download_post_mat_fp16_offset]; src.allocator->invalidate(src.data); memcpy(dst.data, src.mapped_ptr(), dst.total() * dst.elemsize); break; } - case record::TYPE_copy_buffer: - case record::TYPE_bind_pipeline: - case record::TYPE_bind_descriptorsets: - case record::TYPE_push_constants: - case record::TYPE_dispatch: - case record::TYPE_memory_barrers: - case record::TYPE_buffer_barrers: - case record::TYPE_image_barrers: + case record::TYPE_post_cast_float16_to_float32: + { + const Mat& src = download_post_mats_fp16[r.post_cast_float16_to_float32.download_post_mat_fp16_offset]; + Mat& dst = download_post_mats[r.post_cast_float16_to_float32.download_post_mat_offset]; + + Option opt; + opt.blob_allocator = dst.allocator; + ncnn::cast_float16_to_float32(src, dst, opt); + break; + } default: break; - } } @@ -1242,7 +2508,7 @@ void VkTransfer::record_upload(const Mat& src, VkMat& dst, const Option& opt) // fprintf(stderr, "record_upload src = %d | %d %d %d @ %d\n", src.dims, src.w, src.h, src.c, src.elempack); // NOTE keep the hack here ? - if (src.elemsize / src.elempack == 4) + if (src.elemsize == src.elempack * 4u) { if (opt.use_fp16_storage || (opt.use_fp16_packed && src.elempack % 4 == 0)) { @@ -1260,6 +2526,11 @@ void VkTransfer::record_upload(const Mat& src, VkMat& dst, const Option& opt) // create dst dst.create_like(src_flattened, opt.blob_vkallocator); + if (dst.empty()) + { + return; + } + if (dst.allocator->mappable) { // memcpy src_flattened to device @@ -1298,6 +2569,7 @@ void VkTransfer::record_upload(const Mat& src, VkMat& dst, const Option& opt) // memcpy src_flattened to staging memcpy(dst_staging.mapped_ptr(), src_flattened.data, src_flattened.total() * src_flattened.elemsize); + dst_staging.allocator->flush(dst_staging.data); VkCommandBuffer command_buffer; if (vkdev->info.unified_compute_transfer_queue) @@ -1361,7 +2633,7 @@ void VkTransfer::record_upload(const Mat& src, VkMat& dst, const Option& opt) } else { - // queue ownership transfer any @ transfer to shader-read @ compute + // queue ownership transfer transfer-write @ transfer to shader-read @ compute // release { @@ -1410,6 +2682,203 @@ void VkTransfer::record_upload(const Mat& src, VkMat& dst, const Option& opt) upload_staging_buffers.push_back(dst_staging); } +void VkTransfer::record_upload(const Mat& src, VkImageMat& dst, const Option& opt) +{ +// fprintf(stderr, "record_upload image src = %d | %d %d %d @ %d\n", src.dims, src.w, src.h, src.c, src.elempack); + + // NOTE keep the hack here ? + if (src.elemsize == src.elempack * 4u) + { + if (opt.use_image_fp16_storage || (opt.use_image_fp16_packed && src.elempack % 4 == 0)) + { + Mat src_fp16; + cast_float32_to_float16(src, src_fp16); + + record_upload(src_fp16, dst, opt); + + return; + } + } + + // create dst + dst.create_like(src, opt.blob_vkallocator); + if (dst.empty()) + return; + + // create staging + VkMat dst_staging; + dst_staging.create_like(src, opt.staging_vkallocator); + + // memcpy src to staging + memcpy(dst_staging.mapped_ptr(), src.data, src.total() * src.elemsize); + dst_staging.allocator->flush(dst_staging.data); + + VkCommandBuffer command_buffer; + if (vkdev->info.unified_compute_transfer_queue) + { + command_buffer = compute_command_buffer; + } + else + { + command_buffer = upload_command_buffer; + } + + // barrier staging host-write @ null to transfer-read @ queue + { + VkBufferMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barrier.pNext = 0; + barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.buffer = dst_staging.buffer(); + barrier.offset = dst_staging.buffer_offset(); + barrier.size = dst_staging.buffer_capacity(); + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_HOST_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + + vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); + } + + // image layout transform undefined @ null to transfer-dst-optimal @ queue + { + VkImageMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.pNext = 0; + barrier.srcAccessMask = 0; + barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.image = dst.image(); + barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + barrier.subresourceRange.baseMipLevel = 0; + barrier.subresourceRange.levelCount = 1; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + + vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, &barrier); + } + + // record staging to image + { + const int channels = dst.c; + VkBufferImageCopy* regions = new VkBufferImageCopy[channels]; + for (int i = 0; i < channels; i++) + { + regions[i].bufferOffset = dst_staging.buffer_offset() + dst_staging.cstep * dst_staging.elemsize * i; + regions[i].bufferRowLength = 0; + regions[i].bufferImageHeight = 0; + regions[i].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + regions[i].imageSubresource.mipLevel = 0; + regions[i].imageSubresource.baseArrayLayer = 0; + regions[i].imageSubresource.layerCount = 1; + regions[i].imageOffset.x = 0; + regions[i].imageOffset.y = 0; + regions[i].imageOffset.z = i; + regions[i].imageExtent.width = dst.data->width; + regions[i].imageExtent.height = dst.data->height; + regions[i].imageExtent.depth = 1; + } + + vkCmdCopyBufferToImage(command_buffer, dst_staging.buffer(), dst.image(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, channels, regions); + delete[] regions; + } + + if (vkdev->info.unified_compute_transfer_queue) + { + // image layout transform transfer-dst-optimal @ compute to shader-readonly-optimal @ compute + { + VkImageMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.pNext = 0; + barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.image = dst.image(); + barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + barrier.subresourceRange.baseMipLevel = 0; + barrier.subresourceRange.levelCount = 1; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, &barrier); + } + } + else + { + // queue ownership transfer transfer-write @ transfer to shader-read @ compute + + // release + { + VkImageMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.pNext = 0; + barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = 0; + barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index; + barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index; + barrier.image = dst.image(); + barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + barrier.subresourceRange.baseMipLevel = 0; + barrier.subresourceRange.levelCount = 1; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; + + vkCmdPipelineBarrier(upload_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, &barrier); + } + + // acquire + { + VkImageMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.pNext = 0; + barrier.srcAccessMask = 0; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index; + barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index; + barrier.image = dst.image(); + barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + barrier.subresourceRange.baseMipLevel = 0; + barrier.subresourceRange.levelCount = 1; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, &barrier); + } + } + + // mark device shader-readwrite @ compute + dst.data->access_flags = VK_ACCESS_SHADER_READ_BIT; + dst.data->image_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + dst.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + // stash staging + upload_staging_buffers.push_back(dst_staging); +} + int VkTransfer::submit_and_wait() { // fprintf(stderr, "submit_and_wait\n"); diff --git a/src/command.h b/src/command.h index 133e279b9..b9084bb0a 100644 --- a/src/command.h +++ b/src/command.h @@ -35,18 +35,44 @@ public: public: void record_upload(const Mat& src, VkMat& dst, const Option& opt); + void record_upload(const Mat& src, VkImageMat& dst, const Option& opt); + void record_download(const VkMat& src, Mat& dst, const Option& opt); + void record_download(const VkImageMat& src, Mat& dst, const Option& opt); + + void record_buffer_to_image(const VkMat& src, VkImageMat& dst, const Option& opt); + + void record_image_to_buffer(const VkImageMat& src, VkMat& dst, const Option& opt); + + void record_clone(const Mat& src, VkMat& dst, const Option& opt); + + void record_clone(const Mat& src, VkImageMat& dst, const Option& opt); + + void record_clone(const VkMat& src, Mat& dst, const Option& opt); + + void record_clone(const VkImageMat& src, Mat& dst, const Option& opt); + void record_clone(const VkMat& src, VkMat& dst, const Option& opt); + void record_clone(const VkImageMat& src, VkImageMat& dst, const Option& opt); + + void record_clone(const VkMat& src, VkImageMat& dst, const Option& opt); + + void record_clone(const VkImageMat& src, VkMat& dst, const Option& opt); + void record_pipeline(const Pipeline* pipeline, const std::vector& bindings, const std::vector& constants, const VkMat& dispatcher); + void record_pipeline(const Pipeline* pipeline, const std::vector& bindings, const std::vector& constants, const VkImageMat& dispatcher); + #if NCNN_BENCHMARK void record_write_timestamp(uint32_t query); #endif // NCNN_BENCHMARK #if __ANDROID_API__ >= 26 void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst); + + void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkImageMat& dst); #endif // __ANDROID_API__ >= 26 int submit_and_wait(); @@ -75,8 +101,11 @@ protected: std::vector upload_staging_buffers; std::vector download_post_buffers; + std::vector download_post_mats_fp16; std::vector download_post_mats; + std::vector image_blocks_to_destroy; + // the good-old path for device without VK_KHR_push_descriptor std::vector descriptor_pools; std::vector descriptorsets; @@ -86,6 +115,9 @@ protected: enum { TYPE_copy_buffer, + TYPE_copy_image, + TYPE_copy_buffer_to_image, + TYPE_copy_image_to_buffer, TYPE_bind_pipeline, TYPE_bind_descriptorsets, TYPE_push_constants, @@ -99,6 +131,7 @@ protected: #endif // NCNN_BENCHMARK TYPE_post_download, + TYPE_post_cast_float16_to_float32, }; int type; @@ -107,6 +140,9 @@ protected: union { struct { VkBuffer src; VkBuffer dst; uint32_t region_count; const VkBufferCopy* regions; } copy_buffer; + struct { VkImage src; VkImageLayout src_layout; VkImage dst; VkImageLayout dst_layout; uint32_t region_count; const VkImageCopy* regions; } copy_image; + struct { VkBuffer src; VkImage dst; VkImageLayout layout; uint32_t region_count; const VkBufferImageCopy* regions; } copy_buffer_to_image; + struct { VkImage src; VkImageLayout layout; VkBuffer dst; uint32_t region_count; const VkBufferImageCopy* regions; } copy_image_to_buffer; struct { VkPipelineBindPoint bind_point; VkPipeline pipeline; } bind_pipeline; struct { VkPipelineBindPoint bind_point; VkPipelineLayout pipeline_layout; uint32_t descriptorset_count; uint32_t descriptorset_offset; } bind_descriptorsets; @@ -122,7 +158,8 @@ protected: struct { uint32_t query; } write_timestamp; #endif // NCNN_BENCHMARK - struct { uint32_t download_post_buffer_mat_offset; } post_download; + struct { uint32_t download_post_buffer_mat_offset; uint32_t download_post_mat_fp16_offset; } post_download; + struct { uint32_t download_post_mat_fp16_offset; uint32_t download_post_mat_offset; } post_cast_float16_to_float32; }; }; @@ -143,6 +180,8 @@ public: public: void record_upload(const Mat& src, VkMat& dst, const Option& opt); + void record_upload(const Mat& src, VkImageMat& dst, const Option& opt); + int submit_and_wait(); protected: diff --git a/src/convert_ycbcr.comp b/src/convert_ycbcr.comp index 3459eff1f..57a370b75 100644 --- a/src/convert_ycbcr.comp +++ b/src/convert_ycbcr.comp @@ -34,8 +34,13 @@ layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; layout (binding = 0) uniform sampler2D android_hardware_buffer_image; +#if NCNN_image_shader +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D vkmat_blob; +layout (binding = 2, imfmtc4) writeonly uniform unfp image3D vkmat_pack4_blob; +#else layout (binding = 1) writeonly buffer vkmat_blob { sfp vkmat_blob_data[]; }; layout (binding = 2) writeonly buffer vkmat_pack4_blob { sfpvec4 vkmat_pack4_blob_data[]; }; +#endif void main() { @@ -108,51 +113,75 @@ void main() if (type_to == 1) // PIXEL_RGB { +#if NCNN_image_shader + image3d_st1(vkmat_blob, ivec3(gx, gy, 0), rgb.r); + image3d_st1(vkmat_blob, ivec3(gx, gy, 1), rgb.g); + image3d_st1(vkmat_blob, ivec3(gx, gy, 2), rgb.b); +#else ivec3 v_offset = (gy * outw + gx) + ivec3(0, 1, 2) * outcstep; buffer_st1(vkmat_blob_data, v_offset.r, afp(rgb.r)); buffer_st1(vkmat_blob_data, v_offset.g, afp(rgb.g)); buffer_st1(vkmat_blob_data, v_offset.b, afp(rgb.b)); +#endif } if (type_to == 2) // PIXEL_BGR { +#if NCNN_image_shader + image3d_st1(vkmat_blob, ivec3(gx, gy, 0), rgb.b); + image3d_st1(vkmat_blob, ivec3(gx, gy, 1), rgb.g); + image3d_st1(vkmat_blob, ivec3(gx, gy, 2), rgb.r); +#else ivec3 v_offset = (gy * outw + gx) + ivec3(0, 1, 2) * outcstep; buffer_st1(vkmat_blob_data, v_offset.r, afp(rgb.b)); buffer_st1(vkmat_blob_data, v_offset.g, afp(rgb.g)); buffer_st1(vkmat_blob_data, v_offset.b, afp(rgb.r)); +#endif } if (type_to == 3) // PIXEL_GRAY { - int v_offset = gy * outw + gx; - // coeffs for r g b = 0.299f, 0.587f, 0.114f float v = clamp(rgb.r * 0.299f + rgb.g * 0.587f + rgb.b * 0.114f, 0.f, 255.f); +#if NCNN_image_shader + image3d_st1(vkmat_blob, ivec3(gx, gy, 0), v); +#else + int v_offset = gy * outw + gx; + buffer_st1(vkmat_blob_data, v_offset, afp(v)); +#endif } if (type_to == 4) // PIXEL_RGBA { - int v_offset = gy * outw + gx; - vec4 rgba; rgba.rgb = rgb; rgba.a = 255.f; +#if NCNN_image_shader + image3d_st4(vkmat_pack4_blob, ivec3(gx, gy, 0), rgba); +#else + int v_offset = gy * outw + gx; + buffer_st4(vkmat_pack4_blob_data, v_offset, afpvec4(rgba)); +#endif } if (type_to == 5) // PIXEL_BGRA { - int v_offset = gy * outw + gx; - vec4 rgba; rgba.bgr = rgb; rgba.a = 255.f; +#if NCNN_image_shader + image3d_st4(vkmat_pack4_blob, ivec3(gx, gy, 0), rgba); +#else + int v_offset = gy * outw + gx; + buffer_st4(vkmat_pack4_blob_data, v_offset, afpvec4(rgba)); +#endif } } diff --git a/src/gpu.cpp b/src/gpu.cpp index 0b56239f3..07498d169 100644 --- a/src/gpu.cpp +++ b/src/gpu.cpp @@ -27,6 +27,9 @@ #include #include "mat.h" +#include "command.h" +#include "layer_type.h" +#include "layer.h" #if __ANDROID__ #define ENABLE_VALIDATION_LAYER 0 @@ -603,6 +606,10 @@ int create_gpu_instance() gpu_info.memory_map_alignment = physicalDeviceProperties.limits.minMemoryMapAlignment; gpu_info.buffer_offset_alignment = physicalDeviceProperties.limits.minStorageBufferOffsetAlignment; gpu_info.non_coherent_atom_size = physicalDeviceProperties.limits.nonCoherentAtomSize; + gpu_info.buffer_image_granularity = physicalDeviceProperties.limits.bufferImageGranularity; + gpu_info.max_image_dimension_1d = physicalDeviceProperties.limits.maxImageDimension1D; + gpu_info.max_image_dimension_2d = physicalDeviceProperties.limits.maxImageDimension2D; + gpu_info.max_image_dimension_3d = physicalDeviceProperties.limits.maxImageDimension3D; gpu_info.timestamp_period = physicalDeviceProperties.limits.timestampPeriod; @@ -810,6 +817,54 @@ int create_gpu_instance() gpu_info.support_fp16_arithmetic = true; } + // check format + gpu_info.support_image_storage = false; + gpu_info.support_image_fp16_packed = false; + gpu_info.support_image_fp16_storage = false; + gpu_info.support_image_fp16_arithmetic = false; + { + VkFormatProperties r32f_formatProperties; + VkFormatProperties rgba32f_formatProperties; + vkGetPhysicalDeviceFormatProperties(physicalDevice, VK_FORMAT_R32_SFLOAT, &r32f_formatProperties); + vkGetPhysicalDeviceFormatProperties(physicalDevice, VK_FORMAT_R32G32B32A32_SFLOAT, &rgba32f_formatProperties); + + if ((r32f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT) + && (r32f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT) + && (rgba32f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT) + && (rgba32f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)) + gpu_info.support_image_storage = true; + } + { + VkFormatProperties rgba16f_formatProperties; + vkGetPhysicalDeviceFormatProperties(physicalDevice, VK_FORMAT_R16G16B16A16_SFLOAT, &rgba16f_formatProperties); + + if ((rgba16f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT) + && (rgba16f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)) + gpu_info.support_image_fp16_packed = true; + } + { + VkFormatProperties r16f_formatProperties; + VkFormatProperties rgba16f_formatProperties; + vkGetPhysicalDeviceFormatProperties(physicalDevice, VK_FORMAT_R16_SFLOAT, &r16f_formatProperties); + vkGetPhysicalDeviceFormatProperties(physicalDevice, VK_FORMAT_R16G16B16A16_SFLOAT, &rgba16f_formatProperties); + + if ((r16f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT) + && (r16f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT) + && (rgba16f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT) + && (rgba16f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)) + gpu_info.support_image_fp16_storage = true; + } + if (gpu_info.support_fp16_arithmetic) + { + gpu_info.support_image_fp16_arithmetic = true; + } + + if (physicalDeviceProperties.vendorID == 0x1ae0 && physicalDeviceProperties.deviceID == 0xc0de) + { + // swiftshader image r16f is not supported + gpu_info.support_image_fp16_storage = false; + } + fprintf(stderr, "[%u %s] queueC=%u[%u] queueG=%u[%u] queueT=%u[%u]\n", i, physicalDeviceProperties.deviceName, gpu_info.compute_queue_family_index, gpu_info.compute_queue_count, gpu_info.graphics_queue_family_index, gpu_info.graphics_queue_count, @@ -822,6 +877,10 @@ int create_gpu_instance() gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic, gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic); + fprintf(stderr, "[%u %s] imgfp32=%d imgfp16p=%d imgfp16s=%d imgfp16a=%d\n", i, physicalDeviceProperties.deviceName, + gpu_info.support_image_storage, gpu_info.support_image_fp16_packed, + gpu_info.support_image_fp16_storage, gpu_info.support_image_fp16_arithmetic); + gpu_info_index++; } @@ -833,7 +892,7 @@ int create_gpu_instance() // resolve shader info for (int i=0; iforward(src, dst, cmd, opt); } -static inline bool string_ends_with_fp16pa(const char* name) +void VulkanDevice::cast_float32_to_float16(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const { - int len = strlen(name); - if (len < 7) - return false; + int uoi = opt.use_image_fp16_storage ? 5 : opt.use_image_fp16_packed ? 4 : 3; + uop_cast_float32_to_float16[uoi]->forward(src, dst, cmd, opt); +} - return memcmp(name + len - 7, "_fp16pa", 7) == 0; +void VulkanDevice::cast_float16_to_float32(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const +{ + int uoi = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0; + uop_cast_float16_to_float32[uoi]->forward(src, dst, cmd, opt); } -static inline bool string_ends_with_fp16s(const char* name) +void VulkanDevice::cast_float16_to_float32(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const { - int len = strlen(name); - if (len < 6) - return false; + int uoi = opt.use_image_fp16_storage ? 5 : opt.use_image_fp16_packed ? 4 : 3; + uop_cast_float16_to_float32[uoi]->forward(src, dst, cmd, opt); +} - return memcmp(name + len - 6, "_fp16s", 6) == 0; +void VulkanDevice::packing_pack1(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const +{ + int uoi = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0; + uop_packing_pack1[uoi]->forward(src, dst, cmd, opt); } -static inline bool string_ends_with_fp16sa(const char* name) +void VulkanDevice::packing_pack1(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const { - int len = strlen(name); - if (len < 7) - return false; + int uoi = opt.use_image_fp16_storage ? 5 : opt.use_image_fp16_packed ? 4 : 3; + uop_packing_pack1[uoi]->forward(src, dst, cmd, opt); +} - return memcmp(name + len - 7, "_fp16sa", 7) == 0; +void VulkanDevice::packing_pack4(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const +{ + int uoi = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0; + uop_packing_pack4[uoi]->forward(src, dst, cmd, opt); +} + +void VulkanDevice::packing_pack4(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const +{ + int uoi = opt.use_image_fp16_storage ? 5 : opt.use_image_fp16_packed ? 4 : 3; + uop_packing_pack4[uoi]->forward(src, dst, cmd, opt); +} + +void VulkanDevice::packing_pack8(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const +{ + int uoi = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0; + uop_packing_pack8[uoi]->forward(src, dst, cmd, opt); +} + +void VulkanDevice::packing_pack8(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const +{ + int uoi = opt.use_image_fp16_storage ? 5 : opt.use_image_fp16_packed ? 4 : 3; + uop_packing_pack8[uoi]->forward(src, dst, cmd, opt); } int VulkanDevice::create_shader_module() @@ -1490,28 +1617,56 @@ int VulkanDevice::create_shader_module() // 2 = fp16pa // 3 = fp16s // 4 = fp16sa + // 5 = image + // 6 = image_fp16p + // 7 = image_fp16s + // 8 = image_fp16a if (!info.support_fp16_packed) { - if (i % 5 == 1) + if (i % 9 == 1) continue; } if (!info.support_fp16_packed || !info.support_fp16_arithmetic) { - if (i % 5 == 2) + if (i % 9 == 2) continue; } if (!info.support_fp16_storage) { - if (i % 5 == 3) + if (i % 9 == 3) continue; } if (!info.support_fp16_storage || !info.support_fp16_arithmetic) { - if (i % 5 == 4) + if (i % 9 == 4) + continue; + } + + if (!info.support_image_storage) + { + if (i % 9 == 5) + continue; + } + + if (!info.support_image_storage || !info.support_image_fp16_packed) + { + if (i % 9 == 6) + continue; + } + + if (!info.support_image_storage || !info.support_image_fp16_storage) + { + if (i % 9 == 7) + continue; + } + + if (!info.support_image_storage || !info.support_image_fp16_storage || !info.support_image_fp16_arithmetic) + { + if (i % 9 == 8) continue; } @@ -1606,6 +1761,214 @@ int VulkanDevice::init_device_extension() return 0; } +int VulkanDevice::create_utility_operator() +{ + Option opt[6]; + + opt[0].use_fp16_packed = false; + opt[0].use_fp16_storage = false; + opt[0].use_image_storage = false; + opt[0].use_image_fp16_packed = false; + opt[0].use_image_fp16_storage = false; + opt[0].use_shader_pack8 = true; + + opt[1].use_fp16_packed = true; + opt[1].use_fp16_storage = false; + opt[1].use_image_storage = false; + opt[1].use_image_fp16_packed = false; + opt[1].use_image_fp16_storage = false; + opt[1].use_shader_pack8 = true; + + opt[2].use_fp16_packed = true; + opt[2].use_fp16_storage = true; + opt[2].use_image_storage = false; + opt[2].use_image_fp16_packed = false; + opt[2].use_image_fp16_storage = false; + opt[2].use_shader_pack8 = true; + + opt[3].use_fp16_packed = false; + opt[3].use_fp16_storage = false; + opt[3].use_image_storage = true; + opt[3].use_image_fp16_packed = false; + opt[3].use_image_fp16_storage = false; + opt[3].use_shader_pack8 = true; + + opt[4].use_fp16_packed = false; + opt[4].use_fp16_storage = false; + opt[4].use_image_storage = true; + opt[4].use_image_fp16_packed = true; + opt[4].use_image_fp16_storage = false; + opt[4].use_shader_pack8 = true; + + opt[5].use_fp16_packed = false; + opt[5].use_fp16_storage = false; + opt[5].use_image_storage = true; + opt[5].use_image_fp16_packed = true; + opt[5].use_image_fp16_storage = true; + opt[5].use_shader_pack8 = true; + + for (int i = 0; i < 6; i++) + { + uop_cast_float32_to_float16[i] = 0; + uop_cast_float16_to_float32[i] = 0; + uop_packing_pack1[i] = 0; + uop_packing_pack4[i] = 0; + uop_packing_pack8[i] = 0; + + if (i == 1 && !info.support_fp16_packed) + continue; + + if (i == 2 && !info.support_fp16_storage) + continue; + + if (i == 3 && !info.support_image_storage) + continue; + + if (i == 4 && (!info.support_image_storage || !info.support_image_fp16_packed)) + continue; + + if (i == 5 && (!info.support_image_storage || !info.support_image_fp16_storage)) + continue; + + { + uop_cast_float32_to_float16[i] = ncnn::create_layer(ncnn::LayerType::Cast); + uop_cast_float32_to_float16[i]->vkdev = this; + + ncnn::ParamDict pd; + pd.set(0, 1); + pd.set(1, 2); + + uop_cast_float32_to_float16[i]->load_param(pd); + } + + { + uop_cast_float16_to_float32[i] = ncnn::create_layer(ncnn::LayerType::Cast); + uop_cast_float16_to_float32[i]->vkdev = this; + + ncnn::ParamDict pd; + pd.set(0, 2); + pd.set(1, 1); + + uop_cast_float16_to_float32[i]->load_param(pd); + } + + { + uop_packing_pack1[i] = ncnn::create_layer(ncnn::LayerType::Packing); + uop_packing_pack1[i]->vkdev = this; + + ncnn::ParamDict pd; + pd.set(0, 1); + + uop_packing_pack1[i]->load_param(pd); + } + + { + uop_packing_pack4[i] = ncnn::create_layer(ncnn::LayerType::Packing); + uop_packing_pack4[i]->vkdev = this; + + ncnn::ParamDict pd; + pd.set(0, 4); + + uop_packing_pack4[i]->load_param(pd); + } + + { + uop_packing_pack8[i] = ncnn::create_layer(ncnn::LayerType::Packing); + uop_packing_pack8[i]->vkdev = this; + + ncnn::ParamDict pd; + pd.set(0, 8); + + uop_packing_pack8[i]->load_param(pd); + } + + uop_cast_float32_to_float16[i]->create_pipeline(opt[i]); + uop_cast_float16_to_float32[i]->create_pipeline(opt[i]); + uop_packing_pack1[i]->create_pipeline(opt[i]); + uop_packing_pack4[i]->create_pipeline(opt[i]); + uop_packing_pack8[i]->create_pipeline(opt[i]); + } + + return 0; +} + +void VulkanDevice::destroy_utility_operator() +{ + Option opt[6]; + + opt[0].use_fp16_packed = false; + opt[0].use_fp16_storage = false; + opt[0].use_image_storage = false; + opt[0].use_image_fp16_packed = false; + opt[0].use_image_fp16_storage = false; + opt[0].use_shader_pack8 = true; + + opt[1].use_fp16_packed = true; + opt[1].use_fp16_storage = false; + opt[1].use_image_storage = false; + opt[1].use_image_fp16_packed = false; + opt[1].use_image_fp16_storage = false; + opt[1].use_shader_pack8 = true; + + opt[2].use_fp16_packed = true; + opt[2].use_fp16_storage = true; + opt[2].use_image_storage = false; + opt[2].use_image_fp16_packed = false; + opt[2].use_image_fp16_storage = false; + opt[2].use_shader_pack8 = true; + + opt[3].use_fp16_packed = false; + opt[3].use_fp16_storage = false; + opt[3].use_image_storage = true; + opt[3].use_image_fp16_packed = false; + opt[3].use_image_fp16_storage = false; + opt[3].use_shader_pack8 = true; + + opt[4].use_fp16_packed = false; + opt[4].use_fp16_storage = false; + opt[4].use_image_storage = true; + opt[4].use_image_fp16_packed = true; + opt[4].use_image_fp16_storage = false; + opt[4].use_shader_pack8 = true; + + opt[5].use_fp16_packed = false; + opt[5].use_fp16_storage = false; + opt[5].use_image_storage = true; + opt[5].use_image_fp16_packed = true; + opt[5].use_image_fp16_storage = true; + opt[5].use_shader_pack8 = true; + + for (int i = 0; i < 6; i++) + { + if (i == 1 && !info.support_fp16_packed) + continue; + + if (i == 2 && !info.support_fp16_storage) + continue; + + if (i == 3 && !info.support_image_storage) + continue; + + if (i == 4 && (!info.support_image_storage || !info.support_image_fp16_packed)) + continue; + + if (i == 5 && (!info.support_image_storage || !info.support_image_fp16_storage)) + continue; + + uop_cast_float32_to_float16[i]->destroy_pipeline(opt[i]); + uop_cast_float16_to_float32[i]->destroy_pipeline(opt[i]); + uop_packing_pack1[i]->destroy_pipeline(opt[i]); + uop_packing_pack4[i]->destroy_pipeline(opt[i]); + uop_packing_pack8[i]->destroy_pipeline(opt[i]); + + delete uop_cast_float32_to_float16[i]; + delete uop_cast_float16_to_float32[i]; + delete uop_packing_pack1[i]; + delete uop_packing_pack4[i]; + delete uop_packing_pack8[i]; + } +} + VulkanDevice* get_gpu_device(int device_index) { if (device_index < 0 || device_index >= g_gpu_count) @@ -1630,16 +1993,30 @@ const ShaderInfo& get_shader_info(int shader_type_index) return layer_shader_infos[shader_type_index]; } -ShaderInfo resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size) +int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info) { + shader_info.specialization_count = 0; + shader_info.binding_count = 0; + shader_info.push_constant_count = 0; + uint32_t parameter_id = -233; int specialization_count = 0; int binding_count = 0; int push_constant_count = 0; + // id -> binding_type + std::vector id_types; + + // binding_id -> binding_type + std::vector binding_types; + const uint32_t* p = spv_data; + int bound = p[3]; + + id_types.resize(bound); + // skip magic version generator bound schema p += 5; @@ -1668,28 +2045,86 @@ ShaderInfo resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size) push_constant_count++; } } + else if (op == 25) // OpTypeImage + { + uint32_t id = p[1]; + id_types[id] = 2; + } + else if (op == 27) // OpTypeSampledImage + { + uint32_t id = p[1]; + id_types[id] = 3; + } + else if (op == 32) // OpTypePointer + { + uint32_t id = p[1]; + uint32_t storage_class = p[2]; + uint32_t type = p[3]; + if (storage_class == 0) // UniformConstant + { + id_types[id] = id_types[type]; + } + if (storage_class == 2) // Uniform + { + id_types[id] = id_types[type]; + } + } + else if (op == 59) // OpVariable + { + uint32_t id = p[1]; + uint32_t var_id = p[2]; + uint32_t storage_class = p[3]; + if (storage_class == 0) // UniformConstant + { + id_types[var_id] = id_types[id]; + } + if (storage_class == 2) // Uniform + { + id_types[var_id] = id_types[id]; + } + } else if (op == 71) // OpDecorate { + uint32_t id = p[1]; uint32_t decoration = p[2]; + uint32_t binding_id = p[3]; if (decoration == 1) // SpecId { specialization_count++; } + if (decoration == 3) // BufferBlock + { + id_types[id] = 1; + } else if (decoration == 33) // Binding { - binding_count++; + binding_count = std::max(binding_count, (int)binding_id + 1); + + binding_types.resize(binding_count); + binding_types[binding_id] = id; } } p += wordcount; } - ShaderInfo si; - si.specialization_count = specialization_count; - si.binding_count = binding_count; - si.push_constant_count = push_constant_count; + if (binding_count > 16) + { + fprintf(stderr, "too many binding %d\n", binding_count); + return -1; + } + + shader_info.specialization_count = specialization_count; + shader_info.binding_count = binding_count; + shader_info.push_constant_count = push_constant_count; - return si; + // resolve binding_types + for (int i=0; i shader_modules; @@ -251,6 +285,22 @@ private: // default staging allocator for each queue mutable std::vector staging_allocators; mutable Mutex staging_allocator_lock; + + // nearest sampler for texelfetch + VkSampler texelfetch_sampler; + + // utility operator + // 0 = fp32 + // 1 = fp16p + // 2 = fp16s + // 3 = image + // 4 = image_fp16p + // 5 = image_fp16s + ncnn::Layer* uop_cast_float32_to_float16[6]; + ncnn::Layer* uop_cast_float16_to_float32[6]; + ncnn::Layer* uop_packing_pack1[6]; + ncnn::Layer* uop_packing_pack4[6]; + ncnn::Layer* uop_packing_pack8[6]; }; VulkanDevice* get_gpu_device(int device_index = get_default_gpu_index()); @@ -262,10 +312,16 @@ public: int specialization_count; int binding_count; int push_constant_count; + + // 0 = null + // 1 = storage buffer + // 2 = storage image + // 3 = combined image sampler + int binding_types[16];// 16 is large enough I think ... }; const ShaderInfo& get_shader_info(int shader_type_index); -ShaderInfo resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size); +int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info); } // namespace ncnn diff --git a/src/layer.cpp b/src/layer.cpp index 340caaf05..3d1c9c19d 100644 --- a/src/layer.cpp +++ b/src/layer.cpp @@ -39,6 +39,7 @@ Layer::Layer() support_packing = false; support_bf16_storage = false; + support_image_storage = false; #if NCNN_VULKAN vkdev = 0; @@ -137,6 +138,30 @@ int Layer::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, co return forward_inplace(top_blob, cmd, opt); } +int Layer::forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const +{ + if (!support_inplace) + return -1; + + top_blobs.resize(bottom_blobs.size()); + for (int i = 0; i < (int)top_blobs.size(); i++) + { + cmd.record_clone(bottom_blobs[i], top_blobs[i], opt); + } + + return forward_inplace(top_blobs, cmd, opt); +} + +int Layer::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + if (!support_inplace) + return -1; + + cmd.record_clone(bottom_blob, top_blob, opt); + + return forward_inplace(top_blob, cmd, opt); +} + int Layer::forward_inplace(std::vector& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const { return -1; @@ -146,6 +171,16 @@ int Layer::forward_inplace(VkMat& /*bottom_top_blob*/, VkCompute& /*cmd*/, const { return -1; } + +int Layer::forward_inplace(std::vector& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const +{ + return -1; +} + +int Layer::forward_inplace(VkImageMat& /*bottom_top_blob*/, VkCompute& /*cmd*/, const Option& /*opt*/) const +{ + return -1; +} #endif // NCNN_VULKAN static const layer_registry_entry layer_registry[] = diff --git a/src/layer.h b/src/layer.h index 7d90de3d0..8b6bb0875 100644 --- a/src/layer.h +++ b/src/layer.h @@ -73,6 +73,9 @@ public: // accept bf16 bool support_bf16_storage; + // shader image storage + bool support_image_storage; + public: // implement inference // return 0 if success @@ -95,11 +98,21 @@ public: virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; + // implement inference + // return 0 if success + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; + virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; + // implement inplace inference // return 0 if success virtual int forward_inplace(std::vector& bottom_top_blobs, VkCompute& cmd, const Option& opt) const; virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const; + // implement inplace inference + // return 0 if success + virtual int forward_inplace(std::vector& bottom_top_blobs, VkCompute& cmd, const Option& opt) const; + virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const; + public: // assigned immediately after creating this layer const VulkanDevice* vkdev; diff --git a/src/layer/input.cpp b/src/layer/input.cpp index a8d451e50..d30a13d81 100644 --- a/src/layer/input.cpp +++ b/src/layer/input.cpp @@ -22,7 +22,10 @@ Input::Input() { one_blob_only = true; support_inplace = true; - support_vulkan = false; + support_vulkan = true; + support_packing = true; + support_bf16_storage = true; + support_image_storage = true; } int Input::load_param(const ParamDict& pd) @@ -39,4 +42,16 @@ int Input::forward_inplace(Mat& /*bottom_top_blob*/, const Option& /*opt*/) cons return 0; } +#if NCNN_VULKAN +int Input::forward_inplace(VkMat& /*bottom_top_blob*/, VkCompute& /*cmd*/, const Option& /*opt*/) const +{ + return 0; +} + +int Input::forward_inplace(VkImageMat& /*bottom_top_blob*/, VkCompute& /*cmd*/, const Option& /*opt*/) const +{ + return 0; +} +#endif // NCNN_VULKAN + } // namespace ncnn diff --git a/src/layer/input.h b/src/layer/input.h index 2d12d54d6..bd7f31cc8 100644 --- a/src/layer/input.h +++ b/src/layer/input.h @@ -28,6 +28,11 @@ public: virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +#if NCNN_VULKAN + virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const; + virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const; +#endif // NCNN_VULKAN + public: int w; int h; diff --git a/src/layer/noop.cpp b/src/layer/noop.cpp index a26f8f1cf..e69e2964d 100644 --- a/src/layer/noop.cpp +++ b/src/layer/noop.cpp @@ -23,6 +23,7 @@ Noop::Noop() support_inplace = true; support_vulkan = true; support_packing = true; + support_image_storage = true; } int Noop::forward_inplace(std::vector& /*bottom_top_blobs*/, const Option& /*opt*/) const @@ -35,6 +36,11 @@ int Noop::forward_inplace(std::vector& /*bottom_top_blobs*/, VkCompute& / { return 0; } + +int Noop::forward_inplace(std::vector& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const +{ + return 0; +} #endif // NCNN_VULKAN } // namespace ncnn diff --git a/src/layer/noop.h b/src/layer/noop.h index ab62106e2..1fb7af35c 100644 --- a/src/layer/noop.h +++ b/src/layer/noop.h @@ -28,6 +28,7 @@ public: #if NCNN_VULKAN virtual int forward_inplace(std::vector& bottom_top_blobs, VkCompute& cmd, const Option& opt) const; + virtual int forward_inplace(std::vector& bottom_top_blobs, VkCompute& cmd, const Option& opt) const; #endif // NCNN_VULKAN }; diff --git a/src/layer/split.cpp b/src/layer/split.cpp index 64c39763c..a842ec60b 100644 --- a/src/layer/split.cpp +++ b/src/layer/split.cpp @@ -25,6 +25,7 @@ Split::Split() support_vulkan = true; support_packing = true; support_bf16_storage = true; + support_image_storage = true; } int Split::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& /*opt*/) const @@ -41,8 +42,6 @@ int Split::forward(const std::vector& bottom_blobs, std::vector& top_b #if NCNN_VULKAN int Split::forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& /*cmd*/, const Option& /*opt*/) const { -// fprintf(stderr, "Split::forward %p\n", bottom_blobs[0].buffer()); - const VkMat& bottom_blob = bottom_blobs[0]; for (size_t i=0; i& bottom_blobs, std::vector& t return 0; } + +int Split::forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& /*cmd*/, const Option& /*opt*/) const +{ + const VkImageMat& bottom_blob = bottom_blobs[0]; + for (size_t i=0; i& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; #endif // NCNN_VULKAN public: diff --git a/src/layer/vulkan/absval_vulkan.cpp b/src/layer/vulkan/absval_vulkan.cpp index 0727c47cf..c012e0d4d 100644 --- a/src/layer/vulkan/absval_vulkan.cpp +++ b/src/layer/vulkan/absval_vulkan.cpp @@ -23,6 +23,7 @@ DEFINE_LAYER_CREATOR(AbsVal_vulkan) AbsVal_vulkan::AbsVal_vulkan() { support_vulkan = true; + support_image_storage = true; pipeline_absval = 0; pipeline_absval_pack4 = 0; @@ -39,7 +40,19 @@ int AbsVal_vulkan::create_pipeline(const Option& opt) if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; size_t elemsize; - if (opt.use_fp16_storage) + if (opt.use_image_storage && opt.use_image_fp16_storage) + { + elemsize = elempack * 2u; + } + else if (opt.use_image_storage && opt.use_image_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + } + else if (opt.use_image_storage) + { + elemsize = elempack * 4u; + } + else if (opt.use_fp16_storage) { elemsize = elempack * 2u; } @@ -148,4 +161,28 @@ int AbsVal_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const return 0; } +int AbsVal_vulkan::forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& /*opt*/) const +{ + int elempack = bottom_top_blob.elempack; + + std::vector bindings(2); + bindings[0] = bottom_top_blob; + bindings[1] = bottom_top_blob; + + std::vector constants(5); + constants[0].i = bottom_top_blob.dims; + constants[1].i = bottom_top_blob.w; + constants[2].i = bottom_top_blob.h; + constants[3].i = bottom_top_blob.c; + constants[4].i = 0;//bottom_top_blob.cstep; + + const Pipeline* pipeline = elempack == 8 ? pipeline_absval_pack8 + : elempack == 4 ? pipeline_absval_pack4 + : pipeline_absval; + + cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob); + + return 0; +} + } // namespace ncnn diff --git a/src/layer/vulkan/absval_vulkan.h b/src/layer/vulkan/absval_vulkan.h index 173f838ba..d14c2ac53 100644 --- a/src/layer/vulkan/absval_vulkan.h +++ b/src/layer/vulkan/absval_vulkan.h @@ -29,6 +29,7 @@ public: using AbsVal::forward_inplace; virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const; + virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const; public: Pipeline* pipeline_absval; diff --git a/src/layer/vulkan/cast_vulkan.cpp b/src/layer/vulkan/cast_vulkan.cpp index 003319496..b7f67ed89 100644 --- a/src/layer/vulkan/cast_vulkan.cpp +++ b/src/layer/vulkan/cast_vulkan.cpp @@ -23,6 +23,7 @@ DEFINE_LAYER_CREATOR(Cast_vulkan) Cast_vulkan::Cast_vulkan() { support_vulkan = true; + support_image_storage = true; pipeline_cast_fp32_to_fp16 = 0; pipeline_cast_fp32_to_fp16_pack4 = 0; @@ -49,7 +50,22 @@ int Cast_vulkan::create_pipeline(const Option& opt) size_t elemsize; size_t out_elemsize; - if (opt.use_fp16_storage) + if (opt.use_image_storage && opt.use_image_fp16_storage) + { + elemsize = elempack * 2u; + out_elemsize = out_elempack * 2u; + } + else if (opt.use_image_storage && opt.use_image_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; + } + else if (opt.use_image_storage) + { + elemsize = elempack * 4u; + out_elemsize = out_elempack * 4u; + } + else if (opt.use_fp16_storage) { elemsize = elempack * 2u; out_elemsize = out_elempack * 2u; @@ -285,4 +301,102 @@ int Cast_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& c return 0; } +int Cast_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + if (type_from == type_to) + { + top_blob = bottom_blob; + return 0; + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + size_t out_elemsize = elemsize; + if (type_to == 1) + { + // float32 + out_elemsize = 4 * elempack; + } + else if (type_to == 2) + { + // float16 + out_elemsize = 2 * elempack; + + if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) + { + if (elempack == 8) out_elemsize = 8*2u; + if (elempack == 4) out_elemsize = 4*2u; + if (elempack == 1) out_elemsize = 4u; + } + + if (!opt.use_image_fp16_packed && !opt.use_image_fp16_storage) + { + // fallback to fp32 :( + out_elemsize = 4 * elempack; + } + } + else if (type_to == 3) + { + // int8 + out_elemsize = elempack; + } + + if (dims == 1) + { + top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator); + } + else if (dims == 2) + { + top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator); + } + else if (dims == 3) + { + top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator); + } + if (top_blob.empty()) + return -100; + + std::vector bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + + std::vector constants(10); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = 0;//bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = 0;//top_blob.cstep; + + const Pipeline* pipeline = 0; + + if (type_from == 1 && type_to == 2) + { + pipeline = elempack == 8 ? pipeline_cast_fp32_to_fp16_pack8 + : elempack == 4 ? pipeline_cast_fp32_to_fp16_pack4 + : pipeline_cast_fp32_to_fp16; + } + if (type_from == 2 && type_to == 1) + { + pipeline = elempack == 8 ? pipeline_cast_fp16_to_fp32_pack8 + : elempack == 4 ? pipeline_cast_fp16_to_fp32_pack4 + : pipeline_cast_fp16_to_fp32; + } + + // TODO more cast type + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + + return 0; +} + } // namespace ncnn diff --git a/src/layer/vulkan/cast_vulkan.h b/src/layer/vulkan/cast_vulkan.h index 5d951fa71..c184c7439 100644 --- a/src/layer/vulkan/cast_vulkan.h +++ b/src/layer/vulkan/cast_vulkan.h @@ -29,6 +29,7 @@ public: using Cast::forward; virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; + virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; public: Pipeline* pipeline_cast_fp32_to_fp16; diff --git a/src/layer/vulkan/concat_vulkan.cpp b/src/layer/vulkan/concat_vulkan.cpp index 2baeb9847..272feff5b 100644 --- a/src/layer/vulkan/concat_vulkan.cpp +++ b/src/layer/vulkan/concat_vulkan.cpp @@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(Concat_vulkan) Concat_vulkan::Concat_vulkan() { support_vulkan = true; + support_image_storage = true; packing_pack4 = 0; packing_pack8 = 0; @@ -77,7 +78,19 @@ int Concat_vulkan::create_pipeline(const Option& opt) } size_t elemsize; - if (opt.use_fp16_storage) + if (opt.use_image_storage && opt.use_image_fp16_storage) + { + elemsize = elempack * 2u; + } + else if (opt.use_image_storage && opt.use_image_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + } + else if (opt.use_image_storage) + { + elemsize = elempack * 4u; + } + else if (opt.use_fp16_storage) { elemsize = elempack * 2u; } @@ -761,4 +774,483 @@ int Concat_vulkan::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const +{ + int dims = bottom_blobs[0].dims; + + if (dims == 1) // axis == 0 + { + // concat vector + // total length + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + int top_w = 0; + for (size_t b=0; b bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob_unpacked; + + std::vector constants(11); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = 0;//bottom_blob.cstep; + constants[5].i = top_blob_unpacked.dims; + constants[6].i = top_blob_unpacked.w; + constants[7].i = top_blob_unpacked.h; + constants[8].i = top_blob_unpacked.c; + constants[9].i = 0;//top_blob_unpacked.cstep; + constants[10].i = woffset; + + const Pipeline* pipeline = 0; + if (bottom_blob.elempack == 1 && elempack == 1) + { + pipeline = pipeline_concat[b%2]; + } + else if (bottom_blob.elempack == 4 && elempack == 4) + { + pipeline = pipeline_concat_pack4[b%2]; + } + else if (bottom_blob.elempack == 4 && elempack == 1) + { + pipeline = pipeline_concat_pack4to1[b%2]; + } + else if (bottom_blob.elempack == 8 && elempack == 8) + { + pipeline = pipeline_concat_pack8[b%2]; + } + else if (bottom_blob.elempack == 8 && elempack == 4) + { + pipeline = pipeline_concat_pack8to4[b%2]; + } + else if (bottom_blob.elempack == 8 && elempack == 1) + { + pipeline = pipeline_concat_pack8to1[b%2]; + } + + cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); + + woffset += bottom_blob.w * bottom_blob.elempack / elempack; + } + + // packing + if (elempack < out_elempack) + { + const Layer* packing = out_elempack == 8 ? packing_pack8 : packing_pack4; + packing->forward(top_blob_unpacked, top_blob, cmd, opt); + } + + return 0; + } + + if (dims == 2 && axis == 0) + { + // concat image + int w = bottom_blobs[0].w; + + // total height + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + int top_h = 0; + for (size_t b=0; b bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob_unpacked; + + std::vector constants(11); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = 0;//bottom_blob.cstep; + constants[5].i = top_blob_unpacked.dims; + constants[6].i = top_blob_unpacked.w; + constants[7].i = top_blob_unpacked.h; + constants[8].i = top_blob_unpacked.c; + constants[9].i = 0;//top_blob_unpacked.cstep; + constants[10].i = hoffset; + + const Pipeline* pipeline = 0; + if (bottom_blob.elempack == 1 && elempack == 1) + { + pipeline = pipeline_concat[b%2]; + } + else if (bottom_blob.elempack == 4 && elempack == 4) + { + pipeline = pipeline_concat_pack4[b%2]; + } + else if (bottom_blob.elempack == 4 && elempack == 1) + { + pipeline = pipeline_concat_pack4to1[b%2]; + } + else if (bottom_blob.elempack == 8 && elempack == 8) + { + pipeline = pipeline_concat_pack8[b%2]; + } + else if (bottom_blob.elempack == 8 && elempack == 4) + { + pipeline = pipeline_concat_pack8to4[b%2]; + } + else if (bottom_blob.elempack == 8 && elempack == 1) + { + pipeline = pipeline_concat_pack8to1[b%2]; + } + + cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); + + hoffset += bottom_blob.h * bottom_blob.elempack / elempack; + } + + // packing + if (elempack < out_elempack) + { + const Layer* packing = out_elempack == 8 ? packing_pack8 : packing_pack4; + packing->forward(top_blob_unpacked, top_blob, cmd, opt); + } + + return 0; + } + + if (dims == 2 && axis == 1) + { + // interleave image row + int h = bottom_blobs[0].h; + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + + // total width + int top_w = 0; + for (size_t b=0; b bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + + std::vector constants(11); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = 0;//bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = 0;//top_blob.cstep; + constants[10].i = woffset; + + const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b%2] + : elempack == 4 ? pipeline_concat_pack4[b%2] + : pipeline_concat[b%2]; + + cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); + + woffset += bottom_blob.w; + } + + return 0; + } + + if (dims == 3 && axis == 0) + { + // concat dim + int w = bottom_blobs[0].w; + int h = bottom_blobs[0].h; + + // total channels + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + int top_channels = 0; + for (size_t b=0; b bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob_unpacked; + + std::vector constants(11); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = 0;//bottom_blob.cstep; + constants[5].i = top_blob_unpacked.dims; + constants[6].i = top_blob_unpacked.w; + constants[7].i = top_blob_unpacked.h; + constants[8].i = top_blob_unpacked.c; + constants[9].i = 0;//top_blob_unpacked.cstep; + constants[10].i = coffset; + + const Pipeline* pipeline = 0; + if (bottom_blob.elempack == 1 && elempack == 1) + { + pipeline = pipeline_concat[b%2]; + } + else if (bottom_blob.elempack == 4 && elempack == 4) + { + pipeline = pipeline_concat_pack4[b%2]; + } + else if (bottom_blob.elempack == 4 && elempack == 1) + { + pipeline = pipeline_concat_pack4to1[b%2]; + } + else if (bottom_blob.elempack == 8 && elempack == 8) + { + pipeline = pipeline_concat_pack8[b%2]; + } + else if (bottom_blob.elempack == 8 && elempack == 4) + { + pipeline = pipeline_concat_pack8to4[b%2]; + } + else if (bottom_blob.elempack == 8 && elempack == 1) + { + pipeline = pipeline_concat_pack8to1[b%2]; + } + + cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); + + coffset += bottom_blob.c * bottom_blob.elempack / elempack; + } + + // packing + if (elempack < out_elempack) + { + const Layer* packing = out_elempack == 8 ? packing_pack8 : packing_pack4; + packing->forward(top_blob_unpacked, top_blob, cmd, opt); + } + + return 0; + } + + if (dims == 3 && axis == 1) + { + // interleave dim height + int w = bottom_blobs[0].w; + int channels = bottom_blobs[0].c; + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + + // total height + int top_h = 0; + for (size_t b=0; b bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + + std::vector constants(11); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = 0;//bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = 0;//top_blob.cstep; + constants[10].i = hoffset; + + const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b%2] + : elempack == 4 ? pipeline_concat_pack4[b%2] + : pipeline_concat[b%2]; + + cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); + + hoffset += bottom_blob.h; + } + + return 0; + } + + if (dims == 3 && axis == 2) + { + // interleave dim width + int h = bottom_blobs[0].h; + int channels = bottom_blobs[0].c; + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + + // total height + int top_w = 0; + for (size_t b=0; b bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + + std::vector constants(11); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = 0;//bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = 0;//top_blob.cstep; + constants[10].i = woffset; + + const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b%2] + : elempack == 4 ? pipeline_concat_pack4[b%2] + : pipeline_concat[b%2]; + + cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); + + woffset += bottom_blob.w; + } + + return 0; + } + + return 0; +} + } // namespace ncnn diff --git a/src/layer/vulkan/concat_vulkan.h b/src/layer/vulkan/concat_vulkan.h index 9a326f728..e78d168ee 100644 --- a/src/layer/vulkan/concat_vulkan.h +++ b/src/layer/vulkan/concat_vulkan.h @@ -29,6 +29,7 @@ public: using Concat::forward; virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; public: ncnn::Layer* packing_pack4; diff --git a/src/layer/vulkan/convolution_vulkan.cpp b/src/layer/vulkan/convolution_vulkan.cpp index d9f81593a..92ac59f58 100644 --- a/src/layer/vulkan/convolution_vulkan.cpp +++ b/src/layer/vulkan/convolution_vulkan.cpp @@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(Convolution_vulkan) Convolution_vulkan::Convolution_vulkan() { support_vulkan = true; + support_image_storage = true; padding = 0; @@ -149,7 +150,22 @@ int Convolution_vulkan::create_pipeline(const Option& opt) size_t elemsize; size_t out_elemsize; - if (opt.use_fp16_storage) + if (opt.use_image_storage && opt.use_image_fp16_storage) + { + elemsize = elempack * 2u; + out_elemsize = out_elempack * 2u; + } + else if (opt.use_image_storage && opt.use_image_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; + } + else if (opt.use_image_storage) + { + elemsize = elempack * 4u; + out_elemsize = out_elempack * 4u; + } + else if (opt.use_fp16_storage) { elemsize = elempack * 2u; out_elemsize = out_elempack * 2u; @@ -210,22 +226,21 @@ int Convolution_vulkan::create_pipeline(const Option& opt) if (is_conv1x1s1d1) { pipeline_convolution_1x1s1d1 = new Pipeline(vkdev); + if (opt.use_image_storage) + { + Mat local_size_xyz_local(4, 4, std::min(4, num_output / out_elempack), (void*)0); + if (out_shape_packed.dims != 0) + { + local_size_xyz_local.w = std::max(1, std::min(4, (out_shape_packed.w + 1) / 2)); + local_size_xyz_local.h = std::max(1, std::min(4, (out_shape_packed.h + 1) / 2)); + local_size_xyz_local.c = std::min(4, out_shape_packed.c); + } + pipeline_convolution_1x1s1d1->set_optimal_local_size_xyz(local_size_xyz_local); + } + else + { pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output)); - - std::vector specializations(4 + 8); - specializations[0].i = bias_term; - specializations[1].i = activation_type; - specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f; - specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f; - specializations[4 + 0].i = shape_bordered_packed.dims; - specializations[4 + 1].i = shape_bordered_packed.cstep / 4; - specializations[4 + 2].i = shape_bordered_packed.c; - specializations[4 + 3].i = shape_bordered_packed.cstep / 4; - specializations[4 + 4].i = out_shape_packed.dims; - specializations[4 + 5].i = out_shape_packed.cstep / 4; - specializations[4 + 6].i = out_shape_packed.c; - specializations[4 + 7].i = out_shape_packed.cstep / 4; - + } pipeline_convolution_1x1s1d1->create(LayerShaderType::convolution_1x1s1d1, opt, specializations); } else @@ -242,22 +257,21 @@ int Convolution_vulkan::create_pipeline(const Option& opt) if (is_conv1x1s1d1) { pipeline_convolution_pack4_1x1s1d1 = new Pipeline(vkdev); + if (opt.use_image_storage) + { + Mat local_size_xyz_local(4, 4, std::min(4, num_output / out_elempack), (void*)0); + if (out_shape_packed.dims != 0) + { + local_size_xyz_local.w = std::max(1, std::min(4, (out_shape_packed.w + 1) / 2)); + local_size_xyz_local.h = std::max(1, std::min(4, (out_shape_packed.h + 1) / 2)); + local_size_xyz_local.c = std::min(4, out_shape_packed.c); + } + pipeline_convolution_pack4_1x1s1d1->set_optimal_local_size_xyz(local_size_xyz_local); + } + else + { pipeline_convolution_pack4_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output / 4)); - - std::vector specializations(4 + 8); - specializations[0].i = bias_term; - specializations[1].i = activation_type; - specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f; - specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f; - specializations[4 + 0].i = shape_bordered_packed.dims; - specializations[4 + 1].i = shape_bordered_packed.w * shape_bordered_packed.h; - specializations[4 + 2].i = shape_bordered_packed.c; - specializations[4 + 3].i = shape_bordered_packed.cstep; - specializations[4 + 4].i = out_shape_packed.dims; - specializations[4 + 5].i = out_shape_packed.w * out_shape_packed.h; - specializations[4 + 6].i = out_shape_packed.c; - specializations[4 + 7].i = out_shape_packed.cstep; - + } pipeline_convolution_pack4_1x1s1d1->create(LayerShaderType::convolution_pack4_1x1s1d1, opt, specializations); } else if (is_conv3x3s1d1 && num_input >= 16 && num_output >= 16) @@ -419,22 +433,21 @@ int Convolution_vulkan::create_pipeline(const Option& opt) if (is_conv1x1s1d1) { pipeline_convolution_pack8_1x1s1d1 = new Pipeline(vkdev); + if (opt.use_image_storage) + { + Mat local_size_xyz_local(4, 4, std::min(4, num_output / out_elempack), (void*)0); + if (out_shape_packed.dims != 0) + { + local_size_xyz_local.w = std::max(1, std::min(4, (out_shape_packed.w + 1) / 2)); + local_size_xyz_local.h = std::max(1, std::min(4, (out_shape_packed.h + 1) / 2)); + local_size_xyz_local.c = std::min(4, out_shape_packed.c); + } + pipeline_convolution_pack8_1x1s1d1->set_optimal_local_size_xyz(local_size_xyz_local); + } + else + { pipeline_convolution_pack8_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output / 8)); - - std::vector specializations(4 + 8); - specializations[0].i = bias_term; - specializations[1].i = activation_type; - specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f; - specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f; - specializations[4 + 0].i = shape_bordered_packed.dims; - specializations[4 + 1].i = shape_bordered_packed.w * shape_bordered_packed.h; - specializations[4 + 2].i = shape_bordered_packed.c; - specializations[4 + 3].i = shape_bordered_packed.cstep; - specializations[4 + 4].i = out_shape_packed.dims; - specializations[4 + 5].i = out_shape_packed.w * out_shape_packed.h; - specializations[4 + 6].i = out_shape_packed.c; - specializations[4 + 7].i = out_shape_packed.cstep; - + } pipeline_convolution_pack8_1x1s1d1->create(LayerShaderType::convolution_pack8_1x1s1d1, opt, specializations); } else if (is_conv3x3s1d1 && num_input >= 16 && num_output >= 16) @@ -695,6 +708,21 @@ int Convolution_vulkan::destroy_pipeline(const Option& opt) int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) { + if (padding) + { + padding->upload_model(cmd, opt); + } + + if (winograd_padding) + { + winograd_padding->upload_model(cmd, opt); + } + + if (winograd_crop) + { + winograd_crop->upload_model(cmd, opt); + } + const int maxk = kernel_w * kernel_h; int num_input = weight_data_size / maxk / num_output; @@ -738,8 +766,14 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) } } - cmd.record_upload(weight_data_packed, weight_data_gpu, opt); - + if (opt.use_image_storage) + { + cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt); + } + else + { + cmd.record_upload(weight_data_packed, weight_data_gpu, opt); + } bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1; @@ -862,7 +896,14 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) } } - cmd.record_upload(weight_data_pack4_tm, weight_data_gpu_pack4_tm, opt); + if (opt.use_image_storage) + { + cmd.record_upload(weight_data_pack4_tm, weight_data_gpu_pack4_tm_image, opt); + } + else + { + cmd.record_upload(weight_data_pack4_tm, weight_data_gpu_pack4_tm, opt); + } } } @@ -952,7 +993,14 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) } } - cmd.record_upload(weight_data_pack8_tm, weight_data_gpu_pack8_tm, opt); + if (opt.use_image_storage) + { + cmd.record_upload(weight_data_pack8_tm, weight_data_gpu_pack8_tm_image, opt); + } + else + { + cmd.record_upload(weight_data_pack8_tm, weight_data_gpu_pack8_tm, opt); + } } } @@ -961,7 +1009,18 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) Mat bias_data_packed; convert_packing(bias_data, bias_data_packed, out_elempack); - cmd.record_upload(bias_data_packed, bias_data_gpu, opt); + if (opt.use_image_storage) + { + cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt); + } + else + { + cmd.record_upload(bias_data_packed, bias_data_gpu, opt); + } + } + else if (opt.use_image_storage) + { + cmd.record_upload(Mat(1), bias_data_gpu_image, opt); } if (innerproduct) @@ -1070,6 +1129,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom } bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1; + if (elempack == 4 && out_elempack == 4 && is_conv3x3s1d1 && channels * elempack >= 16 && num_output >= 16) { // winograd23 @@ -1353,7 +1413,6 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom return 0; } - top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; @@ -1364,19 +1423,21 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom bindings[2] = weight_data_gpu; bindings[3] = bias_term ? bias_data_gpu : bindings[2];// TODO use dummy buffer + std::vector constants(10); + constants[0].i = bottom_blob_bordered.dims; + constants[1].i = bottom_blob_bordered.w; + constants[2].i = bottom_blob_bordered.h; + constants[3].i = bottom_blob_bordered.c; + constants[4].i = bottom_blob_bordered.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + // record if (elempack == 1 && out_elempack == 1 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1) { - std::vector constants(8); - constants[0].i = bottom_blob_bordered.dims; - constants[1].i = bottom_blob_bordered.cstep / 4; - constants[2].i = bottom_blob_bordered.c; - constants[3].i = bottom_blob_bordered.cstep / 4; - constants[4].i = top_blob.dims; - constants[5].i = top_blob.cstep / 4; - constants[6].i = top_blob.c; - constants[7].i = top_blob.cstep / 4; - VkMat dispatcher; dispatcher.w = top_blob.cstep / 4; dispatcher.h = 1; @@ -1386,16 +1447,6 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom } else if (elempack == 4 && out_elempack == 4 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1) { - std::vector constants(8); - constants[0].i = bottom_blob_bordered.dims; - constants[1].i = bottom_blob_bordered.w * bottom_blob_bordered.h; - constants[2].i = bottom_blob_bordered.c; - constants[3].i = bottom_blob_bordered.cstep; - constants[4].i = top_blob.dims; - constants[5].i = top_blob.w * top_blob.h; - constants[6].i = top_blob.c; - constants[7].i = top_blob.cstep; - VkMat dispatcher; dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; dispatcher.h = 1; @@ -1405,16 +1456,6 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom } else if (elempack == 8 && out_elempack == 8 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1) { - std::vector constants(8); - constants[0].i = bottom_blob_bordered.dims; - constants[1].i = bottom_blob_bordered.w * bottom_blob_bordered.h; - constants[2].i = bottom_blob_bordered.c; - constants[3].i = bottom_blob_bordered.cstep; - constants[4].i = top_blob.dims; - constants[5].i = top_blob.w * top_blob.h; - constants[6].i = top_blob.c; - constants[7].i = top_blob.cstep; - VkMat dispatcher; dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; dispatcher.h = 1; @@ -1424,18 +1465,484 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom } else { - std::vector constants(10); - constants[0].i = bottom_blob_bordered.dims; - constants[1].i = bottom_blob_bordered.w; - constants[2].i = bottom_blob_bordered.h; - constants[3].i = bottom_blob_bordered.c; - constants[4].i = bottom_blob_bordered.cstep; - constants[5].i = top_blob.dims; - constants[6].i = top_blob.w; - constants[7].i = top_blob.h; - constants[8].i = top_blob.c; - constants[9].i = top_blob.cstep; + const Pipeline* pipeline = 0; + if (elempack == 1 && out_elempack == 1) + { + pipeline = pipeline_convolution; + } + else if (elempack == 4 && out_elempack == 4) + { + pipeline = pipeline_convolution_pack4; + } + else if (elempack == 1 && out_elempack == 4) + { + pipeline = pipeline_convolution_pack1to4; + } + else if (elempack == 4 && out_elempack == 1) + { + pipeline = pipeline_convolution_pack4to1; + } + else if (elempack == 8 && out_elempack == 8) + { + pipeline = pipeline_convolution_pack8; + } + else if (elempack == 1 && out_elempack == 8) + { + pipeline = pipeline_convolution_pack1to8; + } + else if (elempack == 4 && out_elempack == 8) + { + pipeline = pipeline_convolution_pack4to8; + } + else if (elempack == 8 && out_elempack == 4) + { + pipeline = pipeline_convolution_pack8to4; + } + else if (elempack == 8 && out_elempack == 1) + { + pipeline = pipeline_convolution_pack8to1; + } + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + } + + return 0; +} + +int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + // flattened blob, implement as InnerProduct + if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) + { + int num_input = weight_data_size / num_output; + if (bottom_blob.w * bottom_blob.elempack == num_input) + { + return innerproduct->forward(bottom_blob, top_blob, cmd, opt); + } + } + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + VkImageMat bottom_blob_bordered = bottom_blob; + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad); + } + else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233) + { + int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w; + int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; + if (wpad > 0 || hpad > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); + int* padding_params = padding_param_blob.mapped(); + + padding_params[0] = hpad / 2; + padding_params[1] = hpad - hpad / 2; + padding_params[2] = wpad / 2; + padding_params[3] = wpad - wpad / 2; + + std::vector padding_inputs(2); + padding_inputs[0] = bottom_blob; + padding_inputs[1] = padding_param_blob; + + std::vector padding_outputs(1); + padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); + bottom_blob_bordered = padding_outputs[0]; + } + } + else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234) + { + int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w; + int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; + if (wpad > 0 || hpad > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); + int* padding_params = padding_param_blob.mapped(); + + padding_params[0] = hpad - hpad / 2; + padding_params[1] = hpad / 2; + padding_params[2] = wpad - wpad / 2; + padding_params[3] = wpad / 2; + + std::vector padding_inputs(2); + padding_inputs[0] = bottom_blob; + padding_inputs[1] = padding_param_blob; + + std::vector padding_outputs(1); + padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); + bottom_blob_bordered = padding_outputs[0]; + } + } + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8*2u; + if (out_elempack == 4) out_elemsize = 4*2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1; + + if (elempack == 4 && out_elempack == 4 && is_conv3x3s1d1 && channels * elempack >= 16 && num_output >= 16) + { + // winograd23 + int outw_bordered = (outw + 1) / 2 * 2; + int outh_bordered = (outh + 1) / 2 * 2; + + int w_bordered = outw_bordered + 2; + int h_bordered = outh_bordered + 2; + + int block_x = outw_bordered / 2; + int block_y = outh_bordered / 2; + + // pad to 2n+2 + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); + int* padding_params = padding_param_blob.mapped(); + + padding_params[0] = 0; + padding_params[1] = h_bordered - bottom_blob_bordered.h; + padding_params[2] = 0; + padding_params[3] = w_bordered - bottom_blob_bordered.w; + + std::vector padding_inputs(2); + padding_inputs[0] = bottom_blob_bordered; + padding_inputs[1] = padding_param_blob; + + std::vector padding_outputs(1); + winograd_padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); + bottom_blob_bordered = padding_outputs[0]; + } + + // transform input + VkImageMat bottom_tm_blob; + { + bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator); + if (bottom_tm_blob.empty()) + return -100; + + std::vector bindings(2); + bindings[0] = bottom_blob_bordered; + bindings[1] = bottom_tm_blob; + + std::vector constants(7); + constants[0].i = bottom_blob_bordered.w; + constants[1].i = bottom_blob_bordered.h; + constants[2].i = bottom_blob_bordered.c; + constants[3].i = 0;//bottom_blob_bordered.cstep; + constants[4].i = 0;//bottom_tm_blob.cstep; + constants[5].i = block_x; + constants[6].i = block_y; + + VkImageMat dispatcher; + dispatcher.w = block_x; + dispatcher.h = block_y; + dispatcher.c = bottom_tm_blob.c; + + cmd.record_pipeline(pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input, bindings, constants, dispatcher); + } + + // gemm + VkImageMat top_tm_blob; + { + top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); + if (top_tm_blob.empty()) + return -100; + + std::vector bindings(3); + bindings[0] = bottom_tm_blob; + bindings[1] = top_tm_blob; + bindings[2] = weight_data_gpu_pack4_tm_image; + + std::vector constants(5); + constants[0].i = bottom_tm_blob.c; + constants[1].i = 0;//bottom_tm_blob.cstep; + constants[2].i = top_tm_blob.h; + constants[3].i = top_tm_blob.c; + constants[4].i = 0;//top_tm_blob.cstep; + + VkImageMat dispatcher; + dispatcher.w = top_tm_blob.w; + dispatcher.h = (top_tm_blob.h + 3) / 4; + dispatcher.c = top_tm_blob.c; + + cmd.record_pipeline(pipeline_convolution_pack4_3x3s1d1_winograd23_gemm, bindings, constants, dispatcher); + } + + // transform output + VkImageMat top_blob_bordered; + { + top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob_bordered.empty()) + return -100; + + std::vector bindings(3); + bindings[0] = top_tm_blob; + bindings[1] = top_blob_bordered; + bindings[2] = bias_data_gpu_image;// TODO use dummy buffer + + std::vector constants(7); + constants[0].i = top_tm_blob.c; + constants[1].i = 0;//top_tm_blob.cstep; + constants[2].i = block_x; + constants[3].i = block_y; + constants[4].i = top_blob_bordered.w; + constants[5].i = top_blob_bordered.h; + constants[6].i = 0;//top_blob_bordered.cstep; + + VkImageMat dispatcher; + dispatcher.w = block_x; + dispatcher.h = block_y; + dispatcher.c = top_blob_bordered.c; + + cmd.record_pipeline(pipeline_convolution_pack4_3x3s1d1_winograd23_transform_output, bindings, constants, dispatcher); + } + + // crop top_blob + { + VkImageMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator); + int* crop_params = crop_param_blob.mapped(); + + crop_params[0] = 0; + crop_params[1] = 0; + crop_params[2] = 0; + crop_params[3] = outw; + crop_params[4] = outh; + crop_params[5] = num_output; + + std::vector crop_inputs(2); + crop_inputs[0] = top_blob_bordered; + crop_inputs[1] = crop_param_blob; + + std::vector crop_outputs(1); + winograd_crop->forward(crop_inputs, crop_outputs, cmd, opt); + top_blob = crop_outputs[0]; + } + + return 0; + } + if (elempack == 8 && out_elempack == 8 && is_conv3x3s1d1 && channels * elempack >= 16 && num_output >= 16) + { + // winograd23 + int outw_bordered = (outw + 1) / 2 * 2; + int outh_bordered = (outh + 1) / 2 * 2; + + int w_bordered = outw_bordered + 2; + int h_bordered = outh_bordered + 2; + + int block_x = outw_bordered / 2; + int block_y = outh_bordered / 2; + + // pad to 2n+2 + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); + int* padding_params = padding_param_blob.mapped(); + + padding_params[0] = 0; + padding_params[1] = h_bordered - bottom_blob_bordered.h; + padding_params[2] = 0; + padding_params[3] = w_bordered - bottom_blob_bordered.w; + + std::vector padding_inputs(2); + padding_inputs[0] = bottom_blob_bordered; + padding_inputs[1] = padding_param_blob; + + std::vector padding_outputs(1); + winograd_padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); + bottom_blob_bordered = padding_outputs[0]; + } + + // transform input + VkImageMat bottom_tm_blob; + { + bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator); + if (bottom_tm_blob.empty()) + return -100; + + std::vector bindings(2); + bindings[0] = bottom_blob_bordered; + bindings[1] = bottom_tm_blob; + + std::vector constants(7); + constants[0].i = bottom_blob_bordered.w; + constants[1].i = bottom_blob_bordered.h; + constants[2].i = bottom_blob_bordered.c; + constants[3].i = 0;//bottom_blob_bordered.cstep; + constants[4].i = 0;//bottom_tm_blob.cstep; + constants[5].i = block_x; + constants[6].i = block_y; + + VkImageMat dispatcher; + dispatcher.w = block_x; + dispatcher.h = block_y; + dispatcher.c = bottom_tm_blob.c; + + cmd.record_pipeline(pipeline_convolution_pack8_3x3s1d1_winograd23_transform_input, bindings, constants, dispatcher); + } + + // gemm + VkImageMat top_tm_blob; + { + top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); + if (top_tm_blob.empty()) + return -100; + + std::vector bindings(3); + bindings[0] = bottom_tm_blob; + bindings[1] = top_tm_blob; + bindings[2] = weight_data_gpu_pack8_tm_image; + + std::vector constants(5); + constants[0].i = bottom_tm_blob.c; + constants[1].i = 0;//bottom_tm_blob.cstep; + constants[2].i = top_tm_blob.h; + constants[3].i = top_tm_blob.c; + constants[4].i = 0;//top_tm_blob.cstep; + + VkImageMat dispatcher; + dispatcher.w = top_tm_blob.w; + dispatcher.h = (top_tm_blob.h + 3) / 4; + dispatcher.c = top_tm_blob.c; + + cmd.record_pipeline(pipeline_convolution_pack8_3x3s1d1_winograd23_gemm, bindings, constants, dispatcher); + } + + // transform output + VkImageMat top_blob_bordered; + { + top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob_bordered.empty()) + return -100; + + std::vector bindings(3); + bindings[0] = top_tm_blob; + bindings[1] = top_blob_bordered; + bindings[2] = bias_data_gpu_image;// TODO use dummy buffer + + std::vector constants(7); + constants[0].i = top_tm_blob.c; + constants[1].i = 0;//top_tm_blob.cstep; + constants[2].i = block_x; + constants[3].i = block_y; + constants[4].i = top_blob_bordered.w; + constants[5].i = top_blob_bordered.h; + constants[6].i = 0;//top_blob_bordered.cstep; + + VkImageMat dispatcher; + dispatcher.w = block_x; + dispatcher.h = block_y; + dispatcher.c = top_blob_bordered.c; + + cmd.record_pipeline(pipeline_convolution_pack8_3x3s1d1_winograd23_transform_output, bindings, constants, dispatcher); + } + + // crop top_blob + { + VkImageMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator); + int* crop_params = crop_param_blob.mapped(); + + crop_params[0] = 0; + crop_params[1] = 0; + crop_params[2] = 0; + crop_params[3] = outw; + crop_params[4] = outh; + crop_params[5] = num_output; + + std::vector crop_inputs(2); + crop_inputs[0] = top_blob_bordered; + crop_inputs[1] = crop_param_blob; + + std::vector crop_outputs(1); + winograd_crop->forward(crop_inputs, crop_outputs, cmd, opt); + top_blob = crop_outputs[0]; + } + + return 0; + } + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(4); + bindings[0] = bottom_blob_bordered; + bindings[1] = top_blob; + bindings[2] = weight_data_gpu_image; + bindings[3] = bias_data_gpu_image;// TODO use dummy buffer + + std::vector constants(10); + constants[0].i = bottom_blob_bordered.dims; + constants[1].i = bottom_blob_bordered.w; + constants[2].i = bottom_blob_bordered.h; + constants[3].i = bottom_blob_bordered.c; + constants[4].i = 0;//bottom_blob_bordered.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = 0;//top_blob.cstep; + + // record + if (elempack == 1 && out_elempack == 1 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1) + { + VkImageMat dispatcher; + dispatcher.w = (top_blob.w + 1) / 2; + dispatcher.h = (top_blob.h + 1) / 2; + dispatcher.c = top_blob.c; + + cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher); + } + else if (elempack == 4 && out_elempack == 4 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1) + { + VkImageMat dispatcher; + dispatcher.w = (top_blob.w + 1) / 2; + dispatcher.h = (top_blob.h + 1) / 2; + dispatcher.c = top_blob.c; + + cmd.record_pipeline(pipeline_convolution_pack4_1x1s1d1, bindings, constants, dispatcher); + } + else if (elempack == 8 && out_elempack == 8 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1) + { + VkImageMat dispatcher; + dispatcher.w = (top_blob.w + 1) / 2; + dispatcher.h = (top_blob.h + 1) / 2; + dispatcher.c = top_blob.c; + + cmd.record_pipeline(pipeline_convolution_pack8_1x1s1d1, bindings, constants, dispatcher); + } + else + { const Pipeline* pipeline = 0; if (elempack == 1 && out_elempack == 1) { diff --git a/src/layer/vulkan/convolution_vulkan.h b/src/layer/vulkan/convolution_vulkan.h index 914f97254..7ec81ed4c 100644 --- a/src/layer/vulkan/convolution_vulkan.h +++ b/src/layer/vulkan/convolution_vulkan.h @@ -31,6 +31,7 @@ public: using Convolution::forward; virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; + virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; public: ncnn::Layer* padding; @@ -38,6 +39,9 @@ public: VkMat weight_data_gpu; VkMat bias_data_gpu; + VkImageMat weight_data_gpu_image; + VkImageMat bias_data_gpu_image; + Pipeline* pipeline_convolution; Pipeline* pipeline_convolution_1x1s1d1; Pipeline* pipeline_convolution_pack4; @@ -55,12 +59,14 @@ public: ncnn::Layer* winograd_padding; ncnn::Layer* winograd_crop; VkMat weight_data_gpu_pack4_tm; + VkImageMat weight_data_gpu_pack4_tm_image; Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input; Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd23_gemm; Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd23_transform_output; // pack8 winograd23 VkMat weight_data_gpu_pack8_tm; + VkImageMat weight_data_gpu_pack8_tm_image; Pipeline* pipeline_convolution_pack8_3x3s1d1_winograd23_transform_input; Pipeline* pipeline_convolution_pack8_3x3s1d1_winograd23_gemm; Pipeline* pipeline_convolution_pack8_3x3s1d1_winograd23_transform_output; diff --git a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp index 6b1d7e1be..e8f723019 100644 --- a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp +++ b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp @@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(ConvolutionDepthWise_vulkan) ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan() { support_vulkan = true; + support_image_storage = true; padding = 0; packing_unpack = 0; @@ -106,7 +107,22 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& opt) size_t elemsize; size_t out_elemsize; - if (opt.use_fp16_storage) + if (opt.use_image_storage && opt.use_image_fp16_storage) + { + elemsize = elempack * 2u; + out_elemsize = out_elempack * 2u; + } + else if (opt.use_image_storage && opt.use_image_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; + } + else if (opt.use_image_storage) + { + elemsize = elempack * 4u; + out_elemsize = out_elempack * 4u; + } + else if (opt.use_fp16_storage) { elemsize = elempack * 2u; out_elemsize = out_elempack * 2u; @@ -199,7 +215,22 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& opt) size_t elemsize_g; size_t out_elemsize_g; - if (opt.use_fp16_storage) + if (opt.use_image_storage && opt.use_image_fp16_storage) + { + elemsize_g = elempack_g * 2u; + out_elemsize_g = out_elempack_g * 2u; + } + else if (opt.use_image_storage && opt.use_image_fp16_packed) + { + elemsize_g = elempack_g == 1 ? 4u : elempack_g * 2u; + out_elemsize_g = out_elempack_g == 1 ? 4u : out_elempack_g * 2u; + } + else if (opt.use_image_storage) + { + elemsize_g = elempack_g * 4u; + out_elemsize_g = out_elempack_g * 4u; + } + else if (opt.use_fp16_storage) { elemsize_g = elempack_g * 2u; out_elemsize_g = out_elempack_g * 2u; @@ -415,6 +446,21 @@ int ConvolutionDepthWise_vulkan::destroy_pipeline(const Option& opt) int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt) { + if (padding) + { + padding->upload_model(cmd, opt); + } + + if (packing_unpack) + { + packing_unpack->upload_model(cmd, opt); + } + + if (packing_pack) + { + packing_pack->upload_model(cmd, opt); + } + const int maxk = kernel_w * kernel_h; int channels = (weight_data_size / group) / maxk / (num_output / group) * group; @@ -430,12 +476,25 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt cmd.record_upload(weight_data_packed, weight_data_gpu, opt); + cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt); + if (bias_term) { Mat bias_data_packed; convert_packing(bias_data, bias_data_packed, out_elempack); - cmd.record_upload(bias_data_packed, bias_data_gpu, opt); + if (opt.use_image_storage) + { + cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt); + } + else + { + cmd.record_upload(bias_data_packed, bias_data_gpu, opt); + } + } + else if (opt.use_image_storage) + { + cmd.record_upload(Mat(1), bias_data_gpu_image, opt); } return 0; @@ -493,14 +552,32 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt } } - cmd.record_upload(weight_data_packed_groups, weight_data_gpu, opt); + if (opt.use_image_storage) + { + cmd.record_upload(weight_data_packed_groups, weight_data_gpu_image, opt); + } + else + { + cmd.record_upload(weight_data_packed_groups, weight_data_gpu, opt); + } if (bias_term) { Mat bias_data_packed; convert_packing(bias_data, bias_data_packed, out_elempack_g); - cmd.record_upload(bias_data_packed, bias_data_gpu, opt); + if (opt.use_image_storage) + { + cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt); + } + else + { + cmd.record_upload(bias_data_packed, bias_data_gpu, opt); + } + } + else if (opt.use_image_storage) + { + cmd.record_upload(Mat(1), bias_data_gpu_image, opt); } return 0; @@ -730,4 +807,228 @@ int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_bl return 0; } +int ConvolutionDepthWise_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + VkImageMat bottom_blob_bordered = bottom_blob; + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad); + } + else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233) + { + int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w; + int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; + if (wpad > 0 || hpad > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); + int* padding_params = padding_param_blob.mapped(); + + padding_params[0] = hpad / 2; + padding_params[1] = hpad - hpad / 2; + padding_params[2] = wpad / 2; + padding_params[3] = wpad - wpad / 2; + + std::vector padding_inputs(2); + padding_inputs[0] = bottom_blob; + padding_inputs[1] = padding_param_blob; + + std::vector padding_outputs(1); + padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); + bottom_blob_bordered = padding_outputs[0]; + } + } + else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234) + { + int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w; + int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; + if (wpad > 0 || hpad > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); + int* padding_params = padding_param_blob.mapped(); + + padding_params[0] = hpad - hpad / 2; + padding_params[1] = hpad / 2; + padding_params[2] = wpad - wpad / 2; + padding_params[3] = wpad / 2; + + std::vector padding_inputs(2); + padding_inputs[0] = bottom_blob; + padding_inputs[1] = padding_param_blob; + + std::vector padding_outputs(1); + padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); + bottom_blob_bordered = padding_outputs[0]; + } + } + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8*2u; + if (out_elempack == 4) out_elemsize = 4*2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + // depth-wise + if (channels == group / elempack && group / elempack == num_output / elempack) + { + std::vector bindings(4); + bindings[0] = bottom_blob_bordered; + bindings[1] = top_blob; + bindings[2] = weight_data_gpu_image; + bindings[3] = bias_data_gpu_image;// TODO use dummy buffer + + std::vector constants(10); + constants[0].i = bottom_blob_bordered.dims; + constants[1].i = bottom_blob_bordered.w; + constants[2].i = bottom_blob_bordered.h; + constants[3].i = bottom_blob_bordered.c; + constants[4].i = 0;//bottom_blob_bordered.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = 0;//top_blob.cstep; + + const Pipeline* pipeline = elempack == 8 ? pipeline_convolutiondepthwise_pack8 + : elempack == 4 ? pipeline_convolutiondepthwise_pack4 + : pipeline_convolutiondepthwise; + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + + return 0; + } + + const int channels_g = channels * elempack / group; + const int num_output_g = num_output / group; + + int elempack_g = opt.use_shader_pack8 && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1; + int out_elempack_g = opt.use_shader_pack8 && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1; + size_t out_elemsize_g = elemsize / elempack * out_elempack_g; + + if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) + { + if (out_elempack_g == 8) out_elemsize_g = 8*2u; + if (out_elempack_g == 4) out_elemsize_g = 4*2u; + if (out_elempack_g == 1) out_elemsize_g = 4u; + } + + // unpacking + VkImageMat bottom_blob_bordered_unpacked = bottom_blob_bordered; + if (elempack > elempack_g) + { + Option opt_pack1 = opt; + opt_pack1.blob_vkallocator = opt.workspace_vkallocator; + + packing_unpack->forward(bottom_blob_bordered, bottom_blob_bordered_unpacked, cmd, opt_pack1); + } + + VkImageMat top_blob_unpacked = top_blob; + if (out_elempack_g < out_elempack) + { + top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator); + if (top_blob_unpacked.empty()) + return -100; + } + + std::vector bindings(4); + bindings[0] = bottom_blob_bordered_unpacked; + bindings[1] = top_blob_unpacked; + bindings[2] = weight_data_gpu_image; + bindings[3] = bias_data_gpu_image;// TODO use dummy buffer + + std::vector constants(10); + constants[0].i = bottom_blob_bordered_unpacked.dims; + constants[1].i = bottom_blob_bordered_unpacked.w; + constants[2].i = bottom_blob_bordered_unpacked.h; + constants[3].i = bottom_blob_bordered_unpacked.c; + constants[4].i = 0;//bottom_blob_bordered_unpacked.cstep; + constants[5].i = top_blob_unpacked.dims; + constants[6].i = top_blob_unpacked.w; + constants[7].i = top_blob_unpacked.h; + constants[8].i = top_blob_unpacked.c; + constants[9].i = 0;//top_blob_unpacked.cstep; + + const Pipeline* pipeline = 0; + if (elempack_g == 1 && out_elempack_g == 1) + { + pipeline = pipeline_convolutiondepthwise_group; + } + else if (elempack_g == 4 && out_elempack_g == 4) + { + pipeline = pipeline_convolutiondepthwise_group_pack4; + } + else if (elempack_g == 1 && out_elempack_g == 4) + { + pipeline = pipeline_convolutiondepthwise_group_pack1to4; + } + else if (elempack_g == 4 && out_elempack_g == 1) + { + pipeline = pipeline_convolutiondepthwise_group_pack4to1; + } + else if (elempack_g == 8 && out_elempack_g == 8) + { + pipeline = pipeline_convolutiondepthwise_group_pack8; + } + else if (elempack_g == 1 && out_elempack_g == 8) + { + pipeline = pipeline_convolutiondepthwise_group_pack1to8; + } + else if (elempack_g == 4 && out_elempack_g == 8) + { + pipeline = pipeline_convolutiondepthwise_group_pack4to8; + } + else if (elempack_g == 8 && out_elempack_g == 4) + { + pipeline = pipeline_convolutiondepthwise_group_pack8to4; + } + else if (elempack_g == 8 && out_elempack_g == 1) + { + pipeline = pipeline_convolutiondepthwise_group_pack8to1; + } + + cmd.record_pipeline(pipeline, bindings, constants, top_blob_unpacked); + + // packing + if (out_elempack_g < out_elempack) + { + packing_pack->forward(top_blob_unpacked, top_blob, cmd, opt); + } + else + { + top_blob = top_blob_unpacked; + } + + return 0; +} + } // namespace ncnn diff --git a/src/layer/vulkan/convolutiondepthwise_vulkan.h b/src/layer/vulkan/convolutiondepthwise_vulkan.h index 7d48635fc..a9c395c88 100644 --- a/src/layer/vulkan/convolutiondepthwise_vulkan.h +++ b/src/layer/vulkan/convolutiondepthwise_vulkan.h @@ -31,11 +31,15 @@ public: using ConvolutionDepthWise::forward; virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; + virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; public: VkMat weight_data_gpu; VkMat bias_data_gpu; + VkImageMat weight_data_gpu_image; + VkImageMat bias_data_gpu_image; + ncnn::Layer* padding; ncnn::Layer* packing_unpack; ncnn::Layer* packing_pack; diff --git a/src/layer/vulkan/crop_vulkan.cpp b/src/layer/vulkan/crop_vulkan.cpp index 4798f38a6..e32bc9ac7 100644 --- a/src/layer/vulkan/crop_vulkan.cpp +++ b/src/layer/vulkan/crop_vulkan.cpp @@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(Crop_vulkan) Crop_vulkan::Crop_vulkan() { support_vulkan = true; + support_image_storage = true; packing_pack1 = 0; packing_pack4 = 0; @@ -104,7 +105,22 @@ int Crop_vulkan::create_pipeline(const Option& opt) size_t elemsize; size_t out_elemsize; - if (opt.use_fp16_storage) + if (opt.use_image_storage && opt.use_image_fp16_storage) + { + elemsize = elempack * 2u; + out_elemsize = out_elempack * 2u; + } + else if (opt.use_image_storage && opt.use_image_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; + } + else if (opt.use_image_storage) + { + elemsize = elempack * 4u; + out_elemsize = out_elempack * 4u; + } + else if (opt.use_fp16_storage) { elemsize = elempack * 2u; out_elemsize = out_elempack * 2u; @@ -134,7 +150,19 @@ int Crop_vulkan::create_pipeline(const Option& opt) if (bottom_shapes.size() == 1 && shape.dims != 0 && elempack == out_elempack && elempack > offset_elempack) { size_t offset_elemsize; - if (opt.use_fp16_storage) + if (opt.use_image_storage && opt.use_image_fp16_storage) + { + offset_elemsize = offset_elempack * 2u; + } + else if (opt.use_image_storage && opt.use_image_fp16_packed) + { + offset_elemsize = offset_elempack == 1 ? 4u : offset_elempack * 2u; + } + else if (opt.use_image_storage) + { + offset_elemsize = offset_elempack * 4u; + } + else if (opt.use_fp16_storage) { offset_elemsize = offset_elempack * 2u; } @@ -598,4 +626,261 @@ int Crop_vulkan::forward(const std::vector& bottom_blobs, std::vector offset_elempack) + { + Option opt_pack1 = opt; + opt_pack1.blob_vkallocator = opt.workspace_vkallocator; + + const Layer* packing = offset_elempack == 4 ? packing_pack4 : packing_pack1; + packing->forward(bottom_blob, bottom_blob_unpacked, cmd, opt_pack1); + } + + top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(2); + bindings[0] = bottom_blob_unpacked; + bindings[1] = top_blob; + + std::vector constants(13); + constants[0].i = bottom_blob_unpacked.dims; + constants[1].i = bottom_blob_unpacked.w; + constants[2].i = bottom_blob_unpacked.h; + constants[3].i = bottom_blob_unpacked.c; + constants[4].i = 0;//bottom_blob_unpacked.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = 0;//top_blob.cstep; + constants[10].i = _woffset; + constants[11].i = _hoffset; + constants[12].i = _coffset; + + const Pipeline* pipeline = 0; + if (elempack == 1 && out_elempack == 1) + { + pipeline = pipeline_crop; + } + else if (elempack == 4 && offset_elempack == 4 && out_elempack == 4) + { + constants[12].i = _coffset / 4; + + pipeline = pipeline_crop_pack4; + } + else if (elempack == 4 && offset_elempack == 1 && out_elempack == 4) + { + pipeline = pipeline_crop_pack1to4; + } + else if (elempack == 1 && out_elempack == 4) + { + pipeline = pipeline_crop_pack1to4; + } + else if (elempack == 4 && out_elempack == 1) + { + pipeline = pipeline_crop_pack4to1; + } + else if (elempack == 8 && offset_elempack == 8 && out_elempack == 8) + { + constants[12].i = _coffset / 8; + + pipeline = pipeline_crop_pack8; + } + else if (elempack == 8 && offset_elempack == 4 && out_elempack == 8) + { + pipeline = pipeline_crop_pack4to8; + } + else if (elempack == 8 && offset_elempack == 1 && out_elempack == 8) + { + pipeline = pipeline_crop_pack1to8; + } + else if (elempack == 1 && out_elempack == 8) + { + pipeline = pipeline_crop_pack1to8; + } + else if (elempack == 4 && out_elempack == 8) + { + pipeline = pipeline_crop_pack4to8; + } + else if (elempack == 8 && out_elempack == 4) + { + pipeline = pipeline_crop_pack8to4; + } + else if (elempack == 8 && out_elempack == 1) + { + pipeline = pipeline_crop_pack8to1; + } + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + } + + return 0; +} + +int Crop_vulkan::forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const +{ + const VkImageMat& bottom_blob = bottom_blobs[0]; + const VkImageMat& reference_blob = bottom_blobs[1]; + + int h = bottom_blob.h; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int _woffset, _hoffset, _coffset; + int _outw, _outh, _outc; + if (woffset == -233) + { + resolve_crop_roi(bottom_blob.shape(), (const int*)reference_blob.mapped(), _woffset, _hoffset, _coffset, _outw, _outh, _outc); + } + else + { + resolve_crop_roi(bottom_blob.shape(), reference_blob.shape(), _woffset, _hoffset, _coffset, _outw, _outh, _outc); + } + + // TODO vec and image crop + + if (dims == 3) + { + int offset_elempack = _coffset == 0 ? elempack : opt.use_shader_pack8 && _coffset % 8 == 0 ? 8 : _coffset % 4 == 0 ? 4 : 1; + + int out_elempack = opt.use_shader_pack8 && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8*2u; + if (out_elempack == 4) out_elemsize = 4*2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + // unpacking + VkImageMat bottom_blob_unpacked = bottom_blob; + if (elempack == out_elempack && elempack > offset_elempack) + { + Option opt_pack1 = opt; + opt_pack1.blob_vkallocator = opt.workspace_vkallocator; + + const Layer* packing = offset_elempack == 4 ? packing_pack4 : packing_pack1; + packing->forward(bottom_blob, bottom_blob_unpacked, cmd, opt_pack1); + } + + VkImageMat& top_blob = top_blobs[0]; + + top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(2); + bindings[0] = bottom_blob_unpacked; + bindings[1] = top_blob; + + std::vector constants(13); + constants[0].i = bottom_blob_unpacked.dims; + constants[1].i = bottom_blob_unpacked.w; + constants[2].i = bottom_blob_unpacked.h; + constants[3].i = bottom_blob_unpacked.c; + constants[4].i = 0;//bottom_blob_unpacked.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = 0;//top_blob.cstep; + constants[10].i = _woffset; + constants[11].i = _hoffset; + constants[12].i = _coffset; + + const Pipeline* pipeline = 0; + if (elempack == 1 && out_elempack == 1) + { + pipeline = pipeline_crop; + } + else if (elempack == 4 && offset_elempack == 4 && out_elempack == 4) + { + constants[12].i = _coffset / 4; + + pipeline = pipeline_crop_pack4; + } + else if (elempack == 4 && offset_elempack == 1 && out_elempack == 4) + { + pipeline = pipeline_crop_pack1to4; + } + else if (elempack == 1 && out_elempack == 4) + { + pipeline = pipeline_crop_pack1to4; + } + else if (elempack == 4 && out_elempack == 1) + { + pipeline = pipeline_crop_pack4to1; + } + else if (elempack == 8 && offset_elempack == 8 && out_elempack == 8) + { + constants[12].i = _coffset / 8; + + pipeline = pipeline_crop_pack8; + } + else if (elempack == 8 && offset_elempack == 4 && out_elempack == 8) + { + pipeline = pipeline_crop_pack4to8; + } + else if (elempack == 8 && offset_elempack == 1 && out_elempack == 8) + { + pipeline = pipeline_crop_pack1to8; + } + else if (elempack == 1 && out_elempack == 8) + { + pipeline = pipeline_crop_pack1to8; + } + else if (elempack == 4 && out_elempack == 8) + { + pipeline = pipeline_crop_pack4to8; + } + else if (elempack == 8 && out_elempack == 4) + { + pipeline = pipeline_crop_pack8to4; + } + else if (elempack == 8 && out_elempack == 1) + { + pipeline = pipeline_crop_pack8to1; + } + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + } + + return 0; +} + } // namespace ncnn diff --git a/src/layer/vulkan/crop_vulkan.h b/src/layer/vulkan/crop_vulkan.h index 3f2a32a21..3cebe0013 100644 --- a/src/layer/vulkan/crop_vulkan.h +++ b/src/layer/vulkan/crop_vulkan.h @@ -32,6 +32,10 @@ public: virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; + virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; + public: ncnn::Layer* packing_pack1; ncnn::Layer* packing_pack4; diff --git a/src/layer/vulkan/deconvolution_vulkan.cpp b/src/layer/vulkan/deconvolution_vulkan.cpp index 08978bb68..6146caa55 100644 --- a/src/layer/vulkan/deconvolution_vulkan.cpp +++ b/src/layer/vulkan/deconvolution_vulkan.cpp @@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(Deconvolution_vulkan) Deconvolution_vulkan::Deconvolution_vulkan() { support_vulkan = true; + support_image_storage = true; crop = 0; output_pad = 0; @@ -130,7 +131,22 @@ int Deconvolution_vulkan::create_pipeline(const Option& opt) size_t elemsize; size_t out_elemsize; - if (opt.use_fp16_storage) + if (opt.use_image_storage && opt.use_image_fp16_storage) + { + elemsize = elempack * 2u; + out_elemsize = out_elempack * 2u; + } + else if (opt.use_image_storage && opt.use_image_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; + } + else if (opt.use_image_storage) + { + elemsize = elempack * 4u; + out_elemsize = out_elempack * 4u; + } + else if (opt.use_fp16_storage) { elemsize = elempack * 2u; out_elemsize = out_elempack * 2u; @@ -316,6 +332,21 @@ int Deconvolution_vulkan::destroy_pipeline(const Option& opt) int Deconvolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) { + if (crop) + { + crop->upload_model(cmd, opt); + } + + if (output_pad) + { + output_pad->upload_model(cmd, opt); + } + + if (output_crop) + { + output_crop->upload_model(cmd, opt); + } + const int maxk = kernel_w * kernel_h; int num_input = weight_data_size / maxk / num_output; @@ -376,14 +407,32 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) } } - cmd.record_upload(weight_data_packed, weight_data_gpu, opt); + if (opt.use_image_storage) + { + cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt); + } + else + { + cmd.record_upload(weight_data_packed, weight_data_gpu, opt); + } if (bias_term) { Mat bias_data_packed; convert_packing(bias_data, bias_data_packed, out_elempack); - cmd.record_upload(bias_data_packed, bias_data_gpu, opt); + if (opt.use_image_storage) + { + cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt); + } + else + { + cmd.record_upload(bias_data_packed, bias_data_gpu, opt); + } + } + else if (opt.use_image_storage) + { + cmd.record_upload(Mat(1), bias_data_gpu_image, opt); } return 0; @@ -582,4 +631,197 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC return 0; } +int Deconvolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + int outw = (w - 1) * stride_w + kernel_extent_w; + int outh = (h - 1) * stride_h + kernel_extent_h; + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8*2u; + if (out_elempack == 4) out_elemsize = 4*2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + VkImageMat top_blob_bordered; + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0)) + { + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); + } + else + { + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + } + if (top_blob_bordered.empty()) + return -100; + + std::vector bindings(4); + bindings[0] = bottom_blob; + bindings[1] = top_blob_bordered; + bindings[2] = weight_data_gpu_image; + bindings[3] = bias_data_gpu_image;// TODO use dummy buffer + + std::vector constants(10); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = 0;//bottom_blob.cstep; + constants[5].i = top_blob_bordered.dims; + constants[6].i = top_blob_bordered.w; + constants[7].i = top_blob_bordered.h; + constants[8].i = top_blob_bordered.c; + constants[9].i = 0;//top_blob_bordered.cstep; + + const Pipeline* pipeline = 0; + if (elempack == 1 && out_elempack == 1) + { + pipeline = pipeline_deconvolution; + } + else if (elempack == 4 && out_elempack == 4) + { + pipeline = pipeline_deconvolution_pack4; + } + else if (elempack == 1 && out_elempack == 4) + { + pipeline = pipeline_deconvolution_pack1to4; + } + else if (elempack == 4 && out_elempack == 1) + { + pipeline = pipeline_deconvolution_pack4to1; + } + else if (elempack == 8 && out_elempack == 8) + { + pipeline = pipeline_deconvolution_pack8; + } + else if (elempack == 1 && out_elempack == 8) + { + pipeline = pipeline_deconvolution_pack1to8; + } + else if (elempack == 4 && out_elempack == 8) + { + pipeline = pipeline_deconvolution_pack4to8; + } + else if (elempack == 8 && out_elempack == 4) + { + pipeline = pipeline_deconvolution_pack8to4; + } + else if (elempack == 8 && out_elempack == 1) + { + pipeline = pipeline_deconvolution_pack8to1; + } + + cmd.record_pipeline(pipeline, bindings, constants, top_blob_bordered); + + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0) + { + VkImageMat top_blob_bordered_adj = top_blob_bordered; + if (output_pad_right > 0 || output_pad_bottom > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + output_pad->forward(top_blob_bordered, top_blob_bordered_adj, cmd, opt_pad); + if (top_blob_bordered_adj.empty()) + return -100; + } + + { + VkImageMat reference_blob; + reference_blob.dims = 2; + reference_blob.w = top_blob_bordered_adj.w - pad_left - pad_right; + reference_blob.h = top_blob_bordered_adj.h - pad_top - pad_bottom; + reference_blob.elempack = 1; + + std::vector crop_bottom_blobs(2); + crop_bottom_blobs[0] = top_blob_bordered_adj; + crop_bottom_blobs[1] = reference_blob; + std::vector crop_top_blobs(1); + crop->forward(crop_bottom_blobs, crop_top_blobs, cmd, opt); + top_blob = crop_top_blobs[0]; + } + if (top_blob.empty()) + return -100; + + outw = top_blob.w; + outh = top_blob.h; + } + else if (output_w > 0 && output_h > 0) + { + VkImageMat top_blob_bordered_adj = top_blob_bordered; + if (output_pad_right > 0 || output_pad_bottom > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + output_pad->forward(top_blob_bordered, top_blob_bordered_adj, cmd, opt_pad); + if (top_blob_bordered_adj.empty()) + return -100; + } + + int wcut = top_blob_bordered_adj.w - output_w; + int hcut = top_blob_bordered_adj.h - output_h; + + VkImageMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); + int* crop_params = crop_param_blob.mapped(); + + if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233) + { + // onnx padding=SAME_UPPER + crop_params[0] = wcut / 2; + crop_params[1] = hcut / 2; + crop_params[2] = 0; + crop_params[3] = top_blob_bordered_adj.w - wcut; + crop_params[4] = top_blob_bordered_adj.h - hcut; + crop_params[5] = top_blob_bordered_adj.c; + } + else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234) + { + // onnx padding=SAME_LOWER + crop_params[0] = wcut - wcut / 2; + crop_params[1] = hcut - hcut / 2; + crop_params[2] = 0; + crop_params[3] = top_blob_bordered_adj.w - wcut; + crop_params[4] = top_blob_bordered_adj.h - hcut; + crop_params[5] = top_blob_bordered_adj.c; + } + + std::vector crop_inputs(2); + crop_inputs[0] = top_blob_bordered_adj; + crop_inputs[1] = crop_param_blob; + + std::vector crop_outputs(1); + output_crop->forward(crop_inputs, crop_outputs, cmd, opt); + top_blob = crop_outputs[0]; + if (top_blob.empty()) + return -100; + + outw = top_blob.w; + outh = top_blob.h; + } + else + { + if (output_pad_right > 0 || output_pad_bottom > 0) + { + output_pad->forward(top_blob_bordered, top_blob, cmd, opt); + if (top_blob.empty()) + return -100; + } + else + { + top_blob = top_blob_bordered; + } + } + + return 0; +} + } // namespace ncnn diff --git a/src/layer/vulkan/deconvolution_vulkan.h b/src/layer/vulkan/deconvolution_vulkan.h index 4159d7af2..21814c1b3 100644 --- a/src/layer/vulkan/deconvolution_vulkan.h +++ b/src/layer/vulkan/deconvolution_vulkan.h @@ -31,11 +31,15 @@ public: using Deconvolution::forward; virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; + virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; public: VkMat weight_data_gpu; VkMat bias_data_gpu; + VkImageMat weight_data_gpu_image; + VkImageMat bias_data_gpu_image; + ncnn::Layer* crop; ncnn::Layer* output_pad; ncnn::Layer* output_crop; diff --git a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp index 19095bf23..79c0a705d 100644 --- a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp +++ b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp @@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(DeconvolutionDepthWise_vulkan) DeconvolutionDepthWise_vulkan::DeconvolutionDepthWise_vulkan() { support_vulkan = true; + support_image_storage = true; crop = 0; output_pad = 0; @@ -136,7 +137,22 @@ int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& opt) size_t elemsize; size_t out_elemsize; - if (opt.use_fp16_storage) + if (opt.use_image_storage && opt.use_image_fp16_storage) + { + elemsize = elempack * 2u; + out_elemsize = out_elempack * 2u; + } + else if (opt.use_image_storage && opt.use_image_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; + } + else if (opt.use_image_storage) + { + elemsize = elempack * 4u; + out_elemsize = out_elempack * 4u; + } + else if (opt.use_fp16_storage) { elemsize = elempack * 2u; out_elemsize = out_elempack * 2u; @@ -233,7 +249,22 @@ int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& opt) size_t elemsize_g; size_t out_elemsize_g; - if (opt.use_fp16_storage) + if (opt.use_image_storage && opt.use_image_fp16_storage) + { + elemsize_g = elempack_g * 2u; + out_elemsize_g = out_elempack_g * 2u; + } + else if (opt.use_image_storage && opt.use_image_fp16_packed) + { + elemsize_g = elempack_g == 1 ? 4u : elempack_g * 2u; + out_elemsize_g = out_elempack_g == 1 ? 4u : out_elempack_g * 2u; + } + else if (opt.use_image_storage) + { + elemsize_g = elempack_g * 4u; + out_elemsize_g = out_elempack_g * 4u; + } + else if (opt.use_fp16_storage) { elemsize_g = elempack_g * 2u; out_elemsize_g = out_elempack_g * 2u; @@ -463,6 +494,31 @@ int DeconvolutionDepthWise_vulkan::destroy_pipeline(const Option& opt) int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt) { + if (crop) + { + crop->upload_model(cmd, opt); + } + + if (output_pad) + { + output_pad->upload_model(cmd, opt); + } + + if (output_crop) + { + output_crop->upload_model(cmd, opt); + } + + if (packing_unpack) + { + packing_unpack->upload_model(cmd, opt); + } + + if (packing_pack) + { + packing_pack->upload_model(cmd, opt); + } + const int maxk = kernel_w * kernel_h; int channels = (weight_data_size / group) / maxk / (num_output / group) * group; @@ -495,12 +551,25 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& o cmd.record_upload(weight_data_r2_packed, weight_data_gpu, opt); + cmd.record_upload(weight_data_r2_packed, weight_data_gpu_image, opt); + if (bias_term) { Mat bias_data_packed; convert_packing(bias_data, bias_data_packed, out_elempack); - cmd.record_upload(bias_data_packed, bias_data_gpu, opt); + if (opt.use_image_storage) + { + cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt); + } + else + { + cmd.record_upload(bias_data_packed, bias_data_gpu, opt); + } + } + else if (opt.use_image_storage) + { + cmd.record_upload(Mat(1), bias_data_gpu_image, opt); } return 0; @@ -558,14 +627,32 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& o } } - cmd.record_upload(weight_data_packed_groups, weight_data_gpu, opt); + if (opt.use_image_storage) + { + cmd.record_upload(weight_data_packed_groups, weight_data_gpu_image, opt); + } + else + { + cmd.record_upload(weight_data_packed_groups, weight_data_gpu, opt); + } if (bias_term) { Mat bias_data_packed; convert_packing(bias_data, bias_data_packed, out_elempack_g); - cmd.record_upload(bias_data_packed, bias_data_gpu, opt); + if (opt.use_image_storage) + { + cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt); + } + else + { + cmd.record_upload(bias_data_packed, bias_data_gpu, opt); + } + } + else if (opt.use_image_storage) + { + cmd.record_upload(Mat(1), bias_data_gpu_image, opt); } return 0; @@ -936,4 +1023,369 @@ int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_ return 0; } +int DeconvolutionDepthWise_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + int outw = (w - 1) * stride_w + kernel_extent_w; + int outh = (h - 1) * stride_h + kernel_extent_h; + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8*2u; + if (out_elempack == 4) out_elemsize = 4*2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + VkImageMat top_blob_bordered; + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0)) + { + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); + } + else + { + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + } + if (top_blob_bordered.empty()) + return -100; + + // depth-wise + if (channels == group / elempack && group / elempack == num_output / elempack) + { + std::vector bindings(4); + bindings[0] = bottom_blob; + bindings[1] = top_blob_bordered; + bindings[2] = weight_data_gpu_image; + bindings[3] = bias_data_gpu_image;// TODO use dummy buffer + + std::vector constants(10); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = 0;//bottom_blob.cstep; + constants[5].i = top_blob_bordered.dims; + constants[6].i = top_blob_bordered.w; + constants[7].i = top_blob_bordered.h; + constants[8].i = top_blob_bordered.c; + constants[9].i = 0;//top_blob_bordered.cstep; + + const Pipeline* pipeline = elempack == 8 ? pipeline_deconvolutiondepthwise_pack8 + : elempack == 4 ? pipeline_deconvolutiondepthwise_pack4 + : pipeline_deconvolutiondepthwise; + + // record + cmd.record_pipeline(pipeline, bindings, constants, top_blob_bordered); + + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0) + { + VkImageMat top_blob_bordered_adj = top_blob_bordered; + if (output_pad_right > 0 || output_pad_bottom > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + output_pad->forward(top_blob_bordered, top_blob_bordered_adj, cmd, opt_pad); + if (top_blob_bordered_adj.empty()) + return -100; + } + + { + VkImageMat reference_blob; + reference_blob.dims = 2; + reference_blob.w = top_blob_bordered_adj.w - pad_left - pad_right; + reference_blob.h = top_blob_bordered_adj.h - pad_top - pad_bottom; + reference_blob.elempack = 1; + + std::vector crop_bottom_blobs(2); + crop_bottom_blobs[0] = top_blob_bordered_adj; + crop_bottom_blobs[1] = reference_blob; + std::vector crop_top_blobs(1); + crop->forward(crop_bottom_blobs, crop_top_blobs, cmd, opt); + top_blob = crop_top_blobs[0]; + } + if (top_blob.empty()) + return -100; + + outw = top_blob.w; + outh = top_blob.h; + } + else if (output_w > 0 && output_h > 0) + { + VkImageMat top_blob_bordered_adj = top_blob_bordered; + if (output_pad_right > 0 || output_pad_bottom > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + output_pad->forward(top_blob_bordered, top_blob_bordered_adj, cmd, opt_pad); + if (top_blob_bordered_adj.empty()) + return -100; + } + + int wcut = top_blob_bordered_adj.w - output_w; + int hcut = top_blob_bordered_adj.h - output_h; + + VkImageMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); + int* crop_params = crop_param_blob.mapped(); + + if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233) + { + // onnx padding=SAME_UPPER + crop_params[0] = wcut / 2; + crop_params[1] = hcut / 2; + crop_params[2] = 0; + crop_params[3] = top_blob_bordered_adj.w - wcut; + crop_params[4] = top_blob_bordered_adj.h - hcut; + crop_params[5] = top_blob_bordered_adj.c; + } + else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234) + { + // onnx padding=SAME_LOWER + crop_params[0] = wcut - wcut / 2; + crop_params[1] = hcut - hcut / 2; + crop_params[2] = 0; + crop_params[3] = top_blob_bordered_adj.w - wcut; + crop_params[4] = top_blob_bordered_adj.h - hcut; + crop_params[5] = top_blob_bordered_adj.c; + } + + std::vector crop_inputs(2); + crop_inputs[0] = top_blob_bordered_adj; + crop_inputs[1] = crop_param_blob; + + std::vector crop_outputs(1); + output_crop->forward(crop_inputs, crop_outputs, cmd, opt); + top_blob = crop_outputs[0]; + if (top_blob.empty()) + return -100; + + outw = top_blob.w; + outh = top_blob.h; + } + else + { + if (output_pad_right > 0 || output_pad_bottom > 0) + { + output_pad->forward(top_blob_bordered, top_blob, cmd, opt); + if (top_blob.empty()) + return -100; + } + else + { + top_blob = top_blob_bordered; + } + } + + return 0; + } + + const int channels_g = channels * elempack / group; + const int num_output_g = num_output / group; + + int elempack_g = opt.use_shader_pack8 && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1; + int out_elempack_g = opt.use_shader_pack8 && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1; + size_t out_elemsize_g = elemsize / elempack * out_elempack_g; + + if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) + { + if (out_elempack_g == 8) out_elemsize_g = 8*2u; + if (out_elempack_g == 4) out_elemsize_g = 4*2u; + if (out_elempack_g == 1) out_elemsize_g = 4u; + } + + // unpacking + VkImageMat bottom_blob_unpacked = bottom_blob; + if (elempack > elempack_g) + { + Option opt_pack1 = opt; + opt_pack1.blob_vkallocator = opt.workspace_vkallocator; + + packing_unpack->forward(bottom_blob, bottom_blob_unpacked, cmd, opt_pack1); + } + + VkImageMat top_blob_unpacked = top_blob_bordered; + if (out_elempack_g < out_elempack) + { + top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator); + if (top_blob_unpacked.empty()) + return -100; + } + + std::vector bindings(4); + bindings[0] = bottom_blob_unpacked; + bindings[1] = top_blob_unpacked; + bindings[2] = weight_data_gpu_image; + bindings[3] = bias_data_gpu_image;// TODO use dummy buffer + + std::vector constants(10); + constants[0].i = bottom_blob_unpacked.dims; + constants[1].i = bottom_blob_unpacked.w; + constants[2].i = bottom_blob_unpacked.h; + constants[3].i = bottom_blob_unpacked.c; + constants[4].i = 0;//bottom_blob_unpacked.cstep; + constants[5].i = top_blob_unpacked.dims; + constants[6].i = top_blob_unpacked.w; + constants[7].i = top_blob_unpacked.h; + constants[8].i = top_blob_unpacked.c; + constants[9].i = 0;//top_blob_unpacked.cstep; + + const Pipeline* pipeline = 0; + if (elempack_g == 1 && out_elempack_g == 1) + { + pipeline = pipeline_deconvolutiondepthwise_group; + } + else if (elempack_g == 4 && out_elempack_g == 4) + { + pipeline = pipeline_deconvolutiondepthwise_group_pack4; + } + else if (elempack_g == 1 && out_elempack_g == 4) + { + pipeline = pipeline_deconvolutiondepthwise_group_pack1to4; + } + else if (elempack_g == 4 && out_elempack_g == 1) + { + pipeline = pipeline_deconvolutiondepthwise_group_pack4to1; + } + else if (elempack_g == 8 && out_elempack_g == 8) + { + pipeline = pipeline_deconvolutiondepthwise_group_pack8; + } + else if (elempack_g == 1 && out_elempack_g == 8) + { + pipeline = pipeline_deconvolutiondepthwise_group_pack1to8; + } + else if (elempack_g == 4 && out_elempack_g == 8) + { + pipeline = pipeline_deconvolutiondepthwise_group_pack4to8; + } + else if (elempack_g == 8 && out_elempack_g == 4) + { + pipeline = pipeline_deconvolutiondepthwise_group_pack8to4; + } + else if (elempack_g == 8 && out_elempack_g == 1) + { + pipeline = pipeline_deconvolutiondepthwise_group_pack8to1; + } + + cmd.record_pipeline(pipeline, bindings, constants, top_blob_unpacked); + + // packing + if (out_elempack_g < out_elempack) + { + packing_pack->forward(top_blob_unpacked, top_blob_bordered, cmd, opt); + } + else + { + top_blob_bordered = top_blob_unpacked; + } + + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0) + { + VkImageMat top_blob_bordered_adj = top_blob_bordered; + if (output_pad_right > 0 || output_pad_bottom > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + output_pad->forward(top_blob_bordered, top_blob_bordered_adj, cmd, opt_pad); + if (top_blob_bordered_adj.empty()) + return -100; + } + + { + VkImageMat reference_blob; + reference_blob.dims = 2; + reference_blob.w = top_blob_bordered_adj.w - pad_left - pad_right; + reference_blob.h = top_blob_bordered_adj.h - pad_top - pad_bottom; + reference_blob.elempack = 1; + + std::vector crop_bottom_blobs(2); + crop_bottom_blobs[0] = top_blob_bordered_adj; + crop_bottom_blobs[1] = reference_blob; + std::vector crop_top_blobs(1); + crop->forward(crop_bottom_blobs, crop_top_blobs, cmd, opt); + top_blob = crop_top_blobs[0]; + } + if (top_blob.empty()) + return -100; + + outw = top_blob.w; + outh = top_blob.h; + } + else if (output_w > 0 && output_h > 0) + { + VkImageMat top_blob_bordered_adj = top_blob_bordered; + if (output_pad_right > 0 || output_pad_bottom > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + output_pad->forward(top_blob_bordered, top_blob_bordered_adj, cmd, opt_pad); + if (top_blob_bordered_adj.empty()) + return -100; + } + + int wcut = top_blob_bordered_adj.w - output_w; + int hcut = top_blob_bordered_adj.h - output_h; + + VkImageMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); + int* crop_params = crop_param_blob.mapped(); + + if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233) + { + // onnx padding=SAME_UPPER + crop_params[0] = wcut / 2; + crop_params[1] = hcut / 2; + crop_params[2] = 0; + crop_params[3] = top_blob_bordered_adj.w - wcut; + crop_params[4] = top_blob_bordered_adj.h - hcut; + crop_params[5] = top_blob_bordered_adj.c; + } + else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234) + { + // onnx padding=SAME_LOWER + crop_params[0] = wcut - wcut / 2; + crop_params[1] = hcut - hcut / 2; + crop_params[2] = 0; + crop_params[3] = top_blob_bordered_adj.w - wcut; + crop_params[4] = top_blob_bordered_adj.h - hcut; + crop_params[5] = top_blob_bordered_adj.c; + } + + std::vector crop_inputs(2); + crop_inputs[0] = top_blob_bordered_adj; + crop_inputs[1] = crop_param_blob; + + std::vector crop_outputs(1); + output_crop->forward(crop_inputs, crop_outputs, cmd, opt); + top_blob = crop_outputs[0]; + if (top_blob.empty()) + return -100; + + outw = top_blob.w; + outh = top_blob.h; + } + else + { + if (output_pad_right > 0 || output_pad_bottom > 0) + { + output_pad->forward(top_blob_bordered, top_blob, cmd, opt); + if (top_blob.empty()) + return -100; + } + else + { + top_blob = top_blob_bordered; + } + } + + return 0; +} + } // namespace ncnn diff --git a/src/layer/vulkan/deconvolutiondepthwise_vulkan.h b/src/layer/vulkan/deconvolutiondepthwise_vulkan.h index d8b31d1e0..1eb57e67b 100644 --- a/src/layer/vulkan/deconvolutiondepthwise_vulkan.h +++ b/src/layer/vulkan/deconvolutiondepthwise_vulkan.h @@ -31,11 +31,15 @@ public: using DeconvolutionDepthWise::forward; virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; + virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; public: VkMat weight_data_gpu; VkMat bias_data_gpu; + VkImageMat weight_data_gpu_image; + VkImageMat bias_data_gpu_image; + ncnn::Layer* crop; ncnn::Layer* output_pad; ncnn::Layer* output_crop; diff --git a/src/layer/vulkan/eltwise_vulkan.cpp b/src/layer/vulkan/eltwise_vulkan.cpp index 146358ba1..ebcdb3009 100644 --- a/src/layer/vulkan/eltwise_vulkan.cpp +++ b/src/layer/vulkan/eltwise_vulkan.cpp @@ -23,6 +23,7 @@ DEFINE_LAYER_CREATOR(Eltwise_vulkan) Eltwise_vulkan::Eltwise_vulkan() { support_vulkan = true; + support_image_storage = true; pipeline_eltwise[0] = 0; pipeline_eltwise[1] = 0; @@ -42,7 +43,19 @@ int Eltwise_vulkan::create_pipeline(const Option& opt) if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; size_t elemsize; - if (opt.use_fp16_storage) + if (opt.use_image_storage && opt.use_image_fp16_storage) + { + elemsize = elempack * 2u; + } + else if (opt.use_image_storage && opt.use_image_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + } + else if (opt.use_image_storage) + { + elemsize = elempack * 4u; + } + else if (opt.use_fp16_storage) { elemsize = elempack * 2u; } @@ -207,4 +220,66 @@ int Eltwise_vulkan::forward(const std::vector& bottom_blobs, std::vector< return 0; } +int Eltwise_vulkan::forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const +{ + const VkImageMat& bottom_blob = bottom_blobs[0]; + const VkImageMat& bottom_blob1 = bottom_blobs[1]; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + VkImageMat& top_blob = top_blobs[0]; + top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(3); + bindings[0] = bottom_blob; + bindings[1] = bottom_blob1; + bindings[2] = top_blob; + + std::vector constants(5 + 2); + constants[0].i = top_blob.dims; + constants[1].i = top_blob.w; + constants[2].i = top_blob.h; + constants[3].i = top_blob.c; + constants[4].i = 0;//top_blob.cstep; + constants[5].f = coeffs.w == 0 ? 1.f : coeffs[0]; + constants[6].f = coeffs.w == 0 ? 1.f : coeffs[1]; + + const Pipeline* pipeline = elempack == 8 ? pipeline_eltwise_pack8[1] + : elempack == 4 ? pipeline_eltwise_pack4[1] + : pipeline_eltwise[1]; + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + + for (size_t b=2; b bindings(3); + bindings[0] = top_blob; + bindings[1] = bottom_blobs[b]; + bindings[2] = top_blob;// TODO use separated pipeline ? + + std::vector constants(5 + 2); + constants[0].i = top_blob.dims; + constants[1].i = top_blob.w; + constants[2].i = top_blob.h; + constants[3].i = top_blob.c; + constants[4].i = 0;//top_blob.cstep; + constants[5].f = 1.f; + constants[6].f = coeffs.w == 0 ? 1 : coeffs[b]; + + const Pipeline* pipeline = elempack == 8 ? pipeline_eltwise_pack8[b%2] + : elempack == 4 ? pipeline_eltwise_pack4[b%2] + : pipeline_eltwise[b%2]; + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + } + + return 0; +} + } // namespace ncnn diff --git a/src/layer/vulkan/eltwise_vulkan.h b/src/layer/vulkan/eltwise_vulkan.h index c97df3ffa..2516db55d 100644 --- a/src/layer/vulkan/eltwise_vulkan.h +++ b/src/layer/vulkan/eltwise_vulkan.h @@ -29,6 +29,7 @@ public: using Eltwise::forward; virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; public: Pipeline* pipeline_eltwise[2]; diff --git a/src/layer/vulkan/flatten_vulkan.cpp b/src/layer/vulkan/flatten_vulkan.cpp index 25e54efd0..7888339da 100644 --- a/src/layer/vulkan/flatten_vulkan.cpp +++ b/src/layer/vulkan/flatten_vulkan.cpp @@ -23,6 +23,7 @@ DEFINE_LAYER_CREATOR(Flatten_vulkan) Flatten_vulkan::Flatten_vulkan() { support_vulkan = true; + support_image_storage = true; pipeline_flatten = 0; pipeline_flatten_pack4 = 0; @@ -47,7 +48,22 @@ int Flatten_vulkan::create_pipeline(const Option& opt) size_t elemsize; size_t out_elemsize; - if (opt.use_fp16_storage) + if (opt.use_image_storage && opt.use_image_fp16_storage) + { + elemsize = elempack * 2u; + out_elemsize = out_elempack * 2u; + } + else if (opt.use_image_storage && opt.use_image_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; + } + else if (opt.use_image_storage) + { + elemsize = elempack * 4u; + out_elemsize = out_elempack * 4u; + } + else if (opt.use_fp16_storage) { elemsize = elempack * 2u; out_elemsize = out_elempack * 2u; @@ -256,4 +272,83 @@ int Flatten_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute return 0; } +int Flatten_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + int dims = bottom_blob.dims; + + if (dims == 1) + { + top_blob = bottom_blob; + return 0; + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int total = w * h * channels * elempack; + + int out_elempack = opt.use_shader_pack8 && total % 8 == 0 ? 8 : total % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8*2u; + if (out_elempack == 4) out_elemsize = 4*2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + + std::vector constants(10); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = 0;//bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = 0;//top_blob.cstep; + + const Pipeline* pipeline = 0; + if (elempack == 1 && out_elempack == 1) + { + pipeline = pipeline_flatten; + } + else if (elempack == 4 && out_elempack == 4) + { + pipeline = pipeline_flatten_pack4; + } + else if (elempack == 1 && out_elempack == 4) + { + pipeline = pipeline_flatten_pack1to4; + } + else if (elempack == 8 /*&& out_elempack == 8*/) + { + pipeline = pipeline_flatten_pack8; + } + else if (elempack == 1 && out_elempack == 8) + { + pipeline = pipeline_flatten_pack1to8; + } + else if (elempack == 4 && out_elempack == 8) + { + pipeline = pipeline_flatten_pack4to8; + } + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + + return 0; +} + } // namespace ncnn diff --git a/src/layer/vulkan/flatten_vulkan.h b/src/layer/vulkan/flatten_vulkan.h index ec229c87f..510cab128 100644 --- a/src/layer/vulkan/flatten_vulkan.h +++ b/src/layer/vulkan/flatten_vulkan.h @@ -29,6 +29,7 @@ public: using Flatten::forward; virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; + virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; public: Pipeline* pipeline_flatten; diff --git a/src/layer/vulkan/innerproduct_vulkan.cpp b/src/layer/vulkan/innerproduct_vulkan.cpp index 99ef42210..08264f609 100644 --- a/src/layer/vulkan/innerproduct_vulkan.cpp +++ b/src/layer/vulkan/innerproduct_vulkan.cpp @@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(InnerProduct_vulkan) InnerProduct_vulkan::InnerProduct_vulkan() { support_vulkan = true; + support_image_storage = true; flatten = 0; @@ -72,7 +73,17 @@ int InnerProduct_vulkan::create_pipeline(const Option& opt) size_t elemsize; size_t out_elemsize; - if (opt.use_fp16_storage) + if (opt.use_image_storage && opt.use_image_fp16_storage) + { + elemsize = elempack * 2u; + out_elemsize = out_elempack * 2u; + } + else if (opt.use_image_storage) + { + elemsize = elempack * 4u; + out_elemsize = out_elempack * 4u; + } + else if (opt.use_fp16_storage) { elemsize = elempack * 2u; out_elemsize = out_elempack * 2u; @@ -269,14 +280,32 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd, const Option& opt) } } - cmd.record_upload(weight_data_packed, weight_data_gpu, opt); + if (opt.use_image_storage) + { + cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt); + } + else + { + cmd.record_upload(weight_data_packed, weight_data_gpu, opt); + } if (bias_term) { Mat bias_data_packed; convert_packing(bias_data, bias_data_packed, out_elempack); - cmd.record_upload(bias_data_packed, bias_data_gpu, opt); + if (opt.use_image_storage) + { + cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt); + } + else + { + cmd.record_upload(bias_data_packed, bias_data_gpu, opt); + } + } + else if (opt.use_image_storage) + { + cmd.record_upload(Mat(1), bias_data_gpu_image, opt); } return 0; @@ -371,4 +400,93 @@ int InnerProduct_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCo return 0; } +int InnerProduct_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + // flatten + VkImageMat bottom_blob_flattened = bottom_blob; + { + Option opt_flatten = opt; + opt_flatten.blob_vkallocator = opt.workspace_vkallocator; + + flatten->forward(bottom_blob, bottom_blob_flattened, cmd, opt_flatten); + } + + size_t elemsize = bottom_blob_flattened.elemsize; + int elempack = bottom_blob_flattened.elempack; + + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8*2u; + if (out_elempack == 4) out_elemsize = 4*2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(4); + bindings[0] = bottom_blob_flattened; + bindings[1] = top_blob; + bindings[2] = weight_data_gpu_image; + bindings[3] = bias_data_gpu_image;// TODO use dummy buffer + + std::vector constants(10); + constants[0].i = bottom_blob_flattened.dims; + constants[1].i = bottom_blob_flattened.w; + constants[2].i = bottom_blob_flattened.h; + constants[3].i = bottom_blob_flattened.c; + constants[4].i = 0;//bottom_blob_flattened.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = 0;//top_blob.cstep; + + const Pipeline* pipeline = 0; + if (elempack == 1 && out_elempack == 1) + { + pipeline = pipeline_innerproduct; + } + else if (elempack == 4 && out_elempack == 4) + { + pipeline = pipeline_innerproduct_pack4; + } + else if (elempack == 1 && out_elempack == 4) + { + pipeline = pipeline_innerproduct_pack1to4; + } + else if (elempack == 4 && out_elempack == 1) + { + pipeline = pipeline_innerproduct_pack4to1; + } + else if (elempack == 8 && out_elempack == 8) + { + pipeline = pipeline_innerproduct_pack8; + } + else if (elempack == 1 && out_elempack == 8) + { + pipeline = pipeline_innerproduct_pack1to8; + } + else if (elempack == 4 && out_elempack == 8) + { + pipeline = pipeline_innerproduct_pack4to8; + } + else if (elempack == 8 && out_elempack == 4) + { + pipeline = pipeline_innerproduct_pack8to4; + } + else if (elempack == 8 && out_elempack == 1) + { + pipeline = pipeline_innerproduct_pack8to1; + } + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + + return 0; +} + } // namespace ncnn diff --git a/src/layer/vulkan/innerproduct_vulkan.h b/src/layer/vulkan/innerproduct_vulkan.h index 20e730c9e..886dbd82a 100644 --- a/src/layer/vulkan/innerproduct_vulkan.h +++ b/src/layer/vulkan/innerproduct_vulkan.h @@ -31,6 +31,7 @@ public: using InnerProduct::forward; virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; + virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; public: ncnn::Layer* flatten; @@ -38,6 +39,9 @@ public: VkMat weight_data_gpu; VkMat bias_data_gpu; + VkImageMat weight_data_gpu_image; + VkImageMat bias_data_gpu_image; + Pipeline* pipeline_innerproduct; Pipeline* pipeline_innerproduct_pack4; Pipeline* pipeline_innerproduct_pack1to4; diff --git a/src/layer/vulkan/packing_vulkan.cpp b/src/layer/vulkan/packing_vulkan.cpp index 8df5cc70b..a1d221222 100644 --- a/src/layer/vulkan/packing_vulkan.cpp +++ b/src/layer/vulkan/packing_vulkan.cpp @@ -22,6 +22,7 @@ DEFINE_LAYER_CREATOR(Packing_vulkan) Packing_vulkan::Packing_vulkan() { support_vulkan = true; + support_image_storage = true; pipeline_packing_1to4 = 0; pipeline_packing_4to1 = 0; @@ -37,7 +38,19 @@ int Packing_vulkan::create_pipeline(const Option& opt) const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0]; size_t out_elemsize; - if (opt.use_fp16_storage) + if (opt.use_image_storage && opt.use_image_fp16_storage) + { + out_elemsize = out_elempack * 2u; + } + else if (opt.use_image_storage && opt.use_image_fp16_packed) + { + out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; + } + else if (opt.use_image_storage) + { + out_elemsize = out_elempack * 4u; + } + else if (opt.use_fp16_storage) { out_elemsize = out_elempack * 2u; } @@ -284,4 +297,132 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute return 0; } +int Packing_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + int elempack = bottom_blob.elempack; + + if (elempack == out_elempack) + { + top_blob = bottom_blob; + return 0; + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + + if (!use_padding) + { + // identity if use_padding not allowed + if (dims == 1 && w * elempack % out_elempack != 0) + { + top_blob = bottom_blob; + return 0; + } + if (dims == 2 && h * elempack % out_elempack != 0) + { + top_blob = bottom_blob; + return 0; + } + if (dims == 3 && channels * elempack % out_elempack != 0) + { + top_blob = bottom_blob; + return 0; + } + } + + if (dims == 1) + { + int outw = (w * elempack + out_elempack - 1) / out_elempack; + size_t out_elemsize = elemsize / elempack * out_elempack; + if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8*2u; + if (out_elempack == 4) out_elemsize = 4*2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + } + + if (dims == 2) + { + int outh = (h * elempack + out_elempack - 1) / out_elempack; + size_t out_elemsize = elemsize / elempack * out_elempack; + if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8*2u; + if (out_elempack == 4) out_elemsize = 4*2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + } + + if (dims == 3) + { + int outc = (channels * elempack + out_elempack - 1) / out_elempack; + size_t out_elemsize = elemsize / elempack * out_elempack; + if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8*2u; + if (out_elempack == 4) out_elemsize = 4*2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + } + + std::vector bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + + std::vector constants(10); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = 0;//bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = 0;//top_blob.cstep; + + if (elempack == 1 && out_elempack == 4) + { + cmd.record_pipeline(pipeline_packing_1to4, bindings, constants, top_blob); + } + if (elempack == 4 && out_elempack == 1) + { + cmd.record_pipeline(pipeline_packing_4to1, bindings, constants, bottom_blob); + } + if (elempack == 1 && out_elempack == 8) + { + cmd.record_pipeline(pipeline_packing_1to8, bindings, constants, top_blob); + } + if (elempack == 4 && out_elempack == 8) + { + cmd.record_pipeline(pipeline_packing_4to8, bindings, constants, top_blob); + } + if (elempack == 8 && out_elempack == 4) + { + cmd.record_pipeline(pipeline_packing_8to4, bindings, constants, bottom_blob); + } + if (elempack == 8 && out_elempack == 1) + { + cmd.record_pipeline(pipeline_packing_8to1, bindings, constants, bottom_blob); + } + + return 0; +} + } // namespace ncnn diff --git a/src/layer/vulkan/packing_vulkan.h b/src/layer/vulkan/packing_vulkan.h index ee53186da..76daacb81 100644 --- a/src/layer/vulkan/packing_vulkan.h +++ b/src/layer/vulkan/packing_vulkan.h @@ -29,6 +29,7 @@ public: using Packing::forward; virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; + virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; public: Pipeline* pipeline_packing_1to4; diff --git a/src/layer/vulkan/padding_vulkan.cpp b/src/layer/vulkan/padding_vulkan.cpp index f614bcc0a..152ed376d 100644 --- a/src/layer/vulkan/padding_vulkan.cpp +++ b/src/layer/vulkan/padding_vulkan.cpp @@ -23,6 +23,7 @@ DEFINE_LAYER_CREATOR(Padding_vulkan) Padding_vulkan::Padding_vulkan() { support_vulkan = true; + support_image_storage = true; pipeline_padding = 0; pipeline_padding_pack4 = 0; @@ -46,7 +47,22 @@ int Padding_vulkan::create_pipeline(const Option& opt) size_t elemsize; size_t out_elemsize; - if (opt.use_fp16_storage) + if (opt.use_image_storage && opt.use_image_fp16_storage) + { + elemsize = elempack * 2u; + out_elemsize = out_elempack * 2u; + } + else if (opt.use_image_storage && opt.use_image_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; + } + else if (opt.use_image_storage) + { + elemsize = elempack * 4u; + out_elemsize = out_elempack * 4u; + } + else if (opt.use_fp16_storage) { elemsize = elempack * 2u; out_elemsize = out_elempack * 2u; @@ -139,14 +155,28 @@ int Padding_vulkan::destroy_pipeline(const Option& /*opt*/) int Padding_vulkan::upload_model(VkTransfer& cmd, const Option& opt) { if (per_channel_pad_data_size == 0) + { + if (opt.use_image_storage) + { + cmd.record_upload(Mat(1), per_channel_pad_data_gpu_image, opt); + } + return 0; + } int elempack = opt.use_shader_pack8 && per_channel_pad_data_size % 8 == 0 ? 8 : per_channel_pad_data_size % 4 == 0 ? 4 : 1; Mat per_channel_pad_data_packed; convert_packing(per_channel_pad_data, per_channel_pad_data_packed, elempack); - cmd.record_upload(per_channel_pad_data_packed, per_channel_pad_data_gpu, opt); + if (opt.use_image_storage) + { + cmd.record_upload(per_channel_pad_data_packed, per_channel_pad_data_gpu_image, opt); + } + else + { + cmd.record_upload(per_channel_pad_data_packed, per_channel_pad_data_gpu, opt); + } return 0; } @@ -271,4 +301,124 @@ int Padding_vulkan::forward(const std::vector& bottom_blobs, std::vector< return 0; } +int Padding_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + if (top == 0 && bottom == 0 && left == 0 && right == 0) + { + top_blob = bottom_blob; + return 0; + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + // TODO vec and image padding + + int outw = w + left + right; + int outh = h + top + bottom; + + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(3); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + bindings[2] = per_channel_pad_data_gpu_image;// TODO use dummy buffer + + std::vector constants(12); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = 0;//bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = 0;//top_blob.cstep; + constants[10].i = left; + constants[11].i = top; + + const Pipeline* pipeline = elempack == 8 ? pipeline_padding_pack8 + : elempack == 4 ? pipeline_padding_pack4 + : pipeline_padding; + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + + return 0; +} + +int Padding_vulkan::forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const +{ + const VkImageMat& bottom_blob = bottom_blobs[0]; + const VkImageMat& reference_blob = bottom_blobs[1]; + + VkImageMat& top_blob = top_blobs[0]; + + int _top; + int _bottom; + int _left; + int _right; + { + const int* param_data = reference_blob.mapped(); + + _top = param_data[0]; + _bottom = param_data[1]; + _left = param_data[2]; + _right = param_data[3]; + } + + if (_top == 0 && _bottom == 0 && _left == 0 && _right == 0) + { + top_blob = bottom_blob; + return 0; + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + // TODO vec and image padding + + int outw = w + _left + _right; + int outh = h + _top + _bottom; + + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(3); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + bindings[2] = per_channel_pad_data_gpu_image;// TODO use dummy buffer + + std::vector constants(12); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = 0;//bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = 0;//top_blob.cstep; + constants[10].i = _left; + constants[11].i = _top; + + const Pipeline* pipeline = elempack == 8 ? pipeline_padding_pack8 + : elempack == 4 ? pipeline_padding_pack4 + : pipeline_padding; + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + + return 0; +} + } // namespace ncnn diff --git a/src/layer/vulkan/padding_vulkan.h b/src/layer/vulkan/padding_vulkan.h index 171426769..8da5bc418 100644 --- a/src/layer/vulkan/padding_vulkan.h +++ b/src/layer/vulkan/padding_vulkan.h @@ -34,8 +34,13 @@ public: virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; + virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; + public: VkMat per_channel_pad_data_gpu; + VkImageMat per_channel_pad_data_gpu_image; Pipeline* pipeline_padding; Pipeline* pipeline_padding_pack4; Pipeline* pipeline_padding_pack8; diff --git a/src/layer/vulkan/pooling_vulkan.cpp b/src/layer/vulkan/pooling_vulkan.cpp index fd31230af..795f23757 100644 --- a/src/layer/vulkan/pooling_vulkan.cpp +++ b/src/layer/vulkan/pooling_vulkan.cpp @@ -25,6 +25,7 @@ DEFINE_LAYER_CREATOR(Pooling_vulkan) Pooling_vulkan::Pooling_vulkan() { support_vulkan = true; + support_image_storage = true; padding = 0; pipeline_pooling = 0; @@ -112,7 +113,22 @@ int Pooling_vulkan::create_pipeline(const Option& opt) size_t elemsize; size_t out_elemsize; - if (opt.use_fp16_storage) + if (opt.use_image_storage && opt.use_image_fp16_storage) + { + elemsize = elempack * 2u; + out_elemsize = out_elempack * 2u; + } + else if (opt.use_image_storage && opt.use_image_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; + } + else if (opt.use_image_storage) + { + elemsize = elempack * 4u; + out_elemsize = out_elempack * 4u; + } + else if (opt.use_fp16_storage) { elemsize = elempack * 2u; out_elemsize = out_elempack * 2u; @@ -277,6 +293,16 @@ int Pooling_vulkan::destroy_pipeline(const Option& opt) return 0; } +int Pooling_vulkan::upload_model(VkTransfer& cmd, const Option& opt) +{ + if (padding) + { + padding->upload_model(cmd, opt); + } + + return 0; +} + int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const { int w = bottom_blob.w; @@ -447,4 +473,174 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute return 0; } +int Pooling_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + if (global_pooling) + { + top_blob.create(channels, elemsize, elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + + std::vector constants(10); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = 0;//bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = 0;//top_blob.cstep; + + const Pipeline* pipeline = elempack == 8 ? pipeline_pooling_global_pack8 + : elempack == 4 ? pipeline_pooling_global_pack4 + : pipeline_pooling_global; + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + + return 0; + } + + VkImageMat bottom_blob_bordered = bottom_blob; + + int wtailpad = 0; + int htailpad = 0; + + if (pad_mode == 0) // full padding + { + int wtail = (w + pad_left + pad_right - kernel_w) % stride_w; + int htail = (h + pad_top + pad_bottom - kernel_h) % stride_h; + + if (wtail != 0) + wtailpad = stride_w - wtail; + if (htail != 0) + htailpad = stride_h - htail; + + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); + int* padding_params = padding_param_blob.mapped(); + + padding_params[0] = pad_top; + padding_params[1] = pad_bottom + htailpad; + padding_params[2] = pad_left; + padding_params[3] = pad_right + wtailpad; + + std::vector padding_inputs(2); + padding_inputs[0] = bottom_blob; + padding_inputs[1] = padding_param_blob; + + std::vector padding_outputs(1); + padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); + bottom_blob_bordered = padding_outputs[0]; + } + else if (pad_mode == 1) // valid padding + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad); + } + else if (pad_mode == 2) // tensorflow padding=SAME or onnx padding=SAME_UPPER + { + int wpad = kernel_w + (w - 1) / stride_w * stride_w - w; + int hpad = kernel_h + (h - 1) / stride_h * stride_h - h; + if (wpad > 0 || hpad > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); + int* padding_params = padding_param_blob.mapped(); + + padding_params[0] = hpad / 2; + padding_params[1] = hpad - hpad / 2; + padding_params[2] = wpad / 2; + padding_params[3] = wpad - wpad / 2; + + std::vector padding_inputs(2); + padding_inputs[0] = bottom_blob; + padding_inputs[1] = padding_param_blob; + + std::vector padding_outputs(1); + padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); + bottom_blob_bordered = padding_outputs[0]; + } + } + else if (pad_mode == 3) // onnx padding=SAME_LOWER + { + int wpad = kernel_w + (w - 1) / stride_w * stride_w - w; + int hpad = kernel_h + (h - 1) / stride_h * stride_h - h; + if (wpad > 0 || hpad > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); + int* padding_params = padding_param_blob.mapped(); + + padding_params[0] = hpad - hpad / 2; + padding_params[1] = hpad / 2; + padding_params[2] = wpad - wpad / 2; + padding_params[3] = wpad / 2; + + std::vector padding_inputs(2); + padding_inputs[0] = bottom_blob; + padding_inputs[1] = padding_param_blob; + + std::vector padding_outputs(1); + padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); + bottom_blob_bordered = padding_outputs[0]; + } + } + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int outw = (w - kernel_w) / stride_w + 1; + int outh = (h - kernel_h) / stride_h + 1; + + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(2); + bindings[0] = bottom_blob_bordered; + bindings[1] = top_blob; + + std::vector constants(12); + constants[0].i = bottom_blob_bordered.dims; + constants[1].i = bottom_blob_bordered.w; + constants[2].i = bottom_blob_bordered.h; + constants[3].i = bottom_blob_bordered.c; + constants[4].i = 0;//bottom_blob_bordered.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = 0;//top_blob.cstep; + constants[10].i = wtailpad; + constants[11].i = htailpad; + + const Pipeline* pipeline = elempack == 8 ? pipeline_pooling_pack8 + : elempack == 4 ? pipeline_pooling_pack4 + : pipeline_pooling; + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + + return 0; +} + } // namespace ncnn diff --git a/src/layer/vulkan/pooling_vulkan.h b/src/layer/vulkan/pooling_vulkan.h index cebba78cf..82977da5b 100644 --- a/src/layer/vulkan/pooling_vulkan.h +++ b/src/layer/vulkan/pooling_vulkan.h @@ -27,8 +27,11 @@ public: virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); + virtual int upload_model(VkTransfer& cmd, const Option& opt); + using Pooling::forward; virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; + virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; public: ncnn::Layer* padding; diff --git a/src/layer/vulkan/shader/absval.comp b/src/layer/vulkan/shader/absval.comp index 0205b0a5d..8eed3231d 100644 --- a/src/layer/vulkan/shader/absval.comp +++ b/src/layer/vulkan/shader/absval.comp @@ -32,7 +32,16 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -52,11 +61,42 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + afp v; + if (psc(dims) == 1) + { + v = image1d_ld1(bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else const int gi = gz * psc(cstep) + gy * psc(w) + gx; afp v = buffer_ld1(bottom_top_blob_data, gi); +#endif v = abs(v); +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st1(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st1(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else buffer_st1(bottom_top_blob_data, gi, v); +#endif } diff --git a/src/layer/vulkan/shader/absval_pack4.comp b/src/layer/vulkan/shader/absval_pack4.comp index 412dc6a70..eeea8be9e 100644 --- a/src/layer/vulkan/shader/absval_pack4.comp +++ b/src/layer/vulkan/shader/absval_pack4.comp @@ -32,7 +32,16 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -52,11 +61,42 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + afpvec4 v; + if (psc(dims) == 1) + { + v = image1d_ld4(bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else const int gi = gz * psc(cstep) + gy * psc(w) + gx; afpvec4 v = buffer_ld4(bottom_top_blob_data, gi); +#endif v = abs(v); +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st4(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st4(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else buffer_st4(bottom_top_blob_data, gi, v); +#endif } diff --git a/src/layer/vulkan/shader/absval_pack8.comp b/src/layer/vulkan/shader/absval_pack8.comp index d9d76506a..16c2bbe90 100644 --- a/src/layer/vulkan/shader/absval_pack8.comp +++ b/src/layer/vulkan/shader/absval_pack8.comp @@ -33,7 +33,16 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -53,12 +62,43 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + afpvec8 v; + if (psc(dims) == 1) + { + v = image1d_ld8(bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else const int gi = gz * psc(cstep) + gy * psc(w) + gx; afpvec8 v = buffer_ld8(bottom_top_blob_data, gi); +#endif v[0] = abs(v[0]); v[1] = abs(v[1]); +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st8(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else buffer_st8(bottom_top_blob_data, gi, v); +#endif } diff --git a/src/layer/vulkan/shader/cast_fp16_to_fp32.comp b/src/layer/vulkan/shader/cast_fp16_to_fp32.comp index c3e281283..acef4bdc5 100644 --- a/src/layer/vulkan/shader/cast_fp16_to_fp32.comp +++ b/src/layer/vulkan/shader/cast_fp16_to_fp32.comp @@ -38,8 +38,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, r32f) writeonly uniform highp image1D top_blob_1d; +layout (binding = 1, r32f) writeonly uniform highp image2D top_blob_2d; +layout (binding = 1, r32f) writeonly uniform highp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -65,9 +74,24 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_cp1(top_blob_1d, gx, bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + image2d_cp1(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + image3d_cp1(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; const int v_offset = gz * psc(cstep) + gy * psc(w) + gx; top_blob_data[gi] = float(buffer_ld1(bottom_blob_data, v_offset)); +#endif } diff --git a/src/layer/vulkan/shader/cast_fp16_to_fp32_pack4.comp b/src/layer/vulkan/shader/cast_fp16_to_fp32_pack4.comp index cf39748e8..1d3090bdd 100644 --- a/src/layer/vulkan/shader/cast_fp16_to_fp32_pack4.comp +++ b/src/layer/vulkan/shader/cast_fp16_to_fp32_pack4.comp @@ -38,8 +38,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, rgba32f) writeonly uniform highp image1D top_blob_1d; +layout (binding = 1, rgba32f) writeonly uniform highp image2D top_blob_2d; +layout (binding = 1, rgba32f) writeonly uniform highp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -65,9 +74,24 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_cp4(top_blob_1d, gx, bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + image2d_cp4(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + image3d_cp4(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; const int v_offset = gz * psc(cstep) + gy * psc(w) + gx; top_blob_data[gi] = vec4(buffer_ld4(bottom_blob_data, v_offset)); +#endif } diff --git a/src/layer/vulkan/shader/cast_fp16_to_fp32_pack8.comp b/src/layer/vulkan/shader/cast_fp16_to_fp32_pack8.comp index 315fabaa5..90dc72085 100644 --- a/src/layer/vulkan/shader/cast_fp16_to_fp32_pack8.comp +++ b/src/layer/vulkan/shader/cast_fp16_to_fp32_pack8.comp @@ -39,8 +39,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, rgba32f) writeonly uniform highp image1D top_blob_1d; +layout (binding = 1, rgba32f) writeonly uniform highp image2D top_blob_2d; +layout (binding = 1, rgba32f) writeonly uniform highp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { mat2x4 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -66,9 +75,24 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_cp8(top_blob_1d, gx, bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + image2d_cp8(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + image3d_cp8(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; const int v_offset = gz * psc(cstep) + gy * psc(w) + gx; top_blob_data[gi] = mat2x4(buffer_ld8(bottom_blob_data, v_offset)); +#endif } diff --git a/src/layer/vulkan/shader/cast_fp32_to_fp16.comp b/src/layer/vulkan/shader/cast_fp32_to_fp16.comp index 146311706..5649cf259 100644 --- a/src/layer/vulkan/shader/cast_fp32_to_fp16.comp +++ b/src/layer/vulkan/shader/cast_fp32_to_fp16.comp @@ -38,8 +38,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform highp sampler1D bottom_blob_1d; +layout (binding = 0) uniform highp sampler2D bottom_blob_2d; +layout (binding = 0) uniform highp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -65,9 +74,24 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_cp1(top_blob_1d, gx, bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + image2d_cp1(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + image3d_cp1(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; const int v_offset = gz * psc(cstep) + gy * psc(w) + gx; buffer_st1(top_blob_data, gi, afp(bottom_blob_data[v_offset])); +#endif } diff --git a/src/layer/vulkan/shader/cast_fp32_to_fp16_pack4.comp b/src/layer/vulkan/shader/cast_fp32_to_fp16_pack4.comp index 00d860e68..86bf021d4 100644 --- a/src/layer/vulkan/shader/cast_fp32_to_fp16_pack4.comp +++ b/src/layer/vulkan/shader/cast_fp32_to_fp16_pack4.comp @@ -38,8 +38,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform highp sampler1D bottom_blob_1d; +layout (binding = 0) uniform highp sampler2D bottom_blob_2d; +layout (binding = 0) uniform highp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -65,9 +74,24 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_cp4(top_blob_1d, gx, bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + image2d_cp4(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + image3d_cp4(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; const int v_offset = gz * psc(cstep) + gy * psc(w) + gx; buffer_st4(top_blob_data, gi, afpvec4(bottom_blob_data[v_offset])); +#endif } diff --git a/src/layer/vulkan/shader/cast_fp32_to_fp16_pack8.comp b/src/layer/vulkan/shader/cast_fp32_to_fp16_pack8.comp index 8873f440c..d1142d6b8 100644 --- a/src/layer/vulkan/shader/cast_fp32_to_fp16_pack8.comp +++ b/src/layer/vulkan/shader/cast_fp32_to_fp16_pack8.comp @@ -39,8 +39,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform highp sampler1D bottom_blob_1d; +layout (binding = 0) uniform highp sampler2D bottom_blob_2d; +layout (binding = 0) uniform highp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob { mat2x4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -66,9 +75,24 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_cp8(top_blob_1d, gx, bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + image2d_cp8(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + image3d_cp8(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; const int v_offset = gz * psc(cstep) + gy * psc(w) + gx; buffer_st8(top_blob_data, gi, afpvec8(bottom_blob_data[v_offset])); +#endif } diff --git a/src/layer/vulkan/shader/concat.comp b/src/layer/vulkan/shader/concat.comp index e3df157db..5c904b42e 100644 --- a/src/layer/vulkan/shader/concat.comp +++ b/src/layer/vulkan/shader/concat.comp @@ -40,8 +40,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -69,6 +78,23 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_cp1(top_blob_1d, gx + p.offset, bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + if (axis == 0) image2d_cp1(top_blob_2d, ivec2(gx, gy + p.offset), bottom_blob_2d, ivec2(gx, gy)); + if (axis == 1) image2d_cp1(top_blob_2d, ivec2(gx + p.offset, gy), bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + if (axis == 0) image3d_cp1(top_blob_3d, ivec3(gx, gy, gz + p.offset), bottom_blob_3d, ivec3(gx, gy, gz)); + if (axis == 1) image3d_cp1(top_blob_3d, ivec3(gx, gy + p.offset, gz), bottom_blob_3d, ivec3(gx, gy, gz)); + if (axis == 2) image3d_cp1(top_blob_3d, ivec3(gx + p.offset, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else const int gi = gz * psc(cstep) + gy * psc(w) + gx; ivec3 gxyz = ivec3(gx, gy, gz); @@ -78,4 +104,5 @@ void main() int v_offset = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x; buffer_cp1(top_blob_data, v_offset, bottom_blob_data, gi); +#endif } diff --git a/src/layer/vulkan/shader/concat_pack4.comp b/src/layer/vulkan/shader/concat_pack4.comp index 9a97021b0..e904aec55 100644 --- a/src/layer/vulkan/shader/concat_pack4.comp +++ b/src/layer/vulkan/shader/concat_pack4.comp @@ -40,8 +40,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -69,6 +78,23 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_cp4(top_blob_1d, gx + p.offset, bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + if (axis == 0) image2d_cp4(top_blob_2d, ivec2(gx, gy + p.offset), bottom_blob_2d, ivec2(gx, gy)); + if (axis == 1) image2d_cp4(top_blob_2d, ivec2(gx + p.offset, gy), bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + if (axis == 0) image3d_cp4(top_blob_3d, ivec3(gx, gy, gz + p.offset), bottom_blob_3d, ivec3(gx, gy, gz)); + if (axis == 1) image3d_cp4(top_blob_3d, ivec3(gx, gy + p.offset, gz), bottom_blob_3d, ivec3(gx, gy, gz)); + if (axis == 2) image3d_cp4(top_blob_3d, ivec3(gx + p.offset, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else const int gi = gz * psc(cstep) + gy * psc(w) + gx; ivec3 gxyz = ivec3(gx, gy, gz); @@ -78,4 +104,5 @@ void main() int v_offset = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x; buffer_cp4(top_blob_data, v_offset, bottom_blob_data, gi); +#endif } diff --git a/src/layer/vulkan/shader/concat_pack4to1.comp b/src/layer/vulkan/shader/concat_pack4to1.comp index 6d380b823..bf69cebab 100644 --- a/src/layer/vulkan/shader/concat_pack4to1.comp +++ b/src/layer/vulkan/shader/concat_pack4to1.comp @@ -40,8 +40,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -69,6 +78,74 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + if (psc(dims) == 1) + { + afpvec4 v = image1d_ld4(bottom_blob_1d, gx); + + int gx4 = gx * 4 + p.offset; + + image1d_st1(top_blob_1d, gx4 + 0, v.r); + image1d_st1(top_blob_1d, gx4 + 1, v.g); + image1d_st1(top_blob_1d, gx4 + 2, v.b); + image1d_st1(top_blob_1d, gx4 + 3, v.a); + } + else if (psc(dims) == 2) + { + afpvec4 v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); + + if (axis == 0) + { + int gy4 = gy * 4 + p.offset; + + image2d_st1(top_blob_2d, ivec2(gx, gy4 + 0), v.r); + image2d_st1(top_blob_2d, ivec2(gx, gy4 + 1), v.g); + image2d_st1(top_blob_2d, ivec2(gx, gy4 + 2), v.b); + image2d_st1(top_blob_2d, ivec2(gx, gy4 + 3), v.a); + } + if (axis == 1) + { + int gx4 = gx * 4 + p.offset; + + image2d_st1(top_blob_2d, ivec2(gx4 + 0, gy), v.r); + image2d_st1(top_blob_2d, ivec2(gx4 + 1, gy), v.g); + image2d_st1(top_blob_2d, ivec2(gx4 + 2, gy), v.b); + image2d_st1(top_blob_2d, ivec2(gx4 + 3, gy), v.a); + } + } + else // if (psc(dims) == 3) + { + afpvec4 v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + + if (axis == 0) + { + int gz4 = gz * 4 + p.offset; + + image3d_st1(top_blob_3d, ivec3(gx, gy, gz4 + 0), v.r); + image3d_st1(top_blob_3d, ivec3(gx, gy, gz4 + 1), v.g); + image3d_st1(top_blob_3d, ivec3(gx, gy, gz4 + 2), v.b); + image3d_st1(top_blob_3d, ivec3(gx, gy, gz4 + 3), v.a); + } + if (axis == 1) + { + int gy4 = gy * 4 + p.offset; + + image3d_st1(top_blob_3d, ivec3(gx, gy4 + 0, gz), v.r); + image3d_st1(top_blob_3d, ivec3(gx, gy4 + 1, gz), v.g); + image3d_st1(top_blob_3d, ivec3(gx, gy4 + 2, gz), v.b); + image3d_st1(top_blob_3d, ivec3(gx, gy4 + 3, gz), v.a); + } + if (axis == 2) + { + int gx4 = gx * 4 + p.offset; + + image3d_st1(top_blob_3d, ivec3(gx4 + 0, gy, gz), v.r); + image3d_st1(top_blob_3d, ivec3(gx4 + 1, gy, gz), v.g); + image3d_st1(top_blob_3d, ivec3(gx4 + 2, gy, gz), v.b); + image3d_st1(top_blob_3d, ivec3(gx4 + 3, gy, gz), v.a); + } + } +#else const int gi = gz * psc(cstep) + gy * psc(w) + gx; ivec3 gxyz = ivec3(gx, gy, gz); @@ -83,4 +160,5 @@ void main() ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * gxyz4[psc(dims) - 1 - axis]; buffer_cp4to1(top_blob_data, v_offset, bottom_blob_data, gi); +#endif } diff --git a/src/layer/vulkan/shader/concat_pack8.comp b/src/layer/vulkan/shader/concat_pack8.comp index 4429a7faa..6353705a5 100644 --- a/src/layer/vulkan/shader/concat_pack8.comp +++ b/src/layer/vulkan/shader/concat_pack8.comp @@ -41,8 +41,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -70,6 +79,23 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_cp8(top_blob_1d, gx + p.offset, bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + if (axis == 0) image2d_cp8(top_blob_2d, ivec2(gx, gy + p.offset), bottom_blob_2d, ivec2(gx, gy)); + if (axis == 1) image2d_cp8(top_blob_2d, ivec2(gx + p.offset, gy), bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + if (axis == 0) image3d_cp8(top_blob_3d, ivec3(gx, gy, gz + p.offset), bottom_blob_3d, ivec3(gx, gy, gz)); + if (axis == 1) image3d_cp8(top_blob_3d, ivec3(gx, gy + p.offset, gz), bottom_blob_3d, ivec3(gx, gy, gz)); + if (axis == 2) image3d_cp8(top_blob_3d, ivec3(gx + p.offset, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else const int gi = gz * psc(cstep) + gy * psc(w) + gx; ivec3 gxyz = ivec3(gx, gy, gz); @@ -79,4 +105,5 @@ void main() int v_offset = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x; buffer_cp8(top_blob_data, v_offset, bottom_blob_data, gi); +#endif } diff --git a/src/layer/vulkan/shader/concat_pack8to1.comp b/src/layer/vulkan/shader/concat_pack8to1.comp index a47fcc3c0..ffeedd8c9 100644 --- a/src/layer/vulkan/shader/concat_pack8to1.comp +++ b/src/layer/vulkan/shader/concat_pack8to1.comp @@ -41,8 +41,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -70,6 +79,98 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + if (psc(dims) == 1) + { + afpvec8 v = image1d_ld8(bottom_blob_1d, gx); + + int gx8 = gx * 8 + p.offset; + + image1d_st1(top_blob_1d, gx8 + 0, v[0].r); + image1d_st1(top_blob_1d, gx8 + 1, v[0].g); + image1d_st1(top_blob_1d, gx8 + 2, v[0].b); + image1d_st1(top_blob_1d, gx8 + 3, v[0].a); + image1d_st1(top_blob_1d, gx8 + 4, v[1].r); + image1d_st1(top_blob_1d, gx8 + 5, v[1].g); + image1d_st1(top_blob_1d, gx8 + 6, v[1].b); + image1d_st1(top_blob_1d, gx8 + 7, v[1].a); + } + else if (psc(dims) == 2) + { + afpvec8 v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + + if (axis == 0) + { + int gy8 = gy * 8 + p.offset; + + image2d_st1(top_blob_2d, ivec2(gx, gy8 + 0), v[0].r); + image2d_st1(top_blob_2d, ivec2(gx, gy8 + 1), v[0].g); + image2d_st1(top_blob_2d, ivec2(gx, gy8 + 2), v[0].b); + image2d_st1(top_blob_2d, ivec2(gx, gy8 + 3), v[0].a); + image2d_st1(top_blob_2d, ivec2(gx, gy8 + 4), v[1].r); + image2d_st1(top_blob_2d, ivec2(gx, gy8 + 5), v[1].g); + image2d_st1(top_blob_2d, ivec2(gx, gy8 + 6), v[1].b); + image2d_st1(top_blob_2d, ivec2(gx, gy8 + 7), v[1].a); + } + if (axis == 1) + { + int gx8 = gx * 8 + p.offset; + + image2d_st1(top_blob_2d, ivec2(gx8 + 0, gy), v[0].r); + image2d_st1(top_blob_2d, ivec2(gx8 + 1, gy), v[0].g); + image2d_st1(top_blob_2d, ivec2(gx8 + 2, gy), v[0].b); + image2d_st1(top_blob_2d, ivec2(gx8 + 3, gy), v[0].a); + image2d_st1(top_blob_2d, ivec2(gx8 + 4, gy), v[1].r); + image2d_st1(top_blob_2d, ivec2(gx8 + 5, gy), v[1].g); + image2d_st1(top_blob_2d, ivec2(gx8 + 6, gy), v[1].b); + image2d_st1(top_blob_2d, ivec2(gx8 + 7, gy), v[1].a); + } + } + else // if (psc(dims) == 3) + { + afpvec8 v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + + if (axis == 0) + { + int gz8 = gz * 8 + p.offset; + + image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 0), v[0].r); + image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 1), v[0].g); + image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 2), v[0].b); + image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 3), v[0].a); + image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 4), v[1].r); + image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 5), v[1].g); + image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 6), v[1].b); + image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 7), v[1].a); + } + if (axis == 1) + { + int gy8 = gy * 8 + p.offset; + + image3d_st1(top_blob_3d, ivec3(gx, gy8 + 0, gz), v[0].r); + image3d_st1(top_blob_3d, ivec3(gx, gy8 + 1, gz), v[0].g); + image3d_st1(top_blob_3d, ivec3(gx, gy8 + 2, gz), v[0].b); + image3d_st1(top_blob_3d, ivec3(gx, gy8 + 3, gz), v[0].a); + image3d_st1(top_blob_3d, ivec3(gx, gy8 + 4, gz), v[1].r); + image3d_st1(top_blob_3d, ivec3(gx, gy8 + 5, gz), v[1].g); + image3d_st1(top_blob_3d, ivec3(gx, gy8 + 6, gz), v[1].b); + image3d_st1(top_blob_3d, ivec3(gx, gy8 + 7, gz), v[1].a); + } + if (axis == 2) + { + int gx8 = gx * 8 + p.offset; + + image3d_st1(top_blob_3d, ivec3(gx8 + 0, gy, gz), v[0].r); + image3d_st1(top_blob_3d, ivec3(gx8 + 1, gy, gz), v[0].g); + image3d_st1(top_blob_3d, ivec3(gx8 + 2, gy, gz), v[0].b); + image3d_st1(top_blob_3d, ivec3(gx8 + 3, gy, gz), v[0].a); + image3d_st1(top_blob_3d, ivec3(gx8 + 4, gy, gz), v[1].r); + image3d_st1(top_blob_3d, ivec3(gx8 + 5, gy, gz), v[1].g); + image3d_st1(top_blob_3d, ivec3(gx8 + 6, gy, gz), v[1].b); + image3d_st1(top_blob_3d, ivec3(gx8 + 7, gy, gz), v[1].a); + } + } +#else const int gi = gz * psc(cstep) + gy * psc(w) + gx; ivec3 gxyz = ivec3(gx, gy, gz); @@ -85,4 +186,5 @@ void main() ivec4 vv_offset = v_offset + 4 * gxyz4[psc(dims) - 1 - axis]; buffer_cp8to1(top_blob_data, v_offset, vv_offset, bottom_blob_data, gi); +#endif } diff --git a/src/layer/vulkan/shader/concat_pack8to4.comp b/src/layer/vulkan/shader/concat_pack8to4.comp index c045ae6ba..6890e0f14 100644 --- a/src/layer/vulkan/shader/concat_pack8to4.comp +++ b/src/layer/vulkan/shader/concat_pack8to4.comp @@ -41,8 +41,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -70,6 +79,63 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + if (psc(dims) == 1) + { + afpvec8 v = image1d_ld8(bottom_blob_1d, gx); + + int gx2 = gx * 2 + p.offset; + + image1d_st4(top_blob_1d, gx2 + 0, v[0]); + image1d_st4(top_blob_1d, gx2 + 1, v[1]); + + } + else if (psc(dims) == 2) + { + afpvec8 v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + + if (axis == 0) + { + int gy2 = gy * 2 + p.offset; + + image2d_st4(top_blob_2d, ivec2(gx, gy2 + 0), v[0]); + image2d_st4(top_blob_2d, ivec2(gx, gy2 + 1), v[1]); + } + if (axis == 1) + { + int gx2 = gx * 2 + p.offset; + + image2d_st4(top_blob_2d, ivec2(gx2 + 0, gy), v[0]); + image2d_st4(top_blob_2d, ivec2(gx2 + 1, gy), v[1]); + } + } + else // if (psc(dims) == 3) + { + afpvec8 v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + + if (axis == 0) + { + int gz2 = gz * 2 + p.offset; + + image3d_st4(top_blob_3d, ivec3(gx, gy, gz2 + 0), v[0]); + image3d_st4(top_blob_3d, ivec3(gx, gy, gz2 + 1), v[1]); + } + if (axis == 1) + { + int gy2 = gy * 2 + p.offset; + + image3d_st4(top_blob_3d, ivec3(gx, gy2 + 0, gz), v[0]); + image3d_st4(top_blob_3d, ivec3(gx, gy2 + 1, gz), v[1]); + } + if (axis == 2) + { + int gx2 = gx * 2 + p.offset; + + image3d_st4(top_blob_3d, ivec3(gx2 + 0, gy, gz), v[0]); + image3d_st4(top_blob_3d, ivec3(gx2 + 1, gy, gz), v[1]); + } + } +#else const int gi = gz * psc(cstep) + gy * psc(w) + gx; ivec3 gxyz = ivec3(gx, gy, gz); @@ -84,4 +150,5 @@ void main() ivec2 v_offset = v_offset_0 + ivec2(0, 1) * gxyz4[psc(dims) - 1 - axis]; buffer_cp8to4(top_blob_data, v_offset, bottom_blob_data, gi); +#endif } diff --git a/src/layer/vulkan/shader/convolution.comp b/src/layer/vulkan/shader/convolution.comp index f7a38be1e..b2655da75 100644 --- a/src/layer/vulkan/shader/convolution.comp +++ b/src/layer/vulkan/shader/convolution.comp @@ -49,10 +49,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -82,13 +89,39 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else sum = buffer_ld1(bias_data, gz); +#endif } else { sum = afp(0.f); } +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + sum += image3d_ld1(weight_blob, ivec3(wx, z, gz)) * image3d_ld1(bottom_blob, ivec3(sx, sy, z)); + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } + } +#else // NCNN_image_shader int w_offset = gz * psc(c) * kernel_w * kernel_h; for (int z = 0; z < psc(c); z++) @@ -106,6 +139,7 @@ void main() w_offset += kernel_w; } } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -127,7 +161,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st1(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/convolution_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_1x1s1d1.comp index 1fa28abbc..f3fbded03 100644 --- a/src/layer/vulkan/shader/convolution_1x1s1d1.comp +++ b/src/layer/vulkan/shader/convolution_1x1s1d1.comp @@ -21,26 +21,40 @@ #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif -layout (constant_id = 0) const int bias_term = 0; -layout (constant_id = 1) const int activation_type = 0; -layout (constant_id = 2) const float activation_param_0 = 0; -layout (constant_id = 3) const float activation_param_1 = 0; - -#define shape_constant_id_offset 4 +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int activation_type = 0; +layout (constant_id = 8) const float activation_param_0 = 0; +layout (constant_id = 9) const float activation_param_1 = 0; + +#define shape_constant_id_offset 10 layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int size_4 = 0; -layout (constant_id = shape_constant_id_offset + 2) const int c = 0; -layout (constant_id = shape_constant_id_offset + 3) const int cstep_4 = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; -layout (constant_id = shape_constant_id_offset + 4) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 5) const int outsize_4 = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outcstep_4 = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader #if NCNN_fp16_packed layout (binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; }; @@ -50,40 +64,67 @@ layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; #endif layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { int dims; - int size_4; + int w; + int h; int c; - int cstep_4; + int cstep; int outdims; - int outsize_4; + int outw; + int outh; int outc; - int outcstep_4; + int outcstep; } p; void main() { +#if NCNN_image_shader + int gx = int(gl_GlobalInvocationID.x) * 2; + int gy = int(gl_GlobalInvocationID.y) * 2; + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; +#else int gx = int(gl_GlobalInvocationID.x); int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outsize_4) || gy >= 1 || gz >= psc(outc)) + if (gx * 4 >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) return; +#endif afpvec4 sum; if (bias_term == 1) { +#if NCNN_image_shader + sum = afpvec4(image1d_ld1(bias_blob, gz)); +#else sum = afpvec4(buffer_ld1(bias_data, gz)); +#endif } else { sum = afpvec4(0.f); } +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + afp k = image3d_ld1(weight_blob, ivec3(0, z, gz)); + + sum.r += k * image3d_ld1(bottom_blob, ivec3(gx, gy, z)); + sum.g += k * image3d_ld1(bottom_blob, ivec3(gx+1, gy, z)); + sum.b += k * image3d_ld1(bottom_blob, ivec3(gx, gy+1, z)); + sum.a += k * image3d_ld1(bottom_blob, ivec3(gx+1, gy+1, z)); + } +#else // NCNN_image_shader int w_offset = gz * psc(c); int v_offset = gx; @@ -96,8 +137,9 @@ void main() #endif w_offset += 1; - v_offset += psc(cstep_4); + v_offset += psc(cstep) / 4; } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -119,11 +161,18 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } - const int gi = gz * psc(outcstep_4) + gx; +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum.r); + image3d_st1(top_blob, ivec3(gx+1, gy, gz), sum.g); + image3d_st1(top_blob, ivec3(gx, gy+1, gz), sum.b); + image3d_st1(top_blob, ivec3(gx+1, gy+1, gz), sum.a); +#else + const int gi = gz * psc(outcstep) + gx; #if NCNN_fp16_packed top_blob_data[gi] = sum; #else buffer_st4(top_blob_data, gi, sum); #endif +#endif } diff --git a/src/layer/vulkan/shader/convolution_pack1to4.comp b/src/layer/vulkan/shader/convolution_pack1to4.comp index 9b16f638f..cd62c247e 100644 --- a/src/layer/vulkan/shader/convolution_pack1to4.comp +++ b/src/layer/vulkan/shader/convolution_pack1to4.comp @@ -49,10 +49,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -82,13 +89,43 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else sum = buffer_ld4(bias_data, gz); +#endif } else { sum = afpvec4(0.f); } +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, z)); + + afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz)); + + sum += v * k; + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } + } +#else // NCNN_image_shader int w_offset = gz * psc(c) * kernel_w * kernel_h; for (int z = 0; z < psc(c); z++) @@ -110,6 +147,7 @@ void main() w_offset += kernel_w; } } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -131,7 +169,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st4(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/convolution_pack1to8.comp b/src/layer/vulkan/shader/convolution_pack1to8.comp index efd8ed6ea..661c31695 100644 --- a/src/layer/vulkan/shader/convolution_pack1to8.comp +++ b/src/layer/vulkan/shader/convolution_pack1to8.comp @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -83,13 +90,45 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else sum = buffer_ld8(bias_data, gz); +#endif } else { sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); } +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, z)); + + afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz)); + + // sum += v * k; + sum[0] += v * k[0]; + sum[1] += v * k[1]; + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } + } +#else // NCNN_image_shader int w_offset = gz * psc(c) * kernel_w * kernel_h; for (int z = 0; z < psc(c); z++) @@ -113,6 +152,7 @@ void main() w_offset += kernel_w; } } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -138,7 +178,11 @@ void main() sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); } +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st8(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/convolution_pack4.comp b/src/layer/vulkan/shader/convolution_pack4.comp index 81f8dbe40..677fe56f1 100644 --- a/src/layer/vulkan/shader/convolution_pack4.comp +++ b/src/layer/vulkan/shader/convolution_pack4.comp @@ -49,6 +49,12 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; #if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) @@ -58,6 +64,7 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; #endif layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -87,13 +94,48 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else sum = buffer_ld4(bias_data, gz); +#endif } else { sum = afpvec4(0.f); } +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, z)); + + afpmat4 k = afpmat4( + image3d_ld4(weight_blob, ivec3(wx + 0, z, gz)), + image3d_ld4(weight_blob, ivec3(wx + 1, z, gz)), + image3d_ld4(weight_blob, ivec3(wx + 2, z, gz)), + image3d_ld4(weight_blob, ivec3(wx + 3, z, gz)) + ); + + sum += v * k; + + sx += dilation_w; + wx += 4; + } + + sy += dilation_h; + } + } +#else // NCNN_image_shader int w_offset = gz * psc(c) * kernel_w * kernel_h; for (int z = 0; z < psc(c); z++) @@ -125,6 +167,7 @@ void main() w_offset += kernel_w; } } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -146,7 +189,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st4(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp index 147333ed2..5b95a71db 100644 --- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp +++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp @@ -21,26 +21,40 @@ #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif -layout (constant_id = 0) const int bias_term = 0; -layout (constant_id = 1) const int activation_type = 0; -layout (constant_id = 2) const float activation_param_0 = 0; -layout (constant_id = 3) const float activation_param_1 = 0; - -#define shape_constant_id_offset 4 +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int activation_type = 0; +layout (constant_id = 8) const float activation_param_0 = 0; +layout (constant_id = 9) const float activation_param_1 = 0; + +#define shape_constant_id_offset 10 layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int size = 0; -layout (constant_id = shape_constant_id_offset + 2) const int c = 0; -layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; -layout (constant_id = shape_constant_id_offset + 4) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 5) const int outsize = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; #if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) @@ -50,28 +64,40 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; #endif layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { int dims; - int size; + int w; + int h; int c; int cstep; int outdims; - int outsize; + int outw; + int outh; int outc; int outcstep; } p; void main() { +#if NCNN_image_shader + int gx = int(gl_GlobalInvocationID.x) * 2; + int gy = int(gl_GlobalInvocationID.y) * 2; + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; +#else int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outsize) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) return; +#endif afpvec4 sum0; afpvec4 sum1; @@ -80,7 +106,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + afpvec4 b = image1d_ld4(bias_blob, gz); +#else afpvec4 b = buffer_ld4(bias_data, gz); +#endif sum0 = b; sum1 = b; sum2 = b; @@ -94,6 +124,27 @@ void main() sum3 = afpvec4(0.f); } +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(gx, gy, z)); + afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(gx+1, gy, z)); + afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(gx, gy+1, z)); + afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(gx+1, gy+1, z)); + + afpmat4 k = afpmat4( + image3d_ld4(weight_blob, ivec3(0, z, gz)), + image3d_ld4(weight_blob, ivec3(1, z, gz)), + image3d_ld4(weight_blob, ivec3(2, z, gz)), + image3d_ld4(weight_blob, ivec3(3, z, gz)) + ); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + } +#else // NCNN_image_shader int w_offset = gz * psc(c); int v_offset = gx; @@ -124,6 +175,7 @@ void main() w_offset += 1; v_offset += psc(cstep); } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -157,10 +209,17 @@ void main() sum3 = afp(1.f) / (afp(1.f) + exp(-sum3)); } +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum0); + image3d_st4(top_blob, ivec3(gx+1, gy, gz), sum1); + image3d_st4(top_blob, ivec3(gx, gy+1, gz), sum2); + image3d_st4(top_blob, ivec3(gx+1, gy+1, gz), sum3); +#else int gi = gz * psc(outcstep) + gx; buffer_st4(top_blob_data, gi + 0, sum0); if (gx + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, sum1); if (gx + 2 < psc(outcstep)) buffer_st4(top_blob_data, gi + 2, sum2); if (gx + 3 < psc(outcstep)) buffer_st4(top_blob_data, gi + 3, sum3); +#endif } diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_gemm.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_gemm.comp index f76cda275..6a49a3fc4 100644 --- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_gemm.comp +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_gemm.comp @@ -33,6 +33,11 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_tm_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_tm_blob; +layout (binding = 2) uniform unfp sampler3D weight_tm_blob; +#else layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; }; layout (binding = 1) writeonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; }; #if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) @@ -41,6 +46,7 @@ layout (binding = 2) readonly buffer weight_tm_blob { sfpvec4 weight_tm_data[]; #else layout (binding = 2) readonly buffer weight_tm_blob { sfpmat4 weight_tm_data[]; }; #endif +#endif layout (push_constant) uniform parameter { @@ -66,6 +72,29 @@ void main() afpvec4 sum2 = afpvec4(0.f); afpvec4 sum3 = afpvec4(0.f); +#if NCNN_image_shader + int wx = gx * 4; + + for (int z = 0; z < psc(c); z++) + { + afpvec4 v0 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 0, z)); + afpvec4 v1 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 1, z)); + afpvec4 v2 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 2, z)); + afpvec4 v3 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 3, z)); + + afpmat4 k = afpmat4( + image3d_ld4(weight_tm_blob, ivec3(wx + 0, z, gz)), + image3d_ld4(weight_tm_blob, ivec3(wx + 1, z, gz)), + image3d_ld4(weight_tm_blob, ivec3(wx + 2, z, gz)), + image3d_ld4(weight_tm_blob, ivec3(wx + 3, z, gz)) + ); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + } +#else int v_offset = gy * 16 + gx; int w_offset = gz * psc(c) * 16 + gx; @@ -96,11 +125,19 @@ void main() v_offset += psc(cstep); w_offset += 16; } +#endif +#if NCNN_image_shader + image3d_st4(top_tm_blob, ivec3(gx, gy + 0, gz), sum0); + image3d_st4(top_tm_blob, ivec3(gx, gy + 1, gz), sum1); + image3d_st4(top_tm_blob, ivec3(gx, gy + 2, gz), sum2); + image3d_st4(top_tm_blob, ivec3(gx, gy + 3, gz), sum3); +#else int gi = gz * psc(outcstep) + gy * 16 + gx; buffer_st4(top_tm_blob_data, gi + 0, sum0); if (gy + 1 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 16, sum1); if (gy + 2 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 32, sum2); if (gy + 3 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 48, sum3); +#endif } diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input.comp index 606ca10ae..d23548069 100644 --- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input.comp +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input.comp @@ -36,8 +36,13 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D bottom_tm_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -62,6 +67,30 @@ void main() return; // load 4x4 +#if NCNN_image_shader + int sx = gx * 2; + int sy = gy * 2; + + afpvec4 v00 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 0, gz)); + afpvec4 v01 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 0, gz)); + afpvec4 v02 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 0, gz)); + afpvec4 v03 = image3d_ld4(bottom_blob, ivec3(sx + 3, sy + 0, gz)); + + afpvec4 v10 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 1, gz)); + afpvec4 v11 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 1, gz)); + afpvec4 v12 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 1, gz)); + afpvec4 v13 = image3d_ld4(bottom_blob, ivec3(sx + 3, sy + 1, gz)); + + afpvec4 v20 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 2, gz)); + afpvec4 v21 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 2, gz)); + afpvec4 v22 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 2, gz)); + afpvec4 v23 = image3d_ld4(bottom_blob, ivec3(sx + 3, sy + 2, gz)); + + afpvec4 v30 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 3, gz)); + afpvec4 v31 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 3, gz)); + afpvec4 v32 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 3, gz)); + afpvec4 v33 = image3d_ld4(bottom_blob, ivec3(sx + 3, sy + 3, gz)); +#else int v_offset_0 = gz * psc(cstep) + gy * 2 * psc(w) + gx * 2; ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w); @@ -84,6 +113,7 @@ void main() afpvec4 v31 = buffer_ld4(bottom_blob_data, v_offset.a + 1); afpvec4 v32 = buffer_ld4(bottom_blob_data, v_offset.a + 2); afpvec4 v33 = buffer_ld4(bottom_blob_data, v_offset.a + 3); +#endif // const float itm[4][4] = { // {1.0f, 0.0f, -1.0f, 0.0f}, @@ -134,6 +164,26 @@ void main() v33 = m33 - m31; // store 16 +#if NCNN_image_shader + int y = gy * p.block_x + gx; + + image3d_st4(bottom_tm_blob, ivec3(0, y, gz), v00); + image3d_st4(bottom_tm_blob, ivec3(1, y, gz), v01); + image3d_st4(bottom_tm_blob, ivec3(2, y, gz), v02); + image3d_st4(bottom_tm_blob, ivec3(3, y, gz), v03); + image3d_st4(bottom_tm_blob, ivec3(4, y, gz), v10); + image3d_st4(bottom_tm_blob, ivec3(5, y, gz), v11); + image3d_st4(bottom_tm_blob, ivec3(6, y, gz), v12); + image3d_st4(bottom_tm_blob, ivec3(7, y, gz), v13); + image3d_st4(bottom_tm_blob, ivec3(8, y, gz), v20); + image3d_st4(bottom_tm_blob, ivec3(9, y, gz), v21); + image3d_st4(bottom_tm_blob, ivec3(10, y, gz), v22); + image3d_st4(bottom_tm_blob, ivec3(11, y, gz), v23); + image3d_st4(bottom_tm_blob, ivec3(12, y, gz), v30); + image3d_st4(bottom_tm_blob, ivec3(13, y, gz), v31); + image3d_st4(bottom_tm_blob, ivec3(14, y, gz), v32); + image3d_st4(bottom_tm_blob, ivec3(15, y, gz), v33); +#else int v_tm_offset = gz * psc(outcstep) + (gy * p.block_x + gx) * 16; buffer_st4(bottom_tm_blob_data, v_tm_offset + 0, v00); @@ -152,4 +202,5 @@ void main() buffer_st4(bottom_tm_blob_data, v_tm_offset + 13, v31); buffer_st4(bottom_tm_blob_data, v_tm_offset + 14, v32); buffer_st4(bottom_tm_blob_data, v_tm_offset + 15, v33); +#endif } diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output.comp index dee997a88..8f5ba51a6 100644 --- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output.comp +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output.comp @@ -41,9 +41,15 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D top_tm_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler1D bias_blob; +#else layout (binding = 0) readonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; layout (binding = 2) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -68,6 +74,26 @@ void main() return; // load 16 +#if NCNN_image_shader + int sy = gy * p.block_x + gx; + + afpvec4 v00 = image3d_ld4(top_tm_blob, ivec3(0, sy, gz)); + afpvec4 v01 = image3d_ld4(top_tm_blob, ivec3(1, sy, gz)); + afpvec4 v02 = image3d_ld4(top_tm_blob, ivec3(2, sy, gz)); + afpvec4 v03 = image3d_ld4(top_tm_blob, ivec3(3, sy, gz)); + afpvec4 v10 = image3d_ld4(top_tm_blob, ivec3(4, sy, gz)); + afpvec4 v11 = image3d_ld4(top_tm_blob, ivec3(5, sy, gz)); + afpvec4 v12 = image3d_ld4(top_tm_blob, ivec3(6, sy, gz)); + afpvec4 v13 = image3d_ld4(top_tm_blob, ivec3(7, sy, gz)); + afpvec4 v20 = image3d_ld4(top_tm_blob, ivec3(8, sy, gz)); + afpvec4 v21 = image3d_ld4(top_tm_blob, ivec3(9, sy, gz)); + afpvec4 v22 = image3d_ld4(top_tm_blob, ivec3(10, sy, gz)); + afpvec4 v23 = image3d_ld4(top_tm_blob, ivec3(11, sy, gz)); + afpvec4 v30 = image3d_ld4(top_tm_blob, ivec3(12, sy, gz)); + afpvec4 v31 = image3d_ld4(top_tm_blob, ivec3(13, sy, gz)); + afpvec4 v32 = image3d_ld4(top_tm_blob, ivec3(14, sy, gz)); + afpvec4 v33 = image3d_ld4(top_tm_blob, ivec3(15, sy, gz)); +#else int v_tm_offset = gz * psc(cstep) + (gy * p.block_x + gx) * 16; afpvec4 v00 = buffer_ld4(top_tm_blob_data, v_tm_offset + 0); @@ -86,6 +112,7 @@ void main() afpvec4 v31 = buffer_ld4(top_tm_blob_data, v_tm_offset + 13); afpvec4 v32 = buffer_ld4(top_tm_blob_data, v_tm_offset + 14); afpvec4 v33 = buffer_ld4(top_tm_blob_data, v_tm_offset + 15); +#endif // const float itm[2][4] = { // {1.0f, 1.0f, 1.0f, 0.0f}, @@ -105,7 +132,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + const afpvec4 bias_value = image1d_ld4(bias_blob, gz); +#else const afpvec4 bias_value = buffer_ld4(bias_data, gz); +#endif v00 = bias_value + m00 + m01 + m02; v10 = bias_value + m10 + m11 + m12; @@ -155,6 +186,15 @@ void main() } // store 2x2 +#if NCNN_image_shader + int x = gx * 2; + int y = gy * 2; + + image3d_st4(top_blob, ivec3(x, y, gz), v00); + image3d_st4(top_blob, ivec3(x + 1, y, gz), v01); + image3d_st4(top_blob, ivec3(x, y + 1, gz), v10); + image3d_st4(top_blob, ivec3(x + 1, y + 1, gz), v11); +#else int v_offset_0 = gz * psc(outcstep) + gy * 2 * psc(outw) + gx * 2; int v_offset_1 = v_offset_0 + psc(outw); @@ -162,4 +202,5 @@ void main() buffer_st4(top_blob_data, v_offset_0 + 1, v01); buffer_st4(top_blob_data, v_offset_1 + 0, v10); buffer_st4(top_blob_data, v_offset_1 + 1, v11); +#endif } diff --git a/src/layer/vulkan/shader/convolution_pack4to1.comp b/src/layer/vulkan/shader/convolution_pack4to1.comp index 876cd2f9d..1427e029f 100644 --- a/src/layer/vulkan/shader/convolution_pack4to1.comp +++ b/src/layer/vulkan/shader/convolution_pack4to1.comp @@ -49,10 +49,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -82,13 +89,43 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else sum = buffer_ld1(bias_data, gz); +#endif } else { sum = afp(0.f); } +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, z)); + + afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz)); + + sum += dot(v, k); + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } + } +#else // NCNN_image_shader int w_offset = gz * psc(c) * kernel_w * kernel_h; for (int z = 0; z < psc(c); z++) @@ -110,6 +147,7 @@ void main() w_offset += kernel_w; } } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -131,7 +169,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st1(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/convolution_pack4to8.comp b/src/layer/vulkan/shader/convolution_pack4to8.comp index 89320a6c2..531d27677 100644 --- a/src/layer/vulkan/shader/convolution_pack4to8.comp +++ b/src/layer/vulkan/shader/convolution_pack4to8.comp @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -83,13 +90,58 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else sum = buffer_ld8(bias_data, gz); +#endif } else { sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); } +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, z)); + + afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx + 0, z, gz)); + afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx + 1, z, gz)); + afpvec4 k2 = image3d_ld4(weight_blob, ivec3(wx + 2, z, gz)); + afpvec4 k3 = image3d_ld4(weight_blob, ivec3(wx + 3, z, gz)); + afpvec4 k4 = image3d_ld4(weight_blob, ivec3(wx + 4, z, gz)); + afpvec4 k5 = image3d_ld4(weight_blob, ivec3(wx + 5, z, gz)); + afpvec4 k6 = image3d_ld4(weight_blob, ivec3(wx + 6, z, gz)); + afpvec4 k7 = image3d_ld4(weight_blob, ivec3(wx + 7, z, gz)); + + // sum += v * k; + sum[0].r += dot(v, k0); + sum[0].g += dot(v, k1); + sum[0].b += dot(v, k2); + sum[0].a += dot(v, k3); + sum[1].r += dot(v, k4); + sum[1].g += dot(v, k5); + sum[1].b += dot(v, k6); + sum[1].a += dot(v, k7); + + sx += dilation_w; + wx += 8; + } + + sy += dilation_h; + } + } +#else // NCNN_image_shader int w_offset = gz * psc(c) * kernel_w * kernel_h; for (int z = 0; z < psc(c); z++) @@ -126,6 +178,7 @@ void main() w_offset += kernel_w; } } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -151,7 +204,11 @@ void main() sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); } +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st8(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/convolution_pack8.comp b/src/layer/vulkan/shader/convolution_pack8.comp index 23911bfa4..17a7834d6 100644 --- a/src/layer/vulkan/shader/convolution_pack8.comp +++ b/src/layer/vulkan/shader/convolution_pack8.comp @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -83,13 +90,58 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else sum = buffer_ld8(bias_data, gz); +#endif } else { sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); } +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, z)); + + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz)); + afpvec8 k4 = image3d_ld8(weight_blob, ivec3(wx + 4, z, gz)); + afpvec8 k5 = image3d_ld8(weight_blob, ivec3(wx + 5, z, gz)); + afpvec8 k6 = image3d_ld8(weight_blob, ivec3(wx + 6, z, gz)); + afpvec8 k7 = image3d_ld8(weight_blob, ivec3(wx + 7, z, gz)); + + // sum += v * k; + sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]); + sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]); + sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]); + sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]); + + sx += dilation_w; + wx += 8; + } + + sy += dilation_h; + } + } +#else // NCNN_image_shader int w_offset = gz * psc(c) * kernel_w * kernel_h; for (int z = 0; z < psc(c); z++) @@ -126,6 +178,7 @@ void main() w_offset += kernel_w; } } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -151,7 +204,11 @@ void main() sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); } +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st8(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/convolution_pack8_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_pack8_1x1s1d1.comp index 4bf86950c..dce3443d2 100644 --- a/src/layer/vulkan/shader/convolution_pack8_1x1s1d1.comp +++ b/src/layer/vulkan/shader/convolution_pack8_1x1s1d1.comp @@ -22,52 +22,78 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif -layout (constant_id = 0) const int bias_term = 0; -layout (constant_id = 1) const int activation_type = 0; -layout (constant_id = 2) const float activation_param_0 = 0; -layout (constant_id = 3) const float activation_param_1 = 0; +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int activation_type = 0; +layout (constant_id = 8) const float activation_param_0 = 0; +layout (constant_id = 9) const float activation_param_1 = 0; -#define shape_constant_id_offset 4 +#define shape_constant_id_offset 10 layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int size = 0; -layout (constant_id = shape_constant_id_offset + 2) const int c = 0; -layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; -layout (constant_id = shape_constant_id_offset + 4) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 5) const int outsize = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { int dims; - int size; + int w; + int h; int c; int cstep; int outdims; - int outsize; + int outw; + int outh; int outc; int outcstep; } p; void main() { +#if NCNN_image_shader + int gx = int(gl_GlobalInvocationID.x) * 2; + int gy = int(gl_GlobalInvocationID.y) * 2; + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; +#else int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outsize) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) return; +#endif afpvec8 sum0; afpvec8 sum1; @@ -76,7 +102,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + afpvec8 b = image1d_ld8(bias_blob, gz); +#else afpvec8 b = buffer_ld8(bias_data, gz); +#endif sum0 = b; sum1 = b; sum2 = b; @@ -90,6 +120,61 @@ void main() sum3 = afpvec8(afpvec4(0.f), afpvec4(0.f)); } +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(gx, gy, z)); + afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(gx+1, gy, z)); + afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(gx, gy+1, z)); + afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(gx+1, gy+1, z)); + + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gz)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gz)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gz)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gz)); + afpvec8 k4 = image3d_ld8(weight_blob, ivec3(4, z, gz)); + afpvec8 k5 = image3d_ld8(weight_blob, ivec3(5, z, gz)); + afpvec8 k6 = image3d_ld8(weight_blob, ivec3(6, z, gz)); + afpvec8 k7 = image3d_ld8(weight_blob, ivec3(7, z, gz)); + + // sum += v * k + sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); + sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); + sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); + sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); + sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]); + sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]); + sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]); + sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]); + + sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); + sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); + sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); + sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); + sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]); + sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]); + sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]); + sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]); + + sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]); + sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]); + sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]); + sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]); + sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]); + sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]); + sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]); + sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]); + + sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]); + sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]); + sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]); + sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); + sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]); + sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]); + sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]); + sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]); + } +#else // NCNN_image_shader int w_offset = gz * psc(c) * 8; int v_offset = gx; @@ -149,6 +234,7 @@ void main() w_offset += 8; v_offset += psc(cstep); } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -198,10 +284,17 @@ void main() sum3[1] = afp(1.f) / (afp(1.f) + exp(-sum3[1])); } +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum0); + image3d_st8(top_blob, ivec3(gx+1, gy, gz), sum1); + image3d_st8(top_blob, ivec3(gx, gy+1, gz), sum2); + image3d_st8(top_blob, ivec3(gx+1, gy+1, gz), sum3); +#else int gi = gz * psc(outcstep) + gx; buffer_st8(top_blob_data, gi + 0, sum0); if (gx + 1 < psc(outcstep)) buffer_st8(top_blob_data, gi + 1, sum1); if (gx + 2 < psc(outcstep)) buffer_st8(top_blob_data, gi + 2, sum2); if (gx + 3 < psc(outcstep)) buffer_st8(top_blob_data, gi + 3, sum3); +#endif } diff --git a/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_gemm.comp b/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_gemm.comp index 22a436074..a6176741c 100644 --- a/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_gemm.comp +++ b/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_gemm.comp @@ -34,9 +34,15 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_tm_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_tm_blob; +layout (binding = 2) uniform unfp sampler3D weight_tm_blob; +#else layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec8 bottom_tm_blob_data[]; }; layout (binding = 1) writeonly buffer top_tm_blob { sfpvec8 top_tm_blob_data[]; }; layout (binding = 2) readonly buffer weight_tm_blob { sfpvec8 weight_tm_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -62,6 +68,63 @@ void main() afpvec8 sum2 = afpvec8(afpvec4(0.f), afpvec4(0.f)); afpvec8 sum3 = afpvec8(afpvec4(0.f), afpvec4(0.f)); +#if NCNN_image_shader + int wx = gx * 8; + + for (int z = 0; z < psc(c); z++) + { + afpvec8 v0 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 0, z)); + afpvec8 v1 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 1, z)); + afpvec8 v2 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 2, z)); + afpvec8 v3 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 3, z)); + + afpvec8 k0 = image3d_ld8(weight_tm_blob, ivec3(wx + 0, z, gz)); + afpvec8 k1 = image3d_ld8(weight_tm_blob, ivec3(wx + 1, z, gz)); + afpvec8 k2 = image3d_ld8(weight_tm_blob, ivec3(wx + 2, z, gz)); + afpvec8 k3 = image3d_ld8(weight_tm_blob, ivec3(wx + 3, z, gz)); + afpvec8 k4 = image3d_ld8(weight_tm_blob, ivec3(wx + 4, z, gz)); + afpvec8 k5 = image3d_ld8(weight_tm_blob, ivec3(wx + 5, z, gz)); + afpvec8 k6 = image3d_ld8(weight_tm_blob, ivec3(wx + 6, z, gz)); + afpvec8 k7 = image3d_ld8(weight_tm_blob, ivec3(wx + 7, z, gz)); + + // sum += v * k + sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); + sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); + sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); + sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); + sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]); + sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]); + sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]); + sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]); + + sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); + sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); + sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); + sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); + sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]); + sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]); + sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]); + sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]); + + sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]); + sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]); + sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]); + sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]); + sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]); + sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]); + sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]); + sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]); + + sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]); + sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]); + sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]); + sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); + sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]); + sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]); + sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]); + sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]); + } +#else int v_offset = gy * 16 + gx; int w_offset = (gz * psc(c) * 16 + gx) * 8; @@ -121,11 +184,19 @@ void main() v_offset += psc(cstep); w_offset += 16 * 8; } +#endif +#if NCNN_image_shader + image3d_st8(top_tm_blob, ivec3(gx, gy + 0, gz), sum0); + image3d_st8(top_tm_blob, ivec3(gx, gy + 1, gz), sum1); + image3d_st8(top_tm_blob, ivec3(gx, gy + 2, gz), sum2); + image3d_st8(top_tm_blob, ivec3(gx, gy + 3, gz), sum3); +#else int gi = gz * psc(outcstep) + gy * 16 + gx; buffer_st8(top_tm_blob_data, gi + 0, sum0); if (gy + 1 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 16, sum1); if (gy + 2 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 32, sum2); if (gy + 3 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 48, sum3); +#endif } diff --git a/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_input.comp b/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_input.comp index 973339571..18b6d382a 100644 --- a/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_input.comp +++ b/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_input.comp @@ -37,8 +37,13 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D bottom_tm_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer bottom_tm_blob { sfpvec8 bottom_tm_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -63,6 +68,30 @@ void main() return; // load 4x4 +#if NCNN_image_shader + int sx = gx * 2; + int sy = gy * 2; + + afpvec8 v00 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 0, gz)); + afpvec8 v01 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 0, gz)); + afpvec8 v02 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 0, gz)); + afpvec8 v03 = image3d_ld8(bottom_blob, ivec3(sx + 3, sy + 0, gz)); + + afpvec8 v10 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 1, gz)); + afpvec8 v11 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 1, gz)); + afpvec8 v12 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 1, gz)); + afpvec8 v13 = image3d_ld8(bottom_blob, ivec3(sx + 3, sy + 1, gz)); + + afpvec8 v20 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 2, gz)); + afpvec8 v21 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 2, gz)); + afpvec8 v22 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 2, gz)); + afpvec8 v23 = image3d_ld8(bottom_blob, ivec3(sx + 3, sy + 2, gz)); + + afpvec8 v30 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 3, gz)); + afpvec8 v31 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 3, gz)); + afpvec8 v32 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 3, gz)); + afpvec8 v33 = image3d_ld8(bottom_blob, ivec3(sx + 3, sy + 3, gz)); +#else int v_offset_0 = gz * psc(cstep) + gy * 2 * psc(w) + gx * 2; ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w); @@ -85,6 +114,7 @@ void main() afpvec8 v31 = buffer_ld8(bottom_blob_data, v_offset.a + 1); afpvec8 v32 = buffer_ld8(bottom_blob_data, v_offset.a + 2); afpvec8 v33 = buffer_ld8(bottom_blob_data, v_offset.a + 3); +#endif // const float itm[4][4] = { // {1.0f, 0.0f, -1.0f, 0.0f}, @@ -135,6 +165,26 @@ void main() v33 = m33 - m31; // store 16 +#if NCNN_image_shader + int y = gy * p.block_x + gx; + + image3d_st8(bottom_tm_blob, ivec3(0, y, gz), v00); + image3d_st8(bottom_tm_blob, ivec3(1, y, gz), v01); + image3d_st8(bottom_tm_blob, ivec3(2, y, gz), v02); + image3d_st8(bottom_tm_blob, ivec3(3, y, gz), v03); + image3d_st8(bottom_tm_blob, ivec3(4, y, gz), v10); + image3d_st8(bottom_tm_blob, ivec3(5, y, gz), v11); + image3d_st8(bottom_tm_blob, ivec3(6, y, gz), v12); + image3d_st8(bottom_tm_blob, ivec3(7, y, gz), v13); + image3d_st8(bottom_tm_blob, ivec3(8, y, gz), v20); + image3d_st8(bottom_tm_blob, ivec3(9, y, gz), v21); + image3d_st8(bottom_tm_blob, ivec3(10, y, gz), v22); + image3d_st8(bottom_tm_blob, ivec3(11, y, gz), v23); + image3d_st8(bottom_tm_blob, ivec3(12, y, gz), v30); + image3d_st8(bottom_tm_blob, ivec3(13, y, gz), v31); + image3d_st8(bottom_tm_blob, ivec3(14, y, gz), v32); + image3d_st8(bottom_tm_blob, ivec3(15, y, gz), v33); +#else int v_tm_offset = gz * psc(outcstep) + (gy * p.block_x + gx) * 16; buffer_st8(bottom_tm_blob_data, v_tm_offset + 0, v00); @@ -153,4 +203,5 @@ void main() buffer_st8(bottom_tm_blob_data, v_tm_offset + 13, v31); buffer_st8(bottom_tm_blob_data, v_tm_offset + 14, v32); buffer_st8(bottom_tm_blob_data, v_tm_offset + 15, v33); +#endif } diff --git a/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_output.comp b/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_output.comp index c07ad41d9..deacb636f 100644 --- a/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_output.comp +++ b/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_output.comp @@ -42,9 +42,15 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D top_tm_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler1D bias_blob; +#else layout (binding = 0) readonly buffer top_tm_blob { sfpvec8 top_tm_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; layout (binding = 2) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -69,6 +75,26 @@ void main() return; // load 16 +#if NCNN_image_shader + int sy = gy * p.block_x + gx; + + afpvec8 v00 = image3d_ld8(top_tm_blob, ivec3(0, sy, gz)); + afpvec8 v01 = image3d_ld8(top_tm_blob, ivec3(1, sy, gz)); + afpvec8 v02 = image3d_ld8(top_tm_blob, ivec3(2, sy, gz)); + afpvec8 v03 = image3d_ld8(top_tm_blob, ivec3(3, sy, gz)); + afpvec8 v10 = image3d_ld8(top_tm_blob, ivec3(4, sy, gz)); + afpvec8 v11 = image3d_ld8(top_tm_blob, ivec3(5, sy, gz)); + afpvec8 v12 = image3d_ld8(top_tm_blob, ivec3(6, sy, gz)); + afpvec8 v13 = image3d_ld8(top_tm_blob, ivec3(7, sy, gz)); + afpvec8 v20 = image3d_ld8(top_tm_blob, ivec3(8, sy, gz)); + afpvec8 v21 = image3d_ld8(top_tm_blob, ivec3(9, sy, gz)); + afpvec8 v22 = image3d_ld8(top_tm_blob, ivec3(10, sy, gz)); + afpvec8 v23 = image3d_ld8(top_tm_blob, ivec3(11, sy, gz)); + afpvec8 v30 = image3d_ld8(top_tm_blob, ivec3(12, sy, gz)); + afpvec8 v31 = image3d_ld8(top_tm_blob, ivec3(13, sy, gz)); + afpvec8 v32 = image3d_ld8(top_tm_blob, ivec3(14, sy, gz)); + afpvec8 v33 = image3d_ld8(top_tm_blob, ivec3(15, sy, gz)); +#else int v_tm_offset = gz * psc(cstep) + (gy * p.block_x + gx) * 16; afpvec8 v00 = buffer_ld8(top_tm_blob_data, v_tm_offset + 0); @@ -87,6 +113,7 @@ void main() afpvec8 v31 = buffer_ld8(top_tm_blob_data, v_tm_offset + 13); afpvec8 v32 = buffer_ld8(top_tm_blob_data, v_tm_offset + 14); afpvec8 v33 = buffer_ld8(top_tm_blob_data, v_tm_offset + 15); +#endif // const float itm[2][4] = { // {1.0f, 1.0f, 1.0f, 0.0f}, @@ -106,7 +133,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + const afpvec8 bias_value = image1d_ld8(bias_blob, gz); +#else const afpvec8 bias_value = buffer_ld8(bias_data, gz); +#endif v00 = bias_value + m00 + m01 + m02; v10 = bias_value + m10 + m11 + m12; @@ -172,6 +203,15 @@ void main() } // store 2x2 +#if NCNN_image_shader + int x = gx * 2; + int y = gy * 2; + + image3d_st8(top_blob, ivec3(x, y, gz), v00); + image3d_st8(top_blob, ivec3(x + 1, y, gz), v01); + image3d_st8(top_blob, ivec3(x, y + 1, gz), v10); + image3d_st8(top_blob, ivec3(x + 1, y + 1, gz), v11); +#else int v_offset_0 = gz * psc(outcstep) + gy * 2 * psc(outw) + gx * 2; int v_offset_1 = v_offset_0 + psc(outw); @@ -179,4 +219,5 @@ void main() buffer_st8(top_blob_data, v_offset_0 + 1, v01); buffer_st8(top_blob_data, v_offset_1 + 0, v10); buffer_st8(top_blob_data, v_offset_1 + 1, v11); +#endif } diff --git a/src/layer/vulkan/shader/convolution_pack8to1.comp b/src/layer/vulkan/shader/convolution_pack8to1.comp index b13afd8f9..22fbaad24 100644 --- a/src/layer/vulkan/shader/convolution_pack8to1.comp +++ b/src/layer/vulkan/shader/convolution_pack8to1.comp @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -83,13 +90,44 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else sum = buffer_ld1(bias_data, gz); +#endif } else { sum = afp(0.f); } +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, z)); + + afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz)); + + // sum += dot(v, k); + sum += dot(v[0], k[0]) + dot(v[1], k[1]); + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } + } +#else // NCNN_image_shader int w_offset = gz * psc(c) * kernel_w * kernel_h; for (int z = 0; z < psc(c); z++) @@ -112,6 +150,7 @@ void main() w_offset += kernel_w; } } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -133,7 +172,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st1(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/convolution_pack8to4.comp b/src/layer/vulkan/shader/convolution_pack8to4.comp index b85a816e0..60b6a2fa9 100644 --- a/src/layer/vulkan/shader/convolution_pack8to4.comp +++ b/src/layer/vulkan/shader/convolution_pack8to4.comp @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -83,13 +90,50 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else sum = buffer_ld4(bias_data, gz); +#endif } else { sum = afpvec4(0.f); } +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, z)); + + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz)); + + // sum += v * k + sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + + sx += dilation_w; + wx += 4; + } + + sy += dilation_h; + } + } +#else // NCNN_image_shader int w_offset = gz * psc(c) * kernel_w * kernel_h; for (int z = 0; z < psc(c); z++) @@ -118,6 +162,7 @@ void main() w_offset += kernel_w; } } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -139,7 +184,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st4(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/convolutiondepthwise.comp b/src/layer/vulkan/shader/convolutiondepthwise.comp index 1c3b115f1..6e5079aa4 100644 --- a/src/layer/vulkan/shader/convolutiondepthwise.comp +++ b/src/layer/vulkan/shader/convolutiondepthwise.comp @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -83,7 +90,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else sum = buffer_ld1(bias_data, gz); +#endif } else { @@ -91,6 +102,25 @@ void main() } // depth-wise convolution +#if NCNN_image_shader + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + sum += image2d_ld1(weight_blob, ivec2(wx, gz)) * image3d_ld1(bottom_blob, ivec3(sx, sy, gz)); + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } +#else // NCNN_image_shader int w_offset = gz * kernel_w * kernel_h; int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; @@ -104,6 +134,7 @@ void main() v_offset += dilation_h * psc(w); w_offset += kernel_w; } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -125,7 +156,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st1(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/convolutiondepthwise_group.comp b/src/layer/vulkan/shader/convolutiondepthwise_group.comp index 888de9b88..a73a80299 100644 --- a/src/layer/vulkan/shader/convolutiondepthwise_group.comp +++ b/src/layer/vulkan/shader/convolutiondepthwise_group.comp @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -83,7 +90,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else sum = buffer_ld1(bias_data, gz); +#endif } else { @@ -97,6 +108,32 @@ void main() // group id const int gg = gz / num_output_g; +#if NCNN_image_shader + int sz = gg * channels_g; + + for (int z = 0; z < channels_g; z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + sum += image3d_ld1(weight_blob, ivec3(wx, z, gz)) * image3d_ld1(bottom_blob, ivec3(sx, sy, sz)); + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } + + sz += 1; + } +#else // NCNN_image_shader int w_offset = gz * channels_g * kernel_w * kernel_h; int v_offset_0 = gg * channels_g * psc(cstep); @@ -117,6 +154,7 @@ void main() v_offset_0 += psc(cstep); } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -138,7 +176,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st1(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/convolutiondepthwise_group_pack1to4.comp b/src/layer/vulkan/shader/convolutiondepthwise_group_pack1to4.comp index 7d971a5b2..8fd8d7836 100644 --- a/src/layer/vulkan/shader/convolutiondepthwise_group_pack1to4.comp +++ b/src/layer/vulkan/shader/convolutiondepthwise_group_pack1to4.comp @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -83,7 +90,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else sum = buffer_ld4(bias_data, gz); +#endif } else { @@ -97,6 +108,36 @@ void main() // group id const int gg = gz / num_output_g; +#if NCNN_image_shader + int sz = gg * channels_g; + + for (int z = 0; z < channels_g; z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, sz)); + + afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz)); + + sum += v * k; + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } + + sz += 1; + } +#else // NCNN_image_shader int w_offset = gz * channels_g * kernel_w * kernel_h; int v_offset_0 = gg * channels_g * psc(cstep); @@ -121,6 +162,7 @@ void main() v_offset_0 += psc(cstep); } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -142,7 +184,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st4(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/convolutiondepthwise_group_pack1to8.comp b/src/layer/vulkan/shader/convolutiondepthwise_group_pack1to8.comp index 6f8febedf..1fa212978 100644 --- a/src/layer/vulkan/shader/convolutiondepthwise_group_pack1to8.comp +++ b/src/layer/vulkan/shader/convolutiondepthwise_group_pack1to8.comp @@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -84,7 +91,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else sum = buffer_ld8(bias_data, gz); +#endif } else { @@ -98,6 +109,38 @@ void main() // group id const int gg = gz / num_output_g; +#if NCNN_image_shader + int sz = gg * channels_g; + + for (int z = 0; z < channels_g; z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, sz)); + + afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz)); + + // sum += v * k; + sum[0] += v * k[0]; + sum[1] += v * k[1]; + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } + + sz += 1; + } +#else // NCNN_image_shader int w_offset = gz * channels_g * kernel_w * kernel_h; int v_offset_0 = gg * channels_g * psc(cstep); @@ -124,6 +167,7 @@ void main() v_offset_0 += psc(cstep); } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -149,7 +193,11 @@ void main() sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); } +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st8(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/convolutiondepthwise_group_pack4.comp b/src/layer/vulkan/shader/convolutiondepthwise_group_pack4.comp index a8dba5229..1a1f40724 100644 --- a/src/layer/vulkan/shader/convolutiondepthwise_group_pack4.comp +++ b/src/layer/vulkan/shader/convolutiondepthwise_group_pack4.comp @@ -50,6 +50,12 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; #if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) @@ -59,6 +65,7 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; #endif layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -88,7 +95,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else sum = buffer_ld4(bias_data, gz); +#endif } else { @@ -102,6 +113,41 @@ void main() // group id const int gg = gz / num_output_g; +#if NCNN_image_shader + int sz = gg * channels_g; + + for (int z = 0; z < channels_g; z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, sz)); + + afpmat4 k = afpmat4( + image3d_ld4(weight_blob, ivec3(wx + 0, z, gz)), + image3d_ld4(weight_blob, ivec3(wx + 1, z, gz)), + image3d_ld4(weight_blob, ivec3(wx + 2, z, gz)), + image3d_ld4(weight_blob, ivec3(wx + 3, z, gz)) + ); + + sum += v * k; + + sx += dilation_w; + wx += 4; + } + + sy += dilation_h; + } + + sz += 1; + } +#else // NCNN_image_shader int w_offset = gz * channels_g * kernel_w * kernel_h; int v_offset_0 = gg * channels_g * psc(cstep); @@ -136,6 +182,7 @@ void main() v_offset_0 += psc(cstep); } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -157,7 +204,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st4(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/convolutiondepthwise_group_pack4to1.comp b/src/layer/vulkan/shader/convolutiondepthwise_group_pack4to1.comp index 51326e53e..9298121d3 100644 --- a/src/layer/vulkan/shader/convolutiondepthwise_group_pack4to1.comp +++ b/src/layer/vulkan/shader/convolutiondepthwise_group_pack4to1.comp @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -83,7 +90,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else sum = buffer_ld1(bias_data, gz); +#endif } else { @@ -97,6 +108,36 @@ void main() // group id const int gg = gz / num_output_g; +#if NCNN_image_shader + int sz = gg * channels_g; + + for (int z = 0; z < channels_g; z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, sz)); + + afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz)); + + sum += dot(v, k); + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } + + sz += 1; + } +#else // NCNN_image_shader int w_offset = gz * channels_g * kernel_w * kernel_h; int v_offset_0 = gg * channels_g * psc(cstep); @@ -121,6 +162,7 @@ void main() v_offset_0 += psc(cstep); } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -142,7 +184,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st1(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/convolutiondepthwise_group_pack4to8.comp b/src/layer/vulkan/shader/convolutiondepthwise_group_pack4to8.comp index 1f1f3dd73..2cbd9e808 100644 --- a/src/layer/vulkan/shader/convolutiondepthwise_group_pack4to8.comp +++ b/src/layer/vulkan/shader/convolutiondepthwise_group_pack4to8.comp @@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -84,7 +91,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else sum = buffer_ld8(bias_data, gz); +#endif } else { @@ -98,6 +109,51 @@ void main() // group id const int gg = gz / num_output_g; +#if NCNN_image_shader + int sz = gg * channels_g; + + for (int z = 0; z < channels_g; z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, sz)); + + afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx + 0, z, gz)); + afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx + 1, z, gz)); + afpvec4 k2 = image3d_ld4(weight_blob, ivec3(wx + 2, z, gz)); + afpvec4 k3 = image3d_ld4(weight_blob, ivec3(wx + 3, z, gz)); + afpvec4 k4 = image3d_ld4(weight_blob, ivec3(wx + 4, z, gz)); + afpvec4 k5 = image3d_ld4(weight_blob, ivec3(wx + 5, z, gz)); + afpvec4 k6 = image3d_ld4(weight_blob, ivec3(wx + 6, z, gz)); + afpvec4 k7 = image3d_ld4(weight_blob, ivec3(wx + 7, z, gz)); + + // sum += v * k; + sum[0].r += dot(v, k0); + sum[0].g += dot(v, k1); + sum[0].b += dot(v, k2); + sum[0].a += dot(v, k3); + sum[1].r += dot(v, k4); + sum[1].g += dot(v, k5); + sum[1].b += dot(v, k6); + sum[1].a += dot(v, k7); + + sx += dilation_w; + wx += 8; + } + + sy += dilation_h; + } + + sz += 1; + } +#else // NCNN_image_shader int w_offset = gz * channels_g * kernel_w * kernel_h; int v_offset_0 = gg * channels_g * psc(cstep); @@ -137,6 +193,7 @@ void main() v_offset_0 += psc(cstep); } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -162,7 +219,11 @@ void main() sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); } +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st8(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/convolutiondepthwise_group_pack8.comp b/src/layer/vulkan/shader/convolutiondepthwise_group_pack8.comp index b8ab54adf..24215e516 100644 --- a/src/layer/vulkan/shader/convolutiondepthwise_group_pack8.comp +++ b/src/layer/vulkan/shader/convolutiondepthwise_group_pack8.comp @@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -84,7 +91,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else sum = buffer_ld8(bias_data, gz); +#endif } else { @@ -98,6 +109,51 @@ void main() // group id const int gg = gz / num_output_g; +#if NCNN_image_shader + int sz = gg * channels_g; + + for (int z = 0; z < channels_g; z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, sz)); + + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz)); + afpvec8 k4 = image3d_ld8(weight_blob, ivec3(wx + 4, z, gz)); + afpvec8 k5 = image3d_ld8(weight_blob, ivec3(wx + 5, z, gz)); + afpvec8 k6 = image3d_ld8(weight_blob, ivec3(wx + 6, z, gz)); + afpvec8 k7 = image3d_ld8(weight_blob, ivec3(wx + 7, z, gz)); + + // sum += v * k + sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]); + sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]); + sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]); + sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]); + + sx += dilation_w; + wx += 8; + } + + sy += dilation_h; + } + + sz += 1; + } +#else // NCNN_image_shader int w_offset = gz * channels_g * kernel_w * kernel_h; int v_offset_0 = gg * channels_g * psc(cstep); @@ -137,6 +193,7 @@ void main() v_offset_0 += psc(cstep); } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -162,7 +219,11 @@ void main() sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); } +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st8(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/convolutiondepthwise_group_pack8to1.comp b/src/layer/vulkan/shader/convolutiondepthwise_group_pack8to1.comp index 33e29f360..6714b8c46 100644 --- a/src/layer/vulkan/shader/convolutiondepthwise_group_pack8to1.comp +++ b/src/layer/vulkan/shader/convolutiondepthwise_group_pack8to1.comp @@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -84,7 +91,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else sum = buffer_ld1(bias_data, gz); +#endif } else { @@ -98,6 +109,37 @@ void main() // group id const int gg = gz / num_output_g; +#if NCNN_image_shader + int sz = gg * channels_g; + + for (int z = 0; z < channels_g; z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, sz)); + + afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz)); + + // sum += dot(v, k); + sum += dot(v[0], k[0]) + dot(v[1], k[1]); + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } + + sz += 1; + } +#else // NCNN_image_shader int w_offset = gz * channels_g * kernel_w * kernel_h; int v_offset_0 = gg * channels_g * psc(cstep); @@ -123,6 +165,7 @@ void main() v_offset_0 += psc(cstep); } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -144,7 +187,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st1(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/convolutiondepthwise_group_pack8to4.comp b/src/layer/vulkan/shader/convolutiondepthwise_group_pack8to4.comp index 85b25a203..e3b9abe94 100644 --- a/src/layer/vulkan/shader/convolutiondepthwise_group_pack8to4.comp +++ b/src/layer/vulkan/shader/convolutiondepthwise_group_pack8to4.comp @@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -84,7 +91,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else sum = buffer_ld4(bias_data, gz); +#endif } else { @@ -98,6 +109,43 @@ void main() // group id const int gg = gz / num_output_g; +#if NCNN_image_shader + int sz = gg * channels_g; + + for (int z = 0; z < channels_g; z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, sz)); + + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz)); + + // sum += v * k + sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + + sx += dilation_w; + wx += 4; + } + + sy += dilation_h; + } + + sz += 1; + } +#else // NCNN_image_shader int w_offset = gz * channels_g * kernel_w * kernel_h; int v_offset_0 = gg * channels_g * psc(cstep); @@ -129,6 +177,7 @@ void main() v_offset_0 += psc(cstep); } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -150,7 +199,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st4(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/convolutiondepthwise_pack4.comp b/src/layer/vulkan/shader/convolutiondepthwise_pack4.comp index c2f81a1b3..04b98ff6b 100644 --- a/src/layer/vulkan/shader/convolutiondepthwise_pack4.comp +++ b/src/layer/vulkan/shader/convolutiondepthwise_pack4.comp @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -83,7 +90,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else sum = buffer_ld4(bias_data, gz); +#endif } else { @@ -91,6 +102,29 @@ void main() } // depth-wise convolution +#if NCNN_image_shader + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, gz)); + + afpvec4 k = image2d_ld4(weight_blob, ivec2(wx, gz)); + + sum += v * k; + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } +#else // NCNN_image_shader int w_offset = gz * kernel_w * kernel_h; int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; @@ -108,6 +142,7 @@ void main() v_offset += dilation_h * psc(w); w_offset += kernel_w; } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -129,7 +164,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st4(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/convolutiondepthwise_pack8.comp b/src/layer/vulkan/shader/convolutiondepthwise_pack8.comp index 6a9466f1e..199e6484e 100644 --- a/src/layer/vulkan/shader/convolutiondepthwise_pack8.comp +++ b/src/layer/vulkan/shader/convolutiondepthwise_pack8.comp @@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -84,7 +91,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else sum = buffer_ld8(bias_data, gz); +#endif } else { @@ -92,6 +103,31 @@ void main() } // depth-wise convolution +#if NCNN_image_shader + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, gz)); + + afpvec8 k = image2d_ld8(weight_blob, ivec2(wx, gz)); + + // sum += v * k; + sum[0] += v[0] * k[0]; + sum[1] += v[1] * k[1]; + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } +#else // NCNN_image_shader int w_offset = gz * kernel_w * kernel_h; int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; @@ -111,6 +147,7 @@ void main() v_offset += dilation_h * psc(w); w_offset += kernel_w; } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -136,7 +173,11 @@ void main() sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); } +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st8(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/crop.comp b/src/layer/vulkan/shader/crop.comp index 690f27e02..9e19a2ad6 100644 --- a/src/layer/vulkan/shader/crop.comp +++ b/src/layer/vulkan/shader/crop.comp @@ -38,8 +38,13 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -69,13 +74,17 @@ void main() if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) return; - const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; - int x = gx + p.woffset; int y = gy + p.hoffset; int z = gz + p.coffset; +#if NCNN_image_shader + image3d_cp1(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, z)); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + int v_offset = z * psc(cstep) + y * psc(w) + x; buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); +#endif } diff --git a/src/layer/vulkan/shader/crop_pack1to4.comp b/src/layer/vulkan/shader/crop_pack1to4.comp index dd4a1e463..5d54c9aac 100644 --- a/src/layer/vulkan/shader/crop_pack1to4.comp +++ b/src/layer/vulkan/shader/crop_pack1to4.comp @@ -38,8 +38,13 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -69,12 +74,23 @@ void main() if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) return; - int gi = gz * psc(outcstep) + gy * psc(outw) + gx; - int x = gx + p.woffset; int y = gy + p.hoffset; int z = gz * 4 + p.coffset; + +#if NCNN_image_shader + afpvec4 v; + v.r = image3d_ld1(bottom_blob, ivec3(x, y, z + 0)); + v.g = image3d_ld1(bottom_blob, ivec3(x, y, z + 1)); + v.b = image3d_ld1(bottom_blob, ivec3(x, y, z + 2)); + v.a = image3d_ld1(bottom_blob, ivec3(x, y, z + 3)); + + image3d_st4(top_blob, ivec3(gx, gy, gz), v); +#else + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + ivec4 v_offset = z * psc(cstep) + y * psc(w) + x + ivec4(0, 1, 2, 3) * psc(cstep); buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif } diff --git a/src/layer/vulkan/shader/crop_pack1to8.comp b/src/layer/vulkan/shader/crop_pack1to8.comp index fd2828a2e..449df006f 100644 --- a/src/layer/vulkan/shader/crop_pack1to8.comp +++ b/src/layer/vulkan/shader/crop_pack1to8.comp @@ -39,8 +39,13 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -70,13 +75,28 @@ void main() if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) return; - int gi = gz * psc(outcstep) + gy * psc(outw) + gx; - int x = gx + p.woffset; int y = gy + p.hoffset; int z = gz * 8 + p.coffset; + +#if NCNN_image_shader + afpvec8 v; + v[0].r = image3d_ld1(bottom_blob, ivec3(x, y, z + 0)); + v[0].g = image3d_ld1(bottom_blob, ivec3(x, y, z + 1)); + v[0].b = image3d_ld1(bottom_blob, ivec3(x, y, z + 2)); + v[0].a = image3d_ld1(bottom_blob, ivec3(x, y, z + 3)); + v[1].r = image3d_ld1(bottom_blob, ivec3(x, y, z + 4)); + v[1].g = image3d_ld1(bottom_blob, ivec3(x, y, z + 5)); + v[1].b = image3d_ld1(bottom_blob, ivec3(x, y, z + 6)); + v[1].a = image3d_ld1(bottom_blob, ivec3(x, y, z + 7)); + + image3d_st8(top_blob, ivec3(gx, gy, gz), v); +#else + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + ivec4 v_offset = z * psc(cstep) + y * psc(w) + x + ivec4(0, 1, 2, 3) * psc(cstep); ivec4 vv_offset = v_offset + 4 * psc(cstep); buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); +#endif } diff --git a/src/layer/vulkan/shader/crop_pack4.comp b/src/layer/vulkan/shader/crop_pack4.comp index 6b223d15c..bfa985820 100644 --- a/src/layer/vulkan/shader/crop_pack4.comp +++ b/src/layer/vulkan/shader/crop_pack4.comp @@ -38,8 +38,13 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -69,13 +74,17 @@ void main() if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) return; - const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; - int x = gx + p.woffset; int y = gy + p.hoffset; int z = gz + p.coffset; +#if NCNN_image_shader + image3d_cp4(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, z)); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + int v_offset = z * psc(cstep) + y * psc(w) + x; buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif } diff --git a/src/layer/vulkan/shader/crop_pack4to1.comp b/src/layer/vulkan/shader/crop_pack4to1.comp index a65a856d1..efae1b939 100644 --- a/src/layer/vulkan/shader/crop_pack4to1.comp +++ b/src/layer/vulkan/shader/crop_pack4to1.comp @@ -38,12 +38,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +#else #if NCNN_fp16_packed layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; #else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; #endif layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -73,12 +78,17 @@ void main() if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) return; - int gi = gz * psc(outcstep) + gy * psc(outw) + gx; - int x = gx + p.woffset; int y = gy + p.hoffset; int z = gz + p.coffset; +#if NCNN_image_shader + afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, y, z / 4)); + + image3d_st1(top_blob, ivec3(gx, gy, gz), v[z % 4]); +#else + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + #if NCNN_fp16_packed int v_offset = ((z / 4) * psc(cstep) + y * psc(w) + x) * 2 + (z % 4) / 2; int lane2 = z % 2; @@ -91,4 +101,5 @@ void main() buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); #endif +#endif } diff --git a/src/layer/vulkan/shader/crop_pack4to8.comp b/src/layer/vulkan/shader/crop_pack4to8.comp index b66f238e9..5635208a1 100644 --- a/src/layer/vulkan/shader/crop_pack4to8.comp +++ b/src/layer/vulkan/shader/crop_pack4to8.comp @@ -39,12 +39,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +#else #if NCNN_fp16_packed layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; #else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; #endif layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -81,6 +86,28 @@ void main() ivec4 z4 = gz * 8 + p.coffset + ivec4(0, 1, 2, 3); ivec4 zz4 = z4 + 4; +#if NCNN_image_shader + afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y, z4.r / 4)); + afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y, z4.g / 4)); + afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(x, y, z4.b / 4)); + afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(x, y, z4.a / 4)); + afpvec4 v4 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.r / 4)); + afpvec4 v5 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.g / 4)); + afpvec4 v6 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.b / 4)); + afpvec4 v7 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.a / 4)); + + afpvec8 v; + v[0].r = v0[z4.r % 4]; + v[0].g = v1[z4.g % 4]; + v[0].b = v2[z4.b % 4]; + v[0].a = v3[z4.a % 4]; + v[1].r = v4[zz4.r % 4]; + v[1].g = v5[zz4.g % 4]; + v[1].b = v6[zz4.b % 4]; + v[1].a = v7[zz4.a % 4]; + + image3d_st8(top_blob, ivec3(gx, gy, gz), v); +#else #if NCNN_fp16_packed ivec4 v_offset = ((z4 / 4) * psc(cstep) + y * psc(w) + x) * 2 + (z4 % 4) / 2; ivec4 lane2 = z4 % 2; @@ -106,4 +133,5 @@ void main() buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); #endif +#endif } diff --git a/src/layer/vulkan/shader/crop_pack8.comp b/src/layer/vulkan/shader/crop_pack8.comp index b148f84d8..27a55f57f 100644 --- a/src/layer/vulkan/shader/crop_pack8.comp +++ b/src/layer/vulkan/shader/crop_pack8.comp @@ -39,8 +39,13 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -70,13 +75,17 @@ void main() if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) return; - const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; - int x = gx + p.woffset; int y = gy + p.hoffset; int z = gz + p.coffset; +#if NCNN_image_shader + image3d_cp8(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, z)); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + int v_offset = z * psc(cstep) + y * psc(w) + x; buffer_cp8(top_blob_data, gi, bottom_blob_data, v_offset); +#endif } diff --git a/src/layer/vulkan/shader/crop_pack8to1.comp b/src/layer/vulkan/shader/crop_pack8to1.comp index 85ffc515b..f14523716 100644 --- a/src/layer/vulkan/shader/crop_pack8to1.comp +++ b/src/layer/vulkan/shader/crop_pack8to1.comp @@ -39,12 +39,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +#else #if NCNN_fp16_packed layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; #else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; #endif layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -74,12 +79,17 @@ void main() if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) return; - int gi = gz * psc(outcstep) + gy * psc(outw) + gx; - int x = gx + p.woffset; int y = gy + p.hoffset; int z = gz + p.coffset; +#if NCNN_image_shader + afpvec8 v = image3d_ld8(bottom_blob, ivec3(x, y, z / 8)); + + image3d_st1(top_blob, ivec3(gx, gy, gz), v[(z % 8) / 4][z % 4]); +#else + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + #if NCNN_fp16_packed int v_offset = ((z / 8) * psc(cstep) + y * psc(w) + x) * 4 + (z % 8) / 2; int lane2 = z % 2; @@ -92,4 +102,5 @@ void main() buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); #endif +#endif } diff --git a/src/layer/vulkan/shader/crop_pack8to4.comp b/src/layer/vulkan/shader/crop_pack8to4.comp index 52bbe03e7..c3b82b5ea 100644 --- a/src/layer/vulkan/shader/crop_pack8to4.comp +++ b/src/layer/vulkan/shader/crop_pack8to4.comp @@ -39,12 +39,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +#else #if NCNN_fp16_packed layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; #else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; #endif layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -80,6 +85,20 @@ void main() int y = gy + p.hoffset; ivec4 z4 = gz * 4 + p.coffset + ivec4(0, 1, 2, 3); +#if NCNN_image_shader + afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(x, y, z4.r / 8)); + afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(x, y, z4.g / 8)); + afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(x, y, z4.b / 8)); + afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(x, y, z4.a / 8)); + + afpvec4 v; + v.r = v0[(z4.r % 8) / 4][z4.r % 4]; + v.g = v1[(z4.g % 8) / 4][z4.g % 4]; + v.b = v2[(z4.b % 8) / 4][z4.b % 4]; + v.a = v3[(z4.a % 8) / 4][z4.a % 4]; + + image3d_st4(top_blob, ivec3(gx, gy, gz), v); +#else #if NCNN_fp16_packed ivec4 v_offset = ((z4 / 8) * psc(cstep) + y * psc(w) + x) * 4 + (z4 % 8) / 2; ivec4 lane2 = z4 % 2; @@ -97,4 +116,5 @@ void main() buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset); #endif +#endif } diff --git a/src/layer/vulkan/shader/deconvolution.comp b/src/layer/vulkan/shader/deconvolution.comp index 34146b4b9..ca4ac47bf 100644 --- a/src/layer/vulkan/shader/deconvolution.comp +++ b/src/layer/vulkan/shader/deconvolution.comp @@ -49,10 +49,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -82,7 +89,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else sum = buffer_ld1(bias_data, gz); +#endif } else { @@ -92,6 +103,36 @@ void main() const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int wx = y * kernel_w + x; + + for (int z = 0; z < psc(c); z++) + { + sum += image3d_ld1(weight_blob, ivec3(wx, z, gz)) * image3d_ld1(bottom_blob, ivec3(sx, sy, z)); + } + } + } +#else // NCNN_image_shader int w_offset_0 = gz * psc(c) * kernel_w * kernel_h; for (int y = 0; y < kernel_h; y++) @@ -126,6 +167,7 @@ void main() } } } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -147,7 +189,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st1(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/deconvolution_pack1to4.comp b/src/layer/vulkan/shader/deconvolution_pack1to4.comp index d0472366c..389f1423a 100644 --- a/src/layer/vulkan/shader/deconvolution_pack1to4.comp +++ b/src/layer/vulkan/shader/deconvolution_pack1to4.comp @@ -49,10 +49,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -82,7 +89,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else sum = buffer_ld4(bias_data, gz); +#endif } else { @@ -92,6 +103,40 @@ void main() const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int wx = y * kernel_w + x; + + for (int z = 0; z < psc(c); z++) + { + afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, z)); + + afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz)); + + sum += v * k; + } + } + } +#else // NCNN_image_shader int w_offset_0 = gz * psc(c) * kernel_w * kernel_h; for (int y = 0; y < kernel_h; y++) @@ -130,6 +175,7 @@ void main() } } } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -151,7 +197,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st4(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/deconvolution_pack1to8.comp b/src/layer/vulkan/shader/deconvolution_pack1to8.comp index 73ff5b48f..3c30a71b0 100644 --- a/src/layer/vulkan/shader/deconvolution_pack1to8.comp +++ b/src/layer/vulkan/shader/deconvolution_pack1to8.comp @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -83,7 +90,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else sum = buffer_ld8(bias_data, gz); +#endif } else { @@ -93,6 +104,42 @@ void main() const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int wx = y * kernel_w + x; + + for (int z = 0; z < psc(c); z++) + { + afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, z)); + + afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz)); + + // sum += v * k; + sum[0] += v * k[0]; + sum[1] += v * k[1]; + } + } + } +#else // NCNN_image_shader int w_offset_0 = gz * psc(c) * kernel_w * kernel_h; for (int y = 0; y < kernel_h; y++) @@ -133,6 +180,7 @@ void main() } } } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -158,7 +206,11 @@ void main() sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); } +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st8(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/deconvolution_pack4.comp b/src/layer/vulkan/shader/deconvolution_pack4.comp index 0499208e5..d0010b75d 100644 --- a/src/layer/vulkan/shader/deconvolution_pack4.comp +++ b/src/layer/vulkan/shader/deconvolution_pack4.comp @@ -49,6 +49,12 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; #if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) @@ -58,6 +64,7 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; #endif layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -87,7 +94,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else sum = buffer_ld4(bias_data, gz); +#endif } else { @@ -97,6 +108,45 @@ void main() const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int wx = (y * kernel_w + x) * 4; + + for (int z = 0; z < psc(c); z++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, z)); + + afpmat4 k = afpmat4( + image3d_ld4(weight_blob, ivec3(wx + 0, z, gz)), + image3d_ld4(weight_blob, ivec3(wx + 1, z, gz)), + image3d_ld4(weight_blob, ivec3(wx + 2, z, gz)), + image3d_ld4(weight_blob, ivec3(wx + 3, z, gz)) + ); + + sum += v * k; + } + } + } +#else // NCNN_image_shader int w_offset_0 = gz * psc(c) * kernel_w * kernel_h; for (int y = 0; y < kernel_h; y++) @@ -145,6 +195,7 @@ void main() } } } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -166,7 +217,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st4(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/deconvolution_pack4to1.comp b/src/layer/vulkan/shader/deconvolution_pack4to1.comp index 5ce9c1a67..ebf1c3bf6 100644 --- a/src/layer/vulkan/shader/deconvolution_pack4to1.comp +++ b/src/layer/vulkan/shader/deconvolution_pack4to1.comp @@ -49,10 +49,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -82,7 +89,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else sum = buffer_ld1(bias_data, gz); +#endif } else { @@ -92,6 +103,40 @@ void main() const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int wx = y * kernel_w + x; + + for (int z = 0; z < psc(c); z++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, z)); + + afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz)); + + sum += dot(v, k); + } + } + } +#else // NCNN_image_shader int w_offset_0 = gz * psc(c) * kernel_w * kernel_h; for (int y = 0; y < kernel_h; y++) @@ -130,6 +175,7 @@ void main() } } } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -151,7 +197,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st1(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/deconvolution_pack4to8.comp b/src/layer/vulkan/shader/deconvolution_pack4to8.comp index 9dbfd9e94..0003b4424 100644 --- a/src/layer/vulkan/shader/deconvolution_pack4to8.comp +++ b/src/layer/vulkan/shader/deconvolution_pack4to8.comp @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -83,7 +90,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else sum = buffer_ld8(bias_data, gz); +#endif } else { @@ -93,6 +104,55 @@ void main() const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int wx = (y * kernel_w + x) * 8; + + for (int z = 0; z < psc(c); z++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, z)); + + afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx + 0, z, gz)); + afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx + 1, z, gz)); + afpvec4 k2 = image3d_ld4(weight_blob, ivec3(wx + 2, z, gz)); + afpvec4 k3 = image3d_ld4(weight_blob, ivec3(wx + 3, z, gz)); + afpvec4 k4 = image3d_ld4(weight_blob, ivec3(wx + 4, z, gz)); + afpvec4 k5 = image3d_ld4(weight_blob, ivec3(wx + 5, z, gz)); + afpvec4 k6 = image3d_ld4(weight_blob, ivec3(wx + 6, z, gz)); + afpvec4 k7 = image3d_ld4(weight_blob, ivec3(wx + 7, z, gz)); + + // sum += v * k; + sum[0].r += dot(v, k0); + sum[0].g += dot(v, k1); + sum[0].b += dot(v, k2); + sum[0].a += dot(v, k3); + sum[1].r += dot(v, k4); + sum[1].g += dot(v, k5); + sum[1].b += dot(v, k6); + sum[1].a += dot(v, k7); + } + } + } +#else // NCNN_image_shader int w_offset_0 = gz * psc(c) * kernel_w * kernel_h; for (int y = 0; y < kernel_h; y++) @@ -146,6 +206,7 @@ void main() } } } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -171,7 +232,11 @@ void main() sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); } +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st8(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/deconvolution_pack8.comp b/src/layer/vulkan/shader/deconvolution_pack8.comp index 233ec1ace..129a6b2f4 100644 --- a/src/layer/vulkan/shader/deconvolution_pack8.comp +++ b/src/layer/vulkan/shader/deconvolution_pack8.comp @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -83,7 +90,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else sum = buffer_ld8(bias_data, gz); +#endif } else { @@ -93,6 +104,55 @@ void main() const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int wx = (y * kernel_w + x) * 8; + + for (int z = 0; z < psc(c); z++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, z)); + + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz)); + afpvec8 k4 = image3d_ld8(weight_blob, ivec3(wx + 4, z, gz)); + afpvec8 k5 = image3d_ld8(weight_blob, ivec3(wx + 5, z, gz)); + afpvec8 k6 = image3d_ld8(weight_blob, ivec3(wx + 6, z, gz)); + afpvec8 k7 = image3d_ld8(weight_blob, ivec3(wx + 7, z, gz)); + + // sum += v * k + sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]); + sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]); + sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]); + sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]); + } + } + } +#else // NCNN_image_shader int w_offset_0 = gz * psc(c) * kernel_w * kernel_h; for (int y = 0; y < kernel_h; y++) @@ -146,6 +206,7 @@ void main() } } } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -171,7 +232,11 @@ void main() sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); } +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st8(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/deconvolution_pack8to1.comp b/src/layer/vulkan/shader/deconvolution_pack8to1.comp index 78ecee2f0..fd23edaf5 100644 --- a/src/layer/vulkan/shader/deconvolution_pack8to1.comp +++ b/src/layer/vulkan/shader/deconvolution_pack8to1.comp @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -83,7 +90,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else sum = buffer_ld1(bias_data, gz); +#endif } else { @@ -93,6 +104,41 @@ void main() const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int wx = y * kernel_w + x; + + for (int z = 0; z < psc(c); z++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, z)); + + afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz)); + + // sum += dot(v, k); + sum += dot(v[0], k[0]) + dot(v[1], k[1]); + } + } + } +#else // NCNN_image_shader int w_offset_0 = gz * psc(c) * kernel_w * kernel_h; for (int y = 0; y < kernel_h; y++) @@ -132,6 +178,7 @@ void main() } } } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -153,7 +200,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st1(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/deconvolution_pack8to4.comp b/src/layer/vulkan/shader/deconvolution_pack8to4.comp index 9527cae94..5b924961b 100644 --- a/src/layer/vulkan/shader/deconvolution_pack8to4.comp +++ b/src/layer/vulkan/shader/deconvolution_pack8to4.comp @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -83,7 +90,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else sum = buffer_ld4(bias_data, gz); +#endif } else { @@ -93,6 +104,47 @@ void main() const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int wx = (y * kernel_w + x) * 4; + + for (int z = 0; z < psc(c); z++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, z)); + + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz)); + + // sum += v * k + sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + } + } + } +#else // NCNN_image_shader int w_offset_0 = gz * psc(c) * kernel_w * kernel_h; for (int y = 0; y < kernel_h; y++) @@ -138,6 +190,7 @@ void main() } } } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -159,7 +212,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st4(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/deconvolutiondepthwise.comp b/src/layer/vulkan/shader/deconvolutiondepthwise.comp index 6dc43d754..036b7de73 100644 --- a/src/layer/vulkan/shader/deconvolutiondepthwise.comp +++ b/src/layer/vulkan/shader/deconvolutiondepthwise.comp @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -83,7 +90,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else sum = buffer_ld1(bias_data, gz); +#endif } else { @@ -94,6 +105,33 @@ void main() const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; // depth-wise deconvolution +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int wx = y * kernel_w + x; + + sum += image2d_ld1(weight_blob, ivec2(wx, gz)) * image3d_ld1(bottom_blob, ivec3(sx, sy, gz)); + } + } +#else int v_offset_0 = gz * psc(cstep); int w_offset_0 = gz * kernel_w * kernel_h; @@ -123,6 +161,7 @@ void main() sum += buffer_ld1(weight_data, w_offset) * buffer_ld1(bottom_blob_data, v_offset); } } +#endif if (activation_type == 1) { @@ -144,7 +183,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st1(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/deconvolutiondepthwise_group.comp b/src/layer/vulkan/shader/deconvolutiondepthwise_group.comp index 9ce9b8569..8ed752704 100644 --- a/src/layer/vulkan/shader/deconvolutiondepthwise_group.comp +++ b/src/layer/vulkan/shader/deconvolutiondepthwise_group.comp @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -83,7 +90,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else sum = buffer_ld1(bias_data, gz); +#endif } else { @@ -100,6 +111,39 @@ void main() // group id const int gg = gz / num_output_g; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int sz = gg * channels_g; + int wx = y * kernel_w + x; + + for (int z = 0; z < channels_g; z++) + { + sum += image3d_ld1(weight_blob, ivec3(wx, z, gz)) * image3d_ld1(bottom_blob, ivec3(sx, sy, sz)); + + sz += 1; + } + } + } +#else int w_offset_0 = gz * channels_g * kernel_w * kernel_h; int v_offset_0 = gg * channels_g * psc(cstep); @@ -135,6 +179,7 @@ void main() } } } +#endif if (activation_type == 1) { @@ -156,7 +201,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st1(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack1to4.comp b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack1to4.comp index 74eb92968..08395c23d 100644 --- a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack1to4.comp +++ b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack1to4.comp @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -83,7 +90,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else sum = buffer_ld4(bias_data, gz); +#endif } else { @@ -100,6 +111,43 @@ void main() // group id const int gg = gz / num_output_g; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int sz = gg * channels_g; + int wx = y * kernel_w + x; + + for (int z = 0; z < channels_g; z++) + { + afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, sz)); + + afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz)); + + sum += v * k; + + sz += 1; + } + } + } +#else int w_offset_0 = gz * channels_g * kernel_w * kernel_h; int v_offset_0 = gg * channels_g * psc(cstep); @@ -139,6 +187,7 @@ void main() } } } +#endif if (activation_type == 1) { @@ -160,7 +209,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st4(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack1to8.comp b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack1to8.comp index 67a9756c9..c43f2ef0b 100644 --- a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack1to8.comp +++ b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack1to8.comp @@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -84,7 +91,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else sum = buffer_ld8(bias_data, gz); +#endif } else { @@ -101,6 +112,45 @@ void main() // group id const int gg = gz / num_output_g; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int sz = gg * channels_g; + int wx = y * kernel_w + x; + + for (int z = 0; z < channels_g; z++) + { + afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, sz)); + + afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz)); + + // sum += v * k; + sum[0] += v * k[0]; + sum[1] += v * k[1]; + + sz += 1; + } + } + } +#else int w_offset_0 = gz * channels_g * kernel_w * kernel_h; int v_offset_0 = gg * channels_g * psc(cstep); @@ -142,6 +192,7 @@ void main() } } } +#endif if (activation_type == 1) { @@ -167,7 +218,11 @@ void main() sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); } +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st8(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4.comp b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4.comp index 622223696..7c05b3d67 100644 --- a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4.comp +++ b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4.comp @@ -50,6 +50,12 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; #if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) @@ -59,6 +65,7 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; #endif layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -88,7 +95,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else sum = buffer_ld4(bias_data, gz); +#endif } else { @@ -105,6 +116,48 @@ void main() // group id const int gg = gz / num_output_g; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int sz = gg * channels_g; + int wx = (y * kernel_w + x) * 4; + + for (int z = 0; z < channels_g; z++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, sz)); + + afpmat4 k = afpmat4( + image3d_ld4(weight_blob, ivec3(wx + 0, z, gz)), + image3d_ld4(weight_blob, ivec3(wx + 1, z, gz)), + image3d_ld4(weight_blob, ivec3(wx + 2, z, gz)), + image3d_ld4(weight_blob, ivec3(wx + 3, z, gz)) + ); + + sum += v * k; + + sz += 1; + } + } + } +#else int w_offset_0 = gz * channels_g * kernel_w * kernel_h; int v_offset_0 = gg * channels_g * psc(cstep); @@ -154,6 +207,7 @@ void main() } } } +#endif if (activation_type == 1) { @@ -175,7 +229,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st4(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4to1.comp b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4to1.comp index e81a5aa90..619a093a6 100644 --- a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4to1.comp +++ b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4to1.comp @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -83,7 +90,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else sum = buffer_ld1(bias_data, gz); +#endif } else { @@ -100,6 +111,43 @@ void main() // group id const int gg = gz / num_output_g; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int sz = gg * channels_g; + int wx = y * kernel_w + x; + + for (int z = 0; z < channels_g; z++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, sz)); + + afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz)); + + sum += dot(v, k); + + sz += 1; + } + } + } +#else int w_offset_0 = gz * channels_g * kernel_w * kernel_h; int v_offset_0 = gg * channels_g * psc(cstep); @@ -139,6 +187,7 @@ void main() } } } +#endif if (activation_type == 1) { @@ -160,7 +209,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st1(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4to8.comp b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4to8.comp index b5bcc492f..08b03f513 100644 --- a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4to8.comp +++ b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4to8.comp @@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -84,7 +91,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else sum = buffer_ld8(bias_data, gz); +#endif } else { @@ -101,6 +112,58 @@ void main() // group id const int gg = gz / num_output_g; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int sz = gg * channels_g; + int wx = (y * kernel_w + x) * 8; + + for (int z = 0; z < channels_g; z++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, sz)); + + afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx + 0, z, gz)); + afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx + 1, z, gz)); + afpvec4 k2 = image3d_ld4(weight_blob, ivec3(wx + 2, z, gz)); + afpvec4 k3 = image3d_ld4(weight_blob, ivec3(wx + 3, z, gz)); + afpvec4 k4 = image3d_ld4(weight_blob, ivec3(wx + 4, z, gz)); + afpvec4 k5 = image3d_ld4(weight_blob, ivec3(wx + 5, z, gz)); + afpvec4 k6 = image3d_ld4(weight_blob, ivec3(wx + 6, z, gz)); + afpvec4 k7 = image3d_ld4(weight_blob, ivec3(wx + 7, z, gz)); + + // sum += v * k; + sum[0].r += dot(v, k0); + sum[0].g += dot(v, k1); + sum[0].b += dot(v, k2); + sum[0].a += dot(v, k3); + sum[1].r += dot(v, k4); + sum[1].g += dot(v, k5); + sum[1].b += dot(v, k6); + sum[1].a += dot(v, k7); + + sz += 1; + } + } + } +#else int w_offset_0 = gz * channels_g * kernel_w * kernel_h; int v_offset_0 = gg * channels_g * psc(cstep); @@ -155,6 +218,7 @@ void main() } } } +#endif if (activation_type == 1) { @@ -180,7 +244,11 @@ void main() sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); } +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st8(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack8.comp b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack8.comp index 2fcae2b0e..b0d012e04 100644 --- a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack8.comp +++ b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack8.comp @@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -84,7 +91,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else sum = buffer_ld8(bias_data, gz); +#endif } else { @@ -101,6 +112,58 @@ void main() // group id const int gg = gz / num_output_g; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int sz = gg * channels_g; + int wx = (y * kernel_w + x) * 8; + + for (int z = 0; z < channels_g; z++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, sz)); + + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz)); + afpvec8 k4 = image3d_ld8(weight_blob, ivec3(wx + 4, z, gz)); + afpvec8 k5 = image3d_ld8(weight_blob, ivec3(wx + 5, z, gz)); + afpvec8 k6 = image3d_ld8(weight_blob, ivec3(wx + 6, z, gz)); + afpvec8 k7 = image3d_ld8(weight_blob, ivec3(wx + 7, z, gz)); + + // sum += v * k + sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]); + sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]); + sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]); + sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]); + + sz += 1; + } + } + } +#else int w_offset_0 = gz * channels_g * kernel_w * kernel_h; int v_offset_0 = gg * channels_g * psc(cstep); @@ -155,6 +218,7 @@ void main() } } } +#endif if (activation_type == 1) { @@ -180,7 +244,11 @@ void main() sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); } +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st8(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack8to1.comp b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack8to1.comp index 525d78d3d..b7f21bc9b 100644 --- a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack8to1.comp +++ b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack8to1.comp @@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -84,7 +91,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else sum = buffer_ld1(bias_data, gz); +#endif } else { @@ -101,6 +112,44 @@ void main() // group id const int gg = gz / num_output_g; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int sz = gg * channels_g; + int wx = y * kernel_w + x; + + for (int z = 0; z < channels_g; z++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, sz)); + + afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz)); + + // sum += dot(v, k); + sum += dot(v[0], k[0]) + dot(v[1], k[1]); + + sz += 1; + } + } + } +#else int w_offset_0 = gz * channels_g * kernel_w * kernel_h; int v_offset_0 = gg * channels_g * psc(cstep); @@ -141,6 +190,7 @@ void main() } } } +#endif if (activation_type == 1) { @@ -162,7 +212,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st1(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack8to4.comp b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack8to4.comp index ec19a65af..a91772a23 100644 --- a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack8to4.comp +++ b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack8to4.comp @@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -84,7 +91,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else sum = buffer_ld4(bias_data, gz); +#endif } else { @@ -101,6 +112,50 @@ void main() // group id const int gg = gz / num_output_g; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int sz = gg * channels_g; + int wx = (y * kernel_w + x) * 4; + + for (int z = 0; z < channels_g; z++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, sz)); + + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz)); + + // sum += v * k + sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + + sz += 1; + } + } + } +#else int w_offset_0 = gz * channels_g * kernel_w * kernel_h; int v_offset_0 = gg * channels_g * psc(cstep); @@ -147,6 +202,7 @@ void main() } } } +#endif if (activation_type == 1) { @@ -168,7 +224,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st4(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/deconvolutiondepthwise_pack4.comp b/src/layer/vulkan/shader/deconvolutiondepthwise_pack4.comp index 6b3fec1ab..8fbf30151 100644 --- a/src/layer/vulkan/shader/deconvolutiondepthwise_pack4.comp +++ b/src/layer/vulkan/shader/deconvolutiondepthwise_pack4.comp @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -83,7 +90,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else sum = buffer_ld4(bias_data, gz); +#endif } else { @@ -94,6 +105,37 @@ void main() const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; // depth-wise deconvolution +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int wx = y * kernel_w + x; + + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, gz)); + + afpvec4 k = image2d_ld4(weight_blob, ivec2(wx, gz)); + + sum += v * k; + } + } +#else int v_offset_0 = gz * psc(cstep); int w_offset_0 = gz * kernel_w * kernel_h; @@ -127,6 +169,7 @@ void main() sum += v * k; } } +#endif if (activation_type == 1) { @@ -148,7 +191,11 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st4(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/deconvolutiondepthwise_pack8.comp b/src/layer/vulkan/shader/deconvolutiondepthwise_pack8.comp index 1a99e18fd..965cf6531 100644 --- a/src/layer/vulkan/shader/deconvolutiondepthwise_pack8.comp +++ b/src/layer/vulkan/shader/deconvolutiondepthwise_pack8.comp @@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -84,7 +91,11 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else sum = buffer_ld8(bias_data, gz); +#endif } else { @@ -95,6 +106,39 @@ void main() const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; // depth-wise deconvolution +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= psc(h)) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= psc(w)) + continue; + + int wx = y * kernel_w + x; + + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, gz)); + + afpvec8 k = image2d_ld8(weight_blob, ivec2(wx, gz)); + + // sum += v * k; + sum[0] += v[0] * k[0]; + sum[1] += v[1] * k[1]; + } + } +#else int v_offset_0 = gz * psc(cstep); int w_offset_0 = gz * kernel_w * kernel_h; @@ -130,6 +174,7 @@ void main() sum[1] += v[1] * k[1]; } } +#endif if (activation_type == 1) { @@ -155,7 +200,11 @@ void main() sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); } +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st8(top_blob_data, gi, sum); +#endif } diff --git a/src/layer/vulkan/shader/eltwise.comp b/src/layer/vulkan/shader/eltwise.comp index 69b025d48..addb1bfb0 100644 --- a/src/layer/vulkan/shader/eltwise.comp +++ b/src/layer/vulkan/shader/eltwise.comp @@ -35,9 +35,21 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob1_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob1_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob1_3d; +layout (binding = 1) uniform unfp sampler1D bottom_blob2_1d; +layout (binding = 1) uniform unfp sampler2D bottom_blob2_2d; +layout (binding = 1) uniform unfp sampler3D bottom_blob2_3d; +layout (binding = 2, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 2, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 2, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob1 { sfp bottom_blob1_data[]; }; layout (binding = 1) readonly buffer bottom_blob2 { sfp bottom_blob2_data[]; }; layout (binding = 2) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -60,10 +72,30 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + afp v1; + afp v2; + if (psc(dims) == 1) + { + v1 = image1d_ld1(bottom_blob1_1d, gx); + v2 = image1d_ld1(bottom_blob2_1d, gx); + } + else if (psc(dims) == 2) + { + v1 = image2d_ld1(bottom_blob1_2d, ivec2(gx, gy)); + v2 = image2d_ld1(bottom_blob2_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + v1 = image3d_ld1(bottom_blob1_3d, ivec3(gx, gy, gz)); + v2 = image3d_ld1(bottom_blob2_3d, ivec3(gx, gy, gz)); + } +#else const int gi = gz * psc(cstep) + gy * psc(w) + gx; afp v1 = buffer_ld1(bottom_blob1_data, gi); afp v2 = buffer_ld1(bottom_blob2_data, gi); +#endif afp res; @@ -90,5 +122,20 @@ void main() res = max(v1, v2); } +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st1(top_blob_1d, gx, res); + } + else if (psc(dims) == 2) + { + image2d_st1(top_blob_2d, ivec2(gx, gy), res); + } + else // if (psc(dims) == 3) + { + image3d_st1(top_blob_3d, ivec3(gx, gy, gz), res); + } +#else buffer_st1(top_blob_data, gi, res); +#endif } diff --git a/src/layer/vulkan/shader/eltwise_pack4.comp b/src/layer/vulkan/shader/eltwise_pack4.comp index e6b6d8ff5..c93d1000b 100644 --- a/src/layer/vulkan/shader/eltwise_pack4.comp +++ b/src/layer/vulkan/shader/eltwise_pack4.comp @@ -35,9 +35,21 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob1_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob1_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob1_3d; +layout (binding = 1) uniform unfp sampler1D bottom_blob2_1d; +layout (binding = 1) uniform unfp sampler2D bottom_blob2_2d; +layout (binding = 1) uniform unfp sampler3D bottom_blob2_3d; +layout (binding = 2, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 2, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 2, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob1 { sfpvec4 bottom_blob1_data[]; }; layout (binding = 1) readonly buffer bottom_blob2 { sfpvec4 bottom_blob2_data[]; }; layout (binding = 2) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -60,10 +72,30 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + afpvec4 v1; + afpvec4 v2; + if (psc(dims) == 1) + { + v1 = image1d_ld4(bottom_blob1_1d, gx); + v2 = image1d_ld4(bottom_blob2_1d, gx); + } + else if (psc(dims) == 2) + { + v1 = image2d_ld4(bottom_blob1_2d, ivec2(gx, gy)); + v2 = image2d_ld4(bottom_blob2_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + v1 = image3d_ld4(bottom_blob1_3d, ivec3(gx, gy, gz)); + v2 = image3d_ld4(bottom_blob2_3d, ivec3(gx, gy, gz)); + } +#else const int gi = gz * psc(cstep) + gy * psc(w) + gx; afpvec4 v1 = buffer_ld4(bottom_blob1_data, gi); afpvec4 v2 = buffer_ld4(bottom_blob2_data, gi); +#endif afpvec4 res; @@ -90,5 +122,20 @@ void main() res = max(v1, v2); } +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st4(top_blob_1d, gx, res); + } + else if (psc(dims) == 2) + { + image2d_st4(top_blob_2d, ivec2(gx, gy), res); + } + else // if (psc(dims) == 3) + { + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), res); + } +#else buffer_st4(top_blob_data, gi, res); +#endif } diff --git a/src/layer/vulkan/shader/eltwise_pack8.comp b/src/layer/vulkan/shader/eltwise_pack8.comp index 430230e53..5f767b82f 100644 --- a/src/layer/vulkan/shader/eltwise_pack8.comp +++ b/src/layer/vulkan/shader/eltwise_pack8.comp @@ -36,9 +36,21 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob1_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob1_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob1_3d; +layout (binding = 1) uniform unfp sampler1D bottom_blob2_1d; +layout (binding = 1) uniform unfp sampler2D bottom_blob2_2d; +layout (binding = 1) uniform unfp sampler3D bottom_blob2_3d; +layout (binding = 2, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 2, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 2, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob1 { sfpvec8 bottom_blob1_data[]; }; layout (binding = 1) readonly buffer bottom_blob2 { sfpvec8 bottom_blob2_data[]; }; layout (binding = 2) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -61,10 +73,30 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + afpvec8 v1; + afpvec8 v2; + if (psc(dims) == 1) + { + v1 = image1d_ld8(bottom_blob1_1d, gx); + v2 = image1d_ld8(bottom_blob2_1d, gx); + } + else if (psc(dims) == 2) + { + v1 = image2d_ld8(bottom_blob1_2d, ivec2(gx, gy)); + v2 = image2d_ld8(bottom_blob2_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + v1 = image3d_ld8(bottom_blob1_3d, ivec3(gx, gy, gz)); + v2 = image3d_ld8(bottom_blob2_3d, ivec3(gx, gy, gz)); + } +#else const int gi = gz * psc(cstep) + gy * psc(w) + gx; afpvec8 v1 = buffer_ld8(bottom_blob1_data, gi); afpvec8 v2 = buffer_ld8(bottom_blob2_data, gi); +#endif afpvec8 res; @@ -109,5 +141,20 @@ void main() } } +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st8(top_blob_1d, gx, res); + } + else if (psc(dims) == 2) + { + image2d_st8(top_blob_2d, ivec2(gx, gy), res); + } + else // if (psc(dims) == 3) + { + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), res); + } +#else buffer_st8(top_blob_data, gi, res); +#endif } diff --git a/src/layer/vulkan/shader/flatten.comp b/src/layer/vulkan/shader/flatten.comp index 683d1d779..8cc137789 100644 --- a/src/layer/vulkan/shader/flatten.comp +++ b/src/layer/vulkan/shader/flatten.comp @@ -38,8 +38,14 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -71,7 +77,22 @@ void main() int y = gx % size / psc(w); int x = gx % size % psc(w); +#if NCNN_image_shader + afp v; + + if (psc(dims) == 2) + { + v = image2d_ld1(bottom_blob_2d, ivec2(x, y)); + } + else // if (psc(dims) == 3) + { + v = image3d_ld1(bottom_blob_3d, ivec3(x, y, z)); + } + + image1d_st1(top_blob, gx, v); +#else int v_offset = z * psc(cstep) + y * psc(w) + x; buffer_cp1(top_blob_data, gx, bottom_blob_data, v_offset); +#endif } diff --git a/src/layer/vulkan/shader/flatten_pack1to4.comp b/src/layer/vulkan/shader/flatten_pack1to4.comp index 6c0fab307..b0ff244e5 100644 --- a/src/layer/vulkan/shader/flatten_pack1to4.comp +++ b/src/layer/vulkan/shader/flatten_pack1to4.comp @@ -38,8 +38,14 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -67,6 +73,35 @@ void main() ivec4 i4 = gx * 4 + ivec4(0, 1, 2, 3); +#if NCNN_image_shader + afpvec4 v; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + v.r = image2d_ld1(bottom_blob_2d, ivec2(x4.r, y4.r)); + v.g = image2d_ld1(bottom_blob_2d, ivec2(x4.g, y4.g)); + v.b = image2d_ld1(bottom_blob_2d, ivec2(x4.b, y4.b)); + v.a = image2d_ld1(bottom_blob_2d, ivec2(x4.a, y4.a)); + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + v.r = image3d_ld1(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r)); + v.g = image3d_ld1(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g)); + v.b = image3d_ld1(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b)); + v.a = image3d_ld1(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a)); + } + + image1d_st4(top_blob, gx, v); +#else ivec4 v_offset; if (psc(dims) == 2) @@ -88,4 +123,5 @@ void main() } buffer_cp1to4(top_blob_data, gx, bottom_blob_data, v_offset); +#endif } diff --git a/src/layer/vulkan/shader/flatten_pack1to8.comp b/src/layer/vulkan/shader/flatten_pack1to8.comp index 512979079..38f3f89d3 100644 --- a/src/layer/vulkan/shader/flatten_pack1to8.comp +++ b/src/layer/vulkan/shader/flatten_pack1to8.comp @@ -39,8 +39,14 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -69,6 +75,50 @@ void main() ivec4 i4 = gx * 8 + ivec4(0, 1, 2, 3); ivec4 ii4 = i4 + 4; +#if NCNN_image_shader + afpvec8 v; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + v[0].r = image2d_ld1(bottom_blob_2d, ivec2(x4.r, y4.r)); + v[0].g = image2d_ld1(bottom_blob_2d, ivec2(x4.g, y4.g)); + v[0].b = image2d_ld1(bottom_blob_2d, ivec2(x4.b, y4.b)); + v[0].a = image2d_ld1(bottom_blob_2d, ivec2(x4.a, y4.a)); + v[1].r = image2d_ld1(bottom_blob_2d, ivec2(xx4.r, yy4.r)); + v[1].g = image2d_ld1(bottom_blob_2d, ivec2(xx4.g, yy4.g)); + v[1].b = image2d_ld1(bottom_blob_2d, ivec2(xx4.b, yy4.b)); + v[1].a = image2d_ld1(bottom_blob_2d, ivec2(xx4.a, yy4.a)); + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + v[0].r = image3d_ld1(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r)); + v[0].g = image3d_ld1(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g)); + v[0].b = image3d_ld1(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b)); + v[0].a = image3d_ld1(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a)); + v[1].r = image3d_ld1(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r)); + v[1].g = image3d_ld1(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g)); + v[1].b = image3d_ld1(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b)); + v[1].a = image3d_ld1(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a)); + } + + image1d_st8(top_blob, gx, v); +#else ivec4 v_offset; ivec4 vv_offset; @@ -100,4 +150,5 @@ void main() } buffer_cp1to8(top_blob_data, gx, bottom_blob_data, v_offset, vv_offset); +#endif } diff --git a/src/layer/vulkan/shader/flatten_pack4.comp b/src/layer/vulkan/shader/flatten_pack4.comp index 9d952f28e..a6827efd4 100644 --- a/src/layer/vulkan/shader/flatten_pack4.comp +++ b/src/layer/vulkan/shader/flatten_pack4.comp @@ -38,12 +38,18 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +#else #if NCNN_fp16_packed layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; #else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; #endif layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -71,6 +77,45 @@ void main() ivec4 i4 = gx * 4 + ivec4(0, 1, 2, 3); +#if NCNN_image_shader + afpvec4 v; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + afpvec4 v0 = image2d_ld4(bottom_blob_2d, ivec2(x4.r, y4.r / 4)); + afpvec4 v1 = image2d_ld4(bottom_blob_2d, ivec2(x4.g, y4.g / 4)); + afpvec4 v2 = image2d_ld4(bottom_blob_2d, ivec2(x4.b, y4.b / 4)); + afpvec4 v3 = image2d_ld4(bottom_blob_2d, ivec2(x4.a, y4.a / 4)); + + v.r = v0[y4.r % 4]; + v.g = v1[y4.g % 4]; + v.b = v2[y4.b % 4]; + v.a = v3[y4.a % 4]; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + afpvec4 v0 = image3d_ld4(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 4)); + afpvec4 v1 = image3d_ld4(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 4)); + afpvec4 v2 = image3d_ld4(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 4)); + afpvec4 v3 = image3d_ld4(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 4)); + + v.r = v0[z4.r % 4]; + v.g = v1[z4.g % 4]; + v.b = v2[z4.b % 4]; + v.a = v3[z4.a % 4]; + } + + image1d_st4(top_blob, gx, v); +#else #if NCNN_fp16_packed ivec4 v_offset; ivec4 lane2; @@ -126,4 +171,5 @@ void main() buffer_cp1to4(top_blob_data, gx, bottom_blob_data, v_offset); #endif +#endif } diff --git a/src/layer/vulkan/shader/flatten_pack4to8.comp b/src/layer/vulkan/shader/flatten_pack4to8.comp index c07dd3c38..8dfaf3b15 100644 --- a/src/layer/vulkan/shader/flatten_pack4to8.comp +++ b/src/layer/vulkan/shader/flatten_pack4to8.comp @@ -39,12 +39,18 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +#else #if NCNN_fp16_packed layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; #else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; #endif layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -73,6 +79,66 @@ void main() ivec4 i4 = gx * 8 + ivec4(0, 1, 2, 3); ivec4 ii4 = i4 + 4; +#if NCNN_image_shader + afpvec8 v; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + afpvec4 v0 = image2d_ld4(bottom_blob_2d, ivec2(x4.r, y4.r / 4)); + afpvec4 v1 = image2d_ld4(bottom_blob_2d, ivec2(x4.g, y4.g / 4)); + afpvec4 v2 = image2d_ld4(bottom_blob_2d, ivec2(x4.b, y4.b / 4)); + afpvec4 v3 = image2d_ld4(bottom_blob_2d, ivec2(x4.a, y4.a / 4)); + afpvec4 v4 = image2d_ld4(bottom_blob_2d, ivec2(xx4.r, yy4.r / 4)); + afpvec4 v5 = image2d_ld4(bottom_blob_2d, ivec2(xx4.g, yy4.g / 4)); + afpvec4 v6 = image2d_ld4(bottom_blob_2d, ivec2(xx4.b, yy4.b / 4)); + afpvec4 v7 = image2d_ld4(bottom_blob_2d, ivec2(xx4.a, yy4.a / 4)); + + v[0].r = v0[y4.r % 4]; + v[0].g = v1[y4.g % 4]; + v[0].b = v2[y4.b % 4]; + v[0].a = v3[y4.a % 4]; + v[1].r = v4[yy4.r % 4]; + v[1].g = v5[yy4.g % 4]; + v[1].b = v6[yy4.b % 4]; + v[1].a = v7[yy4.a % 4]; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + afpvec4 v0 = image3d_ld4(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 4)); + afpvec4 v1 = image3d_ld4(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 4)); + afpvec4 v2 = image3d_ld4(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 4)); + afpvec4 v3 = image3d_ld4(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 4)); + afpvec4 v4 = image3d_ld4(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r / 4)); + afpvec4 v5 = image3d_ld4(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g / 4)); + afpvec4 v6 = image3d_ld4(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b / 4)); + afpvec4 v7 = image3d_ld4(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a / 4)); + + v[0].r = v0[z4.r % 4]; + v[0].g = v1[z4.g % 4]; + v[0].b = v2[z4.b % 4]; + v[0].a = v3[z4.a % 4]; + v[1].r = v4[zz4.r % 4]; + v[1].g = v5[zz4.g % 4]; + v[1].b = v6[zz4.b % 4]; + v[1].a = v7[zz4.a % 4]; + } + + image1d_st8(top_blob, gx, v); +#else #if NCNN_fp16_packed ivec4 v_offset; ivec4 lane4; @@ -152,4 +218,5 @@ void main() buffer_cp1to8(top_blob_data, gx, bottom_blob_data, v_offset, vv_offset); #endif +#endif } diff --git a/src/layer/vulkan/shader/flatten_pack8.comp b/src/layer/vulkan/shader/flatten_pack8.comp index dbd4d6ca6..01a06f451 100644 --- a/src/layer/vulkan/shader/flatten_pack8.comp +++ b/src/layer/vulkan/shader/flatten_pack8.comp @@ -39,12 +39,18 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +#else #if NCNN_fp16_packed layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; #else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; #endif layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -73,6 +79,66 @@ void main() ivec4 i4 = gx * 8 + ivec4(0, 1, 2, 3); ivec4 ii4 = i4 + 4; +#if NCNN_image_shader + afpvec8 v; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + afpvec8 v0 = image2d_ld8(bottom_blob_2d, ivec2(x4.r, y4.r / 8)); + afpvec8 v1 = image2d_ld8(bottom_blob_2d, ivec2(x4.g, y4.g / 8)); + afpvec8 v2 = image2d_ld8(bottom_blob_2d, ivec2(x4.b, y4.b / 8)); + afpvec8 v3 = image2d_ld8(bottom_blob_2d, ivec2(x4.a, y4.a / 8)); + afpvec8 v4 = image2d_ld8(bottom_blob_2d, ivec2(xx4.r, yy4.r / 8)); + afpvec8 v5 = image2d_ld8(bottom_blob_2d, ivec2(xx4.g, yy4.g / 8)); + afpvec8 v6 = image2d_ld8(bottom_blob_2d, ivec2(xx4.b, yy4.b / 8)); + afpvec8 v7 = image2d_ld8(bottom_blob_2d, ivec2(xx4.a, yy4.a / 8)); + + v[0].r = v0[(y4.r % 8) / 4][y4.r % 4]; + v[0].g = v1[(y4.g % 8) / 4][y4.g % 4]; + v[0].b = v2[(y4.b % 8) / 4][y4.b % 4]; + v[0].a = v3[(y4.a % 8) / 4][y4.a % 4]; + v[1].r = v4[(yy4.r % 8) / 4][yy4.r % 4]; + v[1].g = v5[(yy4.g % 8) / 4][yy4.g % 4]; + v[1].b = v6[(yy4.b % 8) / 4][yy4.b % 4]; + v[1].a = v7[(yy4.a % 8) / 4][yy4.a % 4]; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + afpvec8 v0 = image3d_ld8(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 8)); + afpvec8 v1 = image3d_ld8(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 8)); + afpvec8 v2 = image3d_ld8(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 8)); + afpvec8 v3 = image3d_ld8(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 8)); + afpvec8 v4 = image3d_ld8(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r / 8)); + afpvec8 v5 = image3d_ld8(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g / 8)); + afpvec8 v6 = image3d_ld8(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b / 8)); + afpvec8 v7 = image3d_ld8(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a / 8)); + + v[0].r = v0[(z4.r % 8) / 4][z4.r % 4]; + v[0].g = v1[(z4.g % 8) / 4][z4.g % 4]; + v[0].b = v2[(z4.b % 8) / 4][z4.b % 4]; + v[0].a = v3[(z4.a % 8) / 4][z4.a % 4]; + v[1].r = v4[(zz4.r % 8) / 4][zz4.r % 4]; + v[1].g = v5[(zz4.g % 8) / 4][zz4.g % 4]; + v[1].b = v6[(zz4.b % 8) / 4][zz4.b % 4]; + v[1].a = v7[(zz4.a % 8) / 4][zz4.a % 4]; + } + + image1d_st8(top_blob, gx, v); +#else #if NCNN_fp16_packed ivec4 v_offset; ivec4 lane4; @@ -152,4 +218,5 @@ void main() buffer_cp1to8(top_blob_data, gx, bottom_blob_data, v_offset, vv_offset); #endif +#endif } diff --git a/src/layer/vulkan/shader/innerproduct.comp b/src/layer/vulkan/shader/innerproduct.comp index b71d0282c..bc2cf298c 100644 --- a/src/layer/vulkan/shader/innerproduct.comp +++ b/src/layer/vulkan/shader/innerproduct.comp @@ -43,10 +43,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -76,19 +83,30 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gx); +#else sum = buffer_ld1(bias_data, gx); +#endif } else { sum = afp(0.f); } +#if NCNN_image_shader + for (int i = 0; i < psc(w); i++) + { + sum += image2d_ld1(weight_blob, ivec2(i, gx)) * image1d_ld1(bottom_blob, i); + } +#else // NCNN_image_shader int w_offset = gx * psc(w); for (int i = 0; i < psc(w); i++) { sum += buffer_ld1(weight_data, w_offset + i) * buffer_ld1(bottom_blob_data, i); } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -110,5 +128,9 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image1d_st1(top_blob, gx, sum); +#else buffer_st1(top_blob_data, gx, sum); +#endif } diff --git a/src/layer/vulkan/shader/innerproduct_pack1to4.comp b/src/layer/vulkan/shader/innerproduct_pack1to4.comp index a6ede8bad..cd911b264 100644 --- a/src/layer/vulkan/shader/innerproduct_pack1to4.comp +++ b/src/layer/vulkan/shader/innerproduct_pack1to4.comp @@ -43,10 +43,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -76,13 +83,27 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gx); +#else sum = buffer_ld4(bias_data, gx); +#endif } else { sum = afpvec4(0.f); } +#if NCNN_image_shader + for (int i = 0; i < psc(w); i++) + { + afp v = image1d_ld1(bottom_blob, i); + + afpvec4 k = image2d_ld4(weight_blob, ivec2(i, gx)); + + sum += v * k; + } +#else // NCNN_image_shader int w_offset = gx * psc(w); for (int i = 0; i < psc(w); i++) @@ -93,6 +114,7 @@ void main() sum += v * k; } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -114,5 +136,9 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image1d_st4(top_blob, gx, sum); +#else buffer_st4(top_blob_data, gx, sum); +#endif } diff --git a/src/layer/vulkan/shader/innerproduct_pack1to8.comp b/src/layer/vulkan/shader/innerproduct_pack1to8.comp index 30d2b89e8..52ac57d7d 100644 --- a/src/layer/vulkan/shader/innerproduct_pack1to8.comp +++ b/src/layer/vulkan/shader/innerproduct_pack1to8.comp @@ -44,10 +44,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -77,13 +84,31 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gx); +#else sum = buffer_ld8(bias_data, gx); +#endif } else { sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); } +#if NCNN_image_shader + int wx = 0; + + for (int i = 0; i < psc(w); i++) + { + afp v = image1d_ld1(bottom_blob, i); + + afpvec8 k = image2d_ld8(weight_blob, ivec2(i, gx)); + + // sum += v * k; + sum[0] += v * k[0]; + sum[1] += v * k[1]; + } +#else // NCNN_image_shader int w_offset = gx * psc(w); for (int i = 0; i < psc(w); i++) @@ -96,6 +121,7 @@ void main() sum[0] += v * k[0]; sum[1] += v * k[1]; } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -121,5 +147,9 @@ void main() sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); } +#if NCNN_image_shader + image1d_st8(top_blob, gx, sum); +#else buffer_st8(top_blob_data, gx, sum); +#endif } diff --git a/src/layer/vulkan/shader/innerproduct_pack4.comp b/src/layer/vulkan/shader/innerproduct_pack4.comp index c6c6606d4..84b5ab6af 100644 --- a/src/layer/vulkan/shader/innerproduct_pack4.comp +++ b/src/layer/vulkan/shader/innerproduct_pack4.comp @@ -43,6 +43,12 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; #if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) @@ -52,6 +58,7 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; #endif layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -81,13 +88,35 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gx); +#else sum = buffer_ld4(bias_data, gx); +#endif } else { sum = afpvec4(0.f); } +#if NCNN_image_shader + int wx = 0; + + for (int i = 0; i < psc(w); i++) + { + afpvec4 v = image1d_ld4(bottom_blob, i); + afpmat4 k = afpmat4( + image2d_ld4(weight_blob, ivec2(wx + 0, gx)), + image2d_ld4(weight_blob, ivec2(wx + 1, gx)), + image2d_ld4(weight_blob, ivec2(wx + 2, gx)), + image2d_ld4(weight_blob, ivec2(wx + 3, gx)) + ); + + sum += v * k; + + wx += 4; + } +#else // NCNN_image_shader int w_offset = gx * psc(w); for (int i = 0; i < psc(w); i++) @@ -108,6 +137,7 @@ void main() sum += v * k; } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -129,5 +159,9 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image1d_st4(top_blob, gx, sum); +#else buffer_st4(top_blob_data, gx, sum); +#endif } diff --git a/src/layer/vulkan/shader/innerproduct_pack4to1.comp b/src/layer/vulkan/shader/innerproduct_pack4to1.comp index ed4263e05..ade1bed47 100644 --- a/src/layer/vulkan/shader/innerproduct_pack4to1.comp +++ b/src/layer/vulkan/shader/innerproduct_pack4to1.comp @@ -43,10 +43,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -76,13 +83,27 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gx); +#else sum = buffer_ld1(bias_data, gx); +#endif } else { sum = afp(0.f); } +#if NCNN_image_shader + for (int i = 0; i < psc(w); i++) + { + afpvec4 v = image1d_ld4(bottom_blob, i); + + afpvec4 k = image2d_ld4(weight_blob, ivec2(i, gx)); + + sum += dot(v, k); + } +#else // NCNN_image_shader int w_offset = gx * psc(w); for (int i = 0; i < psc(w); i++) @@ -93,6 +114,7 @@ void main() sum += dot(v, k); } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -114,5 +136,9 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image1d_st1(top_blob, gx, sum); +#else buffer_st1(top_blob_data, gx, sum); +#endif } diff --git a/src/layer/vulkan/shader/innerproduct_pack4to8.comp b/src/layer/vulkan/shader/innerproduct_pack4to8.comp index c7b51f514..f174fe724 100644 --- a/src/layer/vulkan/shader/innerproduct_pack4to8.comp +++ b/src/layer/vulkan/shader/innerproduct_pack4to8.comp @@ -44,10 +44,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -77,13 +84,46 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gx); +#else sum = buffer_ld8(bias_data, gx); +#endif } else { sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); } +#if NCNN_image_shader + int wx = 0; + + for (int i = 0; i < psc(w); i++) + { + afpvec4 v = image1d_ld4(bottom_blob, i); + + afpvec4 k0 = image2d_ld4(weight_blob, ivec2(wx + 0, gx)); + afpvec4 k1 = image2d_ld4(weight_blob, ivec2(wx + 1, gx)); + afpvec4 k2 = image2d_ld4(weight_blob, ivec2(wx + 2, gx)); + afpvec4 k3 = image2d_ld4(weight_blob, ivec2(wx + 3, gx)); + afpvec4 k4 = image2d_ld4(weight_blob, ivec2(wx + 4, gx)); + afpvec4 k5 = image2d_ld4(weight_blob, ivec2(wx + 5, gx)); + afpvec4 k6 = image2d_ld4(weight_blob, ivec2(wx + 6, gx)); + afpvec4 k7 = image2d_ld4(weight_blob, ivec2(wx + 7, gx)); + + // sum += v * k; + sum[0].r += dot(v, k0); + sum[0].g += dot(v, k1); + sum[0].b += dot(v, k2); + sum[0].a += dot(v, k3); + sum[1].r += dot(v, k4); + sum[1].g += dot(v, k5); + sum[1].b += dot(v, k6); + sum[1].a += dot(v, k7); + + wx += 8; + } +#else // NCNN_image_shader int w_offset = gx * psc(w); for (int i = 0; i < psc(w); i++) @@ -109,6 +149,7 @@ void main() sum[1].b += dot(v, k6); sum[1].a += dot(v, k7); } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -134,5 +175,9 @@ void main() sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); } +#if NCNN_image_shader + image1d_st8(top_blob, gx, sum); +#else buffer_st8(top_blob_data, gx, sum); +#endif } diff --git a/src/layer/vulkan/shader/innerproduct_pack8.comp b/src/layer/vulkan/shader/innerproduct_pack8.comp index f2489a2d1..c13229dd8 100644 --- a/src/layer/vulkan/shader/innerproduct_pack8.comp +++ b/src/layer/vulkan/shader/innerproduct_pack8.comp @@ -44,10 +44,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -77,13 +84,46 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gx); +#else sum = buffer_ld8(bias_data, gx); +#endif } else { sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); } +#if NCNN_image_shader + int wx = 0; + + for (int i = 0; i < psc(w); i++) + { + afpvec8 v = image1d_ld8(bottom_blob, i); + + afpvec8 k0 = image2d_ld8(weight_blob, ivec2(wx + 0, gx)); + afpvec8 k1 = image2d_ld8(weight_blob, ivec2(wx + 1, gx)); + afpvec8 k2 = image2d_ld8(weight_blob, ivec2(wx + 2, gx)); + afpvec8 k3 = image2d_ld8(weight_blob, ivec2(wx + 3, gx)); + afpvec8 k4 = image2d_ld8(weight_blob, ivec2(wx + 4, gx)); + afpvec8 k5 = image2d_ld8(weight_blob, ivec2(wx + 5, gx)); + afpvec8 k6 = image2d_ld8(weight_blob, ivec2(wx + 6, gx)); + afpvec8 k7 = image2d_ld8(weight_blob, ivec2(wx + 7, gx)); + + // sum += v * k + sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]); + sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]); + sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]); + sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]); + + wx += 8; + } +#else // NCNN_image_shader int w_offset = gx * psc(w); for (int i = 0; i < psc(w); i++) @@ -109,6 +149,7 @@ void main() sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]); sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]); } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -134,5 +175,9 @@ void main() sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); } +#if NCNN_image_shader + image1d_st8(top_blob, gx, sum); +#else buffer_st8(top_blob_data, gx, sum); +#endif } diff --git a/src/layer/vulkan/shader/innerproduct_pack8to1.comp b/src/layer/vulkan/shader/innerproduct_pack8to1.comp index 94f2fc0bd..bd6b8c93d 100644 --- a/src/layer/vulkan/shader/innerproduct_pack8to1.comp +++ b/src/layer/vulkan/shader/innerproduct_pack8to1.comp @@ -44,10 +44,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -77,13 +84,28 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gx); +#else sum = buffer_ld1(bias_data, gx); +#endif } else { sum = afp(0.f); } +#if NCNN_image_shader + for (int i = 0; i < psc(w); i++) + { + afpvec8 v = image1d_ld8(bottom_blob, i); + + afpvec8 k = image2d_ld8(weight_blob, ivec2(i, gx)); + + // sum += dot(v, k); + sum += dot(v[0], k[0]) + dot(v[1], k[1]); + } +#else // NCNN_image_shader int w_offset = gx * psc(w); for (int i = 0; i < psc(w); i++) @@ -95,6 +117,7 @@ void main() // sum += dot(v, k); sum += dot(v[0], k[0]) + dot(v[1], k[1]); } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -116,5 +139,9 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image1d_st1(top_blob, gx, sum); +#else buffer_st1(top_blob_data, gx, sum); +#endif } diff --git a/src/layer/vulkan/shader/innerproduct_pack8to4.comp b/src/layer/vulkan/shader/innerproduct_pack8to4.comp index 59b5f2016..1024c09f7 100644 --- a/src/layer/vulkan/shader/innerproduct_pack8to4.comp +++ b/src/layer/vulkan/shader/innerproduct_pack8to4.comp @@ -44,10 +44,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -77,13 +84,38 @@ void main() if (bias_term == 1) { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gx); +#else sum = buffer_ld4(bias_data, gx); +#endif } else { sum = afpvec4(0.f); } +#if NCNN_image_shader + int wx = 0; + + for (int i = 0; i < psc(w); i++) + { + afpvec8 v = image1d_ld8(bottom_blob, i); + + afpvec8 k0 = image2d_ld8(weight_blob, ivec2(wx + 0, gx)); + afpvec8 k1 = image2d_ld8(weight_blob, ivec2(wx + 1, gx)); + afpvec8 k2 = image2d_ld8(weight_blob, ivec2(wx + 2, gx)); + afpvec8 k3 = image2d_ld8(weight_blob, ivec2(wx + 3, gx)); + + // sum += v * k + sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + + wx += 4; + } +#else // NCNN_image_shader int w_offset = gx * psc(w); for (int i = 0; i < psc(w); i++) @@ -101,6 +133,7 @@ void main() sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]); sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]); } +#endif // NCNN_image_shader if (activation_type == 1) { @@ -122,5 +155,9 @@ void main() sum = afp(1.f) / (afp(1.f) + exp(-sum)); } +#if NCNN_image_shader + image1d_st4(top_blob, gx, sum); +#else buffer_st4(top_blob_data, gx, sum); +#endif } diff --git a/src/layer/vulkan/shader/packing_1to4.comp b/src/layer/vulkan/shader/packing_1to4.comp index 63136cb41..0bd03c3cb 100644 --- a/src/layer/vulkan/shader/packing_1to4.comp +++ b/src/layer/vulkan/shader/packing_1to4.comp @@ -38,8 +38,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -65,6 +74,43 @@ void main() if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) return; +#if NCNN_image_shader + afpvec4 v; + + if (psc(dims) == 1) + { + int x4 = gx * 4; + + v.r = image1d_ld1(bottom_blob_1d, x4 + 0); + v.g = image1d_ld1(bottom_blob_1d, x4 + 1); + v.b = image1d_ld1(bottom_blob_1d, x4 + 2); + v.a = image1d_ld1(bottom_blob_1d, x4 + 3); + + image1d_st4(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + int y4 = gy * 4; + + v.r = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 0)); + v.g = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 1)); + v.b = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 2)); + v.a = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 3)); + + image2d_st4(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + int z4 = gz * 4; + + v.r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 0)); + v.g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 1)); + v.b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 2)); + v.a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 3)); + + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else // NCNN_image_shader ivec4 v_offset; if (psc(dims) == 1) @@ -89,4 +135,5 @@ void main() int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif // NCNN_image_shader } diff --git a/src/layer/vulkan/shader/packing_1to8.comp b/src/layer/vulkan/shader/packing_1to8.comp index 750a2f69e..526496ccf 100644 --- a/src/layer/vulkan/shader/packing_1to8.comp +++ b/src/layer/vulkan/shader/packing_1to8.comp @@ -39,8 +39,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -66,6 +75,55 @@ void main() if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) return; +#if NCNN_image_shader + afpvec8 v; + + if (psc(dims) == 1) + { + int x4 = gx * 8; + + v[0].r = image1d_ld1(bottom_blob_1d, x4 + 0); + v[0].g = image1d_ld1(bottom_blob_1d, x4 + 1); + v[0].b = image1d_ld1(bottom_blob_1d, x4 + 2); + v[0].a = image1d_ld1(bottom_blob_1d, x4 + 3); + v[1].r = image1d_ld1(bottom_blob_1d, x4 + 4); + v[1].g = image1d_ld1(bottom_blob_1d, x4 + 5); + v[1].b = image1d_ld1(bottom_blob_1d, x4 + 6); + v[1].a = image1d_ld1(bottom_blob_1d, x4 + 7); + + image1d_st8(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + int y4 = gy * 8; + + v[0].r = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 0)); + v[0].g = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 1)); + v[0].b = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 2)); + v[0].a = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 3)); + v[1].r = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 4)); + v[1].g = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 5)); + v[1].b = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 6)); + v[1].a = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 7)); + + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + int z4 = gz * 8; + + v[0].r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 0)); + v[0].g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 1)); + v[0].b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 2)); + v[0].a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 3)); + v[1].r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 4)); + v[1].g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 5)); + v[1].b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 6)); + v[1].a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 7)); + + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else // NCNN_image_shader ivec4 v_offset; ivec4 vv_offset; @@ -94,4 +152,5 @@ void main() int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); +#endif // NCNN_image_shader } diff --git a/src/layer/vulkan/shader/packing_4to1.comp b/src/layer/vulkan/shader/packing_4to1.comp index ee81a0cd9..1313052ae 100644 --- a/src/layer/vulkan/shader/packing_4to1.comp +++ b/src/layer/vulkan/shader/packing_4to1.comp @@ -38,8 +38,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -65,6 +74,43 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + afpvec4 v; + + if (psc(dims) == 1) + { + v = image1d_ld4(bottom_blob_1d, gx); + + int x4 = gx * 4; + + image1d_st1(top_blob_1d, x4 + 0, v.r); + image1d_st1(top_blob_1d, x4 + 1, v.g); + image1d_st1(top_blob_1d, x4 + 2, v.b); + image1d_st1(top_blob_1d, x4 + 3, v.a); + } + else if (psc(dims) == 2) + { + v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); + + int y4 = gy * 4; + + image2d_st1(top_blob_2d, ivec2(gx, y4 + 0), v.r); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 1), v.g); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 2), v.b); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 3), v.a); + } + else // if (psc(dims) == 3) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + + int z4 = gz * 4; + + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 0), v.r); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 1), v.g); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 2), v.b); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 3), v.a); + } +#else // NCNN_image_shader ivec4 v_offset; if (psc(dims) == 1) @@ -89,4 +135,5 @@ void main() int gi = gz * psc(cstep) + gy * psc(w) + gx; buffer_cp4to1(top_blob_data, v_offset, bottom_blob_data, gi); +#endif // NCNN_image_shader } diff --git a/src/layer/vulkan/shader/packing_4to8.comp b/src/layer/vulkan/shader/packing_4to8.comp index 9db8d1dfc..43aaa1cde 100644 --- a/src/layer/vulkan/shader/packing_4to8.comp +++ b/src/layer/vulkan/shader/packing_4to8.comp @@ -39,8 +39,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -66,6 +75,37 @@ void main() if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) return; +#if NCNN_image_shader + afpvec8 v; + + if (psc(dims) == 1) + { + int x2 = gx * 2; + + v[0] = image1d_ld4(bottom_blob_1d, x2 + 0); + v[1] = image1d_ld4(bottom_blob_1d, x2 + 1); + + image1d_st8(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + int y2 = gy * 2; + + v[0] = image2d_ld4(bottom_blob_2d, ivec2(gx, y2 + 0)); + v[1] = image2d_ld4(bottom_blob_2d, ivec2(gx, y2 + 1)); + + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + int z2 = gz * 2; + + v[0] = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, z2 + 0)); + v[1] = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, z2 + 1)); + + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else // NCNN_image_shader ivec2 v_offset; if (psc(dims) == 1) @@ -90,4 +130,5 @@ void main() int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_cp4to8(top_blob_data, gi, bottom_blob_data, v_offset); +#endif // NCNN_image_shader } diff --git a/src/layer/vulkan/shader/packing_8to1.comp b/src/layer/vulkan/shader/packing_8to1.comp index fa2f45978..9f1fe77f6 100644 --- a/src/layer/vulkan/shader/packing_8to1.comp +++ b/src/layer/vulkan/shader/packing_8to1.comp @@ -39,8 +39,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -66,6 +75,55 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + afpvec8 v; + + if (psc(dims) == 1) + { + v = image1d_ld8(bottom_blob_1d, gx); + + int x4 = gx * 8; + + image1d_st1(top_blob_1d, x4 + 0, v[0].r); + image1d_st1(top_blob_1d, x4 + 1, v[0].g); + image1d_st1(top_blob_1d, x4 + 2, v[0].b); + image1d_st1(top_blob_1d, x4 + 3, v[0].a); + image1d_st1(top_blob_1d, x4 + 4, v[1].r); + image1d_st1(top_blob_1d, x4 + 5, v[1].g); + image1d_st1(top_blob_1d, x4 + 6, v[1].b); + image1d_st1(top_blob_1d, x4 + 7, v[1].a); + } + else if (psc(dims) == 2) + { + v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + + int y4 = gy * 8; + + image2d_st1(top_blob_2d, ivec2(gx, y4 + 0), v[0].r); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 1), v[0].g); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 2), v[0].b); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 3), v[0].a); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 4), v[1].r); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 5), v[1].g); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 6), v[1].b); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 7), v[1].a); + } + else // if (psc(dims) == 3) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + + int z4 = gz * 8; + + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 0), v[0].r); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 1), v[0].g); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 2), v[0].b); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 3), v[0].a); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 4), v[1].r); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 5), v[1].g); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 6), v[1].b); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 7), v[1].a); + } +#else // NCNN_image_shader ivec4 v_offset; ivec4 vv_offset; @@ -94,4 +152,5 @@ void main() int gi = gz * psc(cstep) + gy * psc(w) + gx; buffer_cp8to1(top_blob_data, v_offset, vv_offset, bottom_blob_data, gi); +#endif // NCNN_image_shader } diff --git a/src/layer/vulkan/shader/packing_8to4.comp b/src/layer/vulkan/shader/packing_8to4.comp index 6d2d2af50..cea711f05 100644 --- a/src/layer/vulkan/shader/packing_8to4.comp +++ b/src/layer/vulkan/shader/packing_8to4.comp @@ -39,8 +39,17 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -66,6 +75,37 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + afpvec8 v; + + if (psc(dims) == 1) + { + v = image1d_ld8(bottom_blob_1d, gx); + + int x2 = gx * 2; + + image1d_st4(top_blob_1d, x2 + 0, v[0]); + image1d_st4(top_blob_1d, x2 + 1, v[1]); + } + else if (psc(dims) == 2) + { + v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + + int y2 = gy * 2; + + image2d_st4(top_blob_2d, ivec2(gx, y2 + 0), v[0]); + image2d_st4(top_blob_2d, ivec2(gx, y2 + 1), v[1]); + } + else // if (psc(dims) == 3) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + + int z2 = gz * 2; + + image3d_st4(top_blob_3d, ivec3(gx, gy, z2 + 0), v[0]); + image3d_st4(top_blob_3d, ivec3(gx, gy, z2 + 1), v[1]); + } +#else // NCNN_image_shader ivec2 v_offset; if (psc(dims) == 1) @@ -90,4 +130,5 @@ void main() int gi = gz * psc(cstep) + gy * psc(w) + gx; buffer_cp8to4(top_blob_data, v_offset, bottom_blob_data, gi); +#endif // NCNN_image_shader } diff --git a/src/layer/vulkan/shader/padding.comp b/src/layer/vulkan/shader/padding.comp index 226344741..c2c8aab9d 100644 --- a/src/layer/vulkan/shader/padding.comp +++ b/src/layer/vulkan/shader/padding.comp @@ -42,9 +42,15 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler1D per_channel_pad_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; layout (binding = 2) readonly buffer per_channel_pad_blob { sfp per_channel_pad_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -82,35 +88,56 @@ void main() { if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h)) { +#if NCNN_image_shader + image3d_cp1(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz)); +#else int v_offset = gz * psc(cstep) + y * psc(w) + x; buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); +#endif } else if (per_channel_pad == 1) { +#if NCNN_image_shader + afp v = image1d_ld1(per_channel_pad_blob, gz); + image3d_st1(top_blob, ivec3(gx, gy, gz), v); +#else buffer_cp1(top_blob_data, gi, per_channel_pad_blob_data, gz); +#endif } else { afp v = afp(value); +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), v); +#else buffer_st1(top_blob_data, gi, v); +#endif } } - else if (type == 1) + if (type == 1) { x = clamp(x, 0, psc(w) - 1); y = clamp(y, 0, psc(h) - 1); +#if NCNN_image_shader + image3d_cp1(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz)); +#else int v_offset = gz * psc(cstep) + y * psc(w) + x; buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); +#endif } - else if (type == 2) + if (type == 2) { x = abs(x); y = abs(y); x = (psc(w) - 1) - abs(x - (psc(w) - 1)); y = (psc(h) - 1) - abs(y - (psc(h) - 1)); +#if NCNN_image_shader + image3d_cp1(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz)); +#else int v_offset = gz * psc(cstep) + y * psc(w) + x; buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); +#endif } } diff --git a/src/layer/vulkan/shader/padding_pack4.comp b/src/layer/vulkan/shader/padding_pack4.comp index b9ff22c06..6b7f8962e 100644 --- a/src/layer/vulkan/shader/padding_pack4.comp +++ b/src/layer/vulkan/shader/padding_pack4.comp @@ -42,9 +42,15 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler1D per_channel_pad_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; layout (binding = 2) readonly buffer per_channel_pad_blob { sfpvec4 per_channel_pad_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -82,35 +88,56 @@ void main() { if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h)) { +#if NCNN_image_shader + image3d_cp4(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz)); +#else int v_offset = gz * psc(cstep) + y * psc(w) + x; buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif } else if (per_channel_pad == 1) { +#if NCNN_image_shader + afpvec4 v = image1d_ld4(per_channel_pad_blob, gz); + image3d_st4(top_blob, ivec3(gx, gy, gz), v); +#else buffer_cp4(top_blob_data, gi, per_channel_pad_blob_data, gz); +#endif } else { afpvec4 v = afpvec4(value); +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), v); +#else buffer_st4(top_blob_data, gi, v); +#endif } } - else if (type == 1) + if (type == 1) { x = clamp(x, 0, psc(w) - 1); y = clamp(y, 0, psc(h) - 1); +#if NCNN_image_shader + image3d_cp4(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz)); +#else int v_offset = gz * psc(cstep) + y * psc(w) + x; buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif } - else if (type == 2) + if (type == 2) { x = abs(x); y = abs(y); x = (psc(w) - 1) - abs(x - (psc(w) - 1)); y = (psc(h) - 1) - abs(y - (psc(h) - 1)); +#if NCNN_image_shader + image3d_cp4(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz)); +#else int v_offset = gz * psc(cstep) + y * psc(w) + x; buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif } } diff --git a/src/layer/vulkan/shader/padding_pack8.comp b/src/layer/vulkan/shader/padding_pack8.comp index 911b074e0..aa25c7026 100644 --- a/src/layer/vulkan/shader/padding_pack8.comp +++ b/src/layer/vulkan/shader/padding_pack8.comp @@ -43,9 +43,15 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler1D per_channel_pad_blob; +#else layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; layout (binding = 2) readonly buffer per_channel_pad_blob { sfpvec8 per_channel_pad_blob_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -83,35 +89,56 @@ void main() { if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h)) { +#if NCNN_image_shader + image3d_cp8(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz)); +#else int v_offset = gz * psc(cstep) + y * psc(w) + x; buffer_cp8(top_blob_data, gi, bottom_blob_data, v_offset); +#endif } else if (per_channel_pad == 1) { +#if NCNN_image_shader + afpvec8 v = image1d_ld8(per_channel_pad_blob, gz); + image3d_st8(top_blob, ivec3(gx, gy, gz), v); +#else buffer_cp8(top_blob_data, gi, per_channel_pad_blob_data, gz); +#endif } else { afpvec8 v = afpvec8(afpvec4(value), afpvec4(value)); +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), v); +#else buffer_st8(top_blob_data, gi, v); +#endif } } - else if (type == 1) + if (type == 1) { x = clamp(x, 0, psc(w) - 1); y = clamp(y, 0, psc(h) - 1); +#if NCNN_image_shader + image3d_cp8(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz)); +#else int v_offset = gz * psc(cstep) + y * psc(w) + x; buffer_cp8(top_blob_data, gi, bottom_blob_data, v_offset); +#endif } - else if (type == 2) + if (type == 2) { x = abs(x); y = abs(y); x = (psc(w) - 1) - abs(x - (psc(w) - 1)); y = (psc(h) - 1) - abs(y - (psc(h) - 1)); +#if NCNN_image_shader + image3d_cp8(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz)); +#else int v_offset = gz * psc(cstep) + y * psc(w) + x; buffer_cp8(top_blob_data, gi, bottom_blob_data, v_offset); +#endif } } diff --git a/src/layer/vulkan/shader/pooling.comp b/src/layer/vulkan/shader/pooling.comp index d78631b9e..bc0215ff6 100644 --- a/src/layer/vulkan/shader/pooling.comp +++ b/src/layer/vulkan/shader/pooling.comp @@ -53,8 +53,13 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -89,6 +94,19 @@ void main() { res = afp(-FLT_MAX); +#if NCNN_image_shader + int sx = gx * stride_w; + int sy = gy * stride_h; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afp v = image3d_ld1(bottom_blob, ivec3(sx + x, sy + y, gz)); + res = max(res, v); + } + } +#else int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; for (int y = 0; y < kernel_h; y++) @@ -101,16 +119,39 @@ void main() v_offset += psc(w); } +#endif } - else if (pooling_type == 1 && avgpool_count_include_pad == 0) + if (pooling_type == 1 && avgpool_count_include_pad == 0) { res = afp(0.f); + int area = 0; int sx = gx * stride_w; int sy = gy * stride_h; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + if (sy + y < pad_top) + continue; + + if (sy + y >= psc(h) - pad_bottom - p.htailpad) + break; + + for (int x = 0; x < kernel_w; x++) + { + if (sx + x < pad_left) + continue; + + if (sx + x >= psc(w) - pad_right - p.wtailpad) + break; + + res += image3d_ld1(bottom_blob, ivec3(sx + x, sy + y, gz)); + area += 1; + } + } +#else int v_offset = gz * psc(cstep) + sy * psc(w) + sx; - int area = 0; for (int y = 0; y < kernel_h; y++) { @@ -139,13 +180,26 @@ void main() v_offset += psc(w); } +#endif res /= afp(area); } - else if (pooling_type == 1 && avgpool_count_include_pad == 1) + if (pooling_type == 1 && avgpool_count_include_pad == 1) { res = afp(0.f); +#if NCNN_image_shader + int sx = gx * stride_w; + int sy = gy * stride_h; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + res += image3d_ld1(bottom_blob, ivec3(sx + x, sy + y, gz)); + } + } +#else int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; for (int y = 0; y < kernel_h; y++) @@ -157,11 +211,16 @@ void main() v_offset += psc(w); } +#endif res /= afp(kernel_w * kernel_h); } +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), res); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st1(top_blob_data, gi, res); +#endif } diff --git a/src/layer/vulkan/shader/pooling_global.comp b/src/layer/vulkan/shader/pooling_global.comp index cbb0de452..07471c0f3 100644 --- a/src/layer/vulkan/shader/pooling_global.comp +++ b/src/layer/vulkan/shader/pooling_global.comp @@ -42,8 +42,13 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -66,7 +71,7 @@ void main() int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + if (gx >= psc(outw) || gy >= 1 || gz >= 1) return; int size = psc(w) * psc(h); @@ -78,23 +83,48 @@ void main() { res = afp(-FLT_MAX); +#if NCNN_image_shader + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < psc(w); x++) + { + afp v = image3d_ld1(bottom_blob, ivec3(x, y, gx)); + res = max(res, v); + } + } +#else for (int i = 0; i < size; i++) { afp v = buffer_ld1(bottom_blob_data, v_offset + i); res = max(res, v); } +#endif } - else if (pooling_type == 1) + if (pooling_type == 1) { res = afp(0.f); +#if NCNN_image_shader + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < psc(w); x++) + { + res += image3d_ld1(bottom_blob, ivec3(x, y, gx)); + } + } +#else for (int i = 0; i < size; i++) { res += buffer_ld1(bottom_blob_data, v_offset + i); } +#endif res /= afp(size); } +#if NCNN_image_shader + image1d_st1(top_blob, gx, res); +#else buffer_st1(top_blob_data, gx, res); +#endif } diff --git a/src/layer/vulkan/shader/pooling_global_pack4.comp b/src/layer/vulkan/shader/pooling_global_pack4.comp index 5c1e3c44a..3cdbc52c5 100644 --- a/src/layer/vulkan/shader/pooling_global_pack4.comp +++ b/src/layer/vulkan/shader/pooling_global_pack4.comp @@ -42,8 +42,13 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -66,7 +71,7 @@ void main() int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + if (gx >= psc(outw) || gy >= 1 || gz >= 1) return; int size = psc(w) * psc(h); @@ -78,23 +83,48 @@ void main() { res = afpvec4(-FLT_MAX); +#if NCNN_image_shader + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < psc(w); x++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, y, gx)); + res = max(res, v); + } + } +#else for (int i = 0; i < size; i++) { afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + i); res = max(res, v); } +#endif } - else if (pooling_type == 1) + if (pooling_type == 1) { res = afpvec4(0.f); +#if NCNN_image_shader + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < psc(w); x++) + { + res += image3d_ld4(bottom_blob, ivec3(x, y, gx)); + } + } +#else for (int i = 0; i < size; i++) { res += buffer_ld4(bottom_blob_data, v_offset + i); } +#endif res /= afp(size); } +#if NCNN_image_shader + image1d_st4(top_blob, gx, res); +#else buffer_st4(top_blob_data, gx, res); +#endif } diff --git a/src/layer/vulkan/shader/pooling_global_pack8.comp b/src/layer/vulkan/shader/pooling_global_pack8.comp index c37a83ca0..ccbc6a504 100644 --- a/src/layer/vulkan/shader/pooling_global_pack8.comp +++ b/src/layer/vulkan/shader/pooling_global_pack8.comp @@ -43,8 +43,13 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -67,7 +72,7 @@ void main() int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + if (gx >= psc(outw) || gy >= 1 || gz >= 1) return; int size = psc(w) * psc(h); @@ -79,28 +84,56 @@ void main() { res = afpvec8(afpvec4(-FLT_MAX), afpvec4(-FLT_MAX)); +#if NCNN_image_shader + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < psc(w); x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(x, y, gx)); + res[0] = max(res[0], v[0]); + res[1] = max(res[1], v[1]); + } + } +#else for (int i = 0; i < size; i++) { afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + i); res[0] = max(res[0], v[0]); res[1] = max(res[1], v[1]); } +#endif } - else if (pooling_type == 1) + if (pooling_type == 1) { res = afpvec8(afpvec4(0.f), afpvec4(0.f)); +#if NCNN_image_shader + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < psc(w); x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(x, y, gx)); + res[0] += v[0]; + res[1] += v[1]; + } + } +#else for (int i = 0; i < size; i++) { afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + i); res[0] += v[0]; res[1] += v[1]; } +#endif afp area = afp(size); res[0] /= area; res[1] /= area; } +#if NCNN_image_shader + image1d_st8(top_blob, gx, res); +#else buffer_st8(top_blob_data, gx, res); +#endif } diff --git a/src/layer/vulkan/shader/pooling_pack4.comp b/src/layer/vulkan/shader/pooling_pack4.comp index 63226b805..3c980a7d7 100644 --- a/src/layer/vulkan/shader/pooling_pack4.comp +++ b/src/layer/vulkan/shader/pooling_pack4.comp @@ -53,8 +53,13 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -89,6 +94,19 @@ void main() { res = afpvec4(-FLT_MAX); +#if NCNN_image_shader + int sx = gx * stride_w; + int sy = gy * stride_h; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx + x, sy + y, gz)); + res = max(res, v); + } + } +#else int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; for (int y = 0; y < kernel_h; y++) @@ -101,16 +119,39 @@ void main() v_offset += psc(w); } +#endif } else if (pooling_type == 1 && avgpool_count_include_pad == 0) { res = afpvec4(0.f); + int area = 0; int sx = gx * stride_w; int sy = gy * stride_h; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + if (sy + y < pad_top) + continue; + + if (sy + y >= psc(h) - pad_bottom - p.htailpad) + break; + + for (int x = 0; x < kernel_w; x++) + { + if (sx + x < pad_left) + continue; + + if (sx + x >= psc(w) - pad_right - p.wtailpad) + break; + + res += image3d_ld4(bottom_blob, ivec3(sx + x, sy + y, gz)); + area += 1; + } + } +#else int v_offset = gz * psc(cstep) + sy * psc(w) + sx; - int area = 0; for (int y = 0; y < kernel_h; y++) { @@ -139,6 +180,7 @@ void main() v_offset += psc(w); } +#endif res /= afp(area); } @@ -146,6 +188,18 @@ void main() { res = afpvec4(0.f); +#if NCNN_image_shader + int sx = gx * stride_w; + int sy = gy * stride_h; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + res += image3d_ld4(bottom_blob, ivec3(sx + x, sy + y, gz)); + } + } +#else int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; for (int y = 0; y < kernel_h; y++) @@ -157,11 +211,16 @@ void main() v_offset += psc(w); } +#endif res /= afp(kernel_w * kernel_h); } +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), res); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st4(top_blob_data, gi, res); +#endif } diff --git a/src/layer/vulkan/shader/pooling_pack8.comp b/src/layer/vulkan/shader/pooling_pack8.comp index 272dd3672..82f91cc8f 100644 --- a/src/layer/vulkan/shader/pooling_pack8.comp +++ b/src/layer/vulkan/shader/pooling_pack8.comp @@ -54,8 +54,13 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +#else // NCNN_image_shader layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif // NCNN_image_shader layout (push_constant) uniform parameter { @@ -90,6 +95,20 @@ void main() { res = afpvec8(afpvec4(-FLT_MAX), afpvec4(-FLT_MAX)); +#if NCNN_image_shader + int sx = gx * stride_w; + int sy = gy * stride_h; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx + x, sy + y, gz)); + res[0] = max(res[0], v[0]); + res[1] = max(res[1], v[1]); + } + } +#else int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; for (int y = 0; y < kernel_h; y++) @@ -103,16 +122,41 @@ void main() v_offset += psc(w); } +#endif } else if (pooling_type == 1 && avgpool_count_include_pad == 0) { res = afpvec8(afpvec4(0.f), afpvec4(0.f)); + int area = 0; int sx = gx * stride_w; int sy = gy * stride_h; +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + if (sy + y < pad_top) + continue; + + if (sy + y >= psc(h) - pad_bottom - p.htailpad) + break; + + for (int x = 0; x < kernel_w; x++) + { + if (sx + x < pad_left) + continue; + + if (sx + x >= psc(w) - pad_right - p.wtailpad) + break; + + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx + x, sy + y, gz)); + res[0] += v[0]; + res[1] += v[1]; + area += 1; + } + } +#else int v_offset = gz * psc(cstep) + sy * psc(w) + sx; - int area = 0; for (int y = 0; y < kernel_h; y++) { @@ -143,6 +187,7 @@ void main() v_offset += psc(w); } +#endif res[0] /= afp(area); res[1] /= afp(area); @@ -151,6 +196,20 @@ void main() { res = afpvec8(afpvec4(0.f), afpvec4(0.f)); +#if NCNN_image_shader + int sx = gx * stride_w; + int sy = gy * stride_h; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx + x, sy + y, gz)); + res[0] += v[0]; + res[1] += v[1]; + } + } +#else int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; for (int y = 0; y < kernel_h; y++) @@ -164,13 +223,18 @@ void main() v_offset += psc(w); } +#endif afp area = afp(kernel_w * kernel_h); res[0] /= area; res[1] /= area; } +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), res); +#else const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st8(top_blob_data, gi, res); +#endif } diff --git a/src/layer/vulkan/shader/softmax_div_sum.comp b/src/layer/vulkan/shader/softmax_div_sum.comp index 18942a439..5db4bd4a3 100644 --- a/src/layer/vulkan/shader/softmax_div_sum.comp +++ b/src/layer/vulkan/shader/softmax_div_sum.comp @@ -40,8 +40,19 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +layout (binding = 2) uniform unfp sampler1D sum_workspace_1d; +layout (binding = 2) uniform unfp sampler2D sum_workspace_2d; +#else layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; layout (binding = 1) readonly buffer sum_workspace { sfp sum_workspace_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -67,6 +78,41 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + afp v; + afp sum; + + if (psc(dims) == 1) // axis == 0 + { + v = image1d_ld1(bottom_blob_1d, gx); + sum = image1d_ld1(sum_workspace_1d, 0); + } + else if (psc(dims) == 2 && axis == 0) + { + v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy)); + sum = image1d_ld1(sum_workspace_1d, gx); + } + else if (psc(dims) == 2 && axis == 1) + { + v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy)); + sum = image1d_ld1(sum_workspace_1d, gy); + } + else if (psc(dims) == 3 && axis == 0) + { + v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz)); + sum = image2d_ld1(sum_workspace_2d, ivec2(gx, gy)); + } + else if (psc(dims) == 3 && axis == 1) + { + v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz)); + sum = image2d_ld1(sum_workspace_2d, ivec2(gx, gz)); + } + else if (psc(dims) == 3 && axis == 2) + { + v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz)); + sum = image2d_ld1(sum_workspace_2d, ivec2(gy, gz)); + } +#else const int gi = gz * psc(cstep) + gy * psc(w) + gx; afp v = buffer_ld1(bottom_top_blob_data, gi); @@ -97,8 +143,24 @@ void main() { sum = buffer_ld1(sum_workspace_data, gz * psc(h) + gy); } +#endif v /= sum; +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st1(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st1(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else buffer_st1(bottom_top_blob_data, gi, v); +#endif } diff --git a/src/layer/vulkan/shader/softmax_div_sum_pack4.comp b/src/layer/vulkan/shader/softmax_div_sum_pack4.comp index 5f133fee0..361dda87e 100644 --- a/src/layer/vulkan/shader/softmax_div_sum_pack4.comp +++ b/src/layer/vulkan/shader/softmax_div_sum_pack4.comp @@ -40,8 +40,19 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +layout (binding = 2) uniform unfp sampler1D sum_workspace_1d; +layout (binding = 2) uniform unfp sampler2D sum_workspace_2d; +#else layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; layout (binding = 1) readonly buffer sum_workspace { sfpvec4 sum_workspace_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -67,6 +78,41 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + afpvec4 v; + afpvec4 sum; + + if (psc(dims) == 1) // axis == 0 + { + v = image1d_ld4(bottom_blob_1d, gx); + sum = image1d_ld4(sum_workspace_1d, 0); + } + else if (psc(dims) == 2 && axis == 0) + { + v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); + sum = image1d_ld4(sum_workspace_1d, gx); + } + else if (psc(dims) == 2 && axis == 1) + { + v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); + sum = image1d_ld4(sum_workspace_1d, gy); + } + else if (psc(dims) == 3 && axis == 0) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + sum = image2d_ld4(sum_workspace_2d, ivec2(gx, gy)); + } + else if (psc(dims) == 3 && axis == 1) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + sum = image2d_ld4(sum_workspace_2d, ivec2(gx, gz)); + } + else if (psc(dims) == 3 && axis == 2) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + sum = image2d_ld4(sum_workspace_2d, ivec2(gy, gz)); + } +#else const int gi = gz * psc(cstep) + gy * psc(w) + gx; afpvec4 v = buffer_ld4(bottom_top_blob_data, gi); @@ -97,8 +143,24 @@ void main() { sum = buffer_ld4(sum_workspace_data, gz * psc(h) + gy); } +#endif v /= sum; +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st4(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st4(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else buffer_st4(bottom_top_blob_data, gi, v); +#endif } diff --git a/src/layer/vulkan/shader/softmax_div_sum_pack8.comp b/src/layer/vulkan/shader/softmax_div_sum_pack8.comp index 7b35d6974..ea347a7e4 100644 --- a/src/layer/vulkan/shader/softmax_div_sum_pack8.comp +++ b/src/layer/vulkan/shader/softmax_div_sum_pack8.comp @@ -41,8 +41,19 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +layout (binding = 2) uniform unfp sampler1D sum_workspace_1d; +layout (binding = 2) uniform unfp sampler2D sum_workspace_2d; +#else layout (binding = 0) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; }; layout (binding = 1) readonly buffer sum_workspace { sfpvec8 sum_workspace_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -68,6 +79,41 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + afpvec8 v; + afpvec8 sum; + + if (psc(dims) == 1) // axis == 0 + { + v = image1d_ld8(bottom_blob_1d, gx); + sum = image1d_ld8(sum_workspace_1d, 0); + } + else if (psc(dims) == 2 && axis == 0) + { + v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + sum = image1d_ld8(sum_workspace_1d, gx); + } + else if (psc(dims) == 2 && axis == 1) + { + v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + sum = image1d_ld8(sum_workspace_1d, gy); + } + else if (psc(dims) == 3 && axis == 0) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + sum = image2d_ld8(sum_workspace_2d, ivec2(gx, gy)); + } + else if (psc(dims) == 3 && axis == 1) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + sum = image2d_ld8(sum_workspace_2d, ivec2(gx, gz)); + } + else if (psc(dims) == 3 && axis == 2) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + sum = image2d_ld8(sum_workspace_2d, ivec2(gy, gz)); + } +#else const int gi = gz * psc(cstep) + gy * psc(w) + gx; afpvec8 v = buffer_ld8(bottom_top_blob_data, gi); @@ -98,9 +144,25 @@ void main() { sum = buffer_ld8(sum_workspace_data, gz * psc(h) + gy); } +#endif v[0] /= sum[0]; v[1] /= sum[1]; +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st8(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else buffer_st8(bottom_top_blob_data, gi, v); +#endif } diff --git a/src/layer/vulkan/shader/softmax_exp_sub_max.comp b/src/layer/vulkan/shader/softmax_exp_sub_max.comp index 897740d09..210284df4 100644 --- a/src/layer/vulkan/shader/softmax_exp_sub_max.comp +++ b/src/layer/vulkan/shader/softmax_exp_sub_max.comp @@ -40,8 +40,19 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +layout (binding = 2) uniform unfp sampler1D max_workspace_1d; +layout (binding = 2) uniform unfp sampler2D max_workspace_2d; +#else layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; layout (binding = 1) readonly buffer max_workspace { sfp max_workspace_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -67,6 +78,41 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + afp v; + afp max_value; + + if (psc(dims) == 1) // axis == 0 + { + v = image1d_ld1(bottom_blob_1d, gx); + max_value = image1d_ld1(max_workspace_1d, 0); + } + else if (psc(dims) == 2 && axis == 0) + { + v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy)); + max_value = image1d_ld1(max_workspace_1d, gx); + } + else if (psc(dims) == 2 && axis == 1) + { + v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy)); + max_value = image1d_ld1(max_workspace_1d, gy); + } + else if (psc(dims) == 3 && axis == 0) + { + v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz)); + max_value = image2d_ld1(max_workspace_2d, ivec2(gx, gy)); + } + else if (psc(dims) == 3 && axis == 1) + { + v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz)); + max_value = image2d_ld1(max_workspace_2d, ivec2(gx, gz)); + } + else if (psc(dims) == 3 && axis == 2) + { + v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz)); + max_value = image2d_ld1(max_workspace_2d, ivec2(gy, gz)); + } +#else const int gi = gz * psc(cstep) + gy * psc(w) + gx; afp v = buffer_ld1(bottom_top_blob_data, gi); @@ -97,8 +143,24 @@ void main() { max_value = buffer_ld1(max_workspace_data, gz * psc(h) + gy); } +#endif v = exp(v - max_value); +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st1(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st1(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else buffer_st1(bottom_top_blob_data, gi, v); +#endif } diff --git a/src/layer/vulkan/shader/softmax_exp_sub_max_pack4.comp b/src/layer/vulkan/shader/softmax_exp_sub_max_pack4.comp index eaee2a983..9fc6c40e4 100644 --- a/src/layer/vulkan/shader/softmax_exp_sub_max_pack4.comp +++ b/src/layer/vulkan/shader/softmax_exp_sub_max_pack4.comp @@ -40,8 +40,19 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +layout (binding = 2) uniform unfp sampler1D max_workspace_1d; +layout (binding = 2) uniform unfp sampler2D max_workspace_2d; +#else layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; layout (binding = 1) readonly buffer max_workspace { sfpvec4 max_workspace_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -67,6 +78,41 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + afpvec4 v; + afpvec4 max_value; + + if (psc(dims) == 1) // axis == 0 + { + v = image1d_ld4(bottom_blob_1d, gx); + max_value = image1d_ld4(max_workspace_1d, 0); + } + else if (psc(dims) == 2 && axis == 0) + { + v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); + max_value = image1d_ld4(max_workspace_1d, gx); + } + else if (psc(dims) == 2 && axis == 1) + { + v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); + max_value = image1d_ld4(max_workspace_1d, gy); + } + else if (psc(dims) == 3 && axis == 0) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + max_value = image2d_ld4(max_workspace_2d, ivec2(gx, gy)); + } + else if (psc(dims) == 3 && axis == 1) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + max_value = image2d_ld4(max_workspace_2d, ivec2(gx, gz)); + } + else if (psc(dims) == 3 && axis == 2) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + max_value = image2d_ld4(max_workspace_2d, ivec2(gy, gz)); + } +#else const int gi = gz * psc(cstep) + gy * psc(w) + gx; afpvec4 v = buffer_ld4(bottom_top_blob_data, gi); @@ -97,8 +143,24 @@ void main() { max_value = buffer_ld4(max_workspace_data, gz * psc(h) + gy); } +#endif v = exp(v - max_value); +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st4(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st4(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else buffer_st4(bottom_top_blob_data, gi, v); +#endif } diff --git a/src/layer/vulkan/shader/softmax_exp_sub_max_pack8.comp b/src/layer/vulkan/shader/softmax_exp_sub_max_pack8.comp index a7b93cbcd..e0a42aa61 100644 --- a/src/layer/vulkan/shader/softmax_exp_sub_max_pack8.comp +++ b/src/layer/vulkan/shader/softmax_exp_sub_max_pack8.comp @@ -41,8 +41,19 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +layout (binding = 2) uniform unfp sampler1D max_workspace_1d; +layout (binding = 2) uniform unfp sampler2D max_workspace_2d; +#else layout (binding = 0) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; }; layout (binding = 1) readonly buffer max_workspace { sfpvec8 max_workspace_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -68,6 +79,41 @@ void main() if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) return; +#if NCNN_image_shader + afpvec8 v; + afpvec8 max_value; + + if (psc(dims) == 1) // axis == 0 + { + v = image1d_ld8(bottom_blob_1d, gx); + max_value = image1d_ld8(max_workspace_1d, 0); + } + else if (psc(dims) == 2 && axis == 0) + { + v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + max_value = image1d_ld8(max_workspace_1d, gx); + } + else if (psc(dims) == 2 && axis == 1) + { + v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + max_value = image1d_ld8(max_workspace_1d, gy); + } + else if (psc(dims) == 3 && axis == 0) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + max_value = image2d_ld8(max_workspace_2d, ivec2(gx, gy)); + } + else if (psc(dims) == 3 && axis == 1) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + max_value = image2d_ld8(max_workspace_2d, ivec2(gx, gz)); + } + else if (psc(dims) == 3 && axis == 2) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + max_value = image2d_ld8(max_workspace_2d, ivec2(gy, gz)); + } +#else const int gi = gz * psc(cstep) + gy * psc(w) + gx; afpvec8 v = buffer_ld8(bottom_top_blob_data, gi); @@ -98,9 +144,25 @@ void main() { max_value = buffer_ld8(max_workspace_data, gz * psc(h) + gy); } +#endif v[0] = exp(v[0] - max_value[0]); v[1] = exp(v[1] - max_value[1]); +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st8(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else buffer_st8(bottom_top_blob_data, gi, v); +#endif } diff --git a/src/layer/vulkan/shader/softmax_reduce_max.comp b/src/layer/vulkan/shader/softmax_reduce_max.comp index 602b32799..42271ccb5 100644 --- a/src/layer/vulkan/shader/softmax_reduce_max.comp +++ b/src/layer/vulkan/shader/softmax_reduce_max.comp @@ -40,8 +40,16 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_top_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_top_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_top_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D max_workspace_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D max_workspace_2d; +#else layout (binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; layout (binding = 1) writeonly buffer max_workspace { sfp max_workspace_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -73,10 +81,18 @@ void main() { for (int i = 0; i < psc(w); i++) { +#if NCNN_image_shader + afp v = image1d_ld1(bottom_top_blob_1d, i); +#else afp v = buffer_ld1(bottom_top_blob_data, i); +#endif max_value = max(max_value, v); } +#if NCNN_image_shader + image1d_st1(max_workspace_1d, 0, max_value); +#else buffer_st1(max_workspace_data, 0, max_value); +#endif return; } @@ -84,11 +100,19 @@ void main() { for (int i = 0; i < psc(h); i++) { +#if NCNN_image_shader + afp v = image2d_ld1(bottom_top_blob_2d, ivec2(gx, i)); +#else int v_offset = i * psc(w) + gx; afp v = buffer_ld1(bottom_top_blob_data, v_offset); +#endif max_value = max(max_value, v); } +#if NCNN_image_shader + image1d_st1(max_workspace_1d, gx, max_value); +#else buffer_st1(max_workspace_data, gx, max_value); +#endif return; } @@ -96,11 +120,19 @@ void main() { for (int i = 0; i < psc(w); i++) { +#if NCNN_image_shader + afp v = image2d_ld1(bottom_top_blob_2d, ivec2(i, gx)); +#else int v_offset = gx * psc(w) + i; afp v = buffer_ld1(bottom_top_blob_data, v_offset); +#endif max_value = max(max_value, v); } +#if NCNN_image_shader + image1d_st1(max_workspace_1d, gx, max_value); +#else buffer_st1(max_workspace_data, gx, max_value); +#endif return; } @@ -108,11 +140,19 @@ void main() { for (int i = 0; i < psc(c); i++) { +#if NCNN_image_shader + afp v = image3d_ld1(bottom_top_blob_3d, ivec3(gx, gy, i)); +#else int v_offset = i * psc(cstep) + gy * psc(w) + gx; afp v = buffer_ld1(bottom_top_blob_data, v_offset); +#endif max_value = max(max_value, v); } +#if NCNN_image_shader + image2d_st1(max_workspace_2d, ivec2(gx, gy), max_value); +#else buffer_st1(max_workspace_data, gy * psc(w) + gx, max_value); +#endif return; } @@ -120,11 +160,19 @@ void main() { for (int i = 0; i < psc(h); i++) { +#if NCNN_image_shader + afp v = image3d_ld1(bottom_top_blob_3d, ivec3(gx, i, gy)); +#else int v_offset = gy * psc(cstep) + i * psc(w) + gx; afp v = buffer_ld1(bottom_top_blob_data, v_offset); +#endif max_value = max(max_value, v); } +#if NCNN_image_shader + image2d_st1(max_workspace_2d, ivec2(gx, gy), max_value); +#else buffer_st1(max_workspace_data, gy * psc(w) + gx, max_value); +#endif return; } @@ -132,11 +180,19 @@ void main() { for (int i = 0; i < psc(w); i++) { +#if NCNN_image_shader + afp v = image3d_ld1(bottom_top_blob_3d, ivec3(i, gx, gy)); +#else int v_offset = gy * psc(cstep) + gx * psc(w) + i; afp v = buffer_ld1(bottom_top_blob_data, v_offset); +#endif max_value = max(max_value, v); } +#if NCNN_image_shader + image2d_st1(max_workspace_2d, ivec2(gx, gy), max_value); +#else buffer_st1(max_workspace_data, gy * psc(h) + gx, max_value); +#endif return; } } diff --git a/src/layer/vulkan/shader/softmax_reduce_max_pack4.comp b/src/layer/vulkan/shader/softmax_reduce_max_pack4.comp index adea8705b..6de110db9 100644 --- a/src/layer/vulkan/shader/softmax_reduce_max_pack4.comp +++ b/src/layer/vulkan/shader/softmax_reduce_max_pack4.comp @@ -40,8 +40,16 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_top_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_top_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_top_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D max_workspace_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D max_workspace_2d; +#else layout (binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; layout (binding = 1) writeonly buffer max_workspace { sfpvec4 max_workspace_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -73,12 +81,20 @@ void main() { for (int i = 0; i < psc(w); i++) { +#if NCNN_image_shader + afpvec4 v = image1d_ld4(bottom_top_blob_1d, i); +#else afpvec4 v = buffer_ld4(bottom_top_blob_data, i); +#endif max_value = max(max_value, v); } afpvec2 max2 = max(max_value.rg, max_value.ba); max_value = afpvec4(max(max2.r, max2.g)); +#if NCNN_image_shader + image1d_st4(max_workspace_1d, 0, max_value); +#else buffer_st4(max_workspace_data, 0, max_value); +#endif return; } @@ -86,13 +102,21 @@ void main() { for (int i = 0; i < psc(h); i++) { +#if NCNN_image_shader + afpvec4 v = image2d_ld4(bottom_top_blob_2d, ivec2(gx, i)); +#else int v_offset = i * psc(w) + gx; afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); +#endif max_value = max(max_value, v); } afpvec2 max2 = max(max_value.rg, max_value.ba); max_value = afpvec4(max(max2.r, max2.g)); +#if NCNN_image_shader + image1d_st4(max_workspace_1d, gx, max_value); +#else buffer_st4(max_workspace_data, gx, max_value); +#endif return; } @@ -100,11 +124,19 @@ void main() { for (int i = 0; i < psc(w); i++) { +#if NCNN_image_shader + afpvec4 v = image2d_ld4(bottom_top_blob_2d, ivec2(i, gx)); +#else int v_offset = gx * psc(w) + i; afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); +#endif max_value = max(max_value, v); } +#if NCNN_image_shader + image1d_st4(max_workspace_1d, gx, max_value); +#else buffer_st4(max_workspace_data, gx, max_value); +#endif return; } @@ -112,13 +144,21 @@ void main() { for (int i = 0; i < psc(c); i++) { +#if NCNN_image_shader + afpvec4 v = image3d_ld4(bottom_top_blob_3d, ivec3(gx, gy, i)); +#else int v_offset = i * psc(cstep) + gy * psc(w) + gx; afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); +#endif max_value = max(max_value, v); } afpvec2 max2 = max(max_value.rg, max_value.ba); max_value = afpvec4(max(max2.r, max2.g)); +#if NCNN_image_shader + image2d_st4(max_workspace_2d, ivec2(gx, gy), max_value); +#else buffer_st4(max_workspace_data, gy * psc(w) + gx, max_value); +#endif return; } @@ -126,11 +166,19 @@ void main() { for (int i = 0; i < psc(h); i++) { +#if NCNN_image_shader + afpvec4 v = image3d_ld4(bottom_top_blob_3d, ivec3(gx, i, gy)); +#else int v_offset = gy * psc(cstep) + i * psc(w) + gx; afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); +#endif max_value = max(max_value, v); } +#if NCNN_image_shader + image2d_st4(max_workspace_2d, ivec2(gx, gy), max_value); +#else buffer_st4(max_workspace_data, gy * psc(w) + gx, max_value); +#endif return; } @@ -138,11 +186,19 @@ void main() { for (int i = 0; i < psc(w); i++) { +#if NCNN_image_shader + afpvec4 v = image3d_ld4(bottom_top_blob_3d, ivec3(i, gx, gy)); +#else int v_offset = gy * psc(cstep) + gx * psc(w) + i; afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); +#endif max_value = max(max_value, v); } +#if NCNN_image_shader + image2d_st4(max_workspace_2d, ivec2(gx, gy), max_value); +#else buffer_st4(max_workspace_data, gy * psc(h) + gx, max_value); +#endif return; } } diff --git a/src/layer/vulkan/shader/softmax_reduce_max_pack8.comp b/src/layer/vulkan/shader/softmax_reduce_max_pack8.comp index 4899dc7ab..66073dad9 100644 --- a/src/layer/vulkan/shader/softmax_reduce_max_pack8.comp +++ b/src/layer/vulkan/shader/softmax_reduce_max_pack8.comp @@ -41,8 +41,16 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_top_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_top_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_top_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D max_workspace_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D max_workspace_2d; +#else layout (binding = 0) readonly buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; }; layout (binding = 1) writeonly buffer max_workspace { sfpvec8 max_workspace_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -74,7 +82,11 @@ void main() { for (int i = 0; i < psc(w); i++) { +#if NCNN_image_shader + afpvec8 v = image1d_ld8(bottom_top_blob_1d, i); +#else afpvec8 v = buffer_ld8(bottom_top_blob_data, i); +#endif max_value[0] = max(max_value[0], v[0]); max_value[1] = max(max_value[1], v[1]); } @@ -82,7 +94,11 @@ void main() afpvec2 max2 = max(max4.rg, max4.ba); afp max1 = max(max2.r, max2.g); max_value = afpvec8(afpvec4(max1), afpvec4(max1)); +#if NCNN_image_shader + image1d_st8(max_workspace_1d, 0, max_value); +#else buffer_st8(max_workspace_data, 0, max_value); +#endif return; } @@ -90,8 +106,12 @@ void main() { for (int i = 0; i < psc(h); i++) { +#if NCNN_image_shader + afpvec8 v = image2d_ld8(bottom_top_blob_2d, ivec2(gx, i)); +#else int v_offset = i * psc(w) + gx; afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset); +#endif max_value[0] = max(max_value[0], v[0]); max_value[1] = max(max_value[1], v[1]); } @@ -99,7 +119,11 @@ void main() afpvec2 max2 = max(max4.rg, max4.ba); afp max1 = max(max2.r, max2.g); max_value = afpvec8(afpvec4(max1), afpvec4(max1)); +#if NCNN_image_shader + image1d_st8(max_workspace_1d, gx, max_value); +#else buffer_st8(max_workspace_data, gx, max_value); +#endif return; } @@ -107,12 +131,20 @@ void main() { for (int i = 0; i < psc(w); i++) { +#if NCNN_image_shader + afpvec8 v = image2d_ld8(bottom_top_blob_2d, ivec2(i, gx)); +#else int v_offset = gx * psc(w) + i; afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset); +#endif max_value[0] = max(max_value[0], v[0]); max_value[1] = max(max_value[1], v[1]); } +#if NCNN_image_shader + image1d_st8(max_workspace_1d, gx, max_value); +#else buffer_st8(max_workspace_data, gx, max_value); +#endif return; } @@ -120,8 +152,12 @@ void main() { for (int i = 0; i < psc(c); i++) { +#if NCNN_image_shader + afpvec8 v = image3d_ld8(bottom_top_blob_3d, ivec3(gx, gy, i)); +#else int v_offset = i * psc(cstep) + gy * psc(w) + gx; afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset); +#endif max_value[0] = max(max_value[0], v[0]); max_value[1] = max(max_value[1], v[1]); } @@ -129,7 +165,11 @@ void main() afpvec2 max2 = max(max4.rg, max4.ba); afp max1 = max(max2.r, max2.g); max_value = afpvec8(afpvec4(max1), afpvec4(max1)); +#if NCNN_image_shader + image2d_st8(max_workspace_2d, ivec2(gx, gy), max_value); +#else buffer_st8(max_workspace_data, gy * psc(w) + gx, max_value); +#endif return; } @@ -137,12 +177,20 @@ void main() { for (int i = 0; i < psc(h); i++) { +#if NCNN_image_shader + afpvec8 v = image3d_ld8(bottom_top_blob_3d, ivec3(gx, i, gy)); +#else int v_offset = gy * psc(cstep) + i * psc(w) + gx; afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset); +#endif max_value[0] = max(max_value[0], v[0]); max_value[1] = max(max_value[1], v[1]); } +#if NCNN_image_shader + image2d_st8(max_workspace_2d, ivec2(gx, gy), max_value); +#else buffer_st8(max_workspace_data, gy * psc(w) + gx, max_value); +#endif return; } @@ -150,12 +198,20 @@ void main() { for (int i = 0; i < psc(w); i++) { +#if NCNN_image_shader + afpvec8 v = image3d_ld8(bottom_top_blob_3d, ivec3(i, gx, gy)); +#else int v_offset = gy * psc(cstep) + gx * psc(w) + i; afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset); +#endif max_value[0] = max(max_value[0], v[0]); max_value[1] = max(max_value[1], v[1]); } +#if NCNN_image_shader + image2d_st8(max_workspace_2d, ivec2(gx, gy), max_value); +#else buffer_st8(max_workspace_data, gy * psc(h) + gx, max_value); +#endif return; } } diff --git a/src/layer/vulkan/shader/softmax_reduce_sum.comp b/src/layer/vulkan/shader/softmax_reduce_sum.comp index a254c03ab..b38d16454 100644 --- a/src/layer/vulkan/shader/softmax_reduce_sum.comp +++ b/src/layer/vulkan/shader/softmax_reduce_sum.comp @@ -40,8 +40,16 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_top_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_top_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_top_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D sum_workspace_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D sum_workspace_2d; +#else layout (binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; layout (binding = 1) writeonly buffer sum_workspace { sfp sum_workspace_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -73,9 +81,18 @@ void main() { for (int i = 0; i < psc(w); i++) { - sum_value += buffer_ld1(bottom_top_blob_data, i); +#if NCNN_image_shader + afp v = image1d_ld1(bottom_top_blob_1d, i); +#else + afp v = buffer_ld1(bottom_top_blob_data, i); +#endif + sum_value += v; } +#if NCNN_image_shader + image1d_st1(sum_workspace_1d, 0, sum_value); +#else buffer_st1(sum_workspace_data, 0, sum_value); +#endif return; } @@ -83,10 +100,19 @@ void main() { for (int i = 0; i < psc(h); i++) { +#if NCNN_image_shader + afp v = image2d_ld1(bottom_top_blob_2d, ivec2(gx, i)); +#else int v_offset = i * psc(w) + gx; - sum_value += buffer_ld1(bottom_top_blob_data, v_offset); + afp v = buffer_ld1(bottom_top_blob_data, v_offset); +#endif + sum_value += v; } +#if NCNN_image_shader + image1d_st1(sum_workspace_1d, gx, sum_value); +#else buffer_st1(sum_workspace_data, gx, sum_value); +#endif return; } @@ -94,10 +120,19 @@ void main() { for (int i = 0; i < psc(w); i++) { +#if NCNN_image_shader + afp v = image2d_ld1(bottom_top_blob_2d, ivec2(i, gx)); +#else int v_offset = gx * psc(w) + i; - sum_value += buffer_ld1(bottom_top_blob_data, v_offset); + afp v = buffer_ld1(bottom_top_blob_data, v_offset); +#endif + sum_value += v; } +#if NCNN_image_shader + image1d_st1(sum_workspace_1d, gx, sum_value); +#else buffer_st1(sum_workspace_data, gx, sum_value); +#endif return; } @@ -105,10 +140,19 @@ void main() { for (int i = 0; i < psc(c); i++) { +#if NCNN_image_shader + afp v = image3d_ld1(bottom_top_blob_3d, ivec3(gx, gy, i)); +#else int v_offset = i * psc(cstep) + gy * psc(w) + gx; - sum_value += buffer_ld1(bottom_top_blob_data, v_offset); + afp v = buffer_ld1(bottom_top_blob_data, v_offset); +#endif + sum_value += v; } +#if NCNN_image_shader + image2d_st1(sum_workspace_2d, ivec2(gx, gy), sum_value); +#else buffer_st1(sum_workspace_data, gy * psc(w) + gx, sum_value); +#endif return; } @@ -116,10 +160,19 @@ void main() { for (int i = 0; i < psc(h); i++) { +#if NCNN_image_shader + afp v = image3d_ld1(bottom_top_blob_3d, ivec3(gx, i, gy)); +#else int v_offset = gy * psc(cstep) + i * psc(w) + gx; - sum_value += buffer_ld1(bottom_top_blob_data, v_offset); + afp v = buffer_ld1(bottom_top_blob_data, v_offset); +#endif + sum_value += v; } +#if NCNN_image_shader + image2d_st1(sum_workspace_2d, ivec2(gx, gy), sum_value); +#else buffer_st1(sum_workspace_data, gy * psc(w) + gx, sum_value); +#endif return; } @@ -127,10 +180,19 @@ void main() { for (int i = 0; i < psc(w); i++) { +#if NCNN_image_shader + afp v = image3d_ld1(bottom_top_blob_3d, ivec3(i, gx, gy)); +#else int v_offset = gy * psc(cstep) + gx * psc(w) + i; - sum_value += buffer_ld1(bottom_top_blob_data, v_offset); + afp v = buffer_ld1(bottom_top_blob_data, v_offset); +#endif + sum_value += v; } +#if NCNN_image_shader + image2d_st1(sum_workspace_2d, ivec2(gx, gy), sum_value); +#else buffer_st1(sum_workspace_data, gy * psc(h) + gx, sum_value); +#endif return; } } diff --git a/src/layer/vulkan/shader/softmax_reduce_sum_pack4.comp b/src/layer/vulkan/shader/softmax_reduce_sum_pack4.comp index 51ebe3378..40b035ac3 100644 --- a/src/layer/vulkan/shader/softmax_reduce_sum_pack4.comp +++ b/src/layer/vulkan/shader/softmax_reduce_sum_pack4.comp @@ -40,8 +40,16 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_top_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_top_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_top_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D sum_workspace_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D sum_workspace_2d; +#else layout (binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; layout (binding = 1) writeonly buffer sum_workspace { sfpvec4 sum_workspace_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -73,11 +81,20 @@ void main() { for (int i = 0; i < psc(w); i++) { - sum_value += buffer_ld4(bottom_top_blob_data, i); +#if NCNN_image_shader + afpvec4 v = image1d_ld4(bottom_top_blob_1d, i); +#else + afpvec4 v = buffer_ld4(bottom_top_blob_data, i); +#endif + sum_value += v; } afpvec2 sum2 = sum_value.rg + sum_value.ba; sum_value = afpvec4(sum2.r + sum2.g); +#if NCNN_image_shader + image1d_st4(sum_workspace_1d, 0, sum_value); +#else buffer_st4(sum_workspace_data, 0, sum_value); +#endif return; } @@ -85,12 +102,21 @@ void main() { for (int i = 0; i < psc(h); i++) { +#if NCNN_image_shader + afpvec4 v = image2d_ld4(bottom_top_blob_2d, ivec2(gx, i)); +#else int v_offset = i * psc(w) + gx; - sum_value += buffer_ld4(bottom_top_blob_data, v_offset); + afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); +#endif + sum_value += v; } afpvec2 sum2 = sum_value.rg + sum_value.ba; sum_value = afpvec4(sum2.r + sum2.g); +#if NCNN_image_shader + image1d_st4(sum_workspace_1d, gx, sum_value); +#else buffer_st4(sum_workspace_data, gx, sum_value); +#endif return; } @@ -98,10 +124,19 @@ void main() { for (int i = 0; i < psc(w); i++) { +#if NCNN_image_shader + afpvec4 v = image2d_ld4(bottom_top_blob_2d, ivec2(i, gx)); +#else int v_offset = gx * psc(w) + i; - sum_value += buffer_ld4(bottom_top_blob_data, v_offset); + afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); +#endif + sum_value += v; } +#if NCNN_image_shader + image1d_st4(sum_workspace_1d, gx, sum_value); +#else buffer_st4(sum_workspace_data, gx, sum_value); +#endif return; } @@ -109,12 +144,21 @@ void main() { for (int i = 0; i < psc(c); i++) { +#if NCNN_image_shader + afpvec4 v = image3d_ld4(bottom_top_blob_3d, ivec3(gx, gy, i)); +#else int v_offset = i * psc(cstep) + gy * psc(w) + gx; - sum_value += buffer_ld4(bottom_top_blob_data, v_offset); + afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); +#endif + sum_value += v; } afpvec2 sum2 = sum_value.rg + sum_value.ba; sum_value = afpvec4(sum2.r + sum2.g); +#if NCNN_image_shader + image2d_st4(sum_workspace_2d, ivec2(gx, gy), sum_value); +#else buffer_st4(sum_workspace_data, gy * psc(w) + gx, sum_value); +#endif return; } @@ -122,10 +166,19 @@ void main() { for (int i = 0; i < psc(h); i++) { +#if NCNN_image_shader + afpvec4 v = image3d_ld4(bottom_top_blob_3d, ivec3(gx, i, gy)); +#else int v_offset = gy * psc(cstep) + i * psc(w) + gx; - sum_value += buffer_ld4(bottom_top_blob_data, v_offset); + afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); +#endif + sum_value += v; } +#if NCNN_image_shader + image2d_st4(sum_workspace_2d, ivec2(gx, gy), sum_value); +#else buffer_st4(sum_workspace_data, gy * psc(w) + gx, sum_value); +#endif return; } @@ -133,10 +186,19 @@ void main() { for (int i = 0; i < psc(w); i++) { +#if NCNN_image_shader + afpvec4 v = image3d_ld4(bottom_top_blob_3d, ivec3(i, gx, gy)); +#else int v_offset = gy * psc(cstep) + gx * psc(w) + i; - sum_value += buffer_ld4(bottom_top_blob_data, v_offset); + afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); +#endif + sum_value += v; } +#if NCNN_image_shader + image2d_st4(sum_workspace_2d, ivec2(gx, gy), sum_value); +#else buffer_st4(sum_workspace_data, gy * psc(h) + gx, sum_value); +#endif return; } } diff --git a/src/layer/vulkan/shader/softmax_reduce_sum_pack8.comp b/src/layer/vulkan/shader/softmax_reduce_sum_pack8.comp index a09092d63..a4a88024b 100644 --- a/src/layer/vulkan/shader/softmax_reduce_sum_pack8.comp +++ b/src/layer/vulkan/shader/softmax_reduce_sum_pack8.comp @@ -41,8 +41,16 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_top_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_top_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_top_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D sum_workspace_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D sum_workspace_2d; +#else layout (binding = 0) readonly buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; }; layout (binding = 1) writeonly buffer sum_workspace { sfpvec8 sum_workspace_data[]; }; +#endif layout (push_constant) uniform parameter { @@ -74,13 +82,22 @@ void main() { for (int i = 0; i < psc(w); i++) { - sum_value += buffer_ld8(bottom_top_blob_data, i); +#if NCNN_image_shader + afpvec8 v = image1d_ld8(bottom_top_blob_1d, i); +#else + afpvec8 v = buffer_ld8(bottom_top_blob_data, i); +#endif + sum_value += v; } afpvec4 sum4 = sum_value[0] + sum_value[1]; afpvec2 sum2 = sum4.rg + sum4.ba; afp sum1 = sum2.r + sum2.g; sum_value = afpvec8(afpvec4(sum1), afpvec4(sum1)); +#if NCNN_image_shader + image1d_st8(sum_workspace_1d, 0, sum_value); +#else buffer_st8(sum_workspace_data, 0, sum_value); +#endif return; } @@ -88,14 +105,23 @@ void main() { for (int i = 0; i < psc(h); i++) { +#if NCNN_image_shader + afpvec8 v = image2d_ld8(bottom_top_blob_2d, ivec2(gx, i)); +#else int v_offset = i * psc(w) + gx; - sum_value += buffer_ld8(bottom_top_blob_data, v_offset); + afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset); +#endif + sum_value += v; } afpvec4 sum4 = sum_value[0] + sum_value[1]; afpvec2 sum2 = sum4.rg + sum4.ba; afp sum1 = sum2.r + sum2.g; sum_value = afpvec8(afpvec4(sum1), afpvec4(sum1)); +#if NCNN_image_shader + image1d_st8(sum_workspace_1d, gx, sum_value); +#else buffer_st8(sum_workspace_data, gx, sum_value); +#endif return; } @@ -103,10 +129,19 @@ void main() { for (int i = 0; i < psc(w); i++) { +#if NCNN_image_shader + afpvec8 v = image2d_ld8(bottom_top_blob_2d, ivec2(i, gx)); +#else int v_offset = gx * psc(w) + i; - sum_value += buffer_ld8(bottom_top_blob_data, v_offset); + afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset); +#endif + sum_value += v; } +#if NCNN_image_shader + image1d_st8(sum_workspace_1d, gx, sum_value); +#else buffer_st8(sum_workspace_data, gx, sum_value); +#endif return; } @@ -114,14 +149,23 @@ void main() { for (int i = 0; i < psc(c); i++) { +#if NCNN_image_shader + afpvec8 v = image3d_ld8(bottom_top_blob_3d, ivec3(gx, gy, i)); +#else int v_offset = i * psc(cstep) + gy * psc(w) + gx; - sum_value += buffer_ld8(bottom_top_blob_data, v_offset); + afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset); +#endif + sum_value += v; } afpvec4 sum4 = sum_value[0] + sum_value[1]; afpvec2 sum2 = sum4.rg + sum4.ba; afp sum1 = sum2.r + sum2.g; sum_value = afpvec8(afpvec4(sum1), afpvec4(sum1)); +#if NCNN_image_shader + image2d_st8(sum_workspace_2d, ivec2(gx, gy), sum_value); +#else buffer_st8(sum_workspace_data, gy * psc(w) + gx, sum_value); +#endif return; } @@ -129,10 +173,19 @@ void main() { for (int i = 0; i < psc(h); i++) { +#if NCNN_image_shader + afpvec8 v = image3d_ld8(bottom_top_blob_3d, ivec3(gx, i, gy)); +#else int v_offset = gy * psc(cstep) + i * psc(w) + gx; - sum_value += buffer_ld8(bottom_top_blob_data, v_offset); + afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset); +#endif + sum_value += v; } +#if NCNN_image_shader + image2d_st8(sum_workspace_2d, ivec2(gx, gy), sum_value); +#else buffer_st8(sum_workspace_data, gy * psc(w) + gx, sum_value); +#endif return; } @@ -140,10 +193,19 @@ void main() { for (int i = 0; i < psc(w); i++) { +#if NCNN_image_shader + afpvec8 v = image3d_ld8(bottom_top_blob_3d, ivec3(i, gx, gy)); +#else int v_offset = gy * psc(cstep) + gx * psc(w) + i; - sum_value += buffer_ld8(bottom_top_blob_data, v_offset); + afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset); +#endif + sum_value += v; } +#if NCNN_image_shader + image2d_st8(sum_workspace_2d, ivec2(gx, gy), sum_value); +#else buffer_st8(sum_workspace_data, gy * psc(h) + gx, sum_value); +#endif return; } } diff --git a/src/layer/vulkan/softmax_vulkan.cpp b/src/layer/vulkan/softmax_vulkan.cpp index d9113a7db..c0063d65c 100644 --- a/src/layer/vulkan/softmax_vulkan.cpp +++ b/src/layer/vulkan/softmax_vulkan.cpp @@ -23,6 +23,7 @@ DEFINE_LAYER_CREATOR(Softmax_vulkan) Softmax_vulkan::Softmax_vulkan() { support_vulkan = true; + support_image_storage = true; pipeline_softmax_reduce_max = 0; pipeline_softmax_exp_sub_max = 0; @@ -415,4 +416,152 @@ int Softmax_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, cons return 0; } +int Softmax_vulkan::forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const +{ + int dims = bottom_top_blob.dims; + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + size_t elemsize = bottom_top_blob.elemsize; + int elempack = bottom_top_blob.elempack; + + VkImageMat max_workspace; + VkImageMat sum_workspace; + + if (dims == 1) // axis == 0 + { + max_workspace.create(1, elemsize, elempack, opt.workspace_vkallocator); + sum_workspace.create(1, elemsize, elempack, opt.workspace_vkallocator); + } + else if (dims == 2 && axis == 0) + { + max_workspace.create(w, elemsize, elempack, opt.workspace_vkallocator); + sum_workspace.create(w, elemsize, elempack, opt.workspace_vkallocator); + } + else if (dims == 2 && axis == 1) + { + max_workspace.create(h, elemsize, elempack, opt.workspace_vkallocator); + sum_workspace.create(h, elemsize, elempack, opt.workspace_vkallocator); + } + else if (dims == 3 && axis == 0) + { + max_workspace.create(w, h, elemsize, elempack, opt.workspace_vkallocator); + sum_workspace.create(w, h, elemsize, elempack, opt.workspace_vkallocator); + } + else if (dims == 3 && axis == 1) + { + max_workspace.create(w, channels, elemsize, elempack, opt.workspace_vkallocator); + sum_workspace.create(w, channels, elemsize, elempack, opt.workspace_vkallocator); + } + else if (dims == 3 && axis == 2) + { + max_workspace.create(h, channels, elemsize, elempack, opt.workspace_vkallocator); + sum_workspace.create(h, channels, elemsize, elempack, opt.workspace_vkallocator); + } + + // reduce max + { + std::vector bindings(2); + bindings[0] = bottom_top_blob; + bindings[1] = max_workspace; + + std::vector constants(10); + constants[0].i = bottom_top_blob.dims; + constants[1].i = bottom_top_blob.w; + constants[2].i = bottom_top_blob.h; + constants[3].i = bottom_top_blob.c; + constants[4].i = 0;//bottom_top_blob.cstep; + constants[5].i = max_workspace.dims; + constants[6].i = max_workspace.w; + constants[7].i = max_workspace.h; + constants[8].i = max_workspace.c; + constants[9].i = 0;//max_workspace.cstep; + + const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_reduce_max_pack8 + : elempack == 4 ? pipeline_softmax_reduce_max_pack4 + : pipeline_softmax_reduce_max; + + cmd.record_pipeline(pipeline, bindings, constants, max_workspace); + } + + // exp( v - max ) + { + std::vector bindings(3); + bindings[0] = bottom_top_blob; + bindings[1] = bottom_top_blob; + bindings[2] = max_workspace; + + std::vector constants(10); + constants[0].i = bottom_top_blob.dims; + constants[1].i = bottom_top_blob.w; + constants[2].i = bottom_top_blob.h; + constants[3].i = bottom_top_blob.c; + constants[4].i = 0;//bottom_top_blob.cstep; + constants[5].i = max_workspace.dims; + constants[6].i = max_workspace.w; + constants[7].i = max_workspace.h; + constants[8].i = max_workspace.c; + constants[9].i = 0;//max_workspace.cstep; + + const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_exp_sub_max_pack8 + : elempack == 4 ? pipeline_softmax_exp_sub_max_pack4 + : pipeline_softmax_exp_sub_max; + + cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob); + } + + // reduce sum + { + std::vector bindings(2); + bindings[0] = bottom_top_blob; + bindings[1] = sum_workspace; + + std::vector constants(10); + constants[0].i = bottom_top_blob.dims; + constants[1].i = bottom_top_blob.w; + constants[2].i = bottom_top_blob.h; + constants[3].i = bottom_top_blob.c; + constants[4].i = 0;//bottom_top_blob.cstep; + constants[5].i = sum_workspace.dims; + constants[6].i = sum_workspace.w; + constants[7].i = sum_workspace.h; + constants[8].i = sum_workspace.c; + constants[9].i = 0;//sum_workspace.cstep; + + const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_reduce_sum_pack8 + : elempack == 4 ? pipeline_softmax_reduce_sum_pack4 + : pipeline_softmax_reduce_sum; + + cmd.record_pipeline(pipeline, bindings, constants, sum_workspace); + } + + // div sum + { + std::vector bindings(3); + bindings[0] = bottom_top_blob; + bindings[1] = bottom_top_blob; + bindings[2] = sum_workspace; + + std::vector constants(10); + constants[0].i = bottom_top_blob.dims; + constants[1].i = bottom_top_blob.w; + constants[2].i = bottom_top_blob.h; + constants[3].i = bottom_top_blob.c; + constants[4].i = 0;//bottom_top_blob.cstep; + constants[5].i = sum_workspace.dims; + constants[6].i = sum_workspace.w; + constants[7].i = sum_workspace.h; + constants[8].i = sum_workspace.c; + constants[9].i = 0;//sum_workspace.cstep; + + const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_div_sum_pack8 + : elempack == 4 ? pipeline_softmax_div_sum_pack4 + : pipeline_softmax_div_sum; + + cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob); + } + + return 0; +} + } // namespace ncnn diff --git a/src/layer/vulkan/softmax_vulkan.h b/src/layer/vulkan/softmax_vulkan.h index e47f3a1b8..35478d2da 100644 --- a/src/layer/vulkan/softmax_vulkan.h +++ b/src/layer/vulkan/softmax_vulkan.h @@ -29,6 +29,7 @@ public: using Softmax::forward_inplace; virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const; + virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const; public: Pipeline* pipeline_softmax_reduce_max; diff --git a/src/mat.cpp b/src/mat.cpp index 8b792fa8a..98b6f859c 100644 --- a/src/mat.cpp +++ b/src/mat.cpp @@ -188,7 +188,7 @@ VkImageMat VkImageMat::from_android_hardware_buffer(VkAndroidHardwareBufferImage int width = allocator->width(); int height = allocator->height(); - return VkImageMat(width, height, VK_FORMAT_UNDEFINED, allocator); + return VkImageMat(width, height, allocator); } #endif // __ANDROID_API__ >= 26 #endif // NCNN_VULKAN diff --git a/src/mat.h b/src/mat.h index 6a615ad72..39321dff2 100644 --- a/src/mat.h +++ b/src/mat.h @@ -41,6 +41,7 @@ namespace ncnn { #if NCNN_VULKAN class VkMat; +class VkImageMat; #endif // NCNN_VULKAN // the three dimension matrix @@ -111,6 +112,8 @@ public: #if NCNN_VULKAN // allocate like void create_like(const VkMat& m, Allocator* allocator = 0); + // allocate like + void create_like(const VkImageMat& im, Allocator* allocator = 0); #endif // NCNN_VULKAN // refcount++ void addref(); @@ -306,6 +309,8 @@ public: void create_like(const Mat& m, VkAllocator* allocator); // allocate like void create_like(const VkMat& m, VkAllocator* allocator); + // allocate like + void create_like(const VkImageMat& im, VkAllocator* allocator); // mapped Mat mapped() const; @@ -365,18 +370,58 @@ class VkImageMat public: // empty VkImageMat(); + // vec + VkImageMat(int w, size_t elemsize, VkAllocator* allocator); // image - VkImageMat(int width, int height, VkFormat format, VkImageAllocator* allocator); + VkImageMat(int w, int h, size_t elemsize, VkAllocator* allocator); + // dim + VkImageMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator); + // packed vec + VkImageMat(int w, size_t elemsize, int elempack, VkAllocator* allocator); + // packed image + VkImageMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator); + // packed dim + VkImageMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator); // copy VkImageMat(const VkImageMat& m); + // external vec + VkImageMat(int w, VkImageMemory* data, size_t elemsize, VkAllocator* allocator); // external image - VkImageMat(int width, int height, VkImageMemory* data, VkFormat format, VkImageAllocator* allocator); + VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, VkAllocator* allocator); + // external dim + VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator); + // external packed vec + VkImageMat(int w, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator); + // external packed image + VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator); + // external packed dim + VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator); // release ~VkImageMat(); // assign VkImageMat& operator=(const VkImageMat& m); + // allocate vec + void create(int w, size_t elemsize, VkAllocator* allocator); // allocate image - void create(int width, int height, VkFormat format, VkImageAllocator* allocator); + void create(int w, int h, size_t elemsize, VkAllocator* allocator); + // allocate dim + void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator); + // allocate packed vec + void create(int w, size_t elemsize, int elempack, VkAllocator* allocator); + // allocate packed image + void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator); + // allocate packed dim + void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator); + // allocate like + void create_like(const Mat& m, VkAllocator* allocator); + // allocate like + void create_like(const VkMat& m, VkAllocator* allocator); + // allocate like + void create_like(const VkImageMat& im, VkAllocator* allocator); + + // mapped + Mat mapped() const; + void* mapped_ptr() const; // refcount++ void addref(); @@ -386,6 +431,9 @@ public: bool empty() const; size_t total() const; + // shape only + Mat shape() const; + // low-level reference VkImage image() const; VkImageView imageview() const; @@ -402,12 +450,28 @@ public: // when points to user-allocated data, the pointer is NULL int* refcount; + // element size in bytes + // 4 = float32/int32 + // 2 = float16 + // 1 = int8/uint8 + // 0 = empty + size_t elemsize; + + // packed count inside element + // c/1-h-w-1 h/1-w-1 w/1-1 scalar + // c/4-h-w-4 h/4-w-4 w/4-4 sse/neon + // c/8-h-w-8 h/8-w-8 w/8-8 avx/fp16 + int elempack; + // the allocator - VkImageAllocator* allocator; + VkAllocator* allocator; - int width; - int height; - VkFormat format; + // the dimension rank + int dims; + + int w; + int h; + int c; }; // type for vulkan specialization constant and push constant @@ -1078,6 +1142,17 @@ inline void Mat::create_like(const VkMat& m, Allocator* _allocator) if (_dims == 3) create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator); } + +inline void Mat::create_like(const VkImageMat& im, Allocator* _allocator) +{ + int _dims = im.dims; + if (_dims == 1) + create(im.w, im.elemsize, im.elempack, _allocator); + if (_dims == 2) + create(im.w, im.h, im.elemsize, im.elempack, _allocator); + if (_dims == 3) + create(im.w, im.h, im.c, im.elemsize, im.elempack, _allocator); +} #endif // NCNN_VULKAN inline void Mat::addref() @@ -1532,6 +1607,17 @@ inline void VkMat::create_like(const VkMat& m, VkAllocator* _allocator) create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator); } +inline void VkMat::create_like(const VkImageMat& im, VkAllocator* _allocator) +{ + int _dims = im.dims; + if (_dims == 1) + create(im.w, im.elemsize, im.elempack, _allocator); + if (_dims == 2) + create(im.w, im.h, im.elemsize, im.elempack, _allocator); + if (_dims == 3) + create(im.w, im.h, im.c, im.elemsize, im.elempack, _allocator); +} + inline Mat VkMat::mapped() const { if (!allocator->mappable) @@ -1626,25 +1712,80 @@ inline size_t VkMat::buffer_capacity() const } inline VkImageMat::VkImageMat() - : data(0), refcount(0), allocator(0), width(0), height(0), format(VK_FORMAT_UNDEFINED) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0) { } -inline VkImageMat::VkImageMat(int _width, int _height, VkFormat _format, VkImageAllocator* _allocator) - : data(0), refcount(0), allocator(0), width(0), height(0), format(VK_FORMAT_UNDEFINED) +inline VkImageMat::VkImageMat(int _w, size_t _elemsize, VkAllocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0) { - create(_width, _height, _format, _allocator); + create(_w, _elemsize, _allocator); +} + +inline VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0) +{ + create(_w, _h, _elemsize, _allocator); +} + +inline VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0) +{ + create(_w, _h, _c, _elemsize, _allocator); +} + +inline VkImageMat::VkImageMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0) +{ + create(_w, _elemsize, _elempack, _allocator); +} + +inline VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0) +{ + create(_w, _h, _elemsize, _elempack, _allocator); +} + +inline VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0) +{ + create(_w, _h, _c, _elemsize, _elempack, _allocator); } inline VkImageMat::VkImageMat(const VkImageMat& m) - : data(m.data), refcount(m.refcount), allocator(m.allocator), width(m.width), height(m.height), format(m.format) + : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), c(m.c) { if (refcount) NCNN_XADD(refcount, 1); } -inline VkImageMat::VkImageMat(int _width, int _height, VkImageMemory* _data, VkFormat _format, VkImageAllocator* _allocator) - : data(_data), refcount(0), allocator(_allocator), width(_width), height(_height), format(_format) +inline VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), c(1) +{ +} + +inline VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), c(1) +{ +} + +inline VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), c(_c) +{ +} + +inline VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), c(1) +{ +} + +inline VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), c(1) +{ +} + +inline VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), c(_c) { } @@ -1665,37 +1806,238 @@ inline VkImageMat& VkImageMat::operator=(const VkImageMat& m) data = m.data; refcount = m.refcount; + elemsize = m.elemsize; + elempack = m.elempack; allocator = m.allocator; - width = m.width; - height = m.height; - format = m.format; + dims = m.dims; + w = m.w; + h = m.h; + c = m.c; return *this; } -inline void VkImageMat::create(int _width, int _height, VkFormat _format, VkImageAllocator* _allocator) +inline void VkImageMat::create(int _w, size_t _elemsize, VkAllocator* _allocator) +{ + if (dims == 1 && w == _w && elemsize == _elemsize && elempack == 1 && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = 1; + allocator = _allocator; + + dims = 1; + w = _w; + h = 1; + c = 1; + + if (total() > 0) + { + data = allocator->fastMalloc(dims, w, h, c, elemsize, elempack); + if (!data) + return; + + refcount = (int*)((unsigned char*)data + offsetof(VkImageMemory, refcount)); + *refcount = 1; + } +} + +inline void VkImageMat::create(int _w, int _h, size_t _elemsize, VkAllocator* _allocator) +{ + if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == 1 && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = 1; + allocator = _allocator; + + dims = 2; + w = _w; + h = _h; + c = 1; + + if (total() > 0) + { + data = allocator->fastMalloc(dims, w, h, c, elemsize, elempack); + if (!data) + return; + + refcount = (int*)((unsigned char*)data + offsetof(VkImageMemory, refcount)); + *refcount = 1; + } +} + +inline void VkImageMat::create(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator) +{ + if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == 1 && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = 1; + allocator = _allocator; + + dims = 3; + w = _w; + h = _h; + c = _c; + + if (total() > 0) + { + data = allocator->fastMalloc(dims, w, h, c, elemsize, elempack); + if (!data) + return; + + refcount = (int*)((unsigned char*)data + offsetof(VkImageMemory, refcount)); + *refcount = 1; + } +} + +inline void VkImageMat::create(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator) +{ + if (dims == 1 && w == _w && elemsize == _elemsize && elempack == _elempack && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = _elempack; + allocator = _allocator; + + dims = 1; + w = _w; + h = 1; + c = 1; + + if (total() > 0) + { + data = allocator->fastMalloc(dims, w, h, c, elemsize, elempack); + if (!data) + return; + + refcount = (int*)((unsigned char*)data + offsetof(VkImageMemory, refcount)); + *refcount = 1; + } +} + +inline void VkImageMat::create(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator) { - if (width == _width && height == _height && format == _format && allocator == _allocator) + if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == _elempack && allocator == _allocator) return; release(); + elemsize = _elemsize; + elempack = _elempack; allocator = _allocator; - width = _width; - height = _height; - format = _format; + dims = 2; + w = _w; + h = _h; + c = 1; if (total() > 0) { - data = allocator->fastMalloc(width, height, format); + data = allocator->fastMalloc(dims, w, h, c, elemsize, elempack); + if (!data) + return; refcount = (int*)((unsigned char*)data + offsetof(VkImageMemory, refcount)); *refcount = 1; } } +inline void VkImageMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator) +{ + if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == _elempack && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = _elempack; + allocator = _allocator; + + dims = 3; + w = _w; + h = _h; + c = _c; + + if (total() > 0) + { + data = allocator->fastMalloc(dims, w, h, c, elemsize, elempack); + if (!data) + return; + + refcount = (int*)((unsigned char*)data + offsetof(VkImageMemory, refcount)); + *refcount = 1; + } +} + +inline void VkImageMat::create_like(const Mat& m, VkAllocator* _allocator) +{ + int _dims = m.dims; + if (_dims == 1) + create(m.w, m.elemsize, m.elempack, _allocator); + if (_dims == 2) + create(m.w, m.h, m.elemsize, m.elempack, _allocator); + if (_dims == 3) + create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator); +} + +inline void VkImageMat::create_like(const VkMat& m, VkAllocator* _allocator) +{ + int _dims = m.dims; + if (_dims == 1) + create(m.w, m.elemsize, m.elempack, _allocator); + if (_dims == 2) + create(m.w, m.h, m.elemsize, m.elempack, _allocator); + if (_dims == 3) + create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator); +} + +inline void VkImageMat::create_like(const VkImageMat& im, VkAllocator* _allocator) +{ + int _dims = im.dims; + if (_dims == 1) + create(im.w, im.elemsize, im.elempack, _allocator); + if (_dims == 2) + create(im.w, im.h, im.elemsize, im.elempack, _allocator); + if (_dims == 3) + create(im.w, im.h, im.c, im.elemsize, im.elempack, _allocator); +} + +inline Mat VkImageMat::mapped() const +{ + if (!allocator->mappable || !data->mapped_ptr) + return Mat(); + + if (dims == 1) + return Mat(w, mapped_ptr(), elemsize, elempack, 0); + + if (dims == 2) + return Mat(w, h, mapped_ptr(), elemsize, elempack, 0); + + if (dims == 3) + return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0); + + return Mat(); +} + +inline void* VkImageMat::mapped_ptr() const +{ + if (!allocator->mappable || !data->mapped_ptr) + return 0; + + return (unsigned char*)data->mapped_ptr + data->bind_offset; +} + inline void VkImageMat::addref() { if (refcount) @@ -1714,9 +2056,13 @@ inline void VkImageMat::release() data = 0; - width = 0; - height = 0; - format = VK_FORMAT_UNDEFINED; + elemsize = 0; + elempack = 0; + + dims = 0; + w = 0; + h = 0; + c = 0; refcount = 0; } @@ -1728,7 +2074,19 @@ inline bool VkImageMat::empty() const inline size_t VkImageMat::total() const { - return width * height; + return w * h * c; +} + +inline Mat VkImageMat::shape() const +{ + if (dims == 1) + return Mat(w * elempack, (void*)0); + if (dims == 2) + return Mat(w, h * elempack, (void*)0); + if (dims == 3) + return Mat(w, h, c * elempack, (void*)0); + + return Mat(); } inline VkImage VkImageMat::image() const diff --git a/src/net.cpp b/src/net.cpp index aa36af833..7a04ccfad 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -42,12 +42,6 @@ Net::Net() vkdev = 0; weight_vkallocator = 0; weight_staging_vkallocator = 0; - - cast_float32_to_float16 = 0; - cast_float16_to_float32 = 0; - packing_pack1 = 0; - packing_pack4 = 0; - packing_pack8 = 0; #endif // NCNN_VULKAN } @@ -56,11 +50,6 @@ Net::~Net() clear(); #if NCNN_VULKAN - delete cast_float32_to_float16; - delete cast_float16_to_float32; - delete packing_pack1; - delete packing_pack4; - delete packing_pack8; #endif // NCNN_VULKAN } @@ -165,6 +154,10 @@ int Net::load_param(const DataReader& dr) if (!vkdev->info.support_fp16_arithmetic) opt.use_fp16_arithmetic = false; if (!vkdev->info.support_int8_storage) opt.use_int8_storage = false; if (!vkdev->info.support_int8_arithmetic) opt.use_int8_arithmetic = false; + if (!vkdev->info.support_image_storage) opt.use_image_storage = false; + if (!vkdev->info.support_image_fp16_packed) opt.use_image_fp16_packed = false; + if (!vkdev->info.support_image_fp16_storage) opt.use_image_fp16_storage = false; + if (!vkdev->info.support_image_fp16_arithmetic) opt.use_image_fp16_arithmetic = false; } #endif // NCNN_VULKAN @@ -355,6 +348,10 @@ int Net::load_param_bin(const DataReader& dr) if (!vkdev->info.support_fp16_arithmetic) opt.use_fp16_arithmetic = false; if (!vkdev->info.support_int8_storage) opt.use_int8_storage = false; if (!vkdev->info.support_int8_arithmetic) opt.use_int8_arithmetic = false; + if (!vkdev->info.support_image_storage) opt.use_image_storage = false; + if (!vkdev->info.support_image_fp16_packed) opt.use_image_fp16_packed = false; + if (!vkdev->info.support_image_fp16_storage) opt.use_image_fp16_storage = false; + if (!vkdev->info.support_image_fp16_arithmetic) opt.use_image_fp16_arithmetic = false; } #endif // NCNN_VULKAN @@ -529,7 +526,13 @@ int Net::load_model(const DataReader& dr) break; } - int cret = layer->create_pipeline(opt); + Option opt1 = opt; + if (!layer->support_image_storage) + { + opt1.use_image_storage = false; + } + + int cret = layer->create_pipeline(opt1); if (cret != 0) { fprintf(stderr, "layer create_pipeline %d failed\n", (int)i); @@ -864,14 +867,22 @@ void Net::clear() blobs.clear(); for (size_t i=0; idestroy_pipeline(opt); + Layer* layer = layers[i]; + + Option opt1 = opt; + if (!layer->support_image_storage) + { + opt1.use_image_storage = false; + } + + int dret = layer->destroy_pipeline(opt1); if (dret != 0) { fprintf(stderr, "layer destroy_pipeline failed\n"); // ignore anyway } - delete layers[i]; + delete layer; } layers.clear(); @@ -917,11 +928,11 @@ int Net::upload_model() // create gpu device allocator if null if (!weight_vkallocator) { - weight_vkallocator = new VkWeightBufferAllocator(vkdev); + weight_vkallocator = new VkWeightAllocator(vkdev); } if (!weight_staging_vkallocator) { - weight_staging_vkallocator = new VkWeightStagingBufferAllocator(vkdev); + weight_staging_vkallocator = new VkWeightStagingAllocator(vkdev); } Option opt_upload = opt; @@ -949,89 +960,11 @@ int Net::upload_model() int Net::create_pipeline() { - if (opt.use_fp16_storage && vkdev->info.type != 0) - { - { - cast_float32_to_float16 = ncnn::create_layer(ncnn::LayerType::Cast); - cast_float32_to_float16->vkdev = vkdev; - - ncnn::ParamDict pd; - pd.set(0, 1); - pd.set(1, 2); - - cast_float32_to_float16->load_param(pd); - } - - { - cast_float16_to_float32 = ncnn::create_layer(ncnn::LayerType::Cast); - cast_float16_to_float32->vkdev = vkdev; - - ncnn::ParamDict pd; - pd.set(0, 2); - pd.set(1, 1); - - cast_float16_to_float32->load_param(pd); - } - - cast_float32_to_float16->create_pipeline(opt); - - cast_float16_to_float32->create_pipeline(opt); - } - - { - packing_pack1 = ncnn::create_layer(ncnn::LayerType::Packing); - packing_pack1->vkdev = vkdev; - - ncnn::ParamDict pd; - pd.set(0, 1); - - packing_pack1->load_param(pd); - } - - { - packing_pack4 = ncnn::create_layer(ncnn::LayerType::Packing); - packing_pack4->vkdev = vkdev; - - ncnn::ParamDict pd; - pd.set(0, 4); - - packing_pack4->load_param(pd); - } - - { - packing_pack8 = ncnn::create_layer(ncnn::LayerType::Packing); - packing_pack8->vkdev = vkdev; - - ncnn::ParamDict pd; - pd.set(0, 8); - - packing_pack8->load_param(pd); - } - - packing_pack1->create_pipeline(opt); - packing_pack4->create_pipeline(opt); - packing_pack8->create_pipeline(opt); - return 0; } int Net::destroy_pipeline() { - if (cast_float32_to_float16) - cast_float32_to_float16->destroy_pipeline(opt); - - if (cast_float16_to_float32) - cast_float16_to_float32->destroy_pipeline(opt); - - if (packing_pack1) - packing_pack1->destroy_pipeline(opt); - - if (packing_pack4) - packing_pack4->destroy_pipeline(opt); - - if (packing_pack8) - packing_pack8->destroy_pipeline(opt); - return 0; } #endif // NCNN_VULKAN @@ -1102,7 +1035,7 @@ Layer* Net::create_custom_layer(int index) return layer_creator(); } -int Net::forward_layer(int layer_index, std::vector& blob_mats, Option& opt) const +int Net::forward_layer(int layer_index, std::vector& blob_mats, const Option& opt) const { const Layer* layer = layers[layer_index]; @@ -1305,7 +1238,7 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, Option& opt } #if NCNN_VULKAN -int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector& blob_mats_gpu, VkCompute& cmd, Option& opt) const +int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector& blob_mats_gpu, VkCompute& cmd, const Option& opt) const { const Layer* layer = layers[layer_index]; @@ -1330,50 +1263,8 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector if (blob_mats_gpu[bottom_blob_index].dims == 0) { - const Mat& bottom_blob_cpu = blob_mats[bottom_blob_index]; - - // cpu cast to fp16 (discrete gpu) - Mat bottom_blob_cpu_fp16; - if (opt.use_fp16_storage && vkdev->info.type == 0) - { - ncnn::cast_float32_to_float16(bottom_blob_cpu, bottom_blob_cpu_fp16, opt); - } - else if (bottom_blob_cpu.elempack == 4 && opt.use_fp16_packed && !opt.use_fp16_storage && vkdev->info.type == 0) - { - ncnn::cast_float32_to_float16(bottom_blob_cpu, bottom_blob_cpu_fp16, opt); - } - else - { - bottom_blob_cpu_fp16 = bottom_blob_cpu; - } - - // upload - VkMat bottom_blob_unpacked; - cmd.record_upload(bottom_blob_cpu_fp16, bottom_blob_unpacked, opt); - - // cast to fp16 (integrated gpu) - VkMat bottom_blob_unpacked_fp16; - if (opt.use_fp16_storage && vkdev->info.type != 0) - { - cast_float32_to_float16->forward(bottom_blob_unpacked, bottom_blob_unpacked_fp16, cmd, opt); - } - else - { - bottom_blob_unpacked_fp16 = bottom_blob_unpacked; - } - - // packing - VkMat& bottom_blob = blob_mats_gpu[bottom_blob_index]; - if (opt.use_shader_pack8) - { - packing_pack8->forward(bottom_blob_unpacked_fp16, bottom_blob, cmd, opt); - if (bottom_blob.elempack != 8) - packing_pack4->forward(bottom_blob_unpacked_fp16, bottom_blob, cmd, opt); - } - else - packing_pack4->forward(bottom_blob_unpacked_fp16, bottom_blob, cmd, opt); - -// fprintf(stderr, "upload %p[+%lu]\n", bottom_blob.buffer(), bottom_blob.buffer_offset()); + // host to buffer + cmd.record_upload(blob_mats[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); } } @@ -1446,50 +1337,8 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector if (blob_mats_gpu[bottom_blob_index].dims == 0) { - const Mat& bottom_blob_cpu = blob_mats[bottom_blob_index]; - - // cast to fp16 (discrete gpu) - Mat bottom_blob_cpu_fp16; - if (opt.use_fp16_storage && vkdev->info.type == 0) - { - ncnn::cast_float32_to_float16(bottom_blob_cpu, bottom_blob_cpu_fp16, opt); - } - else if (bottom_blob_cpu.elempack == 4 && opt.use_fp16_packed && !opt.use_fp16_storage && vkdev->info.type == 0) - { - ncnn::cast_float32_to_float16(bottom_blob_cpu, bottom_blob_cpu_fp16, opt); - } - else - { - bottom_blob_cpu_fp16 = bottom_blob_cpu; - } - - // upload - VkMat bottom_blob_unpacked; - cmd.record_upload(bottom_blob_cpu_fp16, bottom_blob_unpacked, opt); - - // cast to fp16 (integrated gpu) - VkMat bottom_blob_unpacked_fp16; - if (opt.use_fp16_storage && vkdev->info.type != 0) - { - cast_float32_to_float16->forward(bottom_blob_unpacked, bottom_blob_unpacked_fp16, cmd, opt); - } - else - { - bottom_blob_unpacked_fp16 = bottom_blob_unpacked; - } - - // packing - VkMat& bottom_blob = blob_mats_gpu[bottom_blob_index]; - if (opt.use_shader_pack8) - { - packing_pack8->forward(bottom_blob_unpacked, bottom_blob, cmd, opt); - if (bottom_blob.elempack != 8) - packing_pack4->forward(bottom_blob_unpacked, bottom_blob, cmd, opt); - } - else - packing_pack4->forward(bottom_blob_unpacked, bottom_blob, cmd, opt); - -// fprintf(stderr, "upload %p[+%lu]\n", bottom_blob.buffer(), bottom_blob.buffer_offset()); + // host to buffer + cmd.record_upload(blob_mats[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); } } @@ -1554,7 +1403,6 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector } } } - } else { @@ -1575,50 +1423,15 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector if (blob_mats[bottom_blob_index].dims == 0) { - VkMat bottom_blob = blob_mats_gpu[bottom_blob_index]; - -// fprintf(stderr, "download %p[+%lu]\n", bottom_blob.buffer(), bottom_blob.buffer_offset()); + // buffer to host + cmd.record_download(blob_mats_gpu[bottom_blob_index], blob_mats[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu[bottom_blob_index].release(); - // deep copy for inplace forward if data is shared - if (layer->support_inplace && *bottom_blob.refcount != 1) - { - VkMat bottom_blob_copy; - cmd.record_clone(bottom_blob, bottom_blob_copy, opt); -// fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blob.buffer(), bottom_blob.buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset()); - bottom_blob = bottom_blob_copy; - } - } - - VkMat bottom_blob_unpacked_fp16; - if (opt.use_packing_layout && layer->support_packing) - { - packing_pack4->forward(bottom_blob, bottom_blob_unpacked_fp16, cmd, opt); - } - else - { - // unpacking - packing_pack1->forward(bottom_blob, bottom_blob_unpacked_fp16, cmd, opt); } - // cast to fp32 (integrated gpu) - VkMat bottom_blob_unpacked; - if (opt.use_fp16_storage && vkdev->info.type != 0) - { - cast_float16_to_float32->forward(bottom_blob_unpacked_fp16, bottom_blob_unpacked, cmd, opt); - } - else - { - bottom_blob_unpacked = bottom_blob_unpacked_fp16; - } - - // download - Mat bottom_blob_cpu_fp16; - cmd.record_download(bottom_blob_unpacked, bottom_blob_cpu_fp16, opt); - cmd.submit_and_wait(); #if NCNN_BENCHMARK @@ -1637,21 +1450,6 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector #endif // NCNN_BENCHMARK cmd.reset(); - - // cast to fp32 (discrete gpu) - Mat& bottom_blob_cpu = blob_mats[bottom_blob_index]; - if (opt.use_fp16_storage && vkdev->info.type == 0) - { - ncnn::cast_float16_to_float32(bottom_blob_cpu_fp16, bottom_blob_cpu, opt); - } - else if (bottom_blob_cpu_fp16.elempack == 4 && opt.use_fp16_packed && !opt.use_fp16_storage && vkdev->info.type == 0) - { - ncnn::cast_float16_to_float32(bottom_blob_cpu_fp16, bottom_blob_cpu, opt); - } - else - { - bottom_blob_cpu = bottom_blob_cpu_fp16; - } } } @@ -1712,12 +1510,10 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector // store top blob blob_mats[top_blob_index] = top_blob; } - } else { // load bottom blobs - std::vector bottom_blobs_cpu_fp16(layer->bottoms.size()); for (size_t i=0; ibottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; @@ -1733,49 +1529,14 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector if (blob_mats[bottom_blob_index].dims == 0) { - VkMat bottom_blob = blob_mats_gpu[bottom_blob_index]; - -// fprintf(stderr, "download %p[+%lu]\n", bottom_blob.buffer(), bottom_blob.buffer_offset()); + // buffer to host + cmd.record_download(blob_mats_gpu[bottom_blob_index], blob_mats[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu[bottom_blob_index].release(); - // deep copy for inplace forward if data is shared - if (layer->support_inplace && *bottom_blob.refcount != 1) - { - VkMat bottom_blob_copy; - cmd.record_clone(bottom_blob, bottom_blob_copy, opt); -// fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blob.buffer(), bottom_blob.buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset()); - bottom_blob = bottom_blob_copy; - } - } - - VkMat bottom_blob_unpacked_fp16; - if (opt.use_packing_layout && layer->support_packing) - { - packing_pack4->forward(bottom_blob, bottom_blob_unpacked_fp16, cmd, opt); - } - else - { - // unpacking - packing_pack1->forward(bottom_blob, bottom_blob_unpacked_fp16, cmd, opt); - } - - // cast to fp32 (integrated gpu) - VkMat bottom_blob_unpacked; - if (opt.use_fp16_storage && vkdev->info.type != 0) - { - cast_float16_to_float32->forward(bottom_blob_unpacked_fp16, bottom_blob_unpacked, cmd, opt); } - else - { - bottom_blob_unpacked = bottom_blob_unpacked_fp16; - } - - // download - Mat& bottom_blob_cpu_fp16 = bottom_blobs_cpu_fp16[i]; - cmd.record_download(bottom_blob_unpacked, bottom_blob_cpu_fp16, opt); } } } @@ -1806,26 +1567,6 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector { int bottom_blob_index = layer->bottoms[i]; - if (blob_mats[bottom_blob_index].dims == 0) - { - const Mat& bottom_blob_cpu_fp16 = bottom_blobs_cpu_fp16[i]; - - // cast to fp32 (discrete gpu) - Mat& bottom_blob_cpu = blob_mats[bottom_blob_index]; - if (opt.use_fp16_storage && vkdev->info.type == 0) - { - ncnn::cast_float16_to_float32(bottom_blob_cpu_fp16, bottom_blob_cpu, opt); - } - else if (bottom_blob_cpu_fp16.elempack == 4 && opt.use_fp16_packed && !opt.use_fp16_storage && vkdev->info.type == 0) - { - ncnn::cast_float16_to_float32(bottom_blob_cpu_fp16, bottom_blob_cpu, opt); - } - else - { - bottom_blob_cpu = bottom_blob_cpu_fp16; - } - } - bottom_blobs[i] = blob_mats[bottom_blob_index]; if (opt.lightmode) @@ -1849,8 +1590,6 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector } } - bottom_blobs_cpu_fp16.clear(); - // forward if (opt.lightmode && layer->support_inplace) { @@ -1896,7 +1635,6 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector blob_mats[top_blob_index] = top_blobs[i]; } } - } } @@ -1904,99 +1642,865 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector return 0; } -#endif // NCNN_VULKAN -Extractor::Extractor(const Net* _net, size_t blob_count) : net(_net) +int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector& blob_mats_gpu, std::vector& blob_mats_gpu_image, VkCompute& cmd, const Option& opt) const { - blob_mats.resize(blob_count); - opt = net->opt; - -#if NCNN_VULKAN - if (net->opt.use_vulkan_compute) - { - local_blob_vkallocator = 0; - local_staging_vkallocator = 0; - - blob_mats_gpu.resize(blob_count); - } -#endif // NCNN_VULKAN -} + const Layer* layer = layers[layer_index]; -Extractor::~Extractor() -{ - blob_mats.clear(); +// fprintf(stderr, "forward_layer %d %d %s\n", layer->support_vulkan, layer_index, layer->name.c_str()); -#if NCNN_VULKAN - if (net->opt.use_vulkan_compute) + if (layer->support_vulkan) { - blob_mats_gpu.clear(); - - if (local_blob_vkallocator) + if (layer->support_image_storage) { - net->vkdev->reclaim_blob_allocator(local_blob_vkallocator); - } - if (local_staging_vkallocator) - { - net->vkdev->reclaim_staging_allocator(local_staging_vkallocator); - } - } -#endif // NCNN_VULKAN -} - -void Extractor::set_light_mode(bool enable) -{ - opt.lightmode = enable; -} + if (layer->one_blob_only) + { + // load bottom blob + int bottom_blob_index = layer->bottoms[0]; + int top_blob_index = layer->tops[0]; -void Extractor::set_num_threads(int num_threads) -{ - opt.num_threads = num_threads; -} + if (blob_mats_gpu_image[bottom_blob_index].dims == 0) + { + if (blob_mats_gpu[bottom_blob_index].dims == 0) + { + if (blob_mats[bottom_blob_index].dims == 0) + { + int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, blob_mats_gpu_image, cmd, opt); + if (ret != 0) + return ret; + } -void Extractor::set_blob_allocator(Allocator* allocator) -{ - opt.blob_allocator = allocator; -} + if (blob_mats_gpu_image[bottom_blob_index].dims == 0) + { + if (blob_mats_gpu[bottom_blob_index].dims == 0) + { + // host to image + cmd.record_upload(blob_mats[bottom_blob_index], blob_mats_gpu_image[bottom_blob_index], opt); + } + else + { + // buffer to image + cmd.record_buffer_to_image(blob_mats_gpu[bottom_blob_index], blob_mats_gpu_image[bottom_blob_index], opt); -void Extractor::set_workspace_allocator(Allocator* allocator) -{ - opt.workspace_allocator = allocator; -} + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats_gpu[bottom_blob_index].release(); + } + } + } + } + else + { + // buffer to image + cmd.record_buffer_to_image(blob_mats_gpu[bottom_blob_index], blob_mats_gpu_image[bottom_blob_index], opt); -#if NCNN_VULKAN -void Extractor::set_vulkan_compute(bool enable) -{ - if (net->opt.use_vulkan_compute) - { - opt.use_vulkan_compute = enable; - } - else - { - fprintf(stderr, "set_vulkan_compute failed, network use_vulkan_compute disabled\n"); - } -} + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats_gpu[bottom_blob_index].release(); + } + } + } -void Extractor::set_blob_vkallocator(VkAllocator* allocator) -{ - opt.blob_vkallocator = allocator; -} + VkImageMat bottom_blob = blob_mats_gpu_image[bottom_blob_index]; -void Extractor::set_workspace_vkallocator(VkAllocator* allocator) -{ - opt.workspace_vkallocator = allocator; -} + if (bottom_blob.empty()) + { + goto IMAGE_ALLOCATION_FAILED; + } -void Extractor::set_staging_vkallocator(VkAllocator* allocator) -{ - opt.staging_vkallocator = allocator; -} -#endif // NCNN_VULKAN + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats_gpu_image[bottom_blob_index].release(); + // deep copy for inplace forward if data is shared + if (layer->support_inplace && *bottom_blob.refcount != 1) + { + VkImageMat bottom_blob_copy; + cmd.record_clone(bottom_blob, bottom_blob_copy, opt); +// fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blob.buffer(), bottom_blob.buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset()); + bottom_blob = bottom_blob_copy; + } + } -#if NCNN_STRING -int Extractor::input(const char* blob_name, const Mat& in) -{ - int blob_index = net->find_blob_index_by_name(blob_name); - if (blob_index == -1) + // forward + if (opt.lightmode && layer->support_inplace) + { + VkImageMat& bottom_top_blob = bottom_blob; +#if NCNN_BENCHMARK + cmd.record_write_timestamp(layer_index * 2); + int ret = layer->forward_inplace(bottom_top_blob, cmd, opt); + cmd.record_write_timestamp(layer_index * 2 + 1); +#else + int ret = layer->forward_inplace(bottom_top_blob, cmd, opt); +#endif // NCNN_BENCHMARK + if (ret == -100) + { + goto IMAGE_ALLOCATION_FAILED; + } + if (ret != 0) + return ret; + + // store top blob + blob_mats_gpu_image[top_blob_index] = bottom_top_blob; + } + else + { + VkImageMat top_blob; +#if NCNN_BENCHMARK + cmd.record_write_timestamp(layer_index * 2); + int ret = layer->forward(bottom_blob, top_blob, cmd, opt); + cmd.record_write_timestamp(layer_index * 2 + 1); +#else + int ret = layer->forward(bottom_blob, top_blob, cmd, opt); +#endif // NCNN_BENCHMARK + if (ret == -100) + { + goto IMAGE_ALLOCATION_FAILED; + } + if (ret != 0) + return ret; + + // store top blob + blob_mats_gpu_image[top_blob_index] = top_blob; + } + } + else + { + // load bottom blobs + std::vector bottom_blobs(layer->bottoms.size()); + for (size_t i=0; ibottoms.size(); i++) + { + int bottom_blob_index = layer->bottoms[i]; + + if (blob_mats_gpu_image[bottom_blob_index].dims == 0) + { + if (blob_mats_gpu[bottom_blob_index].dims == 0) + { + if (blob_mats[bottom_blob_index].dims == 0) + { + int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, blob_mats_gpu_image, cmd, opt); + if (ret != 0) + return ret; + } + + if (blob_mats_gpu_image[bottom_blob_index].dims == 0) + { + if (blob_mats_gpu[bottom_blob_index].dims == 0) + { + // host to image + cmd.record_upload(blob_mats[bottom_blob_index], blob_mats_gpu_image[bottom_blob_index], opt); + } + else + { + // buffer to image + cmd.record_buffer_to_image(blob_mats_gpu[bottom_blob_index], blob_mats_gpu_image[bottom_blob_index], opt); + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats_gpu[bottom_blob_index].release(); + } + } + } + } + else + { + // buffer to image + cmd.record_buffer_to_image(blob_mats_gpu[bottom_blob_index], blob_mats_gpu_image[bottom_blob_index], opt); + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats_gpu[bottom_blob_index].release(); + } + } + } + + bottom_blobs[i] = blob_mats_gpu_image[bottom_blob_index]; + + if (bottom_blobs[i].empty()) + { + goto IMAGE_ALLOCATION_FAILED; + } + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats_gpu_image[bottom_blob_index].release(); + // deep copy for inplace forward if data is shared + if (layer->support_inplace && *bottom_blobs[i].refcount != 1) + { + VkImageMat bottom_blob_copy; + cmd.record_clone(bottom_blobs[i], bottom_blob_copy, opt); +// fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blobs[i].buffer(), bottom_blobs[i].buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset()); + bottom_blobs[i] = bottom_blob_copy; + } + } + } + + // forward + if (opt.lightmode && layer->support_inplace) + { + std::vector& bottom_top_blobs = bottom_blobs; +#if NCNN_BENCHMARK + cmd.record_write_timestamp(layer_index * 2); + int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt); + cmd.record_write_timestamp(layer_index * 2 + 1); +#else + int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt); +#endif // NCNN_BENCHMARK + if (ret == -100) + { + goto IMAGE_ALLOCATION_FAILED; + } + if (ret != 0) + return ret; + + // store top blobs + for (size_t i=0; itops.size(); i++) + { + int top_blob_index = layer->tops[i]; + + blob_mats_gpu_image[top_blob_index] = bottom_top_blobs[i]; + } + } + else + { + std::vector top_blobs(layer->tops.size()); +#if NCNN_BENCHMARK + cmd.record_write_timestamp(layer_index * 2); + int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt); + cmd.record_write_timestamp(layer_index * 2 + 1); +#else + int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt); +#endif // NCNN_BENCHMARK + if (ret == -100) + { + goto IMAGE_ALLOCATION_FAILED; + } + if (ret != 0) + return ret; + + // store top blobs + for (size_t i=0; itops.size(); i++) + { + int top_blob_index = layer->tops[i]; + + blob_mats_gpu_image[top_blob_index] = top_blobs[i]; + } + } + } + } + else + { + if (layer->one_blob_only) + { + // load bottom blob + int bottom_blob_index = layer->bottoms[0]; + int top_blob_index = layer->tops[0]; + + if (blob_mats_gpu[bottom_blob_index].dims == 0) + { + if (blob_mats_gpu_image[bottom_blob_index].dims == 0) + { + if (blob_mats[bottom_blob_index].dims == 0) + { + int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, blob_mats_gpu_image, cmd, opt); + if (ret != 0) + return ret; + } + + if (blob_mats_gpu[bottom_blob_index].dims == 0) + { + if (blob_mats_gpu_image[bottom_blob_index].dims == 0) + { + // host to buffer + cmd.record_upload(blob_mats[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); + } + else + { + // image to buffer + cmd.record_image_to_buffer(blob_mats_gpu_image[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats_gpu_image[bottom_blob_index].release(); + } + } + } + } + else + { + // image to buffer + cmd.record_image_to_buffer(blob_mats_gpu_image[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats_gpu_image[bottom_blob_index].release(); + } + } + } + + VkMat bottom_blob = blob_mats_gpu[bottom_blob_index]; + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats_gpu[bottom_blob_index].release(); + // deep copy for inplace forward if data is shared + if (layer->support_inplace && *bottom_blob.refcount != 1) + { + VkMat bottom_blob_copy; + cmd.record_clone(bottom_blob, bottom_blob_copy, opt); +// fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blob.buffer(), bottom_blob.buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset()); + bottom_blob = bottom_blob_copy; + } + } + + // forward + if (opt.lightmode && layer->support_inplace) + { + VkMat& bottom_top_blob = bottom_blob; +#if NCNN_BENCHMARK + cmd.record_write_timestamp(layer_index * 2); + int ret = layer->forward_inplace(bottom_top_blob, cmd, opt); + cmd.record_write_timestamp(layer_index * 2 + 1); +#else + int ret = layer->forward_inplace(bottom_top_blob, cmd, opt); +#endif // NCNN_BENCHMARK + if (ret != 0) + return ret; + + // store top blob + blob_mats_gpu[top_blob_index] = bottom_top_blob; + } + else + { + VkMat top_blob; +#if NCNN_BENCHMARK + cmd.record_write_timestamp(layer_index * 2); + int ret = layer->forward(bottom_blob, top_blob, cmd, opt); + cmd.record_write_timestamp(layer_index * 2 + 1); +#else + int ret = layer->forward(bottom_blob, top_blob, cmd, opt); +#endif // NCNN_BENCHMARK + if (ret != 0) + return ret; + + // store top blob + blob_mats_gpu[top_blob_index] = top_blob; + } + } + else + { + // load bottom blobs + std::vector bottom_blobs(layer->bottoms.size()); + for (size_t i=0; ibottoms.size(); i++) + { + int bottom_blob_index = layer->bottoms[i]; + + if (blob_mats_gpu[bottom_blob_index].dims == 0) + { + if (blob_mats_gpu_image[bottom_blob_index].dims == 0) + { + if (blob_mats[bottom_blob_index].dims == 0) + { + int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, blob_mats_gpu_image, cmd, opt); + if (ret != 0) + return ret; + } + + if (blob_mats_gpu[bottom_blob_index].dims == 0) + { + if (blob_mats_gpu_image[bottom_blob_index].dims == 0) + { + // host to buffer + cmd.record_upload(blob_mats[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); + } + else + { + // image to buffer + cmd.record_image_to_buffer(blob_mats_gpu_image[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats_gpu_image[bottom_blob_index].release(); + } + } + } + } + else + { + // image to buffer + cmd.record_image_to_buffer(blob_mats_gpu_image[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats_gpu_image[bottom_blob_index].release(); + } + } + } + + bottom_blobs[i] = blob_mats_gpu[bottom_blob_index]; + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats_gpu[bottom_blob_index].release(); + // deep copy for inplace forward if data is shared + if (layer->support_inplace && *bottom_blobs[i].refcount != 1) + { + VkMat bottom_blob_copy; + cmd.record_clone(bottom_blobs[i], bottom_blob_copy, opt); +// fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blobs[i].buffer(), bottom_blobs[i].buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset()); + bottom_blobs[i] = bottom_blob_copy; + } + } + } + + // forward + if (opt.lightmode && layer->support_inplace) + { + std::vector& bottom_top_blobs = bottom_blobs; +#if NCNN_BENCHMARK + cmd.record_write_timestamp(layer_index * 2); + int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt); + cmd.record_write_timestamp(layer_index * 2 + 1); +#else + int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt); +#endif // NCNN_BENCHMARK + if (ret != 0) + return ret; + + // store top blobs + for (size_t i=0; itops.size(); i++) + { + int top_blob_index = layer->tops[i]; + + blob_mats_gpu[top_blob_index] = bottom_top_blobs[i]; + } + } + else + { + std::vector top_blobs(layer->tops.size()); +#if NCNN_BENCHMARK + cmd.record_write_timestamp(layer_index * 2); + int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt); + cmd.record_write_timestamp(layer_index * 2 + 1); +#else + int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt); +#endif // NCNN_BENCHMARK + if (ret != 0) + return ret; + + // store top blobs + for (size_t i=0; itops.size(); i++) + { + int top_blob_index = layer->tops[i]; + + blob_mats_gpu[top_blob_index] = top_blobs[i]; + } + } + } + } + } + else + { +IMAGE_ALLOCATION_FAILED: + + if (layer->one_blob_only) + { + // load bottom blob + int bottom_blob_index = layer->bottoms[0]; + int top_blob_index = layer->tops[0]; + + if (blob_mats[bottom_blob_index].dims == 0) + { + if (blob_mats_gpu_image[bottom_blob_index].dims == 0) + { + if (blob_mats_gpu[bottom_blob_index].dims == 0) + { + int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, blob_mats_gpu_image, cmd, opt); + if (ret != 0) + return ret; + } + + if (blob_mats[bottom_blob_index].dims == 0) + { + if (blob_mats_gpu_image[bottom_blob_index].dims == 0) + { + // buffer to host + cmd.record_download(blob_mats_gpu[bottom_blob_index], blob_mats[bottom_blob_index], opt); + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats_gpu[bottom_blob_index].release(); + } + } + else + { + // image to host + cmd.record_download(blob_mats_gpu_image[bottom_blob_index], blob_mats[bottom_blob_index], opt); + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats_gpu_image[bottom_blob_index].release(); + } + } + } + } + else + { + // image to host + cmd.record_download(blob_mats_gpu_image[bottom_blob_index], blob_mats[bottom_blob_index], opt); + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats_gpu_image[bottom_blob_index].release(); + } + } + + cmd.submit_and_wait(); + +#if NCNN_BENCHMARK + std::vector results(layer_index * 2); + cmd.get_query_pool_results(0, layer_index * 2, results); + for (int i=0; iinfo.timestamp_period / 1000; + fprintf(stderr, "%-24s %-30s %8.2lfus |\n", layers[i]->type.c_str(), layers[i]->name.c_str(), duration_us); + } +#endif // NCNN_BENCHMARK + + cmd.reset(); + } + + Mat bottom_blob = blob_mats[bottom_blob_index]; + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats[bottom_blob_index].release(); + // deep copy for inplace forward if data is shared + if (layer->support_inplace && *bottom_blob.refcount != 1) + { + bottom_blob = bottom_blob.clone(); + } + } + + if (opt.use_packing_layout) + { + int elempack = layer->support_packing ? 4 : 1; + + Mat bottom_blob_packed; + convert_packing(bottom_blob, bottom_blob_packed, elempack, opt); + bottom_blob = bottom_blob_packed; + } + + // forward + if (opt.lightmode && layer->support_inplace) + { + Mat& bottom_top_blob = bottom_blob; +#if NCNN_BENCHMARK + double start = get_current_time(); + int ret = layer->forward_inplace(bottom_top_blob, opt); + double end = get_current_time(); + benchmark(layer, bottom_top_blob, bottom_top_blob, start, end); +#else + int ret = layer->forward_inplace(bottom_top_blob, opt); +#endif // NCNN_BENCHMARK + if (ret != 0) + return ret; + + // store top blob + blob_mats[top_blob_index] = bottom_top_blob; + } + else + { + Mat top_blob; +#if NCNN_BENCHMARK + double start = get_current_time(); + int ret = layer->forward(bottom_blob, top_blob, opt); + double end = get_current_time(); + benchmark(layer, bottom_blob, top_blob, start, end); +#else + int ret = layer->forward(bottom_blob, top_blob, opt); +#endif // NCNN_BENCHMARK + if (ret != 0) + return ret; + + // store top blob + blob_mats[top_blob_index] = top_blob; + } + } + else + { + // load bottom blobs + for (size_t i=0; ibottoms.size(); i++) + { + int bottom_blob_index = layer->bottoms[i]; + + if (blob_mats[bottom_blob_index].dims == 0) + { + if (blob_mats_gpu_image[bottom_blob_index].dims == 0) + { + if (blob_mats_gpu[bottom_blob_index].dims == 0) + { + int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, blob_mats_gpu_image, cmd, opt); + if (ret != 0) + return ret; + } + + if (blob_mats[bottom_blob_index].dims == 0) + { + if (blob_mats_gpu_image[bottom_blob_index].dims == 0) + { + // buffer to host + cmd.record_download(blob_mats_gpu[bottom_blob_index], blob_mats[bottom_blob_index], opt); + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats_gpu[bottom_blob_index].release(); + } + } + else + { + // image to host + cmd.record_download(blob_mats_gpu_image[bottom_blob_index], blob_mats[bottom_blob_index], opt); + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats_gpu_image[bottom_blob_index].release(); + } + } + } + } + else + { + // image to host + cmd.record_download(blob_mats_gpu_image[bottom_blob_index], blob_mats[bottom_blob_index], opt); + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats_gpu_image[bottom_blob_index].release(); + } + } + } + } + + { + cmd.submit_and_wait(); + +#if NCNN_BENCHMARK + std::vector results(layer_index * 2); + cmd.get_query_pool_results(0, layer_index * 2, results); + for (int i=0; iinfo.timestamp_period / 1000; + fprintf(stderr, "%-24s %-30s %8.2lfus |\n", layers[i]->type.c_str(), layers[i]->name.c_str(), duration_us); + } +#endif // NCNN_BENCHMARK + + cmd.reset(); + } + + std::vector bottom_blobs(layer->bottoms.size()); + for (size_t i=0; ibottoms.size(); i++) + { + int bottom_blob_index = layer->bottoms[i]; + + bottom_blobs[i] = blob_mats[bottom_blob_index]; + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats[bottom_blob_index].release(); + // deep copy for inplace forward if data is shared + if (layer->support_inplace && *bottom_blobs[i].refcount != 1) + { + bottom_blobs[i] = bottom_blobs[i].clone(); + } + } + + if (opt.use_packing_layout) + { + int elempack = layer->support_packing ? 4 : 1; + + Mat bottom_blob_packed; + convert_packing(bottom_blobs[i], bottom_blob_packed, elempack, opt); + bottom_blobs[i] = bottom_blob_packed; + } + } + + // forward + if (opt.lightmode && layer->support_inplace) + { + std::vector& bottom_top_blobs = bottom_blobs; +#if NCNN_BENCHMARK + double start = get_current_time(); + int ret = layer->forward_inplace(bottom_top_blobs, opt); + double end = get_current_time(); + benchmark(layer, start, end); +#else + int ret = layer->forward_inplace(bottom_top_blobs, opt); +#endif // NCNN_BENCHMARK + if (ret != 0) + return ret; + + // store top blobs + for (size_t i=0; itops.size(); i++) + { + int top_blob_index = layer->tops[i]; + + blob_mats[top_blob_index] = bottom_top_blobs[i]; + } + } + else + { + std::vector top_blobs(layer->tops.size()); +#if NCNN_BENCHMARK + double start = get_current_time(); + int ret = layer->forward(bottom_blobs, top_blobs, opt); + double end = get_current_time(); + benchmark(layer, start, end); +#else + int ret = layer->forward(bottom_blobs, top_blobs, opt); +#endif // NCNN_BENCHMARK + if (ret != 0) + return ret; + + // store top blobs + for (size_t i=0; itops.size(); i++) + { + int top_blob_index = layer->tops[i]; + + blob_mats[top_blob_index] = top_blobs[i]; + } + } + } + } + +// fprintf(stderr, "forward_layer %d %d %s done\n", layer->support_vulkan, layer_index, layer->name.c_str()); + + return 0; +} +#endif // NCNN_VULKAN + +Extractor::Extractor(const Net* _net, size_t blob_count) : net(_net) +{ + blob_mats.resize(blob_count); + opt = net->opt; + +#if NCNN_VULKAN + if (net->opt.use_vulkan_compute) + { + local_blob_vkallocator = 0; + local_staging_vkallocator = 0; + + blob_mats_gpu.resize(blob_count); + blob_mats_gpu_image.resize(blob_count); + } +#endif // NCNN_VULKAN +} + +Extractor::~Extractor() +{ + blob_mats.clear(); + +#if NCNN_VULKAN + if (net->opt.use_vulkan_compute) + { + blob_mats_gpu.clear(); + blob_mats_gpu_image.clear(); + + if (local_blob_vkallocator) + { + net->vkdev->reclaim_blob_allocator(local_blob_vkallocator); + } + if (local_staging_vkallocator) + { + net->vkdev->reclaim_staging_allocator(local_staging_vkallocator); + } + } +#endif // NCNN_VULKAN +} + +void Extractor::set_light_mode(bool enable) +{ + opt.lightmode = enable; +} + +void Extractor::set_num_threads(int num_threads) +{ + opt.num_threads = num_threads; +} + +void Extractor::set_blob_allocator(Allocator* allocator) +{ + opt.blob_allocator = allocator; +} + +void Extractor::set_workspace_allocator(Allocator* allocator) +{ + opt.workspace_allocator = allocator; +} + +#if NCNN_VULKAN +void Extractor::set_vulkan_compute(bool enable) +{ + if (net->opt.use_vulkan_compute) + { + opt.use_vulkan_compute = enable; + } + else + { + fprintf(stderr, "set_vulkan_compute failed, network use_vulkan_compute disabled\n"); + } +} + +void Extractor::set_blob_vkallocator(VkAllocator* allocator) +{ + opt.blob_vkallocator = allocator; +} + +void Extractor::set_workspace_vkallocator(VkAllocator* allocator) +{ + opt.workspace_vkallocator = allocator; +} + +void Extractor::set_staging_vkallocator(VkAllocator* allocator) +{ + opt.staging_vkallocator = allocator; +} +#endif // NCNN_VULKAN + +#if NCNN_STRING +int Extractor::input(const char* blob_name, const Mat& in) +{ + int blob_index = net->find_blob_index_by_name(blob_name); + if (blob_index == -1) return -1; return input(blob_index, in); @@ -2057,60 +2561,59 @@ int Extractor::extract(int blob_index, Mat& feat) cmd.create_query_pool(net->layers.size() * 2); #endif // NCNN_BENCHMARK - VkMat feat_gpu; - ret = extract(blob_index, feat_gpu, cmd); - - if (blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0) + // TODO vkimagemat for adreno + if (opt.use_image_storage) { - // unpacking - VkMat feat_gpu_unpacked_fp16; - net->packing_pack1->forward(feat_gpu, feat_gpu_unpacked_fp16, cmd, opt); + VkImageMat feat_gpu; + ret = extract(blob_index, feat_gpu, cmd); - // cast to fp32 (integrated gpu) - VkMat feat_gpu_unpacked; - if (opt.use_fp16_storage && net->vkdev->info.type != 0) + if (blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0) { - net->cast_float16_to_float32->forward(feat_gpu_unpacked_fp16, feat_gpu_unpacked, cmd, opt); - } - else - { - feat_gpu_unpacked = feat_gpu_unpacked_fp16; - } + cmd.record_download(feat_gpu, blob_mats[blob_index], opt); - // download - Mat feat_cpu_fp16; - cmd.record_download(feat_gpu_unpacked, feat_cpu_fp16, opt); - - cmd.submit_and_wait(); + cmd.submit_and_wait(); #if NCNN_BENCHMARK - std::vector results(net->layers.size() * 2); - cmd.get_query_pool_results(0, net->layers.size() * 2, results); - for (size_t i=0; ilayers.size(); i++) - { - uint64_t start = results[i*2]; - uint64_t end = results[i*2+1]; - if (start == 0 || end == 0) - continue; + std::vector results(net->layers.size() * 2); + cmd.get_query_pool_results(0, net->layers.size() * 2, results); + for (size_t i=0; ilayers.size(); i++) + { + uint64_t start = results[i*2]; + uint64_t end = results[i*2+1]; + if (start == 0 || end == 0) + continue; - double duration_us = (end - start) * net->vkdev->info.timestamp_period / 1000; - fprintf(stderr, "%-24s %-30s %8.2lfus |\n", net->layers[i]->type.c_str(), net->layers[i]->name.c_str(), duration_us); - } + double duration_us = (end - start) * net->vkdev->info.timestamp_period / 1000; + fprintf(stderr, "%-24s %-30s %8.2lfus |\n", net->layers[i]->type.c_str(), net->layers[i]->name.c_str(), duration_us); + } #endif // NCNN_BENCHMARK - - // cast to fp32 (discrete gpu) - Mat& feat_cpu = blob_mats[blob_index]; - if (opt.use_fp16_storage && net->vkdev->info.type == 0) - { - ncnn::cast_float16_to_float32(feat_cpu_fp16, feat_cpu, opt); - } - else if (feat_cpu_fp16.elempack == 4 && opt.use_fp16_packed && !opt.use_fp16_storage && net->vkdev->info.type == 0) - { - ncnn::cast_float16_to_float32(feat_cpu_fp16, feat_cpu, opt); } - else + } + else + { + VkMat feat_gpu; + ret = extract(blob_index, feat_gpu, cmd); + + if (blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0) { - feat_cpu = feat_cpu_fp16; + cmd.record_download(feat_gpu, blob_mats[blob_index], opt); + + cmd.submit_and_wait(); + +#if NCNN_BENCHMARK + std::vector results(net->layers.size() * 2); + cmd.get_query_pool_results(0, net->layers.size() * 2, results); + for (size_t i=0; ilayers.size(); i++) + { + uint64_t start = results[i*2]; + uint64_t end = results[i*2+1]; + if (start == 0 || end == 0) + continue; + + double duration_us = (end - start) * net->vkdev->info.timestamp_period / 1000; + fprintf(stderr, "%-24s %-30s %8.2lfus |\n", net->layers[i]->type.c_str(), net->layers[i]->name.c_str(), duration_us); + } +#endif // NCNN_BENCHMARK } } } @@ -2155,6 +2658,24 @@ int Extractor::extract(const char* blob_name, VkMat& feat, VkCompute& cmd) return extract(blob_index, feat, cmd); } + +int Extractor::input(const char* blob_name, const VkImageMat& in) +{ + int blob_index = net->find_blob_index_by_name(blob_name); + if (blob_index == -1) + return -1; + + return input(blob_index, in); +} + +int Extractor::extract(const char* blob_name, VkImageMat& feat, VkCompute& cmd) +{ + int blob_index = net->find_blob_index_by_name(blob_name); + if (blob_index == -1) + return -1; + + return extract(blob_index, feat, cmd); +} #endif // NCNN_STRING int Extractor::input(int blob_index, const VkMat& in) @@ -2180,10 +2701,50 @@ int Extractor::extract(int blob_index, VkMat& feat, VkCompute& cmd) ret = net->forward_layer(layer_index, blob_mats, blob_mats_gpu, cmd, opt); } + if (blob_mats_gpu[blob_index].dims == 0 && blob_mats_gpu_image[blob_index].dims != 0) + { + // image to buffer + cmd.record_image_to_buffer(blob_mats_gpu_image[blob_index], blob_mats_gpu[blob_index], opt); + } + feat = blob_mats_gpu[blob_index]; return ret; } + +int Extractor::input(int blob_index, const VkImageMat& in) +{ + if (blob_index < 0 || blob_index >= (int)blob_mats.size()) + return -1; + + blob_mats_gpu_image[blob_index] = in; + + return 0; +} + +int Extractor::extract(int blob_index, VkImageMat& feat, VkCompute& cmd) +{ + if (blob_index < 0 || blob_index >= (int)blob_mats.size()) + return -1; + + int ret = 0; + + if (blob_mats_gpu_image[blob_index].dims == 0) + { + int layer_index = net->blobs[blob_index].producer; + ret = net->forward_layer(layer_index, blob_mats, blob_mats_gpu, blob_mats_gpu_image, cmd, opt); + } + + if (blob_mats_gpu_image[blob_index].dims == 0 && blob_mats_gpu[blob_index].dims != 0) + { + // buffer to image + cmd.record_buffer_to_image(blob_mats_gpu[blob_index], blob_mats_gpu_image[blob_index], opt); + } + + feat = blob_mats_gpu_image[blob_index]; + + return ret; +} #endif // NCNN_VULKAN } // namespace ncnn diff --git a/src/net.h b/src/net.h index dbeb854bb..08abeb5fb 100644 --- a/src/net.h +++ b/src/net.h @@ -148,10 +148,11 @@ protected: Layer* create_custom_layer(const char* type); #endif // NCNN_STRING Layer* create_custom_layer(int index); - int forward_layer(int layer_index, std::vector& blob_mats, Option& opt) const; + int forward_layer(int layer_index, std::vector& blob_mats, const Option& opt) const; #if NCNN_VULKAN - int forward_layer(int layer_index, std::vector& blob_mats, std::vector& blob_mats_gpu, VkCompute& cmd, Option& opt) const; + int forward_layer(int layer_index, std::vector& blob_mats, std::vector& blob_mats_gpu, VkCompute& cmd, const Option& opt) const; + int forward_layer(int layer_index, std::vector& blob_mats, std::vector& blob_mats_gpu, std::vector& blob_mats_gpu_image, VkCompute& cmd, const Option& opt) const; #endif // NCNN_VULKAN protected: @@ -165,12 +166,6 @@ protected: VkAllocator* weight_vkallocator; VkAllocator* weight_staging_vkallocator; - - ncnn::Layer* cast_float32_to_float16; - ncnn::Layer* cast_float16_to_float32; - ncnn::Layer* packing_pack1; - ncnn::Layer* packing_pack4; - ncnn::Layer* packing_pack8; #endif // NCNN_VULKAN }; @@ -232,6 +227,14 @@ public: // get result by blob name // return 0 if success int extract(const char* blob_name, VkMat& feat, VkCompute& cmd); + + // set input by blob name + // return 0 if success + int input(const char* blob_name, const VkImageMat& in); + + // get result by blob name + // return 0 if success + int extract(const char* blob_name, VkImageMat& feat, VkCompute& cmd); #endif // NCNN_STRING // set input by blob index @@ -241,6 +244,14 @@ public: // get result by blob index // return 0 if success int extract(int blob_index, VkMat& feat, VkCompute& cmd); + + // set input by blob index + // return 0 if success + int input(int blob_index, const VkImageMat& in); + + // get result by blob index + // return 0 if success + int extract(int blob_index, VkImageMat& feat, VkCompute& cmd); #endif // NCNN_VULKAN protected: @@ -257,6 +268,7 @@ private: VkAllocator* local_staging_vkallocator; std::vector blob_mats_gpu; + std::vector blob_mats_gpu_image; #endif // NCNN_VULKAN }; diff --git a/src/option.cpp b/src/option.cpp index e34d22b59..845b1c2ad 100644 --- a/src/option.cpp +++ b/src/option.cpp @@ -45,6 +45,11 @@ Option::Option() use_shader_pack8 = false; + use_image_storage = false; + use_image_fp16_packed = false; + use_image_fp16_storage = false; + use_image_fp16_arithmetic = false; + use_bf16_storage = false; } diff --git a/src/option.h b/src/option.h index ec87c5acf..4114c29de 100644 --- a/src/option.h +++ b/src/option.h @@ -93,6 +93,12 @@ public: bool use_shader_pack8; + // turn on for adreno + bool use_image_storage; + bool use_image_fp16_packed; + bool use_image_fp16_storage; + bool use_image_fp16_arithmetic; + // enable bf16 data type for storage // improve most operator performace on all arm devices, may consume more memory bool use_bf16_storage; diff --git a/src/pipeline.cpp b/src/pipeline.cpp index 3a47a23da..1bd86bd87 100644 --- a/src/pipeline.cpp +++ b/src/pipeline.cpp @@ -49,7 +49,13 @@ Pipeline::~Pipeline() int Pipeline::create(const uint32_t* spv_data, size_t spv_data_size, const std::vector& specializations) { - ShaderInfo si = resolve_shader_info(spv_data, spv_data_size); + ShaderInfo si; + int ret = resolve_shader_info(spv_data, spv_data_size, si); + if (ret != 0) + { + fprintf(stderr, "resolve_shader_info failed %d\n", ret); + return -1; + } // -3 for local_size_xyz int specialization_count_expected = si.specialization_count - 3; @@ -70,29 +76,39 @@ int Pipeline::create(const uint32_t* spv_data, size_t spv_data_size, const std:: // fprintf(stderr, "local_shader_module %p created\n", local_shader_module); - return create(local_shader_module, specializations, si.binding_count, si.push_constant_count); + return create(local_shader_module, si, specializations); } int Pipeline::create(int shader_type_index, const Option& opt, const std::vector& specializations) { - const ShaderInfo& si = get_shader_info(shader_type_index); - - // -3 for local_size_xyz - int specialization_count_expected = si.specialization_count - 3; - if ((int)specializations.size() != specialization_count_expected) - { - fprintf(stderr, "pipeline %d specialization count mismatch, expect %d but got %d\n", shader_type_index, specialization_count_expected, (int)specializations.size()); - return -1; - } - // ncnn_add_shader cmake macro // 0 = fp32 // 1 = fp16p // 2 = fp16pa // 3 = fp16s // 4 = fp16sa + // 5 = image + // 6 = image_fp16p + // 7 = image_fp16s + // 8 = image_fp16a - if (vkdev->info.support_fp16_storage && opt.use_fp16_storage && vkdev->info.support_fp16_arithmetic && opt.use_fp16_arithmetic) + if (opt.use_image_storage && opt.use_image_fp16_storage && opt.use_image_fp16_arithmetic) + { + shader_type_index += 8; + } + else if (opt.use_image_storage && opt.use_image_fp16_storage) + { + shader_type_index += 7; + } + else if (opt.use_image_storage && opt.use_image_fp16_packed) + { + shader_type_index += 6; + } + else if (opt.use_image_storage) + { + shader_type_index += 5; + } + else if (vkdev->info.support_fp16_storage && opt.use_fp16_storage && vkdev->info.support_fp16_arithmetic && opt.use_fp16_arithmetic) { shader_type_index += 4; } @@ -109,29 +125,41 @@ int Pipeline::create(int shader_type_index, const Option& opt, const std::vector shader_type_index += 1; } + const ShaderInfo& si = get_shader_info(shader_type_index); + + // -3 for local_size_xyz + int specialization_count_expected = si.specialization_count - 3; + if ((int)specializations.size() != specialization_count_expected) + { + fprintf(stderr, "pipeline %d specialization count mismatch, expect %d but got %d\n", shader_type_index, specialization_count_expected, (int)specializations.size()); + return -1; + } + if (vkdev->info.bug_local_size_spec_const) { local_shader_module = vkdev->create_shader_module(shader_type_index, local_size_x, local_size_y, local_size_z); - return create(local_shader_module, specializations, si.binding_count, si.push_constant_count); + return create(local_shader_module, si, specializations); } VkShaderModule shader_module = vkdev->get_shader_module(shader_type_index); - return create(shader_module, specializations, si.binding_count, si.push_constant_count); + return create(shader_module, si, specializations); } -int Pipeline::create(VkShaderModule shader_module, const std::vector& specializations, int binding_count, int push_constant_count) +int Pipeline::create(VkShaderModule shader_module, const ShaderInfo& _shader_info, const std::vector& specializations) { - create_descriptorset_layout(binding_count); + shader_info = _shader_info; - create_pipeline_layout(push_constant_count); + create_descriptorset_layout(); + + create_pipeline_layout(); create_pipeline(shader_module, specializations); if (vkdev->info.support_VK_KHR_descriptor_update_template) { - create_descriptor_update_template(binding_count); + create_descriptor_update_template(); } return 0; @@ -222,8 +250,10 @@ void Pipeline::set_local_size_xyz(int w, int h, int c) // fprintf(stderr, "local size = %d %d %d\n", local_size_x, local_size_y, local_size_z); } -int Pipeline::create_descriptorset_layout(int binding_count) +int Pipeline::create_descriptorset_layout() { + const int binding_count = shader_info.binding_count; + if (binding_count == 0) { descriptorset_layout = 0; @@ -233,11 +263,27 @@ int Pipeline::create_descriptorset_layout(int binding_count) std::vector descriptorSetLayoutBindings(binding_count); for (int i=0; iimmutable_texelfetch_sampler();// we always use texelfetch + } } VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo; @@ -262,8 +308,10 @@ int Pipeline::create_descriptorset_layout(int binding_count) return 0; } -int Pipeline::create_pipeline_layout(int push_constant_count) +int Pipeline::create_pipeline_layout() { + const int push_constant_count = shader_info.push_constant_count; + VkPushConstantRange pushConstantRange; pushConstantRange.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; pushConstantRange.offset = 0; @@ -380,8 +428,10 @@ int Pipeline::create_pipeline(VkShaderModule shader_module, const std::vector descriptorUpdateTemplateEntries(binding_count); + size_t offset = 0; for (int i=0; iinfo.support_fp16_storage && opt.use_fp16_storage && vkdev->info.support_fp16_arithmetic && opt.use_fp16_arithmetic) + if (opt.use_image_storage && opt.use_image_fp16_storage && opt.use_image_fp16_arithmetic) + { + shader_type_index += 8; + } + else if (opt.use_image_storage && opt.use_image_fp16_storage) + { + shader_type_index += 7; + } + else if (opt.use_image_storage && opt.use_image_fp16_packed) + { + shader_type_index += 6; + } + else if (opt.use_image_storage) + { + shader_type_index += 5; + } + else if (vkdev->info.support_fp16_storage && opt.use_fp16_storage && vkdev->info.support_fp16_arithmetic && opt.use_fp16_arithmetic) { shader_type_index += 4; } diff --git a/src/pipeline.h b/src/pipeline.h index 7f895d83f..1600bcb0f 100644 --- a/src/pipeline.h +++ b/src/pipeline.h @@ -41,15 +41,15 @@ public: int create(int shader_type_index, const Option& opt, const std::vector& specializations); - int create(VkShaderModule shader_module, const std::vector& specializations, int binding_count, int push_constant_count); + int create(VkShaderModule shader_module, const ShaderInfo& si, const std::vector& specializations); void destroy(); protected: - int create_descriptorset_layout(int binding_count); - int create_pipeline_layout(int push_constant_count); + int create_descriptorset_layout(); + int create_pipeline_layout(); int create_pipeline(VkShaderModule shader_module, const std::vector& specializations); - int create_descriptor_update_template(int binding_count); + int create_descriptor_update_template(); public: const VulkanDevice* vkdev; @@ -65,6 +65,8 @@ public: VkDescriptorUpdateTemplateKHR descriptor_update_template; + ShaderInfo shader_info; + uint32_t local_size_x; uint32_t local_size_y; uint32_t local_size_z; diff --git a/tests/test_cast.cpp b/tests/test_cast.cpp index f8d2db528..a14d3171c 100644 --- a/tests/test_cast.cpp +++ b/tests/test_cast.cpp @@ -154,6 +154,7 @@ static int test_cast_gpu_fp16p(const ncnn::Mat& a, int type_from, int type_to) opt.use_int8_storage = false; opt.use_int8_arithmetic = false; opt.use_packing_layout = true; + opt.use_image_storage = false; ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(); @@ -164,8 +165,8 @@ static int test_cast_gpu_fp16p(const ncnn::Mat& a, int type_from, int type_to) opt.workspace_vkallocator = blob_vkallocator; opt.staging_vkallocator = staging_vkallocator; - if (!vkdev->info.support_fp16_storage) opt.use_fp16_storage = false; if (!vkdev->info.support_fp16_packed) opt.use_fp16_packed = false; + if (!vkdev->info.support_fp16_storage) opt.use_fp16_storage = false; ncnn::Layer* op = ncnn::create_layer("Cast"); @@ -213,7 +214,7 @@ static int test_cast_gpu_fp16p(const ncnn::Mat& a, int type_from, int type_to) // upload ncnn::VkMat a4_gpu; - cmd.record_upload(a4_fp16, a4_gpu, opt); + cmd.record_clone(a4_fp16, a4_gpu, opt); ncnn::VkMat d4_gpu; if (op->support_inplace) @@ -228,7 +229,7 @@ static int test_cast_gpu_fp16p(const ncnn::Mat& a, int type_from, int type_to) } // download - cmd.record_download(d4_gpu, d, opt); + cmd.record_clone(d4_gpu, d, opt); cmd.submit_and_wait(); @@ -267,6 +268,7 @@ static int test_cast_gpu_fp16p_pack8(const ncnn::Mat& a, int type_from, int type opt.use_int8_arithmetic = false; opt.use_packing_layout = true; opt.use_shader_pack8 = true; + opt.use_image_storage = false; ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(); @@ -277,8 +279,8 @@ static int test_cast_gpu_fp16p_pack8(const ncnn::Mat& a, int type_from, int type opt.workspace_vkallocator = blob_vkallocator; opt.staging_vkallocator = staging_vkallocator; - if (!vkdev->info.support_fp16_storage) opt.use_fp16_storage = false; if (!vkdev->info.support_fp16_packed) opt.use_fp16_packed = false; + if (!vkdev->info.support_fp16_storage) opt.use_fp16_storage = false; ncnn::Layer* op = ncnn::create_layer("Cast"); @@ -328,7 +330,7 @@ static int test_cast_gpu_fp16p_pack8(const ncnn::Mat& a, int type_from, int type // upload ncnn::VkMat a4_gpu; - cmd.record_upload(a4_fp16, a4_gpu, opt); + cmd.record_clone(a4_fp16, a4_gpu, opt); ncnn::VkMat d4_gpu; if (op->support_inplace) @@ -343,7 +345,7 @@ static int test_cast_gpu_fp16p_pack8(const ncnn::Mat& a, int type_from, int type } // download - cmd.record_download(d4_gpu, d, opt); + cmd.record_clone(d4_gpu, d, opt); cmd.submit_and_wait(); @@ -362,6 +364,247 @@ static int test_cast_gpu_fp16p_pack8(const ncnn::Mat& a, int type_from, int type return 0; } + +static int test_cast_gpu_image_fp16p(const ncnn::Mat& a, int type_from, int type_to) +{ + ncnn::ParamDict pd; + pd.set(0, type_from); + pd.set(1, type_to); + + std::vector weights(0); + + ncnn::Option opt; + opt.num_threads = 1; + opt.use_vulkan_compute = true; + opt.use_int8_inference = false; + opt.use_fp16_packed = true; + opt.use_fp16_storage = false; + opt.use_fp16_arithmetic = false; + opt.use_int8_storage = false; + opt.use_int8_arithmetic = false; + opt.use_packing_layout = true; + opt.use_image_storage = true; + opt.use_image_fp16_packed = true; + opt.use_image_fp16_storage = false; + opt.use_image_fp16_arithmetic = false; + + ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(); + + ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator(); + ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator(); + + opt.blob_vkallocator = blob_vkallocator; + opt.workspace_vkallocator = blob_vkallocator; + opt.staging_vkallocator = staging_vkallocator; + + if (!vkdev->info.support_fp16_packed) opt.use_fp16_packed = false; + if (!vkdev->info.support_fp16_storage) opt.use_fp16_storage = false; + if (!vkdev->info.support_image_storage) opt.use_image_storage = false; + if (!vkdev->info.support_image_fp16_packed) opt.use_image_fp16_packed = false; + if (!vkdev->info.support_image_fp16_storage) opt.use_image_fp16_storage = false; + + ncnn::Layer* op = ncnn::create_layer("Cast"); + + op->vkdev = vkdev; + + op->load_param(pd); + + ncnn::ModelBinFromMatArray mb(weights.data()); + + op->load_model(mb); + + op->create_pipeline(opt); + + ncnn::Mat a_fp16; + if (type_from == 2) + { + ncnn::cast_float32_to_float16(a, a_fp16, opt); + } + else + { + a_fp16 = a; + } + + ncnn::Mat b; + ((ncnn::Cast*)op)->ncnn::Cast::forward(a_fp16, b, opt); + + ncnn::Mat d; + + // pack + ncnn::Mat a4; + ncnn::convert_packing(a, a4, 4, opt); + + ncnn::Mat a4_fp16; + if (type_from == 2 && a4.elempack == 4) + { + ncnn::cast_float32_to_float16(a4, a4_fp16, opt); + } + else + { + a4_fp16 = a4; + } + + // forward + ncnn::VkCompute cmd(vkdev); + + // upload + ncnn::VkImageMat a4_gpu; + cmd.record_clone(a4_fp16, a4_gpu, opt); + + ncnn::VkImageMat d4_gpu; + if (op->support_inplace) + { + op->forward_inplace(a4_gpu, cmd, opt); + + d4_gpu = a4_gpu; + } + else + { + op->forward(a4_gpu, d4_gpu, cmd, opt); + } + + // download + cmd.record_clone(d4_gpu, d, opt); + + cmd.submit_and_wait(); + + op->destroy_pipeline(opt); + + delete op; + + vkdev->reclaim_blob_allocator(blob_vkallocator); + vkdev->reclaim_staging_allocator(staging_vkallocator); + + if (CompareMat(b, d, 0.001) != 0) + { + fprintf(stderr, "test_cast_gpu_image_fp16p failed a.dims=%d a=(%d %d %d) type_from=%d type_to=%d\n", a.dims, a.w, a.h, a.c, type_from, type_to); + return -1; + } + + return 0; +} + +static int test_cast_gpu_image_fp16p_pack8(const ncnn::Mat& a, int type_from, int type_to) +{ + ncnn::ParamDict pd; + pd.set(0, type_from); + pd.set(1, type_to); + + std::vector weights(0); + + ncnn::Option opt; + opt.num_threads = 1; + opt.use_vulkan_compute = true; + opt.use_int8_inference = false; + opt.use_fp16_packed = true; + opt.use_fp16_storage = false; + opt.use_fp16_arithmetic = false; + opt.use_int8_storage = false; + opt.use_int8_arithmetic = false; + opt.use_packing_layout = true; + opt.use_shader_pack8 = true; + opt.use_image_storage = true; + opt.use_image_fp16_packed = true; + opt.use_image_fp16_storage = false; + opt.use_image_fp16_arithmetic = false; + + ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(); + + ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator(); + ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator(); + + opt.blob_vkallocator = blob_vkallocator; + opt.workspace_vkallocator = blob_vkallocator; + opt.staging_vkallocator = staging_vkallocator; + + if (!vkdev->info.support_fp16_packed) opt.use_fp16_packed = false; + if (!vkdev->info.support_fp16_storage) opt.use_fp16_storage = false; + if (!vkdev->info.support_image_storage) opt.use_image_storage = false; + if (!vkdev->info.support_image_fp16_packed) opt.use_image_fp16_packed = false; + if (!vkdev->info.support_image_fp16_storage) opt.use_image_fp16_storage = false; + + ncnn::Layer* op = ncnn::create_layer("Cast"); + + op->vkdev = vkdev; + + op->load_param(pd); + + ncnn::ModelBinFromMatArray mb(weights.data()); + + op->load_model(mb); + + op->create_pipeline(opt); + + ncnn::Mat a_fp16; + if (type_from == 2) + { + ncnn::cast_float32_to_float16(a, a_fp16, opt); + } + else + { + a_fp16 = a; + } + + ncnn::Mat b; + ((ncnn::Cast*)op)->ncnn::Cast::forward(a_fp16, b, opt); + + ncnn::Mat d; + + // pack + ncnn::Mat a4; + ncnn::convert_packing(a, a4, 8, opt); + if (a4.elempack != 8) + ncnn::convert_packing(a, a4, 4, opt); + + ncnn::Mat a4_fp16; + if (type_from == 2 && (a4.elempack == 4 || a4.elempack == 8)) + { + ncnn::cast_float32_to_float16(a4, a4_fp16, opt); + } + else + { + a4_fp16 = a4; + } + + // forward + ncnn::VkCompute cmd(vkdev); + + // upload + ncnn::VkImageMat a4_gpu; + cmd.record_clone(a4_fp16, a4_gpu, opt); + + ncnn::VkImageMat d4_gpu; + if (op->support_inplace) + { + op->forward_inplace(a4_gpu, cmd, opt); + + d4_gpu = a4_gpu; + } + else + { + op->forward(a4_gpu, d4_gpu, cmd, opt); + } + + // download + cmd.record_clone(d4_gpu, d, opt); + + cmd.submit_and_wait(); + + op->destroy_pipeline(opt); + + delete op; + + vkdev->reclaim_blob_allocator(blob_vkallocator); + vkdev->reclaim_staging_allocator(staging_vkallocator); + + if (CompareMat(b, d, 0.001) != 0) + { + fprintf(stderr, "test_cast_gpu_image_fp16p_pack8 failed a.dims=%d a=(%d %d %d) type_from=%d type_to=%d\n", a.dims, a.w, a.h, a.c, type_from, type_to); + return -1; + } + + return 0; +} #endif // NCNN_VULKAN static int test_cast(const ncnn::Mat& a, int type_from, int type_to) @@ -372,6 +615,8 @@ static int test_cast(const ncnn::Mat& a, int type_from, int type_to) #if NCNN_VULKAN || test_cast_gpu_fp16p(a, type_from, type_to) || test_cast_gpu_fp16p_pack8(a, type_from, type_to) + || test_cast_gpu_image_fp16p(a, type_from, type_to) + || test_cast_gpu_image_fp16p_pack8(a, type_from, type_to) #endif // NCNN_VULKAN ; } diff --git a/tests/test_packing.cpp b/tests/test_packing.cpp index 02f94b9f1..95841d9b8 100644 --- a/tests/test_packing.cpp +++ b/tests/test_packing.cpp @@ -16,10 +16,56 @@ #include "layer/packing.h" -static int test_packing(const ncnn::Mat& a, int out_elempack) +static int test_packing_cpu(const ncnn::Mat& a, int in_elempack, int out_elempack) { ncnn::ParamDict pd; - pd.set(0, out_elempack);//out_elempack + pd.set(0, out_elempack); + + std::vector weights(0); + + ncnn::Option opt; + opt.num_threads = 1; + opt.use_vulkan_compute = false; + opt.use_int8_inference = false; + opt.use_packing_layout = false; + + ncnn::Layer* op = ncnn::create_layer("Packing"); + + op->load_param(pd); + + ncnn::ModelBinFromMatArray mb(weights.data()); + + op->load_model(mb); + + op->create_pipeline(opt); + + ncnn::Mat ap; + ncnn::convert_packing(a, ap, in_elempack); + + ncnn::Mat b; + ((ncnn::Packing*)op)->ncnn::Packing::forward(ap, b, opt); + + ncnn::Mat c; + op->forward(ap, c, opt); + + op->destroy_pipeline(opt); + + delete op; + + if (CompareMat(b, c, 0.001) != 0) + { + fprintf(stderr, "test_packing_cpu failed a.dims=%d a=(%d %d %d) in_elempack=%d out_elempack=%d\n", a.dims, a.w, a.h, a.c, in_elempack, out_elempack); + return -1; + } + + return 0; +} + +#if NCNN_VULKAN +static int test_packing_gpu(const ncnn::Mat& a, int in_elempack, int out_elempack) +{ + ncnn::ParamDict pd; + pd.set(0, out_elempack); std::vector weights(0); @@ -27,23 +73,206 @@ static int test_packing(const ncnn::Mat& a, int out_elempack) opt.num_threads = 1; opt.use_vulkan_compute = true; opt.use_int8_inference = false; + opt.use_fp16_packed = false; + opt.use_fp16_storage = false; + opt.use_fp16_arithmetic = false; + opt.use_int8_storage = false; + opt.use_int8_arithmetic = false; + opt.use_packing_layout = true; + opt.use_shader_pack8 = true; + opt.use_image_storage = false; + + ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(); + + ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator(); + ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator(); + + opt.blob_vkallocator = blob_vkallocator; + opt.workspace_vkallocator = blob_vkallocator; + opt.staging_vkallocator = staging_vkallocator; + + if (!vkdev->info.support_fp16_packed) opt.use_fp16_packed = false; + if (!vkdev->info.support_fp16_storage) opt.use_fp16_storage = false; + + ncnn::Layer* op = ncnn::create_layer("Packing"); + + op->vkdev = vkdev; + + op->load_param(pd); + + ncnn::ModelBinFromMatArray mb(weights.data()); + + op->load_model(mb); + + op->create_pipeline(opt); + + ncnn::Mat ap; + ncnn::convert_packing(a, ap, in_elempack); + + ncnn::Mat b; + ((ncnn::Packing*)op)->ncnn::Packing::forward(ap, b, opt); + + ncnn::Mat d; + + // forward + ncnn::VkCompute cmd(vkdev); + + // upload + ncnn::VkMat a_gpu; + cmd.record_clone(ap, a_gpu, opt); + + ncnn::VkMat d_gpu; + if (op->support_inplace) + { + op->forward_inplace(a_gpu, cmd, opt); + + d_gpu = a_gpu; + } + else + { + op->forward(a_gpu, d_gpu, cmd, opt); + } + + // download + cmd.record_clone(d_gpu, d, opt); + + cmd.submit_and_wait(); + + op->destroy_pipeline(opt); + + delete op; + + vkdev->reclaim_blob_allocator(blob_vkallocator); + vkdev->reclaim_staging_allocator(staging_vkallocator); + + if (CompareMat(b, d, 0.001) != 0) + { + fprintf(stderr, "test_packing_gpu failed a.dims=%d a=(%d %d %d) in_elempack=%d out_elempack=%d\n", a.dims, a.w, a.h, a.c, in_elempack, out_elempack); + return -1; + } + + return 0; +} + +static int test_packing_gpu_image(const ncnn::Mat& a, int in_elempack, int out_elempack) +{ + ncnn::ParamDict pd; + pd.set(0, out_elempack); + + std::vector weights(0); + + ncnn::Option opt; + opt.num_threads = 1; + opt.use_vulkan_compute = true; + opt.use_int8_inference = false; + opt.use_fp16_packed = true; + opt.use_fp16_storage = false; + opt.use_fp16_arithmetic = false; + opt.use_int8_storage = false; + opt.use_int8_arithmetic = false; + opt.use_packing_layout = true; + opt.use_shader_pack8 = true; + opt.use_image_storage = true; + opt.use_image_fp16_packed = false; + + ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(); + + ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator(); + ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator(); + + opt.blob_vkallocator = blob_vkallocator; + opt.workspace_vkallocator = blob_vkallocator; + opt.staging_vkallocator = staging_vkallocator; + + if (!vkdev->info.support_fp16_packed) opt.use_fp16_packed = false; + if (!vkdev->info.support_fp16_storage) opt.use_fp16_storage = false; + + ncnn::Layer* op = ncnn::create_layer("Packing"); + + op->vkdev = vkdev; + + op->load_param(pd); + + ncnn::ModelBinFromMatArray mb(weights.data()); + + op->load_model(mb); + + op->create_pipeline(opt); + + ncnn::Mat ap; + ncnn::convert_packing(a, ap, in_elempack); + + ncnn::Mat b; + ((ncnn::Packing*)op)->ncnn::Packing::forward(ap, b, opt); + + ncnn::Mat d; + + // forward + ncnn::VkCompute cmd(vkdev); + + // upload + ncnn::VkImageMat a_gpu; + cmd.record_clone(ap, a_gpu, opt); + + ncnn::VkImageMat d_gpu; + if (op->support_inplace) + { + op->forward_inplace(a_gpu, cmd, opt); + + d_gpu = a_gpu; + } + else + { + op->forward(a_gpu, d_gpu, cmd, opt); + } + + // download + cmd.record_clone(d_gpu, d, opt); + + cmd.submit_and_wait(); + + op->destroy_pipeline(opt); + + delete op; + + vkdev->reclaim_blob_allocator(blob_vkallocator); + vkdev->reclaim_staging_allocator(staging_vkallocator); - int ret = test_layer("Packing", pd, weights, opt, a); - if (ret != 0) + if (CompareMat(b, d, 0.001) != 0) { - fprintf(stderr, "test_packing failed a.dims=%d a=(%d %d %d) out_elempack=%d\n", a.dims, a.w, a.h, a.c, out_elempack); + fprintf(stderr, "test_packing_gpu_image failed a.dims=%d a=(%d %d %d) in_elempack=%d out_elempack=%d\n", a.dims, a.w, a.h, a.c, in_elempack, out_elempack); + return -1; } - return ret; + return 0; } +#endif static int test_packing_0() { ncnn::Mat a = RandomMat(3, 5, 16); return 0 - || test_packing(a, 1) - || test_packing(a, 4) + || test_packing_cpu(a, 1, 4) + || test_packing_cpu(a, 4, 1) + || test_packing_cpu(a, 1, 8) + || test_packing_cpu(a, 8, 1) + || test_packing_cpu(a, 4, 8) + || test_packing_cpu(a, 8, 4) +#if NCNN_VULKAN + || test_packing_gpu(a, 1, 4) + || test_packing_gpu(a, 4, 1) + || test_packing_gpu(a, 1, 8) + || test_packing_gpu(a, 8, 1) + || test_packing_gpu(a, 4, 8) + || test_packing_gpu(a, 8, 4) + || test_packing_gpu_image(a, 1, 4) + || test_packing_gpu_image(a, 4, 1) + || test_packing_gpu_image(a, 1, 8) + || test_packing_gpu_image(a, 8, 1) + || test_packing_gpu_image(a, 4, 8) + || test_packing_gpu_image(a, 8, 4) +#endif // NCNN_VULKAN ; } @@ -52,8 +281,26 @@ static int test_packing_1() ncnn::Mat a = RandomMat(3, 16); return 0 - || test_packing(a, 1) - || test_packing(a, 4) + || test_packing_cpu(a, 1, 4) + || test_packing_cpu(a, 4, 1) + || test_packing_cpu(a, 1, 8) + || test_packing_cpu(a, 8, 1) + || test_packing_cpu(a, 4, 8) + || test_packing_cpu(a, 8, 4) +#if NCNN_VULKAN + || test_packing_gpu(a, 1, 4) + || test_packing_gpu(a, 4, 1) + || test_packing_gpu(a, 1, 8) + || test_packing_gpu(a, 8, 1) + || test_packing_gpu(a, 4, 8) + || test_packing_gpu(a, 8, 4) + || test_packing_gpu_image(a, 1, 4) + || test_packing_gpu_image(a, 4, 1) + || test_packing_gpu_image(a, 1, 8) + || test_packing_gpu_image(a, 8, 1) + || test_packing_gpu_image(a, 4, 8) + || test_packing_gpu_image(a, 8, 4) +#endif // NCNN_VULKAN ; } @@ -62,8 +309,26 @@ static int test_packing_2() ncnn::Mat a = RandomMat(16); return 0 - || test_packing(a, 1) - || test_packing(a, 4) + || test_packing_cpu(a, 1, 4) + || test_packing_cpu(a, 4, 1) + || test_packing_cpu(a, 1, 8) + || test_packing_cpu(a, 8, 1) + || test_packing_cpu(a, 4, 8) + || test_packing_cpu(a, 8, 4) +#if NCNN_VULKAN + || test_packing_gpu(a, 1, 4) + || test_packing_gpu(a, 4, 1) + || test_packing_gpu(a, 1, 8) + || test_packing_gpu(a, 8, 1) + || test_packing_gpu(a, 4, 8) + || test_packing_gpu(a, 8, 4) + || test_packing_gpu_image(a, 1, 4) + || test_packing_gpu_image(a, 4, 1) + || test_packing_gpu_image(a, 1, 8) + || test_packing_gpu_image(a, 8, 1) + || test_packing_gpu_image(a, 4, 8) + || test_packing_gpu_image(a, 8, 4) +#endif // NCNN_VULKAN ; } diff --git a/tests/testutil.h b/tests/testutil.h index a54cc8bf2..a98d58615 100644 --- a/tests/testutil.h +++ b/tests/testutil.h @@ -213,6 +213,7 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vectorsupport_vulkan) opt.use_vulkan_compute = false; if (!op->support_packing) opt.use_packing_layout = false; if (!op->support_bf16_storage) opt.use_bf16_storage = false; + if (!op->support_image_storage) opt.use_image_storage = false; if (opt.use_int8_inference) opt.use_bf16_storage = false; if (opt.use_int8_inference) opt.use_packing_layout = false; @@ -220,8 +221,8 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vectoracquire_blob_allocator(); ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator(); @@ -230,8 +231,13 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vectorinfo.support_fp16_storage) opt.use_fp16_storage = false; if (!vkdev->info.support_fp16_packed) opt.use_fp16_packed = false; + if (!vkdev->info.support_fp16_storage) opt.use_fp16_storage = false; + if (!vkdev->info.support_fp16_arithmetic) opt.use_fp16_arithmetic = false; + if (!vkdev->info.support_image_storage) opt.use_image_storage = false; + if (!vkdev->info.support_image_fp16_packed) opt.use_image_fp16_packed = false; + if (!vkdev->info.support_image_fp16_storage) opt.use_image_fp16_storage = false; + if (!vkdev->info.support_image_fp16_arithmetic) opt.use_image_fp16_arithmetic = false; op->vkdev = vkdev; #endif // NCNN_VULKAN @@ -342,60 +348,62 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector d(top_blob_count); if (opt.use_vulkan_compute) { - // pack - std::vector a4(a.size()); - for (size_t i=0; i a_gpu(a.size()); + for (size_t i=0; i a4_fp16(a4.size()); - for (size_t i=0; i d_gpu(top_blob_count); + if (op->support_inplace) { - ncnn::cast_float32_to_float16(a4[i], a4_fp16[i], opt); + op->forward_inplace(a_gpu, cmd, opt); + + d_gpu = a_gpu; } else { - a4_fp16[i] = a4[i]; + op->forward(a_gpu, d_gpu, cmd, opt); } - } - // forward - ncnn::VkCompute cmd(vkdev); - - // upload - std::vector a4_fp16_gpu(a4_fp16.size()); - for (size_t i=0; i d4_fp16_gpu(top_blob_count); - if (op->support_inplace) - { - op->forward_inplace(a4_fp16_gpu, cmd, opt); - - d4_fp16_gpu = a4_fp16_gpu; + // download + for (size_t i=0; iforward(a4_fp16_gpu, d4_fp16_gpu, cmd, opt); - } + // upload + std::vector a_gpu(a.size()); + for (size_t i=0; i d_gpu(top_blob_count); + if (op->support_inplace) + { + op->forward_inplace(a_gpu, cmd, opt); + + d_gpu = a_gpu; + } + else + { + op->forward(a_gpu, d_gpu, cmd, opt); + } + + // download + for (size_t i=0; i(typeindex, pd, weights, opt, a, top_blob_count, b, epsilon, func); + { + int ret = test_layer(typeindex, pd, weights, opt, a, top_blob_count, b, epsilon, func); + if (ret != 0) + { + fprintf(stderr, "test_layer failed gpu with shape hint\n"); + } + return ret; + } return 0; } @@ -447,6 +462,7 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vectorsupport_vulkan) opt.use_vulkan_compute = false; if (!op->support_packing) opt.use_packing_layout = false; if (!op->support_bf16_storage) opt.use_bf16_storage = false; + if (!op->support_image_storage) opt.use_image_storage = false; if (opt.use_int8_inference) opt.use_bf16_storage = false; if (opt.use_int8_inference) opt.use_packing_layout = false; @@ -454,8 +470,8 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vectoracquire_blob_allocator(); ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator(); @@ -464,8 +480,13 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vectorinfo.support_fp16_storage) opt.use_fp16_storage = false; if (!vkdev->info.support_fp16_packed) opt.use_fp16_packed = false; + if (!vkdev->info.support_fp16_storage) opt.use_fp16_storage = false; + if (!vkdev->info.support_fp16_arithmetic) opt.use_fp16_arithmetic = false; + if (!vkdev->info.support_image_storage) opt.use_image_storage = false; + if (!vkdev->info.support_image_fp16_packed) opt.use_image_fp16_packed = false; + if (!vkdev->info.support_image_fp16_storage) opt.use_image_fp16_storage = false; + if (!vkdev->info.support_image_fp16_arithmetic) opt.use_image_fp16_arithmetic = false; op->vkdev = vkdev; #endif // NCNN_VULKAN @@ -554,49 +575,51 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vectorsupport_inplace) + if (opt.use_image_storage) { - op->forward_inplace(a4_fp16_gpu, cmd, opt); + // upload + ncnn::VkImageMat a_gpu; + cmd.record_upload(a, a_gpu, opt); + + ncnn::VkImageMat d_gpu; + if (op->support_inplace) + { + op->forward_inplace(a_gpu, cmd, opt); + + d_gpu = a_gpu; + } + else + { + op->forward(a_gpu, d_gpu, cmd, opt); + } - d4_fp16_gpu = a4_fp16_gpu; + // download + cmd.record_download(d_gpu, d, opt); } else { - op->forward(a4_fp16_gpu, d4_fp16_gpu, cmd, opt); - } + // upload + ncnn::VkMat a_gpu; + cmd.record_upload(a, a_gpu, opt); + + ncnn::VkMat d_gpu; + if (op->support_inplace) + { + op->forward_inplace(a_gpu, cmd, opt); + + d_gpu = a_gpu; + } + else + { + op->forward(a_gpu, d_gpu, cmd, opt); + } - // download - cmd.record_download(d4_fp16_gpu, d, opt); + // download + cmd.record_download(d_gpu, d, opt); + } cmd.submit_and_wait(); } @@ -628,7 +651,14 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector(typeindex, pd, weights, opt, a, b, epsilon, func); + { + int ret = test_layer(typeindex, pd, weights, opt, a, b, epsilon, func); + if (ret != 0) + { + fprintf(stderr, "test_layer failed gpu with shape hint\n"); + } + return ret; + } return 0; } @@ -642,17 +672,27 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec opts[0].use_fp16_packed = false; opts[0].use_fp16_storage = false; opts[0].use_shader_pack8 = false; + opts[0].use_image_storage = false; + opts[0].use_image_fp16_packed = false; + opts[0].use_image_fp16_storage = false; opts[1] = _opt; opts[1].use_packing_layout = true; opts[1].use_fp16_packed = true; opts[1].use_fp16_storage = false; opts[1].use_shader_pack8 = true; + opts[1].use_image_storage = false; + opts[1].use_image_fp16_packed = false; + opts[1].use_image_fp16_storage = false; opts[2] = _opt; opts[2].use_packing_layout = true; - opts[2].use_vulkan_compute = false; + opts[2].use_vulkan_compute = false;//TODO enable me opts[2].use_fp16_packed = false; opts[2].use_fp16_storage = false; opts[2].use_bf16_storage = true; + opts[2].use_shader_pack8 = true; + opts[2].use_image_storage = true; + opts[2].use_image_fp16_packed = true; + opts[2].use_image_fp16_storage = false; for (int i = 0; i < 3; i++) { @@ -662,39 +702,39 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec std::vector a_fp16; std::vector weights_fp16; float epsilon_fp16; - if (opt.use_fp16_packed || opt.use_fp16_storage) + if (opt.use_bf16_storage) { a_fp16.resize(a.size()); for (size_t j = 0; j < a.size(); j++) { ncnn::Mat tmp; - ncnn::cast_float32_to_float16(a[j], tmp, opt); - ncnn::cast_float16_to_float32(tmp, a_fp16[j], opt); + ncnn::cast_float32_to_bfloat16(a[j], tmp, opt); + ncnn::cast_bfloat16_to_float32(tmp, a_fp16[j], opt); } weights_fp16.resize(weights.size()); for (size_t j = 0; j < weights.size(); j++) { ncnn::Mat tmp; - ncnn::cast_float32_to_float16(weights[j], tmp, opt); - ncnn::cast_float16_to_float32(tmp, weights_fp16[j], opt); + ncnn::cast_float32_to_bfloat16(weights[j], tmp, opt); + ncnn::cast_bfloat16_to_float32(tmp, weights_fp16[j], opt); } epsilon_fp16 = epsilon * 100;// 0.1 } - else if (opt.use_bf16_storage) + else if (opt.use_fp16_packed || opt.use_fp16_storage || opt.use_image_fp16_storage) { a_fp16.resize(a.size()); for (size_t j = 0; j < a.size(); j++) { ncnn::Mat tmp; - ncnn::cast_float32_to_bfloat16(a[j], tmp, opt); - ncnn::cast_bfloat16_to_float32(tmp, a_fp16[j], opt); + ncnn::cast_float32_to_float16(a[j], tmp, opt); + ncnn::cast_float16_to_float32(tmp, a_fp16[j], opt); } weights_fp16.resize(weights.size()); for (size_t j = 0; j < weights.size(); j++) { ncnn::Mat tmp; - ncnn::cast_float32_to_bfloat16(weights[j], tmp, opt); - ncnn::cast_bfloat16_to_float32(tmp, weights_fp16[j], opt); + ncnn::cast_float32_to_float16(weights[j], tmp, opt); + ncnn::cast_float16_to_float32(tmp, weights_fp16[j], opt); } epsilon_fp16 = epsilon * 100;// 0.1 } @@ -709,7 +749,7 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec int ret = test_layer(ncnn::layer_to_index(layer_type), pd, weights_fp16, opt, a_fp16, top_blob_count, top_shapes, epsilon_fp16, func); if (ret != 0) { - fprintf(stderr, "test_layer %s failed use_packing_layout=%d use_fp16_packed=%d use_shader_pack8=%d use_bf16_storage=%d\n", layer_type, opt.use_packing_layout, opt.use_fp16_packed, opt.use_shader_pack8, opt.use_bf16_storage); + fprintf(stderr, "test_layer %s failed use_packing_layout=%d use_fp16_packed=%d use_shader_pack8=%d use_bf16_storage=%d use_image_storage=%d use_image_fp16_packed=%d\n", layer_type, opt.use_packing_layout, opt.use_fp16_packed, opt.use_shader_pack8, opt.use_bf16_storage, opt.use_image_storage, opt.use_image_fp16_packed); } } @@ -725,17 +765,27 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec opts[0].use_fp16_packed = false; opts[0].use_fp16_storage = false; opts[0].use_shader_pack8 = false; + opts[0].use_image_storage = false; + opts[0].use_image_fp16_packed = false; + opts[0].use_image_fp16_storage = false; opts[1] = _opt; opts[1].use_packing_layout = true; opts[1].use_fp16_packed = true; opts[1].use_fp16_storage = false; opts[1].use_shader_pack8 = true; + opts[1].use_image_storage = false; + opts[1].use_image_fp16_packed = false; + opts[1].use_image_fp16_storage = false; opts[2] = _opt; opts[2].use_packing_layout = true; - opts[2].use_vulkan_compute = false; + opts[2].use_vulkan_compute = false;//TODO enable me opts[2].use_fp16_packed = false; opts[2].use_fp16_storage = false; opts[2].use_bf16_storage = true; + opts[2].use_shader_pack8 = true; + opts[2].use_image_storage = true; + opts[2].use_image_fp16_packed = true; + opts[2].use_image_fp16_storage = false; for (int i = 0; i < 3; i++) { @@ -745,35 +795,35 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec ncnn::Mat a_fp16; std::vector weights_fp16; float epsilon_fp16; - if (opt.use_fp16_packed || opt.use_fp16_storage) + if (opt.use_bf16_storage) { { ncnn::Mat tmp; - ncnn::cast_float32_to_float16(a, tmp, opt); - ncnn::cast_float16_to_float32(tmp, a_fp16, opt); + ncnn::cast_float32_to_bfloat16(a, tmp, opt); + ncnn::cast_bfloat16_to_float32(tmp, a_fp16, opt); } weights_fp16.resize(weights.size()); for (size_t j = 0; j < weights.size(); j++) { ncnn::Mat tmp; - ncnn::cast_float32_to_float16(weights[j], tmp, opt); - ncnn::cast_float16_to_float32(tmp, weights_fp16[j], opt); + ncnn::cast_float32_to_bfloat16(weights[j], tmp, opt); + ncnn::cast_bfloat16_to_float32(tmp, weights_fp16[j], opt); } epsilon_fp16 = epsilon * 100;// 0.1 } - else if (opt.use_bf16_storage) + else if (opt.use_fp16_packed || opt.use_fp16_storage || opt.use_image_fp16_storage) { { ncnn::Mat tmp; - ncnn::cast_float32_to_bfloat16(a, tmp, opt); - ncnn::cast_bfloat16_to_float32(tmp, a_fp16, opt); + ncnn::cast_float32_to_float16(a, tmp, opt); + ncnn::cast_float16_to_float32(tmp, a_fp16, opt); } weights_fp16.resize(weights.size()); for (size_t j = 0; j < weights.size(); j++) { ncnn::Mat tmp; - ncnn::cast_float32_to_bfloat16(weights[j], tmp, opt); - ncnn::cast_bfloat16_to_float32(tmp, weights_fp16[j], opt); + ncnn::cast_float32_to_float16(weights[j], tmp, opt); + ncnn::cast_float16_to_float32(tmp, weights_fp16[j], opt); } epsilon_fp16 = epsilon * 100;// 0.1 } @@ -788,7 +838,7 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec int ret = test_layer(ncnn::layer_to_index(layer_type), pd, weights_fp16, opt, a_fp16, top_shape, epsilon_fp16, func); if (ret != 0) { - fprintf(stderr, "test_layer %s failed use_packing_layout=%d use_fp16_packed=%d use_shader_pack8=%d use_bf16_storage=%d\n", layer_type, opt.use_packing_layout, opt.use_fp16_packed, opt.use_shader_pack8, opt.use_bf16_storage); + fprintf(stderr, "test_layer %s failed use_packing_layout=%d use_fp16_packed=%d use_shader_pack8=%d use_bf16_storage=%d use_image_storage=%d use_image_fp16_packed=%d\n", layer_type, opt.use_packing_layout, opt.use_fp16_packed, opt.use_shader_pack8, opt.use_bf16_storage, opt.use_image_storage, opt.use_image_fp16_packed); return ret; } }