From 17c445480feeb5df4310b1ce7a45df32ccdcb746 Mon Sep 17 00:00:00 2001 From: nihui Date: Fri, 22 May 2020 10:34:35 +0800 Subject: [PATCH] runtime spir-v compilation with libglslang (#1779) --- .github/workflows/android-armv7-gpu.yml | 24 + .github/workflows/android-armv8-gpu.yml | 24 + .github/workflows/android-x64-gpu.yml | 24 + .github/workflows/android-x86-gpu.yml | 24 + .github/workflows/ios-64bit-gpu.yml | 24 + .github/workflows/linux-x64-gpu-clang.yml | 62 +++ .github/workflows/linux-x64-gpu-gcc.yml | 59 ++ .github/workflows/macos-x64-gpu.yml | 58 ++ .github/workflows/test-coverage.yml | 68 +++ .github/workflows/windows-x64-gpu-vs2019.yml | 73 +++ .gitmodules | 3 + CMakeLists.txt | 23 + glslang | 1 + src/CMakeLists.txt | 130 +++-- src/convert_ycbcr.comp | 2 +- src/gpu.cpp | 557 +++++++++++++++++++ src/gpu.h | 13 + src/pipeline.cpp | 82 ++- src/platform.h.in | 1 + 19 files changed, 1205 insertions(+), 47 deletions(-) create mode 100644 .gitmodules create mode 160000 glslang diff --git a/.github/workflows/android-armv7-gpu.yml b/.github/workflows/android-armv7-gpu.yml index ba87783cf..034aeed00 100644 --- a/.github/workflows/android-armv7-gpu.yml +++ b/.github/workflows/android-armv7-gpu.yml @@ -22,3 +22,27 @@ jobs: run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON .. - name: build run: cmake --build build -j 2 + + android-armv7-gpu-online-spirv: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: 'true' + - name: cache-vulkansdk + id: cache-vulkansdk + uses: actions/cache@v1 + with: + path: "1.1.114.0" + key: vulkansdk-linux-x86_64-1.1.114.0 + - name: vulkansdk + if: steps.cache-vulkansdk.outputs.cache-hit != 'true' + run: | + wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz + tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz + rm -rf 1.1.114.0/source 1.1.114.0/samples + find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm + - name: configure + run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON .. + - name: build + run: cmake --build build -j 2 diff --git a/.github/workflows/android-armv8-gpu.yml b/.github/workflows/android-armv8-gpu.yml index 00fb1bfd6..9dd19a946 100644 --- a/.github/workflows/android-armv8-gpu.yml +++ b/.github/workflows/android-armv8-gpu.yml @@ -22,3 +22,27 @@ jobs: run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON .. - name: build run: cmake --build build -j 2 + + android-aarch64-gpu-online-spirv: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: 'true' + - name: cache-vulkansdk + id: cache-vulkansdk + uses: actions/cache@v1 + with: + path: "1.1.114.0" + key: vulkansdk-linux-x86_64-1.1.114.0 + - name: vulkansdk + if: steps.cache-vulkansdk.outputs.cache-hit != 'true' + run: | + wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz + tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz + rm -rf 1.1.114.0/source 1.1.114.0/samples + find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm + - name: configure + run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON .. + - name: build + run: cmake --build build -j 2 diff --git a/.github/workflows/android-x64-gpu.yml b/.github/workflows/android-x64-gpu.yml index c65f6de8c..4f7ac0762 100644 --- a/.github/workflows/android-x64-gpu.yml +++ b/.github/workflows/android-x64-gpu.yml @@ -22,3 +22,27 @@ jobs: run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON .. - name: build run: cmake --build build -j 2 + + android-x86_64-gpu-online-spirv: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: 'true' + - name: cache-vulkansdk + id: cache-vulkansdk + uses: actions/cache@v1 + with: + path: "1.1.114.0" + key: vulkansdk-linux-x86_64-1.1.114.0 + - name: vulkansdk + if: steps.cache-vulkansdk.outputs.cache-hit != 'true' + run: | + wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz + tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz + rm -rf 1.1.114.0/source 1.1.114.0/samples + find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm + - name: configure + run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON .. + - name: build + run: cmake --build build -j 2 diff --git a/.github/workflows/android-x86-gpu.yml b/.github/workflows/android-x86-gpu.yml index f3cc09e27..b89083e3a 100644 --- a/.github/workflows/android-x86-gpu.yml +++ b/.github/workflows/android-x86-gpu.yml @@ -22,3 +22,27 @@ jobs: run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON .. - name: build run: cmake --build build -j 2 + + android-x86-gpu-online-spirv: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: 'true' + - name: cache-vulkansdk + id: cache-vulkansdk + uses: actions/cache@v1 + with: + path: "1.1.114.0" + key: vulkansdk-linux-x86_64-1.1.114.0 + - name: vulkansdk + if: steps.cache-vulkansdk.outputs.cache-hit != 'true' + run: | + wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz + tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz + rm -rf 1.1.114.0/source 1.1.114.0/samples + find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm + - name: configure + run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON .. + - name: build + run: cmake --build build -j 2 diff --git a/.github/workflows/ios-64bit-gpu.yml b/.github/workflows/ios-64bit-gpu.yml index 7f4ecabd9..d28317543 100644 --- a/.github/workflows/ios-64bit-gpu.yml +++ b/.github/workflows/ios-64bit-gpu.yml @@ -22,3 +22,27 @@ jobs: run: export VULKAN_SDK=`pwd`/vulkansdk-macos-1.1.114.0/macOS && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=OS64 -DVulkan_INCLUDE_DIR=`pwd`/../vulkansdk-macos-1.1.114.0/MoltenVK/include -DVulkan_LIBRARY=`pwd`/../vulkansdk-macos-1.1.114.0/MoltenVK/iOS/dynamic/libMoltenVK.dylib -DNCNN_VULKAN=ON .. - name: build run: cmake --build build -j 2 + + ios-iphone-os-gpu-online-spirv: + runs-on: macos-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: 'true' + - name: cache-vulkansdk + id: cache-vulkansdk + uses: actions/cache@v1 + with: + path: "vulkansdk-macos-1.1.114.0" + key: vulkansdk-macos-1.1.114.0 + - name: vulkansdk + if: steps.cache-vulkansdk.outputs.cache-hit != 'true' + run: | + wget https://sdk.lunarg.com/sdk/download/1.1.114.0/mac/vulkansdk-macos-1.1.114.0.tar.gz?Human=true -O vulkansdk-macos-1.1.114.0.tar.gz + tar -xf vulkansdk-macos-1.1.114.0.tar.gz + rm -rf vulkansdk-macos-1.1.114.0/Applications + find vulkansdk-macos-1.1.114.0 -type f | grep -v -E 'vulkan|glslang|MoltenVK' | xargs rm + - name: configure + run: export VULKAN_SDK=`pwd`/vulkansdk-macos-1.1.114.0/macOS && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=OS64 -DVulkan_INCLUDE_DIR=`pwd`/../vulkansdk-macos-1.1.114.0/MoltenVK/include -DVulkan_LIBRARY=`pwd`/../vulkansdk-macos-1.1.114.0/MoltenVK/iOS/dynamic/libMoltenVK.dylib -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON .. + - name: build + run: cmake --build build -j 2 diff --git a/.github/workflows/linux-x64-gpu-clang.yml b/.github/workflows/linux-x64-gpu-clang.yml index bbf73f6ab..07912c233 100644 --- a/.github/workflows/linux-x64-gpu-clang.yml +++ b/.github/workflows/linux-x64-gpu-clang.yml @@ -60,3 +60,65 @@ jobs: run: | export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" cd build && ctest --output-on-failure -j 2 + + linux-clang-gpu-online-spirv: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: 'true' + - name: update + run: sudo apt-get update + - name: protobuf + run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev + - name: cache-vulkansdk + id: cache-vulkansdk + uses: actions/cache@v1 + with: + path: "1.1.114.0" + key: vulkansdk-linux-x86_64-1.1.114.0 + - name: vulkansdk + if: steps.cache-vulkansdk.outputs.cache-hit != 'true' + run: | + wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz + tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz + rm -rf 1.1.114.0/source 1.1.114.0/samples + find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm + - name: cache-swiftshader + id: cache-swiftshader + uses: actions/cache@v1 + with: + path: swiftshader-install + key: swiftshader-linux-install-20200508 + - name: checkout-swiftshader + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + uses: actions/checkout@v2 + with: + repository: google/swiftshader + path: swiftshader + ref: 2dd864470e310d173d35fa95ca3a14d216734aab + - name: checkout-swiftshader-submodules + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + run: | + cd swiftshader + git submodule update --init --recursive + - name: swiftshader + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + run: | + cd swiftshader + mkdir -p build; cd build + cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. + cmake --build . -j 2 + mkdir $GITHUB_WORKSPACE/swiftshader-install + cp Linux/* $GITHUB_WORKSPACE/swiftshader-install + - name: configure + env: + CC: clang + CXX: clang++ + run: export VULKAN_SDK=`pwd`/1.1.114.0/x86_64 && mkdir build && cd build && cmake -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON .. + - name: build + run: cmake --build build -j 2 + - name: test + run: | + export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" + cd build && ctest --output-on-failure -j 2 diff --git a/.github/workflows/linux-x64-gpu-gcc.yml b/.github/workflows/linux-x64-gpu-gcc.yml index d51444b80..d2d785aa9 100644 --- a/.github/workflows/linux-x64-gpu-gcc.yml +++ b/.github/workflows/linux-x64-gpu-gcc.yml @@ -57,3 +57,62 @@ jobs: run: | export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" cd build && ctest --output-on-failure -j 2 + + linux-gcc-gpu-online-spirv: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: 'true' + - name: update + run: sudo apt-get update + - name: protobuf + run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev + - name: cache-vulkansdk + id: cache-vulkansdk + uses: actions/cache@v1 + with: + path: "1.1.114.0" + key: vulkansdk-linux-x86_64-1.1.114.0 + - name: vulkansdk + if: steps.cache-vulkansdk.outputs.cache-hit != 'true' + run: | + wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz + tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz + rm -rf 1.1.114.0/source 1.1.114.0/samples + find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm + - name: cache-swiftshader + id: cache-swiftshader + uses: actions/cache@v1 + with: + path: swiftshader-install + key: swiftshader-linux-install-20200508 + - name: checkout-swiftshader + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + uses: actions/checkout@v2 + with: + repository: google/swiftshader + path: swiftshader + ref: 2dd864470e310d173d35fa95ca3a14d216734aab + - name: checkout-swiftshader-submodules + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + run: | + cd swiftshader + git submodule update --init --recursive + - name: swiftshader + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + run: | + cd swiftshader + mkdir -p build; cd build + cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. + cmake --build . -j 2 + mkdir $GITHUB_WORKSPACE/swiftshader-install + cp Linux/* $GITHUB_WORKSPACE/swiftshader-install + - name: configure + run: export VULKAN_SDK=`pwd`/1.1.114.0/x86_64 && mkdir build && cd build && cmake -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON .. + - name: build + run: cmake --build build -j 2 + - name: test + run: | + export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" + cd build && ctest --output-on-failure -j 2 diff --git a/.github/workflows/macos-x64-gpu.yml b/.github/workflows/macos-x64-gpu.yml index cdcbd32e5..6bb36a4b4 100644 --- a/.github/workflows/macos-x64-gpu.yml +++ b/.github/workflows/macos-x64-gpu.yml @@ -56,3 +56,61 @@ jobs: export DYLD_LIBRARY_PATH="vulkansdk-macos-1.1.114.0/macOS/lib":$DYLD_LIBRARY_PATH export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" cd build && ctest --output-on-failure -j 2 + + macos-clang-gpu-online-spirv: + runs-on: macos-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: 'true' + - name: protobuf + run: brew install protobuf opencv3 + - name: cache-vulkansdk + id: cache-vulkansdk + uses: actions/cache@v1 + with: + path: "vulkansdk-macos-1.1.114.0" + key: vulkansdk-macos-1.1.114.0 + - name: vulkansdk + if: steps.cache-vulkansdk.outputs.cache-hit != 'true' + run: | + wget https://sdk.lunarg.com/sdk/download/1.1.114.0/mac/vulkansdk-macos-1.1.114.0.tar.gz?Human=true -O vulkansdk-macos-1.1.114.0.tar.gz + tar -xf vulkansdk-macos-1.1.114.0.tar.gz + rm -rf vulkansdk-macos-1.1.114.0/Applications + find vulkansdk-macos-1.1.114.0 -type f | grep -v -E 'vulkan|glslang|MoltenVK' | xargs rm + - name: cache-swiftshader + id: cache-swiftshader + uses: actions/cache@v1 + with: + path: swiftshader-install + key: swiftshader-macos-install-20200508 + - name: checkout-swiftshader + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + uses: actions/checkout@v2 + with: + repository: google/swiftshader + path: swiftshader + ref: 2dd864470e310d173d35fa95ca3a14d216734aab + - name: checkout-swiftshader-submodules + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + run: | + cd swiftshader + git submodule update --init --recursive + - name: swiftshader + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + run: | + cd swiftshader + mkdir -p build; cd build + cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. + cmake --build . -j 2 + mkdir $GITHUB_WORKSPACE/swiftshader-install + cp Darwin/* $GITHUB_WORKSPACE/swiftshader-install + - name: configure + run: export VULKAN_SDK=`pwd`/vulkansdk-macos-1.1.114.0/macOS && mkdir build && cd build && cmake -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON .. + - name: build + run: cmake --build build -j 2 + - name: test + run: | + export DYLD_LIBRARY_PATH="vulkansdk-macos-1.1.114.0/macOS/lib":$DYLD_LIBRARY_PATH + export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" + cd build && ctest --output-on-failure -j 2 diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml index 0fc469581..36c5dd993 100644 --- a/.github/workflows/test-coverage.yml +++ b/.github/workflows/test-coverage.yml @@ -67,6 +67,74 @@ jobs: token: ${{ secrets.CODECOV_TOKEN }} file: build/lcov.info + linux-gcc-gpu-online-spirv: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: 'true' + - name: lcov + run: sudo apt-get install lcov + - name: cache-vulkansdk + id: cache-vulkansdk + uses: actions/cache@v1 + with: + path: "1.1.114.0" + key: vulkansdk-linux-x86_64-1.1.114.0 + - name: vulkansdk + if: steps.cache-vulkansdk.outputs.cache-hit != 'true' + run: | + wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz + tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz + rm -rf 1.1.114.0/source 1.1.114.0/samples + find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm + - name: cache-swiftshader + id: cache-swiftshader + uses: actions/cache@v1 + with: + path: swiftshader-install + key: swiftshader-linux-install-20200508 + - name: checkout-swiftshader + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + uses: actions/checkout@v2 + with: + repository: google/swiftshader + path: swiftshader + ref: 2dd864470e310d173d35fa95ca3a14d216734aab + - name: checkout-swiftshader-submodules + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + run: | + cd swiftshader + git submodule update --init --recursive + - name: swiftshader + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + run: | + cd swiftshader + mkdir -p build; cd build + cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. + cmake --build . -j 2 + mkdir $GITHUB_WORKSPACE/swiftshader-install + cp Linux/* $GITHUB_WORKSPACE/swiftshader-install + - name: configure + run: export VULKAN_SDK=`pwd`/1.1.114.0/x86_64 && mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + - name: build + run: cmake --build build -j 2 + - name: test + run: | + export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" + cd build && ctest --output-on-failure -j 2 + - name: lcov-collect + run: | + cd build + lcov -d ./src -c -o lcov.info + lcov -r lcov.info '/usr/*' -o lcov.info + lcov --list lcov.info + - name: codecov + uses: codecov/codecov-action@v1 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: build/lcov.info + linux-gcc-avx2: runs-on: ubuntu-latest steps: diff --git a/.github/workflows/windows-x64-gpu-vs2019.yml b/.github/workflows/windows-x64-gpu-vs2019.yml index cd4e70fa1..44fd6f640 100644 --- a/.github/workflows/windows-x64-gpu-vs2019.yml +++ b/.github/workflows/windows-x64-gpu-vs2019.yml @@ -71,3 +71,76 @@ jobs: Copy-Item -Path '.\VulkanSDK\RunTimeInstaller\x64\vulkan-1.dll' -Destination 'build\tests' $env:VK_ICD_FILENAMES="$env:GITHUB_WORKSPACE\swiftshader-install\vk_swiftshader_icd.json" cd build; ctest --output-on-failure -j 2 + + windows-vs2019-gpu-online-spirv: + runs-on: windows-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: 'true' + - name: cache-protobuf + id: cache-protobuf + uses: actions/cache@v1 + with: + path: "protobuf-install" + key: protobuf-windows-install + - name: protobuf + if: steps.cache-protobuf.outputs.cache-hit != 'true' + run: | + Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip + 7z x ./protobuf-3.11.2.zip + cd protobuf-3.11.2 + mkdir build-vs2019; cd build-vs2019; cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 2 + cmake --build . --config Release --target install + - name: cache-vulkansdk + id: cache-vulkansdk + uses: actions/cache@v1 + with: + path: "VulkanSDK" + key: VulkanSDK-1.1.114.0-Installer + - name: vulkansdk + if: steps.cache-vulkansdk.outputs.cache-hit != 'true' + run: | + Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.1.114.0/windows/VulkanSDK-1.1.114.0-Installer.exe?Human=true -OutFile VulkanSDK-1.1.114.0-Installer.exe + 7z x -aoa ./VulkanSDK-1.1.114.0-Installer.exe -oVulkanSDK + Remove-Item .\VulkanSDK\Demos, .\VulkanSDK\glslang, .\VulkanSDK\Samples, .\VulkanSDK\shaderc, .\VulkanSDK\spirv-tools, .\VulkanSDK\Third-Party, .\VulkanSDK\Tools, .\VulkanSDK\Tools32 -Recurse + - name: cache-swiftshader + id: cache-swiftshader + uses: actions/cache@v1 + with: + path: swiftshader-install + key: swiftshader-windows-install-20200508 + - name: checkout-swiftshader + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + uses: actions/checkout@v2 + with: + repository: google/swiftshader + path: swiftshader + ref: 2dd864470e310d173d35fa95ca3a14d216734aab + - name: checkout-swiftshader-submodules + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + run: | + cd swiftshader + git submodule update --init --recursive + - name: swiftshader + if: steps.cache-swiftshader.outputs.cache-hit != 'true' + run: | + cd swiftshader + mkdir build-vs2019; cd build-vs2019 + cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. + cmake --build . --config Release -j 2 + mkdir "$env:GITHUB_WORKSPACE/swiftshader-install" + Copy-Item -Path "Windows\*" -Destination "$env:GITHUB_WORKSPACE\swiftshader-install" + - name: configure + run: | + $env:VULKAN_SDK="$(pwd)/VulkanSDK" + mkdir build; cd build + cmake -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON .. + - name: build + run: cmake --build build --config Release -j 2 + - name: test + run: | + Copy-Item -Path '.\VulkanSDK\RunTimeInstaller\x64\vulkan-1.dll' -Destination 'build\tests' + $env:VK_ICD_FILENAMES="$env:GITHUB_WORKSPACE\swiftshader-install\vk_swiftshader_icd.json" + cd build; ctest --output-on-failure -j 2 diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..6dc660ae0 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "glslang"] + path = glslang + url = https://github.com/KhronosGroup/glslang diff --git a/CMakeLists.txt b/CMakeLists.txt index 79935a714..abf76d631 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,6 +30,7 @@ option(NCNN_PIXEL "convert and resize from/to image pixel" ON) option(NCNN_PIXEL_ROTATE "rotate image pixel orientation" ON) option(NCNN_CMAKE_VERBOSE "print verbose cmake messages" OFF) option(NCNN_VULKAN "vulkan compute support" OFF) +option(NCNN_VULKAN_ONLINE_SPIRV "online SPIR-V module compilation" OFF) option(NCNN_REQUANT "auto merge int8 quant and dequant" OFF) option(NCNN_AVX2 "optimize x86 platform with avx2" OFF) option(NCNN_DISABLE_PIC "disable position-independent code" OFF) @@ -64,6 +65,28 @@ if(NCNN_COVERAGE) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -coverage -lgcov") endif() +if(NCNN_VULKAN_ONLINE_SPIRV) + if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/glslang/CMakeLists.txt") + message(WARNING "The submodules were not downloaded! NCNN_VULKAN_ONLINE_SPIRV will be turned off.") + message(WARNING "Please update submodules with \"git submodule update --init\" and try again.") + set(NCNN_VULKAN_ONLINE_SPIRV OFF) + else() + # glslang requires c++11 + set(CMAKE_CXX_STANDARD 11) + + option(BUILD_EXTERNAL "" OFF) + option(ENABLE_SPVREMAPPER "" OFF) + option(ENABLE_GLSLANG_BINARIES "" OFF) + option(ENABLE_HLSL "" OFF) + option(ENABLE_RTTI "" OFF) + option(ENABLE_EXCEPTIONS "" OFF) + option(ENABLE_OPT "" OFF) + option(ENABLE_PCH "" OFF) + option(ENABLE_CTEST "" OFF) + add_subdirectory(glslang) + endif() +endif() + add_subdirectory(src) if(NCNN_BUILD_BENCHMARK) add_subdirectory(benchmark) diff --git a/glslang b/glslang new file mode 160000 index 000000000..59216d5cd --- /dev/null +++ b/glslang @@ -0,0 +1 @@ +Subproject commit 59216d5cd87eee78dc979e30645c4c2240a7c351 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 54e7be8ac..217c6f645 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -3,7 +3,7 @@ configure_file(platform.h.in ${CMAKE_CURRENT_BINARY_DIR}/platform.h) -if(NCNN_VULKAN) +if(NCNN_VULKAN AND NOT NCNN_VULKAN_ONLINE_SPIRV) find_program(GLSLANGVALIDATOR_EXECUTABLE NAMES glslangValidator PATHS $ENV{VULKAN_SDK}/bin NO_CMAKE_FIND_ROOT_PATH) message(STATUS "Found glslangValidator: ${GLSLANGVALIDATOR_EXECUTABLE}") endif() @@ -21,6 +21,7 @@ endfunction() set(ncnn_SRCS allocator.cpp + benchmark.cpp blob.cpp command.cpp cpu.cpp @@ -37,59 +38,93 @@ set(ncnn_SRCS option.cpp paramdict.cpp pipeline.cpp - benchmark.cpp ) -ncnn_src_group(ncnn_SRCS "sources") - if(ANDROID) list(APPEND ncnn_SRCS mat_pixel_android.cpp) endif() +ncnn_src_group(ncnn_SRCS "sources") + include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_generate_shader_spv_header.cmake) macro(ncnn_add_shader SHADER_SRC) - ncnn_generate_shader_spv_header(SHADER_SPV_HEADER SHADER_SPV_HEX_HEADERS ${SHADER_SRC}) - - get_filename_component(SHADER_SPV_HEADER_NAME ${SHADER_SPV_HEADER} NAME) - string(APPEND layer_shader_spv_data "#include \"${SHADER_SPV_HEADER_NAME}\"\n") - - get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE) - string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_spv_data,sizeof(${SHADER_SRC_NAME_WE}_spv_data)},\n") - string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16p_spv_data)},\n") - string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16pa_spv_data)},\n") - string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16s_spv_data)},\n") - string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16sa_spv_data)},\n") - string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_spv_data)},\n") - string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16p_spv_data)},\n") - string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16pa_spv_data)},\n") - string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16s_spv_data)},\n") - string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16sa_spv_data)},\n") - - list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEADER}) - list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEX_HEADERS}) - - # generate layer_shader_type_enum file - set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE} = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") - math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") - set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") - math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") - set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16pa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") - math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") - set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") - math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") - set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") - math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") - set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") - math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") - set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") - math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") - set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16pa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") - math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") - set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") - math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") - set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") - math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + if(NCNN_VULKAN_ONLINE_SPIRV) + + file(READ ${SHADER_SRC} comp_data) + + # skip leading comment + string(FIND "${comp_data}" "#version" version_start) + string(SUBSTRING "${comp_data}" ${version_start} -1 comp_data) + + # remove whitespace + string(REGEX REPLACE "\n +" "\n" comp_data "${comp_data}") + + # text to hex + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/text2hex.txt "${comp_data}") + file(READ ${CMAKE_CURRENT_BINARY_DIR}/text2hex.txt comp_data_hex HEX) + string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," comp_data_hex ${comp_data_hex}) + string(FIND "${comp_data_hex}" "," tail_comma REVERSE) + string(SUBSTRING "${comp_data_hex}" 0 ${tail_comma} comp_data_hex) + + get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE) + set(SHADER_COMP_HEADER ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_SRC_NAME_WE}.comp.hex.h) + file(WRITE ${SHADER_COMP_HEADER} "static const char ${SHADER_SRC_NAME_WE}_comp_data[] = {${comp_data_hex}};\n") + set_source_files_properties(${SHADER_COMP_HEADER} PROPERTIES GENERATED TRUE) + + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_comp_data,sizeof(${SHADER_SRC_NAME_WE}_comp_data)},\n") + + get_filename_component(SHADER_COMP_HEADER_NAME ${SHADER_COMP_HEADER} NAME) + list(APPEND SHADER_SPV_HEX_FILES ${SHADER_COMP_HEADER_NAME}) + + string(APPEND layer_shader_spv_data "#include \"${SHADER_COMP_HEADER_NAME}\"\n") + + # generate layer_shader_type_enum file + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE} = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + else() + ncnn_generate_shader_spv_header(SHADER_SPV_HEADER SHADER_SPV_HEX_HEADERS ${SHADER_SRC}) + + get_filename_component(SHADER_SPV_HEADER_NAME ${SHADER_SPV_HEADER} NAME) + string(APPEND layer_shader_spv_data "#include \"${SHADER_SPV_HEADER_NAME}\"\n") + + get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE) + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_spv_data,sizeof(${SHADER_SRC_NAME_WE}_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16p_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16pa_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16s_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16sa_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16p_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16pa_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16s_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16sa_spv_data)},\n") + + list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEADER}) + list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEX_HEADERS}) + + # generate layer_shader_type_enum file + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE} = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16pa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16pa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + endif() endmacro() macro(ncnn_add_layer class) @@ -303,6 +338,11 @@ endif() if(NCNN_VULKAN) find_package(Vulkan REQUIRED) target_link_libraries(ncnn PUBLIC Vulkan::Vulkan) + + if(NCNN_VULKAN_ONLINE_SPIRV) + target_include_directories(ncnn PRIVATE $) + target_link_libraries(ncnn PRIVATE glslang SPIRV) + endif() endif() if(ANDROID_NDK) diff --git a/src/convert_ycbcr.comp b/src/convert_ycbcr.comp index 57a370b75..c5dff0460 100644 --- a/src/convert_ycbcr.comp +++ b/src/convert_ycbcr.comp @@ -18,7 +18,7 @@ #extension GL_EXT_shader_16bit_storage: require #endif #if NCNN_fp16_arithmetic -#extension GL_AMD_gpu_shader_half_float: require +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif layout (constant_id = 0) const int w = 0; diff --git a/src/gpu.cpp b/src/gpu.cpp index 82fd26810..7fdc40af7 100644 --- a/src/gpu.cpp +++ b/src/gpu.cpp @@ -23,6 +23,11 @@ #include +#if NCNN_VULKAN_ONLINE_SPIRV +#include "glslang/glslang/Public/ShaderLang.h" +#include "glslang/SPIRV/GlslangToSpv.h" +#endif + #include "mat.h" #include "command.h" #include "layer_type.h" @@ -54,11 +59,19 @@ static Mutex g_default_vkdev_lock; static VulkanDevice* g_default_vkdev[NCNN_MAX_GPU_COUNT] = {0}; // precompiled spirv +#if NCNN_VULKAN_ONLINE_SPIRV +struct layer_shader_registry_entry +{ + const char* comp_data; + int comp_data_size; +}; +#else struct layer_shader_registry_entry { const uint32_t* spv_data; size_t spv_data_size; }; +#endif #include "layer_shader_spv_data.h" @@ -67,7 +80,9 @@ static const layer_shader_registry_entry layer_shader_registry[] = #include "layer_shader_registry.h" }; +#if !NCNN_VULKAN_ONLINE_SPIRV static ShaderInfo layer_shader_infos[sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry)]; +#endif static const int layer_shader_registry_entry_count = sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry); @@ -846,17 +861,25 @@ int create_gpu_instance() // the default gpu device g_default_gpu_index = find_default_vulkan_device_index(); +#if NCNN_VULKAN_ONLINE_SPIRV + glslang::InitializeProcess(); +#else // resolve shader info for (int i=0; i= layer_shader_registry_entry_count) @@ -1162,6 +1190,7 @@ VkShaderModule VulkanDevice::create_shader_module(int shader_type_index, uint32_ return compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z); } +#endif VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const { @@ -1601,6 +1630,7 @@ void VulkanDevice::convert_packing(const VkImageMat& src, VkMat& dst, int dst_el uop->forward(src, dst, cmd, opt); } +#if !NCNN_VULKAN_ONLINE_SPIRV int VulkanDevice::create_shader_module() { if (info.bug_local_size_spec_const) @@ -1706,6 +1736,7 @@ void VulkanDevice::destroy_shader_module() shader_modules.clear(); } +#endif int VulkanDevice::init_device_extension() { @@ -2025,6 +2056,531 @@ VulkanDevice* get_gpu_device(int device_index) return g_default_vkdev[device_index]; } +#if NCNN_VULKAN_ONLINE_SPIRV + +const TBuiltInResource default_TBuiltInResource = { + /* .MaxLights = */ 32, + /* .MaxClipPlanes = */ 6, + /* .MaxTextureUnits = */ 32, + /* .MaxTextureCoords = */ 32, + /* .MaxVertexAttribs = */ 64, + /* .MaxVertexUniformComponents = */ 4096, + /* .MaxVaryingFloats = */ 64, + /* .MaxVertexTextureImageUnits = */ 32, + /* .MaxCombinedTextureImageUnits = */ 80, + /* .MaxTextureImageUnits = */ 32, + /* .MaxFragmentUniformComponents = */ 4096, + /* .MaxDrawBuffers = */ 32, + /* .MaxVertexUniformVectors = */ 128, + /* .MaxVaryingVectors = */ 8, + /* .MaxFragmentUniformVectors = */ 16, + /* .MaxVertexOutputVectors = */ 16, + /* .MaxFragmentInputVectors = */ 15, + /* .MinProgramTexelOffset = */ -8, + /* .MaxProgramTexelOffset = */ 7, + /* .MaxClipDistances = */ 8, + /* .MaxComputeWorkGroupCountX = */ 65535, + /* .MaxComputeWorkGroupCountY = */ 65535, + /* .MaxComputeWorkGroupCountZ = */ 65535, + /* .MaxComputeWorkGroupSizeX = */ 1024, + /* .MaxComputeWorkGroupSizeY = */ 1024, + /* .MaxComputeWorkGroupSizeZ = */ 64, + /* .MaxComputeUniformComponents = */ 1024, + /* .MaxComputeTextureImageUnits = */ 16, + /* .MaxComputeImageUniforms = */ 8, + /* .MaxComputeAtomicCounters = */ 8, + /* .MaxComputeAtomicCounterBuffers = */ 1, + /* .MaxVaryingComponents = */ 60, + /* .MaxVertexOutputComponents = */ 64, + /* .MaxGeometryInputComponents = */ 64, + /* .MaxGeometryOutputComponents = */ 128, + /* .MaxFragmentInputComponents = */ 128, + /* .MaxImageUnits = */ 8, + /* .MaxCombinedImageUnitsAndFragmentOutputs = */ 8, + /* .MaxCombinedShaderOutputResources = */ 8, + /* .MaxImageSamples = */ 0, + /* .MaxVertexImageUniforms = */ 0, + /* .MaxTessControlImageUniforms = */ 0, + /* .MaxTessEvaluationImageUniforms = */ 0, + /* .MaxGeometryImageUniforms = */ 0, + /* .MaxFragmentImageUniforms = */ 8, + /* .MaxCombinedImageUniforms = */ 8, + /* .MaxGeometryTextureImageUnits = */ 16, + /* .MaxGeometryOutputVertices = */ 256, + /* .MaxGeometryTotalOutputComponents = */ 1024, + /* .MaxGeometryUniformComponents = */ 1024, + /* .MaxGeometryVaryingComponents = */ 64, + /* .MaxTessControlInputComponents = */ 128, + /* .MaxTessControlOutputComponents = */ 128, + /* .MaxTessControlTextureImageUnits = */ 16, + /* .MaxTessControlUniformComponents = */ 1024, + /* .MaxTessControlTotalOutputComponents = */ 4096, + /* .MaxTessEvaluationInputComponents = */ 128, + /* .MaxTessEvaluationOutputComponents = */ 128, + /* .MaxTessEvaluationTextureImageUnits = */ 16, + /* .MaxTessEvaluationUniformComponents = */ 1024, + /* .MaxTessPatchComponents = */ 120, + /* .MaxPatchVertices = */ 32, + /* .MaxTessGenLevel = */ 64, + /* .MaxViewports = */ 16, + /* .MaxVertexAtomicCounters = */ 0, + /* .MaxTessControlAtomicCounters = */ 0, + /* .MaxTessEvaluationAtomicCounters = */ 0, + /* .MaxGeometryAtomicCounters = */ 0, + /* .MaxFragmentAtomicCounters = */ 8, + /* .MaxCombinedAtomicCounters = */ 8, + /* .MaxAtomicCounterBindings = */ 1, + /* .MaxVertexAtomicCounterBuffers = */ 0, + /* .MaxTessControlAtomicCounterBuffers = */ 0, + /* .MaxTessEvaluationAtomicCounterBuffers = */ 0, + /* .MaxGeometryAtomicCounterBuffers = */ 0, + /* .MaxFragmentAtomicCounterBuffers = */ 1, + /* .MaxCombinedAtomicCounterBuffers = */ 1, + /* .MaxAtomicCounterBufferSize = */ 16384, + /* .MaxTransformFeedbackBuffers = */ 4, + /* .MaxTransformFeedbackInterleavedComponents = */ 64, + /* .MaxCullDistances = */ 8, + /* .MaxCombinedClipAndCullDistances = */ 8, + /* .MaxSamples = */ 4, + /* .maxMeshOutputVerticesNV = */ 256, + /* .maxMeshOutputPrimitivesNV = */ 512, + /* .maxMeshWorkGroupSizeX_NV = */ 32, + /* .maxMeshWorkGroupSizeY_NV = */ 1, + /* .maxMeshWorkGroupSizeZ_NV = */ 1, + /* .maxTaskWorkGroupSizeX_NV = */ 32, + /* .maxTaskWorkGroupSizeY_NV = */ 1, + /* .maxTaskWorkGroupSizeZ_NV = */ 1, + /* .maxMeshViewCountNV = */ 4, + /* .maxDualSourceDrawBuffersEXT = */ 1, + + /* .limits = */ { + /* .nonInductiveForLoops = */ 1, + /* .whileLoops = */ 1, + /* .doWhileLoops = */ 1, + /* .generalUniformIndexing = */ 1, + /* .generalAttributeMatrixVectorIndexing = */ 1, + /* .generalVaryingIndexing = */ 1, + /* .generalSamplerIndexing = */ 1, + /* .generalVariableIndexing = */ 1, + /* .generalConstantMatrixVectorIndexing = */ 1, + } +}; + +int compile_spirv_module(int shader_type_index, const Option& opt, std::vector& spirv) +{ + if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count) + { + NCNN_LOGE("no such shader module %d", shader_type_index); + return -1; + } + + const char* comp_data = layer_shader_registry[shader_type_index].comp_data; + int comp_data_size = layer_shader_registry[shader_type_index].comp_data_size; + + std::vector< std::pair > custom_defines; + + if (opt.use_fp16_storage) + { + custom_defines.push_back(std::make_pair("sfp", "float16_t")); + custom_defines.push_back(std::make_pair("sfpvec2", "f16vec2")); + custom_defines.push_back(std::make_pair("sfpvec4", "f16vec4")); + + if (opt.use_fp16_arithmetic) + { + custom_defines.push_back(std::make_pair("sfpvec8", "f16mat2x4")); + custom_defines.push_back(std::make_pair("sfpmat4", "f16mat4")); + } + } + else if (opt.use_fp16_packed) + { + custom_defines.push_back(std::make_pair("sfp", "float")); + custom_defines.push_back(std::make_pair("sfpvec2", "uint")); + custom_defines.push_back(std::make_pair("sfpvec4", "uvec2")); + custom_defines.push_back(std::make_pair("sfpvec8", "uvec4")); + } + else + { + custom_defines.push_back(std::make_pair("sfp", "float")); + custom_defines.push_back(std::make_pair("sfpvec2", "vec2")); + custom_defines.push_back(std::make_pair("sfpvec4", "vec4")); + custom_defines.push_back(std::make_pair("sfpvec8", "mat2x4")); + custom_defines.push_back(std::make_pair("sfpmat4", "mat4")); + } + + if (opt.use_fp16_arithmetic) + { + custom_defines.push_back(std::make_pair("afp", "float16_t")); + custom_defines.push_back(std::make_pair("afpvec2", "f16vec2")); + custom_defines.push_back(std::make_pair("afpvec4", "f16vec4")); + custom_defines.push_back(std::make_pair("afpvec8", "f16mat2x4")); + custom_defines.push_back(std::make_pair("afpmat4", "f16mat4")); + } + else + { + custom_defines.push_back(std::make_pair("afp", "float")); + custom_defines.push_back(std::make_pair("afpvec2", "vec2")); + custom_defines.push_back(std::make_pair("afpvec4", "vec4")); + custom_defines.push_back(std::make_pair("afpvec8", "mat2x4")); + custom_defines.push_back(std::make_pair("afpmat4", "mat4")); + } + + if (opt.use_fp16_storage && opt.use_fp16_arithmetic) + { + custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "buf[i]")); + custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=v;}")); + custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}")); + custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=f16vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}")); + custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=f16mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}")); + custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "buf[i]")); + custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=v;}")); + custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}")); + custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "buf[i]")); + custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=v;}")); + custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}")); + custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}")); + custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=f16mat2x4(sbuf[si2.r],sbuf[si2.g]);}")); + custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "buf[i]")); + custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=v;}")); + custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}")); + custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{f16mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}")); + custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{f16mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}")); + custom_defines.push_back(std::make_pair("sfp2afpmat4(v)", "v")); + custom_defines.push_back(std::make_pair("afp2sfpmat4(v)", "v")); + } + else if (opt.use_fp16_packed && opt.use_fp16_arithmetic) + { + custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "float16_t(buf[i])")); + custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=float(v);}")); + custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}")); + custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=uvec2(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))));}")); + custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=uvec4(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.r],sbuf[sii4.g]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.b],sbuf[sii4.a]))));}")); + custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "f16vec2(unpackHalf2x16(buf[i]))")); + custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=packHalf2x16(vec2(v))}")); + custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}")); + custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "f16vec4(vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y)))")); + custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=uvec2(packHalf2x16(vec2(v.rg)),packHalf2x16(vec2(v.ba)));}")); + custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}")); + custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}")); + custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}")); + custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "f16mat2x4(f16vec4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g))),f16vec4(vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a))))")); + custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=uvec4(uvec2(packHalf2x16(vec2(v[0].rg)),packHalf2x16(vec2(v[0].ba))),uvec2(packHalf2x16(vec2(v[1].rg)),packHalf2x16(vec2(v[1].ba))));}")); + custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}")); + custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}")); + custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}")); + } + else if (opt.use_fp16_storage) + { + custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "float(buf[i])")); + custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=float16_t(v);}")); + custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}")); + custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i].r=sbuf[si4.r];buf[i].g=sbuf[si4.g];buf[i].b=sbuf[si4.b];buf[i].a=sbuf[si4.a];}")); + custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i].abcd.r=sbuf[si4.r];buf[i].abcd.g=sbuf[si4.g];buf[i].abcd.b=sbuf[si4.b];buf[i].abcd.a=sbuf[si4.a];buf[i].efgh.r=sbuf[sii4.r];buf[i].efgh.g=sbuf[sii4.g];buf[i].efgh.b=sbuf[sii4.b];buf[i].efgh.a=sbuf[sii4.a];}")); + custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "vec2(buf[i])")); + custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=f16vec2(v);}")); + custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}")); + custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "vec4(buf[i])")); + custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=f16vec4(v);}")); + custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}")); + custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}")); + custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i].abcd=sbuf[si2.r];buf[i].efgh=sbuf[si2.g];}")); + custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "mat2x4(vec4(buf[i].abcd),vec4(buf[i].efgh))")); + custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i].abcd=f16vec4(v[0]);buf[i].efgh=f16vec4(v[1]);}")); + custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i].abcd=sbuf[si].abcd;buf[i].efgh=sbuf[si].efgh;}")); + custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{buf[i4.r]=sbuf[si].abcd.r;buf[i4.g]=sbuf[si].abcd.g;buf[i4.b]=sbuf[si].abcd.b;buf[i4.a]=sbuf[si].abcd.a; buf[ii4.r]=sbuf[si].efgh.r;buf[ii4.g]=sbuf[si].efgh.g;buf[ii4.b]=sbuf[si].efgh.b;buf[ii4.a]=sbuf[si].efgh.a;}")); + custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{buf[i2.r]=sbuf[si].abcd;buf[i2.g]=sbuf[si].efgh;}")); + } + else if (opt.use_fp16_packed) + { + custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "buf[i]")); + custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=v;}")); + custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}")); + custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=uvec2(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])));}")); + custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=uvec4(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])),packHalf2x16(vec2(sbuf[sii4.r],sbuf[sii4.g])),packHalf2x16(vec2(sbuf[sii4.b],sbuf[sii4.a])));}")); + custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "unpackHalf2x16(buf[i])")); + custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=packHalf2x16(v)}")); + custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}")); + custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y))")); + custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=uvec2(packHalf2x16(v.rg),packHalf2x16(v.ba));}")); + custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}")); + custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}")); + custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}")); + custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "mat2x4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g)),vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a)))")); + custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=uvec4(uvec2(packHalf2x16(v[0].rg),packHalf2x16(v[0].ba)),uvec2(packHalf2x16(v[1].rg),packHalf2x16(v[1].ba)));}")); + custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}")); + custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}")); + custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}")); + } + else + { + custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "buf[i]")); + custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=v;}")); + custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}")); + custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}")); + custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}")); + custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "buf[i]")); + custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=v;}")); + custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}")); + custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "buf[i]")); + custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=v;}")); + custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}")); + custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{vec4 _v=sbuf[si]; buf[i4.r]=_v.r;buf[i4.g]=_v.g;buf[i4.b]=_v.b;buf[i4.a]=_v.a;}")); + custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=mat2x4(sbuf[si2.r],sbuf[si2.g]);}")); + custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "buf[i]")); + custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=v;}")); + custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}")); + custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}")); + custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}")); + custom_defines.push_back(std::make_pair("sfp2afpmat4(v)", "v")); + custom_defines.push_back(std::make_pair("afp2sfpmat4(v)", "v")); + } + + if (opt.use_image_storage) + { + if (opt.use_fp16_storage) + { + custom_defines.push_back(std::make_pair("imfmtc1", "r16f")); + custom_defines.push_back(std::make_pair("imfmtc4", "rgba16f")); + custom_defines.push_back(std::make_pair("unfp", "mediump")); + } + else if (opt.use_fp16_packed) + { + custom_defines.push_back(std::make_pair("imfmtc1", "r32f")); + custom_defines.push_back(std::make_pair("imfmtc4", "rgba16f")); + custom_defines.push_back(std::make_pair("unfp", "mediump")); + } + else + { + custom_defines.push_back(std::make_pair("imfmtc1", "r32f")); + custom_defines.push_back(std::make_pair("imfmtc4", "rgba32f")); + custom_defines.push_back(std::make_pair("unfp", "highp")); + } + + if (opt.use_fp16_storage && opt.use_fp16_arithmetic) + { + custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)")); + custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)")); + custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)")); + custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{f16vec4 _v;_v.r=float16_t(v);imageStore(img,p,_v);}")); + custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{f16vec4 _v;_v.r=float16_t(v);imageStore(img,p,_v);}")); + custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{f16vec4 _v;_v.r=float16_t(v);imageStore(img,p,_v);}")); + custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))")); + custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))")); + custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))")); + custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,vec4(v));}")); + custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,vec4(v));}")); + custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,vec4(v));}")); + custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))")); + custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))")); + custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))")); + custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,vec4(v[0]));imageStore(img,(p)*2+1,vec4(v[1]));}")); + custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),vec4(v[0]));imageStore(img,ivec2(p.x*2+1,p.y),vec4(v[1]));}")); + custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),vec4(v[0]));imageStore(img,ivec3(p.x*2+1,p.y,p.z),vec4(v[1]));}")); + custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}")); + custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}")); + custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}")); + } + else if (opt.use_fp16_packed && opt.use_fp16_arithmetic) + { + custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)")); + custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)")); + custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)")); + custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}")); + custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}")); + custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}")); + custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))")); + custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))")); + custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))")); + custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}")); + custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}")); + custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}")); + custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))")); + custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))")); + custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))")); + custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}")); + custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}")); + custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}")); + custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}")); + custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}")); + custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}")); + } + else if (opt.use_fp16_storage) + { + custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "texelFetch(tex,p,0).r")); + custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "texelFetch(tex,p,0).r")); + custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "texelFetch(tex,p,0).r")); + custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}")); + custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}")); + custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}")); + custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "texelFetch(tex,p,0)")); + custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "texelFetch(tex,p,0)")); + custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "texelFetch(tex,p,0)")); + custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}")); + custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}")); + custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}")); + custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))")); + custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))")); + custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))")); + custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}")); + custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}")); + custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}")); + custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}")); + custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}")); + custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}")); + } + else if (opt.use_fp16_packed) + { + custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "texelFetch(tex,p,0).r")); + custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "texelFetch(tex,p,0).r")); + custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "texelFetch(tex,p,0).r")); + custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}")); + custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}")); + custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}")); + custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "texelFetch(tex,p,0)")); + custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "texelFetch(tex,p,0)")); + custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "texelFetch(tex,p,0)")); + custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}")); + custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}")); + custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}")); + custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))")); + custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))")); + custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))")); + custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}")); + custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}")); + custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}")); + custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}")); + custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}")); + custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}")); + } + else + { + custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "texelFetch(tex,p,0).r")); + custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "texelFetch(tex,p,0).r")); + custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "texelFetch(tex,p,0).r")); + custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}")); + custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}")); + custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}")); + custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "texelFetch(tex,p,0)")); + custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "texelFetch(tex,p,0)")); + custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "texelFetch(tex,p,0)")); + custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}")); + custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}")); + custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}")); + custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}")); + custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))")); + custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))")); + custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))")); + custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}")); + custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}")); + custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}")); + custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}")); + custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}")); + custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}")); + } + } + + custom_defines.push_back(std::make_pair("psc(x)", "(x==0?p.x:x)")); + + if (opt.use_fp16_packed) + { + custom_defines.push_back(std::make_pair("NCNN_fp16_packed", "1")); + } + if (opt.use_fp16_storage) + { + custom_defines.push_back(std::make_pair("NCNN_fp16_storage", "1")); + } + if (opt.use_fp16_arithmetic) + { + custom_defines.push_back(std::make_pair("NCNN_fp16_arithmetic", "1")); + } + + if (opt.use_image_storage) + { + custom_defines.push_back(std::make_pair("NCNN_image_shader", "1")); + } + + std::string preamble; + std::vector processes; + + processes.resize(custom_defines.size()); + for (size_t i = 0; i < custom_defines.size(); i++) + { + const char* key = custom_defines[i].first; + const char* def = custom_defines[i].second; + + preamble += std::string("#define ") + key + " " + def + "\n"; + processes[i] = std::string("define-macro ") + key + "=" + def; + } + + bool compile_success = true; + + { + glslang::TShader s(EShLangCompute); + + s.setStringsWithLengths(&comp_data, &comp_data_size, 1); + + s.setPreamble(preamble.c_str()); + s.addProcesses(processes); + s.setEntryPoint("main"); + s.setSourceEntryPoint("main"); + + s.setEnvInput(glslang::EShSourceGlsl, EShLangCompute, glslang::EShClientVulkan, 1); + s.setEnvClient(glslang::EShClientVulkan, glslang::EShTargetVulkan_1_0); + s.setEnvTarget(glslang::EshTargetSpv, glslang::EShTargetSpv_1_0); + + TBuiltInResource resources = default_TBuiltInResource; + + // although vulkan 1.1 accept glsl directly + // ncnn resolve_shader_info() only works with the intermediate spirv code + bool pr = s.parse(&resources, 100, false, EShMsgDefault); + if (!pr) + { + NCNN_LOGE("compile spir-v module failed"); + NCNN_LOGE("%s", s.getInfoLog()); + NCNN_LOGE("%s", s.getInfoDebugLog()); + + compile_success = false; + } + else + { + glslang::TIntermediate* ir = s.getIntermediate(); + glslang::GlslangToSpv(*ir, spirv); + } + } + + return compile_success ? 0 : -1; +} +#endif + +#if !NCNN_VULKAN_ONLINE_SPIRV const ShaderInfo& get_shader_info(int shader_type_index) { if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count) @@ -2035,6 +2591,7 @@ const ShaderInfo& get_shader_info(int shader_type_index) return layer_shader_infos[shader_type_index]; } +#endif int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info) { diff --git a/src/gpu.h b/src/gpu.h index ae27e60c4..e90b08d11 100644 --- a/src/gpu.h +++ b/src/gpu.h @@ -173,10 +173,12 @@ public: VkDevice vkdevice() const { return device; } +#if !NCNN_VULKAN_ONLINE_SPIRV VkShaderModule get_shader_module(int shader_type_index) const; // with fixed workgroup size VkShaderModule create_shader_module(int shader_type_index, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const; +#endif VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const; @@ -252,9 +254,11 @@ public: #endif // __ANDROID_API__ >= 26 protected: +#if !NCNN_VULKAN_ONLINE_SPIRV // shader management int create_shader_module(); void destroy_shader_module(); +#endif // device extension int init_device_extension(); @@ -269,7 +273,9 @@ protected: private: VkDevice device; +#if !NCNN_VULKAN_ONLINE_SPIRV std::vector shader_modules; +#endif // hardware queue mutable std::vector compute_queues; @@ -304,6 +310,11 @@ private: VulkanDevice* get_gpu_device(int device_index = get_default_gpu_index()); +#if NCNN_VULKAN_ONLINE_SPIRV +// online spirv compilation +int compile_spirv_module(int shader_type_index, const Option& opt, std::vector& spirv); +#endif + // info from spirv class ShaderInfo { @@ -319,7 +330,9 @@ public: int binding_types[16];// 16 is large enough I think ... }; +#if !NCNN_VULKAN_ONLINE_SPIRV const ShaderInfo& get_shader_info(int shader_type_index); +#endif int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info); } // namespace ncnn diff --git a/src/pipeline.cpp b/src/pipeline.cpp index 085f1e82c..eac51c43b 100644 --- a/src/pipeline.cpp +++ b/src/pipeline.cpp @@ -79,6 +79,47 @@ int Pipeline::create(const uint32_t* spv_data, size_t spv_data_size, const std:: int Pipeline::create(int shader_type_index, const Option& opt, const std::vector& specializations) { +#if NCNN_VULKAN_ONLINE_SPIRV + std::vector spirv; + int retc = compile_spirv_module(shader_type_index, opt, spirv); + if (retc != 0) + { + NCNN_LOGE("compile_spirv_module failed %d", retc); + return -1; + } + + const uint32_t* spv_data = spirv.data(); + size_t spv_data_size = spirv.size() * 4; + + ShaderInfo si; + int ret = resolve_shader_info(spv_data, spv_data_size, si); + if (ret != 0) + { + NCNN_LOGE("resolve_shader_info failed %d", ret); + return -1; + } + + // -3 for local_size_xyz + int specialization_count_expected = si.specialization_count - 3; + if ((int)specializations.size() != specialization_count_expected) + { + NCNN_LOGE("pipeline specialization count mismatch, expect %d but got %d", specialization_count_expected, (int)specializations.size()); + return -1; + } + + if (vkdev->info.bug_local_size_spec_const) + { + local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z); + } + else + { + local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size); + } + +// NCNN_LOGE("local_shader_module %p created", local_shader_module); + + return create(local_shader_module, si, specializations); +#else // ncnn_add_shader cmake macro // 0 = fp32 // 1 = fp16p @@ -148,6 +189,7 @@ int Pipeline::create(int shader_type_index, const Option& opt, const std::vector VkShaderModule shader_module = vkdev->get_shader_module(shader_type_index); return create(shader_module, si, specializations); +#endif } int Pipeline::create(VkShaderModule shader_module, const ShaderInfo& _shader_info, const std::vector& specializations) @@ -590,6 +632,44 @@ int ImportAndroidHardwareBufferPipeline::create(VkAndroidHardwareBufferImageAllo int shader_type_index = LayerShaderType::convert_ycbcr; +#if NCNN_VULKAN_ONLINE_SPIRV + std::vector spirv; + int retc = compile_spirv_module(shader_type_index, opt, spirv); + if (retc != 0) + { + NCNN_LOGE("compile_spirv_module failed %d", retc); + return -1; + } + + const uint32_t* spv_data = spirv.data(); + size_t spv_data_size = spirv.size() * 4; + + ShaderInfo si; + int ret = resolve_shader_info(spv_data, spv_data_size, si); + if (ret != 0) + { + NCNN_LOGE("resolve_shader_info failed %d", ret); + return -1; + } + + // -3 for local_size_xyz + int specialization_count_expected = si.specialization_count - 3; + if ((int)specializations.size() != specialization_count_expected) + { + NCNN_LOGE("pipeline specialization count mismatch, expect %d but got %d", specialization_count_expected, (int)specializations.size()); + return -1; + } + + VkShaderModule shader_module; + if (vkdev->info.bug_local_size_spec_const) + { + shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z); + } + else + { + shader_module = vkdev->compile_shader_module(spv_data, spv_data_size); + } +#else // ncnn_add_shader cmake macro // 0 = fp32 // 1 = fp16p @@ -640,7 +720,7 @@ int ImportAndroidHardwareBufferPipeline::create(VkAndroidHardwareBufferImageAllo } VkShaderModule shader_module = vkdev->get_shader_module(shader_type_index); - +#endif create_pipeline(shader_module, specializations); if (vkdev->info.support_VK_KHR_descriptor_update_template) diff --git a/src/platform.h.in b/src/platform.h.in index 4d960b84a..4a73395da 100644 --- a/src/platform.h.in +++ b/src/platform.h.in @@ -23,6 +23,7 @@ #cmakedefine01 NCNN_PIXEL #cmakedefine01 NCNN_PIXEL_ROTATE #cmakedefine01 NCNN_VULKAN +#cmakedefine01 NCNN_VULKAN_ONLINE_SPIRV #cmakedefine01 NCNN_REQUANT #cmakedefine01 NCNN_AVX2