diff --git a/.github/workflows/android-armv7-gpu.yml b/.github/workflows/android-armv7-gpu.yml
index ba87783cf..034aeed00 100644
--- a/.github/workflows/android-armv7-gpu.yml
+++ b/.github/workflows/android-armv7-gpu.yml
@@ -22,3 +22,27 @@ jobs:
run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
- name: build
run: cmake --build build -j 2
+
+ android-armv7-gpu-online-spirv:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ submodules: 'true'
+ - name: cache-vulkansdk
+ id: cache-vulkansdk
+ uses: actions/cache@v1
+ with:
+ path: "1.1.114.0"
+ key: vulkansdk-linux-x86_64-1.1.114.0
+ - name: vulkansdk
+ if: steps.cache-vulkansdk.outputs.cache-hit != 'true'
+ run: |
+ wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz
+ tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz
+ rm -rf 1.1.114.0/source 1.1.114.0/samples
+ find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm
+ - name: configure
+ run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON ..
+ - name: build
+ run: cmake --build build -j 2
diff --git a/.github/workflows/android-armv8-gpu.yml b/.github/workflows/android-armv8-gpu.yml
index 00fb1bfd6..9dd19a946 100644
--- a/.github/workflows/android-armv8-gpu.yml
+++ b/.github/workflows/android-armv8-gpu.yml
@@ -22,3 +22,27 @@ jobs:
run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
- name: build
run: cmake --build build -j 2
+
+ android-aarch64-gpu-online-spirv:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ submodules: 'true'
+ - name: cache-vulkansdk
+ id: cache-vulkansdk
+ uses: actions/cache@v1
+ with:
+ path: "1.1.114.0"
+ key: vulkansdk-linux-x86_64-1.1.114.0
+ - name: vulkansdk
+ if: steps.cache-vulkansdk.outputs.cache-hit != 'true'
+ run: |
+ wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz
+ tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz
+ rm -rf 1.1.114.0/source 1.1.114.0/samples
+ find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm
+ - name: configure
+ run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON ..
+ - name: build
+ run: cmake --build build -j 2
diff --git a/.github/workflows/android-x64-gpu.yml b/.github/workflows/android-x64-gpu.yml
index c65f6de8c..4f7ac0762 100644
--- a/.github/workflows/android-x64-gpu.yml
+++ b/.github/workflows/android-x64-gpu.yml
@@ -22,3 +22,27 @@ jobs:
run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
- name: build
run: cmake --build build -j 2
+
+ android-x86_64-gpu-online-spirv:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ submodules: 'true'
+ - name: cache-vulkansdk
+ id: cache-vulkansdk
+ uses: actions/cache@v1
+ with:
+ path: "1.1.114.0"
+ key: vulkansdk-linux-x86_64-1.1.114.0
+ - name: vulkansdk
+ if: steps.cache-vulkansdk.outputs.cache-hit != 'true'
+ run: |
+ wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz
+ tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz
+ rm -rf 1.1.114.0/source 1.1.114.0/samples
+ find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm
+ - name: configure
+ run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON ..
+ - name: build
+ run: cmake --build build -j 2
diff --git a/.github/workflows/android-x86-gpu.yml b/.github/workflows/android-x86-gpu.yml
index f3cc09e27..b89083e3a 100644
--- a/.github/workflows/android-x86-gpu.yml
+++ b/.github/workflows/android-x86-gpu.yml
@@ -22,3 +22,27 @@ jobs:
run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
- name: build
run: cmake --build build -j 2
+
+ android-x86-gpu-online-spirv:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ submodules: 'true'
+ - name: cache-vulkansdk
+ id: cache-vulkansdk
+ uses: actions/cache@v1
+ with:
+ path: "1.1.114.0"
+ key: vulkansdk-linux-x86_64-1.1.114.0
+ - name: vulkansdk
+ if: steps.cache-vulkansdk.outputs.cache-hit != 'true'
+ run: |
+ wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz
+ tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz
+ rm -rf 1.1.114.0/source 1.1.114.0/samples
+ find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm
+ - name: configure
+ run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON ..
+ - name: build
+ run: cmake --build build -j 2
diff --git a/.github/workflows/ios-64bit-gpu.yml b/.github/workflows/ios-64bit-gpu.yml
index 7f4ecabd9..d28317543 100644
--- a/.github/workflows/ios-64bit-gpu.yml
+++ b/.github/workflows/ios-64bit-gpu.yml
@@ -22,3 +22,27 @@ jobs:
run: export VULKAN_SDK=`pwd`/vulkansdk-macos-1.1.114.0/macOS && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=OS64 -DVulkan_INCLUDE_DIR=`pwd`/../vulkansdk-macos-1.1.114.0/MoltenVK/include -DVulkan_LIBRARY=`pwd`/../vulkansdk-macos-1.1.114.0/MoltenVK/iOS/dynamic/libMoltenVK.dylib -DNCNN_VULKAN=ON ..
- name: build
run: cmake --build build -j 2
+
+ ios-iphone-os-gpu-online-spirv:
+ runs-on: macos-latest
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ submodules: 'true'
+ - name: cache-vulkansdk
+ id: cache-vulkansdk
+ uses: actions/cache@v1
+ with:
+ path: "vulkansdk-macos-1.1.114.0"
+ key: vulkansdk-macos-1.1.114.0
+ - name: vulkansdk
+ if: steps.cache-vulkansdk.outputs.cache-hit != 'true'
+ run: |
+ wget https://sdk.lunarg.com/sdk/download/1.1.114.0/mac/vulkansdk-macos-1.1.114.0.tar.gz?Human=true -O vulkansdk-macos-1.1.114.0.tar.gz
+ tar -xf vulkansdk-macos-1.1.114.0.tar.gz
+ rm -rf vulkansdk-macos-1.1.114.0/Applications
+ find vulkansdk-macos-1.1.114.0 -type f | grep -v -E 'vulkan|glslang|MoltenVK' | xargs rm
+ - name: configure
+ run: export VULKAN_SDK=`pwd`/vulkansdk-macos-1.1.114.0/macOS && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=OS64 -DVulkan_INCLUDE_DIR=`pwd`/../vulkansdk-macos-1.1.114.0/MoltenVK/include -DVulkan_LIBRARY=`pwd`/../vulkansdk-macos-1.1.114.0/MoltenVK/iOS/dynamic/libMoltenVK.dylib -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON ..
+ - name: build
+ run: cmake --build build -j 2
diff --git a/.github/workflows/linux-x64-gpu-clang.yml b/.github/workflows/linux-x64-gpu-clang.yml
index bbf73f6ab..07912c233 100644
--- a/.github/workflows/linux-x64-gpu-clang.yml
+++ b/.github/workflows/linux-x64-gpu-clang.yml
@@ -60,3 +60,65 @@ jobs:
run: |
export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
cd build && ctest --output-on-failure -j 2
+
+ linux-clang-gpu-online-spirv:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ submodules: 'true'
+ - name: update
+ run: sudo apt-get update
+ - name: protobuf
+ run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev
+ - name: cache-vulkansdk
+ id: cache-vulkansdk
+ uses: actions/cache@v1
+ with:
+ path: "1.1.114.0"
+ key: vulkansdk-linux-x86_64-1.1.114.0
+ - name: vulkansdk
+ if: steps.cache-vulkansdk.outputs.cache-hit != 'true'
+ run: |
+ wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz
+ tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz
+ rm -rf 1.1.114.0/source 1.1.114.0/samples
+ find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm
+ - name: cache-swiftshader
+ id: cache-swiftshader
+ uses: actions/cache@v1
+ with:
+ path: swiftshader-install
+ key: swiftshader-linux-install-20200508
+ - name: checkout-swiftshader
+ if: steps.cache-swiftshader.outputs.cache-hit != 'true'
+ uses: actions/checkout@v2
+ with:
+ repository: google/swiftshader
+ path: swiftshader
+ ref: 2dd864470e310d173d35fa95ca3a14d216734aab
+ - name: checkout-swiftshader-submodules
+ if: steps.cache-swiftshader.outputs.cache-hit != 'true'
+ run: |
+ cd swiftshader
+ git submodule update --init --recursive
+ - name: swiftshader
+ if: steps.cache-swiftshader.outputs.cache-hit != 'true'
+ run: |
+ cd swiftshader
+ mkdir -p build; cd build
+ cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
+ cmake --build . -j 2
+ mkdir $GITHUB_WORKSPACE/swiftshader-install
+ cp Linux/* $GITHUB_WORKSPACE/swiftshader-install
+ - name: configure
+ env:
+ CC: clang
+ CXX: clang++
+ run: export VULKAN_SDK=`pwd`/1.1.114.0/x86_64 && mkdir build && cd build && cmake -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON ..
+ - name: build
+ run: cmake --build build -j 2
+ - name: test
+ run: |
+ export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
+ cd build && ctest --output-on-failure -j 2
diff --git a/.github/workflows/linux-x64-gpu-gcc.yml b/.github/workflows/linux-x64-gpu-gcc.yml
index d51444b80..d2d785aa9 100644
--- a/.github/workflows/linux-x64-gpu-gcc.yml
+++ b/.github/workflows/linux-x64-gpu-gcc.yml
@@ -57,3 +57,62 @@ jobs:
run: |
export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
cd build && ctest --output-on-failure -j 2
+
+ linux-gcc-gpu-online-spirv:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ submodules: 'true'
+ - name: update
+ run: sudo apt-get update
+ - name: protobuf
+ run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev
+ - name: cache-vulkansdk
+ id: cache-vulkansdk
+ uses: actions/cache@v1
+ with:
+ path: "1.1.114.0"
+ key: vulkansdk-linux-x86_64-1.1.114.0
+ - name: vulkansdk
+ if: steps.cache-vulkansdk.outputs.cache-hit != 'true'
+ run: |
+ wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz
+ tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz
+ rm -rf 1.1.114.0/source 1.1.114.0/samples
+ find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm
+ - name: cache-swiftshader
+ id: cache-swiftshader
+ uses: actions/cache@v1
+ with:
+ path: swiftshader-install
+ key: swiftshader-linux-install-20200508
+ - name: checkout-swiftshader
+ if: steps.cache-swiftshader.outputs.cache-hit != 'true'
+ uses: actions/checkout@v2
+ with:
+ repository: google/swiftshader
+ path: swiftshader
+ ref: 2dd864470e310d173d35fa95ca3a14d216734aab
+ - name: checkout-swiftshader-submodules
+ if: steps.cache-swiftshader.outputs.cache-hit != 'true'
+ run: |
+ cd swiftshader
+ git submodule update --init --recursive
+ - name: swiftshader
+ if: steps.cache-swiftshader.outputs.cache-hit != 'true'
+ run: |
+ cd swiftshader
+ mkdir -p build; cd build
+ cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
+ cmake --build . -j 2
+ mkdir $GITHUB_WORKSPACE/swiftshader-install
+ cp Linux/* $GITHUB_WORKSPACE/swiftshader-install
+ - name: configure
+ run: export VULKAN_SDK=`pwd`/1.1.114.0/x86_64 && mkdir build && cd build && cmake -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON ..
+ - name: build
+ run: cmake --build build -j 2
+ - name: test
+ run: |
+ export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
+ cd build && ctest --output-on-failure -j 2
diff --git a/.github/workflows/macos-x64-gpu.yml b/.github/workflows/macos-x64-gpu.yml
index cdcbd32e5..6bb36a4b4 100644
--- a/.github/workflows/macos-x64-gpu.yml
+++ b/.github/workflows/macos-x64-gpu.yml
@@ -56,3 +56,61 @@ jobs:
export DYLD_LIBRARY_PATH="vulkansdk-macos-1.1.114.0/macOS/lib":$DYLD_LIBRARY_PATH
export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
cd build && ctest --output-on-failure -j 2
+
+ macos-clang-gpu-online-spirv:
+ runs-on: macos-latest
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ submodules: 'true'
+ - name: protobuf
+ run: brew install protobuf opencv3
+ - name: cache-vulkansdk
+ id: cache-vulkansdk
+ uses: actions/cache@v1
+ with:
+ path: "vulkansdk-macos-1.1.114.0"
+ key: vulkansdk-macos-1.1.114.0
+ - name: vulkansdk
+ if: steps.cache-vulkansdk.outputs.cache-hit != 'true'
+ run: |
+ wget https://sdk.lunarg.com/sdk/download/1.1.114.0/mac/vulkansdk-macos-1.1.114.0.tar.gz?Human=true -O vulkansdk-macos-1.1.114.0.tar.gz
+ tar -xf vulkansdk-macos-1.1.114.0.tar.gz
+ rm -rf vulkansdk-macos-1.1.114.0/Applications
+ find vulkansdk-macos-1.1.114.0 -type f | grep -v -E 'vulkan|glslang|MoltenVK' | xargs rm
+ - name: cache-swiftshader
+ id: cache-swiftshader
+ uses: actions/cache@v1
+ with:
+ path: swiftshader-install
+ key: swiftshader-macos-install-20200508
+ - name: checkout-swiftshader
+ if: steps.cache-swiftshader.outputs.cache-hit != 'true'
+ uses: actions/checkout@v2
+ with:
+ repository: google/swiftshader
+ path: swiftshader
+ ref: 2dd864470e310d173d35fa95ca3a14d216734aab
+ - name: checkout-swiftshader-submodules
+ if: steps.cache-swiftshader.outputs.cache-hit != 'true'
+ run: |
+ cd swiftshader
+ git submodule update --init --recursive
+ - name: swiftshader
+ if: steps.cache-swiftshader.outputs.cache-hit != 'true'
+ run: |
+ cd swiftshader
+ mkdir -p build; cd build
+ cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
+ cmake --build . -j 2
+ mkdir $GITHUB_WORKSPACE/swiftshader-install
+ cp Darwin/* $GITHUB_WORKSPACE/swiftshader-install
+ - name: configure
+ run: export VULKAN_SDK=`pwd`/vulkansdk-macos-1.1.114.0/macOS && mkdir build && cd build && cmake -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON ..
+ - name: build
+ run: cmake --build build -j 2
+ - name: test
+ run: |
+ export DYLD_LIBRARY_PATH="vulkansdk-macos-1.1.114.0/macOS/lib":$DYLD_LIBRARY_PATH
+ export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
+ cd build && ctest --output-on-failure -j 2
diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml
index 0fc469581..36c5dd993 100644
--- a/.github/workflows/test-coverage.yml
+++ b/.github/workflows/test-coverage.yml
@@ -67,6 +67,74 @@ jobs:
token: ${{ secrets.CODECOV_TOKEN }}
file: build/lcov.info
+ linux-gcc-gpu-online-spirv:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ submodules: 'true'
+ - name: lcov
+ run: sudo apt-get install lcov
+ - name: cache-vulkansdk
+ id: cache-vulkansdk
+ uses: actions/cache@v1
+ with:
+ path: "1.1.114.0"
+ key: vulkansdk-linux-x86_64-1.1.114.0
+ - name: vulkansdk
+ if: steps.cache-vulkansdk.outputs.cache-hit != 'true'
+ run: |
+ wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz
+ tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz
+ rm -rf 1.1.114.0/source 1.1.114.0/samples
+ find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm
+ - name: cache-swiftshader
+ id: cache-swiftshader
+ uses: actions/cache@v1
+ with:
+ path: swiftshader-install
+ key: swiftshader-linux-install-20200508
+ - name: checkout-swiftshader
+ if: steps.cache-swiftshader.outputs.cache-hit != 'true'
+ uses: actions/checkout@v2
+ with:
+ repository: google/swiftshader
+ path: swiftshader
+ ref: 2dd864470e310d173d35fa95ca3a14d216734aab
+ - name: checkout-swiftshader-submodules
+ if: steps.cache-swiftshader.outputs.cache-hit != 'true'
+ run: |
+ cd swiftshader
+ git submodule update --init --recursive
+ - name: swiftshader
+ if: steps.cache-swiftshader.outputs.cache-hit != 'true'
+ run: |
+ cd swiftshader
+ mkdir -p build; cd build
+ cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
+ cmake --build . -j 2
+ mkdir $GITHUB_WORKSPACE/swiftshader-install
+ cp Linux/* $GITHUB_WORKSPACE/swiftshader-install
+ - name: configure
+ run: export VULKAN_SDK=`pwd`/1.1.114.0/x86_64 && mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+ - name: build
+ run: cmake --build build -j 2
+ - name: test
+ run: |
+ export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
+ cd build && ctest --output-on-failure -j 2
+ - name: lcov-collect
+ run: |
+ cd build
+ lcov -d ./src -c -o lcov.info
+ lcov -r lcov.info '/usr/*' -o lcov.info
+ lcov --list lcov.info
+ - name: codecov
+ uses: codecov/codecov-action@v1
+ with:
+ token: ${{ secrets.CODECOV_TOKEN }}
+ file: build/lcov.info
+
linux-gcc-avx2:
runs-on: ubuntu-latest
steps:
diff --git a/.github/workflows/windows-x64-gpu-vs2019.yml b/.github/workflows/windows-x64-gpu-vs2019.yml
index cd4e70fa1..44fd6f640 100644
--- a/.github/workflows/windows-x64-gpu-vs2019.yml
+++ b/.github/workflows/windows-x64-gpu-vs2019.yml
@@ -71,3 +71,76 @@ jobs:
Copy-Item -Path '.\VulkanSDK\RunTimeInstaller\x64\vulkan-1.dll' -Destination 'build\tests'
$env:VK_ICD_FILENAMES="$env:GITHUB_WORKSPACE\swiftshader-install\vk_swiftshader_icd.json"
cd build; ctest --output-on-failure -j 2
+
+ windows-vs2019-gpu-online-spirv:
+ runs-on: windows-latest
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ submodules: 'true'
+ - name: cache-protobuf
+ id: cache-protobuf
+ uses: actions/cache@v1
+ with:
+ path: "protobuf-install"
+ key: protobuf-windows-install
+ - name: protobuf
+ if: steps.cache-protobuf.outputs.cache-hit != 'true'
+ run: |
+ Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
+ 7z x ./protobuf-3.11.2.zip
+ cd protobuf-3.11.2
+ mkdir build-vs2019; cd build-vs2019; cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
+ cmake --build . --config Release -j 2
+ cmake --build . --config Release --target install
+ - name: cache-vulkansdk
+ id: cache-vulkansdk
+ uses: actions/cache@v1
+ with:
+ path: "VulkanSDK"
+ key: VulkanSDK-1.1.114.0-Installer
+ - name: vulkansdk
+ if: steps.cache-vulkansdk.outputs.cache-hit != 'true'
+ run: |
+ Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.1.114.0/windows/VulkanSDK-1.1.114.0-Installer.exe?Human=true -OutFile VulkanSDK-1.1.114.0-Installer.exe
+ 7z x -aoa ./VulkanSDK-1.1.114.0-Installer.exe -oVulkanSDK
+ Remove-Item .\VulkanSDK\Demos, .\VulkanSDK\glslang, .\VulkanSDK\Samples, .\VulkanSDK\shaderc, .\VulkanSDK\spirv-tools, .\VulkanSDK\Third-Party, .\VulkanSDK\Tools, .\VulkanSDK\Tools32 -Recurse
+ - name: cache-swiftshader
+ id: cache-swiftshader
+ uses: actions/cache@v1
+ with:
+ path: swiftshader-install
+ key: swiftshader-windows-install-20200508
+ - name: checkout-swiftshader
+ if: steps.cache-swiftshader.outputs.cache-hit != 'true'
+ uses: actions/checkout@v2
+ with:
+ repository: google/swiftshader
+ path: swiftshader
+ ref: 2dd864470e310d173d35fa95ca3a14d216734aab
+ - name: checkout-swiftshader-submodules
+ if: steps.cache-swiftshader.outputs.cache-hit != 'true'
+ run: |
+ cd swiftshader
+ git submodule update --init --recursive
+ - name: swiftshader
+ if: steps.cache-swiftshader.outputs.cache-hit != 'true'
+ run: |
+ cd swiftshader
+ mkdir build-vs2019; cd build-vs2019
+ cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
+ cmake --build . --config Release -j 2
+ mkdir "$env:GITHUB_WORKSPACE/swiftshader-install"
+ Copy-Item -Path "Windows\*" -Destination "$env:GITHUB_WORKSPACE\swiftshader-install"
+ - name: configure
+ run: |
+ $env:VULKAN_SDK="$(pwd)/VulkanSDK"
+ mkdir build; cd build
+ cmake -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON ..
+ - name: build
+ run: cmake --build build --config Release -j 2
+ - name: test
+ run: |
+ Copy-Item -Path '.\VulkanSDK\RunTimeInstaller\x64\vulkan-1.dll' -Destination 'build\tests'
+ $env:VK_ICD_FILENAMES="$env:GITHUB_WORKSPACE\swiftshader-install\vk_swiftshader_icd.json"
+ cd build; ctest --output-on-failure -j 2
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..6dc660ae0
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "glslang"]
+ path = glslang
+ url = https://github.com/KhronosGroup/glslang
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 79935a714..abf76d631 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,6 +30,7 @@ option(NCNN_PIXEL "convert and resize from/to image pixel" ON)
option(NCNN_PIXEL_ROTATE "rotate image pixel orientation" ON)
option(NCNN_CMAKE_VERBOSE "print verbose cmake messages" OFF)
option(NCNN_VULKAN "vulkan compute support" OFF)
+option(NCNN_VULKAN_ONLINE_SPIRV "online SPIR-V module compilation" OFF)
option(NCNN_REQUANT "auto merge int8 quant and dequant" OFF)
option(NCNN_AVX2 "optimize x86 platform with avx2" OFF)
option(NCNN_DISABLE_PIC "disable position-independent code" OFF)
@@ -64,6 +65,28 @@ if(NCNN_COVERAGE)
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -coverage -lgcov")
endif()
+if(NCNN_VULKAN_ONLINE_SPIRV)
+ if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/glslang/CMakeLists.txt")
+ message(WARNING "The submodules were not downloaded! NCNN_VULKAN_ONLINE_SPIRV will be turned off.")
+ message(WARNING "Please update submodules with \"git submodule update --init\" and try again.")
+ set(NCNN_VULKAN_ONLINE_SPIRV OFF)
+ else()
+ # glslang requires c++11
+ set(CMAKE_CXX_STANDARD 11)
+
+ option(BUILD_EXTERNAL "" OFF)
+ option(ENABLE_SPVREMAPPER "" OFF)
+ option(ENABLE_GLSLANG_BINARIES "" OFF)
+ option(ENABLE_HLSL "" OFF)
+ option(ENABLE_RTTI "" OFF)
+ option(ENABLE_EXCEPTIONS "" OFF)
+ option(ENABLE_OPT "" OFF)
+ option(ENABLE_PCH "" OFF)
+ option(ENABLE_CTEST "" OFF)
+ add_subdirectory(glslang)
+ endif()
+endif()
+
add_subdirectory(src)
if(NCNN_BUILD_BENCHMARK)
add_subdirectory(benchmark)
diff --git a/glslang b/glslang
new file mode 160000
index 000000000..59216d5cd
--- /dev/null
+++ b/glslang
@@ -0,0 +1 @@
+Subproject commit 59216d5cd87eee78dc979e30645c4c2240a7c351
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 54e7be8ac..217c6f645 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -3,7 +3,7 @@
configure_file(platform.h.in ${CMAKE_CURRENT_BINARY_DIR}/platform.h)
-if(NCNN_VULKAN)
+if(NCNN_VULKAN AND NOT NCNN_VULKAN_ONLINE_SPIRV)
find_program(GLSLANGVALIDATOR_EXECUTABLE NAMES glslangValidator PATHS $ENV{VULKAN_SDK}/bin NO_CMAKE_FIND_ROOT_PATH)
message(STATUS "Found glslangValidator: ${GLSLANGVALIDATOR_EXECUTABLE}")
endif()
@@ -21,6 +21,7 @@ endfunction()
set(ncnn_SRCS
allocator.cpp
+ benchmark.cpp
blob.cpp
command.cpp
cpu.cpp
@@ -37,59 +38,93 @@ set(ncnn_SRCS
option.cpp
paramdict.cpp
pipeline.cpp
- benchmark.cpp
)
-ncnn_src_group(ncnn_SRCS "sources")
-
if(ANDROID)
list(APPEND ncnn_SRCS mat_pixel_android.cpp)
endif()
+ncnn_src_group(ncnn_SRCS "sources")
+
include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_generate_shader_spv_header.cmake)
macro(ncnn_add_shader SHADER_SRC)
- ncnn_generate_shader_spv_header(SHADER_SPV_HEADER SHADER_SPV_HEX_HEADERS ${SHADER_SRC})
-
- get_filename_component(SHADER_SPV_HEADER_NAME ${SHADER_SPV_HEADER} NAME)
- string(APPEND layer_shader_spv_data "#include \"${SHADER_SPV_HEADER_NAME}\"\n")
-
- get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE)
- string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_spv_data,sizeof(${SHADER_SRC_NAME_WE}_spv_data)},\n")
- string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16p_spv_data)},\n")
- string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16pa_spv_data)},\n")
- string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16s_spv_data)},\n")
- string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16sa_spv_data)},\n")
- string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_spv_data)},\n")
- string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16p_spv_data)},\n")
- string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16pa_spv_data)},\n")
- string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16s_spv_data)},\n")
- string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16sa_spv_data)},\n")
-
- list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEADER})
- list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEX_HEADERS})
-
- # generate layer_shader_type_enum file
- set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE} = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
- math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
- set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
- math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
- set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16pa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
- math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
- set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
- math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
- set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
- math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
- set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
- math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
- set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
- math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
- set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16pa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
- math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
- set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
- math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
- set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
- math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+ if(NCNN_VULKAN_ONLINE_SPIRV)
+
+ file(READ ${SHADER_SRC} comp_data)
+
+ # skip leading comment
+ string(FIND "${comp_data}" "#version" version_start)
+ string(SUBSTRING "${comp_data}" ${version_start} -1 comp_data)
+
+ # remove whitespace
+ string(REGEX REPLACE "\n +" "\n" comp_data "${comp_data}")
+
+ # text to hex
+ file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/text2hex.txt "${comp_data}")
+ file(READ ${CMAKE_CURRENT_BINARY_DIR}/text2hex.txt comp_data_hex HEX)
+ string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," comp_data_hex ${comp_data_hex})
+ string(FIND "${comp_data_hex}" "," tail_comma REVERSE)
+ string(SUBSTRING "${comp_data_hex}" 0 ${tail_comma} comp_data_hex)
+
+ get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE)
+ set(SHADER_COMP_HEADER ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_SRC_NAME_WE}.comp.hex.h)
+ file(WRITE ${SHADER_COMP_HEADER} "static const char ${SHADER_SRC_NAME_WE}_comp_data[] = {${comp_data_hex}};\n")
+ set_source_files_properties(${SHADER_COMP_HEADER} PROPERTIES GENERATED TRUE)
+
+ string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_comp_data,sizeof(${SHADER_SRC_NAME_WE}_comp_data)},\n")
+
+ get_filename_component(SHADER_COMP_HEADER_NAME ${SHADER_COMP_HEADER} NAME)
+ list(APPEND SHADER_SPV_HEX_FILES ${SHADER_COMP_HEADER_NAME})
+
+ string(APPEND layer_shader_spv_data "#include \"${SHADER_COMP_HEADER_NAME}\"\n")
+
+ # generate layer_shader_type_enum file
+ set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE} = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+ math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+ else()
+ ncnn_generate_shader_spv_header(SHADER_SPV_HEADER SHADER_SPV_HEX_HEADERS ${SHADER_SRC})
+
+ get_filename_component(SHADER_SPV_HEADER_NAME ${SHADER_SPV_HEADER} NAME)
+ string(APPEND layer_shader_spv_data "#include \"${SHADER_SPV_HEADER_NAME}\"\n")
+
+ get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE)
+ string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_spv_data,sizeof(${SHADER_SRC_NAME_WE}_spv_data)},\n")
+ string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16p_spv_data)},\n")
+ string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16pa_spv_data)},\n")
+ string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16s_spv_data)},\n")
+ string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16sa_spv_data)},\n")
+ string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_spv_data)},\n")
+ string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16p_spv_data)},\n")
+ string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16pa_spv_data)},\n")
+ string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16s_spv_data)},\n")
+ string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16sa_spv_data)},\n")
+
+ list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEADER})
+ list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEX_HEADERS})
+
+ # generate layer_shader_type_enum file
+ set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE} = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+ math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+ set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+ math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+ set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16pa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+ math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+ set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+ math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+ set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+ math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+ set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+ math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+ set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+ math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+ set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16pa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+ math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+ set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+ math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+ set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+ math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+ endif()
endmacro()
macro(ncnn_add_layer class)
@@ -303,6 +338,11 @@ endif()
if(NCNN_VULKAN)
find_package(Vulkan REQUIRED)
target_link_libraries(ncnn PUBLIC Vulkan::Vulkan)
+
+ if(NCNN_VULKAN_ONLINE_SPIRV)
+ target_include_directories(ncnn PRIVATE $)
+ target_link_libraries(ncnn PRIVATE glslang SPIRV)
+ endif()
endif()
if(ANDROID_NDK)
diff --git a/src/convert_ycbcr.comp b/src/convert_ycbcr.comp
index 57a370b75..c5dff0460 100644
--- a/src/convert_ycbcr.comp
+++ b/src/convert_ycbcr.comp
@@ -18,7 +18,7 @@
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
-#extension GL_AMD_gpu_shader_half_float: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif
layout (constant_id = 0) const int w = 0;
diff --git a/src/gpu.cpp b/src/gpu.cpp
index 82fd26810..7fdc40af7 100644
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -23,6 +23,11 @@
#include
+#if NCNN_VULKAN_ONLINE_SPIRV
+#include "glslang/glslang/Public/ShaderLang.h"
+#include "glslang/SPIRV/GlslangToSpv.h"
+#endif
+
#include "mat.h"
#include "command.h"
#include "layer_type.h"
@@ -54,11 +59,19 @@ static Mutex g_default_vkdev_lock;
static VulkanDevice* g_default_vkdev[NCNN_MAX_GPU_COUNT] = {0};
// precompiled spirv
+#if NCNN_VULKAN_ONLINE_SPIRV
+struct layer_shader_registry_entry
+{
+ const char* comp_data;
+ int comp_data_size;
+};
+#else
struct layer_shader_registry_entry
{
const uint32_t* spv_data;
size_t spv_data_size;
};
+#endif
#include "layer_shader_spv_data.h"
@@ -67,7 +80,9 @@ static const layer_shader_registry_entry layer_shader_registry[] =
#include "layer_shader_registry.h"
};
+#if !NCNN_VULKAN_ONLINE_SPIRV
static ShaderInfo layer_shader_infos[sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry)];
+#endif
static const int layer_shader_registry_entry_count = sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry);
@@ -846,17 +861,25 @@ int create_gpu_instance()
// the default gpu device
g_default_gpu_index = find_default_vulkan_device_index();
+#if NCNN_VULKAN_ONLINE_SPIRV
+ glslang::InitializeProcess();
+#else
// resolve shader info
for (int i=0; i= layer_shader_registry_entry_count)
@@ -1162,6 +1190,7 @@ VkShaderModule VulkanDevice::create_shader_module(int shader_type_index, uint32_
return compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z);
}
+#endif
VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const
{
@@ -1601,6 +1630,7 @@ void VulkanDevice::convert_packing(const VkImageMat& src, VkMat& dst, int dst_el
uop->forward(src, dst, cmd, opt);
}
+#if !NCNN_VULKAN_ONLINE_SPIRV
int VulkanDevice::create_shader_module()
{
if (info.bug_local_size_spec_const)
@@ -1706,6 +1736,7 @@ void VulkanDevice::destroy_shader_module()
shader_modules.clear();
}
+#endif
int VulkanDevice::init_device_extension()
{
@@ -2025,6 +2056,531 @@ VulkanDevice* get_gpu_device(int device_index)
return g_default_vkdev[device_index];
}
+#if NCNN_VULKAN_ONLINE_SPIRV
+
+const TBuiltInResource default_TBuiltInResource = {
+ /* .MaxLights = */ 32,
+ /* .MaxClipPlanes = */ 6,
+ /* .MaxTextureUnits = */ 32,
+ /* .MaxTextureCoords = */ 32,
+ /* .MaxVertexAttribs = */ 64,
+ /* .MaxVertexUniformComponents = */ 4096,
+ /* .MaxVaryingFloats = */ 64,
+ /* .MaxVertexTextureImageUnits = */ 32,
+ /* .MaxCombinedTextureImageUnits = */ 80,
+ /* .MaxTextureImageUnits = */ 32,
+ /* .MaxFragmentUniformComponents = */ 4096,
+ /* .MaxDrawBuffers = */ 32,
+ /* .MaxVertexUniformVectors = */ 128,
+ /* .MaxVaryingVectors = */ 8,
+ /* .MaxFragmentUniformVectors = */ 16,
+ /* .MaxVertexOutputVectors = */ 16,
+ /* .MaxFragmentInputVectors = */ 15,
+ /* .MinProgramTexelOffset = */ -8,
+ /* .MaxProgramTexelOffset = */ 7,
+ /* .MaxClipDistances = */ 8,
+ /* .MaxComputeWorkGroupCountX = */ 65535,
+ /* .MaxComputeWorkGroupCountY = */ 65535,
+ /* .MaxComputeWorkGroupCountZ = */ 65535,
+ /* .MaxComputeWorkGroupSizeX = */ 1024,
+ /* .MaxComputeWorkGroupSizeY = */ 1024,
+ /* .MaxComputeWorkGroupSizeZ = */ 64,
+ /* .MaxComputeUniformComponents = */ 1024,
+ /* .MaxComputeTextureImageUnits = */ 16,
+ /* .MaxComputeImageUniforms = */ 8,
+ /* .MaxComputeAtomicCounters = */ 8,
+ /* .MaxComputeAtomicCounterBuffers = */ 1,
+ /* .MaxVaryingComponents = */ 60,
+ /* .MaxVertexOutputComponents = */ 64,
+ /* .MaxGeometryInputComponents = */ 64,
+ /* .MaxGeometryOutputComponents = */ 128,
+ /* .MaxFragmentInputComponents = */ 128,
+ /* .MaxImageUnits = */ 8,
+ /* .MaxCombinedImageUnitsAndFragmentOutputs = */ 8,
+ /* .MaxCombinedShaderOutputResources = */ 8,
+ /* .MaxImageSamples = */ 0,
+ /* .MaxVertexImageUniforms = */ 0,
+ /* .MaxTessControlImageUniforms = */ 0,
+ /* .MaxTessEvaluationImageUniforms = */ 0,
+ /* .MaxGeometryImageUniforms = */ 0,
+ /* .MaxFragmentImageUniforms = */ 8,
+ /* .MaxCombinedImageUniforms = */ 8,
+ /* .MaxGeometryTextureImageUnits = */ 16,
+ /* .MaxGeometryOutputVertices = */ 256,
+ /* .MaxGeometryTotalOutputComponents = */ 1024,
+ /* .MaxGeometryUniformComponents = */ 1024,
+ /* .MaxGeometryVaryingComponents = */ 64,
+ /* .MaxTessControlInputComponents = */ 128,
+ /* .MaxTessControlOutputComponents = */ 128,
+ /* .MaxTessControlTextureImageUnits = */ 16,
+ /* .MaxTessControlUniformComponents = */ 1024,
+ /* .MaxTessControlTotalOutputComponents = */ 4096,
+ /* .MaxTessEvaluationInputComponents = */ 128,
+ /* .MaxTessEvaluationOutputComponents = */ 128,
+ /* .MaxTessEvaluationTextureImageUnits = */ 16,
+ /* .MaxTessEvaluationUniformComponents = */ 1024,
+ /* .MaxTessPatchComponents = */ 120,
+ /* .MaxPatchVertices = */ 32,
+ /* .MaxTessGenLevel = */ 64,
+ /* .MaxViewports = */ 16,
+ /* .MaxVertexAtomicCounters = */ 0,
+ /* .MaxTessControlAtomicCounters = */ 0,
+ /* .MaxTessEvaluationAtomicCounters = */ 0,
+ /* .MaxGeometryAtomicCounters = */ 0,
+ /* .MaxFragmentAtomicCounters = */ 8,
+ /* .MaxCombinedAtomicCounters = */ 8,
+ /* .MaxAtomicCounterBindings = */ 1,
+ /* .MaxVertexAtomicCounterBuffers = */ 0,
+ /* .MaxTessControlAtomicCounterBuffers = */ 0,
+ /* .MaxTessEvaluationAtomicCounterBuffers = */ 0,
+ /* .MaxGeometryAtomicCounterBuffers = */ 0,
+ /* .MaxFragmentAtomicCounterBuffers = */ 1,
+ /* .MaxCombinedAtomicCounterBuffers = */ 1,
+ /* .MaxAtomicCounterBufferSize = */ 16384,
+ /* .MaxTransformFeedbackBuffers = */ 4,
+ /* .MaxTransformFeedbackInterleavedComponents = */ 64,
+ /* .MaxCullDistances = */ 8,
+ /* .MaxCombinedClipAndCullDistances = */ 8,
+ /* .MaxSamples = */ 4,
+ /* .maxMeshOutputVerticesNV = */ 256,
+ /* .maxMeshOutputPrimitivesNV = */ 512,
+ /* .maxMeshWorkGroupSizeX_NV = */ 32,
+ /* .maxMeshWorkGroupSizeY_NV = */ 1,
+ /* .maxMeshWorkGroupSizeZ_NV = */ 1,
+ /* .maxTaskWorkGroupSizeX_NV = */ 32,
+ /* .maxTaskWorkGroupSizeY_NV = */ 1,
+ /* .maxTaskWorkGroupSizeZ_NV = */ 1,
+ /* .maxMeshViewCountNV = */ 4,
+ /* .maxDualSourceDrawBuffersEXT = */ 1,
+
+ /* .limits = */ {
+ /* .nonInductiveForLoops = */ 1,
+ /* .whileLoops = */ 1,
+ /* .doWhileLoops = */ 1,
+ /* .generalUniformIndexing = */ 1,
+ /* .generalAttributeMatrixVectorIndexing = */ 1,
+ /* .generalVaryingIndexing = */ 1,
+ /* .generalSamplerIndexing = */ 1,
+ /* .generalVariableIndexing = */ 1,
+ /* .generalConstantMatrixVectorIndexing = */ 1,
+ }
+};
+
+int compile_spirv_module(int shader_type_index, const Option& opt, std::vector& spirv)
+{
+ if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count)
+ {
+ NCNN_LOGE("no such shader module %d", shader_type_index);
+ return -1;
+ }
+
+ const char* comp_data = layer_shader_registry[shader_type_index].comp_data;
+ int comp_data_size = layer_shader_registry[shader_type_index].comp_data_size;
+
+ std::vector< std::pair > custom_defines;
+
+ if (opt.use_fp16_storage)
+ {
+ custom_defines.push_back(std::make_pair("sfp", "float16_t"));
+ custom_defines.push_back(std::make_pair("sfpvec2", "f16vec2"));
+ custom_defines.push_back(std::make_pair("sfpvec4", "f16vec4"));
+
+ if (opt.use_fp16_arithmetic)
+ {
+ custom_defines.push_back(std::make_pair("sfpvec8", "f16mat2x4"));
+ custom_defines.push_back(std::make_pair("sfpmat4", "f16mat4"));
+ }
+ }
+ else if (opt.use_fp16_packed)
+ {
+ custom_defines.push_back(std::make_pair("sfp", "float"));
+ custom_defines.push_back(std::make_pair("sfpvec2", "uint"));
+ custom_defines.push_back(std::make_pair("sfpvec4", "uvec2"));
+ custom_defines.push_back(std::make_pair("sfpvec8", "uvec4"));
+ }
+ else
+ {
+ custom_defines.push_back(std::make_pair("sfp", "float"));
+ custom_defines.push_back(std::make_pair("sfpvec2", "vec2"));
+ custom_defines.push_back(std::make_pair("sfpvec4", "vec4"));
+ custom_defines.push_back(std::make_pair("sfpvec8", "mat2x4"));
+ custom_defines.push_back(std::make_pair("sfpmat4", "mat4"));
+ }
+
+ if (opt.use_fp16_arithmetic)
+ {
+ custom_defines.push_back(std::make_pair("afp", "float16_t"));
+ custom_defines.push_back(std::make_pair("afpvec2", "f16vec2"));
+ custom_defines.push_back(std::make_pair("afpvec4", "f16vec4"));
+ custom_defines.push_back(std::make_pair("afpvec8", "f16mat2x4"));
+ custom_defines.push_back(std::make_pair("afpmat4", "f16mat4"));
+ }
+ else
+ {
+ custom_defines.push_back(std::make_pair("afp", "float"));
+ custom_defines.push_back(std::make_pair("afpvec2", "vec2"));
+ custom_defines.push_back(std::make_pair("afpvec4", "vec4"));
+ custom_defines.push_back(std::make_pair("afpvec8", "mat2x4"));
+ custom_defines.push_back(std::make_pair("afpmat4", "mat4"));
+ }
+
+ if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
+ {
+ custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "buf[i]"));
+ custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=v;}"));
+ custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
+ custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=f16vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"));
+ custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=f16mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"));
+ custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "buf[i]"));
+ custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=v;}"));
+ custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
+ custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "buf[i]"));
+ custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=v;}"));
+ custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
+ custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"));
+ custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=f16mat2x4(sbuf[si2.r],sbuf[si2.g]);}"));
+ custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "buf[i]"));
+ custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=v;}"));
+ custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
+ custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{f16mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"));
+ custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{f16mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"));
+ custom_defines.push_back(std::make_pair("sfp2afpmat4(v)", "v"));
+ custom_defines.push_back(std::make_pair("afp2sfpmat4(v)", "v"));
+ }
+ else if (opt.use_fp16_packed && opt.use_fp16_arithmetic)
+ {
+ custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "float16_t(buf[i])"));
+ custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=float(v);}"));
+ custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
+ custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=uvec2(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))));}"));
+ custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=uvec4(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.r],sbuf[sii4.g]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.b],sbuf[sii4.a]))));}"));
+ custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "f16vec2(unpackHalf2x16(buf[i]))"));
+ custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=packHalf2x16(vec2(v))}"));
+ custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
+ custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "f16vec4(vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y)))"));
+ custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=uvec2(packHalf2x16(vec2(v.rg)),packHalf2x16(vec2(v.ba)));}"));
+ custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
+ custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"));
+ custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"));
+ custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "f16mat2x4(f16vec4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g))),f16vec4(vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a))))"));
+ custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=uvec4(uvec2(packHalf2x16(vec2(v[0].rg)),packHalf2x16(vec2(v[0].ba))),uvec2(packHalf2x16(vec2(v[1].rg)),packHalf2x16(vec2(v[1].ba))));}"));
+ custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
+ custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"));
+ custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"));
+ }
+ else if (opt.use_fp16_storage)
+ {
+ custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "float(buf[i])"));
+ custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=float16_t(v);}"));
+ custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
+ custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i].r=sbuf[si4.r];buf[i].g=sbuf[si4.g];buf[i].b=sbuf[si4.b];buf[i].a=sbuf[si4.a];}"));
+ custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i].abcd.r=sbuf[si4.r];buf[i].abcd.g=sbuf[si4.g];buf[i].abcd.b=sbuf[si4.b];buf[i].abcd.a=sbuf[si4.a];buf[i].efgh.r=sbuf[sii4.r];buf[i].efgh.g=sbuf[sii4.g];buf[i].efgh.b=sbuf[sii4.b];buf[i].efgh.a=sbuf[sii4.a];}"));
+ custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "vec2(buf[i])"));
+ custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=f16vec2(v);}"));
+ custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
+ custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "vec4(buf[i])"));
+ custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=f16vec4(v);}"));
+ custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
+ custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"));
+ custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i].abcd=sbuf[si2.r];buf[i].efgh=sbuf[si2.g];}"));
+ custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "mat2x4(vec4(buf[i].abcd),vec4(buf[i].efgh))"));
+ custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i].abcd=f16vec4(v[0]);buf[i].efgh=f16vec4(v[1]);}"));
+ custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i].abcd=sbuf[si].abcd;buf[i].efgh=sbuf[si].efgh;}"));
+ custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{buf[i4.r]=sbuf[si].abcd.r;buf[i4.g]=sbuf[si].abcd.g;buf[i4.b]=sbuf[si].abcd.b;buf[i4.a]=sbuf[si].abcd.a; buf[ii4.r]=sbuf[si].efgh.r;buf[ii4.g]=sbuf[si].efgh.g;buf[ii4.b]=sbuf[si].efgh.b;buf[ii4.a]=sbuf[si].efgh.a;}"));
+ custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{buf[i2.r]=sbuf[si].abcd;buf[i2.g]=sbuf[si].efgh;}"));
+ }
+ else if (opt.use_fp16_packed)
+ {
+ custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "buf[i]"));
+ custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=v;}"));
+ custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
+ custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=uvec2(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])));}"));
+ custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=uvec4(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])),packHalf2x16(vec2(sbuf[sii4.r],sbuf[sii4.g])),packHalf2x16(vec2(sbuf[sii4.b],sbuf[sii4.a])));}"));
+ custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "unpackHalf2x16(buf[i])"));
+ custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=packHalf2x16(v)}"));
+ custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
+ custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y))"));
+ custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=uvec2(packHalf2x16(v.rg),packHalf2x16(v.ba));}"));
+ custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
+ custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"));
+ custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"));
+ custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "mat2x4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g)),vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a)))"));
+ custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=uvec4(uvec2(packHalf2x16(v[0].rg),packHalf2x16(v[0].ba)),uvec2(packHalf2x16(v[1].rg),packHalf2x16(v[1].ba)));}"));
+ custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
+ custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"));
+ custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"));
+ }
+ else
+ {
+ custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "buf[i]"));
+ custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=v;}"));
+ custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
+ custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"));
+ custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"));
+ custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "buf[i]"));
+ custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=v;}"));
+ custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
+ custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "buf[i]"));
+ custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=v;}"));
+ custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
+ custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{vec4 _v=sbuf[si]; buf[i4.r]=_v.r;buf[i4.g]=_v.g;buf[i4.b]=_v.b;buf[i4.a]=_v.a;}"));
+ custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=mat2x4(sbuf[si2.r],sbuf[si2.g]);}"));
+ custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "buf[i]"));
+ custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=v;}"));
+ custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
+ custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"));
+ custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"));
+ custom_defines.push_back(std::make_pair("sfp2afpmat4(v)", "v"));
+ custom_defines.push_back(std::make_pair("afp2sfpmat4(v)", "v"));
+ }
+
+ if (opt.use_image_storage)
+ {
+ if (opt.use_fp16_storage)
+ {
+ custom_defines.push_back(std::make_pair("imfmtc1", "r16f"));
+ custom_defines.push_back(std::make_pair("imfmtc4", "rgba16f"));
+ custom_defines.push_back(std::make_pair("unfp", "mediump"));
+ }
+ else if (opt.use_fp16_packed)
+ {
+ custom_defines.push_back(std::make_pair("imfmtc1", "r32f"));
+ custom_defines.push_back(std::make_pair("imfmtc4", "rgba16f"));
+ custom_defines.push_back(std::make_pair("unfp", "mediump"));
+ }
+ else
+ {
+ custom_defines.push_back(std::make_pair("imfmtc1", "r32f"));
+ custom_defines.push_back(std::make_pair("imfmtc4", "rgba32f"));
+ custom_defines.push_back(std::make_pair("unfp", "highp"));
+ }
+
+ if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
+ {
+ custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
+ custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
+ custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
+ custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{f16vec4 _v;_v.r=float16_t(v);imageStore(img,p,_v);}"));
+ custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{f16vec4 _v;_v.r=float16_t(v);imageStore(img,p,_v);}"));
+ custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{f16vec4 _v;_v.r=float16_t(v);imageStore(img,p,_v);}"));
+ custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
+ custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
+ custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
+ custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,vec4(v));}"));
+ custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,vec4(v));}"));
+ custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,vec4(v));}"));
+ custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
+ custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
+ custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
+ custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,vec4(v[0]));imageStore(img,(p)*2+1,vec4(v[1]));}"));
+ custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),vec4(v[0]));imageStore(img,ivec2(p.x*2+1,p.y),vec4(v[1]));}"));
+ custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),vec4(v[0]));imageStore(img,ivec3(p.x*2+1,p.y,p.z),vec4(v[1]));}"));
+ custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
+ custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
+ custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
+ }
+ else if (opt.use_fp16_packed && opt.use_fp16_arithmetic)
+ {
+ custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
+ custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
+ custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
+ custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
+ custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
+ custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
+ custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
+ custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
+ custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
+ custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}"));
+ custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}"));
+ custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}"));
+ custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
+ custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
+ custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
+ custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"));
+ custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"));
+ custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"));
+ custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
+ custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
+ custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
+ }
+ else if (opt.use_fp16_storage)
+ {
+ custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
+ custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
+ custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
+ custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
+ custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
+ custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
+ custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "texelFetch(tex,p,0)"));
+ custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "texelFetch(tex,p,0)"));
+ custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "texelFetch(tex,p,0)"));
+ custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}"));
+ custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}"));
+ custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}"));
+ custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
+ custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
+ custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
+ custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"));
+ custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"));
+ custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"));
+ custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
+ custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
+ custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
+ }
+ else if (opt.use_fp16_packed)
+ {
+ custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
+ custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
+ custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
+ custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
+ custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
+ custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
+ custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "texelFetch(tex,p,0)"));
+ custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "texelFetch(tex,p,0)"));
+ custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "texelFetch(tex,p,0)"));
+ custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}"));
+ custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}"));
+ custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}"));
+ custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
+ custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
+ custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
+ custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"));
+ custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"));
+ custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"));
+ custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
+ custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
+ custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
+ }
+ else
+ {
+ custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
+ custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
+ custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
+ custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
+ custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
+ custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
+ custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "texelFetch(tex,p,0)"));
+ custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "texelFetch(tex,p,0)"));
+ custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "texelFetch(tex,p,0)"));
+ custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}"));
+ custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}"));
+ custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}"));
+ custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
+ custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
+ custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
+ custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
+ custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"));
+ custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"));
+ custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"));
+ custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
+ custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
+ custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
+ }
+ }
+
+ custom_defines.push_back(std::make_pair("psc(x)", "(x==0?p.x:x)"));
+
+ if (opt.use_fp16_packed)
+ {
+ custom_defines.push_back(std::make_pair("NCNN_fp16_packed", "1"));
+ }
+ if (opt.use_fp16_storage)
+ {
+ custom_defines.push_back(std::make_pair("NCNN_fp16_storage", "1"));
+ }
+ if (opt.use_fp16_arithmetic)
+ {
+ custom_defines.push_back(std::make_pair("NCNN_fp16_arithmetic", "1"));
+ }
+
+ if (opt.use_image_storage)
+ {
+ custom_defines.push_back(std::make_pair("NCNN_image_shader", "1"));
+ }
+
+ std::string preamble;
+ std::vector processes;
+
+ processes.resize(custom_defines.size());
+ for (size_t i = 0; i < custom_defines.size(); i++)
+ {
+ const char* key = custom_defines[i].first;
+ const char* def = custom_defines[i].second;
+
+ preamble += std::string("#define ") + key + " " + def + "\n";
+ processes[i] = std::string("define-macro ") + key + "=" + def;
+ }
+
+ bool compile_success = true;
+
+ {
+ glslang::TShader s(EShLangCompute);
+
+ s.setStringsWithLengths(&comp_data, &comp_data_size, 1);
+
+ s.setPreamble(preamble.c_str());
+ s.addProcesses(processes);
+ s.setEntryPoint("main");
+ s.setSourceEntryPoint("main");
+
+ s.setEnvInput(glslang::EShSourceGlsl, EShLangCompute, glslang::EShClientVulkan, 1);
+ s.setEnvClient(glslang::EShClientVulkan, glslang::EShTargetVulkan_1_0);
+ s.setEnvTarget(glslang::EshTargetSpv, glslang::EShTargetSpv_1_0);
+
+ TBuiltInResource resources = default_TBuiltInResource;
+
+ // although vulkan 1.1 accept glsl directly
+ // ncnn resolve_shader_info() only works with the intermediate spirv code
+ bool pr = s.parse(&resources, 100, false, EShMsgDefault);
+ if (!pr)
+ {
+ NCNN_LOGE("compile spir-v module failed");
+ NCNN_LOGE("%s", s.getInfoLog());
+ NCNN_LOGE("%s", s.getInfoDebugLog());
+
+ compile_success = false;
+ }
+ else
+ {
+ glslang::TIntermediate* ir = s.getIntermediate();
+ glslang::GlslangToSpv(*ir, spirv);
+ }
+ }
+
+ return compile_success ? 0 : -1;
+}
+#endif
+
+#if !NCNN_VULKAN_ONLINE_SPIRV
const ShaderInfo& get_shader_info(int shader_type_index)
{
if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count)
@@ -2035,6 +2591,7 @@ const ShaderInfo& get_shader_info(int shader_type_index)
return layer_shader_infos[shader_type_index];
}
+#endif
int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info)
{
diff --git a/src/gpu.h b/src/gpu.h
index ae27e60c4..e90b08d11 100644
--- a/src/gpu.h
+++ b/src/gpu.h
@@ -173,10 +173,12 @@ public:
VkDevice vkdevice() const { return device; }
+#if !NCNN_VULKAN_ONLINE_SPIRV
VkShaderModule get_shader_module(int shader_type_index) const;
// with fixed workgroup size
VkShaderModule create_shader_module(int shader_type_index, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
+#endif
VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const;
@@ -252,9 +254,11 @@ public:
#endif // __ANDROID_API__ >= 26
protected:
+#if !NCNN_VULKAN_ONLINE_SPIRV
// shader management
int create_shader_module();
void destroy_shader_module();
+#endif
// device extension
int init_device_extension();
@@ -269,7 +273,9 @@ protected:
private:
VkDevice device;
+#if !NCNN_VULKAN_ONLINE_SPIRV
std::vector shader_modules;
+#endif
// hardware queue
mutable std::vector compute_queues;
@@ -304,6 +310,11 @@ private:
VulkanDevice* get_gpu_device(int device_index = get_default_gpu_index());
+#if NCNN_VULKAN_ONLINE_SPIRV
+// online spirv compilation
+int compile_spirv_module(int shader_type_index, const Option& opt, std::vector& spirv);
+#endif
+
// info from spirv
class ShaderInfo
{
@@ -319,7 +330,9 @@ public:
int binding_types[16];// 16 is large enough I think ...
};
+#if !NCNN_VULKAN_ONLINE_SPIRV
const ShaderInfo& get_shader_info(int shader_type_index);
+#endif
int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info);
} // namespace ncnn
diff --git a/src/pipeline.cpp b/src/pipeline.cpp
index 085f1e82c..eac51c43b 100644
--- a/src/pipeline.cpp
+++ b/src/pipeline.cpp
@@ -79,6 +79,47 @@ int Pipeline::create(const uint32_t* spv_data, size_t spv_data_size, const std::
int Pipeline::create(int shader_type_index, const Option& opt, const std::vector& specializations)
{
+#if NCNN_VULKAN_ONLINE_SPIRV
+ std::vector spirv;
+ int retc = compile_spirv_module(shader_type_index, opt, spirv);
+ if (retc != 0)
+ {
+ NCNN_LOGE("compile_spirv_module failed %d", retc);
+ return -1;
+ }
+
+ const uint32_t* spv_data = spirv.data();
+ size_t spv_data_size = spirv.size() * 4;
+
+ ShaderInfo si;
+ int ret = resolve_shader_info(spv_data, spv_data_size, si);
+ if (ret != 0)
+ {
+ NCNN_LOGE("resolve_shader_info failed %d", ret);
+ return -1;
+ }
+
+ // -3 for local_size_xyz
+ int specialization_count_expected = si.specialization_count - 3;
+ if ((int)specializations.size() != specialization_count_expected)
+ {
+ NCNN_LOGE("pipeline specialization count mismatch, expect %d but got %d", specialization_count_expected, (int)specializations.size());
+ return -1;
+ }
+
+ if (vkdev->info.bug_local_size_spec_const)
+ {
+ local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z);
+ }
+ else
+ {
+ local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size);
+ }
+
+// NCNN_LOGE("local_shader_module %p created", local_shader_module);
+
+ return create(local_shader_module, si, specializations);
+#else
// ncnn_add_shader cmake macro
// 0 = fp32
// 1 = fp16p
@@ -148,6 +189,7 @@ int Pipeline::create(int shader_type_index, const Option& opt, const std::vector
VkShaderModule shader_module = vkdev->get_shader_module(shader_type_index);
return create(shader_module, si, specializations);
+#endif
}
int Pipeline::create(VkShaderModule shader_module, const ShaderInfo& _shader_info, const std::vector& specializations)
@@ -590,6 +632,44 @@ int ImportAndroidHardwareBufferPipeline::create(VkAndroidHardwareBufferImageAllo
int shader_type_index = LayerShaderType::convert_ycbcr;
+#if NCNN_VULKAN_ONLINE_SPIRV
+ std::vector spirv;
+ int retc = compile_spirv_module(shader_type_index, opt, spirv);
+ if (retc != 0)
+ {
+ NCNN_LOGE("compile_spirv_module failed %d", retc);
+ return -1;
+ }
+
+ const uint32_t* spv_data = spirv.data();
+ size_t spv_data_size = spirv.size() * 4;
+
+ ShaderInfo si;
+ int ret = resolve_shader_info(spv_data, spv_data_size, si);
+ if (ret != 0)
+ {
+ NCNN_LOGE("resolve_shader_info failed %d", ret);
+ return -1;
+ }
+
+ // -3 for local_size_xyz
+ int specialization_count_expected = si.specialization_count - 3;
+ if ((int)specializations.size() != specialization_count_expected)
+ {
+ NCNN_LOGE("pipeline specialization count mismatch, expect %d but got %d", specialization_count_expected, (int)specializations.size());
+ return -1;
+ }
+
+ VkShaderModule shader_module;
+ if (vkdev->info.bug_local_size_spec_const)
+ {
+ shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z);
+ }
+ else
+ {
+ shader_module = vkdev->compile_shader_module(spv_data, spv_data_size);
+ }
+#else
// ncnn_add_shader cmake macro
// 0 = fp32
// 1 = fp16p
@@ -640,7 +720,7 @@ int ImportAndroidHardwareBufferPipeline::create(VkAndroidHardwareBufferImageAllo
}
VkShaderModule shader_module = vkdev->get_shader_module(shader_type_index);
-
+#endif
create_pipeline(shader_module, specializations);
if (vkdev->info.support_VK_KHR_descriptor_update_template)
diff --git a/src/platform.h.in b/src/platform.h.in
index 4d960b84a..4a73395da 100644
--- a/src/platform.h.in
+++ b/src/platform.h.in
@@ -23,6 +23,7 @@
#cmakedefine01 NCNN_PIXEL
#cmakedefine01 NCNN_PIXEL_ROTATE
#cmakedefine01 NCNN_VULKAN
+#cmakedefine01 NCNN_VULKAN_ONLINE_SPIRV
#cmakedefine01 NCNN_REQUANT
#cmakedefine01 NCNN_AVX2