Browse Source

runtime spir-v compilation with libglslang (#1779)

tags/20200616
nihui GitHub 6 years ago
parent
commit
17c445480f
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 1205 additions and 47 deletions
  1. +24
    -0
      .github/workflows/android-armv7-gpu.yml
  2. +24
    -0
      .github/workflows/android-armv8-gpu.yml
  3. +24
    -0
      .github/workflows/android-x64-gpu.yml
  4. +24
    -0
      .github/workflows/android-x86-gpu.yml
  5. +24
    -0
      .github/workflows/ios-64bit-gpu.yml
  6. +62
    -0
      .github/workflows/linux-x64-gpu-clang.yml
  7. +59
    -0
      .github/workflows/linux-x64-gpu-gcc.yml
  8. +58
    -0
      .github/workflows/macos-x64-gpu.yml
  9. +68
    -0
      .github/workflows/test-coverage.yml
  10. +73
    -0
      .github/workflows/windows-x64-gpu-vs2019.yml
  11. +3
    -0
      .gitmodules
  12. +23
    -0
      CMakeLists.txt
  13. +1
    -0
      glslang
  14. +85
    -45
      src/CMakeLists.txt
  15. +1
    -1
      src/convert_ycbcr.comp
  16. +557
    -0
      src/gpu.cpp
  17. +13
    -0
      src/gpu.h
  18. +81
    -1
      src/pipeline.cpp
  19. +1
    -0
      src/platform.h.in

+ 24
- 0
.github/workflows/android-armv7-gpu.yml View File

@@ -22,3 +22,27 @@ jobs:
run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
- name: build
run: cmake --build build -j 2

android-armv7-gpu-online-spirv:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- name: cache-vulkansdk
id: cache-vulkansdk
uses: actions/cache@v1
with:
path: "1.1.114.0"
key: vulkansdk-linux-x86_64-1.1.114.0
- name: vulkansdk
if: steps.cache-vulkansdk.outputs.cache-hit != 'true'
run: |
wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz
tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz
rm -rf 1.1.114.0/source 1.1.114.0/samples
find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm
- name: configure
run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON ..
- name: build
run: cmake --build build -j 2

+ 24
- 0
.github/workflows/android-armv8-gpu.yml View File

@@ -22,3 +22,27 @@ jobs:
run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
- name: build
run: cmake --build build -j 2

android-aarch64-gpu-online-spirv:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- name: cache-vulkansdk
id: cache-vulkansdk
uses: actions/cache@v1
with:
path: "1.1.114.0"
key: vulkansdk-linux-x86_64-1.1.114.0
- name: vulkansdk
if: steps.cache-vulkansdk.outputs.cache-hit != 'true'
run: |
wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz
tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz
rm -rf 1.1.114.0/source 1.1.114.0/samples
find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm
- name: configure
run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON ..
- name: build
run: cmake --build build -j 2

+ 24
- 0
.github/workflows/android-x64-gpu.yml View File

@@ -22,3 +22,27 @@ jobs:
run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
- name: build
run: cmake --build build -j 2

android-x86_64-gpu-online-spirv:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- name: cache-vulkansdk
id: cache-vulkansdk
uses: actions/cache@v1
with:
path: "1.1.114.0"
key: vulkansdk-linux-x86_64-1.1.114.0
- name: vulkansdk
if: steps.cache-vulkansdk.outputs.cache-hit != 'true'
run: |
wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz
tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz
rm -rf 1.1.114.0/source 1.1.114.0/samples
find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm
- name: configure
run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON ..
- name: build
run: cmake --build build -j 2

+ 24
- 0
.github/workflows/android-x86-gpu.yml View File

@@ -22,3 +22,27 @@ jobs:
run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
- name: build
run: cmake --build build -j 2

android-x86-gpu-online-spirv:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- name: cache-vulkansdk
id: cache-vulkansdk
uses: actions/cache@v1
with:
path: "1.1.114.0"
key: vulkansdk-linux-x86_64-1.1.114.0
- name: vulkansdk
if: steps.cache-vulkansdk.outputs.cache-hit != 'true'
run: |
wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz
tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz
rm -rf 1.1.114.0/source 1.1.114.0/samples
find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm
- name: configure
run: export PATH=`pwd`/1.1.114.0/x86_64/bin:$PATH && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON ..
- name: build
run: cmake --build build -j 2

+ 24
- 0
.github/workflows/ios-64bit-gpu.yml View File

@@ -22,3 +22,27 @@ jobs:
run: export VULKAN_SDK=`pwd`/vulkansdk-macos-1.1.114.0/macOS && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=OS64 -DVulkan_INCLUDE_DIR=`pwd`/../vulkansdk-macos-1.1.114.0/MoltenVK/include -DVulkan_LIBRARY=`pwd`/../vulkansdk-macos-1.1.114.0/MoltenVK/iOS/dynamic/libMoltenVK.dylib -DNCNN_VULKAN=ON ..
- name: build
run: cmake --build build -j 2

ios-iphone-os-gpu-online-spirv:
runs-on: macos-latest
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- name: cache-vulkansdk
id: cache-vulkansdk
uses: actions/cache@v1
with:
path: "vulkansdk-macos-1.1.114.0"
key: vulkansdk-macos-1.1.114.0
- name: vulkansdk
if: steps.cache-vulkansdk.outputs.cache-hit != 'true'
run: |
wget https://sdk.lunarg.com/sdk/download/1.1.114.0/mac/vulkansdk-macos-1.1.114.0.tar.gz?Human=true -O vulkansdk-macos-1.1.114.0.tar.gz
tar -xf vulkansdk-macos-1.1.114.0.tar.gz
rm -rf vulkansdk-macos-1.1.114.0/Applications
find vulkansdk-macos-1.1.114.0 -type f | grep -v -E 'vulkan|glslang|MoltenVK' | xargs rm
- name: configure
run: export VULKAN_SDK=`pwd`/vulkansdk-macos-1.1.114.0/macOS && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=OS64 -DVulkan_INCLUDE_DIR=`pwd`/../vulkansdk-macos-1.1.114.0/MoltenVK/include -DVulkan_LIBRARY=`pwd`/../vulkansdk-macos-1.1.114.0/MoltenVK/iOS/dynamic/libMoltenVK.dylib -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON ..
- name: build
run: cmake --build build -j 2

+ 62
- 0
.github/workflows/linux-x64-gpu-clang.yml View File

@@ -60,3 +60,65 @@ jobs:
run: |
export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
cd build && ctest --output-on-failure -j 2

linux-clang-gpu-online-spirv:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- name: update
run: sudo apt-get update
- name: protobuf
run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev
- name: cache-vulkansdk
id: cache-vulkansdk
uses: actions/cache@v1
with:
path: "1.1.114.0"
key: vulkansdk-linux-x86_64-1.1.114.0
- name: vulkansdk
if: steps.cache-vulkansdk.outputs.cache-hit != 'true'
run: |
wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz
tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz
rm -rf 1.1.114.0/source 1.1.114.0/samples
find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm
- name: cache-swiftshader
id: cache-swiftshader
uses: actions/cache@v1
with:
path: swiftshader-install
key: swiftshader-linux-install-20200508
- name: checkout-swiftshader
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
uses: actions/checkout@v2
with:
repository: google/swiftshader
path: swiftshader
ref: 2dd864470e310d173d35fa95ca3a14d216734aab
- name: checkout-swiftshader-submodules
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
run: |
cd swiftshader
git submodule update --init --recursive
- name: swiftshader
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
run: |
cd swiftshader
mkdir -p build; cd build
cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
cmake --build . -j 2
mkdir $GITHUB_WORKSPACE/swiftshader-install
cp Linux/* $GITHUB_WORKSPACE/swiftshader-install
- name: configure
env:
CC: clang
CXX: clang++
run: export VULKAN_SDK=`pwd`/1.1.114.0/x86_64 && mkdir build && cd build && cmake -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON ..
- name: build
run: cmake --build build -j 2
- name: test
run: |
export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
cd build && ctest --output-on-failure -j 2

+ 59
- 0
.github/workflows/linux-x64-gpu-gcc.yml View File

@@ -57,3 +57,62 @@ jobs:
run: |
export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
cd build && ctest --output-on-failure -j 2

linux-gcc-gpu-online-spirv:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- name: update
run: sudo apt-get update
- name: protobuf
run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev
- name: cache-vulkansdk
id: cache-vulkansdk
uses: actions/cache@v1
with:
path: "1.1.114.0"
key: vulkansdk-linux-x86_64-1.1.114.0
- name: vulkansdk
if: steps.cache-vulkansdk.outputs.cache-hit != 'true'
run: |
wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz
tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz
rm -rf 1.1.114.0/source 1.1.114.0/samples
find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm
- name: cache-swiftshader
id: cache-swiftshader
uses: actions/cache@v1
with:
path: swiftshader-install
key: swiftshader-linux-install-20200508
- name: checkout-swiftshader
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
uses: actions/checkout@v2
with:
repository: google/swiftshader
path: swiftshader
ref: 2dd864470e310d173d35fa95ca3a14d216734aab
- name: checkout-swiftshader-submodules
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
run: |
cd swiftshader
git submodule update --init --recursive
- name: swiftshader
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
run: |
cd swiftshader
mkdir -p build; cd build
cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
cmake --build . -j 2
mkdir $GITHUB_WORKSPACE/swiftshader-install
cp Linux/* $GITHUB_WORKSPACE/swiftshader-install
- name: configure
run: export VULKAN_SDK=`pwd`/1.1.114.0/x86_64 && mkdir build && cd build && cmake -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON ..
- name: build
run: cmake --build build -j 2
- name: test
run: |
export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
cd build && ctest --output-on-failure -j 2

+ 58
- 0
.github/workflows/macos-x64-gpu.yml View File

@@ -56,3 +56,61 @@ jobs:
export DYLD_LIBRARY_PATH="vulkansdk-macos-1.1.114.0/macOS/lib":$DYLD_LIBRARY_PATH
export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
cd build && ctest --output-on-failure -j 2

macos-clang-gpu-online-spirv:
runs-on: macos-latest
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- name: protobuf
run: brew install protobuf opencv3
- name: cache-vulkansdk
id: cache-vulkansdk
uses: actions/cache@v1
with:
path: "vulkansdk-macos-1.1.114.0"
key: vulkansdk-macos-1.1.114.0
- name: vulkansdk
if: steps.cache-vulkansdk.outputs.cache-hit != 'true'
run: |
wget https://sdk.lunarg.com/sdk/download/1.1.114.0/mac/vulkansdk-macos-1.1.114.0.tar.gz?Human=true -O vulkansdk-macos-1.1.114.0.tar.gz
tar -xf vulkansdk-macos-1.1.114.0.tar.gz
rm -rf vulkansdk-macos-1.1.114.0/Applications
find vulkansdk-macos-1.1.114.0 -type f | grep -v -E 'vulkan|glslang|MoltenVK' | xargs rm
- name: cache-swiftshader
id: cache-swiftshader
uses: actions/cache@v1
with:
path: swiftshader-install
key: swiftshader-macos-install-20200508
- name: checkout-swiftshader
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
uses: actions/checkout@v2
with:
repository: google/swiftshader
path: swiftshader
ref: 2dd864470e310d173d35fa95ca3a14d216734aab
- name: checkout-swiftshader-submodules
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
run: |
cd swiftshader
git submodule update --init --recursive
- name: swiftshader
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
run: |
cd swiftshader
mkdir -p build; cd build
cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
cmake --build . -j 2
mkdir $GITHUB_WORKSPACE/swiftshader-install
cp Darwin/* $GITHUB_WORKSPACE/swiftshader-install
- name: configure
run: export VULKAN_SDK=`pwd`/vulkansdk-macos-1.1.114.0/macOS && mkdir build && cd build && cmake -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON ..
- name: build
run: cmake --build build -j 2
- name: test
run: |
export DYLD_LIBRARY_PATH="vulkansdk-macos-1.1.114.0/macOS/lib":$DYLD_LIBRARY_PATH
export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
cd build && ctest --output-on-failure -j 2

+ 68
- 0
.github/workflows/test-coverage.yml View File

@@ -67,6 +67,74 @@ jobs:
token: ${{ secrets.CODECOV_TOKEN }}
file: build/lcov.info

linux-gcc-gpu-online-spirv:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- name: lcov
run: sudo apt-get install lcov
- name: cache-vulkansdk
id: cache-vulkansdk
uses: actions/cache@v1
with:
path: "1.1.114.0"
key: vulkansdk-linux-x86_64-1.1.114.0
- name: vulkansdk
if: steps.cache-vulkansdk.outputs.cache-hit != 'true'
run: |
wget https://sdk.lunarg.com/sdk/download/1.1.114.0/linux/vulkansdk-linux-x86_64-1.1.114.0.tar.gz?Human=true -O vulkansdk-linux-x86_64-1.1.114.0.tar.gz
tar -xf vulkansdk-linux-x86_64-1.1.114.0.tar.gz
rm -rf 1.1.114.0/source 1.1.114.0/samples
find 1.1.114.0 -type f | grep -v -E 'vulkan|glslang' | xargs rm
- name: cache-swiftshader
id: cache-swiftshader
uses: actions/cache@v1
with:
path: swiftshader-install
key: swiftshader-linux-install-20200508
- name: checkout-swiftshader
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
uses: actions/checkout@v2
with:
repository: google/swiftshader
path: swiftshader
ref: 2dd864470e310d173d35fa95ca3a14d216734aab
- name: checkout-swiftshader-submodules
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
run: |
cd swiftshader
git submodule update --init --recursive
- name: swiftshader
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
run: |
cd swiftshader
mkdir -p build; cd build
cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
cmake --build . -j 2
mkdir $GITHUB_WORKSPACE/swiftshader-install
cp Linux/* $GITHUB_WORKSPACE/swiftshader-install
- name: configure
run: export VULKAN_SDK=`pwd`/1.1.114.0/x86_64 && mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
- name: build
run: cmake --build build -j 2
- name: test
run: |
export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
cd build && ctest --output-on-failure -j 2
- name: lcov-collect
run: |
cd build
lcov -d ./src -c -o lcov.info
lcov -r lcov.info '/usr/*' -o lcov.info
lcov --list lcov.info
- name: codecov
uses: codecov/codecov-action@v1
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: build/lcov.info

linux-gcc-avx2:
runs-on: ubuntu-latest
steps:


+ 73
- 0
.github/workflows/windows-x64-gpu-vs2019.yml View File

@@ -71,3 +71,76 @@ jobs:
Copy-Item -Path '.\VulkanSDK\RunTimeInstaller\x64\vulkan-1.dll' -Destination 'build\tests'
$env:VK_ICD_FILENAMES="$env:GITHUB_WORKSPACE\swiftshader-install\vk_swiftshader_icd.json"
cd build; ctest --output-on-failure -j 2

windows-vs2019-gpu-online-spirv:
runs-on: windows-latest
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- name: cache-protobuf
id: cache-protobuf
uses: actions/cache@v1
with:
path: "protobuf-install"
key: protobuf-windows-install
- name: protobuf
if: steps.cache-protobuf.outputs.cache-hit != 'true'
run: |
Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
7z x ./protobuf-3.11.2.zip
cd protobuf-3.11.2
mkdir build-vs2019; cd build-vs2019; cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
cmake --build . --config Release -j 2
cmake --build . --config Release --target install
- name: cache-vulkansdk
id: cache-vulkansdk
uses: actions/cache@v1
with:
path: "VulkanSDK"
key: VulkanSDK-1.1.114.0-Installer
- name: vulkansdk
if: steps.cache-vulkansdk.outputs.cache-hit != 'true'
run: |
Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.1.114.0/windows/VulkanSDK-1.1.114.0-Installer.exe?Human=true -OutFile VulkanSDK-1.1.114.0-Installer.exe
7z x -aoa ./VulkanSDK-1.1.114.0-Installer.exe -oVulkanSDK
Remove-Item .\VulkanSDK\Demos, .\VulkanSDK\glslang, .\VulkanSDK\Samples, .\VulkanSDK\shaderc, .\VulkanSDK\spirv-tools, .\VulkanSDK\Third-Party, .\VulkanSDK\Tools, .\VulkanSDK\Tools32 -Recurse
- name: cache-swiftshader
id: cache-swiftshader
uses: actions/cache@v1
with:
path: swiftshader-install
key: swiftshader-windows-install-20200508
- name: checkout-swiftshader
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
uses: actions/checkout@v2
with:
repository: google/swiftshader
path: swiftshader
ref: 2dd864470e310d173d35fa95ca3a14d216734aab
- name: checkout-swiftshader-submodules
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
run: |
cd swiftshader
git submodule update --init --recursive
- name: swiftshader
if: steps.cache-swiftshader.outputs.cache-hit != 'true'
run: |
cd swiftshader
mkdir build-vs2019; cd build-vs2019
cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
cmake --build . --config Release -j 2
mkdir "$env:GITHUB_WORKSPACE/swiftshader-install"
Copy-Item -Path "Windows\*" -Destination "$env:GITHUB_WORKSPACE\swiftshader-install"
- name: configure
run: |
$env:VULKAN_SDK="$(pwd)/VulkanSDK"
mkdir build; cd build
cmake -DProtobuf_INCLUDE_DIR="$env:GITHUB_WORKSPACE\protobuf-install\include" -DProtobuf_LIBRARIES="$env:GITHUB_WORKSPACE\protobuf-install\lib\libprotobuf.lib" -DProtobuf_PROTOC_EXECUTABLE="$env:GITHUB_WORKSPACE\protobuf-install\bin\protoc.exe" -DNCNN_VULKAN=ON -DNCNN_VULKAN_ONLINE_SPIRV=ON ..
- name: build
run: cmake --build build --config Release -j 2
- name: test
run: |
Copy-Item -Path '.\VulkanSDK\RunTimeInstaller\x64\vulkan-1.dll' -Destination 'build\tests'
$env:VK_ICD_FILENAMES="$env:GITHUB_WORKSPACE\swiftshader-install\vk_swiftshader_icd.json"
cd build; ctest --output-on-failure -j 2

+ 3
- 0
.gitmodules View File

@@ -0,0 +1,3 @@
[submodule "glslang"]
path = glslang
url = https://github.com/KhronosGroup/glslang

+ 23
- 0
CMakeLists.txt View File

@@ -30,6 +30,7 @@ option(NCNN_PIXEL "convert and resize from/to image pixel" ON)
option(NCNN_PIXEL_ROTATE "rotate image pixel orientation" ON)
option(NCNN_CMAKE_VERBOSE "print verbose cmake messages" OFF)
option(NCNN_VULKAN "vulkan compute support" OFF)
option(NCNN_VULKAN_ONLINE_SPIRV "online SPIR-V module compilation" OFF)
option(NCNN_REQUANT "auto merge int8 quant and dequant" OFF)
option(NCNN_AVX2 "optimize x86 platform with avx2" OFF)
option(NCNN_DISABLE_PIC "disable position-independent code" OFF)
@@ -64,6 +65,28 @@ if(NCNN_COVERAGE)
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -coverage -lgcov")
endif()

if(NCNN_VULKAN_ONLINE_SPIRV)
if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/glslang/CMakeLists.txt")
message(WARNING "The submodules were not downloaded! NCNN_VULKAN_ONLINE_SPIRV will be turned off.")
message(WARNING "Please update submodules with \"git submodule update --init\" and try again.")
set(NCNN_VULKAN_ONLINE_SPIRV OFF)
else()
# glslang requires c++11
set(CMAKE_CXX_STANDARD 11)

option(BUILD_EXTERNAL "" OFF)
option(ENABLE_SPVREMAPPER "" OFF)
option(ENABLE_GLSLANG_BINARIES "" OFF)
option(ENABLE_HLSL "" OFF)
option(ENABLE_RTTI "" OFF)
option(ENABLE_EXCEPTIONS "" OFF)
option(ENABLE_OPT "" OFF)
option(ENABLE_PCH "" OFF)
option(ENABLE_CTEST "" OFF)
add_subdirectory(glslang)
endif()
endif()

add_subdirectory(src)
if(NCNN_BUILD_BENCHMARK)
add_subdirectory(benchmark)


+ 1
- 0
glslang

@@ -0,0 +1 @@
Subproject commit 59216d5cd87eee78dc979e30645c4c2240a7c351

+ 85
- 45
src/CMakeLists.txt View File

@@ -3,7 +3,7 @@

configure_file(platform.h.in ${CMAKE_CURRENT_BINARY_DIR}/platform.h)

if(NCNN_VULKAN)
if(NCNN_VULKAN AND NOT NCNN_VULKAN_ONLINE_SPIRV)
find_program(GLSLANGVALIDATOR_EXECUTABLE NAMES glslangValidator PATHS $ENV{VULKAN_SDK}/bin NO_CMAKE_FIND_ROOT_PATH)
message(STATUS "Found glslangValidator: ${GLSLANGVALIDATOR_EXECUTABLE}")
endif()
@@ -21,6 +21,7 @@ endfunction()

set(ncnn_SRCS
allocator.cpp
benchmark.cpp
blob.cpp
command.cpp
cpu.cpp
@@ -37,59 +38,93 @@ set(ncnn_SRCS
option.cpp
paramdict.cpp
pipeline.cpp
benchmark.cpp
)

ncnn_src_group(ncnn_SRCS "sources")

if(ANDROID)
list(APPEND ncnn_SRCS mat_pixel_android.cpp)
endif()

ncnn_src_group(ncnn_SRCS "sources")

include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_generate_shader_spv_header.cmake)

macro(ncnn_add_shader SHADER_SRC)
ncnn_generate_shader_spv_header(SHADER_SPV_HEADER SHADER_SPV_HEX_HEADERS ${SHADER_SRC})

get_filename_component(SHADER_SPV_HEADER_NAME ${SHADER_SPV_HEADER} NAME)
string(APPEND layer_shader_spv_data "#include \"${SHADER_SPV_HEADER_NAME}\"\n")

get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE)
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_spv_data,sizeof(${SHADER_SRC_NAME_WE}_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16p_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16pa_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16s_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16sa_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16p_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16pa_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16s_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16sa_spv_data)},\n")

list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEADER})
list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEX_HEADERS})

# generate layer_shader_type_enum file
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE} = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16pa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16pa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
if(NCNN_VULKAN_ONLINE_SPIRV)

file(READ ${SHADER_SRC} comp_data)

# skip leading comment
string(FIND "${comp_data}" "#version" version_start)
string(SUBSTRING "${comp_data}" ${version_start} -1 comp_data)

# remove whitespace
string(REGEX REPLACE "\n +" "\n" comp_data "${comp_data}")

# text to hex
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/text2hex.txt "${comp_data}")
file(READ ${CMAKE_CURRENT_BINARY_DIR}/text2hex.txt comp_data_hex HEX)
string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," comp_data_hex ${comp_data_hex})
string(FIND "${comp_data_hex}" "," tail_comma REVERSE)
string(SUBSTRING "${comp_data_hex}" 0 ${tail_comma} comp_data_hex)

get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE)
set(SHADER_COMP_HEADER ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_SRC_NAME_WE}.comp.hex.h)
file(WRITE ${SHADER_COMP_HEADER} "static const char ${SHADER_SRC_NAME_WE}_comp_data[] = {${comp_data_hex}};\n")
set_source_files_properties(${SHADER_COMP_HEADER} PROPERTIES GENERATED TRUE)

string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_comp_data,sizeof(${SHADER_SRC_NAME_WE}_comp_data)},\n")

get_filename_component(SHADER_COMP_HEADER_NAME ${SHADER_COMP_HEADER} NAME)
list(APPEND SHADER_SPV_HEX_FILES ${SHADER_COMP_HEADER_NAME})

string(APPEND layer_shader_spv_data "#include \"${SHADER_COMP_HEADER_NAME}\"\n")

# generate layer_shader_type_enum file
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE} = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
else()
ncnn_generate_shader_spv_header(SHADER_SPV_HEADER SHADER_SPV_HEX_HEADERS ${SHADER_SRC})

get_filename_component(SHADER_SPV_HEADER_NAME ${SHADER_SPV_HEADER} NAME)
string(APPEND layer_shader_spv_data "#include \"${SHADER_SPV_HEADER_NAME}\"\n")

get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE)
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_spv_data,sizeof(${SHADER_SRC_NAME_WE}_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16p_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16pa_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16s_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16sa_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16p_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16pa_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16s_spv_data)},\n")
string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16sa_spv_data)},\n")

list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEADER})
list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEX_HEADERS})

# generate layer_shader_type_enum file
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE} = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16pa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16pa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
endif()
endmacro()

macro(ncnn_add_layer class)
@@ -303,6 +338,11 @@ endif()
if(NCNN_VULKAN)
find_package(Vulkan REQUIRED)
target_link_libraries(ncnn PUBLIC Vulkan::Vulkan)

if(NCNN_VULKAN_ONLINE_SPIRV)
target_include_directories(ncnn PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../>)
target_link_libraries(ncnn PRIVATE glslang SPIRV)
endif()
endif()

if(ANDROID_NDK)


+ 1
- 1
src/convert_ycbcr.comp View File

@@ -18,7 +18,7 @@
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int w = 0;


+ 557
- 0
src/gpu.cpp View File

@@ -23,6 +23,11 @@

#include <algorithm>

#if NCNN_VULKAN_ONLINE_SPIRV
#include "glslang/glslang/Public/ShaderLang.h"
#include "glslang/SPIRV/GlslangToSpv.h"
#endif

#include "mat.h"
#include "command.h"
#include "layer_type.h"
@@ -54,11 +59,19 @@ static Mutex g_default_vkdev_lock;
static VulkanDevice* g_default_vkdev[NCNN_MAX_GPU_COUNT] = {0};

// precompiled spirv
#if NCNN_VULKAN_ONLINE_SPIRV
struct layer_shader_registry_entry
{
const char* comp_data;
int comp_data_size;
};
#else
struct layer_shader_registry_entry
{
const uint32_t* spv_data;
size_t spv_data_size;
};
#endif

#include "layer_shader_spv_data.h"

@@ -67,7 +80,9 @@ static const layer_shader_registry_entry layer_shader_registry[] =
#include "layer_shader_registry.h"
};

#if !NCNN_VULKAN_ONLINE_SPIRV
static ShaderInfo layer_shader_infos[sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry)];
#endif

static const int layer_shader_registry_entry_count = sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry);

@@ -846,17 +861,25 @@ int create_gpu_instance()
// the default gpu device
g_default_gpu_index = find_default_vulkan_device_index();

#if NCNN_VULKAN_ONLINE_SPIRV
glslang::InitializeProcess();
#else
// resolve shader info
for (int i=0; i<layer_shader_registry_entry_count; i++)
{
resolve_shader_info(layer_shader_registry[i].spv_data, layer_shader_registry[i].spv_data_size, layer_shader_infos[i]);
}
#endif

return 0;
}

void destroy_gpu_instance()
{
#if NCNN_VULKAN_ONLINE_SPIRV
glslang::FinalizeProcess();
#endif

for (int i=0; i<NCNN_MAX_GPU_COUNT; i++)
{
delete g_default_vkdev[i];
@@ -1051,7 +1074,9 @@ VulkanDevice::VulkanDevice(int device_index) : info(g_gpu_infos[device_index])

init_device_extension();

#if !NCNN_VULKAN_ONLINE_SPIRV
create_shader_module();
#endif

compute_queues.resize(info.compute_queue_count);
blob_allocators.resize(info.compute_queue_count);
@@ -1133,11 +1158,14 @@ VulkanDevice::~VulkanDevice()
blob_allocators.clear();
staging_allocators.clear();

#if !NCNN_VULKAN_ONLINE_SPIRV
destroy_shader_module();
#endif

vkDestroyDevice(device, 0);
}

#if !NCNN_VULKAN_ONLINE_SPIRV
VkShaderModule VulkanDevice::get_shader_module(int shader_type_index) const
{
if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count)
@@ -1162,6 +1190,7 @@ VkShaderModule VulkanDevice::create_shader_module(int shader_type_index, uint32_

return compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z);
}
#endif

VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const
{
@@ -1601,6 +1630,7 @@ void VulkanDevice::convert_packing(const VkImageMat& src, VkMat& dst, int dst_el
uop->forward(src, dst, cmd, opt);
}

#if !NCNN_VULKAN_ONLINE_SPIRV
int VulkanDevice::create_shader_module()
{
if (info.bug_local_size_spec_const)
@@ -1706,6 +1736,7 @@ void VulkanDevice::destroy_shader_module()

shader_modules.clear();
}
#endif

int VulkanDevice::init_device_extension()
{
@@ -2025,6 +2056,531 @@ VulkanDevice* get_gpu_device(int device_index)
return g_default_vkdev[device_index];
}

#if NCNN_VULKAN_ONLINE_SPIRV

const TBuiltInResource default_TBuiltInResource = {
/* .MaxLights = */ 32,
/* .MaxClipPlanes = */ 6,
/* .MaxTextureUnits = */ 32,
/* .MaxTextureCoords = */ 32,
/* .MaxVertexAttribs = */ 64,
/* .MaxVertexUniformComponents = */ 4096,
/* .MaxVaryingFloats = */ 64,
/* .MaxVertexTextureImageUnits = */ 32,
/* .MaxCombinedTextureImageUnits = */ 80,
/* .MaxTextureImageUnits = */ 32,
/* .MaxFragmentUniformComponents = */ 4096,
/* .MaxDrawBuffers = */ 32,
/* .MaxVertexUniformVectors = */ 128,
/* .MaxVaryingVectors = */ 8,
/* .MaxFragmentUniformVectors = */ 16,
/* .MaxVertexOutputVectors = */ 16,
/* .MaxFragmentInputVectors = */ 15,
/* .MinProgramTexelOffset = */ -8,
/* .MaxProgramTexelOffset = */ 7,
/* .MaxClipDistances = */ 8,
/* .MaxComputeWorkGroupCountX = */ 65535,
/* .MaxComputeWorkGroupCountY = */ 65535,
/* .MaxComputeWorkGroupCountZ = */ 65535,
/* .MaxComputeWorkGroupSizeX = */ 1024,
/* .MaxComputeWorkGroupSizeY = */ 1024,
/* .MaxComputeWorkGroupSizeZ = */ 64,
/* .MaxComputeUniformComponents = */ 1024,
/* .MaxComputeTextureImageUnits = */ 16,
/* .MaxComputeImageUniforms = */ 8,
/* .MaxComputeAtomicCounters = */ 8,
/* .MaxComputeAtomicCounterBuffers = */ 1,
/* .MaxVaryingComponents = */ 60,
/* .MaxVertexOutputComponents = */ 64,
/* .MaxGeometryInputComponents = */ 64,
/* .MaxGeometryOutputComponents = */ 128,
/* .MaxFragmentInputComponents = */ 128,
/* .MaxImageUnits = */ 8,
/* .MaxCombinedImageUnitsAndFragmentOutputs = */ 8,
/* .MaxCombinedShaderOutputResources = */ 8,
/* .MaxImageSamples = */ 0,
/* .MaxVertexImageUniforms = */ 0,
/* .MaxTessControlImageUniforms = */ 0,
/* .MaxTessEvaluationImageUniforms = */ 0,
/* .MaxGeometryImageUniforms = */ 0,
/* .MaxFragmentImageUniforms = */ 8,
/* .MaxCombinedImageUniforms = */ 8,
/* .MaxGeometryTextureImageUnits = */ 16,
/* .MaxGeometryOutputVertices = */ 256,
/* .MaxGeometryTotalOutputComponents = */ 1024,
/* .MaxGeometryUniformComponents = */ 1024,
/* .MaxGeometryVaryingComponents = */ 64,
/* .MaxTessControlInputComponents = */ 128,
/* .MaxTessControlOutputComponents = */ 128,
/* .MaxTessControlTextureImageUnits = */ 16,
/* .MaxTessControlUniformComponents = */ 1024,
/* .MaxTessControlTotalOutputComponents = */ 4096,
/* .MaxTessEvaluationInputComponents = */ 128,
/* .MaxTessEvaluationOutputComponents = */ 128,
/* .MaxTessEvaluationTextureImageUnits = */ 16,
/* .MaxTessEvaluationUniformComponents = */ 1024,
/* .MaxTessPatchComponents = */ 120,
/* .MaxPatchVertices = */ 32,
/* .MaxTessGenLevel = */ 64,
/* .MaxViewports = */ 16,
/* .MaxVertexAtomicCounters = */ 0,
/* .MaxTessControlAtomicCounters = */ 0,
/* .MaxTessEvaluationAtomicCounters = */ 0,
/* .MaxGeometryAtomicCounters = */ 0,
/* .MaxFragmentAtomicCounters = */ 8,
/* .MaxCombinedAtomicCounters = */ 8,
/* .MaxAtomicCounterBindings = */ 1,
/* .MaxVertexAtomicCounterBuffers = */ 0,
/* .MaxTessControlAtomicCounterBuffers = */ 0,
/* .MaxTessEvaluationAtomicCounterBuffers = */ 0,
/* .MaxGeometryAtomicCounterBuffers = */ 0,
/* .MaxFragmentAtomicCounterBuffers = */ 1,
/* .MaxCombinedAtomicCounterBuffers = */ 1,
/* .MaxAtomicCounterBufferSize = */ 16384,
/* .MaxTransformFeedbackBuffers = */ 4,
/* .MaxTransformFeedbackInterleavedComponents = */ 64,
/* .MaxCullDistances = */ 8,
/* .MaxCombinedClipAndCullDistances = */ 8,
/* .MaxSamples = */ 4,
/* .maxMeshOutputVerticesNV = */ 256,
/* .maxMeshOutputPrimitivesNV = */ 512,
/* .maxMeshWorkGroupSizeX_NV = */ 32,
/* .maxMeshWorkGroupSizeY_NV = */ 1,
/* .maxMeshWorkGroupSizeZ_NV = */ 1,
/* .maxTaskWorkGroupSizeX_NV = */ 32,
/* .maxTaskWorkGroupSizeY_NV = */ 1,
/* .maxTaskWorkGroupSizeZ_NV = */ 1,
/* .maxMeshViewCountNV = */ 4,
/* .maxDualSourceDrawBuffersEXT = */ 1,

/* .limits = */ {
/* .nonInductiveForLoops = */ 1,
/* .whileLoops = */ 1,
/* .doWhileLoops = */ 1,
/* .generalUniformIndexing = */ 1,
/* .generalAttributeMatrixVectorIndexing = */ 1,
/* .generalVaryingIndexing = */ 1,
/* .generalSamplerIndexing = */ 1,
/* .generalVariableIndexing = */ 1,
/* .generalConstantMatrixVectorIndexing = */ 1,
}
};

int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv)
{
if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count)
{
NCNN_LOGE("no such shader module %d", shader_type_index);
return -1;
}

const char* comp_data = layer_shader_registry[shader_type_index].comp_data;
int comp_data_size = layer_shader_registry[shader_type_index].comp_data_size;

std::vector< std::pair<const char*, const char*> > custom_defines;

if (opt.use_fp16_storage)
{
custom_defines.push_back(std::make_pair("sfp", "float16_t"));
custom_defines.push_back(std::make_pair("sfpvec2", "f16vec2"));
custom_defines.push_back(std::make_pair("sfpvec4", "f16vec4"));

if (opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("sfpvec8", "f16mat2x4"));
custom_defines.push_back(std::make_pair("sfpmat4", "f16mat4"));
}
}
else if (opt.use_fp16_packed)
{
custom_defines.push_back(std::make_pair("sfp", "float"));
custom_defines.push_back(std::make_pair("sfpvec2", "uint"));
custom_defines.push_back(std::make_pair("sfpvec4", "uvec2"));
custom_defines.push_back(std::make_pair("sfpvec8", "uvec4"));
}
else
{
custom_defines.push_back(std::make_pair("sfp", "float"));
custom_defines.push_back(std::make_pair("sfpvec2", "vec2"));
custom_defines.push_back(std::make_pair("sfpvec4", "vec4"));
custom_defines.push_back(std::make_pair("sfpvec8", "mat2x4"));
custom_defines.push_back(std::make_pair("sfpmat4", "mat4"));
}

if (opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("afp", "float16_t"));
custom_defines.push_back(std::make_pair("afpvec2", "f16vec2"));
custom_defines.push_back(std::make_pair("afpvec4", "f16vec4"));
custom_defines.push_back(std::make_pair("afpvec8", "f16mat2x4"));
custom_defines.push_back(std::make_pair("afpmat4", "f16mat4"));
}
else
{
custom_defines.push_back(std::make_pair("afp", "float"));
custom_defines.push_back(std::make_pair("afpvec2", "vec2"));
custom_defines.push_back(std::make_pair("afpvec4", "vec4"));
custom_defines.push_back(std::make_pair("afpvec8", "mat2x4"));
custom_defines.push_back(std::make_pair("afpmat4", "mat4"));
}

if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "buf[i]"));
custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=v;}"));
custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=f16vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"));
custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=f16mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"));
custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "buf[i]"));
custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=v;}"));
custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "buf[i]"));
custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=v;}"));
custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"));
custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=f16mat2x4(sbuf[si2.r],sbuf[si2.g]);}"));
custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "buf[i]"));
custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=v;}"));
custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{f16mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"));
custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{f16mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"));
custom_defines.push_back(std::make_pair("sfp2afpmat4(v)", "v"));
custom_defines.push_back(std::make_pair("afp2sfpmat4(v)", "v"));
}
else if (opt.use_fp16_packed && opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "float16_t(buf[i])"));
custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=float(v);}"));
custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=uvec2(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))));}"));
custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=uvec4(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.r],sbuf[sii4.g]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.b],sbuf[sii4.a]))));}"));
custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "f16vec2(unpackHalf2x16(buf[i]))"));
custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=packHalf2x16(vec2(v))}"));
custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "f16vec4(vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y)))"));
custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=uvec2(packHalf2x16(vec2(v.rg)),packHalf2x16(vec2(v.ba)));}"));
custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"));
custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"));
custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "f16mat2x4(f16vec4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g))),f16vec4(vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a))))"));
custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=uvec4(uvec2(packHalf2x16(vec2(v[0].rg)),packHalf2x16(vec2(v[0].ba))),uvec2(packHalf2x16(vec2(v[1].rg)),packHalf2x16(vec2(v[1].ba))));}"));
custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"));
custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"));
}
else if (opt.use_fp16_storage)
{
custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "float(buf[i])"));
custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=float16_t(v);}"));
custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i].r=sbuf[si4.r];buf[i].g=sbuf[si4.g];buf[i].b=sbuf[si4.b];buf[i].a=sbuf[si4.a];}"));
custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i].abcd.r=sbuf[si4.r];buf[i].abcd.g=sbuf[si4.g];buf[i].abcd.b=sbuf[si4.b];buf[i].abcd.a=sbuf[si4.a];buf[i].efgh.r=sbuf[sii4.r];buf[i].efgh.g=sbuf[sii4.g];buf[i].efgh.b=sbuf[sii4.b];buf[i].efgh.a=sbuf[sii4.a];}"));
custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "vec2(buf[i])"));
custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=f16vec2(v);}"));
custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "vec4(buf[i])"));
custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=f16vec4(v);}"));
custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"));
custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i].abcd=sbuf[si2.r];buf[i].efgh=sbuf[si2.g];}"));
custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "mat2x4(vec4(buf[i].abcd),vec4(buf[i].efgh))"));
custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i].abcd=f16vec4(v[0]);buf[i].efgh=f16vec4(v[1]);}"));
custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i].abcd=sbuf[si].abcd;buf[i].efgh=sbuf[si].efgh;}"));
custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{buf[i4.r]=sbuf[si].abcd.r;buf[i4.g]=sbuf[si].abcd.g;buf[i4.b]=sbuf[si].abcd.b;buf[i4.a]=sbuf[si].abcd.a; buf[ii4.r]=sbuf[si].efgh.r;buf[ii4.g]=sbuf[si].efgh.g;buf[ii4.b]=sbuf[si].efgh.b;buf[ii4.a]=sbuf[si].efgh.a;}"));
custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{buf[i2.r]=sbuf[si].abcd;buf[i2.g]=sbuf[si].efgh;}"));
}
else if (opt.use_fp16_packed)
{
custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "buf[i]"));
custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=v;}"));
custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=uvec2(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])));}"));
custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=uvec4(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])),packHalf2x16(vec2(sbuf[sii4.r],sbuf[sii4.g])),packHalf2x16(vec2(sbuf[sii4.b],sbuf[sii4.a])));}"));
custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "unpackHalf2x16(buf[i])"));
custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=packHalf2x16(v)}"));
custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y))"));
custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=uvec2(packHalf2x16(v.rg),packHalf2x16(v.ba));}"));
custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"));
custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"));
custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "mat2x4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g)),vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a)))"));
custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=uvec4(uvec2(packHalf2x16(v[0].rg),packHalf2x16(v[0].ba)),uvec2(packHalf2x16(v[1].rg),packHalf2x16(v[1].ba)));}"));
custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"));
custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"));
}
else
{
custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "buf[i]"));
custom_defines.push_back(std::make_pair("buffer_st1(buf,i,v)", "{buf[i]=v;}"));
custom_defines.push_back(std::make_pair("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
custom_defines.push_back(std::make_pair("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"));
custom_defines.push_back(std::make_pair("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"));
custom_defines.push_back(std::make_pair("buffer_ld2(buf,i)", "buf[i]"));
custom_defines.push_back(std::make_pair("buffer_st2(buf,i,v)", "{buf[i]=v;}"));
custom_defines.push_back(std::make_pair("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
custom_defines.push_back(std::make_pair("buffer_ld4(buf,i)", "buf[i]"));
custom_defines.push_back(std::make_pair("buffer_st4(buf,i,v)", "{buf[i]=v;}"));
custom_defines.push_back(std::make_pair("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
custom_defines.push_back(std::make_pair("buffer_cp4to1(buf,i4,sbuf,si)", "{vec4 _v=sbuf[si]; buf[i4.r]=_v.r;buf[i4.g]=_v.g;buf[i4.b]=_v.b;buf[i4.a]=_v.a;}"));
custom_defines.push_back(std::make_pair("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=mat2x4(sbuf[si2.r],sbuf[si2.g]);}"));
custom_defines.push_back(std::make_pair("buffer_ld8(buf,i)", "buf[i]"));
custom_defines.push_back(std::make_pair("buffer_st8(buf,i,v)", "{buf[i]=v;}"));
custom_defines.push_back(std::make_pair("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"));
custom_defines.push_back(std::make_pair("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"));
custom_defines.push_back(std::make_pair("buffer_cp8to4(buf,i2,sbuf,si)", "{mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"));
custom_defines.push_back(std::make_pair("sfp2afpmat4(v)", "v"));
custom_defines.push_back(std::make_pair("afp2sfpmat4(v)", "v"));
}

if (opt.use_image_storage)
{
if (opt.use_fp16_storage)
{
custom_defines.push_back(std::make_pair("imfmtc1", "r16f"));
custom_defines.push_back(std::make_pair("imfmtc4", "rgba16f"));
custom_defines.push_back(std::make_pair("unfp", "mediump"));
}
else if (opt.use_fp16_packed)
{
custom_defines.push_back(std::make_pair("imfmtc1", "r32f"));
custom_defines.push_back(std::make_pair("imfmtc4", "rgba16f"));
custom_defines.push_back(std::make_pair("unfp", "mediump"));
}
else
{
custom_defines.push_back(std::make_pair("imfmtc1", "r32f"));
custom_defines.push_back(std::make_pair("imfmtc4", "rgba32f"));
custom_defines.push_back(std::make_pair("unfp", "highp"));
}

if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{f16vec4 _v;_v.r=float16_t(v);imageStore(img,p,_v);}"));
custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{f16vec4 _v;_v.r=float16_t(v);imageStore(img,p,_v);}"));
custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{f16vec4 _v;_v.r=float16_t(v);imageStore(img,p,_v);}"));
custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,vec4(v));}"));
custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,vec4(v));}"));
custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,vec4(v));}"));
custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,vec4(v[0]));imageStore(img,(p)*2+1,vec4(v[1]));}"));
custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),vec4(v[0]));imageStore(img,ivec2(p.x*2+1,p.y),vec4(v[1]));}"));
custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),vec4(v[0]));imageStore(img,ivec3(p.x*2+1,p.y,p.z),vec4(v[1]));}"));
custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
}
else if (opt.use_fp16_packed && opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "float16_t(texelFetch(tex,p,0).r)"));
custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "f16vec4(texelFetch(tex,p,0))"));
custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}"));
custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}"));
custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}"));
custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"));
custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"));
custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"));
custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
}
else if (opt.use_fp16_storage)
{
custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "texelFetch(tex,p,0)"));
custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "texelFetch(tex,p,0)"));
custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "texelFetch(tex,p,0)"));
custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}"));
custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}"));
custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}"));
custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"));
custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"));
custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"));
custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
}
else if (opt.use_fp16_packed)
{
custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "texelFetch(tex,p,0)"));
custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "texelFetch(tex,p,0)"));
custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "texelFetch(tex,p,0)"));
custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}"));
custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}"));
custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}"));
custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"));
custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"));
custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"));
custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
}
else
{
custom_defines.push_back(std::make_pair("image1d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
custom_defines.push_back(std::make_pair("image2d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
custom_defines.push_back(std::make_pair("image3d_ld1(tex,p)", "texelFetch(tex,p,0).r"));
custom_defines.push_back(std::make_pair("image1d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
custom_defines.push_back(std::make_pair("image2d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
custom_defines.push_back(std::make_pair("image3d_st1(img,p,v)", "{vec4 _v;_v.r=v;imageStore(img,p,_v);}"));
custom_defines.push_back(std::make_pair("image1d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image2d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image3d_cp1(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image1d_ld4(tex,p)", "texelFetch(tex,p,0)"));
custom_defines.push_back(std::make_pair("image2d_ld4(tex,p)", "texelFetch(tex,p,0)"));
custom_defines.push_back(std::make_pair("image3d_ld4(tex,p)", "texelFetch(tex,p,0)"));
custom_defines.push_back(std::make_pair("image1d_st4(img,p,v)", "{imageStore(img,p,v);}"));
custom_defines.push_back(std::make_pair("image2d_st4(img,p,v)", "{imageStore(img,p,v);}"));
custom_defines.push_back(std::make_pair("image3d_st4(img,p,v)", "{imageStore(img,p,v);}"));
custom_defines.push_back(std::make_pair("image1d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image2d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image3d_cp4(img,p,tex,sp)", "{imageStore(img,p,texelFetch(tex,sp,0));}"));
custom_defines.push_back(std::make_pair("image1d_ld8(tex,p)", "mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"));
custom_defines.push_back(std::make_pair("image2d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"));
custom_defines.push_back(std::make_pair("image3d_ld8(tex,p)", "mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"));
custom_defines.push_back(std::make_pair("image1d_st8(img,p,v)", "{imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"));
custom_defines.push_back(std::make_pair("image2d_st8(img,p,v)", "{imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"));
custom_defines.push_back(std::make_pair("image3d_st8(img,p,v)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"));
custom_defines.push_back(std::make_pair("image1d_cp8(img,p,tex,sp)", "{imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"));
custom_defines.push_back(std::make_pair("image2d_cp8(img,p,tex,sp)", "{imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"));
custom_defines.push_back(std::make_pair("image3d_cp8(img,p,tex,sp)", "{imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"));
}
}

custom_defines.push_back(std::make_pair("psc(x)", "(x==0?p.x:x)"));

if (opt.use_fp16_packed)
{
custom_defines.push_back(std::make_pair("NCNN_fp16_packed", "1"));
}
if (opt.use_fp16_storage)
{
custom_defines.push_back(std::make_pair("NCNN_fp16_storage", "1"));
}
if (opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("NCNN_fp16_arithmetic", "1"));
}

if (opt.use_image_storage)
{
custom_defines.push_back(std::make_pair("NCNN_image_shader", "1"));
}

std::string preamble;
std::vector<std::string> processes;

processes.resize(custom_defines.size());
for (size_t i = 0; i < custom_defines.size(); i++)
{
const char* key = custom_defines[i].first;
const char* def = custom_defines[i].second;

preamble += std::string("#define ") + key + " " + def + "\n";
processes[i] = std::string("define-macro ") + key + "=" + def;
}

bool compile_success = true;

{
glslang::TShader s(EShLangCompute);

s.setStringsWithLengths(&comp_data, &comp_data_size, 1);

s.setPreamble(preamble.c_str());
s.addProcesses(processes);
s.setEntryPoint("main");
s.setSourceEntryPoint("main");

s.setEnvInput(glslang::EShSourceGlsl, EShLangCompute, glslang::EShClientVulkan, 1);
s.setEnvClient(glslang::EShClientVulkan, glslang::EShTargetVulkan_1_0);
s.setEnvTarget(glslang::EshTargetSpv, glslang::EShTargetSpv_1_0);

TBuiltInResource resources = default_TBuiltInResource;

// although vulkan 1.1 accept glsl directly
// ncnn resolve_shader_info() only works with the intermediate spirv code
bool pr = s.parse(&resources, 100, false, EShMsgDefault);
if (!pr)
{
NCNN_LOGE("compile spir-v module failed");
NCNN_LOGE("%s", s.getInfoLog());
NCNN_LOGE("%s", s.getInfoDebugLog());

compile_success = false;
}
else
{
glslang::TIntermediate* ir = s.getIntermediate();
glslang::GlslangToSpv(*ir, spirv);
}
}

return compile_success ? 0 : -1;
}
#endif

#if !NCNN_VULKAN_ONLINE_SPIRV
const ShaderInfo& get_shader_info(int shader_type_index)
{
if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count)
@@ -2035,6 +2591,7 @@ const ShaderInfo& get_shader_info(int shader_type_index)

return layer_shader_infos[shader_type_index];
}
#endif

int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info)
{


+ 13
- 0
src/gpu.h View File

@@ -173,10 +173,12 @@ public:

VkDevice vkdevice() const { return device; }

#if !NCNN_VULKAN_ONLINE_SPIRV
VkShaderModule get_shader_module(int shader_type_index) const;

// with fixed workgroup size
VkShaderModule create_shader_module(int shader_type_index, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
#endif

VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const;

@@ -252,9 +254,11 @@ public:
#endif // __ANDROID_API__ >= 26

protected:
#if !NCNN_VULKAN_ONLINE_SPIRV
// shader management
int create_shader_module();
void destroy_shader_module();
#endif

// device extension
int init_device_extension();
@@ -269,7 +273,9 @@ protected:

private:
VkDevice device;
#if !NCNN_VULKAN_ONLINE_SPIRV
std::vector<VkShaderModule> shader_modules;
#endif

// hardware queue
mutable std::vector<VkQueue> compute_queues;
@@ -304,6 +310,11 @@ private:

VulkanDevice* get_gpu_device(int device_index = get_default_gpu_index());

#if NCNN_VULKAN_ONLINE_SPIRV
// online spirv compilation
int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv);
#endif

// info from spirv
class ShaderInfo
{
@@ -319,7 +330,9 @@ public:
int binding_types[16];// 16 is large enough I think ...
};

#if !NCNN_VULKAN_ONLINE_SPIRV
const ShaderInfo& get_shader_info(int shader_type_index);
#endif
int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info);

} // namespace ncnn


+ 81
- 1
src/pipeline.cpp View File

@@ -79,6 +79,47 @@ int Pipeline::create(const uint32_t* spv_data, size_t spv_data_size, const std::

int Pipeline::create(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations)
{
#if NCNN_VULKAN_ONLINE_SPIRV
std::vector<uint32_t> spirv;
int retc = compile_spirv_module(shader_type_index, opt, spirv);
if (retc != 0)
{
NCNN_LOGE("compile_spirv_module failed %d", retc);
return -1;
}

const uint32_t* spv_data = spirv.data();
size_t spv_data_size = spirv.size() * 4;

ShaderInfo si;
int ret = resolve_shader_info(spv_data, spv_data_size, si);
if (ret != 0)
{
NCNN_LOGE("resolve_shader_info failed %d", ret);
return -1;
}

// -3 for local_size_xyz
int specialization_count_expected = si.specialization_count - 3;
if ((int)specializations.size() != specialization_count_expected)
{
NCNN_LOGE("pipeline specialization count mismatch, expect %d but got %d", specialization_count_expected, (int)specializations.size());
return -1;
}

if (vkdev->info.bug_local_size_spec_const)
{
local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z);
}
else
{
local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size);
}

// NCNN_LOGE("local_shader_module %p created", local_shader_module);

return create(local_shader_module, si, specializations);
#else
// ncnn_add_shader cmake macro
// 0 = fp32
// 1 = fp16p
@@ -148,6 +189,7 @@ int Pipeline::create(int shader_type_index, const Option& opt, const std::vector
VkShaderModule shader_module = vkdev->get_shader_module(shader_type_index);

return create(shader_module, si, specializations);
#endif
}

int Pipeline::create(VkShaderModule shader_module, const ShaderInfo& _shader_info, const std::vector<vk_specialization_type>& specializations)
@@ -590,6 +632,44 @@ int ImportAndroidHardwareBufferPipeline::create(VkAndroidHardwareBufferImageAllo

int shader_type_index = LayerShaderType::convert_ycbcr;

#if NCNN_VULKAN_ONLINE_SPIRV
std::vector<uint32_t> spirv;
int retc = compile_spirv_module(shader_type_index, opt, spirv);
if (retc != 0)
{
NCNN_LOGE("compile_spirv_module failed %d", retc);
return -1;
}

const uint32_t* spv_data = spirv.data();
size_t spv_data_size = spirv.size() * 4;

ShaderInfo si;
int ret = resolve_shader_info(spv_data, spv_data_size, si);
if (ret != 0)
{
NCNN_LOGE("resolve_shader_info failed %d", ret);
return -1;
}

// -3 for local_size_xyz
int specialization_count_expected = si.specialization_count - 3;
if ((int)specializations.size() != specialization_count_expected)
{
NCNN_LOGE("pipeline specialization count mismatch, expect %d but got %d", specialization_count_expected, (int)specializations.size());
return -1;
}

VkShaderModule shader_module;
if (vkdev->info.bug_local_size_spec_const)
{
shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z);
}
else
{
shader_module = vkdev->compile_shader_module(spv_data, spv_data_size);
}
#else
// ncnn_add_shader cmake macro
// 0 = fp32
// 1 = fp16p
@@ -640,7 +720,7 @@ int ImportAndroidHardwareBufferPipeline::create(VkAndroidHardwareBufferImageAllo
}

VkShaderModule shader_module = vkdev->get_shader_module(shader_type_index);
#endif
create_pipeline(shader_module, specializations);

if (vkdev->info.support_VK_KHR_descriptor_update_template)


+ 1
- 0
src/platform.h.in View File

@@ -23,6 +23,7 @@
#cmakedefine01 NCNN_PIXEL
#cmakedefine01 NCNN_PIXEL_ROTATE
#cmakedefine01 NCNN_VULKAN
#cmakedefine01 NCNN_VULKAN_ONLINE_SPIRV
#cmakedefine01 NCNN_REQUANT
#cmakedefine01 NCNN_AVX2



Loading…
Cancel
Save