* wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shadertags/20200616
| @@ -27,14 +27,14 @@ jobs: | |||
| uses: actions/cache@v1 | |||
| with: | |||
| path: swiftshader-install | |||
| key: swiftshader-linux-install | |||
| key: swiftshader-linux-install-20200426-3 | |||
| - name: checkout-swiftshader | |||
| if: steps.cache-swiftshader.outputs.cache-hit != 'true' | |||
| uses: actions/checkout@v2 | |||
| with: | |||
| repository: google/swiftshader | |||
| path: swiftshader | |||
| ref: 59465799210b3f4962af1a9dc44a4ffecb422c10 | |||
| ref: 60aa34a990fa77553e2d9a69d34f0b3601ced66a | |||
| - name: checkout-swiftshader-submodules | |||
| if: steps.cache-swiftshader.outputs.cache-hit != 'true' | |||
| run: | | |||
| @@ -45,7 +45,7 @@ jobs: | |||
| run: | | |||
| cd swiftshader | |||
| mkdir -p build; cd build | |||
| cmake -DCMAKE_INSTALL_PREFIX=install -DBUILD_EGL=0 -DBUILD_GLESv2=0 -DBUILD_GLES_CM=0 -DBUILD_VULKAN=1 -DBUILD_SAMPLES=0 -DBUILD_TESTS=0 -DWARNINGS_AS_ERRORS=0 .. | |||
| cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_DEFAULT_OPT_LEVEL=None -DCMAKE_BUILD_TYPE=Release .. | |||
| cmake --build . -j 2 | |||
| mkdir $GITHUB_WORKSPACE/swiftshader-install | |||
| cp Linux/* $GITHUB_WORKSPACE/swiftshader-install | |||
| @@ -27,14 +27,14 @@ jobs: | |||
| uses: actions/cache@v1 | |||
| with: | |||
| path: swiftshader-install | |||
| key: swiftshader-linux-install | |||
| key: swiftshader-linux-install-20200426-3 | |||
| - name: checkout-swiftshader | |||
| if: steps.cache-swiftshader.outputs.cache-hit != 'true' | |||
| uses: actions/checkout@v2 | |||
| with: | |||
| repository: google/swiftshader | |||
| path: swiftshader | |||
| ref: 59465799210b3f4962af1a9dc44a4ffecb422c10 | |||
| ref: 60aa34a990fa77553e2d9a69d34f0b3601ced66a | |||
| - name: checkout-swiftshader-submodules | |||
| if: steps.cache-swiftshader.outputs.cache-hit != 'true' | |||
| run: | | |||
| @@ -45,7 +45,7 @@ jobs: | |||
| run: | | |||
| cd swiftshader | |||
| mkdir -p build; cd build | |||
| cmake -DCMAKE_INSTALL_PREFIX=install -DBUILD_EGL=0 -DBUILD_GLESv2=0 -DBUILD_GLES_CM=0 -DBUILD_VULKAN=1 -DBUILD_SAMPLES=0 -DBUILD_TESTS=0 -DWARNINGS_AS_ERRORS=0 .. | |||
| cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_DEFAULT_OPT_LEVEL=None -DCMAKE_BUILD_TYPE=Release .. | |||
| cmake --build . -j 2 | |||
| mkdir $GITHUB_WORKSPACE/swiftshader-install | |||
| cp Linux/* $GITHUB_WORKSPACE/swiftshader-install | |||
| @@ -25,14 +25,14 @@ jobs: | |||
| uses: actions/cache@v1 | |||
| with: | |||
| path: swiftshader-install | |||
| key: swiftshader-macos-install | |||
| key: swiftshader-macos-install-20200426-3 | |||
| - name: checkout-swiftshader | |||
| if: steps.cache-swiftshader.outputs.cache-hit != 'true' | |||
| uses: actions/checkout@v2 | |||
| with: | |||
| repository: google/swiftshader | |||
| path: swiftshader | |||
| ref: 59465799210b3f4962af1a9dc44a4ffecb422c10 | |||
| ref: 60aa34a990fa77553e2d9a69d34f0b3601ced66a | |||
| - name: checkout-swiftshader-submodules | |||
| if: steps.cache-swiftshader.outputs.cache-hit != 'true' | |||
| run: | | |||
| @@ -43,7 +43,7 @@ jobs: | |||
| run: | | |||
| cd swiftshader | |||
| mkdir -p build; cd build | |||
| cmake -DCMAKE_INSTALL_PREFIX=install -DBUILD_EGL=0 -DBUILD_GLESv2=0 -DBUILD_GLES_CM=0 -DBUILD_VULKAN=1 -DBUILD_SAMPLES=0 -DBUILD_TESTS=0 -DWARNINGS_AS_ERRORS=0 .. | |||
| cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_DEFAULT_OPT_LEVEL=None -DCMAKE_BUILD_TYPE=Release .. | |||
| cmake --build . -j 2 | |||
| mkdir $GITHUB_WORKSPACE/swiftshader-install | |||
| cp Darwin/* $GITHUB_WORKSPACE/swiftshader-install | |||
| @@ -51,16 +51,11 @@ jobs: | |||
| run: export VULKAN_SDK=`pwd`/vulkansdk-macos-1.1.114.0/macOS && mkdir build && cd build && cmake -DNCNN_VULKAN=ON .. | |||
| - name: build | |||
| run: cmake --build build -j 2 | |||
| # - name: test | |||
| # run: | | |||
| # find "swiftshader-install/" | |||
| # find "vulkansdk-macos-1.1.114.0/" | |||
| # export DYLD_LIBRARY_PATH="vulkansdk-macos-1.1.114.0/macOS/lib":$DYLD_LIBRARY_PATH | |||
| # export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" | |||
| # ./vulkansdk-macos-1.1.114.0/macOS/bin/vulkaninfo | |||
| # cd build && ctest --output-on-failure -j 2 | |||
| # export VK_ICD_FILENAMES="vulkansdk-macos-1.1.114.0/macOS/etc/vulkan/icd.d/MoltenVK_icd.json" | |||
| # cd build && ctest --output-on-failure -j 2 | |||
| - name: test | |||
| run: | | |||
| export DYLD_LIBRARY_PATH="vulkansdk-macos-1.1.114.0/macOS/lib":$DYLD_LIBRARY_PATH | |||
| export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" | |||
| cd build && ctest --output-on-failure -j 2 | |||
| macos-clang-gpu-nostdio: | |||
| runs-on: macos-latest | |||
| @@ -25,14 +25,14 @@ jobs: | |||
| uses: actions/cache@v1 | |||
| with: | |||
| path: swiftshader-install | |||
| key: swiftshader-linux-install | |||
| key: swiftshader-linux-install-20200426-3 | |||
| - name: checkout-swiftshader | |||
| if: steps.cache-swiftshader.outputs.cache-hit != 'true' | |||
| uses: actions/checkout@v2 | |||
| with: | |||
| repository: google/swiftshader | |||
| path: swiftshader | |||
| ref: 59465799210b3f4962af1a9dc44a4ffecb422c10 | |||
| ref: 60aa34a990fa77553e2d9a69d34f0b3601ced66a | |||
| - name: checkout-swiftshader-submodules | |||
| if: steps.cache-swiftshader.outputs.cache-hit != 'true' | |||
| run: | | |||
| @@ -43,7 +43,7 @@ jobs: | |||
| run: | | |||
| cd swiftshader | |||
| mkdir -p build; cd build | |||
| cmake -DCMAKE_INSTALL_PREFIX=install -DBUILD_EGL=0 -DBUILD_GLESv2=0 -DBUILD_GLES_CM=0 -DBUILD_VULKAN=1 -DBUILD_SAMPLES=0 -DBUILD_TESTS=0 -DWARNINGS_AS_ERRORS=0 .. | |||
| cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_DEFAULT_OPT_LEVEL=None -DCMAKE_BUILD_TYPE=Release .. | |||
| cmake --build . -j 2 | |||
| mkdir $GITHUB_WORKSPACE/swiftshader-install | |||
| cp Linux/* $GITHUB_WORKSPACE/swiftshader-install | |||
| @@ -37,14 +37,14 @@ jobs: | |||
| uses: actions/cache@v1 | |||
| with: | |||
| path: swiftshader-install | |||
| key: swiftshader-windows-install | |||
| key: swiftshader-windows-install-20200426-3 | |||
| - name: checkout-swiftshader | |||
| if: steps.cache-swiftshader.outputs.cache-hit != 'true' | |||
| uses: actions/checkout@v2 | |||
| with: | |||
| repository: google/swiftshader | |||
| path: swiftshader | |||
| ref: 59465799210b3f4962af1a9dc44a4ffecb422c10 | |||
| ref: 60aa34a990fa77553e2d9a69d34f0b3601ced66a | |||
| - name: checkout-swiftshader-submodules | |||
| if: steps.cache-swiftshader.outputs.cache-hit != 'true' | |||
| run: | | |||
| @@ -55,7 +55,7 @@ jobs: | |||
| run: | | |||
| cd swiftshader | |||
| mkdir build-vs2019; cd build-vs2019 | |||
| cmake -DCMAKE_INSTALL_PREFIX=install -DBUILD_EGL=0 -DBUILD_GLESv2=0 -DBUILD_GLES_CM=0 -DBUILD_VULKAN=1 -DBUILD_SAMPLES=0 -DBUILD_TESTS=0 -DWARNINGS_AS_ERRORS=0 .. | |||
| cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_DEFAULT_OPT_LEVEL=None -DCMAKE_BUILD_TYPE=Release .. | |||
| cmake --build . --config Release -j 2 | |||
| mkdir "$env:GITHUB_WORKSPACE/swiftshader-install" | |||
| Copy-Item -Path "Windows\*" -Destination "$env:GITHUB_WORKSPACE\swiftshader-install" | |||
| @@ -188,8 +188,8 @@ int main(int argc, char** argv) | |||
| g_vkdev = ncnn::get_gpu_device(gpu_device); | |||
| g_blob_vkallocator = new ncnn::VkBlobBufferAllocator(g_vkdev); | |||
| g_staging_vkallocator = new ncnn::VkStagingBufferAllocator(g_vkdev); | |||
| g_blob_vkallocator = new ncnn::VkBlobAllocator(g_vkdev); | |||
| g_staging_vkallocator = new ncnn::VkStagingAllocator(g_vkdev); | |||
| } | |||
| #endif // NCNN_VULKAN | |||
| @@ -214,6 +214,11 @@ int main(int argc, char** argv) | |||
| opt.use_int8_storage = true; | |||
| opt.use_int8_arithmetic = true; | |||
| opt.use_packing_layout = true; | |||
| opt.use_shader_pack8 = false; | |||
| opt.use_image_storage = false; | |||
| opt.use_image_fp16_packed = true; | |||
| opt.use_image_fp16_storage = true; | |||
| opt.use_image_fp16_arithmetic = true; | |||
| ncnn::set_cpu_powersave(powersave); | |||
| @@ -184,6 +184,296 @@ function(ncnn_generate_shader_spv_header SHADER_SPV_HEADER SHADER_SPV_HEX_HEADER | |||
| ) | |||
| set_source_files_properties(${SHADER_fp16sa_SPV_HEX_FILE} PROPERTIES GENERATED TRUE) | |||
| # image + fp32 | |||
| set(SHADER_image_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image") | |||
| set(SHADER_image_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_SRC_NAME_WE}.spv.hex.h) | |||
| add_custom_command( | |||
| OUTPUT ${SHADER_image_SPV_HEX_FILE} | |||
| COMMAND ${GLSLANGVALIDATOR_EXECUTABLE} | |||
| ARGS -Dsfp=float -Dsfpvec2=vec2 -Dsfpvec4=vec4 -Dsfpvec8=mat2x4 -Dsfpmat4=mat4 | |||
| -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpvec8=mat2x4 -Dafpmat4=mat4 | |||
| -Dimfmtc1=r32f -Dimfmtc4=rgba32f | |||
| -Dunfp=highp | |||
| "-D image1d_ld1(tex,p)=texelFetch(tex,p,0).r" | |||
| "-D image2d_ld1(tex,p)=texelFetch(tex,p,0).r" | |||
| "-D image3d_ld1(tex,p)=texelFetch(tex,p,0).r" | |||
| "-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}" | |||
| "-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}" | |||
| "-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}" | |||
| "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image1d_ld4(tex,p)=texelFetch(tex,p,0)" | |||
| "-D image2d_ld4(tex,p)=texelFetch(tex,p,0)" | |||
| "-D image3d_ld4(tex,p)=texelFetch(tex,p,0)" | |||
| "-D image1d_st4(img,p,v)={imageStore(img,p,v);}" | |||
| "-D image2d_st4(img,p,v)={imageStore(img,p,v);}" | |||
| "-D image3d_st4(img,p,v)={imageStore(img,p,v);}" | |||
| "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image1d_ld8(tex,p)=mat2x4(texelFetch(tex,p*2,0),texelFetch(tex,p*2+1,0))" | |||
| "-D image2d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))" | |||
| "-D image3d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))" | |||
| "-D image1d_st8(img,p,v)={imageStore(img,p*2,v[0]);imageStore(img,p*2+1,v[1]);}" | |||
| "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}" | |||
| "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}" | |||
| "-D image1d_cp8(img,p,tex,sp)={imageStore(img,p*2,texelFetch(tex,sp*2,0));imageStore(img,p*2+1,texelFetch(tex,sp*2+1,0));}" | |||
| "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}" | |||
| "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}" | |||
| "-D buffer_ld1(buf,i)=buf[i]" | |||
| "-D buffer_st1(buf,i,v)={buf[i]=v;}" | |||
| "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}" | |||
| "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}" | |||
| "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}" | |||
| "-D buffer_ld2(buf,i)=buf[i]" | |||
| "-D buffer_st2(buf,i,v)={buf[i]=v;}" | |||
| "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}" | |||
| "-D buffer_ld4(buf,i)=buf[i]" | |||
| "-D buffer_st4(buf,i,v)={buf[i]=v;}" | |||
| "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}" | |||
| "-D buffer_cp4to1(buf,i4,sbuf,si)={vec4 _v=sbuf[si]; buf[i4.r]=_v.r;buf[i4.g]=_v.g;buf[i4.b]=_v.b;buf[i4.a]=_v.a;}" | |||
| "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=mat2x4(sbuf[si2.r],sbuf[si2.g]);}" | |||
| "-D buffer_ld8(buf,i)=buf[i]" | |||
| "-D buffer_st8(buf,i,v)={buf[i]=v;}" | |||
| "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}" | |||
| "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}" | |||
| "-D buffer_cp8to4(buf,i2,sbuf,si)={mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}" | |||
| "-D sfp2afpmat4(v)=v" | |||
| "-D afp2sfpmat4(v)=v" | |||
| "-D psc(x)=(x==0?p.x:x)" | |||
| -DNCNN_image_shader=1 | |||
| -V -s -x -o ${SHADER_image_SPV_HEX_FILE} ${SHADER_SRC} | |||
| DEPENDS ${SHADER_SRC} | |||
| COMMENT "Building SPIR-V module ${SHADER_image_SRC_NAME_WE}.spv" | |||
| VERBATIM | |||
| ) | |||
| set_source_files_properties(${SHADER_image_SPV_HEX_FILE} PROPERTIES GENERATED TRUE) | |||
| # image + fp16p | |||
| set(SHADER_image_fp16p_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16p") | |||
| set(SHADER_image_fp16p_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16p_SRC_NAME_WE}.spv.hex.h) | |||
| add_custom_command( | |||
| OUTPUT ${SHADER_image_fp16p_SPV_HEX_FILE} | |||
| COMMAND ${GLSLANGVALIDATOR_EXECUTABLE} | |||
| ARGS -Dsfp=float -Dsfpvec2=uint -Dsfpvec4=uvec2 -Dsfpvec8=uvec4 | |||
| -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpvec8=mat2x4 -Dafpmat4=mat4 | |||
| -Dimfmtc1=r32f -Dimfmtc4=rgba16f | |||
| -Dunfp=mediump | |||
| "-D image1d_ld1(tex,p)=texelFetch(tex,p,0).r" | |||
| "-D image2d_ld1(tex,p)=texelFetch(tex,p,0).r" | |||
| "-D image3d_ld1(tex,p)=texelFetch(tex,p,0).r" | |||
| "-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}" | |||
| "-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}" | |||
| "-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}" | |||
| "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image1d_ld4(tex,p)=texelFetch(tex,p,0)" | |||
| "-D image2d_ld4(tex,p)=texelFetch(tex,p,0)" | |||
| "-D image3d_ld4(tex,p)=texelFetch(tex,p,0)" | |||
| "-D image1d_st4(img,p,v)={imageStore(img,p,v);}" | |||
| "-D image2d_st4(img,p,v)={imageStore(img,p,v);}" | |||
| "-D image3d_st4(img,p,v)={imageStore(img,p,v);}" | |||
| "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image1d_ld8(tex,p)=mat2x4(texelFetch(tex,p*2,0),texelFetch(tex,p*2+1,0))" | |||
| "-D image2d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))" | |||
| "-D image3d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))" | |||
| "-D image1d_st8(img,p,v)={imageStore(img,p*2,v[0]);imageStore(img,p*2+1,v[1]);}" | |||
| "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}" | |||
| "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}" | |||
| "-D image1d_cp8(img,p,tex,sp)={imageStore(img,p*2,texelFetch(tex,sp*2,0));imageStore(img,p*2+1,texelFetch(tex,sp*2+1,0));}" | |||
| "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}" | |||
| "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}" | |||
| "-D buffer_ld1(buf,i)=buf[i]" | |||
| "-D buffer_st1(buf,i,v)={buf[i]=v;}" | |||
| "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}" | |||
| "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=uvec2(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])));}" | |||
| "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=uvec4(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])),packHalf2x16(vec2(sbuf[sii4.r],sbuf[sii4.g])),packHalf2x16(vec2(sbuf[sii4.b],sbuf[sii4.a])));}" | |||
| "-D buffer_ld2(buf,i)=unpackHalf2x16(buf[i])" | |||
| "-D buffer_st2(buf,i,v)={buf[i]=packHalf2x16(v)}" | |||
| "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}" | |||
| "-D buffer_ld4(buf,i)=vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y))" | |||
| "-D buffer_st4(buf,i,v)={buf[i]=uvec2(packHalf2x16(v.rg),packHalf2x16(v.ba));}" | |||
| "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}" | |||
| "-D buffer_cp4to1(buf,i4,sbuf,si)={uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}" | |||
| "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}" | |||
| "-D buffer_ld8(buf,i)=mat2x4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g)),vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a)))" | |||
| "-D buffer_st8(buf,i,v)={buf[i]=uvec4(uvec2(packHalf2x16(v[0].rg),packHalf2x16(v[0].ba)),uvec2(packHalf2x16(v[1].rg),packHalf2x16(v[1].ba)));}" | |||
| "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}" | |||
| "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}" | |||
| "-D buffer_cp8to4(buf,i2,sbuf,si)={uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}" | |||
| "-D psc(x)=(x==0?p.x:x)" | |||
| -DNCNN_image_shader=1 -DNCNN_fp16_packed=1 | |||
| -V -s -x -o ${SHADER_image_fp16p_SPV_HEX_FILE} ${SHADER_SRC} | |||
| DEPENDS ${SHADER_SRC} | |||
| COMMENT "Building SPIR-V module ${SHADER_image_fp16p_SRC_NAME_WE}.spv" | |||
| VERBATIM | |||
| ) | |||
| set_source_files_properties(${SHADER_image_fp16p_SPV_HEX_FILE} PROPERTIES GENERATED TRUE) | |||
| # image + fp16s | |||
| set(SHADER_image_fp16s_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16s") | |||
| set(SHADER_image_fp16s_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16s_SRC_NAME_WE}.spv.hex.h) | |||
| add_custom_command( | |||
| OUTPUT ${SHADER_image_fp16s_SPV_HEX_FILE} | |||
| COMMAND ${GLSLANGVALIDATOR_EXECUTABLE} | |||
| ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4 | |||
| -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpvec8=mat2x4 -Dafpmat4=mat4 | |||
| -Dimfmtc1=r16f -Dimfmtc4=rgba16f | |||
| -Dunfp=mediump | |||
| "-D image1d_ld1(tex,p)=texelFetch(tex,p,0).r" | |||
| "-D image2d_ld1(tex,p)=texelFetch(tex,p,0).r" | |||
| "-D image3d_ld1(tex,p)=texelFetch(tex,p,0).r" | |||
| "-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}" | |||
| "-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}" | |||
| "-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}" | |||
| "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image1d_ld4(tex,p)=texelFetch(tex,p,0)" | |||
| "-D image2d_ld4(tex,p)=texelFetch(tex,p,0)" | |||
| "-D image3d_ld4(tex,p)=texelFetch(tex,p,0)" | |||
| "-D image1d_st4(img,p,v)={imageStore(img,p,v);}" | |||
| "-D image2d_st4(img,p,v)={imageStore(img,p,v);}" | |||
| "-D image3d_st4(img,p,v)={imageStore(img,p,v);}" | |||
| "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image1d_ld8(tex,p)=mat2x4(texelFetch(tex,p*2,0),texelFetch(tex,p*2+1,0))" | |||
| "-D image2d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))" | |||
| "-D image3d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))" | |||
| "-D image1d_st8(img,p,v)={imageStore(img,p*2,v[0]);imageStore(img,p*2+1,v[1]);}" | |||
| "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}" | |||
| "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}" | |||
| "-D image1d_cp8(img,p,tex,sp)={imageStore(img,p*2,texelFetch(tex,sp*2,0));imageStore(img,p*2+1,texelFetch(tex,sp*2+1,0));}" | |||
| "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}" | |||
| "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}" | |||
| "-D buffer_ld1(buf,i)=float(buf[i])" | |||
| "-D buffer_st1(buf,i,v)={buf[i]=float16_t(v);}" | |||
| "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}" | |||
| "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i].r=sbuf[si4.r];buf[i].g=sbuf[si4.g];buf[i].b=sbuf[si4.b];buf[i].a=sbuf[si4.a];}" | |||
| "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i].abcd.r=sbuf[si4.r];buf[i].abcd.g=sbuf[si4.g];buf[i].abcd.b=sbuf[si4.b];buf[i].abcd.a=sbuf[si4.a];buf[i].efgh.r=sbuf[sii4.r];buf[i].efgh.g=sbuf[sii4.g];buf[i].efgh.b=sbuf[sii4.b];buf[i].efgh.a=sbuf[sii4.a];}" | |||
| "-D buffer_ld2(buf,i)=vec2(buf[i])" | |||
| "-D buffer_st2(buf,i,v)={buf[i]=f16vec2(v);}" | |||
| "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}" | |||
| "-D buffer_ld4(buf,i)=vec4(buf[i])" | |||
| "-D buffer_st4(buf,i,v)={buf[i]=f16vec4(v);}" | |||
| "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}" | |||
| "-D buffer_cp4to1(buf,i4,sbuf,si)={buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}" | |||
| "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i].abcd=sbuf[si2.r];buf[i].efgh=sbuf[si2.g];}" | |||
| "-D buffer_ld8(buf,i)=mat2x4(vec4(buf[i].abcd),vec4(buf[i].efgh))" | |||
| "-D buffer_st8(buf,i,v)={buf[i].abcd=f16vec4(v[0]);buf[i].efgh=f16vec4(v[1]);}" | |||
| "-D buffer_cp8(buf,i,sbuf,si)={buf[i].abcd=sbuf[si].abcd;buf[i].efgh=sbuf[si].efgh;}" | |||
| "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={buf[i4.r]=sbuf[si].abcd.r;buf[i4.g]=sbuf[si].abcd.g;buf[i4.b]=sbuf[si].abcd.b;buf[i4.a]=sbuf[si].abcd.a; buf[ii4.r]=sbuf[si].efgh.r;buf[ii4.g]=sbuf[si].efgh.g;buf[ii4.b]=sbuf[si].efgh.b;buf[ii4.a]=sbuf[si].efgh.a;}" | |||
| "-D buffer_cp8to4(buf,i2,sbuf,si)={buf[i2.r]=sbuf[si].abcd;buf[i2.g]=sbuf[si].efgh;}" | |||
| "-D sfp2afpmat4(v)=v" | |||
| "-D afp2sfpmat4(v)=v" | |||
| "-D psc(x)=(x==0?p.x:x)" | |||
| -DNCNN_image_shader=1 -DNCNN_fp16_storage=1 | |||
| -V -s -x -o ${SHADER_image_fp16s_SPV_HEX_FILE} ${SHADER_SRC} | |||
| DEPENDS ${SHADER_SRC} | |||
| COMMENT "Building SPIR-V module ${SHADER_image_fp16s_SRC_NAME_WE}.spv" | |||
| VERBATIM | |||
| ) | |||
| set_source_files_properties(${SHADER_image_fp16s_SPV_HEX_FILE} PROPERTIES GENERATED TRUE) | |||
| # image + fp16a | |||
| set(SHADER_image_fp16a_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16a") | |||
| set(SHADER_image_fp16a_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16a_SRC_NAME_WE}.spv.hex.h) | |||
| add_custom_command( | |||
| OUTPUT ${SHADER_image_fp16a_SPV_HEX_FILE} | |||
| COMMAND ${GLSLANGVALIDATOR_EXECUTABLE} | |||
| ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4 -Dsfpvec8=f16mat2x4 -Dsfpmat4=f16mat4 | |||
| -Dafp=float16_t -Dafpvec2=f16vec2 -Dafpvec4=f16vec4 -Dafpvec8=f16mat2x4 -Dafpmat4=f16mat4 | |||
| -Dimfmtc1=r16f -Dimfmtc4=rgba16f | |||
| -Dunfp=mediump | |||
| "-D image1d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)" | |||
| "-D image2d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)" | |||
| "-D image3d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)" | |||
| "-D image1d_st1(img,p,v)={vec4 _v;_v.r=float(v);imageStore(img,p,_v);}" | |||
| "-D image2d_st1(img,p,v)={vec4 _v;_v.r=float(v);imageStore(img,p,_v);}" | |||
| "-D image3d_st1(img,p,v)={vec4 _v;_v.r=float(v);imageStore(img,p,_v);}" | |||
| "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image1d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))" | |||
| "-D image2d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))" | |||
| "-D image3d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))" | |||
| "-D image1d_st4(img,p,v)={imageStore(img,p,vec4(v));}" | |||
| "-D image2d_st4(img,p,v)={imageStore(img,p,vec4(v));}" | |||
| "-D image3d_st4(img,p,v)={imageStore(img,p,vec4(v));}" | |||
| "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}" | |||
| "-D image1d_ld8(tex,p)=f16mat2x4(texelFetch(tex,p*2,0),texelFetch(tex,p*2+1,0))" | |||
| "-D image2d_ld8(tex,p)=f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))" | |||
| "-D image3d_ld8(tex,p)=f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))" | |||
| "-D image1d_st8(img,p,v)={imageStore(img,p*2,vec4(v[0]));imageStore(img,p*2+1,vec4(v[1]));}" | |||
| "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),vec4(v[0]));imageStore(img,ivec2(p.x*2+1,p.y),vec4(v[1]));}" | |||
| "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),vec4(v[0]));imageStore(img,ivec3(p.x*2+1,p.y,p.z),vec4(v[1]));}" | |||
| "-D image1d_cp8(img,p,tex,sp)={imageStore(img,p*2,texelFetch(tex,sp*2,0));imageStore(img,p*2+1,texelFetch(tex,sp*2+1,0));}" | |||
| "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}" | |||
| "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}" | |||
| "-D buffer_ld1(buf,i)=buf[i]" | |||
| "-D buffer_st1(buf,i,v)={buf[i]=v;}" | |||
| "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}" | |||
| "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=f16vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}" | |||
| "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=f16mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}" | |||
| "-D buffer_ld2(buf,i)=buf[i]" | |||
| "-D buffer_st2(buf,i,v)={buf[i]=v;}" | |||
| "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}" | |||
| "-D buffer_ld4(buf,i)=buf[i]" | |||
| "-D buffer_st4(buf,i,v)={buf[i]=v;}" | |||
| "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}" | |||
| "-D buffer_cp4to1(buf,i4,sbuf,si)={buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}" | |||
| "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=f16mat2x4(sbuf[si2.r],sbuf[si2.g]);}" | |||
| "-D buffer_ld8(buf,i)=buf[i]" | |||
| "-D buffer_st8(buf,i,v)={buf[i]=v;}" | |||
| "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}" | |||
| "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={f16mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}" | |||
| "-D buffer_cp8to4(buf,i2,sbuf,si)={f16mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}" | |||
| "-D sfp2afpmat4(v)=v" | |||
| "-D afp2sfpmat4(v)=v" | |||
| "-D psc(x)=(x==0?p.x:x)" | |||
| -DNCNN_image_shader=1 -DNCNN_fp16_storage=1 -DNCNN_fp16_arithmetic=1 | |||
| -V -s -x -o ${SHADER_image_fp16a_SPV_HEX_FILE} ${SHADER_SRC} | |||
| DEPENDS ${SHADER_SRC} | |||
| COMMENT "Building SPIR-V module ${SHADER_image_fp16a_SRC_NAME_WE}.spv" | |||
| VERBATIM | |||
| ) | |||
| set_source_files_properties(${SHADER_image_fp16a_SPV_HEX_FILE} PROPERTIES GENERATED TRUE) | |||
| set(LOCAL_SHADER_SPV_HEADER ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_SRC_NAME_WE}.spv.h) | |||
| file(WRITE ${LOCAL_SHADER_SPV_HEADER} | |||
| @@ -192,6 +482,10 @@ function(ncnn_generate_shader_spv_header SHADER_SPV_HEADER SHADER_SPV_HEX_HEADER | |||
| "static const uint32_t ${SHADER_fp16pa_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16pa_SRC_NAME_WE}.spv.hex.h\"\n};\n" | |||
| "static const uint32_t ${SHADER_fp16s_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16s_SRC_NAME_WE}.spv.hex.h\"\n};\n" | |||
| "static const uint32_t ${SHADER_fp16sa_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16sa_SRC_NAME_WE}.spv.hex.h\"\n};\n" | |||
| "static const uint32_t ${SHADER_image_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_SRC_NAME_WE}.spv.hex.h\"\n};\n" | |||
| "static const uint32_t ${SHADER_image_fp16p_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16p_SRC_NAME_WE}.spv.hex.h\"\n};\n" | |||
| "static const uint32_t ${SHADER_image_fp16s_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16s_SRC_NAME_WE}.spv.hex.h\"\n};\n" | |||
| "static const uint32_t ${SHADER_image_fp16a_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16a_SRC_NAME_WE}.spv.hex.h\"\n};\n" | |||
| ) | |||
| set_source_files_properties(${LOCAL_SHADER_SPV_HEADER} PROPERTIES GENERATED TRUE) | |||
| @@ -202,6 +496,10 @@ function(ncnn_generate_shader_spv_header SHADER_SPV_HEADER SHADER_SPV_HEX_HEADER | |||
| ${SHADER_fp16pa_SPV_HEX_FILE} | |||
| ${SHADER_fp16s_SPV_HEX_FILE} | |||
| ${SHADER_fp16sa_SPV_HEX_FILE} | |||
| ${SHADER_image_SPV_HEX_FILE} | |||
| ${SHADER_image_fp16p_SPV_HEX_FILE} | |||
| ${SHADER_image_fp16s_SPV_HEX_FILE} | |||
| ${SHADER_image_fp16a_SPV_HEX_FILE} | |||
| ) | |||
| set(${SHADER_SPV_HEADER} ${LOCAL_SHADER_SPV_HEADER} PARENT_SCOPE) | |||
| @@ -141,10 +141,10 @@ ncnn::create_gpu_instance(); | |||
| { | |||
| ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(); | |||
| ncnn::VkWeightBufferAllocator g_weight_vkallocator(vkdev); | |||
| ncnn::VkBlobBufferAllocator g_blob_vkallocator(vkdev); | |||
| ncnn::VkStagingBufferAllocator g_staging_vkallocator(vkdev); | |||
| ncnn::VkWeightStagingBufferAllocator g_weight_staging_vkallocator(vkdev); | |||
| ncnn::VkWeightAllocator g_weight_vkallocator(vkdev); | |||
| ncnn::VkBlobAllocator g_blob_vkallocator(vkdev); | |||
| ncnn::VkStagingAllocator g_staging_vkallocator(vkdev); | |||
| ncnn::VkWeightStagingAllocator g_weight_staging_vkallocator(vkdev); | |||
| // create layer | |||
| ncnn::Layer* convolution = ncnn::create_layer("Convolution"); | |||
| @@ -60,6 +60,10 @@ macro(ncnn_add_shader SHADER_SRC) | |||
| string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16pa_spv_data)},\n") | |||
| string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16s_spv_data)},\n") | |||
| string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16sa_spv_data)},\n") | |||
| string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_spv_data)},\n") | |||
| string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16p_spv_data)},\n") | |||
| string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16s_spv_data)},\n") | |||
| string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16a_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16a_spv_data)},\n") | |||
| list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEADER}) | |||
| list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEX_HEADERS}) | |||
| @@ -75,6 +79,14 @@ macro(ncnn_add_shader SHADER_SRC) | |||
| math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") | |||
| set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") | |||
| math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") | |||
| set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") | |||
| math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") | |||
| set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") | |||
| math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") | |||
| set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") | |||
| math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") | |||
| set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16a = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") | |||
| math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") | |||
| endmacro() | |||
| macro(ncnn_add_layer class) | |||
| @@ -199,17 +199,54 @@ public: | |||
| int refcount; | |||
| }; | |||
| class VkImageMemory | |||
| { | |||
| public: | |||
| VkImage image; | |||
| VkImageView imageview; | |||
| // underlying info assigned by allocator | |||
| VkImageType image_type; | |||
| VkImageViewType imageview_type; | |||
| int width; | |||
| int height; | |||
| int depth; | |||
| VkFormat format; | |||
| VkDeviceMemory memory; | |||
| void* mapped_ptr; | |||
| // the base offset assigned by allocator | |||
| size_t bind_offset; | |||
| size_t bind_capacity; | |||
| // image state, modified by command functions internally | |||
| mutable VkAccessFlags access_flags; | |||
| mutable VkImageLayout image_layout; | |||
| mutable VkPipelineStageFlags stage_flags; | |||
| // in-execution state, modified by command functions internally | |||
| mutable int command_refcount; | |||
| // initialize and modified by mat | |||
| int refcount; | |||
| }; | |||
| class VkAllocator | |||
| { | |||
| public: | |||
| VkAllocator(const VulkanDevice* _vkdev); | |||
| virtual ~VkAllocator() { clear(); } | |||
| virtual void clear() {} | |||
| virtual VkBufferMemory* fastMalloc(size_t size) = 0; | |||
| virtual void fastFree(VkBufferMemory* ptr) = 0; | |||
| virtual int flush(VkBufferMemory* ptr); | |||
| virtual int invalidate(VkBufferMemory* ptr); | |||
| virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack) = 0; | |||
| virtual void fastFree(VkImageMemory* ptr) = 0; | |||
| public: | |||
| const VulkanDevice* vkdev; | |||
| uint32_t memory_type_index; | |||
| @@ -219,14 +256,17 @@ public: | |||
| protected: | |||
| VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage); | |||
| VkDeviceMemory allocate_memory(size_t size); | |||
| VkDeviceMemory allocate_dedicated_memory(size_t size, VkBuffer buffer); | |||
| VkDeviceMemory allocate_dedicated_memory(size_t size, VkImage image, VkBuffer buffer); | |||
| VkImage create_image(VkImageType type, int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage); | |||
| VkImageView create_imageview(VkImageViewType type, VkImage image, VkFormat format); | |||
| }; | |||
| class VkBlobBufferAllocator : public VkAllocator | |||
| class VkBlobAllocator : public VkAllocator | |||
| { | |||
| public: | |||
| VkBlobBufferAllocator(const VulkanDevice* vkdev); | |||
| virtual ~VkBlobBufferAllocator(); | |||
| VkBlobAllocator(const VulkanDevice* vkdev); | |||
| virtual ~VkBlobAllocator(); | |||
| public: | |||
| // release all budgets immediately | |||
| @@ -234,19 +274,24 @@ public: | |||
| virtual VkBufferMemory* fastMalloc(size_t size); | |||
| virtual void fastFree(VkBufferMemory* ptr); | |||
| virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack); | |||
| virtual void fastFree(VkImageMemory* ptr); | |||
| private: | |||
| size_t block_size; | |||
| size_t buffer_offset_alignment; | |||
| std::vector< std::list< std::pair<size_t, size_t> > > budgets; | |||
| size_t bind_memory_offset_alignment; | |||
| std::vector< std::list< std::pair<size_t, size_t> > > buffer_budgets; | |||
| std::vector<VkBufferMemory*> buffer_blocks; | |||
| std::vector< std::list< std::pair<size_t, size_t> > > image_memory_budgets; | |||
| std::vector<VkDeviceMemory> image_memory_blocks; | |||
| }; | |||
| class VkWeightBufferAllocator : public VkAllocator | |||
| class VkWeightAllocator : public VkAllocator | |||
| { | |||
| public: | |||
| VkWeightBufferAllocator(const VulkanDevice* vkdev); | |||
| virtual ~VkWeightBufferAllocator(); | |||
| VkWeightAllocator(const VulkanDevice* vkdev); | |||
| virtual ~VkWeightAllocator(); | |||
| public: | |||
| // release all blocks immediately | |||
| @@ -255,20 +300,26 @@ public: | |||
| public: | |||
| virtual VkBufferMemory* fastMalloc(size_t size); | |||
| virtual void fastFree(VkBufferMemory* ptr); | |||
| virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack); | |||
| virtual void fastFree(VkImageMemory* ptr); | |||
| private: | |||
| size_t block_size; | |||
| size_t buffer_offset_alignment; | |||
| size_t bind_memory_offset_alignment; | |||
| std::vector<size_t> buffer_block_free_spaces; | |||
| std::vector<VkBufferMemory*> buffer_blocks; | |||
| std::vector<VkBufferMemory*> dedicated_buffer_blocks; | |||
| std::vector<size_t> image_memory_block_free_spaces; | |||
| std::vector<VkDeviceMemory> image_memory_blocks; | |||
| std::vector<VkDeviceMemory> dedicated_image_memory_blocks; | |||
| }; | |||
| class VkStagingBufferAllocator : public VkAllocator | |||
| class VkStagingAllocator : public VkAllocator | |||
| { | |||
| public: | |||
| VkStagingBufferAllocator(const VulkanDevice* vkdev); | |||
| virtual ~VkStagingBufferAllocator(); | |||
| VkStagingAllocator(const VulkanDevice* vkdev); | |||
| virtual ~VkStagingAllocator(); | |||
| public: | |||
| // ratio range 0 ~ 1 | |||
| @@ -280,82 +331,42 @@ public: | |||
| virtual VkBufferMemory* fastMalloc(size_t size); | |||
| virtual void fastFree(VkBufferMemory* ptr); | |||
| virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack); | |||
| virtual void fastFree(VkImageMemory* ptr); | |||
| private: | |||
| unsigned int size_compare_ratio;// 0~256 | |||
| std::list<VkBufferMemory*> budgets; | |||
| std::list<VkBufferMemory*> buffer_budgets; | |||
| }; | |||
| class VkWeightStagingBufferAllocator : public VkAllocator | |||
| class VkWeightStagingAllocator : public VkAllocator | |||
| { | |||
| public: | |||
| VkWeightStagingBufferAllocator(const VulkanDevice* vkdev); | |||
| virtual ~VkWeightStagingBufferAllocator(); | |||
| VkWeightStagingAllocator(const VulkanDevice* vkdev); | |||
| virtual ~VkWeightStagingAllocator(); | |||
| public: | |||
| virtual VkBufferMemory* fastMalloc(size_t size); | |||
| virtual void fastFree(VkBufferMemory* ptr); | |||
| virtual VkImageMemory* fastMalloc(int /*dims*/, int /*w*/, int /*h*/, int /*c*/, size_t /*elemsize*/, int /*elempack*/) { return 0; } | |||
| virtual void fastFree(VkImageMemory* /*ptr*/) {} | |||
| private: | |||
| }; | |||
| class VkImageMemory | |||
| { | |||
| public: | |||
| VkImage image; | |||
| VkImageView imageview; | |||
| VkDeviceMemory memory; | |||
| // image state, modified by command functions internally | |||
| mutable VkAccessFlags access_flags; | |||
| mutable VkPipelineStageFlags stage_flags; | |||
| // initialize and modified by mat | |||
| int refcount; | |||
| }; | |||
| class VkImageAllocator : public VkAllocator | |||
| { | |||
| public: | |||
| VkImageAllocator(const VulkanDevice* _vkdev); | |||
| virtual ~VkImageAllocator() { clear(); } | |||
| virtual void clear() {} | |||
| virtual VkImageMemory* fastMalloc(int width, int height, VkFormat format) = 0; | |||
| virtual void fastFree(VkImageMemory* ptr) = 0; | |||
| protected: | |||
| virtual VkBufferMemory* fastMalloc(size_t /*size*/) { return 0; } | |||
| virtual void fastFree(VkBufferMemory* /*ptr*/) {} | |||
| protected: | |||
| VkImage create_image(int width, int height, VkFormat format, VkImageUsageFlags usage); | |||
| VkImageView create_imageview(VkImage image, VkFormat format); | |||
| VkDeviceMemory allocate_dedicated_memory(size_t size, VkImage image); | |||
| }; | |||
| class VkSimpleImageAllocator : public VkImageAllocator | |||
| { | |||
| public: | |||
| VkSimpleImageAllocator(const VulkanDevice* vkdev); | |||
| virtual ~VkSimpleImageAllocator(); | |||
| public: | |||
| virtual VkImageMemory* fastMalloc(int width, int height, VkFormat format); | |||
| virtual void fastFree(VkImageMemory* ptr); | |||
| }; | |||
| #if __ANDROID_API__ >= 26 | |||
| class ImportAndroidHardwareBufferPipeline; | |||
| class VkAndroidHardwareBufferImageAllocator : public VkImageAllocator | |||
| class VkAndroidHardwareBufferImageAllocator : public VkAllocator | |||
| { | |||
| public: | |||
| VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb); | |||
| virtual ~VkAndroidHardwareBufferImageAllocator(); | |||
| public: | |||
| virtual VkImageMemory* fastMalloc(int width, int height, VkFormat format); | |||
| virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack); | |||
| virtual void fastFree(VkImageMemory* ptr); | |||
| virtual VkBufferMemory* fastMalloc(size_t /*size*/) { return 0; } | |||
| virtual void fastFree(VkBufferMemory* /*ptr*/) {} | |||
| public: | |||
| int init(); | |||
| @@ -35,18 +35,44 @@ public: | |||
| public: | |||
| void record_upload(const Mat& src, VkMat& dst, const Option& opt); | |||
| void record_upload(const Mat& src, VkImageMat& dst, const Option& opt); | |||
| void record_download(const VkMat& src, Mat& dst, const Option& opt); | |||
| void record_download(const VkImageMat& src, Mat& dst, const Option& opt); | |||
| void record_buffer_to_image(const VkMat& src, VkImageMat& dst, const Option& opt); | |||
| void record_image_to_buffer(const VkImageMat& src, VkMat& dst, const Option& opt); | |||
| void record_clone(const Mat& src, VkMat& dst, const Option& opt); | |||
| void record_clone(const Mat& src, VkImageMat& dst, const Option& opt); | |||
| void record_clone(const VkMat& src, Mat& dst, const Option& opt); | |||
| void record_clone(const VkImageMat& src, Mat& dst, const Option& opt); | |||
| void record_clone(const VkMat& src, VkMat& dst, const Option& opt); | |||
| void record_clone(const VkImageMat& src, VkImageMat& dst, const Option& opt); | |||
| void record_clone(const VkMat& src, VkImageMat& dst, const Option& opt); | |||
| void record_clone(const VkImageMat& src, VkMat& dst, const Option& opt); | |||
| void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher); | |||
| void record_pipeline(const Pipeline* pipeline, const std::vector<VkImageMat>& bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher); | |||
| #if NCNN_BENCHMARK | |||
| void record_write_timestamp(uint32_t query); | |||
| #endif // NCNN_BENCHMARK | |||
| #if __ANDROID_API__ >= 26 | |||
| void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst); | |||
| void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkImageMat& dst); | |||
| #endif // __ANDROID_API__ >= 26 | |||
| int submit_and_wait(); | |||
| @@ -75,8 +101,11 @@ protected: | |||
| std::vector<VkMat> upload_staging_buffers; | |||
| std::vector<VkMat> download_post_buffers; | |||
| std::vector<Mat> download_post_mats_fp16; | |||
| std::vector<Mat> download_post_mats; | |||
| std::vector<VkImageMemory*> image_blocks_to_destroy; | |||
| // the good-old path for device without VK_KHR_push_descriptor | |||
| std::vector<VkDescriptorPool> descriptor_pools; | |||
| std::vector<VkDescriptorSet> descriptorsets; | |||
| @@ -86,6 +115,9 @@ protected: | |||
| enum | |||
| { | |||
| TYPE_copy_buffer, | |||
| TYPE_copy_image, | |||
| TYPE_copy_buffer_to_image, | |||
| TYPE_copy_image_to_buffer, | |||
| TYPE_bind_pipeline, | |||
| TYPE_bind_descriptorsets, | |||
| TYPE_push_constants, | |||
| @@ -99,6 +131,7 @@ protected: | |||
| #endif // NCNN_BENCHMARK | |||
| TYPE_post_download, | |||
| TYPE_post_cast_float16_to_float32, | |||
| }; | |||
| int type; | |||
| @@ -107,6 +140,9 @@ protected: | |||
| union | |||
| { | |||
| struct { VkBuffer src; VkBuffer dst; uint32_t region_count; const VkBufferCopy* regions; } copy_buffer; | |||
| struct { VkImage src; VkImageLayout src_layout; VkImage dst; VkImageLayout dst_layout; uint32_t region_count; const VkImageCopy* regions; } copy_image; | |||
| struct { VkBuffer src; VkImage dst; VkImageLayout layout; uint32_t region_count; const VkBufferImageCopy* regions; } copy_buffer_to_image; | |||
| struct { VkImage src; VkImageLayout layout; VkBuffer dst; uint32_t region_count; const VkBufferImageCopy* regions; } copy_image_to_buffer; | |||
| struct { VkPipelineBindPoint bind_point; VkPipeline pipeline; } bind_pipeline; | |||
| struct { VkPipelineBindPoint bind_point; VkPipelineLayout pipeline_layout; uint32_t descriptorset_count; uint32_t descriptorset_offset; } bind_descriptorsets; | |||
| @@ -122,7 +158,8 @@ protected: | |||
| struct { uint32_t query; } write_timestamp; | |||
| #endif // NCNN_BENCHMARK | |||
| struct { uint32_t download_post_buffer_mat_offset; } post_download; | |||
| struct { uint32_t download_post_buffer_mat_offset; uint32_t download_post_mat_fp16_offset; } post_download; | |||
| struct { uint32_t download_post_mat_fp16_offset; uint32_t download_post_mat_offset; } post_cast_float16_to_float32; | |||
| }; | |||
| }; | |||
| @@ -143,6 +180,8 @@ public: | |||
| public: | |||
| void record_upload(const Mat& src, VkMat& dst, const Option& opt); | |||
| void record_upload(const Mat& src, VkImageMat& dst, const Option& opt); | |||
| int submit_and_wait(); | |||
| protected: | |||
| @@ -34,8 +34,13 @@ layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| layout (binding = 0) uniform sampler2D android_hardware_buffer_image; | |||
| #if NCNN_image_shader | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image3D vkmat_blob; | |||
| layout (binding = 2, imfmtc4) writeonly uniform unfp image3D vkmat_pack4_blob; | |||
| #else | |||
| layout (binding = 1) writeonly buffer vkmat_blob { sfp vkmat_blob_data[]; }; | |||
| layout (binding = 2) writeonly buffer vkmat_pack4_blob { sfpvec4 vkmat_pack4_blob_data[]; }; | |||
| #endif | |||
| void main() | |||
| { | |||
| @@ -108,51 +113,75 @@ void main() | |||
| if (type_to == 1) // PIXEL_RGB | |||
| { | |||
| #if NCNN_image_shader | |||
| image3d_st1(vkmat_blob, ivec3(gx, gy, 0), rgb.r); | |||
| image3d_st1(vkmat_blob, ivec3(gx, gy, 1), rgb.g); | |||
| image3d_st1(vkmat_blob, ivec3(gx, gy, 2), rgb.b); | |||
| #else | |||
| ivec3 v_offset = (gy * outw + gx) + ivec3(0, 1, 2) * outcstep; | |||
| buffer_st1(vkmat_blob_data, v_offset.r, afp(rgb.r)); | |||
| buffer_st1(vkmat_blob_data, v_offset.g, afp(rgb.g)); | |||
| buffer_st1(vkmat_blob_data, v_offset.b, afp(rgb.b)); | |||
| #endif | |||
| } | |||
| if (type_to == 2) // PIXEL_BGR | |||
| { | |||
| #if NCNN_image_shader | |||
| image3d_st1(vkmat_blob, ivec3(gx, gy, 0), rgb.b); | |||
| image3d_st1(vkmat_blob, ivec3(gx, gy, 1), rgb.g); | |||
| image3d_st1(vkmat_blob, ivec3(gx, gy, 2), rgb.r); | |||
| #else | |||
| ivec3 v_offset = (gy * outw + gx) + ivec3(0, 1, 2) * outcstep; | |||
| buffer_st1(vkmat_blob_data, v_offset.r, afp(rgb.b)); | |||
| buffer_st1(vkmat_blob_data, v_offset.g, afp(rgb.g)); | |||
| buffer_st1(vkmat_blob_data, v_offset.b, afp(rgb.r)); | |||
| #endif | |||
| } | |||
| if (type_to == 3) // PIXEL_GRAY | |||
| { | |||
| int v_offset = gy * outw + gx; | |||
| // coeffs for r g b = 0.299f, 0.587f, 0.114f | |||
| float v = clamp(rgb.r * 0.299f + rgb.g * 0.587f + rgb.b * 0.114f, 0.f, 255.f); | |||
| #if NCNN_image_shader | |||
| image3d_st1(vkmat_blob, ivec3(gx, gy, 0), v); | |||
| #else | |||
| int v_offset = gy * outw + gx; | |||
| buffer_st1(vkmat_blob_data, v_offset, afp(v)); | |||
| #endif | |||
| } | |||
| if (type_to == 4) // PIXEL_RGBA | |||
| { | |||
| int v_offset = gy * outw + gx; | |||
| vec4 rgba; | |||
| rgba.rgb = rgb; | |||
| rgba.a = 255.f; | |||
| #if NCNN_image_shader | |||
| image3d_st4(vkmat_pack4_blob, ivec3(gx, gy, 0), rgba); | |||
| #else | |||
| int v_offset = gy * outw + gx; | |||
| buffer_st4(vkmat_pack4_blob_data, v_offset, afpvec4(rgba)); | |||
| #endif | |||
| } | |||
| if (type_to == 5) // PIXEL_BGRA | |||
| { | |||
| int v_offset = gy * outw + gx; | |||
| vec4 rgba; | |||
| rgba.bgr = rgb; | |||
| rgba.a = 255.f; | |||
| #if NCNN_image_shader | |||
| image3d_st4(vkmat_pack4_blob, ivec3(gx, gy, 0), rgba); | |||
| #else | |||
| int v_offset = gy * outw + gx; | |||
| buffer_st4(vkmat_pack4_blob_data, v_offset, afpvec4(rgba)); | |||
| #endif | |||
| } | |||
| } | |||
| @@ -27,6 +27,9 @@ | |||
| #include <vector> | |||
| #include "mat.h" | |||
| #include "command.h" | |||
| #include "layer_type.h" | |||
| #include "layer.h" | |||
| #if __ANDROID__ | |||
| #define ENABLE_VALIDATION_LAYER 0 | |||
| @@ -603,6 +606,10 @@ int create_gpu_instance() | |||
| gpu_info.memory_map_alignment = physicalDeviceProperties.limits.minMemoryMapAlignment; | |||
| gpu_info.buffer_offset_alignment = physicalDeviceProperties.limits.minStorageBufferOffsetAlignment; | |||
| gpu_info.non_coherent_atom_size = physicalDeviceProperties.limits.nonCoherentAtomSize; | |||
| gpu_info.buffer_image_granularity = physicalDeviceProperties.limits.bufferImageGranularity; | |||
| gpu_info.max_image_dimension_1d = physicalDeviceProperties.limits.maxImageDimension1D; | |||
| gpu_info.max_image_dimension_2d = physicalDeviceProperties.limits.maxImageDimension2D; | |||
| gpu_info.max_image_dimension_3d = physicalDeviceProperties.limits.maxImageDimension3D; | |||
| gpu_info.timestamp_period = physicalDeviceProperties.limits.timestampPeriod; | |||
| @@ -810,6 +817,54 @@ int create_gpu_instance() | |||
| gpu_info.support_fp16_arithmetic = true; | |||
| } | |||
| // check format | |||
| gpu_info.support_image_storage = false; | |||
| gpu_info.support_image_fp16_packed = false; | |||
| gpu_info.support_image_fp16_storage = false; | |||
| gpu_info.support_image_fp16_arithmetic = false; | |||
| { | |||
| VkFormatProperties r32f_formatProperties; | |||
| VkFormatProperties rgba32f_formatProperties; | |||
| vkGetPhysicalDeviceFormatProperties(physicalDevice, VK_FORMAT_R32_SFLOAT, &r32f_formatProperties); | |||
| vkGetPhysicalDeviceFormatProperties(physicalDevice, VK_FORMAT_R32G32B32A32_SFLOAT, &rgba32f_formatProperties); | |||
| if ((r32f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT) | |||
| && (r32f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT) | |||
| && (rgba32f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT) | |||
| && (rgba32f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)) | |||
| gpu_info.support_image_storage = true; | |||
| } | |||
| { | |||
| VkFormatProperties rgba16f_formatProperties; | |||
| vkGetPhysicalDeviceFormatProperties(physicalDevice, VK_FORMAT_R16G16B16A16_SFLOAT, &rgba16f_formatProperties); | |||
| if ((rgba16f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT) | |||
| && (rgba16f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)) | |||
| gpu_info.support_image_fp16_packed = true; | |||
| } | |||
| { | |||
| VkFormatProperties r16f_formatProperties; | |||
| VkFormatProperties rgba16f_formatProperties; | |||
| vkGetPhysicalDeviceFormatProperties(physicalDevice, VK_FORMAT_R16_SFLOAT, &r16f_formatProperties); | |||
| vkGetPhysicalDeviceFormatProperties(physicalDevice, VK_FORMAT_R16G16B16A16_SFLOAT, &rgba16f_formatProperties); | |||
| if ((r16f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT) | |||
| && (r16f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT) | |||
| && (rgba16f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT) | |||
| && (rgba16f_formatProperties.optimalTilingFeatures & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)) | |||
| gpu_info.support_image_fp16_storage = true; | |||
| } | |||
| if (gpu_info.support_fp16_arithmetic) | |||
| { | |||
| gpu_info.support_image_fp16_arithmetic = true; | |||
| } | |||
| if (physicalDeviceProperties.vendorID == 0x1ae0 && physicalDeviceProperties.deviceID == 0xc0de) | |||
| { | |||
| // swiftshader image r16f is not supported | |||
| gpu_info.support_image_fp16_storage = false; | |||
| } | |||
| fprintf(stderr, "[%u %s] queueC=%u[%u] queueG=%u[%u] queueT=%u[%u]\n", i, physicalDeviceProperties.deviceName, | |||
| gpu_info.compute_queue_family_index, gpu_info.compute_queue_count, | |||
| gpu_info.graphics_queue_family_index, gpu_info.graphics_queue_count, | |||
| @@ -822,6 +877,10 @@ int create_gpu_instance() | |||
| gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic, | |||
| gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic); | |||
| fprintf(stderr, "[%u %s] imgfp32=%d imgfp16p=%d imgfp16s=%d imgfp16a=%d\n", i, physicalDeviceProperties.deviceName, | |||
| gpu_info.support_image_storage, gpu_info.support_image_fp16_packed, | |||
| gpu_info.support_image_fp16_storage, gpu_info.support_image_fp16_arithmetic); | |||
| gpu_info_index++; | |||
| } | |||
| @@ -833,7 +892,7 @@ int create_gpu_instance() | |||
| // resolve shader info | |||
| for (int i=0; i<layer_shader_registry_entry_count; i++) | |||
| { | |||
| layer_shader_infos[i] = resolve_shader_info(layer_shader_registry[i].spv_data, layer_shader_registry[i].spv_data_size); | |||
| resolve_shader_info(layer_shader_registry[i].spv_data, layer_shader_registry[i].spv_data_size, layer_shader_infos[i]); | |||
| } | |||
| return 0; | |||
| @@ -1043,8 +1102,8 @@ VulkanDevice::VulkanDevice(int device_index) : info(g_gpu_infos[device_index]) | |||
| for (uint32_t i = 0; i < info.compute_queue_count; i++) | |||
| { | |||
| vkGetDeviceQueue(device, info.compute_queue_family_index, i, &compute_queues[i]); | |||
| blob_allocators[i] = new VkBlobBufferAllocator(this); | |||
| staging_allocators[i] = new VkStagingBufferAllocator(this); | |||
| blob_allocators[i] = new VkBlobAllocator(this); | |||
| staging_allocators[i] = new VkStagingAllocator(this); | |||
| } | |||
| if (info.compute_queue_family_index != info.graphics_queue_family_index) | |||
| { | |||
| @@ -1062,10 +1121,49 @@ VulkanDevice::VulkanDevice(int device_index) : info(g_gpu_infos[device_index]) | |||
| vkGetDeviceQueue(device, info.transfer_queue_family_index, i, &transfer_queues[i]); | |||
| } | |||
| } | |||
| // prepare immutable texelfetch sampler | |||
| { | |||
| VkSamplerCreateInfo samplerCreateInfo; | |||
| samplerCreateInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO; | |||
| samplerCreateInfo.pNext = 0; | |||
| samplerCreateInfo.flags = 0; | |||
| samplerCreateInfo.magFilter = VK_FILTER_NEAREST; | |||
| samplerCreateInfo.minFilter = VK_FILTER_NEAREST; | |||
| samplerCreateInfo.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST; | |||
| samplerCreateInfo.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; | |||
| samplerCreateInfo.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; | |||
| samplerCreateInfo.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; | |||
| samplerCreateInfo.mipLodBias = 0.0f; | |||
| samplerCreateInfo.anisotropyEnable = VK_FALSE; | |||
| samplerCreateInfo.maxAnisotropy = 1; | |||
| samplerCreateInfo.compareEnable = VK_FALSE; | |||
| samplerCreateInfo.compareOp = VK_COMPARE_OP_NEVER; | |||
| samplerCreateInfo.minLod = 0.0f; | |||
| samplerCreateInfo.maxLod = 0.0f; | |||
| samplerCreateInfo.borderColor = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK; | |||
| samplerCreateInfo.unnormalizedCoordinates = VK_TRUE; | |||
| texelfetch_sampler = 0; | |||
| ret = vkCreateSampler(device, &samplerCreateInfo, 0, &texelfetch_sampler); | |||
| if (ret != VK_SUCCESS) | |||
| { | |||
| fprintf(stderr, "vkCreateSampler failed %d\n", ret); | |||
| } | |||
| } | |||
| create_utility_operator(); | |||
| } | |||
| VulkanDevice::~VulkanDevice() | |||
| { | |||
| destroy_utility_operator(); | |||
| if (texelfetch_sampler) | |||
| { | |||
| vkDestroySampler(device, texelfetch_sampler, 0); | |||
| } | |||
| for (uint32_t i = 0; i < info.compute_queue_count; i++) | |||
| { | |||
| delete blob_allocators[i]; | |||
| @@ -1436,40 +1534,69 @@ void VulkanDevice::reclaim_staging_allocator(VkAllocator* allocator) const | |||
| fprintf(stderr, "FATAL ERROR! reclaim_staging_allocator get wild allocator %p\n", allocator); | |||
| } | |||
| static inline bool string_ends_with_fp16p(const char* name) | |||
| const VkSampler* VulkanDevice::immutable_texelfetch_sampler() const | |||
| { | |||
| int len = strlen(name); | |||
| if (len < 6) | |||
| return false; | |||
| return &texelfetch_sampler; | |||
| } | |||
| return memcmp(name + len - 6, "_fp16p", 6) == 0; | |||
| void VulkanDevice::cast_float32_to_float16(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int uoi = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0; | |||
| uop_cast_float32_to_float16[uoi]->forward(src, dst, cmd, opt); | |||
| } | |||
| static inline bool string_ends_with_fp16pa(const char* name) | |||
| void VulkanDevice::cast_float32_to_float16(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int len = strlen(name); | |||
| if (len < 7) | |||
| return false; | |||
| int uoi = opt.use_image_fp16_storage ? 5 : opt.use_image_fp16_packed ? 4 : 3; | |||
| uop_cast_float32_to_float16[uoi]->forward(src, dst, cmd, opt); | |||
| } | |||
| return memcmp(name + len - 7, "_fp16pa", 7) == 0; | |||
| void VulkanDevice::cast_float16_to_float32(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int uoi = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0; | |||
| uop_cast_float16_to_float32[uoi]->forward(src, dst, cmd, opt); | |||
| } | |||
| static inline bool string_ends_with_fp16s(const char* name) | |||
| void VulkanDevice::cast_float16_to_float32(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int len = strlen(name); | |||
| if (len < 6) | |||
| return false; | |||
| int uoi = opt.use_image_fp16_storage ? 5 : opt.use_image_fp16_packed ? 4 : 3; | |||
| uop_cast_float16_to_float32[uoi]->forward(src, dst, cmd, opt); | |||
| } | |||
| return memcmp(name + len - 6, "_fp16s", 6) == 0; | |||
| void VulkanDevice::packing_pack1(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int uoi = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0; | |||
| uop_packing_pack1[uoi]->forward(src, dst, cmd, opt); | |||
| } | |||
| static inline bool string_ends_with_fp16sa(const char* name) | |||
| void VulkanDevice::packing_pack1(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int len = strlen(name); | |||
| if (len < 7) | |||
| return false; | |||
| int uoi = opt.use_image_fp16_storage ? 5 : opt.use_image_fp16_packed ? 4 : 3; | |||
| uop_packing_pack1[uoi]->forward(src, dst, cmd, opt); | |||
| } | |||
| return memcmp(name + len - 7, "_fp16sa", 7) == 0; | |||
| void VulkanDevice::packing_pack4(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int uoi = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0; | |||
| uop_packing_pack4[uoi]->forward(src, dst, cmd, opt); | |||
| } | |||
| void VulkanDevice::packing_pack4(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int uoi = opt.use_image_fp16_storage ? 5 : opt.use_image_fp16_packed ? 4 : 3; | |||
| uop_packing_pack4[uoi]->forward(src, dst, cmd, opt); | |||
| } | |||
| void VulkanDevice::packing_pack8(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int uoi = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0; | |||
| uop_packing_pack8[uoi]->forward(src, dst, cmd, opt); | |||
| } | |||
| void VulkanDevice::packing_pack8(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int uoi = opt.use_image_fp16_storage ? 5 : opt.use_image_fp16_packed ? 4 : 3; | |||
| uop_packing_pack8[uoi]->forward(src, dst, cmd, opt); | |||
| } | |||
| int VulkanDevice::create_shader_module() | |||
| @@ -1490,28 +1617,56 @@ int VulkanDevice::create_shader_module() | |||
| // 2 = fp16pa | |||
| // 3 = fp16s | |||
| // 4 = fp16sa | |||
| // 5 = image | |||
| // 6 = image_fp16p | |||
| // 7 = image_fp16s | |||
| // 8 = image_fp16a | |||
| if (!info.support_fp16_packed) | |||
| { | |||
| if (i % 5 == 1) | |||
| if (i % 9 == 1) | |||
| continue; | |||
| } | |||
| if (!info.support_fp16_packed || !info.support_fp16_arithmetic) | |||
| { | |||
| if (i % 5 == 2) | |||
| if (i % 9 == 2) | |||
| continue; | |||
| } | |||
| if (!info.support_fp16_storage) | |||
| { | |||
| if (i % 5 == 3) | |||
| if (i % 9 == 3) | |||
| continue; | |||
| } | |||
| if (!info.support_fp16_storage || !info.support_fp16_arithmetic) | |||
| { | |||
| if (i % 5 == 4) | |||
| if (i % 9 == 4) | |||
| continue; | |||
| } | |||
| if (!info.support_image_storage) | |||
| { | |||
| if (i % 9 == 5) | |||
| continue; | |||
| } | |||
| if (!info.support_image_storage || !info.support_image_fp16_packed) | |||
| { | |||
| if (i % 9 == 6) | |||
| continue; | |||
| } | |||
| if (!info.support_image_storage || !info.support_image_fp16_storage) | |||
| { | |||
| if (i % 9 == 7) | |||
| continue; | |||
| } | |||
| if (!info.support_image_storage || !info.support_image_fp16_storage || !info.support_image_fp16_arithmetic) | |||
| { | |||
| if (i % 9 == 8) | |||
| continue; | |||
| } | |||
| @@ -1606,6 +1761,214 @@ int VulkanDevice::init_device_extension() | |||
| return 0; | |||
| } | |||
| int VulkanDevice::create_utility_operator() | |||
| { | |||
| Option opt[6]; | |||
| opt[0].use_fp16_packed = false; | |||
| opt[0].use_fp16_storage = false; | |||
| opt[0].use_image_storage = false; | |||
| opt[0].use_image_fp16_packed = false; | |||
| opt[0].use_image_fp16_storage = false; | |||
| opt[0].use_shader_pack8 = true; | |||
| opt[1].use_fp16_packed = true; | |||
| opt[1].use_fp16_storage = false; | |||
| opt[1].use_image_storage = false; | |||
| opt[1].use_image_fp16_packed = false; | |||
| opt[1].use_image_fp16_storage = false; | |||
| opt[1].use_shader_pack8 = true; | |||
| opt[2].use_fp16_packed = true; | |||
| opt[2].use_fp16_storage = true; | |||
| opt[2].use_image_storage = false; | |||
| opt[2].use_image_fp16_packed = false; | |||
| opt[2].use_image_fp16_storage = false; | |||
| opt[2].use_shader_pack8 = true; | |||
| opt[3].use_fp16_packed = false; | |||
| opt[3].use_fp16_storage = false; | |||
| opt[3].use_image_storage = true; | |||
| opt[3].use_image_fp16_packed = false; | |||
| opt[3].use_image_fp16_storage = false; | |||
| opt[3].use_shader_pack8 = true; | |||
| opt[4].use_fp16_packed = false; | |||
| opt[4].use_fp16_storage = false; | |||
| opt[4].use_image_storage = true; | |||
| opt[4].use_image_fp16_packed = true; | |||
| opt[4].use_image_fp16_storage = false; | |||
| opt[4].use_shader_pack8 = true; | |||
| opt[5].use_fp16_packed = false; | |||
| opt[5].use_fp16_storage = false; | |||
| opt[5].use_image_storage = true; | |||
| opt[5].use_image_fp16_packed = true; | |||
| opt[5].use_image_fp16_storage = true; | |||
| opt[5].use_shader_pack8 = true; | |||
| for (int i = 0; i < 6; i++) | |||
| { | |||
| uop_cast_float32_to_float16[i] = 0; | |||
| uop_cast_float16_to_float32[i] = 0; | |||
| uop_packing_pack1[i] = 0; | |||
| uop_packing_pack4[i] = 0; | |||
| uop_packing_pack8[i] = 0; | |||
| if (i == 1 && !info.support_fp16_packed) | |||
| continue; | |||
| if (i == 2 && !info.support_fp16_storage) | |||
| continue; | |||
| if (i == 3 && !info.support_image_storage) | |||
| continue; | |||
| if (i == 4 && (!info.support_image_storage || !info.support_image_fp16_packed)) | |||
| continue; | |||
| if (i == 5 && (!info.support_image_storage || !info.support_image_fp16_storage)) | |||
| continue; | |||
| { | |||
| uop_cast_float32_to_float16[i] = ncnn::create_layer(ncnn::LayerType::Cast); | |||
| uop_cast_float32_to_float16[i]->vkdev = this; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, 1); | |||
| pd.set(1, 2); | |||
| uop_cast_float32_to_float16[i]->load_param(pd); | |||
| } | |||
| { | |||
| uop_cast_float16_to_float32[i] = ncnn::create_layer(ncnn::LayerType::Cast); | |||
| uop_cast_float16_to_float32[i]->vkdev = this; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, 2); | |||
| pd.set(1, 1); | |||
| uop_cast_float16_to_float32[i]->load_param(pd); | |||
| } | |||
| { | |||
| uop_packing_pack1[i] = ncnn::create_layer(ncnn::LayerType::Packing); | |||
| uop_packing_pack1[i]->vkdev = this; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, 1); | |||
| uop_packing_pack1[i]->load_param(pd); | |||
| } | |||
| { | |||
| uop_packing_pack4[i] = ncnn::create_layer(ncnn::LayerType::Packing); | |||
| uop_packing_pack4[i]->vkdev = this; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, 4); | |||
| uop_packing_pack4[i]->load_param(pd); | |||
| } | |||
| { | |||
| uop_packing_pack8[i] = ncnn::create_layer(ncnn::LayerType::Packing); | |||
| uop_packing_pack8[i]->vkdev = this; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, 8); | |||
| uop_packing_pack8[i]->load_param(pd); | |||
| } | |||
| uop_cast_float32_to_float16[i]->create_pipeline(opt[i]); | |||
| uop_cast_float16_to_float32[i]->create_pipeline(opt[i]); | |||
| uop_packing_pack1[i]->create_pipeline(opt[i]); | |||
| uop_packing_pack4[i]->create_pipeline(opt[i]); | |||
| uop_packing_pack8[i]->create_pipeline(opt[i]); | |||
| } | |||
| return 0; | |||
| } | |||
| void VulkanDevice::destroy_utility_operator() | |||
| { | |||
| Option opt[6]; | |||
| opt[0].use_fp16_packed = false; | |||
| opt[0].use_fp16_storage = false; | |||
| opt[0].use_image_storage = false; | |||
| opt[0].use_image_fp16_packed = false; | |||
| opt[0].use_image_fp16_storage = false; | |||
| opt[0].use_shader_pack8 = true; | |||
| opt[1].use_fp16_packed = true; | |||
| opt[1].use_fp16_storage = false; | |||
| opt[1].use_image_storage = false; | |||
| opt[1].use_image_fp16_packed = false; | |||
| opt[1].use_image_fp16_storage = false; | |||
| opt[1].use_shader_pack8 = true; | |||
| opt[2].use_fp16_packed = true; | |||
| opt[2].use_fp16_storage = true; | |||
| opt[2].use_image_storage = false; | |||
| opt[2].use_image_fp16_packed = false; | |||
| opt[2].use_image_fp16_storage = false; | |||
| opt[2].use_shader_pack8 = true; | |||
| opt[3].use_fp16_packed = false; | |||
| opt[3].use_fp16_storage = false; | |||
| opt[3].use_image_storage = true; | |||
| opt[3].use_image_fp16_packed = false; | |||
| opt[3].use_image_fp16_storage = false; | |||
| opt[3].use_shader_pack8 = true; | |||
| opt[4].use_fp16_packed = false; | |||
| opt[4].use_fp16_storage = false; | |||
| opt[4].use_image_storage = true; | |||
| opt[4].use_image_fp16_packed = true; | |||
| opt[4].use_image_fp16_storage = false; | |||
| opt[4].use_shader_pack8 = true; | |||
| opt[5].use_fp16_packed = false; | |||
| opt[5].use_fp16_storage = false; | |||
| opt[5].use_image_storage = true; | |||
| opt[5].use_image_fp16_packed = true; | |||
| opt[5].use_image_fp16_storage = true; | |||
| opt[5].use_shader_pack8 = true; | |||
| for (int i = 0; i < 6; i++) | |||
| { | |||
| if (i == 1 && !info.support_fp16_packed) | |||
| continue; | |||
| if (i == 2 && !info.support_fp16_storage) | |||
| continue; | |||
| if (i == 3 && !info.support_image_storage) | |||
| continue; | |||
| if (i == 4 && (!info.support_image_storage || !info.support_image_fp16_packed)) | |||
| continue; | |||
| if (i == 5 && (!info.support_image_storage || !info.support_image_fp16_storage)) | |||
| continue; | |||
| uop_cast_float32_to_float16[i]->destroy_pipeline(opt[i]); | |||
| uop_cast_float16_to_float32[i]->destroy_pipeline(opt[i]); | |||
| uop_packing_pack1[i]->destroy_pipeline(opt[i]); | |||
| uop_packing_pack4[i]->destroy_pipeline(opt[i]); | |||
| uop_packing_pack8[i]->destroy_pipeline(opt[i]); | |||
| delete uop_cast_float32_to_float16[i]; | |||
| delete uop_cast_float16_to_float32[i]; | |||
| delete uop_packing_pack1[i]; | |||
| delete uop_packing_pack4[i]; | |||
| delete uop_packing_pack8[i]; | |||
| } | |||
| } | |||
| VulkanDevice* get_gpu_device(int device_index) | |||
| { | |||
| if (device_index < 0 || device_index >= g_gpu_count) | |||
| @@ -1630,16 +1993,30 @@ const ShaderInfo& get_shader_info(int shader_type_index) | |||
| return layer_shader_infos[shader_type_index]; | |||
| } | |||
| ShaderInfo resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size) | |||
| int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info) | |||
| { | |||
| shader_info.specialization_count = 0; | |||
| shader_info.binding_count = 0; | |||
| shader_info.push_constant_count = 0; | |||
| uint32_t parameter_id = -233; | |||
| int specialization_count = 0; | |||
| int binding_count = 0; | |||
| int push_constant_count = 0; | |||
| // id -> binding_type | |||
| std::vector<int> id_types; | |||
| // binding_id -> binding_type | |||
| std::vector<int> binding_types; | |||
| const uint32_t* p = spv_data; | |||
| int bound = p[3]; | |||
| id_types.resize(bound); | |||
| // skip magic version generator bound schema | |||
| p += 5; | |||
| @@ -1668,28 +2045,86 @@ ShaderInfo resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size) | |||
| push_constant_count++; | |||
| } | |||
| } | |||
| else if (op == 25) // OpTypeImage | |||
| { | |||
| uint32_t id = p[1]; | |||
| id_types[id] = 2; | |||
| } | |||
| else if (op == 27) // OpTypeSampledImage | |||
| { | |||
| uint32_t id = p[1]; | |||
| id_types[id] = 3; | |||
| } | |||
| else if (op == 32) // OpTypePointer | |||
| { | |||
| uint32_t id = p[1]; | |||
| uint32_t storage_class = p[2]; | |||
| uint32_t type = p[3]; | |||
| if (storage_class == 0) // UniformConstant | |||
| { | |||
| id_types[id] = id_types[type]; | |||
| } | |||
| if (storage_class == 2) // Uniform | |||
| { | |||
| id_types[id] = id_types[type]; | |||
| } | |||
| } | |||
| else if (op == 59) // OpVariable | |||
| { | |||
| uint32_t id = p[1]; | |||
| uint32_t var_id = p[2]; | |||
| uint32_t storage_class = p[3]; | |||
| if (storage_class == 0) // UniformConstant | |||
| { | |||
| id_types[var_id] = id_types[id]; | |||
| } | |||
| if (storage_class == 2) // Uniform | |||
| { | |||
| id_types[var_id] = id_types[id]; | |||
| } | |||
| } | |||
| else if (op == 71) // OpDecorate | |||
| { | |||
| uint32_t id = p[1]; | |||
| uint32_t decoration = p[2]; | |||
| uint32_t binding_id = p[3]; | |||
| if (decoration == 1) // SpecId | |||
| { | |||
| specialization_count++; | |||
| } | |||
| if (decoration == 3) // BufferBlock | |||
| { | |||
| id_types[id] = 1; | |||
| } | |||
| else if (decoration == 33) // Binding | |||
| { | |||
| binding_count++; | |||
| binding_count = std::max(binding_count, (int)binding_id + 1); | |||
| binding_types.resize(binding_count); | |||
| binding_types[binding_id] = id; | |||
| } | |||
| } | |||
| p += wordcount; | |||
| } | |||
| ShaderInfo si; | |||
| si.specialization_count = specialization_count; | |||
| si.binding_count = binding_count; | |||
| si.push_constant_count = push_constant_count; | |||
| if (binding_count > 16) | |||
| { | |||
| fprintf(stderr, "too many binding %d\n", binding_count); | |||
| return -1; | |||
| } | |||
| shader_info.specialization_count = specialization_count; | |||
| shader_info.binding_count = binding_count; | |||
| shader_info.push_constant_count = push_constant_count; | |||
| return si; | |||
| // resolve binding_types | |||
| for (int i=0; i<binding_count; i++) | |||
| { | |||
| shader_info.binding_types[i] = id_types[ binding_types[i] ]; | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -100,6 +100,10 @@ public: | |||
| size_t memory_map_alignment; | |||
| size_t buffer_offset_alignment; | |||
| size_t non_coherent_atom_size; | |||
| size_t buffer_image_granularity; | |||
| uint32_t max_image_dimension_1d; | |||
| uint32_t max_image_dimension_2d; | |||
| uint32_t max_image_dimension_3d; | |||
| float timestamp_period; | |||
| // runtime | |||
| @@ -127,6 +131,12 @@ public: | |||
| bool support_int8_storage; | |||
| bool support_int8_arithmetic; | |||
| // image feature | |||
| bool support_image_storage; | |||
| bool support_image_fp16_packed; | |||
| bool support_image_fp16_storage; | |||
| bool support_image_fp16_arithmetic; | |||
| // ycbcr conversion feature | |||
| bool support_ycbcr_conversion; | |||
| @@ -154,6 +164,11 @@ public: | |||
| const GpuInfo& get_gpu_info(int device_index = get_default_gpu_index()); | |||
| class VkAllocator; | |||
| class VkCompute; | |||
| class VkMat; | |||
| class VkImageMat; | |||
| class Layer; | |||
| class Option; | |||
| class VulkanDevice | |||
| { | |||
| public: | |||
| @@ -188,6 +203,21 @@ public: | |||
| VkAllocator* acquire_staging_allocator() const; | |||
| void reclaim_staging_allocator(VkAllocator* allocator) const; | |||
| // immutable sampler for texelfetch | |||
| const VkSampler* immutable_texelfetch_sampler() const; | |||
| // utility operator | |||
| void cast_float32_to_float16(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const; | |||
| void cast_float32_to_float16(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const; | |||
| void cast_float16_to_float32(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const; | |||
| void cast_float16_to_float32(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const; | |||
| void packing_pack1(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const; | |||
| void packing_pack1(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const; | |||
| void packing_pack4(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const; | |||
| void packing_pack4(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const; | |||
| void packing_pack8(const VkMat& src, VkMat& dst, VkCompute& cmd, const Option& opt) const; | |||
| void packing_pack8(const VkImageMat& src, VkImageMat& dst, VkCompute& cmd, const Option& opt) const; | |||
| // VK_KHR_bind_memory2 | |||
| PFN_vkBindBufferMemory2KHR vkBindBufferMemory2KHR; | |||
| PFN_vkBindImageMemory2KHR vkBindImageMemory2KHR; | |||
| @@ -234,6 +264,10 @@ protected: | |||
| // device extension | |||
| int init_device_extension(); | |||
| // utility operator | |||
| int create_utility_operator(); | |||
| void destroy_utility_operator(); | |||
| private: | |||
| VkDevice device; | |||
| std::vector<VkShaderModule> shader_modules; | |||
| @@ -251,6 +285,22 @@ private: | |||
| // default staging allocator for each queue | |||
| mutable std::vector<VkAllocator*> staging_allocators; | |||
| mutable Mutex staging_allocator_lock; | |||
| // nearest sampler for texelfetch | |||
| VkSampler texelfetch_sampler; | |||
| // utility operator | |||
| // 0 = fp32 | |||
| // 1 = fp16p | |||
| // 2 = fp16s | |||
| // 3 = image | |||
| // 4 = image_fp16p | |||
| // 5 = image_fp16s | |||
| ncnn::Layer* uop_cast_float32_to_float16[6]; | |||
| ncnn::Layer* uop_cast_float16_to_float32[6]; | |||
| ncnn::Layer* uop_packing_pack1[6]; | |||
| ncnn::Layer* uop_packing_pack4[6]; | |||
| ncnn::Layer* uop_packing_pack8[6]; | |||
| }; | |||
| VulkanDevice* get_gpu_device(int device_index = get_default_gpu_index()); | |||
| @@ -262,10 +312,16 @@ public: | |||
| int specialization_count; | |||
| int binding_count; | |||
| int push_constant_count; | |||
| // 0 = null | |||
| // 1 = storage buffer | |||
| // 2 = storage image | |||
| // 3 = combined image sampler | |||
| int binding_types[16];// 16 is large enough I think ... | |||
| }; | |||
| const ShaderInfo& get_shader_info(int shader_type_index); | |||
| ShaderInfo resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size); | |||
| int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info); | |||
| } // namespace ncnn | |||
| @@ -39,6 +39,7 @@ Layer::Layer() | |||
| support_packing = false; | |||
| support_bf16_storage = false; | |||
| support_image_storage = false; | |||
| #if NCNN_VULKAN | |||
| vkdev = 0; | |||
| @@ -137,6 +138,30 @@ int Layer::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, co | |||
| return forward_inplace(top_blob, cmd, opt); | |||
| } | |||
| int Layer::forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| if (!support_inplace) | |||
| return -1; | |||
| top_blobs.resize(bottom_blobs.size()); | |||
| for (int i = 0; i < (int)top_blobs.size(); i++) | |||
| { | |||
| cmd.record_clone(bottom_blobs[i], top_blobs[i], opt); | |||
| } | |||
| return forward_inplace(top_blobs, cmd, opt); | |||
| } | |||
| int Layer::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| if (!support_inplace) | |||
| return -1; | |||
| cmd.record_clone(bottom_blob, top_blob, opt); | |||
| return forward_inplace(top_blob, cmd, opt); | |||
| } | |||
| int Layer::forward_inplace(std::vector<VkMat>& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const | |||
| { | |||
| return -1; | |||
| @@ -146,6 +171,16 @@ int Layer::forward_inplace(VkMat& /*bottom_top_blob*/, VkCompute& /*cmd*/, const | |||
| { | |||
| return -1; | |||
| } | |||
| int Layer::forward_inplace(std::vector<VkImageMat>& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const | |||
| { | |||
| return -1; | |||
| } | |||
| int Layer::forward_inplace(VkImageMat& /*bottom_top_blob*/, VkCompute& /*cmd*/, const Option& /*opt*/) const | |||
| { | |||
| return -1; | |||
| } | |||
| #endif // NCNN_VULKAN | |||
| static const layer_registry_entry layer_registry[] = | |||
| @@ -73,6 +73,9 @@ public: | |||
| // accept bf16 | |||
| bool support_bf16_storage; | |||
| // shader image storage | |||
| bool support_image_storage; | |||
| public: | |||
| // implement inference | |||
| // return 0 if success | |||
| @@ -95,11 +98,21 @@ public: | |||
| virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| // implement inference | |||
| // return 0 if success | |||
| virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| // implement inplace inference | |||
| // return 0 if success | |||
| virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const; | |||
| // implement inplace inference | |||
| // return 0 if success | |||
| virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const; | |||
| public: | |||
| // assigned immediately after creating this layer | |||
| const VulkanDevice* vkdev; | |||
| @@ -22,7 +22,10 @@ Input::Input() | |||
| { | |||
| one_blob_only = true; | |||
| support_inplace = true; | |||
| support_vulkan = false; | |||
| support_vulkan = true; | |||
| support_packing = true; | |||
| support_bf16_storage = true; | |||
| support_image_storage = true; | |||
| } | |||
| int Input::load_param(const ParamDict& pd) | |||
| @@ -39,4 +42,16 @@ int Input::forward_inplace(Mat& /*bottom_top_blob*/, const Option& /*opt*/) cons | |||
| return 0; | |||
| } | |||
| #if NCNN_VULKAN | |||
| int Input::forward_inplace(VkMat& /*bottom_top_blob*/, VkCompute& /*cmd*/, const Option& /*opt*/) const | |||
| { | |||
| return 0; | |||
| } | |||
| int Input::forward_inplace(VkImageMat& /*bottom_top_blob*/, VkCompute& /*cmd*/, const Option& /*opt*/) const | |||
| { | |||
| return 0; | |||
| } | |||
| #endif // NCNN_VULKAN | |||
| } // namespace ncnn | |||
| @@ -28,6 +28,11 @@ public: | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| #if NCNN_VULKAN | |||
| virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const; | |||
| #endif // NCNN_VULKAN | |||
| public: | |||
| int w; | |||
| int h; | |||
| @@ -23,6 +23,7 @@ Noop::Noop() | |||
| support_inplace = true; | |||
| support_vulkan = true; | |||
| support_packing = true; | |||
| support_image_storage = true; | |||
| } | |||
| int Noop::forward_inplace(std::vector<Mat>& /*bottom_top_blobs*/, const Option& /*opt*/) const | |||
| @@ -35,6 +36,11 @@ int Noop::forward_inplace(std::vector<VkMat>& /*bottom_top_blobs*/, VkCompute& / | |||
| { | |||
| return 0; | |||
| } | |||
| int Noop::forward_inplace(std::vector<VkImageMat>& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const | |||
| { | |||
| return 0; | |||
| } | |||
| #endif // NCNN_VULKAN | |||
| } // namespace ncnn | |||
| @@ -28,6 +28,7 @@ public: | |||
| #if NCNN_VULKAN | |||
| virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const; | |||
| #endif // NCNN_VULKAN | |||
| }; | |||
| @@ -25,6 +25,7 @@ Split::Split() | |||
| support_vulkan = true; | |||
| support_packing = true; | |||
| support_bf16_storage = true; | |||
| support_image_storage = true; | |||
| } | |||
| int Split::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& /*opt*/) const | |||
| @@ -41,8 +42,6 @@ int Split::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_b | |||
| #if NCNN_VULKAN | |||
| int Split::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& /*cmd*/, const Option& /*opt*/) const | |||
| { | |||
| // fprintf(stderr, "Split::forward %p\n", bottom_blobs[0].buffer()); | |||
| const VkMat& bottom_blob = bottom_blobs[0]; | |||
| for (size_t i=0; i<top_blobs.size(); i++) | |||
| { | |||
| @@ -51,6 +50,17 @@ int Split::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& t | |||
| return 0; | |||
| } | |||
| int Split::forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& /*cmd*/, const Option& /*opt*/) const | |||
| { | |||
| const VkImageMat& bottom_blob = bottom_blobs[0]; | |||
| for (size_t i=0; i<top_blobs.size(); i++) | |||
| { | |||
| top_blobs[i] = bottom_blob; | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // NCNN_VULKAN | |||
| } // namespace ncnn | |||
| @@ -28,6 +28,7 @@ public: | |||
| #if NCNN_VULKAN | |||
| virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const; | |||
| #endif // NCNN_VULKAN | |||
| public: | |||
| @@ -23,6 +23,7 @@ DEFINE_LAYER_CREATOR(AbsVal_vulkan) | |||
| AbsVal_vulkan::AbsVal_vulkan() | |||
| { | |||
| support_vulkan = true; | |||
| support_image_storage = true; | |||
| pipeline_absval = 0; | |||
| pipeline_absval_pack4 = 0; | |||
| @@ -39,7 +40,19 @@ int AbsVal_vulkan::create_pipeline(const Option& opt) | |||
| if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; | |||
| size_t elemsize; | |||
| if (opt.use_fp16_storage) | |||
| if (opt.use_image_storage && opt.use_image_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage && opt.use_image_fp16_packed) | |||
| { | |||
| elemsize = elempack == 1 ? 4u : elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| elemsize = elempack * 4u; | |||
| } | |||
| else if (opt.use_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| } | |||
| @@ -148,4 +161,28 @@ int AbsVal_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const | |||
| return 0; | |||
| } | |||
| int AbsVal_vulkan::forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& /*opt*/) const | |||
| { | |||
| int elempack = bottom_top_blob.elempack; | |||
| std::vector<VkImageMat> bindings(2); | |||
| bindings[0] = bottom_top_blob; | |||
| bindings[1] = bottom_top_blob; | |||
| std::vector<vk_constant_type> constants(5); | |||
| constants[0].i = bottom_top_blob.dims; | |||
| constants[1].i = bottom_top_blob.w; | |||
| constants[2].i = bottom_top_blob.h; | |||
| constants[3].i = bottom_top_blob.c; | |||
| constants[4].i = 0;//bottom_top_blob.cstep; | |||
| const Pipeline* pipeline = elempack == 8 ? pipeline_absval_pack8 | |||
| : elempack == 4 ? pipeline_absval_pack4 | |||
| : pipeline_absval; | |||
| cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob); | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -29,6 +29,7 @@ public: | |||
| using AbsVal::forward_inplace; | |||
| virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const; | |||
| public: | |||
| Pipeline* pipeline_absval; | |||
| @@ -23,6 +23,7 @@ DEFINE_LAYER_CREATOR(Cast_vulkan) | |||
| Cast_vulkan::Cast_vulkan() | |||
| { | |||
| support_vulkan = true; | |||
| support_image_storage = true; | |||
| pipeline_cast_fp32_to_fp16 = 0; | |||
| pipeline_cast_fp32_to_fp16_pack4 = 0; | |||
| @@ -49,7 +50,22 @@ int Cast_vulkan::create_pipeline(const Option& opt) | |||
| size_t elemsize; | |||
| size_t out_elemsize; | |||
| if (opt.use_fp16_storage) | |||
| if (opt.use_image_storage && opt.use_image_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| out_elemsize = out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage && opt.use_image_fp16_packed) | |||
| { | |||
| elemsize = elempack == 1 ? 4u : elempack * 2u; | |||
| out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| elemsize = elempack * 4u; | |||
| out_elemsize = out_elempack * 4u; | |||
| } | |||
| else if (opt.use_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| out_elemsize = out_elempack * 2u; | |||
| @@ -285,4 +301,102 @@ int Cast_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& c | |||
| return 0; | |||
| } | |||
| int Cast_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| if (type_from == type_to) | |||
| { | |||
| top_blob = bottom_blob; | |||
| return 0; | |||
| } | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| int dims = bottom_blob.dims; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| size_t out_elemsize = elemsize; | |||
| if (type_to == 1) | |||
| { | |||
| // float32 | |||
| out_elemsize = 4 * elempack; | |||
| } | |||
| else if (type_to == 2) | |||
| { | |||
| // float16 | |||
| out_elemsize = 2 * elempack; | |||
| if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) | |||
| { | |||
| if (elempack == 8) out_elemsize = 8*2u; | |||
| if (elempack == 4) out_elemsize = 4*2u; | |||
| if (elempack == 1) out_elemsize = 4u; | |||
| } | |||
| if (!opt.use_image_fp16_packed && !opt.use_image_fp16_storage) | |||
| { | |||
| // fallback to fp32 :( | |||
| out_elemsize = 4 * elempack; | |||
| } | |||
| } | |||
| else if (type_to == 3) | |||
| { | |||
| // int8 | |||
| out_elemsize = elempack; | |||
| } | |||
| if (dims == 1) | |||
| { | |||
| top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator); | |||
| } | |||
| else if (dims == 2) | |||
| { | |||
| top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator); | |||
| } | |||
| else if (dims == 3) | |||
| { | |||
| top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator); | |||
| } | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(2); | |||
| bindings[0] = bottom_blob; | |||
| bindings[1] = top_blob; | |||
| std::vector<vk_constant_type> constants(10); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h; | |||
| constants[3].i = bottom_blob.c; | |||
| constants[4].i = 0;//bottom_blob.cstep; | |||
| constants[5].i = top_blob.dims; | |||
| constants[6].i = top_blob.w; | |||
| constants[7].i = top_blob.h; | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = 0;//top_blob.cstep; | |||
| const Pipeline* pipeline = 0; | |||
| if (type_from == 1 && type_to == 2) | |||
| { | |||
| pipeline = elempack == 8 ? pipeline_cast_fp32_to_fp16_pack8 | |||
| : elempack == 4 ? pipeline_cast_fp32_to_fp16_pack4 | |||
| : pipeline_cast_fp32_to_fp16; | |||
| } | |||
| if (type_from == 2 && type_to == 1) | |||
| { | |||
| pipeline = elempack == 8 ? pipeline_cast_fp16_to_fp32_pack8 | |||
| : elempack == 4 ? pipeline_cast_fp16_to_fp32_pack4 | |||
| : pipeline_cast_fp16_to_fp32; | |||
| } | |||
| // TODO more cast type | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob); | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -29,6 +29,7 @@ public: | |||
| using Cast::forward; | |||
| virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| public: | |||
| Pipeline* pipeline_cast_fp32_to_fp16; | |||
| @@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(Concat_vulkan) | |||
| Concat_vulkan::Concat_vulkan() | |||
| { | |||
| support_vulkan = true; | |||
| support_image_storage = true; | |||
| packing_pack4 = 0; | |||
| packing_pack8 = 0; | |||
| @@ -77,7 +78,19 @@ int Concat_vulkan::create_pipeline(const Option& opt) | |||
| } | |||
| size_t elemsize; | |||
| if (opt.use_fp16_storage) | |||
| if (opt.use_image_storage && opt.use_image_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage && opt.use_image_fp16_packed) | |||
| { | |||
| elemsize = elempack == 1 ? 4u : elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| elemsize = elempack * 4u; | |||
| } | |||
| else if (opt.use_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| } | |||
| @@ -761,4 +774,483 @@ int Concat_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<V | |||
| return 0; | |||
| } | |||
| int Concat_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int dims = bottom_blobs[0].dims; | |||
| if (dims == 1) // axis == 0 | |||
| { | |||
| // concat vector | |||
| // total length | |||
| size_t elemsize = bottom_blobs[0].elemsize; | |||
| int elempack = bottom_blobs[0].elempack; | |||
| int top_w = 0; | |||
| for (size_t b=0; b<bottom_blobs.size(); b++) | |||
| { | |||
| const VkImageMat& bottom_blob = bottom_blobs[b]; | |||
| elemsize = std::min(elemsize, bottom_blob.elemsize); | |||
| elempack = std::min(elempack, bottom_blob.elempack); | |||
| top_w += bottom_blob.w * bottom_blob.elempack; | |||
| } | |||
| int out_elempack = opt.use_shader_pack8 && top_w % 8 == 0 ? 8 : top_w % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / elempack * out_elempack; | |||
| if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) | |||
| { | |||
| if (out_elempack == 8) out_elemsize = 8*2u; | |||
| if (out_elempack == 4) out_elemsize = 4*2u; | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| VkImageMat& top_blob = top_blobs[0]; | |||
| top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| VkImageMat top_blob_unpacked = top_blob; | |||
| if (elempack < out_elempack) | |||
| { | |||
| top_blob_unpacked.create(top_w / elempack, elemsize, elempack, opt.workspace_vkallocator); | |||
| if (top_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| int woffset = 0; | |||
| for (size_t b=0; b<bottom_blobs.size(); b++) | |||
| { | |||
| const VkImageMat& bottom_blob = bottom_blobs[b]; | |||
| std::vector<VkImageMat> bindings(2); | |||
| bindings[0] = bottom_blob; | |||
| bindings[1] = top_blob_unpacked; | |||
| std::vector<vk_constant_type> constants(11); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h; | |||
| constants[3].i = bottom_blob.c; | |||
| constants[4].i = 0;//bottom_blob.cstep; | |||
| constants[5].i = top_blob_unpacked.dims; | |||
| constants[6].i = top_blob_unpacked.w; | |||
| constants[7].i = top_blob_unpacked.h; | |||
| constants[8].i = top_blob_unpacked.c; | |||
| constants[9].i = 0;//top_blob_unpacked.cstep; | |||
| constants[10].i = woffset; | |||
| const Pipeline* pipeline = 0; | |||
| if (bottom_blob.elempack == 1 && elempack == 1) | |||
| { | |||
| pipeline = pipeline_concat[b%2]; | |||
| } | |||
| else if (bottom_blob.elempack == 4 && elempack == 4) | |||
| { | |||
| pipeline = pipeline_concat_pack4[b%2]; | |||
| } | |||
| else if (bottom_blob.elempack == 4 && elempack == 1) | |||
| { | |||
| pipeline = pipeline_concat_pack4to1[b%2]; | |||
| } | |||
| else if (bottom_blob.elempack == 8 && elempack == 8) | |||
| { | |||
| pipeline = pipeline_concat_pack8[b%2]; | |||
| } | |||
| else if (bottom_blob.elempack == 8 && elempack == 4) | |||
| { | |||
| pipeline = pipeline_concat_pack8to4[b%2]; | |||
| } | |||
| else if (bottom_blob.elempack == 8 && elempack == 1) | |||
| { | |||
| pipeline = pipeline_concat_pack8to1[b%2]; | |||
| } | |||
| cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); | |||
| woffset += bottom_blob.w * bottom_blob.elempack / elempack; | |||
| } | |||
| // packing | |||
| if (elempack < out_elempack) | |||
| { | |||
| const Layer* packing = out_elempack == 8 ? packing_pack8 : packing_pack4; | |||
| packing->forward(top_blob_unpacked, top_blob, cmd, opt); | |||
| } | |||
| return 0; | |||
| } | |||
| if (dims == 2 && axis == 0) | |||
| { | |||
| // concat image | |||
| int w = bottom_blobs[0].w; | |||
| // total height | |||
| size_t elemsize = bottom_blobs[0].elemsize; | |||
| int elempack = bottom_blobs[0].elempack; | |||
| int top_h = 0; | |||
| for (size_t b=0; b<bottom_blobs.size(); b++) | |||
| { | |||
| const VkImageMat& bottom_blob = bottom_blobs[b]; | |||
| elemsize = std::min(elemsize, bottom_blob.elemsize); | |||
| elempack = std::min(elempack, bottom_blob.elempack); | |||
| top_h += bottom_blob.h * bottom_blob.elempack; | |||
| } | |||
| int out_elempack = opt.use_shader_pack8 && top_h % 8 == 0 ? 8 : top_h % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / elempack * out_elempack; | |||
| if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) | |||
| { | |||
| if (out_elempack == 8) out_elemsize = 8*2u; | |||
| if (out_elempack == 4) out_elemsize = 4*2u; | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| VkImageMat& top_blob = top_blobs[0]; | |||
| top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| VkImageMat top_blob_unpacked = top_blob; | |||
| if (elempack < out_elempack) | |||
| { | |||
| top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_vkallocator); | |||
| if (top_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| int hoffset = 0; | |||
| for (size_t b=0; b<bottom_blobs.size(); b++) | |||
| { | |||
| const VkImageMat& bottom_blob = bottom_blobs[b]; | |||
| std::vector<VkImageMat> bindings(2); | |||
| bindings[0] = bottom_blob; | |||
| bindings[1] = top_blob_unpacked; | |||
| std::vector<vk_constant_type> constants(11); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h; | |||
| constants[3].i = bottom_blob.c; | |||
| constants[4].i = 0;//bottom_blob.cstep; | |||
| constants[5].i = top_blob_unpacked.dims; | |||
| constants[6].i = top_blob_unpacked.w; | |||
| constants[7].i = top_blob_unpacked.h; | |||
| constants[8].i = top_blob_unpacked.c; | |||
| constants[9].i = 0;//top_blob_unpacked.cstep; | |||
| constants[10].i = hoffset; | |||
| const Pipeline* pipeline = 0; | |||
| if (bottom_blob.elempack == 1 && elempack == 1) | |||
| { | |||
| pipeline = pipeline_concat[b%2]; | |||
| } | |||
| else if (bottom_blob.elempack == 4 && elempack == 4) | |||
| { | |||
| pipeline = pipeline_concat_pack4[b%2]; | |||
| } | |||
| else if (bottom_blob.elempack == 4 && elempack == 1) | |||
| { | |||
| pipeline = pipeline_concat_pack4to1[b%2]; | |||
| } | |||
| else if (bottom_blob.elempack == 8 && elempack == 8) | |||
| { | |||
| pipeline = pipeline_concat_pack8[b%2]; | |||
| } | |||
| else if (bottom_blob.elempack == 8 && elempack == 4) | |||
| { | |||
| pipeline = pipeline_concat_pack8to4[b%2]; | |||
| } | |||
| else if (bottom_blob.elempack == 8 && elempack == 1) | |||
| { | |||
| pipeline = pipeline_concat_pack8to1[b%2]; | |||
| } | |||
| cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); | |||
| hoffset += bottom_blob.h * bottom_blob.elempack / elempack; | |||
| } | |||
| // packing | |||
| if (elempack < out_elempack) | |||
| { | |||
| const Layer* packing = out_elempack == 8 ? packing_pack8 : packing_pack4; | |||
| packing->forward(top_blob_unpacked, top_blob, cmd, opt); | |||
| } | |||
| return 0; | |||
| } | |||
| if (dims == 2 && axis == 1) | |||
| { | |||
| // interleave image row | |||
| int h = bottom_blobs[0].h; | |||
| size_t elemsize = bottom_blobs[0].elemsize; | |||
| int elempack = bottom_blobs[0].elempack; | |||
| // total width | |||
| int top_w = 0; | |||
| for (size_t b=0; b<bottom_blobs.size(); b++) | |||
| { | |||
| const VkImageMat& bottom_blob = bottom_blobs[b]; | |||
| top_w += bottom_blob.w; | |||
| } | |||
| VkImageMat& top_blob = top_blobs[0]; | |||
| top_blob.create(top_w, h, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| int woffset = 0; | |||
| for (size_t b=0; b<bottom_blobs.size(); b++) | |||
| { | |||
| const VkImageMat& bottom_blob = bottom_blobs[b]; | |||
| std::vector<VkImageMat> bindings(2); | |||
| bindings[0] = bottom_blob; | |||
| bindings[1] = top_blob; | |||
| std::vector<vk_constant_type> constants(11); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h; | |||
| constants[3].i = bottom_blob.c; | |||
| constants[4].i = 0;//bottom_blob.cstep; | |||
| constants[5].i = top_blob.dims; | |||
| constants[6].i = top_blob.w; | |||
| constants[7].i = top_blob.h; | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = 0;//top_blob.cstep; | |||
| constants[10].i = woffset; | |||
| const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b%2] | |||
| : elempack == 4 ? pipeline_concat_pack4[b%2] | |||
| : pipeline_concat[b%2]; | |||
| cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); | |||
| woffset += bottom_blob.w; | |||
| } | |||
| return 0; | |||
| } | |||
| if (dims == 3 && axis == 0) | |||
| { | |||
| // concat dim | |||
| int w = bottom_blobs[0].w; | |||
| int h = bottom_blobs[0].h; | |||
| // total channels | |||
| size_t elemsize = bottom_blobs[0].elemsize; | |||
| int elempack = bottom_blobs[0].elempack; | |||
| int top_channels = 0; | |||
| for (size_t b=0; b<bottom_blobs.size(); b++) | |||
| { | |||
| const VkImageMat& bottom_blob = bottom_blobs[b]; | |||
| elemsize = std::min(elemsize, bottom_blob.elemsize); | |||
| elempack = std::min(elempack, bottom_blob.elempack); | |||
| top_channels += bottom_blob.c * bottom_blob.elempack; | |||
| } | |||
| int out_elempack = opt.use_shader_pack8 && top_channels % 8 == 0 ? 8 : top_channels % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / elempack * out_elempack; | |||
| if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) | |||
| { | |||
| if (out_elempack == 8) out_elemsize = 8*2u; | |||
| if (out_elempack == 4) out_elemsize = 4*2u; | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| VkImageMat& top_blob = top_blobs[0]; | |||
| top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| VkImageMat top_blob_unpacked = top_blob; | |||
| if (elempack < out_elempack) | |||
| { | |||
| top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_vkallocator); | |||
| if (top_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| int coffset = 0; | |||
| for (size_t b=0; b<bottom_blobs.size(); b++) | |||
| { | |||
| const VkImageMat& bottom_blob = bottom_blobs[b]; | |||
| std::vector<VkImageMat> bindings(2); | |||
| bindings[0] = bottom_blob; | |||
| bindings[1] = top_blob_unpacked; | |||
| std::vector<vk_constant_type> constants(11); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h; | |||
| constants[3].i = bottom_blob.c; | |||
| constants[4].i = 0;//bottom_blob.cstep; | |||
| constants[5].i = top_blob_unpacked.dims; | |||
| constants[6].i = top_blob_unpacked.w; | |||
| constants[7].i = top_blob_unpacked.h; | |||
| constants[8].i = top_blob_unpacked.c; | |||
| constants[9].i = 0;//top_blob_unpacked.cstep; | |||
| constants[10].i = coffset; | |||
| const Pipeline* pipeline = 0; | |||
| if (bottom_blob.elempack == 1 && elempack == 1) | |||
| { | |||
| pipeline = pipeline_concat[b%2]; | |||
| } | |||
| else if (bottom_blob.elempack == 4 && elempack == 4) | |||
| { | |||
| pipeline = pipeline_concat_pack4[b%2]; | |||
| } | |||
| else if (bottom_blob.elempack == 4 && elempack == 1) | |||
| { | |||
| pipeline = pipeline_concat_pack4to1[b%2]; | |||
| } | |||
| else if (bottom_blob.elempack == 8 && elempack == 8) | |||
| { | |||
| pipeline = pipeline_concat_pack8[b%2]; | |||
| } | |||
| else if (bottom_blob.elempack == 8 && elempack == 4) | |||
| { | |||
| pipeline = pipeline_concat_pack8to4[b%2]; | |||
| } | |||
| else if (bottom_blob.elempack == 8 && elempack == 1) | |||
| { | |||
| pipeline = pipeline_concat_pack8to1[b%2]; | |||
| } | |||
| cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); | |||
| coffset += bottom_blob.c * bottom_blob.elempack / elempack; | |||
| } | |||
| // packing | |||
| if (elempack < out_elempack) | |||
| { | |||
| const Layer* packing = out_elempack == 8 ? packing_pack8 : packing_pack4; | |||
| packing->forward(top_blob_unpacked, top_blob, cmd, opt); | |||
| } | |||
| return 0; | |||
| } | |||
| if (dims == 3 && axis == 1) | |||
| { | |||
| // interleave dim height | |||
| int w = bottom_blobs[0].w; | |||
| int channels = bottom_blobs[0].c; | |||
| size_t elemsize = bottom_blobs[0].elemsize; | |||
| int elempack = bottom_blobs[0].elempack; | |||
| // total height | |||
| int top_h = 0; | |||
| for (size_t b=0; b<bottom_blobs.size(); b++) | |||
| { | |||
| const VkImageMat& bottom_blob = bottom_blobs[b]; | |||
| top_h += bottom_blob.h; | |||
| } | |||
| VkImageMat& top_blob = top_blobs[0]; | |||
| top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| int hoffset = 0; | |||
| for (size_t b=0; b<bottom_blobs.size(); b++) | |||
| { | |||
| const VkImageMat& bottom_blob = bottom_blobs[b]; | |||
| std::vector<VkImageMat> bindings(2); | |||
| bindings[0] = bottom_blob; | |||
| bindings[1] = top_blob; | |||
| std::vector<vk_constant_type> constants(11); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h; | |||
| constants[3].i = bottom_blob.c; | |||
| constants[4].i = 0;//bottom_blob.cstep; | |||
| constants[5].i = top_blob.dims; | |||
| constants[6].i = top_blob.w; | |||
| constants[7].i = top_blob.h; | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = 0;//top_blob.cstep; | |||
| constants[10].i = hoffset; | |||
| const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b%2] | |||
| : elempack == 4 ? pipeline_concat_pack4[b%2] | |||
| : pipeline_concat[b%2]; | |||
| cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); | |||
| hoffset += bottom_blob.h; | |||
| } | |||
| return 0; | |||
| } | |||
| if (dims == 3 && axis == 2) | |||
| { | |||
| // interleave dim width | |||
| int h = bottom_blobs[0].h; | |||
| int channels = bottom_blobs[0].c; | |||
| size_t elemsize = bottom_blobs[0].elemsize; | |||
| int elempack = bottom_blobs[0].elempack; | |||
| // total height | |||
| int top_w = 0; | |||
| for (size_t b=0; b<bottom_blobs.size(); b++) | |||
| { | |||
| const VkImageMat& bottom_blob = bottom_blobs[b]; | |||
| top_w += bottom_blob.w; | |||
| } | |||
| VkImageMat& top_blob = top_blobs[0]; | |||
| top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| int woffset = 0; | |||
| for (size_t b=0; b<bottom_blobs.size(); b++) | |||
| { | |||
| const VkImageMat& bottom_blob = bottom_blobs[b]; | |||
| std::vector<VkImageMat> bindings(2); | |||
| bindings[0] = bottom_blob; | |||
| bindings[1] = top_blob; | |||
| std::vector<vk_constant_type> constants(11); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h; | |||
| constants[3].i = bottom_blob.c; | |||
| constants[4].i = 0;//bottom_blob.cstep; | |||
| constants[5].i = top_blob.dims; | |||
| constants[6].i = top_blob.w; | |||
| constants[7].i = top_blob.h; | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = 0;//top_blob.cstep; | |||
| constants[10].i = woffset; | |||
| const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b%2] | |||
| : elempack == 4 ? pipeline_concat_pack4[b%2] | |||
| : pipeline_concat[b%2]; | |||
| cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); | |||
| woffset += bottom_blob.w; | |||
| } | |||
| return 0; | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -29,6 +29,7 @@ public: | |||
| using Concat::forward; | |||
| virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const; | |||
| public: | |||
| ncnn::Layer* packing_pack4; | |||
| @@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(Convolution_vulkan) | |||
| Convolution_vulkan::Convolution_vulkan() | |||
| { | |||
| support_vulkan = true; | |||
| support_image_storage = true; | |||
| padding = 0; | |||
| @@ -149,7 +150,22 @@ int Convolution_vulkan::create_pipeline(const Option& opt) | |||
| size_t elemsize; | |||
| size_t out_elemsize; | |||
| if (opt.use_fp16_storage) | |||
| if (opt.use_image_storage && opt.use_image_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| out_elemsize = out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage && opt.use_image_fp16_packed) | |||
| { | |||
| elemsize = elempack == 1 ? 4u : elempack * 2u; | |||
| out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| elemsize = elempack * 4u; | |||
| out_elemsize = out_elempack * 4u; | |||
| } | |||
| else if (opt.use_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| out_elemsize = out_elempack * 2u; | |||
| @@ -210,22 +226,21 @@ int Convolution_vulkan::create_pipeline(const Option& opt) | |||
| if (is_conv1x1s1d1) | |||
| { | |||
| pipeline_convolution_1x1s1d1 = new Pipeline(vkdev); | |||
| if (opt.use_image_storage) | |||
| { | |||
| Mat local_size_xyz_local(4, 4, std::min(4, num_output / out_elempack), (void*)0); | |||
| if (out_shape_packed.dims != 0) | |||
| { | |||
| local_size_xyz_local.w = std::max(1, std::min(4, (out_shape_packed.w + 1) / 2)); | |||
| local_size_xyz_local.h = std::max(1, std::min(4, (out_shape_packed.h + 1) / 2)); | |||
| local_size_xyz_local.c = std::min(4, out_shape_packed.c); | |||
| } | |||
| pipeline_convolution_1x1s1d1->set_optimal_local_size_xyz(local_size_xyz_local); | |||
| } | |||
| else | |||
| { | |||
| pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output)); | |||
| std::vector<vk_specialization_type> specializations(4 + 8); | |||
| specializations[0].i = bias_term; | |||
| specializations[1].i = activation_type; | |||
| specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f; | |||
| specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f; | |||
| specializations[4 + 0].i = shape_bordered_packed.dims; | |||
| specializations[4 + 1].i = shape_bordered_packed.cstep / 4; | |||
| specializations[4 + 2].i = shape_bordered_packed.c; | |||
| specializations[4 + 3].i = shape_bordered_packed.cstep / 4; | |||
| specializations[4 + 4].i = out_shape_packed.dims; | |||
| specializations[4 + 5].i = out_shape_packed.cstep / 4; | |||
| specializations[4 + 6].i = out_shape_packed.c; | |||
| specializations[4 + 7].i = out_shape_packed.cstep / 4; | |||
| } | |||
| pipeline_convolution_1x1s1d1->create(LayerShaderType::convolution_1x1s1d1, opt, specializations); | |||
| } | |||
| else | |||
| @@ -242,22 +257,21 @@ int Convolution_vulkan::create_pipeline(const Option& opt) | |||
| if (is_conv1x1s1d1) | |||
| { | |||
| pipeline_convolution_pack4_1x1s1d1 = new Pipeline(vkdev); | |||
| if (opt.use_image_storage) | |||
| { | |||
| Mat local_size_xyz_local(4, 4, std::min(4, num_output / out_elempack), (void*)0); | |||
| if (out_shape_packed.dims != 0) | |||
| { | |||
| local_size_xyz_local.w = std::max(1, std::min(4, (out_shape_packed.w + 1) / 2)); | |||
| local_size_xyz_local.h = std::max(1, std::min(4, (out_shape_packed.h + 1) / 2)); | |||
| local_size_xyz_local.c = std::min(4, out_shape_packed.c); | |||
| } | |||
| pipeline_convolution_pack4_1x1s1d1->set_optimal_local_size_xyz(local_size_xyz_local); | |||
| } | |||
| else | |||
| { | |||
| pipeline_convolution_pack4_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output / 4)); | |||
| std::vector<vk_specialization_type> specializations(4 + 8); | |||
| specializations[0].i = bias_term; | |||
| specializations[1].i = activation_type; | |||
| specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f; | |||
| specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f; | |||
| specializations[4 + 0].i = shape_bordered_packed.dims; | |||
| specializations[4 + 1].i = shape_bordered_packed.w * shape_bordered_packed.h; | |||
| specializations[4 + 2].i = shape_bordered_packed.c; | |||
| specializations[4 + 3].i = shape_bordered_packed.cstep; | |||
| specializations[4 + 4].i = out_shape_packed.dims; | |||
| specializations[4 + 5].i = out_shape_packed.w * out_shape_packed.h; | |||
| specializations[4 + 6].i = out_shape_packed.c; | |||
| specializations[4 + 7].i = out_shape_packed.cstep; | |||
| } | |||
| pipeline_convolution_pack4_1x1s1d1->create(LayerShaderType::convolution_pack4_1x1s1d1, opt, specializations); | |||
| } | |||
| else if (is_conv3x3s1d1 && num_input >= 16 && num_output >= 16) | |||
| @@ -419,22 +433,21 @@ int Convolution_vulkan::create_pipeline(const Option& opt) | |||
| if (is_conv1x1s1d1) | |||
| { | |||
| pipeline_convolution_pack8_1x1s1d1 = new Pipeline(vkdev); | |||
| if (opt.use_image_storage) | |||
| { | |||
| Mat local_size_xyz_local(4, 4, std::min(4, num_output / out_elempack), (void*)0); | |||
| if (out_shape_packed.dims != 0) | |||
| { | |||
| local_size_xyz_local.w = std::max(1, std::min(4, (out_shape_packed.w + 1) / 2)); | |||
| local_size_xyz_local.h = std::max(1, std::min(4, (out_shape_packed.h + 1) / 2)); | |||
| local_size_xyz_local.c = std::min(4, out_shape_packed.c); | |||
| } | |||
| pipeline_convolution_pack8_1x1s1d1->set_optimal_local_size_xyz(local_size_xyz_local); | |||
| } | |||
| else | |||
| { | |||
| pipeline_convolution_pack8_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output / 8)); | |||
| std::vector<vk_specialization_type> specializations(4 + 8); | |||
| specializations[0].i = bias_term; | |||
| specializations[1].i = activation_type; | |||
| specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f; | |||
| specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f; | |||
| specializations[4 + 0].i = shape_bordered_packed.dims; | |||
| specializations[4 + 1].i = shape_bordered_packed.w * shape_bordered_packed.h; | |||
| specializations[4 + 2].i = shape_bordered_packed.c; | |||
| specializations[4 + 3].i = shape_bordered_packed.cstep; | |||
| specializations[4 + 4].i = out_shape_packed.dims; | |||
| specializations[4 + 5].i = out_shape_packed.w * out_shape_packed.h; | |||
| specializations[4 + 6].i = out_shape_packed.c; | |||
| specializations[4 + 7].i = out_shape_packed.cstep; | |||
| } | |||
| pipeline_convolution_pack8_1x1s1d1->create(LayerShaderType::convolution_pack8_1x1s1d1, opt, specializations); | |||
| } | |||
| else if (is_conv3x3s1d1 && num_input >= 16 && num_output >= 16) | |||
| @@ -695,6 +708,21 @@ int Convolution_vulkan::destroy_pipeline(const Option& opt) | |||
| int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) | |||
| { | |||
| if (padding) | |||
| { | |||
| padding->upload_model(cmd, opt); | |||
| } | |||
| if (winograd_padding) | |||
| { | |||
| winograd_padding->upload_model(cmd, opt); | |||
| } | |||
| if (winograd_crop) | |||
| { | |||
| winograd_crop->upload_model(cmd, opt); | |||
| } | |||
| const int maxk = kernel_w * kernel_h; | |||
| int num_input = weight_data_size / maxk / num_output; | |||
| @@ -738,8 +766,14 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) | |||
| } | |||
| } | |||
| cmd.record_upload(weight_data_packed, weight_data_gpu, opt); | |||
| if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt); | |||
| } | |||
| else | |||
| { | |||
| cmd.record_upload(weight_data_packed, weight_data_gpu, opt); | |||
| } | |||
| bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1; | |||
| @@ -862,7 +896,14 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) | |||
| } | |||
| } | |||
| cmd.record_upload(weight_data_pack4_tm, weight_data_gpu_pack4_tm, opt); | |||
| if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(weight_data_pack4_tm, weight_data_gpu_pack4_tm_image, opt); | |||
| } | |||
| else | |||
| { | |||
| cmd.record_upload(weight_data_pack4_tm, weight_data_gpu_pack4_tm, opt); | |||
| } | |||
| } | |||
| } | |||
| @@ -952,7 +993,14 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) | |||
| } | |||
| } | |||
| cmd.record_upload(weight_data_pack8_tm, weight_data_gpu_pack8_tm, opt); | |||
| if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(weight_data_pack8_tm, weight_data_gpu_pack8_tm_image, opt); | |||
| } | |||
| else | |||
| { | |||
| cmd.record_upload(weight_data_pack8_tm, weight_data_gpu_pack8_tm, opt); | |||
| } | |||
| } | |||
| } | |||
| @@ -961,7 +1009,18 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) | |||
| Mat bias_data_packed; | |||
| convert_packing(bias_data, bias_data_packed, out_elempack); | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu, opt); | |||
| if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt); | |||
| } | |||
| else | |||
| { | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu, opt); | |||
| } | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(Mat(1), bias_data_gpu_image, opt); | |||
| } | |||
| if (innerproduct) | |||
| @@ -1070,6 +1129,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| } | |||
| bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1; | |||
| if (elempack == 4 && out_elempack == 4 && is_conv3x3s1d1 && channels * elempack >= 16 && num_output >= 16) | |||
| { | |||
| // winograd23 | |||
| @@ -1353,7 +1413,6 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| return 0; | |||
| } | |||
| top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -1364,19 +1423,21 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| bindings[2] = weight_data_gpu; | |||
| bindings[3] = bias_term ? bias_data_gpu : bindings[2];// TODO use dummy buffer | |||
| std::vector<vk_constant_type> constants(10); | |||
| constants[0].i = bottom_blob_bordered.dims; | |||
| constants[1].i = bottom_blob_bordered.w; | |||
| constants[2].i = bottom_blob_bordered.h; | |||
| constants[3].i = bottom_blob_bordered.c; | |||
| constants[4].i = bottom_blob_bordered.cstep; | |||
| constants[5].i = top_blob.dims; | |||
| constants[6].i = top_blob.w; | |||
| constants[7].i = top_blob.h; | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = top_blob.cstep; | |||
| // record | |||
| if (elempack == 1 && out_elempack == 1 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1) | |||
| { | |||
| std::vector<vk_constant_type> constants(8); | |||
| constants[0].i = bottom_blob_bordered.dims; | |||
| constants[1].i = bottom_blob_bordered.cstep / 4; | |||
| constants[2].i = bottom_blob_bordered.c; | |||
| constants[3].i = bottom_blob_bordered.cstep / 4; | |||
| constants[4].i = top_blob.dims; | |||
| constants[5].i = top_blob.cstep / 4; | |||
| constants[6].i = top_blob.c; | |||
| constants[7].i = top_blob.cstep / 4; | |||
| VkMat dispatcher; | |||
| dispatcher.w = top_blob.cstep / 4; | |||
| dispatcher.h = 1; | |||
| @@ -1386,16 +1447,6 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| } | |||
| else if (elempack == 4 && out_elempack == 4 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1) | |||
| { | |||
| std::vector<vk_constant_type> constants(8); | |||
| constants[0].i = bottom_blob_bordered.dims; | |||
| constants[1].i = bottom_blob_bordered.w * bottom_blob_bordered.h; | |||
| constants[2].i = bottom_blob_bordered.c; | |||
| constants[3].i = bottom_blob_bordered.cstep; | |||
| constants[4].i = top_blob.dims; | |||
| constants[5].i = top_blob.w * top_blob.h; | |||
| constants[6].i = top_blob.c; | |||
| constants[7].i = top_blob.cstep; | |||
| VkMat dispatcher; | |||
| dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; | |||
| dispatcher.h = 1; | |||
| @@ -1405,16 +1456,6 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| } | |||
| else if (elempack == 8 && out_elempack == 8 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1) | |||
| { | |||
| std::vector<vk_constant_type> constants(8); | |||
| constants[0].i = bottom_blob_bordered.dims; | |||
| constants[1].i = bottom_blob_bordered.w * bottom_blob_bordered.h; | |||
| constants[2].i = bottom_blob_bordered.c; | |||
| constants[3].i = bottom_blob_bordered.cstep; | |||
| constants[4].i = top_blob.dims; | |||
| constants[5].i = top_blob.w * top_blob.h; | |||
| constants[6].i = top_blob.c; | |||
| constants[7].i = top_blob.cstep; | |||
| VkMat dispatcher; | |||
| dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; | |||
| dispatcher.h = 1; | |||
| @@ -1424,18 +1465,484 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| } | |||
| else | |||
| { | |||
| std::vector<vk_constant_type> constants(10); | |||
| constants[0].i = bottom_blob_bordered.dims; | |||
| constants[1].i = bottom_blob_bordered.w; | |||
| constants[2].i = bottom_blob_bordered.h; | |||
| constants[3].i = bottom_blob_bordered.c; | |||
| constants[4].i = bottom_blob_bordered.cstep; | |||
| constants[5].i = top_blob.dims; | |||
| constants[6].i = top_blob.w; | |||
| constants[7].i = top_blob.h; | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = top_blob.cstep; | |||
| const Pipeline* pipeline = 0; | |||
| if (elempack == 1 && out_elempack == 1) | |||
| { | |||
| pipeline = pipeline_convolution; | |||
| } | |||
| else if (elempack == 4 && out_elempack == 4) | |||
| { | |||
| pipeline = pipeline_convolution_pack4; | |||
| } | |||
| else if (elempack == 1 && out_elempack == 4) | |||
| { | |||
| pipeline = pipeline_convolution_pack1to4; | |||
| } | |||
| else if (elempack == 4 && out_elempack == 1) | |||
| { | |||
| pipeline = pipeline_convolution_pack4to1; | |||
| } | |||
| else if (elempack == 8 && out_elempack == 8) | |||
| { | |||
| pipeline = pipeline_convolution_pack8; | |||
| } | |||
| else if (elempack == 1 && out_elempack == 8) | |||
| { | |||
| pipeline = pipeline_convolution_pack1to8; | |||
| } | |||
| else if (elempack == 4 && out_elempack == 8) | |||
| { | |||
| pipeline = pipeline_convolution_pack4to8; | |||
| } | |||
| else if (elempack == 8 && out_elempack == 4) | |||
| { | |||
| pipeline = pipeline_convolution_pack8to4; | |||
| } | |||
| else if (elempack == 8 && out_elempack == 1) | |||
| { | |||
| pipeline = pipeline_convolution_pack8to1; | |||
| } | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob); | |||
| } | |||
| return 0; | |||
| } | |||
| int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| // flattened blob, implement as InnerProduct | |||
| if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) | |||
| { | |||
| int num_input = weight_data_size / num_output; | |||
| if (bottom_blob.w * bottom_blob.elempack == num_input) | |||
| { | |||
| return innerproduct->forward(bottom_blob, top_blob, cmd, opt); | |||
| } | |||
| } | |||
| const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; | |||
| const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; | |||
| VkImageMat bottom_blob_bordered = bottom_blob; | |||
| if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0) | |||
| { | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad); | |||
| } | |||
| else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233) | |||
| { | |||
| int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w; | |||
| int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; | |||
| if (wpad > 0 || hpad > 0) | |||
| { | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* padding_params = padding_param_blob.mapped(); | |||
| padding_params[0] = hpad / 2; | |||
| padding_params[1] = hpad - hpad / 2; | |||
| padding_params[2] = wpad / 2; | |||
| padding_params[3] = wpad - wpad / 2; | |||
| std::vector<VkImageMat> padding_inputs(2); | |||
| padding_inputs[0] = bottom_blob; | |||
| padding_inputs[1] = padding_param_blob; | |||
| std::vector<VkImageMat> padding_outputs(1); | |||
| padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); | |||
| bottom_blob_bordered = padding_outputs[0]; | |||
| } | |||
| } | |||
| else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234) | |||
| { | |||
| int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w; | |||
| int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; | |||
| if (wpad > 0 || hpad > 0) | |||
| { | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* padding_params = padding_param_blob.mapped(); | |||
| padding_params[0] = hpad - hpad / 2; | |||
| padding_params[1] = hpad / 2; | |||
| padding_params[2] = wpad - wpad / 2; | |||
| padding_params[3] = wpad / 2; | |||
| std::vector<VkImageMat> padding_inputs(2); | |||
| padding_inputs[0] = bottom_blob; | |||
| padding_inputs[1] = padding_param_blob; | |||
| std::vector<VkImageMat> padding_outputs(1); | |||
| padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); | |||
| bottom_blob_bordered = padding_outputs[0]; | |||
| } | |||
| } | |||
| w = bottom_blob_bordered.w; | |||
| h = bottom_blob_bordered.h; | |||
| int outw = (w - kernel_extent_w) / stride_w + 1; | |||
| int outh = (h - kernel_extent_h) / stride_h + 1; | |||
| int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / elempack * out_elempack; | |||
| if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) | |||
| { | |||
| if (out_elempack == 8) out_elemsize = 8*2u; | |||
| if (out_elempack == 4) out_elemsize = 4*2u; | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1; | |||
| if (elempack == 4 && out_elempack == 4 && is_conv3x3s1d1 && channels * elempack >= 16 && num_output >= 16) | |||
| { | |||
| // winograd23 | |||
| int outw_bordered = (outw + 1) / 2 * 2; | |||
| int outh_bordered = (outh + 1) / 2 * 2; | |||
| int w_bordered = outw_bordered + 2; | |||
| int h_bordered = outh_bordered + 2; | |||
| int block_x = outw_bordered / 2; | |||
| int block_y = outh_bordered / 2; | |||
| // pad to 2n+2 | |||
| { | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* padding_params = padding_param_blob.mapped(); | |||
| padding_params[0] = 0; | |||
| padding_params[1] = h_bordered - bottom_blob_bordered.h; | |||
| padding_params[2] = 0; | |||
| padding_params[3] = w_bordered - bottom_blob_bordered.w; | |||
| std::vector<VkImageMat> padding_inputs(2); | |||
| padding_inputs[0] = bottom_blob_bordered; | |||
| padding_inputs[1] = padding_param_blob; | |||
| std::vector<VkImageMat> padding_outputs(1); | |||
| winograd_padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); | |||
| bottom_blob_bordered = padding_outputs[0]; | |||
| } | |||
| // transform input | |||
| VkImageMat bottom_tm_blob; | |||
| { | |||
| bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator); | |||
| if (bottom_tm_blob.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(2); | |||
| bindings[0] = bottom_blob_bordered; | |||
| bindings[1] = bottom_tm_blob; | |||
| std::vector<vk_constant_type> constants(7); | |||
| constants[0].i = bottom_blob_bordered.w; | |||
| constants[1].i = bottom_blob_bordered.h; | |||
| constants[2].i = bottom_blob_bordered.c; | |||
| constants[3].i = 0;//bottom_blob_bordered.cstep; | |||
| constants[4].i = 0;//bottom_tm_blob.cstep; | |||
| constants[5].i = block_x; | |||
| constants[6].i = block_y; | |||
| VkImageMat dispatcher; | |||
| dispatcher.w = block_x; | |||
| dispatcher.h = block_y; | |||
| dispatcher.c = bottom_tm_blob.c; | |||
| cmd.record_pipeline(pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input, bindings, constants, dispatcher); | |||
| } | |||
| // gemm | |||
| VkImageMat top_tm_blob; | |||
| { | |||
| top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); | |||
| if (top_tm_blob.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(3); | |||
| bindings[0] = bottom_tm_blob; | |||
| bindings[1] = top_tm_blob; | |||
| bindings[2] = weight_data_gpu_pack4_tm_image; | |||
| std::vector<vk_constant_type> constants(5); | |||
| constants[0].i = bottom_tm_blob.c; | |||
| constants[1].i = 0;//bottom_tm_blob.cstep; | |||
| constants[2].i = top_tm_blob.h; | |||
| constants[3].i = top_tm_blob.c; | |||
| constants[4].i = 0;//top_tm_blob.cstep; | |||
| VkImageMat dispatcher; | |||
| dispatcher.w = top_tm_blob.w; | |||
| dispatcher.h = (top_tm_blob.h + 3) / 4; | |||
| dispatcher.c = top_tm_blob.c; | |||
| cmd.record_pipeline(pipeline_convolution_pack4_3x3s1d1_winograd23_gemm, bindings, constants, dispatcher); | |||
| } | |||
| // transform output | |||
| VkImageMat top_blob_bordered; | |||
| { | |||
| top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(3); | |||
| bindings[0] = top_tm_blob; | |||
| bindings[1] = top_blob_bordered; | |||
| bindings[2] = bias_data_gpu_image;// TODO use dummy buffer | |||
| std::vector<vk_constant_type> constants(7); | |||
| constants[0].i = top_tm_blob.c; | |||
| constants[1].i = 0;//top_tm_blob.cstep; | |||
| constants[2].i = block_x; | |||
| constants[3].i = block_y; | |||
| constants[4].i = top_blob_bordered.w; | |||
| constants[5].i = top_blob_bordered.h; | |||
| constants[6].i = 0;//top_blob_bordered.cstep; | |||
| VkImageMat dispatcher; | |||
| dispatcher.w = block_x; | |||
| dispatcher.h = block_y; | |||
| dispatcher.c = top_blob_bordered.c; | |||
| cmd.record_pipeline(pipeline_convolution_pack4_3x3s1d1_winograd23_transform_output, bindings, constants, dispatcher); | |||
| } | |||
| // crop top_blob | |||
| { | |||
| VkImageMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* crop_params = crop_param_blob.mapped(); | |||
| crop_params[0] = 0; | |||
| crop_params[1] = 0; | |||
| crop_params[2] = 0; | |||
| crop_params[3] = outw; | |||
| crop_params[4] = outh; | |||
| crop_params[5] = num_output; | |||
| std::vector<VkImageMat> crop_inputs(2); | |||
| crop_inputs[0] = top_blob_bordered; | |||
| crop_inputs[1] = crop_param_blob; | |||
| std::vector<VkImageMat> crop_outputs(1); | |||
| winograd_crop->forward(crop_inputs, crop_outputs, cmd, opt); | |||
| top_blob = crop_outputs[0]; | |||
| } | |||
| return 0; | |||
| } | |||
| if (elempack == 8 && out_elempack == 8 && is_conv3x3s1d1 && channels * elempack >= 16 && num_output >= 16) | |||
| { | |||
| // winograd23 | |||
| int outw_bordered = (outw + 1) / 2 * 2; | |||
| int outh_bordered = (outh + 1) / 2 * 2; | |||
| int w_bordered = outw_bordered + 2; | |||
| int h_bordered = outh_bordered + 2; | |||
| int block_x = outw_bordered / 2; | |||
| int block_y = outh_bordered / 2; | |||
| // pad to 2n+2 | |||
| { | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* padding_params = padding_param_blob.mapped(); | |||
| padding_params[0] = 0; | |||
| padding_params[1] = h_bordered - bottom_blob_bordered.h; | |||
| padding_params[2] = 0; | |||
| padding_params[3] = w_bordered - bottom_blob_bordered.w; | |||
| std::vector<VkImageMat> padding_inputs(2); | |||
| padding_inputs[0] = bottom_blob_bordered; | |||
| padding_inputs[1] = padding_param_blob; | |||
| std::vector<VkImageMat> padding_outputs(1); | |||
| winograd_padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); | |||
| bottom_blob_bordered = padding_outputs[0]; | |||
| } | |||
| // transform input | |||
| VkImageMat bottom_tm_blob; | |||
| { | |||
| bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator); | |||
| if (bottom_tm_blob.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(2); | |||
| bindings[0] = bottom_blob_bordered; | |||
| bindings[1] = bottom_tm_blob; | |||
| std::vector<vk_constant_type> constants(7); | |||
| constants[0].i = bottom_blob_bordered.w; | |||
| constants[1].i = bottom_blob_bordered.h; | |||
| constants[2].i = bottom_blob_bordered.c; | |||
| constants[3].i = 0;//bottom_blob_bordered.cstep; | |||
| constants[4].i = 0;//bottom_tm_blob.cstep; | |||
| constants[5].i = block_x; | |||
| constants[6].i = block_y; | |||
| VkImageMat dispatcher; | |||
| dispatcher.w = block_x; | |||
| dispatcher.h = block_y; | |||
| dispatcher.c = bottom_tm_blob.c; | |||
| cmd.record_pipeline(pipeline_convolution_pack8_3x3s1d1_winograd23_transform_input, bindings, constants, dispatcher); | |||
| } | |||
| // gemm | |||
| VkImageMat top_tm_blob; | |||
| { | |||
| top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); | |||
| if (top_tm_blob.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(3); | |||
| bindings[0] = bottom_tm_blob; | |||
| bindings[1] = top_tm_blob; | |||
| bindings[2] = weight_data_gpu_pack8_tm_image; | |||
| std::vector<vk_constant_type> constants(5); | |||
| constants[0].i = bottom_tm_blob.c; | |||
| constants[1].i = 0;//bottom_tm_blob.cstep; | |||
| constants[2].i = top_tm_blob.h; | |||
| constants[3].i = top_tm_blob.c; | |||
| constants[4].i = 0;//top_tm_blob.cstep; | |||
| VkImageMat dispatcher; | |||
| dispatcher.w = top_tm_blob.w; | |||
| dispatcher.h = (top_tm_blob.h + 3) / 4; | |||
| dispatcher.c = top_tm_blob.c; | |||
| cmd.record_pipeline(pipeline_convolution_pack8_3x3s1d1_winograd23_gemm, bindings, constants, dispatcher); | |||
| } | |||
| // transform output | |||
| VkImageMat top_blob_bordered; | |||
| { | |||
| top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(3); | |||
| bindings[0] = top_tm_blob; | |||
| bindings[1] = top_blob_bordered; | |||
| bindings[2] = bias_data_gpu_image;// TODO use dummy buffer | |||
| std::vector<vk_constant_type> constants(7); | |||
| constants[0].i = top_tm_blob.c; | |||
| constants[1].i = 0;//top_tm_blob.cstep; | |||
| constants[2].i = block_x; | |||
| constants[3].i = block_y; | |||
| constants[4].i = top_blob_bordered.w; | |||
| constants[5].i = top_blob_bordered.h; | |||
| constants[6].i = 0;//top_blob_bordered.cstep; | |||
| VkImageMat dispatcher; | |||
| dispatcher.w = block_x; | |||
| dispatcher.h = block_y; | |||
| dispatcher.c = top_blob_bordered.c; | |||
| cmd.record_pipeline(pipeline_convolution_pack8_3x3s1d1_winograd23_transform_output, bindings, constants, dispatcher); | |||
| } | |||
| // crop top_blob | |||
| { | |||
| VkImageMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* crop_params = crop_param_blob.mapped(); | |||
| crop_params[0] = 0; | |||
| crop_params[1] = 0; | |||
| crop_params[2] = 0; | |||
| crop_params[3] = outw; | |||
| crop_params[4] = outh; | |||
| crop_params[5] = num_output; | |||
| std::vector<VkImageMat> crop_inputs(2); | |||
| crop_inputs[0] = top_blob_bordered; | |||
| crop_inputs[1] = crop_param_blob; | |||
| std::vector<VkImageMat> crop_outputs(1); | |||
| winograd_crop->forward(crop_inputs, crop_outputs, cmd, opt); | |||
| top_blob = crop_outputs[0]; | |||
| } | |||
| return 0; | |||
| } | |||
| top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(4); | |||
| bindings[0] = bottom_blob_bordered; | |||
| bindings[1] = top_blob; | |||
| bindings[2] = weight_data_gpu_image; | |||
| bindings[3] = bias_data_gpu_image;// TODO use dummy buffer | |||
| std::vector<vk_constant_type> constants(10); | |||
| constants[0].i = bottom_blob_bordered.dims; | |||
| constants[1].i = bottom_blob_bordered.w; | |||
| constants[2].i = bottom_blob_bordered.h; | |||
| constants[3].i = bottom_blob_bordered.c; | |||
| constants[4].i = 0;//bottom_blob_bordered.cstep; | |||
| constants[5].i = top_blob.dims; | |||
| constants[6].i = top_blob.w; | |||
| constants[7].i = top_blob.h; | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = 0;//top_blob.cstep; | |||
| // record | |||
| if (elempack == 1 && out_elempack == 1 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1) | |||
| { | |||
| VkImageMat dispatcher; | |||
| dispatcher.w = (top_blob.w + 1) / 2; | |||
| dispatcher.h = (top_blob.h + 1) / 2; | |||
| dispatcher.c = top_blob.c; | |||
| cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher); | |||
| } | |||
| else if (elempack == 4 && out_elempack == 4 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1) | |||
| { | |||
| VkImageMat dispatcher; | |||
| dispatcher.w = (top_blob.w + 1) / 2; | |||
| dispatcher.h = (top_blob.h + 1) / 2; | |||
| dispatcher.c = top_blob.c; | |||
| cmd.record_pipeline(pipeline_convolution_pack4_1x1s1d1, bindings, constants, dispatcher); | |||
| } | |||
| else if (elempack == 8 && out_elempack == 8 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1) | |||
| { | |||
| VkImageMat dispatcher; | |||
| dispatcher.w = (top_blob.w + 1) / 2; | |||
| dispatcher.h = (top_blob.h + 1) / 2; | |||
| dispatcher.c = top_blob.c; | |||
| cmd.record_pipeline(pipeline_convolution_pack8_1x1s1d1, bindings, constants, dispatcher); | |||
| } | |||
| else | |||
| { | |||
| const Pipeline* pipeline = 0; | |||
| if (elempack == 1 && out_elempack == 1) | |||
| { | |||
| @@ -31,6 +31,7 @@ public: | |||
| using Convolution::forward; | |||
| virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| public: | |||
| ncnn::Layer* padding; | |||
| @@ -38,6 +39,9 @@ public: | |||
| VkMat weight_data_gpu; | |||
| VkMat bias_data_gpu; | |||
| VkImageMat weight_data_gpu_image; | |||
| VkImageMat bias_data_gpu_image; | |||
| Pipeline* pipeline_convolution; | |||
| Pipeline* pipeline_convolution_1x1s1d1; | |||
| Pipeline* pipeline_convolution_pack4; | |||
| @@ -55,12 +59,14 @@ public: | |||
| ncnn::Layer* winograd_padding; | |||
| ncnn::Layer* winograd_crop; | |||
| VkMat weight_data_gpu_pack4_tm; | |||
| VkImageMat weight_data_gpu_pack4_tm_image; | |||
| Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input; | |||
| Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd23_gemm; | |||
| Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd23_transform_output; | |||
| // pack8 winograd23 | |||
| VkMat weight_data_gpu_pack8_tm; | |||
| VkImageMat weight_data_gpu_pack8_tm_image; | |||
| Pipeline* pipeline_convolution_pack8_3x3s1d1_winograd23_transform_input; | |||
| Pipeline* pipeline_convolution_pack8_3x3s1d1_winograd23_gemm; | |||
| Pipeline* pipeline_convolution_pack8_3x3s1d1_winograd23_transform_output; | |||
| @@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(ConvolutionDepthWise_vulkan) | |||
| ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan() | |||
| { | |||
| support_vulkan = true; | |||
| support_image_storage = true; | |||
| padding = 0; | |||
| packing_unpack = 0; | |||
| @@ -106,7 +107,22 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& opt) | |||
| size_t elemsize; | |||
| size_t out_elemsize; | |||
| if (opt.use_fp16_storage) | |||
| if (opt.use_image_storage && opt.use_image_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| out_elemsize = out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage && opt.use_image_fp16_packed) | |||
| { | |||
| elemsize = elempack == 1 ? 4u : elempack * 2u; | |||
| out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| elemsize = elempack * 4u; | |||
| out_elemsize = out_elempack * 4u; | |||
| } | |||
| else if (opt.use_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| out_elemsize = out_elempack * 2u; | |||
| @@ -199,7 +215,22 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& opt) | |||
| size_t elemsize_g; | |||
| size_t out_elemsize_g; | |||
| if (opt.use_fp16_storage) | |||
| if (opt.use_image_storage && opt.use_image_fp16_storage) | |||
| { | |||
| elemsize_g = elempack_g * 2u; | |||
| out_elemsize_g = out_elempack_g * 2u; | |||
| } | |||
| else if (opt.use_image_storage && opt.use_image_fp16_packed) | |||
| { | |||
| elemsize_g = elempack_g == 1 ? 4u : elempack_g * 2u; | |||
| out_elemsize_g = out_elempack_g == 1 ? 4u : out_elempack_g * 2u; | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| elemsize_g = elempack_g * 4u; | |||
| out_elemsize_g = out_elempack_g * 4u; | |||
| } | |||
| else if (opt.use_fp16_storage) | |||
| { | |||
| elemsize_g = elempack_g * 2u; | |||
| out_elemsize_g = out_elempack_g * 2u; | |||
| @@ -415,6 +446,21 @@ int ConvolutionDepthWise_vulkan::destroy_pipeline(const Option& opt) | |||
| int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt) | |||
| { | |||
| if (padding) | |||
| { | |||
| padding->upload_model(cmd, opt); | |||
| } | |||
| if (packing_unpack) | |||
| { | |||
| packing_unpack->upload_model(cmd, opt); | |||
| } | |||
| if (packing_pack) | |||
| { | |||
| packing_pack->upload_model(cmd, opt); | |||
| } | |||
| const int maxk = kernel_w * kernel_h; | |||
| int channels = (weight_data_size / group) / maxk / (num_output / group) * group; | |||
| @@ -430,12 +476,25 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt | |||
| cmd.record_upload(weight_data_packed, weight_data_gpu, opt); | |||
| cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt); | |||
| if (bias_term) | |||
| { | |||
| Mat bias_data_packed; | |||
| convert_packing(bias_data, bias_data_packed, out_elempack); | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu, opt); | |||
| if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt); | |||
| } | |||
| else | |||
| { | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu, opt); | |||
| } | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(Mat(1), bias_data_gpu_image, opt); | |||
| } | |||
| return 0; | |||
| @@ -493,14 +552,32 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt | |||
| } | |||
| } | |||
| cmd.record_upload(weight_data_packed_groups, weight_data_gpu, opt); | |||
| if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(weight_data_packed_groups, weight_data_gpu_image, opt); | |||
| } | |||
| else | |||
| { | |||
| cmd.record_upload(weight_data_packed_groups, weight_data_gpu, opt); | |||
| } | |||
| if (bias_term) | |||
| { | |||
| Mat bias_data_packed; | |||
| convert_packing(bias_data, bias_data_packed, out_elempack_g); | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu, opt); | |||
| if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt); | |||
| } | |||
| else | |||
| { | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu, opt); | |||
| } | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(Mat(1), bias_data_gpu_image, opt); | |||
| } | |||
| return 0; | |||
| @@ -730,4 +807,228 @@ int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_bl | |||
| return 0; | |||
| } | |||
| int ConvolutionDepthWise_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; | |||
| const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; | |||
| VkImageMat bottom_blob_bordered = bottom_blob; | |||
| if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0) | |||
| { | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad); | |||
| } | |||
| else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233) | |||
| { | |||
| int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w; | |||
| int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; | |||
| if (wpad > 0 || hpad > 0) | |||
| { | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* padding_params = padding_param_blob.mapped(); | |||
| padding_params[0] = hpad / 2; | |||
| padding_params[1] = hpad - hpad / 2; | |||
| padding_params[2] = wpad / 2; | |||
| padding_params[3] = wpad - wpad / 2; | |||
| std::vector<VkImageMat> padding_inputs(2); | |||
| padding_inputs[0] = bottom_blob; | |||
| padding_inputs[1] = padding_param_blob; | |||
| std::vector<VkImageMat> padding_outputs(1); | |||
| padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); | |||
| bottom_blob_bordered = padding_outputs[0]; | |||
| } | |||
| } | |||
| else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234) | |||
| { | |||
| int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w; | |||
| int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; | |||
| if (wpad > 0 || hpad > 0) | |||
| { | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* padding_params = padding_param_blob.mapped(); | |||
| padding_params[0] = hpad - hpad / 2; | |||
| padding_params[1] = hpad / 2; | |||
| padding_params[2] = wpad - wpad / 2; | |||
| padding_params[3] = wpad / 2; | |||
| std::vector<VkImageMat> padding_inputs(2); | |||
| padding_inputs[0] = bottom_blob; | |||
| padding_inputs[1] = padding_param_blob; | |||
| std::vector<VkImageMat> padding_outputs(1); | |||
| padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); | |||
| bottom_blob_bordered = padding_outputs[0]; | |||
| } | |||
| } | |||
| w = bottom_blob_bordered.w; | |||
| h = bottom_blob_bordered.h; | |||
| int outw = (w - kernel_extent_w) / stride_w + 1; | |||
| int outh = (h - kernel_extent_h) / stride_h + 1; | |||
| int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / elempack * out_elempack; | |||
| if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) | |||
| { | |||
| if (out_elempack == 8) out_elemsize = 8*2u; | |||
| if (out_elempack == 4) out_elemsize = 4*2u; | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // depth-wise | |||
| if (channels == group / elempack && group / elempack == num_output / elempack) | |||
| { | |||
| std::vector<VkImageMat> bindings(4); | |||
| bindings[0] = bottom_blob_bordered; | |||
| bindings[1] = top_blob; | |||
| bindings[2] = weight_data_gpu_image; | |||
| bindings[3] = bias_data_gpu_image;// TODO use dummy buffer | |||
| std::vector<vk_constant_type> constants(10); | |||
| constants[0].i = bottom_blob_bordered.dims; | |||
| constants[1].i = bottom_blob_bordered.w; | |||
| constants[2].i = bottom_blob_bordered.h; | |||
| constants[3].i = bottom_blob_bordered.c; | |||
| constants[4].i = 0;//bottom_blob_bordered.cstep; | |||
| constants[5].i = top_blob.dims; | |||
| constants[6].i = top_blob.w; | |||
| constants[7].i = top_blob.h; | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = 0;//top_blob.cstep; | |||
| const Pipeline* pipeline = elempack == 8 ? pipeline_convolutiondepthwise_pack8 | |||
| : elempack == 4 ? pipeline_convolutiondepthwise_pack4 | |||
| : pipeline_convolutiondepthwise; | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob); | |||
| return 0; | |||
| } | |||
| const int channels_g = channels * elempack / group; | |||
| const int num_output_g = num_output / group; | |||
| int elempack_g = opt.use_shader_pack8 && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1; | |||
| int out_elempack_g = opt.use_shader_pack8 && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize_g = elemsize / elempack * out_elempack_g; | |||
| if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) | |||
| { | |||
| if (out_elempack_g == 8) out_elemsize_g = 8*2u; | |||
| if (out_elempack_g == 4) out_elemsize_g = 4*2u; | |||
| if (out_elempack_g == 1) out_elemsize_g = 4u; | |||
| } | |||
| // unpacking | |||
| VkImageMat bottom_blob_bordered_unpacked = bottom_blob_bordered; | |||
| if (elempack > elempack_g) | |||
| { | |||
| Option opt_pack1 = opt; | |||
| opt_pack1.blob_vkallocator = opt.workspace_vkallocator; | |||
| packing_unpack->forward(bottom_blob_bordered, bottom_blob_bordered_unpacked, cmd, opt_pack1); | |||
| } | |||
| VkImageMat top_blob_unpacked = top_blob; | |||
| if (out_elempack_g < out_elempack) | |||
| { | |||
| top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator); | |||
| if (top_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| std::vector<VkImageMat> bindings(4); | |||
| bindings[0] = bottom_blob_bordered_unpacked; | |||
| bindings[1] = top_blob_unpacked; | |||
| bindings[2] = weight_data_gpu_image; | |||
| bindings[3] = bias_data_gpu_image;// TODO use dummy buffer | |||
| std::vector<vk_constant_type> constants(10); | |||
| constants[0].i = bottom_blob_bordered_unpacked.dims; | |||
| constants[1].i = bottom_blob_bordered_unpacked.w; | |||
| constants[2].i = bottom_blob_bordered_unpacked.h; | |||
| constants[3].i = bottom_blob_bordered_unpacked.c; | |||
| constants[4].i = 0;//bottom_blob_bordered_unpacked.cstep; | |||
| constants[5].i = top_blob_unpacked.dims; | |||
| constants[6].i = top_blob_unpacked.w; | |||
| constants[7].i = top_blob_unpacked.h; | |||
| constants[8].i = top_blob_unpacked.c; | |||
| constants[9].i = 0;//top_blob_unpacked.cstep; | |||
| const Pipeline* pipeline = 0; | |||
| if (elempack_g == 1 && out_elempack_g == 1) | |||
| { | |||
| pipeline = pipeline_convolutiondepthwise_group; | |||
| } | |||
| else if (elempack_g == 4 && out_elempack_g == 4) | |||
| { | |||
| pipeline = pipeline_convolutiondepthwise_group_pack4; | |||
| } | |||
| else if (elempack_g == 1 && out_elempack_g == 4) | |||
| { | |||
| pipeline = pipeline_convolutiondepthwise_group_pack1to4; | |||
| } | |||
| else if (elempack_g == 4 && out_elempack_g == 1) | |||
| { | |||
| pipeline = pipeline_convolutiondepthwise_group_pack4to1; | |||
| } | |||
| else if (elempack_g == 8 && out_elempack_g == 8) | |||
| { | |||
| pipeline = pipeline_convolutiondepthwise_group_pack8; | |||
| } | |||
| else if (elempack_g == 1 && out_elempack_g == 8) | |||
| { | |||
| pipeline = pipeline_convolutiondepthwise_group_pack1to8; | |||
| } | |||
| else if (elempack_g == 4 && out_elempack_g == 8) | |||
| { | |||
| pipeline = pipeline_convolutiondepthwise_group_pack4to8; | |||
| } | |||
| else if (elempack_g == 8 && out_elempack_g == 4) | |||
| { | |||
| pipeline = pipeline_convolutiondepthwise_group_pack8to4; | |||
| } | |||
| else if (elempack_g == 8 && out_elempack_g == 1) | |||
| { | |||
| pipeline = pipeline_convolutiondepthwise_group_pack8to1; | |||
| } | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob_unpacked); | |||
| // packing | |||
| if (out_elempack_g < out_elempack) | |||
| { | |||
| packing_pack->forward(top_blob_unpacked, top_blob, cmd, opt); | |||
| } | |||
| else | |||
| { | |||
| top_blob = top_blob_unpacked; | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -31,11 +31,15 @@ public: | |||
| using ConvolutionDepthWise::forward; | |||
| virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| public: | |||
| VkMat weight_data_gpu; | |||
| VkMat bias_data_gpu; | |||
| VkImageMat weight_data_gpu_image; | |||
| VkImageMat bias_data_gpu_image; | |||
| ncnn::Layer* padding; | |||
| ncnn::Layer* packing_unpack; | |||
| ncnn::Layer* packing_pack; | |||
| @@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(Crop_vulkan) | |||
| Crop_vulkan::Crop_vulkan() | |||
| { | |||
| support_vulkan = true; | |||
| support_image_storage = true; | |||
| packing_pack1 = 0; | |||
| packing_pack4 = 0; | |||
| @@ -104,7 +105,22 @@ int Crop_vulkan::create_pipeline(const Option& opt) | |||
| size_t elemsize; | |||
| size_t out_elemsize; | |||
| if (opt.use_fp16_storage) | |||
| if (opt.use_image_storage && opt.use_image_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| out_elemsize = out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage && opt.use_image_fp16_packed) | |||
| { | |||
| elemsize = elempack == 1 ? 4u : elempack * 2u; | |||
| out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| elemsize = elempack * 4u; | |||
| out_elemsize = out_elempack * 4u; | |||
| } | |||
| else if (opt.use_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| out_elemsize = out_elempack * 2u; | |||
| @@ -134,7 +150,19 @@ int Crop_vulkan::create_pipeline(const Option& opt) | |||
| if (bottom_shapes.size() == 1 && shape.dims != 0 && elempack == out_elempack && elempack > offset_elempack) | |||
| { | |||
| size_t offset_elemsize; | |||
| if (opt.use_fp16_storage) | |||
| if (opt.use_image_storage && opt.use_image_fp16_storage) | |||
| { | |||
| offset_elemsize = offset_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage && opt.use_image_fp16_packed) | |||
| { | |||
| offset_elemsize = offset_elempack == 1 ? 4u : offset_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| offset_elemsize = offset_elempack * 4u; | |||
| } | |||
| else if (opt.use_fp16_storage) | |||
| { | |||
| offset_elemsize = offset_elempack * 2u; | |||
| } | |||
| @@ -598,4 +626,261 @@ int Crop_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkM | |||
| return 0; | |||
| } | |||
| int Crop_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| int dims = bottom_blob.dims; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| int _woffset, _hoffset, _coffset; | |||
| int _outw, _outh, _outc; | |||
| resolve_crop_roi(bottom_blob.shape(), _woffset, _hoffset, _coffset, _outw, _outh, _outc); | |||
| // TODO vec and image crop | |||
| if (dims == 3) | |||
| { | |||
| int offset_elempack = _coffset == 0 ? elempack : opt.use_shader_pack8 && _coffset % 8 == 0 ? 8 : _coffset % 4 == 0 ? 4 : 1; | |||
| int out_elempack = opt.use_shader_pack8 && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / elempack * out_elempack; | |||
| if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) | |||
| { | |||
| if (out_elempack == 8) out_elemsize = 8*2u; | |||
| if (out_elempack == 4) out_elemsize = 4*2u; | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| // unpacking | |||
| VkImageMat bottom_blob_unpacked = bottom_blob; | |||
| if (elempack == out_elempack && elempack > offset_elempack) | |||
| { | |||
| Option opt_pack1 = opt; | |||
| opt_pack1.blob_vkallocator = opt.workspace_vkallocator; | |||
| const Layer* packing = offset_elempack == 4 ? packing_pack4 : packing_pack1; | |||
| packing->forward(bottom_blob, bottom_blob_unpacked, cmd, opt_pack1); | |||
| } | |||
| top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(2); | |||
| bindings[0] = bottom_blob_unpacked; | |||
| bindings[1] = top_blob; | |||
| std::vector<vk_constant_type> constants(13); | |||
| constants[0].i = bottom_blob_unpacked.dims; | |||
| constants[1].i = bottom_blob_unpacked.w; | |||
| constants[2].i = bottom_blob_unpacked.h; | |||
| constants[3].i = bottom_blob_unpacked.c; | |||
| constants[4].i = 0;//bottom_blob_unpacked.cstep; | |||
| constants[5].i = top_blob.dims; | |||
| constants[6].i = top_blob.w; | |||
| constants[7].i = top_blob.h; | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = 0;//top_blob.cstep; | |||
| constants[10].i = _woffset; | |||
| constants[11].i = _hoffset; | |||
| constants[12].i = _coffset; | |||
| const Pipeline* pipeline = 0; | |||
| if (elempack == 1 && out_elempack == 1) | |||
| { | |||
| pipeline = pipeline_crop; | |||
| } | |||
| else if (elempack == 4 && offset_elempack == 4 && out_elempack == 4) | |||
| { | |||
| constants[12].i = _coffset / 4; | |||
| pipeline = pipeline_crop_pack4; | |||
| } | |||
| else if (elempack == 4 && offset_elempack == 1 && out_elempack == 4) | |||
| { | |||
| pipeline = pipeline_crop_pack1to4; | |||
| } | |||
| else if (elempack == 1 && out_elempack == 4) | |||
| { | |||
| pipeline = pipeline_crop_pack1to4; | |||
| } | |||
| else if (elempack == 4 && out_elempack == 1) | |||
| { | |||
| pipeline = pipeline_crop_pack4to1; | |||
| } | |||
| else if (elempack == 8 && offset_elempack == 8 && out_elempack == 8) | |||
| { | |||
| constants[12].i = _coffset / 8; | |||
| pipeline = pipeline_crop_pack8; | |||
| } | |||
| else if (elempack == 8 && offset_elempack == 4 && out_elempack == 8) | |||
| { | |||
| pipeline = pipeline_crop_pack4to8; | |||
| } | |||
| else if (elempack == 8 && offset_elempack == 1 && out_elempack == 8) | |||
| { | |||
| pipeline = pipeline_crop_pack1to8; | |||
| } | |||
| else if (elempack == 1 && out_elempack == 8) | |||
| { | |||
| pipeline = pipeline_crop_pack1to8; | |||
| } | |||
| else if (elempack == 4 && out_elempack == 8) | |||
| { | |||
| pipeline = pipeline_crop_pack4to8; | |||
| } | |||
| else if (elempack == 8 && out_elempack == 4) | |||
| { | |||
| pipeline = pipeline_crop_pack8to4; | |||
| } | |||
| else if (elempack == 8 && out_elempack == 1) | |||
| { | |||
| pipeline = pipeline_crop_pack8to1; | |||
| } | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob); | |||
| } | |||
| return 0; | |||
| } | |||
| int Crop_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| const VkImageMat& bottom_blob = bottom_blobs[0]; | |||
| const VkImageMat& reference_blob = bottom_blobs[1]; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| int dims = bottom_blob.dims; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| int _woffset, _hoffset, _coffset; | |||
| int _outw, _outh, _outc; | |||
| if (woffset == -233) | |||
| { | |||
| resolve_crop_roi(bottom_blob.shape(), (const int*)reference_blob.mapped(), _woffset, _hoffset, _coffset, _outw, _outh, _outc); | |||
| } | |||
| else | |||
| { | |||
| resolve_crop_roi(bottom_blob.shape(), reference_blob.shape(), _woffset, _hoffset, _coffset, _outw, _outh, _outc); | |||
| } | |||
| // TODO vec and image crop | |||
| if (dims == 3) | |||
| { | |||
| int offset_elempack = _coffset == 0 ? elempack : opt.use_shader_pack8 && _coffset % 8 == 0 ? 8 : _coffset % 4 == 0 ? 4 : 1; | |||
| int out_elempack = opt.use_shader_pack8 && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / elempack * out_elempack; | |||
| if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) | |||
| { | |||
| if (out_elempack == 8) out_elemsize = 8*2u; | |||
| if (out_elempack == 4) out_elemsize = 4*2u; | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| // unpacking | |||
| VkImageMat bottom_blob_unpacked = bottom_blob; | |||
| if (elempack == out_elempack && elempack > offset_elempack) | |||
| { | |||
| Option opt_pack1 = opt; | |||
| opt_pack1.blob_vkallocator = opt.workspace_vkallocator; | |||
| const Layer* packing = offset_elempack == 4 ? packing_pack4 : packing_pack1; | |||
| packing->forward(bottom_blob, bottom_blob_unpacked, cmd, opt_pack1); | |||
| } | |||
| VkImageMat& top_blob = top_blobs[0]; | |||
| top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(2); | |||
| bindings[0] = bottom_blob_unpacked; | |||
| bindings[1] = top_blob; | |||
| std::vector<vk_constant_type> constants(13); | |||
| constants[0].i = bottom_blob_unpacked.dims; | |||
| constants[1].i = bottom_blob_unpacked.w; | |||
| constants[2].i = bottom_blob_unpacked.h; | |||
| constants[3].i = bottom_blob_unpacked.c; | |||
| constants[4].i = 0;//bottom_blob_unpacked.cstep; | |||
| constants[5].i = top_blob.dims; | |||
| constants[6].i = top_blob.w; | |||
| constants[7].i = top_blob.h; | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = 0;//top_blob.cstep; | |||
| constants[10].i = _woffset; | |||
| constants[11].i = _hoffset; | |||
| constants[12].i = _coffset; | |||
| const Pipeline* pipeline = 0; | |||
| if (elempack == 1 && out_elempack == 1) | |||
| { | |||
| pipeline = pipeline_crop; | |||
| } | |||
| else if (elempack == 4 && offset_elempack == 4 && out_elempack == 4) | |||
| { | |||
| constants[12].i = _coffset / 4; | |||
| pipeline = pipeline_crop_pack4; | |||
| } | |||
| else if (elempack == 4 && offset_elempack == 1 && out_elempack == 4) | |||
| { | |||
| pipeline = pipeline_crop_pack1to4; | |||
| } | |||
| else if (elempack == 1 && out_elempack == 4) | |||
| { | |||
| pipeline = pipeline_crop_pack1to4; | |||
| } | |||
| else if (elempack == 4 && out_elempack == 1) | |||
| { | |||
| pipeline = pipeline_crop_pack4to1; | |||
| } | |||
| else if (elempack == 8 && offset_elempack == 8 && out_elempack == 8) | |||
| { | |||
| constants[12].i = _coffset / 8; | |||
| pipeline = pipeline_crop_pack8; | |||
| } | |||
| else if (elempack == 8 && offset_elempack == 4 && out_elempack == 8) | |||
| { | |||
| pipeline = pipeline_crop_pack4to8; | |||
| } | |||
| else if (elempack == 8 && offset_elempack == 1 && out_elempack == 8) | |||
| { | |||
| pipeline = pipeline_crop_pack1to8; | |||
| } | |||
| else if (elempack == 1 && out_elempack == 8) | |||
| { | |||
| pipeline = pipeline_crop_pack1to8; | |||
| } | |||
| else if (elempack == 4 && out_elempack == 8) | |||
| { | |||
| pipeline = pipeline_crop_pack4to8; | |||
| } | |||
| else if (elempack == 8 && out_elempack == 4) | |||
| { | |||
| pipeline = pipeline_crop_pack8to4; | |||
| } | |||
| else if (elempack == 8 && out_elempack == 1) | |||
| { | |||
| pipeline = pipeline_crop_pack8to1; | |||
| } | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob); | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -32,6 +32,10 @@ public: | |||
| virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const; | |||
| public: | |||
| ncnn::Layer* packing_pack1; | |||
| ncnn::Layer* packing_pack4; | |||
| @@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(Deconvolution_vulkan) | |||
| Deconvolution_vulkan::Deconvolution_vulkan() | |||
| { | |||
| support_vulkan = true; | |||
| support_image_storage = true; | |||
| crop = 0; | |||
| output_pad = 0; | |||
| @@ -130,7 +131,22 @@ int Deconvolution_vulkan::create_pipeline(const Option& opt) | |||
| size_t elemsize; | |||
| size_t out_elemsize; | |||
| if (opt.use_fp16_storage) | |||
| if (opt.use_image_storage && opt.use_image_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| out_elemsize = out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage && opt.use_image_fp16_packed) | |||
| { | |||
| elemsize = elempack == 1 ? 4u : elempack * 2u; | |||
| out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| elemsize = elempack * 4u; | |||
| out_elemsize = out_elempack * 4u; | |||
| } | |||
| else if (opt.use_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| out_elemsize = out_elempack * 2u; | |||
| @@ -316,6 +332,21 @@ int Deconvolution_vulkan::destroy_pipeline(const Option& opt) | |||
| int Deconvolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) | |||
| { | |||
| if (crop) | |||
| { | |||
| crop->upload_model(cmd, opt); | |||
| } | |||
| if (output_pad) | |||
| { | |||
| output_pad->upload_model(cmd, opt); | |||
| } | |||
| if (output_crop) | |||
| { | |||
| output_crop->upload_model(cmd, opt); | |||
| } | |||
| const int maxk = kernel_w * kernel_h; | |||
| int num_input = weight_data_size / maxk / num_output; | |||
| @@ -376,14 +407,32 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) | |||
| } | |||
| } | |||
| cmd.record_upload(weight_data_packed, weight_data_gpu, opt); | |||
| if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt); | |||
| } | |||
| else | |||
| { | |||
| cmd.record_upload(weight_data_packed, weight_data_gpu, opt); | |||
| } | |||
| if (bias_term) | |||
| { | |||
| Mat bias_data_packed; | |||
| convert_packing(bias_data, bias_data_packed, out_elempack); | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu, opt); | |||
| if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt); | |||
| } | |||
| else | |||
| { | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu, opt); | |||
| } | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(Mat(1), bias_data_gpu_image, opt); | |||
| } | |||
| return 0; | |||
| @@ -582,4 +631,197 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC | |||
| return 0; | |||
| } | |||
| int Deconvolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; | |||
| const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; | |||
| int outw = (w - 1) * stride_w + kernel_extent_w; | |||
| int outh = (h - 1) * stride_h + kernel_extent_h; | |||
| int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / elempack * out_elempack; | |||
| if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) | |||
| { | |||
| if (out_elempack == 8) out_elemsize = 8*2u; | |||
| if (out_elempack == 4) out_elemsize = 4*2u; | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| VkImageMat top_blob_bordered; | |||
| if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0)) | |||
| { | |||
| top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); | |||
| } | |||
| else | |||
| { | |||
| top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| } | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(4); | |||
| bindings[0] = bottom_blob; | |||
| bindings[1] = top_blob_bordered; | |||
| bindings[2] = weight_data_gpu_image; | |||
| bindings[3] = bias_data_gpu_image;// TODO use dummy buffer | |||
| std::vector<vk_constant_type> constants(10); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h; | |||
| constants[3].i = bottom_blob.c; | |||
| constants[4].i = 0;//bottom_blob.cstep; | |||
| constants[5].i = top_blob_bordered.dims; | |||
| constants[6].i = top_blob_bordered.w; | |||
| constants[7].i = top_blob_bordered.h; | |||
| constants[8].i = top_blob_bordered.c; | |||
| constants[9].i = 0;//top_blob_bordered.cstep; | |||
| const Pipeline* pipeline = 0; | |||
| if (elempack == 1 && out_elempack == 1) | |||
| { | |||
| pipeline = pipeline_deconvolution; | |||
| } | |||
| else if (elempack == 4 && out_elempack == 4) | |||
| { | |||
| pipeline = pipeline_deconvolution_pack4; | |||
| } | |||
| else if (elempack == 1 && out_elempack == 4) | |||
| { | |||
| pipeline = pipeline_deconvolution_pack1to4; | |||
| } | |||
| else if (elempack == 4 && out_elempack == 1) | |||
| { | |||
| pipeline = pipeline_deconvolution_pack4to1; | |||
| } | |||
| else if (elempack == 8 && out_elempack == 8) | |||
| { | |||
| pipeline = pipeline_deconvolution_pack8; | |||
| } | |||
| else if (elempack == 1 && out_elempack == 8) | |||
| { | |||
| pipeline = pipeline_deconvolution_pack1to8; | |||
| } | |||
| else if (elempack == 4 && out_elempack == 8) | |||
| { | |||
| pipeline = pipeline_deconvolution_pack4to8; | |||
| } | |||
| else if (elempack == 8 && out_elempack == 4) | |||
| { | |||
| pipeline = pipeline_deconvolution_pack8to4; | |||
| } | |||
| else if (elempack == 8 && out_elempack == 1) | |||
| { | |||
| pipeline = pipeline_deconvolution_pack8to1; | |||
| } | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob_bordered); | |||
| if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0) | |||
| { | |||
| VkImageMat top_blob_bordered_adj = top_blob_bordered; | |||
| if (output_pad_right > 0 || output_pad_bottom > 0) | |||
| { | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| output_pad->forward(top_blob_bordered, top_blob_bordered_adj, cmd, opt_pad); | |||
| if (top_blob_bordered_adj.empty()) | |||
| return -100; | |||
| } | |||
| { | |||
| VkImageMat reference_blob; | |||
| reference_blob.dims = 2; | |||
| reference_blob.w = top_blob_bordered_adj.w - pad_left - pad_right; | |||
| reference_blob.h = top_blob_bordered_adj.h - pad_top - pad_bottom; | |||
| reference_blob.elempack = 1; | |||
| std::vector<VkImageMat> crop_bottom_blobs(2); | |||
| crop_bottom_blobs[0] = top_blob_bordered_adj; | |||
| crop_bottom_blobs[1] = reference_blob; | |||
| std::vector<VkImageMat> crop_top_blobs(1); | |||
| crop->forward(crop_bottom_blobs, crop_top_blobs, cmd, opt); | |||
| top_blob = crop_top_blobs[0]; | |||
| } | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| outw = top_blob.w; | |||
| outh = top_blob.h; | |||
| } | |||
| else if (output_w > 0 && output_h > 0) | |||
| { | |||
| VkImageMat top_blob_bordered_adj = top_blob_bordered; | |||
| if (output_pad_right > 0 || output_pad_bottom > 0) | |||
| { | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| output_pad->forward(top_blob_bordered, top_blob_bordered_adj, cmd, opt_pad); | |||
| if (top_blob_bordered_adj.empty()) | |||
| return -100; | |||
| } | |||
| int wcut = top_blob_bordered_adj.w - output_w; | |||
| int hcut = top_blob_bordered_adj.h - output_h; | |||
| VkImageMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* crop_params = crop_param_blob.mapped(); | |||
| if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233) | |||
| { | |||
| // onnx padding=SAME_UPPER | |||
| crop_params[0] = wcut / 2; | |||
| crop_params[1] = hcut / 2; | |||
| crop_params[2] = 0; | |||
| crop_params[3] = top_blob_bordered_adj.w - wcut; | |||
| crop_params[4] = top_blob_bordered_adj.h - hcut; | |||
| crop_params[5] = top_blob_bordered_adj.c; | |||
| } | |||
| else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234) | |||
| { | |||
| // onnx padding=SAME_LOWER | |||
| crop_params[0] = wcut - wcut / 2; | |||
| crop_params[1] = hcut - hcut / 2; | |||
| crop_params[2] = 0; | |||
| crop_params[3] = top_blob_bordered_adj.w - wcut; | |||
| crop_params[4] = top_blob_bordered_adj.h - hcut; | |||
| crop_params[5] = top_blob_bordered_adj.c; | |||
| } | |||
| std::vector<VkImageMat> crop_inputs(2); | |||
| crop_inputs[0] = top_blob_bordered_adj; | |||
| crop_inputs[1] = crop_param_blob; | |||
| std::vector<VkImageMat> crop_outputs(1); | |||
| output_crop->forward(crop_inputs, crop_outputs, cmd, opt); | |||
| top_blob = crop_outputs[0]; | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| outw = top_blob.w; | |||
| outh = top_blob.h; | |||
| } | |||
| else | |||
| { | |||
| if (output_pad_right > 0 || output_pad_bottom > 0) | |||
| { | |||
| output_pad->forward(top_blob_bordered, top_blob, cmd, opt); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| } | |||
| else | |||
| { | |||
| top_blob = top_blob_bordered; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -31,11 +31,15 @@ public: | |||
| using Deconvolution::forward; | |||
| virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| public: | |||
| VkMat weight_data_gpu; | |||
| VkMat bias_data_gpu; | |||
| VkImageMat weight_data_gpu_image; | |||
| VkImageMat bias_data_gpu_image; | |||
| ncnn::Layer* crop; | |||
| ncnn::Layer* output_pad; | |||
| ncnn::Layer* output_crop; | |||
| @@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(DeconvolutionDepthWise_vulkan) | |||
| DeconvolutionDepthWise_vulkan::DeconvolutionDepthWise_vulkan() | |||
| { | |||
| support_vulkan = true; | |||
| support_image_storage = true; | |||
| crop = 0; | |||
| output_pad = 0; | |||
| @@ -136,7 +137,22 @@ int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& opt) | |||
| size_t elemsize; | |||
| size_t out_elemsize; | |||
| if (opt.use_fp16_storage) | |||
| if (opt.use_image_storage && opt.use_image_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| out_elemsize = out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage && opt.use_image_fp16_packed) | |||
| { | |||
| elemsize = elempack == 1 ? 4u : elempack * 2u; | |||
| out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| elemsize = elempack * 4u; | |||
| out_elemsize = out_elempack * 4u; | |||
| } | |||
| else if (opt.use_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| out_elemsize = out_elempack * 2u; | |||
| @@ -233,7 +249,22 @@ int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& opt) | |||
| size_t elemsize_g; | |||
| size_t out_elemsize_g; | |||
| if (opt.use_fp16_storage) | |||
| if (opt.use_image_storage && opt.use_image_fp16_storage) | |||
| { | |||
| elemsize_g = elempack_g * 2u; | |||
| out_elemsize_g = out_elempack_g * 2u; | |||
| } | |||
| else if (opt.use_image_storage && opt.use_image_fp16_packed) | |||
| { | |||
| elemsize_g = elempack_g == 1 ? 4u : elempack_g * 2u; | |||
| out_elemsize_g = out_elempack_g == 1 ? 4u : out_elempack_g * 2u; | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| elemsize_g = elempack_g * 4u; | |||
| out_elemsize_g = out_elempack_g * 4u; | |||
| } | |||
| else if (opt.use_fp16_storage) | |||
| { | |||
| elemsize_g = elempack_g * 2u; | |||
| out_elemsize_g = out_elempack_g * 2u; | |||
| @@ -463,6 +494,31 @@ int DeconvolutionDepthWise_vulkan::destroy_pipeline(const Option& opt) | |||
| int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt) | |||
| { | |||
| if (crop) | |||
| { | |||
| crop->upload_model(cmd, opt); | |||
| } | |||
| if (output_pad) | |||
| { | |||
| output_pad->upload_model(cmd, opt); | |||
| } | |||
| if (output_crop) | |||
| { | |||
| output_crop->upload_model(cmd, opt); | |||
| } | |||
| if (packing_unpack) | |||
| { | |||
| packing_unpack->upload_model(cmd, opt); | |||
| } | |||
| if (packing_pack) | |||
| { | |||
| packing_pack->upload_model(cmd, opt); | |||
| } | |||
| const int maxk = kernel_w * kernel_h; | |||
| int channels = (weight_data_size / group) / maxk / (num_output / group) * group; | |||
| @@ -495,12 +551,25 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& o | |||
| cmd.record_upload(weight_data_r2_packed, weight_data_gpu, opt); | |||
| cmd.record_upload(weight_data_r2_packed, weight_data_gpu_image, opt); | |||
| if (bias_term) | |||
| { | |||
| Mat bias_data_packed; | |||
| convert_packing(bias_data, bias_data_packed, out_elempack); | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu, opt); | |||
| if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt); | |||
| } | |||
| else | |||
| { | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu, opt); | |||
| } | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(Mat(1), bias_data_gpu_image, opt); | |||
| } | |||
| return 0; | |||
| @@ -558,14 +627,32 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& o | |||
| } | |||
| } | |||
| cmd.record_upload(weight_data_packed_groups, weight_data_gpu, opt); | |||
| if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(weight_data_packed_groups, weight_data_gpu_image, opt); | |||
| } | |||
| else | |||
| { | |||
| cmd.record_upload(weight_data_packed_groups, weight_data_gpu, opt); | |||
| } | |||
| if (bias_term) | |||
| { | |||
| Mat bias_data_packed; | |||
| convert_packing(bias_data, bias_data_packed, out_elempack_g); | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu, opt); | |||
| if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt); | |||
| } | |||
| else | |||
| { | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu, opt); | |||
| } | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(Mat(1), bias_data_gpu_image, opt); | |||
| } | |||
| return 0; | |||
| @@ -936,4 +1023,369 @@ int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_ | |||
| return 0; | |||
| } | |||
| int DeconvolutionDepthWise_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; | |||
| const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; | |||
| int outw = (w - 1) * stride_w + kernel_extent_w; | |||
| int outh = (h - 1) * stride_h + kernel_extent_h; | |||
| int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / elempack * out_elempack; | |||
| if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) | |||
| { | |||
| if (out_elempack == 8) out_elemsize = 8*2u; | |||
| if (out_elempack == 4) out_elemsize = 4*2u; | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| VkImageMat top_blob_bordered; | |||
| if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0)) | |||
| { | |||
| top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); | |||
| } | |||
| else | |||
| { | |||
| top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| } | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| // depth-wise | |||
| if (channels == group / elempack && group / elempack == num_output / elempack) | |||
| { | |||
| std::vector<VkImageMat> bindings(4); | |||
| bindings[0] = bottom_blob; | |||
| bindings[1] = top_blob_bordered; | |||
| bindings[2] = weight_data_gpu_image; | |||
| bindings[3] = bias_data_gpu_image;// TODO use dummy buffer | |||
| std::vector<vk_constant_type> constants(10); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h; | |||
| constants[3].i = bottom_blob.c; | |||
| constants[4].i = 0;//bottom_blob.cstep; | |||
| constants[5].i = top_blob_bordered.dims; | |||
| constants[6].i = top_blob_bordered.w; | |||
| constants[7].i = top_blob_bordered.h; | |||
| constants[8].i = top_blob_bordered.c; | |||
| constants[9].i = 0;//top_blob_bordered.cstep; | |||
| const Pipeline* pipeline = elempack == 8 ? pipeline_deconvolutiondepthwise_pack8 | |||
| : elempack == 4 ? pipeline_deconvolutiondepthwise_pack4 | |||
| : pipeline_deconvolutiondepthwise; | |||
| // record | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob_bordered); | |||
| if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0) | |||
| { | |||
| VkImageMat top_blob_bordered_adj = top_blob_bordered; | |||
| if (output_pad_right > 0 || output_pad_bottom > 0) | |||
| { | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| output_pad->forward(top_blob_bordered, top_blob_bordered_adj, cmd, opt_pad); | |||
| if (top_blob_bordered_adj.empty()) | |||
| return -100; | |||
| } | |||
| { | |||
| VkImageMat reference_blob; | |||
| reference_blob.dims = 2; | |||
| reference_blob.w = top_blob_bordered_adj.w - pad_left - pad_right; | |||
| reference_blob.h = top_blob_bordered_adj.h - pad_top - pad_bottom; | |||
| reference_blob.elempack = 1; | |||
| std::vector<VkImageMat> crop_bottom_blobs(2); | |||
| crop_bottom_blobs[0] = top_blob_bordered_adj; | |||
| crop_bottom_blobs[1] = reference_blob; | |||
| std::vector<VkImageMat> crop_top_blobs(1); | |||
| crop->forward(crop_bottom_blobs, crop_top_blobs, cmd, opt); | |||
| top_blob = crop_top_blobs[0]; | |||
| } | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| outw = top_blob.w; | |||
| outh = top_blob.h; | |||
| } | |||
| else if (output_w > 0 && output_h > 0) | |||
| { | |||
| VkImageMat top_blob_bordered_adj = top_blob_bordered; | |||
| if (output_pad_right > 0 || output_pad_bottom > 0) | |||
| { | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| output_pad->forward(top_blob_bordered, top_blob_bordered_adj, cmd, opt_pad); | |||
| if (top_blob_bordered_adj.empty()) | |||
| return -100; | |||
| } | |||
| int wcut = top_blob_bordered_adj.w - output_w; | |||
| int hcut = top_blob_bordered_adj.h - output_h; | |||
| VkImageMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* crop_params = crop_param_blob.mapped(); | |||
| if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233) | |||
| { | |||
| // onnx padding=SAME_UPPER | |||
| crop_params[0] = wcut / 2; | |||
| crop_params[1] = hcut / 2; | |||
| crop_params[2] = 0; | |||
| crop_params[3] = top_blob_bordered_adj.w - wcut; | |||
| crop_params[4] = top_blob_bordered_adj.h - hcut; | |||
| crop_params[5] = top_blob_bordered_adj.c; | |||
| } | |||
| else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234) | |||
| { | |||
| // onnx padding=SAME_LOWER | |||
| crop_params[0] = wcut - wcut / 2; | |||
| crop_params[1] = hcut - hcut / 2; | |||
| crop_params[2] = 0; | |||
| crop_params[3] = top_blob_bordered_adj.w - wcut; | |||
| crop_params[4] = top_blob_bordered_adj.h - hcut; | |||
| crop_params[5] = top_blob_bordered_adj.c; | |||
| } | |||
| std::vector<VkImageMat> crop_inputs(2); | |||
| crop_inputs[0] = top_blob_bordered_adj; | |||
| crop_inputs[1] = crop_param_blob; | |||
| std::vector<VkImageMat> crop_outputs(1); | |||
| output_crop->forward(crop_inputs, crop_outputs, cmd, opt); | |||
| top_blob = crop_outputs[0]; | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| outw = top_blob.w; | |||
| outh = top_blob.h; | |||
| } | |||
| else | |||
| { | |||
| if (output_pad_right > 0 || output_pad_bottom > 0) | |||
| { | |||
| output_pad->forward(top_blob_bordered, top_blob, cmd, opt); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| } | |||
| else | |||
| { | |||
| top_blob = top_blob_bordered; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| const int channels_g = channels * elempack / group; | |||
| const int num_output_g = num_output / group; | |||
| int elempack_g = opt.use_shader_pack8 && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1; | |||
| int out_elempack_g = opt.use_shader_pack8 && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize_g = elemsize / elempack * out_elempack_g; | |||
| if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) | |||
| { | |||
| if (out_elempack_g == 8) out_elemsize_g = 8*2u; | |||
| if (out_elempack_g == 4) out_elemsize_g = 4*2u; | |||
| if (out_elempack_g == 1) out_elemsize_g = 4u; | |||
| } | |||
| // unpacking | |||
| VkImageMat bottom_blob_unpacked = bottom_blob; | |||
| if (elempack > elempack_g) | |||
| { | |||
| Option opt_pack1 = opt; | |||
| opt_pack1.blob_vkallocator = opt.workspace_vkallocator; | |||
| packing_unpack->forward(bottom_blob, bottom_blob_unpacked, cmd, opt_pack1); | |||
| } | |||
| VkImageMat top_blob_unpacked = top_blob_bordered; | |||
| if (out_elempack_g < out_elempack) | |||
| { | |||
| top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator); | |||
| if (top_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| std::vector<VkImageMat> bindings(4); | |||
| bindings[0] = bottom_blob_unpacked; | |||
| bindings[1] = top_blob_unpacked; | |||
| bindings[2] = weight_data_gpu_image; | |||
| bindings[3] = bias_data_gpu_image;// TODO use dummy buffer | |||
| std::vector<vk_constant_type> constants(10); | |||
| constants[0].i = bottom_blob_unpacked.dims; | |||
| constants[1].i = bottom_blob_unpacked.w; | |||
| constants[2].i = bottom_blob_unpacked.h; | |||
| constants[3].i = bottom_blob_unpacked.c; | |||
| constants[4].i = 0;//bottom_blob_unpacked.cstep; | |||
| constants[5].i = top_blob_unpacked.dims; | |||
| constants[6].i = top_blob_unpacked.w; | |||
| constants[7].i = top_blob_unpacked.h; | |||
| constants[8].i = top_blob_unpacked.c; | |||
| constants[9].i = 0;//top_blob_unpacked.cstep; | |||
| const Pipeline* pipeline = 0; | |||
| if (elempack_g == 1 && out_elempack_g == 1) | |||
| { | |||
| pipeline = pipeline_deconvolutiondepthwise_group; | |||
| } | |||
| else if (elempack_g == 4 && out_elempack_g == 4) | |||
| { | |||
| pipeline = pipeline_deconvolutiondepthwise_group_pack4; | |||
| } | |||
| else if (elempack_g == 1 && out_elempack_g == 4) | |||
| { | |||
| pipeline = pipeline_deconvolutiondepthwise_group_pack1to4; | |||
| } | |||
| else if (elempack_g == 4 && out_elempack_g == 1) | |||
| { | |||
| pipeline = pipeline_deconvolutiondepthwise_group_pack4to1; | |||
| } | |||
| else if (elempack_g == 8 && out_elempack_g == 8) | |||
| { | |||
| pipeline = pipeline_deconvolutiondepthwise_group_pack8; | |||
| } | |||
| else if (elempack_g == 1 && out_elempack_g == 8) | |||
| { | |||
| pipeline = pipeline_deconvolutiondepthwise_group_pack1to8; | |||
| } | |||
| else if (elempack_g == 4 && out_elempack_g == 8) | |||
| { | |||
| pipeline = pipeline_deconvolutiondepthwise_group_pack4to8; | |||
| } | |||
| else if (elempack_g == 8 && out_elempack_g == 4) | |||
| { | |||
| pipeline = pipeline_deconvolutiondepthwise_group_pack8to4; | |||
| } | |||
| else if (elempack_g == 8 && out_elempack_g == 1) | |||
| { | |||
| pipeline = pipeline_deconvolutiondepthwise_group_pack8to1; | |||
| } | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob_unpacked); | |||
| // packing | |||
| if (out_elempack_g < out_elempack) | |||
| { | |||
| packing_pack->forward(top_blob_unpacked, top_blob_bordered, cmd, opt); | |||
| } | |||
| else | |||
| { | |||
| top_blob_bordered = top_blob_unpacked; | |||
| } | |||
| if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0) | |||
| { | |||
| VkImageMat top_blob_bordered_adj = top_blob_bordered; | |||
| if (output_pad_right > 0 || output_pad_bottom > 0) | |||
| { | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| output_pad->forward(top_blob_bordered, top_blob_bordered_adj, cmd, opt_pad); | |||
| if (top_blob_bordered_adj.empty()) | |||
| return -100; | |||
| } | |||
| { | |||
| VkImageMat reference_blob; | |||
| reference_blob.dims = 2; | |||
| reference_blob.w = top_blob_bordered_adj.w - pad_left - pad_right; | |||
| reference_blob.h = top_blob_bordered_adj.h - pad_top - pad_bottom; | |||
| reference_blob.elempack = 1; | |||
| std::vector<VkImageMat> crop_bottom_blobs(2); | |||
| crop_bottom_blobs[0] = top_blob_bordered_adj; | |||
| crop_bottom_blobs[1] = reference_blob; | |||
| std::vector<VkImageMat> crop_top_blobs(1); | |||
| crop->forward(crop_bottom_blobs, crop_top_blobs, cmd, opt); | |||
| top_blob = crop_top_blobs[0]; | |||
| } | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| outw = top_blob.w; | |||
| outh = top_blob.h; | |||
| } | |||
| else if (output_w > 0 && output_h > 0) | |||
| { | |||
| VkImageMat top_blob_bordered_adj = top_blob_bordered; | |||
| if (output_pad_right > 0 || output_pad_bottom > 0) | |||
| { | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| output_pad->forward(top_blob_bordered, top_blob_bordered_adj, cmd, opt_pad); | |||
| if (top_blob_bordered_adj.empty()) | |||
| return -100; | |||
| } | |||
| int wcut = top_blob_bordered_adj.w - output_w; | |||
| int hcut = top_blob_bordered_adj.h - output_h; | |||
| VkImageMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* crop_params = crop_param_blob.mapped(); | |||
| if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233) | |||
| { | |||
| // onnx padding=SAME_UPPER | |||
| crop_params[0] = wcut / 2; | |||
| crop_params[1] = hcut / 2; | |||
| crop_params[2] = 0; | |||
| crop_params[3] = top_blob_bordered_adj.w - wcut; | |||
| crop_params[4] = top_blob_bordered_adj.h - hcut; | |||
| crop_params[5] = top_blob_bordered_adj.c; | |||
| } | |||
| else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234) | |||
| { | |||
| // onnx padding=SAME_LOWER | |||
| crop_params[0] = wcut - wcut / 2; | |||
| crop_params[1] = hcut - hcut / 2; | |||
| crop_params[2] = 0; | |||
| crop_params[3] = top_blob_bordered_adj.w - wcut; | |||
| crop_params[4] = top_blob_bordered_adj.h - hcut; | |||
| crop_params[5] = top_blob_bordered_adj.c; | |||
| } | |||
| std::vector<VkImageMat> crop_inputs(2); | |||
| crop_inputs[0] = top_blob_bordered_adj; | |||
| crop_inputs[1] = crop_param_blob; | |||
| std::vector<VkImageMat> crop_outputs(1); | |||
| output_crop->forward(crop_inputs, crop_outputs, cmd, opt); | |||
| top_blob = crop_outputs[0]; | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| outw = top_blob.w; | |||
| outh = top_blob.h; | |||
| } | |||
| else | |||
| { | |||
| if (output_pad_right > 0 || output_pad_bottom > 0) | |||
| { | |||
| output_pad->forward(top_blob_bordered, top_blob, cmd, opt); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| } | |||
| else | |||
| { | |||
| top_blob = top_blob_bordered; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -31,11 +31,15 @@ public: | |||
| using DeconvolutionDepthWise::forward; | |||
| virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| public: | |||
| VkMat weight_data_gpu; | |||
| VkMat bias_data_gpu; | |||
| VkImageMat weight_data_gpu_image; | |||
| VkImageMat bias_data_gpu_image; | |||
| ncnn::Layer* crop; | |||
| ncnn::Layer* output_pad; | |||
| ncnn::Layer* output_crop; | |||
| @@ -23,6 +23,7 @@ DEFINE_LAYER_CREATOR(Eltwise_vulkan) | |||
| Eltwise_vulkan::Eltwise_vulkan() | |||
| { | |||
| support_vulkan = true; | |||
| support_image_storage = true; | |||
| pipeline_eltwise[0] = 0; | |||
| pipeline_eltwise[1] = 0; | |||
| @@ -42,7 +43,19 @@ int Eltwise_vulkan::create_pipeline(const Option& opt) | |||
| if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; | |||
| size_t elemsize; | |||
| if (opt.use_fp16_storage) | |||
| if (opt.use_image_storage && opt.use_image_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage && opt.use_image_fp16_packed) | |||
| { | |||
| elemsize = elempack == 1 ? 4u : elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| elemsize = elempack * 4u; | |||
| } | |||
| else if (opt.use_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| } | |||
| @@ -207,4 +220,66 @@ int Eltwise_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector< | |||
| return 0; | |||
| } | |||
| int Eltwise_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| const VkImageMat& bottom_blob = bottom_blobs[0]; | |||
| const VkImageMat& bottom_blob1 = bottom_blobs[1]; | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| VkImageMat& top_blob = top_blobs[0]; | |||
| top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(3); | |||
| bindings[0] = bottom_blob; | |||
| bindings[1] = bottom_blob1; | |||
| bindings[2] = top_blob; | |||
| std::vector<vk_constant_type> constants(5 + 2); | |||
| constants[0].i = top_blob.dims; | |||
| constants[1].i = top_blob.w; | |||
| constants[2].i = top_blob.h; | |||
| constants[3].i = top_blob.c; | |||
| constants[4].i = 0;//top_blob.cstep; | |||
| constants[5].f = coeffs.w == 0 ? 1.f : coeffs[0]; | |||
| constants[6].f = coeffs.w == 0 ? 1.f : coeffs[1]; | |||
| const Pipeline* pipeline = elempack == 8 ? pipeline_eltwise_pack8[1] | |||
| : elempack == 4 ? pipeline_eltwise_pack4[1] | |||
| : pipeline_eltwise[1]; | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob); | |||
| for (size_t b=2; b<bottom_blobs.size(); b++) | |||
| { | |||
| std::vector<VkImageMat> bindings(3); | |||
| bindings[0] = top_blob; | |||
| bindings[1] = bottom_blobs[b]; | |||
| bindings[2] = top_blob;// TODO use separated pipeline ? | |||
| std::vector<vk_constant_type> constants(5 + 2); | |||
| constants[0].i = top_blob.dims; | |||
| constants[1].i = top_blob.w; | |||
| constants[2].i = top_blob.h; | |||
| constants[3].i = top_blob.c; | |||
| constants[4].i = 0;//top_blob.cstep; | |||
| constants[5].f = 1.f; | |||
| constants[6].f = coeffs.w == 0 ? 1 : coeffs[b]; | |||
| const Pipeline* pipeline = elempack == 8 ? pipeline_eltwise_pack8[b%2] | |||
| : elempack == 4 ? pipeline_eltwise_pack4[b%2] | |||
| : pipeline_eltwise[b%2]; | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob); | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -29,6 +29,7 @@ public: | |||
| using Eltwise::forward; | |||
| virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const; | |||
| public: | |||
| Pipeline* pipeline_eltwise[2]; | |||
| @@ -23,6 +23,7 @@ DEFINE_LAYER_CREATOR(Flatten_vulkan) | |||
| Flatten_vulkan::Flatten_vulkan() | |||
| { | |||
| support_vulkan = true; | |||
| support_image_storage = true; | |||
| pipeline_flatten = 0; | |||
| pipeline_flatten_pack4 = 0; | |||
| @@ -47,7 +48,22 @@ int Flatten_vulkan::create_pipeline(const Option& opt) | |||
| size_t elemsize; | |||
| size_t out_elemsize; | |||
| if (opt.use_fp16_storage) | |||
| if (opt.use_image_storage && opt.use_image_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| out_elemsize = out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage && opt.use_image_fp16_packed) | |||
| { | |||
| elemsize = elempack == 1 ? 4u : elempack * 2u; | |||
| out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| elemsize = elempack * 4u; | |||
| out_elemsize = out_elempack * 4u; | |||
| } | |||
| else if (opt.use_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| out_elemsize = out_elempack * 2u; | |||
| @@ -256,4 +272,83 @@ int Flatten_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| return 0; | |||
| } | |||
| int Flatten_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int dims = bottom_blob.dims; | |||
| if (dims == 1) | |||
| { | |||
| top_blob = bottom_blob; | |||
| return 0; | |||
| } | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| int total = w * h * channels * elempack; | |||
| int out_elempack = opt.use_shader_pack8 && total % 8 == 0 ? 8 : total % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / elempack * out_elempack; | |||
| if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) | |||
| { | |||
| if (out_elempack == 8) out_elemsize = 8*2u; | |||
| if (out_elempack == 4) out_elemsize = 4*2u; | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(2); | |||
| bindings[0] = bottom_blob; | |||
| bindings[1] = top_blob; | |||
| std::vector<vk_constant_type> constants(10); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h; | |||
| constants[3].i = bottom_blob.c; | |||
| constants[4].i = 0;//bottom_blob.cstep; | |||
| constants[5].i = top_blob.dims; | |||
| constants[6].i = top_blob.w; | |||
| constants[7].i = top_blob.h; | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = 0;//top_blob.cstep; | |||
| const Pipeline* pipeline = 0; | |||
| if (elempack == 1 && out_elempack == 1) | |||
| { | |||
| pipeline = pipeline_flatten; | |||
| } | |||
| else if (elempack == 4 && out_elempack == 4) | |||
| { | |||
| pipeline = pipeline_flatten_pack4; | |||
| } | |||
| else if (elempack == 1 && out_elempack == 4) | |||
| { | |||
| pipeline = pipeline_flatten_pack1to4; | |||
| } | |||
| else if (elempack == 8 /*&& out_elempack == 8*/) | |||
| { | |||
| pipeline = pipeline_flatten_pack8; | |||
| } | |||
| else if (elempack == 1 && out_elempack == 8) | |||
| { | |||
| pipeline = pipeline_flatten_pack1to8; | |||
| } | |||
| else if (elempack == 4 && out_elempack == 8) | |||
| { | |||
| pipeline = pipeline_flatten_pack4to8; | |||
| } | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob); | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -29,6 +29,7 @@ public: | |||
| using Flatten::forward; | |||
| virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| public: | |||
| Pipeline* pipeline_flatten; | |||
| @@ -24,6 +24,7 @@ DEFINE_LAYER_CREATOR(InnerProduct_vulkan) | |||
| InnerProduct_vulkan::InnerProduct_vulkan() | |||
| { | |||
| support_vulkan = true; | |||
| support_image_storage = true; | |||
| flatten = 0; | |||
| @@ -72,7 +73,17 @@ int InnerProduct_vulkan::create_pipeline(const Option& opt) | |||
| size_t elemsize; | |||
| size_t out_elemsize; | |||
| if (opt.use_fp16_storage) | |||
| if (opt.use_image_storage && opt.use_image_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| out_elemsize = out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| elemsize = elempack * 4u; | |||
| out_elemsize = out_elempack * 4u; | |||
| } | |||
| else if (opt.use_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| out_elemsize = out_elempack * 2u; | |||
| @@ -269,14 +280,32 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd, const Option& opt) | |||
| } | |||
| } | |||
| cmd.record_upload(weight_data_packed, weight_data_gpu, opt); | |||
| if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt); | |||
| } | |||
| else | |||
| { | |||
| cmd.record_upload(weight_data_packed, weight_data_gpu, opt); | |||
| } | |||
| if (bias_term) | |||
| { | |||
| Mat bias_data_packed; | |||
| convert_packing(bias_data, bias_data_packed, out_elempack); | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu, opt); | |||
| if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt); | |||
| } | |||
| else | |||
| { | |||
| cmd.record_upload(bias_data_packed, bias_data_gpu, opt); | |||
| } | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(Mat(1), bias_data_gpu_image, opt); | |||
| } | |||
| return 0; | |||
| @@ -371,4 +400,93 @@ int InnerProduct_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCo | |||
| return 0; | |||
| } | |||
| int InnerProduct_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| // flatten | |||
| VkImageMat bottom_blob_flattened = bottom_blob; | |||
| { | |||
| Option opt_flatten = opt; | |||
| opt_flatten.blob_vkallocator = opt.workspace_vkallocator; | |||
| flatten->forward(bottom_blob, bottom_blob_flattened, cmd, opt_flatten); | |||
| } | |||
| size_t elemsize = bottom_blob_flattened.elemsize; | |||
| int elempack = bottom_blob_flattened.elempack; | |||
| int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / elempack * out_elempack; | |||
| if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) | |||
| { | |||
| if (out_elempack == 8) out_elemsize = 8*2u; | |||
| if (out_elempack == 4) out_elemsize = 4*2u; | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(4); | |||
| bindings[0] = bottom_blob_flattened; | |||
| bindings[1] = top_blob; | |||
| bindings[2] = weight_data_gpu_image; | |||
| bindings[3] = bias_data_gpu_image;// TODO use dummy buffer | |||
| std::vector<vk_constant_type> constants(10); | |||
| constants[0].i = bottom_blob_flattened.dims; | |||
| constants[1].i = bottom_blob_flattened.w; | |||
| constants[2].i = bottom_blob_flattened.h; | |||
| constants[3].i = bottom_blob_flattened.c; | |||
| constants[4].i = 0;//bottom_blob_flattened.cstep; | |||
| constants[5].i = top_blob.dims; | |||
| constants[6].i = top_blob.w; | |||
| constants[7].i = top_blob.h; | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = 0;//top_blob.cstep; | |||
| const Pipeline* pipeline = 0; | |||
| if (elempack == 1 && out_elempack == 1) | |||
| { | |||
| pipeline = pipeline_innerproduct; | |||
| } | |||
| else if (elempack == 4 && out_elempack == 4) | |||
| { | |||
| pipeline = pipeline_innerproduct_pack4; | |||
| } | |||
| else if (elempack == 1 && out_elempack == 4) | |||
| { | |||
| pipeline = pipeline_innerproduct_pack1to4; | |||
| } | |||
| else if (elempack == 4 && out_elempack == 1) | |||
| { | |||
| pipeline = pipeline_innerproduct_pack4to1; | |||
| } | |||
| else if (elempack == 8 && out_elempack == 8) | |||
| { | |||
| pipeline = pipeline_innerproduct_pack8; | |||
| } | |||
| else if (elempack == 1 && out_elempack == 8) | |||
| { | |||
| pipeline = pipeline_innerproduct_pack1to8; | |||
| } | |||
| else if (elempack == 4 && out_elempack == 8) | |||
| { | |||
| pipeline = pipeline_innerproduct_pack4to8; | |||
| } | |||
| else if (elempack == 8 && out_elempack == 4) | |||
| { | |||
| pipeline = pipeline_innerproduct_pack8to4; | |||
| } | |||
| else if (elempack == 8 && out_elempack == 1) | |||
| { | |||
| pipeline = pipeline_innerproduct_pack8to1; | |||
| } | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob); | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -31,6 +31,7 @@ public: | |||
| using InnerProduct::forward; | |||
| virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| public: | |||
| ncnn::Layer* flatten; | |||
| @@ -38,6 +39,9 @@ public: | |||
| VkMat weight_data_gpu; | |||
| VkMat bias_data_gpu; | |||
| VkImageMat weight_data_gpu_image; | |||
| VkImageMat bias_data_gpu_image; | |||
| Pipeline* pipeline_innerproduct; | |||
| Pipeline* pipeline_innerproduct_pack4; | |||
| Pipeline* pipeline_innerproduct_pack1to4; | |||
| @@ -22,6 +22,7 @@ DEFINE_LAYER_CREATOR(Packing_vulkan) | |||
| Packing_vulkan::Packing_vulkan() | |||
| { | |||
| support_vulkan = true; | |||
| support_image_storage = true; | |||
| pipeline_packing_1to4 = 0; | |||
| pipeline_packing_4to1 = 0; | |||
| @@ -37,7 +38,19 @@ int Packing_vulkan::create_pipeline(const Option& opt) | |||
| const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0]; | |||
| size_t out_elemsize; | |||
| if (opt.use_fp16_storage) | |||
| if (opt.use_image_storage && opt.use_image_fp16_storage) | |||
| { | |||
| out_elemsize = out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage && opt.use_image_fp16_packed) | |||
| { | |||
| out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| out_elemsize = out_elempack * 4u; | |||
| } | |||
| else if (opt.use_fp16_storage) | |||
| { | |||
| out_elemsize = out_elempack * 2u; | |||
| } | |||
| @@ -284,4 +297,132 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| return 0; | |||
| } | |||
| int Packing_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int elempack = bottom_blob.elempack; | |||
| if (elempack == out_elempack) | |||
| { | |||
| top_blob = bottom_blob; | |||
| return 0; | |||
| } | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| int dims = bottom_blob.dims; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| if (!use_padding) | |||
| { | |||
| // identity if use_padding not allowed | |||
| if (dims == 1 && w * elempack % out_elempack != 0) | |||
| { | |||
| top_blob = bottom_blob; | |||
| return 0; | |||
| } | |||
| if (dims == 2 && h * elempack % out_elempack != 0) | |||
| { | |||
| top_blob = bottom_blob; | |||
| return 0; | |||
| } | |||
| if (dims == 3 && channels * elempack % out_elempack != 0) | |||
| { | |||
| top_blob = bottom_blob; | |||
| return 0; | |||
| } | |||
| } | |||
| if (dims == 1) | |||
| { | |||
| int outw = (w * elempack + out_elempack - 1) / out_elempack; | |||
| size_t out_elemsize = elemsize / elempack * out_elempack; | |||
| if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) | |||
| { | |||
| if (out_elempack == 8) out_elemsize = 8*2u; | |||
| if (out_elempack == 4) out_elemsize = 4*2u; | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(outw, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| } | |||
| if (dims == 2) | |||
| { | |||
| int outh = (h * elempack + out_elempack - 1) / out_elempack; | |||
| size_t out_elemsize = elemsize / elempack * out_elempack; | |||
| if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) | |||
| { | |||
| if (out_elempack == 8) out_elemsize = 8*2u; | |||
| if (out_elempack == 4) out_elemsize = 4*2u; | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| } | |||
| if (dims == 3) | |||
| { | |||
| int outc = (channels * elempack + out_elempack - 1) / out_elempack; | |||
| size_t out_elemsize = elemsize / elempack * out_elempack; | |||
| if (opt.use_image_fp16_packed && !opt.use_image_fp16_storage) | |||
| { | |||
| if (out_elempack == 8) out_elemsize = 8*2u; | |||
| if (out_elempack == 4) out_elemsize = 4*2u; | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| } | |||
| std::vector<VkImageMat> bindings(2); | |||
| bindings[0] = bottom_blob; | |||
| bindings[1] = top_blob; | |||
| std::vector<vk_constant_type> constants(10); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h; | |||
| constants[3].i = bottom_blob.c; | |||
| constants[4].i = 0;//bottom_blob.cstep; | |||
| constants[5].i = top_blob.dims; | |||
| constants[6].i = top_blob.w; | |||
| constants[7].i = top_blob.h; | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = 0;//top_blob.cstep; | |||
| if (elempack == 1 && out_elempack == 4) | |||
| { | |||
| cmd.record_pipeline(pipeline_packing_1to4, bindings, constants, top_blob); | |||
| } | |||
| if (elempack == 4 && out_elempack == 1) | |||
| { | |||
| cmd.record_pipeline(pipeline_packing_4to1, bindings, constants, bottom_blob); | |||
| } | |||
| if (elempack == 1 && out_elempack == 8) | |||
| { | |||
| cmd.record_pipeline(pipeline_packing_1to8, bindings, constants, top_blob); | |||
| } | |||
| if (elempack == 4 && out_elempack == 8) | |||
| { | |||
| cmd.record_pipeline(pipeline_packing_4to8, bindings, constants, top_blob); | |||
| } | |||
| if (elempack == 8 && out_elempack == 4) | |||
| { | |||
| cmd.record_pipeline(pipeline_packing_8to4, bindings, constants, bottom_blob); | |||
| } | |||
| if (elempack == 8 && out_elempack == 1) | |||
| { | |||
| cmd.record_pipeline(pipeline_packing_8to1, bindings, constants, bottom_blob); | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -29,6 +29,7 @@ public: | |||
| using Packing::forward; | |||
| virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| public: | |||
| Pipeline* pipeline_packing_1to4; | |||
| @@ -23,6 +23,7 @@ DEFINE_LAYER_CREATOR(Padding_vulkan) | |||
| Padding_vulkan::Padding_vulkan() | |||
| { | |||
| support_vulkan = true; | |||
| support_image_storage = true; | |||
| pipeline_padding = 0; | |||
| pipeline_padding_pack4 = 0; | |||
| @@ -46,7 +47,22 @@ int Padding_vulkan::create_pipeline(const Option& opt) | |||
| size_t elemsize; | |||
| size_t out_elemsize; | |||
| if (opt.use_fp16_storage) | |||
| if (opt.use_image_storage && opt.use_image_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| out_elemsize = out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage && opt.use_image_fp16_packed) | |||
| { | |||
| elemsize = elempack == 1 ? 4u : elempack * 2u; | |||
| out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| elemsize = elempack * 4u; | |||
| out_elemsize = out_elempack * 4u; | |||
| } | |||
| else if (opt.use_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| out_elemsize = out_elempack * 2u; | |||
| @@ -139,14 +155,28 @@ int Padding_vulkan::destroy_pipeline(const Option& /*opt*/) | |||
| int Padding_vulkan::upload_model(VkTransfer& cmd, const Option& opt) | |||
| { | |||
| if (per_channel_pad_data_size == 0) | |||
| { | |||
| if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(Mat(1), per_channel_pad_data_gpu_image, opt); | |||
| } | |||
| return 0; | |||
| } | |||
| int elempack = opt.use_shader_pack8 && per_channel_pad_data_size % 8 == 0 ? 8 : per_channel_pad_data_size % 4 == 0 ? 4 : 1; | |||
| Mat per_channel_pad_data_packed; | |||
| convert_packing(per_channel_pad_data, per_channel_pad_data_packed, elempack); | |||
| cmd.record_upload(per_channel_pad_data_packed, per_channel_pad_data_gpu, opt); | |||
| if (opt.use_image_storage) | |||
| { | |||
| cmd.record_upload(per_channel_pad_data_packed, per_channel_pad_data_gpu_image, opt); | |||
| } | |||
| else | |||
| { | |||
| cmd.record_upload(per_channel_pad_data_packed, per_channel_pad_data_gpu, opt); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -271,4 +301,124 @@ int Padding_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector< | |||
| return 0; | |||
| } | |||
| int Padding_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| if (top == 0 && bottom == 0 && left == 0 && right == 0) | |||
| { | |||
| top_blob = bottom_blob; | |||
| return 0; | |||
| } | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| // TODO vec and image padding | |||
| int outw = w + left + right; | |||
| int outh = h + top + bottom; | |||
| top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(3); | |||
| bindings[0] = bottom_blob; | |||
| bindings[1] = top_blob; | |||
| bindings[2] = per_channel_pad_data_gpu_image;// TODO use dummy buffer | |||
| std::vector<vk_constant_type> constants(12); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h; | |||
| constants[3].i = bottom_blob.c; | |||
| constants[4].i = 0;//bottom_blob.cstep; | |||
| constants[5].i = top_blob.dims; | |||
| constants[6].i = top_blob.w; | |||
| constants[7].i = top_blob.h; | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = 0;//top_blob.cstep; | |||
| constants[10].i = left; | |||
| constants[11].i = top; | |||
| const Pipeline* pipeline = elempack == 8 ? pipeline_padding_pack8 | |||
| : elempack == 4 ? pipeline_padding_pack4 | |||
| : pipeline_padding; | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob); | |||
| return 0; | |||
| } | |||
| int Padding_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| const VkImageMat& bottom_blob = bottom_blobs[0]; | |||
| const VkImageMat& reference_blob = bottom_blobs[1]; | |||
| VkImageMat& top_blob = top_blobs[0]; | |||
| int _top; | |||
| int _bottom; | |||
| int _left; | |||
| int _right; | |||
| { | |||
| const int* param_data = reference_blob.mapped(); | |||
| _top = param_data[0]; | |||
| _bottom = param_data[1]; | |||
| _left = param_data[2]; | |||
| _right = param_data[3]; | |||
| } | |||
| if (_top == 0 && _bottom == 0 && _left == 0 && _right == 0) | |||
| { | |||
| top_blob = bottom_blob; | |||
| return 0; | |||
| } | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| // TODO vec and image padding | |||
| int outw = w + _left + _right; | |||
| int outh = h + _top + _bottom; | |||
| top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(3); | |||
| bindings[0] = bottom_blob; | |||
| bindings[1] = top_blob; | |||
| bindings[2] = per_channel_pad_data_gpu_image;// TODO use dummy buffer | |||
| std::vector<vk_constant_type> constants(12); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h; | |||
| constants[3].i = bottom_blob.c; | |||
| constants[4].i = 0;//bottom_blob.cstep; | |||
| constants[5].i = top_blob.dims; | |||
| constants[6].i = top_blob.w; | |||
| constants[7].i = top_blob.h; | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = 0;//top_blob.cstep; | |||
| constants[10].i = _left; | |||
| constants[11].i = _top; | |||
| const Pipeline* pipeline = elempack == 8 ? pipeline_padding_pack8 | |||
| : elempack == 4 ? pipeline_padding_pack4 | |||
| : pipeline_padding; | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob); | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -34,8 +34,13 @@ public: | |||
| virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const; | |||
| public: | |||
| VkMat per_channel_pad_data_gpu; | |||
| VkImageMat per_channel_pad_data_gpu_image; | |||
| Pipeline* pipeline_padding; | |||
| Pipeline* pipeline_padding_pack4; | |||
| Pipeline* pipeline_padding_pack8; | |||
| @@ -25,6 +25,7 @@ DEFINE_LAYER_CREATOR(Pooling_vulkan) | |||
| Pooling_vulkan::Pooling_vulkan() | |||
| { | |||
| support_vulkan = true; | |||
| support_image_storage = true; | |||
| padding = 0; | |||
| pipeline_pooling = 0; | |||
| @@ -112,7 +113,22 @@ int Pooling_vulkan::create_pipeline(const Option& opt) | |||
| size_t elemsize; | |||
| size_t out_elemsize; | |||
| if (opt.use_fp16_storage) | |||
| if (opt.use_image_storage && opt.use_image_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| out_elemsize = out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage && opt.use_image_fp16_packed) | |||
| { | |||
| elemsize = elempack == 1 ? 4u : elempack * 2u; | |||
| out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; | |||
| } | |||
| else if (opt.use_image_storage) | |||
| { | |||
| elemsize = elempack * 4u; | |||
| out_elemsize = out_elempack * 4u; | |||
| } | |||
| else if (opt.use_fp16_storage) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| out_elemsize = out_elempack * 2u; | |||
| @@ -277,6 +293,16 @@ int Pooling_vulkan::destroy_pipeline(const Option& opt) | |||
| return 0; | |||
| } | |||
| int Pooling_vulkan::upload_model(VkTransfer& cmd, const Option& opt) | |||
| { | |||
| if (padding) | |||
| { | |||
| padding->upload_model(cmd, opt); | |||
| } | |||
| return 0; | |||
| } | |||
| int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| @@ -447,4 +473,174 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| return 0; | |||
| } | |||
| int Pooling_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| if (global_pooling) | |||
| { | |||
| top_blob.create(channels, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(2); | |||
| bindings[0] = bottom_blob; | |||
| bindings[1] = top_blob; | |||
| std::vector<vk_constant_type> constants(10); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h; | |||
| constants[3].i = bottom_blob.c; | |||
| constants[4].i = 0;//bottom_blob.cstep; | |||
| constants[5].i = top_blob.dims; | |||
| constants[6].i = top_blob.w; | |||
| constants[7].i = top_blob.h; | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = 0;//top_blob.cstep; | |||
| const Pipeline* pipeline = elempack == 8 ? pipeline_pooling_global_pack8 | |||
| : elempack == 4 ? pipeline_pooling_global_pack4 | |||
| : pipeline_pooling_global; | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob); | |||
| return 0; | |||
| } | |||
| VkImageMat bottom_blob_bordered = bottom_blob; | |||
| int wtailpad = 0; | |||
| int htailpad = 0; | |||
| if (pad_mode == 0) // full padding | |||
| { | |||
| int wtail = (w + pad_left + pad_right - kernel_w) % stride_w; | |||
| int htail = (h + pad_top + pad_bottom - kernel_h) % stride_h; | |||
| if (wtail != 0) | |||
| wtailpad = stride_w - wtail; | |||
| if (htail != 0) | |||
| htailpad = stride_h - htail; | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* padding_params = padding_param_blob.mapped(); | |||
| padding_params[0] = pad_top; | |||
| padding_params[1] = pad_bottom + htailpad; | |||
| padding_params[2] = pad_left; | |||
| padding_params[3] = pad_right + wtailpad; | |||
| std::vector<VkImageMat> padding_inputs(2); | |||
| padding_inputs[0] = bottom_blob; | |||
| padding_inputs[1] = padding_param_blob; | |||
| std::vector<VkImageMat> padding_outputs(1); | |||
| padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); | |||
| bottom_blob_bordered = padding_outputs[0]; | |||
| } | |||
| else if (pad_mode == 1) // valid padding | |||
| { | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad); | |||
| } | |||
| else if (pad_mode == 2) // tensorflow padding=SAME or onnx padding=SAME_UPPER | |||
| { | |||
| int wpad = kernel_w + (w - 1) / stride_w * stride_w - w; | |||
| int hpad = kernel_h + (h - 1) / stride_h * stride_h - h; | |||
| if (wpad > 0 || hpad > 0) | |||
| { | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* padding_params = padding_param_blob.mapped(); | |||
| padding_params[0] = hpad / 2; | |||
| padding_params[1] = hpad - hpad / 2; | |||
| padding_params[2] = wpad / 2; | |||
| padding_params[3] = wpad - wpad / 2; | |||
| std::vector<VkImageMat> padding_inputs(2); | |||
| padding_inputs[0] = bottom_blob; | |||
| padding_inputs[1] = padding_param_blob; | |||
| std::vector<VkImageMat> padding_outputs(1); | |||
| padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); | |||
| bottom_blob_bordered = padding_outputs[0]; | |||
| } | |||
| } | |||
| else if (pad_mode == 3) // onnx padding=SAME_LOWER | |||
| { | |||
| int wpad = kernel_w + (w - 1) / stride_w * stride_w - w; | |||
| int hpad = kernel_h + (h - 1) / stride_h * stride_h - h; | |||
| if (wpad > 0 || hpad > 0) | |||
| { | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| VkImageMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* padding_params = padding_param_blob.mapped(); | |||
| padding_params[0] = hpad - hpad / 2; | |||
| padding_params[1] = hpad / 2; | |||
| padding_params[2] = wpad - wpad / 2; | |||
| padding_params[3] = wpad / 2; | |||
| std::vector<VkImageMat> padding_inputs(2); | |||
| padding_inputs[0] = bottom_blob; | |||
| padding_inputs[1] = padding_param_blob; | |||
| std::vector<VkImageMat> padding_outputs(1); | |||
| padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); | |||
| bottom_blob_bordered = padding_outputs[0]; | |||
| } | |||
| } | |||
| w = bottom_blob_bordered.w; | |||
| h = bottom_blob_bordered.h; | |||
| int outw = (w - kernel_w) / stride_w + 1; | |||
| int outh = (h - kernel_h) / stride_h + 1; | |||
| top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(2); | |||
| bindings[0] = bottom_blob_bordered; | |||
| bindings[1] = top_blob; | |||
| std::vector<vk_constant_type> constants(12); | |||
| constants[0].i = bottom_blob_bordered.dims; | |||
| constants[1].i = bottom_blob_bordered.w; | |||
| constants[2].i = bottom_blob_bordered.h; | |||
| constants[3].i = bottom_blob_bordered.c; | |||
| constants[4].i = 0;//bottom_blob_bordered.cstep; | |||
| constants[5].i = top_blob.dims; | |||
| constants[6].i = top_blob.w; | |||
| constants[7].i = top_blob.h; | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = 0;//top_blob.cstep; | |||
| constants[10].i = wtailpad; | |||
| constants[11].i = htailpad; | |||
| const Pipeline* pipeline = elempack == 8 ? pipeline_pooling_pack8 | |||
| : elempack == 4 ? pipeline_pooling_pack4 | |||
| : pipeline_pooling; | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob); | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -27,8 +27,11 @@ public: | |||
| virtual int create_pipeline(const Option& opt); | |||
| virtual int destroy_pipeline(const Option& opt); | |||
| virtual int upload_model(VkTransfer& cmd, const Option& opt); | |||
| using Pooling::forward; | |||
| virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| public: | |||
| ncnn::Layer* padding; | |||
| @@ -32,7 +32,16 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; | |||
| layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; | |||
| #else | |||
| layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -52,11 +61,42 @@ void main() | |||
| if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) | |||
| return; | |||
| #if NCNN_image_shader | |||
| afp v; | |||
| if (psc(dims) == 1) | |||
| { | |||
| v = image1d_ld1(bottom_blob_1d, gx); | |||
| } | |||
| else if (psc(dims) == 2) | |||
| { | |||
| v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy)); | |||
| } | |||
| else // if (psc(dims) == 3) | |||
| { | |||
| v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| } | |||
| #else | |||
| const int gi = gz * psc(cstep) + gy * psc(w) + gx; | |||
| afp v = buffer_ld1(bottom_top_blob_data, gi); | |||
| #endif | |||
| v = abs(v); | |||
| #if NCNN_image_shader | |||
| if (psc(dims) == 1) | |||
| { | |||
| image1d_st1(top_blob_1d, gx, v); | |||
| } | |||
| else if (psc(dims) == 2) | |||
| { | |||
| image2d_st1(top_blob_2d, ivec2(gx, gy), v); | |||
| } | |||
| else // if (psc(dims) == 3) | |||
| { | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v); | |||
| } | |||
| #else | |||
| buffer_st1(bottom_top_blob_data, gi, v); | |||
| #endif | |||
| } | |||
| @@ -32,7 +32,16 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; | |||
| layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; | |||
| #else | |||
| layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -52,11 +61,42 @@ void main() | |||
| if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) | |||
| return; | |||
| #if NCNN_image_shader | |||
| afpvec4 v; | |||
| if (psc(dims) == 1) | |||
| { | |||
| v = image1d_ld4(bottom_blob_1d, gx); | |||
| } | |||
| else if (psc(dims) == 2) | |||
| { | |||
| v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); | |||
| } | |||
| else // if (psc(dims) == 3) | |||
| { | |||
| v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| } | |||
| #else | |||
| const int gi = gz * psc(cstep) + gy * psc(w) + gx; | |||
| afpvec4 v = buffer_ld4(bottom_top_blob_data, gi); | |||
| #endif | |||
| v = abs(v); | |||
| #if NCNN_image_shader | |||
| if (psc(dims) == 1) | |||
| { | |||
| image1d_st4(top_blob_1d, gx, v); | |||
| } | |||
| else if (psc(dims) == 2) | |||
| { | |||
| image2d_st4(top_blob_2d, ivec2(gx, gy), v); | |||
| } | |||
| else // if (psc(dims) == 3) | |||
| { | |||
| image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v); | |||
| } | |||
| #else | |||
| buffer_st4(bottom_top_blob_data, gi, v); | |||
| #endif | |||
| } | |||
| @@ -33,7 +33,16 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; | |||
| layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; | |||
| #else | |||
| layout (binding = 0) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -53,12 +62,43 @@ void main() | |||
| if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) | |||
| return; | |||
| #if NCNN_image_shader | |||
| afpvec8 v; | |||
| if (psc(dims) == 1) | |||
| { | |||
| v = image1d_ld8(bottom_blob_1d, gx); | |||
| } | |||
| else if (psc(dims) == 2) | |||
| { | |||
| v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); | |||
| } | |||
| else // if (psc(dims) == 3) | |||
| { | |||
| v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| } | |||
| #else | |||
| const int gi = gz * psc(cstep) + gy * psc(w) + gx; | |||
| afpvec8 v = buffer_ld8(bottom_top_blob_data, gi); | |||
| #endif | |||
| v[0] = abs(v[0]); | |||
| v[1] = abs(v[1]); | |||
| #if NCNN_image_shader | |||
| if (psc(dims) == 1) | |||
| { | |||
| image1d_st8(top_blob_1d, gx, v); | |||
| } | |||
| else if (psc(dims) == 2) | |||
| { | |||
| image2d_st8(top_blob_2d, ivec2(gx, gy), v); | |||
| } | |||
| else // if (psc(dims) == 3) | |||
| { | |||
| image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); | |||
| } | |||
| #else | |||
| buffer_st8(bottom_top_blob_data, gi, v); | |||
| #endif | |||
| } | |||
| @@ -38,8 +38,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; | |||
| layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; | |||
| layout (binding = 1, r32f) writeonly uniform highp image1D top_blob_1d; | |||
| layout (binding = 1, r32f) writeonly uniform highp image2D top_blob_2d; | |||
| layout (binding = 1, r32f) writeonly uniform highp image3D top_blob_3d; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -65,9 +74,24 @@ void main() | |||
| if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) | |||
| return; | |||
| #if NCNN_image_shader | |||
| if (psc(dims) == 1) | |||
| { | |||
| image1d_cp1(top_blob_1d, gx, bottom_blob_1d, gx); | |||
| } | |||
| else if (psc(dims) == 2) | |||
| { | |||
| image2d_cp1(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gx, gy)); | |||
| } | |||
| else // if (psc(dims) == 3) | |||
| { | |||
| image3d_cp1(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| } | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| const int v_offset = gz * psc(cstep) + gy * psc(w) + gx; | |||
| top_blob_data[gi] = float(buffer_ld1(bottom_blob_data, v_offset)); | |||
| #endif | |||
| } | |||
| @@ -38,8 +38,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; | |||
| layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; | |||
| layout (binding = 1, rgba32f) writeonly uniform highp image1D top_blob_1d; | |||
| layout (binding = 1, rgba32f) writeonly uniform highp image2D top_blob_2d; | |||
| layout (binding = 1, rgba32f) writeonly uniform highp image3D top_blob_3d; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -65,9 +74,24 @@ void main() | |||
| if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) | |||
| return; | |||
| #if NCNN_image_shader | |||
| if (psc(dims) == 1) | |||
| { | |||
| image1d_cp4(top_blob_1d, gx, bottom_blob_1d, gx); | |||
| } | |||
| else if (psc(dims) == 2) | |||
| { | |||
| image2d_cp4(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gx, gy)); | |||
| } | |||
| else // if (psc(dims) == 3) | |||
| { | |||
| image3d_cp4(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| } | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| const int v_offset = gz * psc(cstep) + gy * psc(w) + gx; | |||
| top_blob_data[gi] = vec4(buffer_ld4(bottom_blob_data, v_offset)); | |||
| #endif | |||
| } | |||
| @@ -39,8 +39,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; | |||
| layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; | |||
| layout (binding = 1, rgba32f) writeonly uniform highp image1D top_blob_1d; | |||
| layout (binding = 1, rgba32f) writeonly uniform highp image2D top_blob_2d; | |||
| layout (binding = 1, rgba32f) writeonly uniform highp image3D top_blob_3d; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { mat2x4 top_blob_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -66,9 +75,24 @@ void main() | |||
| if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) | |||
| return; | |||
| #if NCNN_image_shader | |||
| if (psc(dims) == 1) | |||
| { | |||
| image1d_cp8(top_blob_1d, gx, bottom_blob_1d, gx); | |||
| } | |||
| else if (psc(dims) == 2) | |||
| { | |||
| image2d_cp8(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gx, gy)); | |||
| } | |||
| else // if (psc(dims) == 3) | |||
| { | |||
| image3d_cp8(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| } | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| const int v_offset = gz * psc(cstep) + gy * psc(w) + gx; | |||
| top_blob_data[gi] = mat2x4(buffer_ld8(bottom_blob_data, v_offset)); | |||
| #endif | |||
| } | |||
| @@ -38,8 +38,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform highp sampler1D bottom_blob_1d; | |||
| layout (binding = 0) uniform highp sampler2D bottom_blob_2d; | |||
| layout (binding = 0) uniform highp sampler3D bottom_blob_3d; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -65,9 +74,24 @@ void main() | |||
| if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) | |||
| return; | |||
| #if NCNN_image_shader | |||
| if (psc(dims) == 1) | |||
| { | |||
| image1d_cp1(top_blob_1d, gx, bottom_blob_1d, gx); | |||
| } | |||
| else if (psc(dims) == 2) | |||
| { | |||
| image2d_cp1(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gx, gy)); | |||
| } | |||
| else // if (psc(dims) == 3) | |||
| { | |||
| image3d_cp1(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| } | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| const int v_offset = gz * psc(cstep) + gy * psc(w) + gx; | |||
| buffer_st1(top_blob_data, gi, afp(bottom_blob_data[v_offset])); | |||
| #endif | |||
| } | |||
| @@ -38,8 +38,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform highp sampler1D bottom_blob_1d; | |||
| layout (binding = 0) uniform highp sampler2D bottom_blob_2d; | |||
| layout (binding = 0) uniform highp sampler3D bottom_blob_3d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -65,9 +74,24 @@ void main() | |||
| if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) | |||
| return; | |||
| #if NCNN_image_shader | |||
| if (psc(dims) == 1) | |||
| { | |||
| image1d_cp4(top_blob_1d, gx, bottom_blob_1d, gx); | |||
| } | |||
| else if (psc(dims) == 2) | |||
| { | |||
| image2d_cp4(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gx, gy)); | |||
| } | |||
| else // if (psc(dims) == 3) | |||
| { | |||
| image3d_cp4(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| } | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| const int v_offset = gz * psc(cstep) + gy * psc(w) + gx; | |||
| buffer_st4(top_blob_data, gi, afpvec4(bottom_blob_data[v_offset])); | |||
| #endif | |||
| } | |||
| @@ -39,8 +39,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform highp sampler1D bottom_blob_1d; | |||
| layout (binding = 0) uniform highp sampler2D bottom_blob_2d; | |||
| layout (binding = 0) uniform highp sampler3D bottom_blob_3d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { mat2x4 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -66,9 +75,24 @@ void main() | |||
| if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) | |||
| return; | |||
| #if NCNN_image_shader | |||
| if (psc(dims) == 1) | |||
| { | |||
| image1d_cp8(top_blob_1d, gx, bottom_blob_1d, gx); | |||
| } | |||
| else if (psc(dims) == 2) | |||
| { | |||
| image2d_cp8(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gx, gy)); | |||
| } | |||
| else // if (psc(dims) == 3) | |||
| { | |||
| image3d_cp8(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| } | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| const int v_offset = gz * psc(cstep) + gy * psc(w) + gx; | |||
| buffer_st8(top_blob_data, gi, afpvec8(bottom_blob_data[v_offset])); | |||
| #endif | |||
| } | |||
| @@ -40,8 +40,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; | |||
| layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -69,6 +78,23 @@ void main() | |||
| if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) | |||
| return; | |||
| #if NCNN_image_shader | |||
| if (psc(dims) == 1) | |||
| { | |||
| image1d_cp1(top_blob_1d, gx + p.offset, bottom_blob_1d, gx); | |||
| } | |||
| else if (psc(dims) == 2) | |||
| { | |||
| if (axis == 0) image2d_cp1(top_blob_2d, ivec2(gx, gy + p.offset), bottom_blob_2d, ivec2(gx, gy)); | |||
| if (axis == 1) image2d_cp1(top_blob_2d, ivec2(gx + p.offset, gy), bottom_blob_2d, ivec2(gx, gy)); | |||
| } | |||
| else // if (psc(dims) == 3) | |||
| { | |||
| if (axis == 0) image3d_cp1(top_blob_3d, ivec3(gx, gy, gz + p.offset), bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| if (axis == 1) image3d_cp1(top_blob_3d, ivec3(gx, gy + p.offset, gz), bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| if (axis == 2) image3d_cp1(top_blob_3d, ivec3(gx + p.offset, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| } | |||
| #else | |||
| const int gi = gz * psc(cstep) + gy * psc(w) + gx; | |||
| ivec3 gxyz = ivec3(gx, gy, gz); | |||
| @@ -78,4 +104,5 @@ void main() | |||
| int v_offset = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x; | |||
| buffer_cp1(top_blob_data, v_offset, bottom_blob_data, gi); | |||
| #endif | |||
| } | |||
| @@ -40,8 +40,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; | |||
| layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -69,6 +78,23 @@ void main() | |||
| if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) | |||
| return; | |||
| #if NCNN_image_shader | |||
| if (psc(dims) == 1) | |||
| { | |||
| image1d_cp4(top_blob_1d, gx + p.offset, bottom_blob_1d, gx); | |||
| } | |||
| else if (psc(dims) == 2) | |||
| { | |||
| if (axis == 0) image2d_cp4(top_blob_2d, ivec2(gx, gy + p.offset), bottom_blob_2d, ivec2(gx, gy)); | |||
| if (axis == 1) image2d_cp4(top_blob_2d, ivec2(gx + p.offset, gy), bottom_blob_2d, ivec2(gx, gy)); | |||
| } | |||
| else // if (psc(dims) == 3) | |||
| { | |||
| if (axis == 0) image3d_cp4(top_blob_3d, ivec3(gx, gy, gz + p.offset), bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| if (axis == 1) image3d_cp4(top_blob_3d, ivec3(gx, gy + p.offset, gz), bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| if (axis == 2) image3d_cp4(top_blob_3d, ivec3(gx + p.offset, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| } | |||
| #else | |||
| const int gi = gz * psc(cstep) + gy * psc(w) + gx; | |||
| ivec3 gxyz = ivec3(gx, gy, gz); | |||
| @@ -78,4 +104,5 @@ void main() | |||
| int v_offset = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x; | |||
| buffer_cp4(top_blob_data, v_offset, bottom_blob_data, gi); | |||
| #endif | |||
| } | |||
| @@ -40,8 +40,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; | |||
| layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -69,6 +78,74 @@ void main() | |||
| if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) | |||
| return; | |||
| #if NCNN_image_shader | |||
| if (psc(dims) == 1) | |||
| { | |||
| afpvec4 v = image1d_ld4(bottom_blob_1d, gx); | |||
| int gx4 = gx * 4 + p.offset; | |||
| image1d_st1(top_blob_1d, gx4 + 0, v.r); | |||
| image1d_st1(top_blob_1d, gx4 + 1, v.g); | |||
| image1d_st1(top_blob_1d, gx4 + 2, v.b); | |||
| image1d_st1(top_blob_1d, gx4 + 3, v.a); | |||
| } | |||
| else if (psc(dims) == 2) | |||
| { | |||
| afpvec4 v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); | |||
| if (axis == 0) | |||
| { | |||
| int gy4 = gy * 4 + p.offset; | |||
| image2d_st1(top_blob_2d, ivec2(gx, gy4 + 0), v.r); | |||
| image2d_st1(top_blob_2d, ivec2(gx, gy4 + 1), v.g); | |||
| image2d_st1(top_blob_2d, ivec2(gx, gy4 + 2), v.b); | |||
| image2d_st1(top_blob_2d, ivec2(gx, gy4 + 3), v.a); | |||
| } | |||
| if (axis == 1) | |||
| { | |||
| int gx4 = gx * 4 + p.offset; | |||
| image2d_st1(top_blob_2d, ivec2(gx4 + 0, gy), v.r); | |||
| image2d_st1(top_blob_2d, ivec2(gx4 + 1, gy), v.g); | |||
| image2d_st1(top_blob_2d, ivec2(gx4 + 2, gy), v.b); | |||
| image2d_st1(top_blob_2d, ivec2(gx4 + 3, gy), v.a); | |||
| } | |||
| } | |||
| else // if (psc(dims) == 3) | |||
| { | |||
| afpvec4 v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| if (axis == 0) | |||
| { | |||
| int gz4 = gz * 4 + p.offset; | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy, gz4 + 0), v.r); | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy, gz4 + 1), v.g); | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy, gz4 + 2), v.b); | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy, gz4 + 3), v.a); | |||
| } | |||
| if (axis == 1) | |||
| { | |||
| int gy4 = gy * 4 + p.offset; | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy4 + 0, gz), v.r); | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy4 + 1, gz), v.g); | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy4 + 2, gz), v.b); | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy4 + 3, gz), v.a); | |||
| } | |||
| if (axis == 2) | |||
| { | |||
| int gx4 = gx * 4 + p.offset; | |||
| image3d_st1(top_blob_3d, ivec3(gx4 + 0, gy, gz), v.r); | |||
| image3d_st1(top_blob_3d, ivec3(gx4 + 1, gy, gz), v.g); | |||
| image3d_st1(top_blob_3d, ivec3(gx4 + 2, gy, gz), v.b); | |||
| image3d_st1(top_blob_3d, ivec3(gx4 + 3, gy, gz), v.a); | |||
| } | |||
| } | |||
| #else | |||
| const int gi = gz * psc(cstep) + gy * psc(w) + gx; | |||
| ivec3 gxyz = ivec3(gx, gy, gz); | |||
| @@ -83,4 +160,5 @@ void main() | |||
| ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * gxyz4[psc(dims) - 1 - axis]; | |||
| buffer_cp4to1(top_blob_data, v_offset, bottom_blob_data, gi); | |||
| #endif | |||
| } | |||
| @@ -41,8 +41,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; | |||
| layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -70,6 +79,23 @@ void main() | |||
| if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) | |||
| return; | |||
| #if NCNN_image_shader | |||
| if (psc(dims) == 1) | |||
| { | |||
| image1d_cp8(top_blob_1d, gx + p.offset, bottom_blob_1d, gx); | |||
| } | |||
| else if (psc(dims) == 2) | |||
| { | |||
| if (axis == 0) image2d_cp8(top_blob_2d, ivec2(gx, gy + p.offset), bottom_blob_2d, ivec2(gx, gy)); | |||
| if (axis == 1) image2d_cp8(top_blob_2d, ivec2(gx + p.offset, gy), bottom_blob_2d, ivec2(gx, gy)); | |||
| } | |||
| else // if (psc(dims) == 3) | |||
| { | |||
| if (axis == 0) image3d_cp8(top_blob_3d, ivec3(gx, gy, gz + p.offset), bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| if (axis == 1) image3d_cp8(top_blob_3d, ivec3(gx, gy + p.offset, gz), bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| if (axis == 2) image3d_cp8(top_blob_3d, ivec3(gx + p.offset, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| } | |||
| #else | |||
| const int gi = gz * psc(cstep) + gy * psc(w) + gx; | |||
| ivec3 gxyz = ivec3(gx, gy, gz); | |||
| @@ -79,4 +105,5 @@ void main() | |||
| int v_offset = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x; | |||
| buffer_cp8(top_blob_data, v_offset, bottom_blob_data, gi); | |||
| #endif | |||
| } | |||
| @@ -41,8 +41,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; | |||
| layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -70,6 +79,98 @@ void main() | |||
| if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) | |||
| return; | |||
| #if NCNN_image_shader | |||
| if (psc(dims) == 1) | |||
| { | |||
| afpvec8 v = image1d_ld8(bottom_blob_1d, gx); | |||
| int gx8 = gx * 8 + p.offset; | |||
| image1d_st1(top_blob_1d, gx8 + 0, v[0].r); | |||
| image1d_st1(top_blob_1d, gx8 + 1, v[0].g); | |||
| image1d_st1(top_blob_1d, gx8 + 2, v[0].b); | |||
| image1d_st1(top_blob_1d, gx8 + 3, v[0].a); | |||
| image1d_st1(top_blob_1d, gx8 + 4, v[1].r); | |||
| image1d_st1(top_blob_1d, gx8 + 5, v[1].g); | |||
| image1d_st1(top_blob_1d, gx8 + 6, v[1].b); | |||
| image1d_st1(top_blob_1d, gx8 + 7, v[1].a); | |||
| } | |||
| else if (psc(dims) == 2) | |||
| { | |||
| afpvec8 v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); | |||
| if (axis == 0) | |||
| { | |||
| int gy8 = gy * 8 + p.offset; | |||
| image2d_st1(top_blob_2d, ivec2(gx, gy8 + 0), v[0].r); | |||
| image2d_st1(top_blob_2d, ivec2(gx, gy8 + 1), v[0].g); | |||
| image2d_st1(top_blob_2d, ivec2(gx, gy8 + 2), v[0].b); | |||
| image2d_st1(top_blob_2d, ivec2(gx, gy8 + 3), v[0].a); | |||
| image2d_st1(top_blob_2d, ivec2(gx, gy8 + 4), v[1].r); | |||
| image2d_st1(top_blob_2d, ivec2(gx, gy8 + 5), v[1].g); | |||
| image2d_st1(top_blob_2d, ivec2(gx, gy8 + 6), v[1].b); | |||
| image2d_st1(top_blob_2d, ivec2(gx, gy8 + 7), v[1].a); | |||
| } | |||
| if (axis == 1) | |||
| { | |||
| int gx8 = gx * 8 + p.offset; | |||
| image2d_st1(top_blob_2d, ivec2(gx8 + 0, gy), v[0].r); | |||
| image2d_st1(top_blob_2d, ivec2(gx8 + 1, gy), v[0].g); | |||
| image2d_st1(top_blob_2d, ivec2(gx8 + 2, gy), v[0].b); | |||
| image2d_st1(top_blob_2d, ivec2(gx8 + 3, gy), v[0].a); | |||
| image2d_st1(top_blob_2d, ivec2(gx8 + 4, gy), v[1].r); | |||
| image2d_st1(top_blob_2d, ivec2(gx8 + 5, gy), v[1].g); | |||
| image2d_st1(top_blob_2d, ivec2(gx8 + 6, gy), v[1].b); | |||
| image2d_st1(top_blob_2d, ivec2(gx8 + 7, gy), v[1].a); | |||
| } | |||
| } | |||
| else // if (psc(dims) == 3) | |||
| { | |||
| afpvec8 v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| if (axis == 0) | |||
| { | |||
| int gz8 = gz * 8 + p.offset; | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 0), v[0].r); | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 1), v[0].g); | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 2), v[0].b); | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 3), v[0].a); | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 4), v[1].r); | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 5), v[1].g); | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 6), v[1].b); | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 7), v[1].a); | |||
| } | |||
| if (axis == 1) | |||
| { | |||
| int gy8 = gy * 8 + p.offset; | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy8 + 0, gz), v[0].r); | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy8 + 1, gz), v[0].g); | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy8 + 2, gz), v[0].b); | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy8 + 3, gz), v[0].a); | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy8 + 4, gz), v[1].r); | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy8 + 5, gz), v[1].g); | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy8 + 6, gz), v[1].b); | |||
| image3d_st1(top_blob_3d, ivec3(gx, gy8 + 7, gz), v[1].a); | |||
| } | |||
| if (axis == 2) | |||
| { | |||
| int gx8 = gx * 8 + p.offset; | |||
| image3d_st1(top_blob_3d, ivec3(gx8 + 0, gy, gz), v[0].r); | |||
| image3d_st1(top_blob_3d, ivec3(gx8 + 1, gy, gz), v[0].g); | |||
| image3d_st1(top_blob_3d, ivec3(gx8 + 2, gy, gz), v[0].b); | |||
| image3d_st1(top_blob_3d, ivec3(gx8 + 3, gy, gz), v[0].a); | |||
| image3d_st1(top_blob_3d, ivec3(gx8 + 4, gy, gz), v[1].r); | |||
| image3d_st1(top_blob_3d, ivec3(gx8 + 5, gy, gz), v[1].g); | |||
| image3d_st1(top_blob_3d, ivec3(gx8 + 6, gy, gz), v[1].b); | |||
| image3d_st1(top_blob_3d, ivec3(gx8 + 7, gy, gz), v[1].a); | |||
| } | |||
| } | |||
| #else | |||
| const int gi = gz * psc(cstep) + gy * psc(w) + gx; | |||
| ivec3 gxyz = ivec3(gx, gy, gz); | |||
| @@ -85,4 +186,5 @@ void main() | |||
| ivec4 vv_offset = v_offset + 4 * gxyz4[psc(dims) - 1 - axis]; | |||
| buffer_cp8to1(top_blob_data, v_offset, vv_offset, bottom_blob_data, gi); | |||
| #endif | |||
| } | |||
| @@ -41,8 +41,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; | |||
| layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -70,6 +79,63 @@ void main() | |||
| if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) | |||
| return; | |||
| #if NCNN_image_shader | |||
| if (psc(dims) == 1) | |||
| { | |||
| afpvec8 v = image1d_ld8(bottom_blob_1d, gx); | |||
| int gx2 = gx * 2 + p.offset; | |||
| image1d_st4(top_blob_1d, gx2 + 0, v[0]); | |||
| image1d_st4(top_blob_1d, gx2 + 1, v[1]); | |||
| } | |||
| else if (psc(dims) == 2) | |||
| { | |||
| afpvec8 v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); | |||
| if (axis == 0) | |||
| { | |||
| int gy2 = gy * 2 + p.offset; | |||
| image2d_st4(top_blob_2d, ivec2(gx, gy2 + 0), v[0]); | |||
| image2d_st4(top_blob_2d, ivec2(gx, gy2 + 1), v[1]); | |||
| } | |||
| if (axis == 1) | |||
| { | |||
| int gx2 = gx * 2 + p.offset; | |||
| image2d_st4(top_blob_2d, ivec2(gx2 + 0, gy), v[0]); | |||
| image2d_st4(top_blob_2d, ivec2(gx2 + 1, gy), v[1]); | |||
| } | |||
| } | |||
| else // if (psc(dims) == 3) | |||
| { | |||
| afpvec8 v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); | |||
| if (axis == 0) | |||
| { | |||
| int gz2 = gz * 2 + p.offset; | |||
| image3d_st4(top_blob_3d, ivec3(gx, gy, gz2 + 0), v[0]); | |||
| image3d_st4(top_blob_3d, ivec3(gx, gy, gz2 + 1), v[1]); | |||
| } | |||
| if (axis == 1) | |||
| { | |||
| int gy2 = gy * 2 + p.offset; | |||
| image3d_st4(top_blob_3d, ivec3(gx, gy2 + 0, gz), v[0]); | |||
| image3d_st4(top_blob_3d, ivec3(gx, gy2 + 1, gz), v[1]); | |||
| } | |||
| if (axis == 2) | |||
| { | |||
| int gx2 = gx * 2 + p.offset; | |||
| image3d_st4(top_blob_3d, ivec3(gx2 + 0, gy, gz), v[0]); | |||
| image3d_st4(top_blob_3d, ivec3(gx2 + 1, gy, gz), v[1]); | |||
| } | |||
| } | |||
| #else | |||
| const int gi = gz * psc(cstep) + gy * psc(w) + gx; | |||
| ivec3 gxyz = ivec3(gx, gy, gz); | |||
| @@ -84,4 +150,5 @@ void main() | |||
| ivec2 v_offset = v_offset_0 + ivec2(0, 1) * gxyz4[psc(dims) - 1 - axis]; | |||
| buffer_cp8to4(top_blob_data, v_offset, bottom_blob_data, gi); | |||
| #endif | |||
| } | |||
| @@ -49,10 +49,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -82,13 +89,39 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld1(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld1(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| sum = afp(0.f); | |||
| } | |||
| #if NCNN_image_shader | |||
| for (int z = 0; z < psc(c); z++) | |||
| { | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| sum += image3d_ld1(weight_blob, ivec3(wx, z, gz)) * image3d_ld1(bottom_blob, ivec3(sx, sy, z)); | |||
| sx += dilation_w; | |||
| wx += 1; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * psc(c) * kernel_w * kernel_h; | |||
| for (int z = 0; z < psc(c); z++) | |||
| @@ -106,6 +139,7 @@ void main() | |||
| w_offset += kernel_w; | |||
| } | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -127,7 +161,11 @@ void main() | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st1(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st1(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -21,26 +21,40 @@ | |||
| #extension GL_EXT_shader_explicit_arithmetic_types_float16: require | |||
| #endif | |||
| layout (constant_id = 0) const int bias_term = 0; | |||
| layout (constant_id = 1) const int activation_type = 0; | |||
| layout (constant_id = 2) const float activation_param_0 = 0; | |||
| layout (constant_id = 3) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 4 | |||
| layout (constant_id = 0) const int kernel_w = 1; | |||
| layout (constant_id = 1) const int kernel_h = 1; | |||
| layout (constant_id = 2) const int dilation_w = 1; | |||
| layout (constant_id = 3) const int dilation_h = 1; | |||
| layout (constant_id = 4) const int stride_w = 1; | |||
| layout (constant_id = 5) const int stride_h = 1; | |||
| layout (constant_id = 6) const int bias_term = 0; | |||
| layout (constant_id = 7) const int activation_type = 0; | |||
| layout (constant_id = 8) const float activation_param_0 = 0; | |||
| layout (constant_id = 9) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 10 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int size_4 = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int cstep_4 = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outsize_4 = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outcstep_4 = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; | |||
| layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| #if NCNN_fp16_packed | |||
| layout (binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; }; | |||
| @@ -50,40 +64,67 @@ layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | |||
| #endif | |||
| layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int size_4; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep_4; | |||
| int cstep; | |||
| int outdims; | |||
| int outsize_4; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| int outcstep_4; | |||
| int outcstep; | |||
| } p; | |||
| void main() | |||
| { | |||
| #if NCNN_image_shader | |||
| int gx = int(gl_GlobalInvocationID.x) * 2; | |||
| int gy = int(gl_GlobalInvocationID.y) * 2; | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) | |||
| return; | |||
| #else | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= psc(outsize_4) || gy >= 1 || gz >= psc(outc)) | |||
| if (gx * 4 >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) | |||
| return; | |||
| #endif | |||
| afpvec4 sum; | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = afpvec4(image1d_ld1(bias_blob, gz)); | |||
| #else | |||
| sum = afpvec4(buffer_ld1(bias_data, gz)); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| sum = afpvec4(0.f); | |||
| } | |||
| #if NCNN_image_shader | |||
| for (int z = 0; z < psc(c); z++) | |||
| { | |||
| afp k = image3d_ld1(weight_blob, ivec3(0, z, gz)); | |||
| sum.r += k * image3d_ld1(bottom_blob, ivec3(gx, gy, z)); | |||
| sum.g += k * image3d_ld1(bottom_blob, ivec3(gx+1, gy, z)); | |||
| sum.b += k * image3d_ld1(bottom_blob, ivec3(gx, gy+1, z)); | |||
| sum.a += k * image3d_ld1(bottom_blob, ivec3(gx+1, gy+1, z)); | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * psc(c); | |||
| int v_offset = gx; | |||
| @@ -96,8 +137,9 @@ void main() | |||
| #endif | |||
| w_offset += 1; | |||
| v_offset += psc(cstep_4); | |||
| v_offset += psc(cstep) / 4; | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -119,11 +161,18 @@ void main() | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| const int gi = gz * psc(outcstep_4) + gx; | |||
| #if NCNN_image_shader | |||
| image3d_st1(top_blob, ivec3(gx, gy, gz), sum.r); | |||
| image3d_st1(top_blob, ivec3(gx+1, gy, gz), sum.g); | |||
| image3d_st1(top_blob, ivec3(gx, gy+1, gz), sum.b); | |||
| image3d_st1(top_blob, ivec3(gx+1, gy+1, gz), sum.a); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gx; | |||
| #if NCNN_fp16_packed | |||
| top_blob_data[gi] = sum; | |||
| #else | |||
| buffer_st4(top_blob_data, gi, sum); | |||
| #endif | |||
| #endif | |||
| } | |||
| @@ -49,10 +49,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -82,13 +89,43 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld4(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld4(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| sum = afpvec4(0.f); | |||
| } | |||
| #if NCNN_image_shader | |||
| for (int z = 0; z < psc(c); z++) | |||
| { | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, z)); | |||
| afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz)); | |||
| sum += v * k; | |||
| sx += dilation_w; | |||
| wx += 1; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * psc(c) * kernel_w * kernel_h; | |||
| for (int z = 0; z < psc(c); z++) | |||
| @@ -110,6 +147,7 @@ void main() | |||
| w_offset += kernel_w; | |||
| } | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -131,7 +169,11 @@ void main() | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st4(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st4(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -83,13 +90,45 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld8(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld8(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); | |||
| } | |||
| #if NCNN_image_shader | |||
| for (int z = 0; z < psc(c); z++) | |||
| { | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, z)); | |||
| afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz)); | |||
| // sum += v * k; | |||
| sum[0] += v * k[0]; | |||
| sum[1] += v * k[1]; | |||
| sx += dilation_w; | |||
| wx += 1; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * psc(c) * kernel_w * kernel_h; | |||
| for (int z = 0; z < psc(c); z++) | |||
| @@ -113,6 +152,7 @@ void main() | |||
| w_offset += kernel_w; | |||
| } | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -138,7 +178,11 @@ void main() | |||
| sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st8(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st8(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -49,6 +49,12 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | |||
| #if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) | |||
| @@ -58,6 +64,7 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; | |||
| #endif | |||
| layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -87,13 +94,48 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld4(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld4(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| sum = afpvec4(0.f); | |||
| } | |||
| #if NCNN_image_shader | |||
| for (int z = 0; z < psc(c); z++) | |||
| { | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, z)); | |||
| afpmat4 k = afpmat4( | |||
| image3d_ld4(weight_blob, ivec3(wx + 0, z, gz)), | |||
| image3d_ld4(weight_blob, ivec3(wx + 1, z, gz)), | |||
| image3d_ld4(weight_blob, ivec3(wx + 2, z, gz)), | |||
| image3d_ld4(weight_blob, ivec3(wx + 3, z, gz)) | |||
| ); | |||
| sum += v * k; | |||
| sx += dilation_w; | |||
| wx += 4; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * psc(c) * kernel_w * kernel_h; | |||
| for (int z = 0; z < psc(c); z++) | |||
| @@ -125,6 +167,7 @@ void main() | |||
| w_offset += kernel_w; | |||
| } | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -146,7 +189,11 @@ void main() | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st4(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st4(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -21,26 +21,40 @@ | |||
| #extension GL_EXT_shader_explicit_arithmetic_types_float16: require | |||
| #endif | |||
| layout (constant_id = 0) const int bias_term = 0; | |||
| layout (constant_id = 1) const int activation_type = 0; | |||
| layout (constant_id = 2) const float activation_param_0 = 0; | |||
| layout (constant_id = 3) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 4 | |||
| layout (constant_id = 0) const int kernel_w = 1; | |||
| layout (constant_id = 1) const int kernel_h = 1; | |||
| layout (constant_id = 2) const int dilation_w = 1; | |||
| layout (constant_id = 3) const int dilation_h = 1; | |||
| layout (constant_id = 4) const int stride_w = 1; | |||
| layout (constant_id = 5) const int stride_h = 1; | |||
| layout (constant_id = 6) const int bias_term = 0; | |||
| layout (constant_id = 7) const int activation_type = 0; | |||
| layout (constant_id = 8) const float activation_param_0 = 0; | |||
| layout (constant_id = 9) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 10 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int size = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outsize = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; | |||
| layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | |||
| #if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) | |||
| @@ -50,28 +64,40 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; | |||
| #endif | |||
| layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int size; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outsize; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| int outcstep; | |||
| } p; | |||
| void main() | |||
| { | |||
| #if NCNN_image_shader | |||
| int gx = int(gl_GlobalInvocationID.x) * 2; | |||
| int gy = int(gl_GlobalInvocationID.y) * 2; | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) | |||
| return; | |||
| #else | |||
| int gx = int(gl_GlobalInvocationID.x) * 4; | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= psc(outsize) || gy >= 1 || gz >= psc(outc)) | |||
| if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) | |||
| return; | |||
| #endif | |||
| afpvec4 sum0; | |||
| afpvec4 sum1; | |||
| @@ -80,7 +106,11 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| afpvec4 b = image1d_ld4(bias_blob, gz); | |||
| #else | |||
| afpvec4 b = buffer_ld4(bias_data, gz); | |||
| #endif | |||
| sum0 = b; | |||
| sum1 = b; | |||
| sum2 = b; | |||
| @@ -94,6 +124,27 @@ void main() | |||
| sum3 = afpvec4(0.f); | |||
| } | |||
| #if NCNN_image_shader | |||
| for (int z = 0; z < psc(c); z++) | |||
| { | |||
| afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(gx, gy, z)); | |||
| afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(gx+1, gy, z)); | |||
| afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(gx, gy+1, z)); | |||
| afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(gx+1, gy+1, z)); | |||
| afpmat4 k = afpmat4( | |||
| image3d_ld4(weight_blob, ivec3(0, z, gz)), | |||
| image3d_ld4(weight_blob, ivec3(1, z, gz)), | |||
| image3d_ld4(weight_blob, ivec3(2, z, gz)), | |||
| image3d_ld4(weight_blob, ivec3(3, z, gz)) | |||
| ); | |||
| sum0 += v0 * k; | |||
| sum1 += v1 * k; | |||
| sum2 += v2 * k; | |||
| sum3 += v3 * k; | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * psc(c); | |||
| int v_offset = gx; | |||
| @@ -124,6 +175,7 @@ void main() | |||
| w_offset += 1; | |||
| v_offset += psc(cstep); | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -157,10 +209,17 @@ void main() | |||
| sum3 = afp(1.f) / (afp(1.f) + exp(-sum3)); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st4(top_blob, ivec3(gx, gy, gz), sum0); | |||
| image3d_st4(top_blob, ivec3(gx+1, gy, gz), sum1); | |||
| image3d_st4(top_blob, ivec3(gx, gy+1, gz), sum2); | |||
| image3d_st4(top_blob, ivec3(gx+1, gy+1, gz), sum3); | |||
| #else | |||
| int gi = gz * psc(outcstep) + gx; | |||
| buffer_st4(top_blob_data, gi + 0, sum0); | |||
| if (gx + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, sum1); | |||
| if (gx + 2 < psc(outcstep)) buffer_st4(top_blob_data, gi + 2, sum2); | |||
| if (gx + 3 < psc(outcstep)) buffer_st4(top_blob_data, gi + 3, sum3); | |||
| #endif | |||
| } | |||
| @@ -33,6 +33,11 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_tm_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_tm_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_tm_blob; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; }; | |||
| #if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) | |||
| @@ -41,6 +46,7 @@ layout (binding = 2) readonly buffer weight_tm_blob { sfpvec4 weight_tm_data[]; | |||
| #else | |||
| layout (binding = 2) readonly buffer weight_tm_blob { sfpmat4 weight_tm_data[]; }; | |||
| #endif | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -66,6 +72,29 @@ void main() | |||
| afpvec4 sum2 = afpvec4(0.f); | |||
| afpvec4 sum3 = afpvec4(0.f); | |||
| #if NCNN_image_shader | |||
| int wx = gx * 4; | |||
| for (int z = 0; z < psc(c); z++) | |||
| { | |||
| afpvec4 v0 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 0, z)); | |||
| afpvec4 v1 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 1, z)); | |||
| afpvec4 v2 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 2, z)); | |||
| afpvec4 v3 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 3, z)); | |||
| afpmat4 k = afpmat4( | |||
| image3d_ld4(weight_tm_blob, ivec3(wx + 0, z, gz)), | |||
| image3d_ld4(weight_tm_blob, ivec3(wx + 1, z, gz)), | |||
| image3d_ld4(weight_tm_blob, ivec3(wx + 2, z, gz)), | |||
| image3d_ld4(weight_tm_blob, ivec3(wx + 3, z, gz)) | |||
| ); | |||
| sum0 += v0 * k; | |||
| sum1 += v1 * k; | |||
| sum2 += v2 * k; | |||
| sum3 += v3 * k; | |||
| } | |||
| #else | |||
| int v_offset = gy * 16 + gx; | |||
| int w_offset = gz * psc(c) * 16 + gx; | |||
| @@ -96,11 +125,19 @@ void main() | |||
| v_offset += psc(cstep); | |||
| w_offset += 16; | |||
| } | |||
| #endif | |||
| #if NCNN_image_shader | |||
| image3d_st4(top_tm_blob, ivec3(gx, gy + 0, gz), sum0); | |||
| image3d_st4(top_tm_blob, ivec3(gx, gy + 1, gz), sum1); | |||
| image3d_st4(top_tm_blob, ivec3(gx, gy + 2, gz), sum2); | |||
| image3d_st4(top_tm_blob, ivec3(gx, gy + 3, gz), sum3); | |||
| #else | |||
| int gi = gz * psc(outcstep) + gy * 16 + gx; | |||
| buffer_st4(top_tm_blob_data, gi + 0, sum0); | |||
| if (gy + 1 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 16, sum1); | |||
| if (gy + 2 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 32, sum2); | |||
| if (gy + 3 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 48, sum3); | |||
| #endif | |||
| } | |||
| @@ -36,8 +36,13 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D bottom_tm_blob; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -62,6 +67,30 @@ void main() | |||
| return; | |||
| // load 4x4 | |||
| #if NCNN_image_shader | |||
| int sx = gx * 2; | |||
| int sy = gy * 2; | |||
| afpvec4 v00 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 0, gz)); | |||
| afpvec4 v01 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 0, gz)); | |||
| afpvec4 v02 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 0, gz)); | |||
| afpvec4 v03 = image3d_ld4(bottom_blob, ivec3(sx + 3, sy + 0, gz)); | |||
| afpvec4 v10 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 1, gz)); | |||
| afpvec4 v11 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 1, gz)); | |||
| afpvec4 v12 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 1, gz)); | |||
| afpvec4 v13 = image3d_ld4(bottom_blob, ivec3(sx + 3, sy + 1, gz)); | |||
| afpvec4 v20 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 2, gz)); | |||
| afpvec4 v21 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 2, gz)); | |||
| afpvec4 v22 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 2, gz)); | |||
| afpvec4 v23 = image3d_ld4(bottom_blob, ivec3(sx + 3, sy + 2, gz)); | |||
| afpvec4 v30 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 3, gz)); | |||
| afpvec4 v31 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 3, gz)); | |||
| afpvec4 v32 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 3, gz)); | |||
| afpvec4 v33 = image3d_ld4(bottom_blob, ivec3(sx + 3, sy + 3, gz)); | |||
| #else | |||
| int v_offset_0 = gz * psc(cstep) + gy * 2 * psc(w) + gx * 2; | |||
| ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w); | |||
| @@ -84,6 +113,7 @@ void main() | |||
| afpvec4 v31 = buffer_ld4(bottom_blob_data, v_offset.a + 1); | |||
| afpvec4 v32 = buffer_ld4(bottom_blob_data, v_offset.a + 2); | |||
| afpvec4 v33 = buffer_ld4(bottom_blob_data, v_offset.a + 3); | |||
| #endif | |||
| // const float itm[4][4] = { | |||
| // {1.0f, 0.0f, -1.0f, 0.0f}, | |||
| @@ -134,6 +164,26 @@ void main() | |||
| v33 = m33 - m31; | |||
| // store 16 | |||
| #if NCNN_image_shader | |||
| int y = gy * p.block_x + gx; | |||
| image3d_st4(bottom_tm_blob, ivec3(0, y, gz), v00); | |||
| image3d_st4(bottom_tm_blob, ivec3(1, y, gz), v01); | |||
| image3d_st4(bottom_tm_blob, ivec3(2, y, gz), v02); | |||
| image3d_st4(bottom_tm_blob, ivec3(3, y, gz), v03); | |||
| image3d_st4(bottom_tm_blob, ivec3(4, y, gz), v10); | |||
| image3d_st4(bottom_tm_blob, ivec3(5, y, gz), v11); | |||
| image3d_st4(bottom_tm_blob, ivec3(6, y, gz), v12); | |||
| image3d_st4(bottom_tm_blob, ivec3(7, y, gz), v13); | |||
| image3d_st4(bottom_tm_blob, ivec3(8, y, gz), v20); | |||
| image3d_st4(bottom_tm_blob, ivec3(9, y, gz), v21); | |||
| image3d_st4(bottom_tm_blob, ivec3(10, y, gz), v22); | |||
| image3d_st4(bottom_tm_blob, ivec3(11, y, gz), v23); | |||
| image3d_st4(bottom_tm_blob, ivec3(12, y, gz), v30); | |||
| image3d_st4(bottom_tm_blob, ivec3(13, y, gz), v31); | |||
| image3d_st4(bottom_tm_blob, ivec3(14, y, gz), v32); | |||
| image3d_st4(bottom_tm_blob, ivec3(15, y, gz), v33); | |||
| #else | |||
| int v_tm_offset = gz * psc(outcstep) + (gy * p.block_x + gx) * 16; | |||
| buffer_st4(bottom_tm_blob_data, v_tm_offset + 0, v00); | |||
| @@ -152,4 +202,5 @@ void main() | |||
| buffer_st4(bottom_tm_blob_data, v_tm_offset + 13, v31); | |||
| buffer_st4(bottom_tm_blob_data, v_tm_offset + 14, v32); | |||
| buffer_st4(bottom_tm_blob_data, v_tm_offset + 15, v33); | |||
| #endif | |||
| } | |||
| @@ -41,9 +41,15 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D top_tm_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler1D bias_blob; | |||
| #else | |||
| layout (binding = 0) readonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -68,6 +74,26 @@ void main() | |||
| return; | |||
| // load 16 | |||
| #if NCNN_image_shader | |||
| int sy = gy * p.block_x + gx; | |||
| afpvec4 v00 = image3d_ld4(top_tm_blob, ivec3(0, sy, gz)); | |||
| afpvec4 v01 = image3d_ld4(top_tm_blob, ivec3(1, sy, gz)); | |||
| afpvec4 v02 = image3d_ld4(top_tm_blob, ivec3(2, sy, gz)); | |||
| afpvec4 v03 = image3d_ld4(top_tm_blob, ivec3(3, sy, gz)); | |||
| afpvec4 v10 = image3d_ld4(top_tm_blob, ivec3(4, sy, gz)); | |||
| afpvec4 v11 = image3d_ld4(top_tm_blob, ivec3(5, sy, gz)); | |||
| afpvec4 v12 = image3d_ld4(top_tm_blob, ivec3(6, sy, gz)); | |||
| afpvec4 v13 = image3d_ld4(top_tm_blob, ivec3(7, sy, gz)); | |||
| afpvec4 v20 = image3d_ld4(top_tm_blob, ivec3(8, sy, gz)); | |||
| afpvec4 v21 = image3d_ld4(top_tm_blob, ivec3(9, sy, gz)); | |||
| afpvec4 v22 = image3d_ld4(top_tm_blob, ivec3(10, sy, gz)); | |||
| afpvec4 v23 = image3d_ld4(top_tm_blob, ivec3(11, sy, gz)); | |||
| afpvec4 v30 = image3d_ld4(top_tm_blob, ivec3(12, sy, gz)); | |||
| afpvec4 v31 = image3d_ld4(top_tm_blob, ivec3(13, sy, gz)); | |||
| afpvec4 v32 = image3d_ld4(top_tm_blob, ivec3(14, sy, gz)); | |||
| afpvec4 v33 = image3d_ld4(top_tm_blob, ivec3(15, sy, gz)); | |||
| #else | |||
| int v_tm_offset = gz * psc(cstep) + (gy * p.block_x + gx) * 16; | |||
| afpvec4 v00 = buffer_ld4(top_tm_blob_data, v_tm_offset + 0); | |||
| @@ -86,6 +112,7 @@ void main() | |||
| afpvec4 v31 = buffer_ld4(top_tm_blob_data, v_tm_offset + 13); | |||
| afpvec4 v32 = buffer_ld4(top_tm_blob_data, v_tm_offset + 14); | |||
| afpvec4 v33 = buffer_ld4(top_tm_blob_data, v_tm_offset + 15); | |||
| #endif | |||
| // const float itm[2][4] = { | |||
| // {1.0f, 1.0f, 1.0f, 0.0f}, | |||
| @@ -105,7 +132,11 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| const afpvec4 bias_value = image1d_ld4(bias_blob, gz); | |||
| #else | |||
| const afpvec4 bias_value = buffer_ld4(bias_data, gz); | |||
| #endif | |||
| v00 = bias_value + m00 + m01 + m02; | |||
| v10 = bias_value + m10 + m11 + m12; | |||
| @@ -155,6 +186,15 @@ void main() | |||
| } | |||
| // store 2x2 | |||
| #if NCNN_image_shader | |||
| int x = gx * 2; | |||
| int y = gy * 2; | |||
| image3d_st4(top_blob, ivec3(x, y, gz), v00); | |||
| image3d_st4(top_blob, ivec3(x + 1, y, gz), v01); | |||
| image3d_st4(top_blob, ivec3(x, y + 1, gz), v10); | |||
| image3d_st4(top_blob, ivec3(x + 1, y + 1, gz), v11); | |||
| #else | |||
| int v_offset_0 = gz * psc(outcstep) + gy * 2 * psc(outw) + gx * 2; | |||
| int v_offset_1 = v_offset_0 + psc(outw); | |||
| @@ -162,4 +202,5 @@ void main() | |||
| buffer_st4(top_blob_data, v_offset_0 + 1, v01); | |||
| buffer_st4(top_blob_data, v_offset_1 + 0, v10); | |||
| buffer_st4(top_blob_data, v_offset_1 + 1, v11); | |||
| #endif | |||
| } | |||
| @@ -49,10 +49,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -82,13 +89,43 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld1(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld1(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| sum = afp(0.f); | |||
| } | |||
| #if NCNN_image_shader | |||
| for (int z = 0; z < psc(c); z++) | |||
| { | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, z)); | |||
| afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz)); | |||
| sum += dot(v, k); | |||
| sx += dilation_w; | |||
| wx += 1; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * psc(c) * kernel_w * kernel_h; | |||
| for (int z = 0; z < psc(c); z++) | |||
| @@ -110,6 +147,7 @@ void main() | |||
| w_offset += kernel_w; | |||
| } | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -131,7 +169,11 @@ void main() | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st1(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st1(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -83,13 +90,58 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld8(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld8(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); | |||
| } | |||
| #if NCNN_image_shader | |||
| for (int z = 0; z < psc(c); z++) | |||
| { | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, z)); | |||
| afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx + 0, z, gz)); | |||
| afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx + 1, z, gz)); | |||
| afpvec4 k2 = image3d_ld4(weight_blob, ivec3(wx + 2, z, gz)); | |||
| afpvec4 k3 = image3d_ld4(weight_blob, ivec3(wx + 3, z, gz)); | |||
| afpvec4 k4 = image3d_ld4(weight_blob, ivec3(wx + 4, z, gz)); | |||
| afpvec4 k5 = image3d_ld4(weight_blob, ivec3(wx + 5, z, gz)); | |||
| afpvec4 k6 = image3d_ld4(weight_blob, ivec3(wx + 6, z, gz)); | |||
| afpvec4 k7 = image3d_ld4(weight_blob, ivec3(wx + 7, z, gz)); | |||
| // sum += v * k; | |||
| sum[0].r += dot(v, k0); | |||
| sum[0].g += dot(v, k1); | |||
| sum[0].b += dot(v, k2); | |||
| sum[0].a += dot(v, k3); | |||
| sum[1].r += dot(v, k4); | |||
| sum[1].g += dot(v, k5); | |||
| sum[1].b += dot(v, k6); | |||
| sum[1].a += dot(v, k7); | |||
| sx += dilation_w; | |||
| wx += 8; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * psc(c) * kernel_w * kernel_h; | |||
| for (int z = 0; z < psc(c); z++) | |||
| @@ -126,6 +178,7 @@ void main() | |||
| w_offset += kernel_w; | |||
| } | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -151,7 +204,11 @@ void main() | |||
| sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st8(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st8(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -83,13 +90,58 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld8(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld8(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); | |||
| } | |||
| #if NCNN_image_shader | |||
| for (int z = 0; z < psc(c); z++) | |||
| { | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, z)); | |||
| afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz)); | |||
| afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz)); | |||
| afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz)); | |||
| afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz)); | |||
| afpvec8 k4 = image3d_ld8(weight_blob, ivec3(wx + 4, z, gz)); | |||
| afpvec8 k5 = image3d_ld8(weight_blob, ivec3(wx + 5, z, gz)); | |||
| afpvec8 k6 = image3d_ld8(weight_blob, ivec3(wx + 6, z, gz)); | |||
| afpvec8 k7 = image3d_ld8(weight_blob, ivec3(wx + 7, z, gz)); | |||
| // sum += v * k; | |||
| sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]); | |||
| sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]); | |||
| sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]); | |||
| sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]); | |||
| sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]); | |||
| sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]); | |||
| sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]); | |||
| sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]); | |||
| sx += dilation_w; | |||
| wx += 8; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * psc(c) * kernel_w * kernel_h; | |||
| for (int z = 0; z < psc(c); z++) | |||
| @@ -126,6 +178,7 @@ void main() | |||
| w_offset += kernel_w; | |||
| } | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -151,7 +204,11 @@ void main() | |||
| sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st8(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st8(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -22,52 +22,78 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; | |||
| #extension GL_EXT_shader_explicit_arithmetic_types_float16: require | |||
| #endif | |||
| layout (constant_id = 0) const int bias_term = 0; | |||
| layout (constant_id = 1) const int activation_type = 0; | |||
| layout (constant_id = 2) const float activation_param_0 = 0; | |||
| layout (constant_id = 3) const float activation_param_1 = 0; | |||
| layout (constant_id = 0) const int kernel_w = 1; | |||
| layout (constant_id = 1) const int kernel_h = 1; | |||
| layout (constant_id = 2) const int dilation_w = 1; | |||
| layout (constant_id = 3) const int dilation_h = 1; | |||
| layout (constant_id = 4) const int stride_w = 1; | |||
| layout (constant_id = 5) const int stride_h = 1; | |||
| layout (constant_id = 6) const int bias_term = 0; | |||
| layout (constant_id = 7) const int activation_type = 0; | |||
| layout (constant_id = 8) const float activation_param_0 = 0; | |||
| layout (constant_id = 9) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 4 | |||
| #define shape_constant_id_offset 10 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int size = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outsize = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; | |||
| layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int size; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outsize; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| int outcstep; | |||
| } p; | |||
| void main() | |||
| { | |||
| #if NCNN_image_shader | |||
| int gx = int(gl_GlobalInvocationID.x) * 2; | |||
| int gy = int(gl_GlobalInvocationID.y) * 2; | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) | |||
| return; | |||
| #else | |||
| int gx = int(gl_GlobalInvocationID.x) * 4; | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= psc(outsize) || gy >= 1 || gz >= psc(outc)) | |||
| if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) | |||
| return; | |||
| #endif | |||
| afpvec8 sum0; | |||
| afpvec8 sum1; | |||
| @@ -76,7 +102,11 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| afpvec8 b = image1d_ld8(bias_blob, gz); | |||
| #else | |||
| afpvec8 b = buffer_ld8(bias_data, gz); | |||
| #endif | |||
| sum0 = b; | |||
| sum1 = b; | |||
| sum2 = b; | |||
| @@ -90,6 +120,61 @@ void main() | |||
| sum3 = afpvec8(afpvec4(0.f), afpvec4(0.f)); | |||
| } | |||
| #if NCNN_image_shader | |||
| for (int z = 0; z < psc(c); z++) | |||
| { | |||
| afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(gx, gy, z)); | |||
| afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(gx+1, gy, z)); | |||
| afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(gx, gy+1, z)); | |||
| afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(gx+1, gy+1, z)); | |||
| afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gz)); | |||
| afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gz)); | |||
| afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gz)); | |||
| afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gz)); | |||
| afpvec8 k4 = image3d_ld8(weight_blob, ivec3(4, z, gz)); | |||
| afpvec8 k5 = image3d_ld8(weight_blob, ivec3(5, z, gz)); | |||
| afpvec8 k6 = image3d_ld8(weight_blob, ivec3(6, z, gz)); | |||
| afpvec8 k7 = image3d_ld8(weight_blob, ivec3(7, z, gz)); | |||
| // sum += v * k | |||
| sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); | |||
| sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); | |||
| sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); | |||
| sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); | |||
| sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]); | |||
| sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]); | |||
| sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]); | |||
| sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]); | |||
| sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); | |||
| sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); | |||
| sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); | |||
| sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); | |||
| sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]); | |||
| sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]); | |||
| sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]); | |||
| sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]); | |||
| sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]); | |||
| sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]); | |||
| sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]); | |||
| sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]); | |||
| sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]); | |||
| sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]); | |||
| sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]); | |||
| sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]); | |||
| sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]); | |||
| sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]); | |||
| sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]); | |||
| sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); | |||
| sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]); | |||
| sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]); | |||
| sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]); | |||
| sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]); | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * psc(c) * 8; | |||
| int v_offset = gx; | |||
| @@ -149,6 +234,7 @@ void main() | |||
| w_offset += 8; | |||
| v_offset += psc(cstep); | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -198,10 +284,17 @@ void main() | |||
| sum3[1] = afp(1.f) / (afp(1.f) + exp(-sum3[1])); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st8(top_blob, ivec3(gx, gy, gz), sum0); | |||
| image3d_st8(top_blob, ivec3(gx+1, gy, gz), sum1); | |||
| image3d_st8(top_blob, ivec3(gx, gy+1, gz), sum2); | |||
| image3d_st8(top_blob, ivec3(gx+1, gy+1, gz), sum3); | |||
| #else | |||
| int gi = gz * psc(outcstep) + gx; | |||
| buffer_st8(top_blob_data, gi + 0, sum0); | |||
| if (gx + 1 < psc(outcstep)) buffer_st8(top_blob_data, gi + 1, sum1); | |||
| if (gx + 2 < psc(outcstep)) buffer_st8(top_blob_data, gi + 2, sum2); | |||
| if (gx + 3 < psc(outcstep)) buffer_st8(top_blob_data, gi + 3, sum3); | |||
| #endif | |||
| } | |||
| @@ -34,9 +34,15 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_tm_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_tm_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_tm_blob; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec8 bottom_tm_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_tm_blob { sfpvec8 top_tm_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_tm_blob { sfpvec8 weight_tm_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -62,6 +68,63 @@ void main() | |||
| afpvec8 sum2 = afpvec8(afpvec4(0.f), afpvec4(0.f)); | |||
| afpvec8 sum3 = afpvec8(afpvec4(0.f), afpvec4(0.f)); | |||
| #if NCNN_image_shader | |||
| int wx = gx * 8; | |||
| for (int z = 0; z < psc(c); z++) | |||
| { | |||
| afpvec8 v0 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 0, z)); | |||
| afpvec8 v1 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 1, z)); | |||
| afpvec8 v2 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 2, z)); | |||
| afpvec8 v3 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 3, z)); | |||
| afpvec8 k0 = image3d_ld8(weight_tm_blob, ivec3(wx + 0, z, gz)); | |||
| afpvec8 k1 = image3d_ld8(weight_tm_blob, ivec3(wx + 1, z, gz)); | |||
| afpvec8 k2 = image3d_ld8(weight_tm_blob, ivec3(wx + 2, z, gz)); | |||
| afpvec8 k3 = image3d_ld8(weight_tm_blob, ivec3(wx + 3, z, gz)); | |||
| afpvec8 k4 = image3d_ld8(weight_tm_blob, ivec3(wx + 4, z, gz)); | |||
| afpvec8 k5 = image3d_ld8(weight_tm_blob, ivec3(wx + 5, z, gz)); | |||
| afpvec8 k6 = image3d_ld8(weight_tm_blob, ivec3(wx + 6, z, gz)); | |||
| afpvec8 k7 = image3d_ld8(weight_tm_blob, ivec3(wx + 7, z, gz)); | |||
| // sum += v * k | |||
| sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); | |||
| sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); | |||
| sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); | |||
| sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); | |||
| sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]); | |||
| sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]); | |||
| sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]); | |||
| sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]); | |||
| sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); | |||
| sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); | |||
| sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); | |||
| sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); | |||
| sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]); | |||
| sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]); | |||
| sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]); | |||
| sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]); | |||
| sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]); | |||
| sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]); | |||
| sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]); | |||
| sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]); | |||
| sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]); | |||
| sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]); | |||
| sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]); | |||
| sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]); | |||
| sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]); | |||
| sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]); | |||
| sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]); | |||
| sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); | |||
| sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]); | |||
| sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]); | |||
| sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]); | |||
| sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]); | |||
| } | |||
| #else | |||
| int v_offset = gy * 16 + gx; | |||
| int w_offset = (gz * psc(c) * 16 + gx) * 8; | |||
| @@ -121,11 +184,19 @@ void main() | |||
| v_offset += psc(cstep); | |||
| w_offset += 16 * 8; | |||
| } | |||
| #endif | |||
| #if NCNN_image_shader | |||
| image3d_st8(top_tm_blob, ivec3(gx, gy + 0, gz), sum0); | |||
| image3d_st8(top_tm_blob, ivec3(gx, gy + 1, gz), sum1); | |||
| image3d_st8(top_tm_blob, ivec3(gx, gy + 2, gz), sum2); | |||
| image3d_st8(top_tm_blob, ivec3(gx, gy + 3, gz), sum3); | |||
| #else | |||
| int gi = gz * psc(outcstep) + gy * 16 + gx; | |||
| buffer_st8(top_tm_blob_data, gi + 0, sum0); | |||
| if (gy + 1 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 16, sum1); | |||
| if (gy + 2 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 32, sum2); | |||
| if (gy + 3 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 48, sum3); | |||
| #endif | |||
| } | |||
| @@ -37,8 +37,13 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D bottom_tm_blob; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer bottom_tm_blob { sfpvec8 bottom_tm_blob_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -63,6 +68,30 @@ void main() | |||
| return; | |||
| // load 4x4 | |||
| #if NCNN_image_shader | |||
| int sx = gx * 2; | |||
| int sy = gy * 2; | |||
| afpvec8 v00 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 0, gz)); | |||
| afpvec8 v01 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 0, gz)); | |||
| afpvec8 v02 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 0, gz)); | |||
| afpvec8 v03 = image3d_ld8(bottom_blob, ivec3(sx + 3, sy + 0, gz)); | |||
| afpvec8 v10 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 1, gz)); | |||
| afpvec8 v11 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 1, gz)); | |||
| afpvec8 v12 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 1, gz)); | |||
| afpvec8 v13 = image3d_ld8(bottom_blob, ivec3(sx + 3, sy + 1, gz)); | |||
| afpvec8 v20 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 2, gz)); | |||
| afpvec8 v21 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 2, gz)); | |||
| afpvec8 v22 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 2, gz)); | |||
| afpvec8 v23 = image3d_ld8(bottom_blob, ivec3(sx + 3, sy + 2, gz)); | |||
| afpvec8 v30 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 3, gz)); | |||
| afpvec8 v31 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 3, gz)); | |||
| afpvec8 v32 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 3, gz)); | |||
| afpvec8 v33 = image3d_ld8(bottom_blob, ivec3(sx + 3, sy + 3, gz)); | |||
| #else | |||
| int v_offset_0 = gz * psc(cstep) + gy * 2 * psc(w) + gx * 2; | |||
| ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w); | |||
| @@ -85,6 +114,7 @@ void main() | |||
| afpvec8 v31 = buffer_ld8(bottom_blob_data, v_offset.a + 1); | |||
| afpvec8 v32 = buffer_ld8(bottom_blob_data, v_offset.a + 2); | |||
| afpvec8 v33 = buffer_ld8(bottom_blob_data, v_offset.a + 3); | |||
| #endif | |||
| // const float itm[4][4] = { | |||
| // {1.0f, 0.0f, -1.0f, 0.0f}, | |||
| @@ -135,6 +165,26 @@ void main() | |||
| v33 = m33 - m31; | |||
| // store 16 | |||
| #if NCNN_image_shader | |||
| int y = gy * p.block_x + gx; | |||
| image3d_st8(bottom_tm_blob, ivec3(0, y, gz), v00); | |||
| image3d_st8(bottom_tm_blob, ivec3(1, y, gz), v01); | |||
| image3d_st8(bottom_tm_blob, ivec3(2, y, gz), v02); | |||
| image3d_st8(bottom_tm_blob, ivec3(3, y, gz), v03); | |||
| image3d_st8(bottom_tm_blob, ivec3(4, y, gz), v10); | |||
| image3d_st8(bottom_tm_blob, ivec3(5, y, gz), v11); | |||
| image3d_st8(bottom_tm_blob, ivec3(6, y, gz), v12); | |||
| image3d_st8(bottom_tm_blob, ivec3(7, y, gz), v13); | |||
| image3d_st8(bottom_tm_blob, ivec3(8, y, gz), v20); | |||
| image3d_st8(bottom_tm_blob, ivec3(9, y, gz), v21); | |||
| image3d_st8(bottom_tm_blob, ivec3(10, y, gz), v22); | |||
| image3d_st8(bottom_tm_blob, ivec3(11, y, gz), v23); | |||
| image3d_st8(bottom_tm_blob, ivec3(12, y, gz), v30); | |||
| image3d_st8(bottom_tm_blob, ivec3(13, y, gz), v31); | |||
| image3d_st8(bottom_tm_blob, ivec3(14, y, gz), v32); | |||
| image3d_st8(bottom_tm_blob, ivec3(15, y, gz), v33); | |||
| #else | |||
| int v_tm_offset = gz * psc(outcstep) + (gy * p.block_x + gx) * 16; | |||
| buffer_st8(bottom_tm_blob_data, v_tm_offset + 0, v00); | |||
| @@ -153,4 +203,5 @@ void main() | |||
| buffer_st8(bottom_tm_blob_data, v_tm_offset + 13, v31); | |||
| buffer_st8(bottom_tm_blob_data, v_tm_offset + 14, v32); | |||
| buffer_st8(bottom_tm_blob_data, v_tm_offset + 15, v33); | |||
| #endif | |||
| } | |||
| @@ -42,9 +42,15 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D top_tm_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler1D bias_blob; | |||
| #else | |||
| layout (binding = 0) readonly buffer top_tm_blob { sfpvec8 top_tm_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer bias_blob { sfpvec8 bias_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -69,6 +75,26 @@ void main() | |||
| return; | |||
| // load 16 | |||
| #if NCNN_image_shader | |||
| int sy = gy * p.block_x + gx; | |||
| afpvec8 v00 = image3d_ld8(top_tm_blob, ivec3(0, sy, gz)); | |||
| afpvec8 v01 = image3d_ld8(top_tm_blob, ivec3(1, sy, gz)); | |||
| afpvec8 v02 = image3d_ld8(top_tm_blob, ivec3(2, sy, gz)); | |||
| afpvec8 v03 = image3d_ld8(top_tm_blob, ivec3(3, sy, gz)); | |||
| afpvec8 v10 = image3d_ld8(top_tm_blob, ivec3(4, sy, gz)); | |||
| afpvec8 v11 = image3d_ld8(top_tm_blob, ivec3(5, sy, gz)); | |||
| afpvec8 v12 = image3d_ld8(top_tm_blob, ivec3(6, sy, gz)); | |||
| afpvec8 v13 = image3d_ld8(top_tm_blob, ivec3(7, sy, gz)); | |||
| afpvec8 v20 = image3d_ld8(top_tm_blob, ivec3(8, sy, gz)); | |||
| afpvec8 v21 = image3d_ld8(top_tm_blob, ivec3(9, sy, gz)); | |||
| afpvec8 v22 = image3d_ld8(top_tm_blob, ivec3(10, sy, gz)); | |||
| afpvec8 v23 = image3d_ld8(top_tm_blob, ivec3(11, sy, gz)); | |||
| afpvec8 v30 = image3d_ld8(top_tm_blob, ivec3(12, sy, gz)); | |||
| afpvec8 v31 = image3d_ld8(top_tm_blob, ivec3(13, sy, gz)); | |||
| afpvec8 v32 = image3d_ld8(top_tm_blob, ivec3(14, sy, gz)); | |||
| afpvec8 v33 = image3d_ld8(top_tm_blob, ivec3(15, sy, gz)); | |||
| #else | |||
| int v_tm_offset = gz * psc(cstep) + (gy * p.block_x + gx) * 16; | |||
| afpvec8 v00 = buffer_ld8(top_tm_blob_data, v_tm_offset + 0); | |||
| @@ -87,6 +113,7 @@ void main() | |||
| afpvec8 v31 = buffer_ld8(top_tm_blob_data, v_tm_offset + 13); | |||
| afpvec8 v32 = buffer_ld8(top_tm_blob_data, v_tm_offset + 14); | |||
| afpvec8 v33 = buffer_ld8(top_tm_blob_data, v_tm_offset + 15); | |||
| #endif | |||
| // const float itm[2][4] = { | |||
| // {1.0f, 1.0f, 1.0f, 0.0f}, | |||
| @@ -106,7 +133,11 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| const afpvec8 bias_value = image1d_ld8(bias_blob, gz); | |||
| #else | |||
| const afpvec8 bias_value = buffer_ld8(bias_data, gz); | |||
| #endif | |||
| v00 = bias_value + m00 + m01 + m02; | |||
| v10 = bias_value + m10 + m11 + m12; | |||
| @@ -172,6 +203,15 @@ void main() | |||
| } | |||
| // store 2x2 | |||
| #if NCNN_image_shader | |||
| int x = gx * 2; | |||
| int y = gy * 2; | |||
| image3d_st8(top_blob, ivec3(x, y, gz), v00); | |||
| image3d_st8(top_blob, ivec3(x + 1, y, gz), v01); | |||
| image3d_st8(top_blob, ivec3(x, y + 1, gz), v10); | |||
| image3d_st8(top_blob, ivec3(x + 1, y + 1, gz), v11); | |||
| #else | |||
| int v_offset_0 = gz * psc(outcstep) + gy * 2 * psc(outw) + gx * 2; | |||
| int v_offset_1 = v_offset_0 + psc(outw); | |||
| @@ -179,4 +219,5 @@ void main() | |||
| buffer_st8(top_blob_data, v_offset_0 + 1, v01); | |||
| buffer_st8(top_blob_data, v_offset_1 + 0, v10); | |||
| buffer_st8(top_blob_data, v_offset_1 + 1, v11); | |||
| #endif | |||
| } | |||
| @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -83,13 +90,44 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld1(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld1(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| sum = afp(0.f); | |||
| } | |||
| #if NCNN_image_shader | |||
| for (int z = 0; z < psc(c); z++) | |||
| { | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, z)); | |||
| afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz)); | |||
| // sum += dot(v, k); | |||
| sum += dot(v[0], k[0]) + dot(v[1], k[1]); | |||
| sx += dilation_w; | |||
| wx += 1; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * psc(c) * kernel_w * kernel_h; | |||
| for (int z = 0; z < psc(c); z++) | |||
| @@ -112,6 +150,7 @@ void main() | |||
| w_offset += kernel_w; | |||
| } | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -133,7 +172,11 @@ void main() | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st1(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st1(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -83,13 +90,50 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld4(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld4(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| sum = afpvec4(0.f); | |||
| } | |||
| #if NCNN_image_shader | |||
| for (int z = 0; z < psc(c); z++) | |||
| { | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, z)); | |||
| afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz)); | |||
| afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz)); | |||
| afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz)); | |||
| afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz)); | |||
| // sum += v * k | |||
| sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]); | |||
| sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]); | |||
| sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]); | |||
| sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]); | |||
| sx += dilation_w; | |||
| wx += 4; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * psc(c) * kernel_w * kernel_h; | |||
| for (int z = 0; z < psc(c); z++) | |||
| @@ -118,6 +162,7 @@ void main() | |||
| w_offset += kernel_w; | |||
| } | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -139,7 +184,11 @@ void main() | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st4(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st4(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler2D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -83,7 +90,11 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld1(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld1(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| @@ -91,6 +102,25 @@ void main() | |||
| } | |||
| // depth-wise convolution | |||
| #if NCNN_image_shader | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| sum += image2d_ld1(weight_blob, ivec2(wx, gz)) * image3d_ld1(bottom_blob, ivec3(sx, sy, gz)); | |||
| sx += dilation_w; | |||
| wx += 1; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * kernel_w * kernel_h; | |||
| int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; | |||
| @@ -104,6 +134,7 @@ void main() | |||
| v_offset += dilation_h * psc(w); | |||
| w_offset += kernel_w; | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -125,7 +156,11 @@ void main() | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st1(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st1(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -83,7 +90,11 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld1(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld1(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| @@ -97,6 +108,32 @@ void main() | |||
| // group id | |||
| const int gg = gz / num_output_g; | |||
| #if NCNN_image_shader | |||
| int sz = gg * channels_g; | |||
| for (int z = 0; z < channels_g; z++) | |||
| { | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| sum += image3d_ld1(weight_blob, ivec3(wx, z, gz)) * image3d_ld1(bottom_blob, ivec3(sx, sy, sz)); | |||
| sx += dilation_w; | |||
| wx += 1; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| sz += 1; | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * channels_g * kernel_w * kernel_h; | |||
| int v_offset_0 = gg * channels_g * psc(cstep); | |||
| @@ -117,6 +154,7 @@ void main() | |||
| v_offset_0 += psc(cstep); | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -138,7 +176,11 @@ void main() | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st1(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st1(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -83,7 +90,11 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld4(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld4(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| @@ -97,6 +108,36 @@ void main() | |||
| // group id | |||
| const int gg = gz / num_output_g; | |||
| #if NCNN_image_shader | |||
| int sz = gg * channels_g; | |||
| for (int z = 0; z < channels_g; z++) | |||
| { | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, sz)); | |||
| afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz)); | |||
| sum += v * k; | |||
| sx += dilation_w; | |||
| wx += 1; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| sz += 1; | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * channels_g * kernel_w * kernel_h; | |||
| int v_offset_0 = gg * channels_g * psc(cstep); | |||
| @@ -121,6 +162,7 @@ void main() | |||
| v_offset_0 += psc(cstep); | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -142,7 +184,11 @@ void main() | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st4(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st4(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -84,7 +91,11 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld8(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld8(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| @@ -98,6 +109,38 @@ void main() | |||
| // group id | |||
| const int gg = gz / num_output_g; | |||
| #if NCNN_image_shader | |||
| int sz = gg * channels_g; | |||
| for (int z = 0; z < channels_g; z++) | |||
| { | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, sz)); | |||
| afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz)); | |||
| // sum += v * k; | |||
| sum[0] += v * k[0]; | |||
| sum[1] += v * k[1]; | |||
| sx += dilation_w; | |||
| wx += 1; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| sz += 1; | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * channels_g * kernel_w * kernel_h; | |||
| int v_offset_0 = gg * channels_g * psc(cstep); | |||
| @@ -124,6 +167,7 @@ void main() | |||
| v_offset_0 += psc(cstep); | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -149,7 +193,11 @@ void main() | |||
| sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st8(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st8(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -50,6 +50,12 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | |||
| #if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) | |||
| @@ -59,6 +65,7 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; | |||
| #endif | |||
| layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -88,7 +95,11 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld4(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld4(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| @@ -102,6 +113,41 @@ void main() | |||
| // group id | |||
| const int gg = gz / num_output_g; | |||
| #if NCNN_image_shader | |||
| int sz = gg * channels_g; | |||
| for (int z = 0; z < channels_g; z++) | |||
| { | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, sz)); | |||
| afpmat4 k = afpmat4( | |||
| image3d_ld4(weight_blob, ivec3(wx + 0, z, gz)), | |||
| image3d_ld4(weight_blob, ivec3(wx + 1, z, gz)), | |||
| image3d_ld4(weight_blob, ivec3(wx + 2, z, gz)), | |||
| image3d_ld4(weight_blob, ivec3(wx + 3, z, gz)) | |||
| ); | |||
| sum += v * k; | |||
| sx += dilation_w; | |||
| wx += 4; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| sz += 1; | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * channels_g * kernel_w * kernel_h; | |||
| int v_offset_0 = gg * channels_g * psc(cstep); | |||
| @@ -136,6 +182,7 @@ void main() | |||
| v_offset_0 += psc(cstep); | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -157,7 +204,11 @@ void main() | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st4(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st4(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -83,7 +90,11 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld1(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld1(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| @@ -97,6 +108,36 @@ void main() | |||
| // group id | |||
| const int gg = gz / num_output_g; | |||
| #if NCNN_image_shader | |||
| int sz = gg * channels_g; | |||
| for (int z = 0; z < channels_g; z++) | |||
| { | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, sz)); | |||
| afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz)); | |||
| sum += dot(v, k); | |||
| sx += dilation_w; | |||
| wx += 1; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| sz += 1; | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * channels_g * kernel_w * kernel_h; | |||
| int v_offset_0 = gg * channels_g * psc(cstep); | |||
| @@ -121,6 +162,7 @@ void main() | |||
| v_offset_0 += psc(cstep); | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -142,7 +184,11 @@ void main() | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st1(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st1(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -84,7 +91,11 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld8(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld8(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| @@ -98,6 +109,51 @@ void main() | |||
| // group id | |||
| const int gg = gz / num_output_g; | |||
| #if NCNN_image_shader | |||
| int sz = gg * channels_g; | |||
| for (int z = 0; z < channels_g; z++) | |||
| { | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, sz)); | |||
| afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx + 0, z, gz)); | |||
| afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx + 1, z, gz)); | |||
| afpvec4 k2 = image3d_ld4(weight_blob, ivec3(wx + 2, z, gz)); | |||
| afpvec4 k3 = image3d_ld4(weight_blob, ivec3(wx + 3, z, gz)); | |||
| afpvec4 k4 = image3d_ld4(weight_blob, ivec3(wx + 4, z, gz)); | |||
| afpvec4 k5 = image3d_ld4(weight_blob, ivec3(wx + 5, z, gz)); | |||
| afpvec4 k6 = image3d_ld4(weight_blob, ivec3(wx + 6, z, gz)); | |||
| afpvec4 k7 = image3d_ld4(weight_blob, ivec3(wx + 7, z, gz)); | |||
| // sum += v * k; | |||
| sum[0].r += dot(v, k0); | |||
| sum[0].g += dot(v, k1); | |||
| sum[0].b += dot(v, k2); | |||
| sum[0].a += dot(v, k3); | |||
| sum[1].r += dot(v, k4); | |||
| sum[1].g += dot(v, k5); | |||
| sum[1].b += dot(v, k6); | |||
| sum[1].a += dot(v, k7); | |||
| sx += dilation_w; | |||
| wx += 8; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| sz += 1; | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * channels_g * kernel_w * kernel_h; | |||
| int v_offset_0 = gg * channels_g * psc(cstep); | |||
| @@ -137,6 +193,7 @@ void main() | |||
| v_offset_0 += psc(cstep); | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -162,7 +219,11 @@ void main() | |||
| sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st8(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st8(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -84,7 +91,11 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld8(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld8(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| @@ -98,6 +109,51 @@ void main() | |||
| // group id | |||
| const int gg = gz / num_output_g; | |||
| #if NCNN_image_shader | |||
| int sz = gg * channels_g; | |||
| for (int z = 0; z < channels_g; z++) | |||
| { | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, sz)); | |||
| afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz)); | |||
| afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz)); | |||
| afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz)); | |||
| afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz)); | |||
| afpvec8 k4 = image3d_ld8(weight_blob, ivec3(wx + 4, z, gz)); | |||
| afpvec8 k5 = image3d_ld8(weight_blob, ivec3(wx + 5, z, gz)); | |||
| afpvec8 k6 = image3d_ld8(weight_blob, ivec3(wx + 6, z, gz)); | |||
| afpvec8 k7 = image3d_ld8(weight_blob, ivec3(wx + 7, z, gz)); | |||
| // sum += v * k | |||
| sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]); | |||
| sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]); | |||
| sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]); | |||
| sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]); | |||
| sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]); | |||
| sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]); | |||
| sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]); | |||
| sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]); | |||
| sx += dilation_w; | |||
| wx += 8; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| sz += 1; | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * channels_g * kernel_w * kernel_h; | |||
| int v_offset_0 = gg * channels_g * psc(cstep); | |||
| @@ -137,6 +193,7 @@ void main() | |||
| v_offset_0 += psc(cstep); | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -162,7 +219,11 @@ void main() | |||
| sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st8(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st8(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -84,7 +91,11 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld1(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld1(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| @@ -98,6 +109,37 @@ void main() | |||
| // group id | |||
| const int gg = gz / num_output_g; | |||
| #if NCNN_image_shader | |||
| int sz = gg * channels_g; | |||
| for (int z = 0; z < channels_g; z++) | |||
| { | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, sz)); | |||
| afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz)); | |||
| // sum += dot(v, k); | |||
| sum += dot(v[0], k[0]) + dot(v[1], k[1]); | |||
| sx += dilation_w; | |||
| wx += 1; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| sz += 1; | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * channels_g * kernel_w * kernel_h; | |||
| int v_offset_0 = gg * channels_g * psc(cstep); | |||
| @@ -123,6 +165,7 @@ void main() | |||
| v_offset_0 += psc(cstep); | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -144,7 +187,11 @@ void main() | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st1(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st1(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler3D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -84,7 +91,11 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld4(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld4(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| @@ -98,6 +109,43 @@ void main() | |||
| // group id | |||
| const int gg = gz / num_output_g; | |||
| #if NCNN_image_shader | |||
| int sz = gg * channels_g; | |||
| for (int z = 0; z < channels_g; z++) | |||
| { | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, sz)); | |||
| afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz)); | |||
| afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz)); | |||
| afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz)); | |||
| afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz)); | |||
| // sum += v * k | |||
| sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]); | |||
| sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]); | |||
| sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]); | |||
| sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]); | |||
| sx += dilation_w; | |||
| wx += 4; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| sz += 1; | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * channels_g * kernel_w * kernel_h; | |||
| int v_offset_0 = gg * channels_g * psc(cstep); | |||
| @@ -129,6 +177,7 @@ void main() | |||
| v_offset_0 += psc(cstep); | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -150,7 +199,11 @@ void main() | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st4(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st4(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -50,10 +50,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler2D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -83,7 +90,11 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld4(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld4(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| @@ -91,6 +102,29 @@ void main() | |||
| } | |||
| // depth-wise convolution | |||
| #if NCNN_image_shader | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, gz)); | |||
| afpvec4 k = image2d_ld4(weight_blob, ivec2(wx, gz)); | |||
| sum += v * k; | |||
| sx += dilation_w; | |||
| wx += 1; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * kernel_w * kernel_h; | |||
| int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; | |||
| @@ -108,6 +142,7 @@ void main() | |||
| v_offset += dilation_h * psc(w); | |||
| w_offset += kernel_w; | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -129,7 +164,11 @@ void main() | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st4(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st4(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -51,10 +51,17 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; | |||
| layout (binding = 2) uniform unfp sampler2D weight_blob; | |||
| layout (binding = 3) uniform unfp sampler1D bias_blob; | |||
| #else // NCNN_image_shader | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; | |||
| #endif // NCNN_image_shader | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -84,7 +91,11 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = image1d_ld8(bias_blob, gz); | |||
| #else | |||
| sum = buffer_ld8(bias_data, gz); | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| @@ -92,6 +103,31 @@ void main() | |||
| } | |||
| // depth-wise convolution | |||
| #if NCNN_image_shader | |||
| int sy = gy * stride_h; | |||
| int wx = 0; | |||
| for (int y = 0; y < kernel_h; y++) | |||
| { | |||
| int sx = gx * stride_w; | |||
| for (int x = 0; x < kernel_w; x++) | |||
| { | |||
| afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, gz)); | |||
| afpvec8 k = image2d_ld8(weight_blob, ivec2(wx, gz)); | |||
| // sum += v * k; | |||
| sum[0] += v[0] * k[0]; | |||
| sum[1] += v[1] * k[1]; | |||
| sx += dilation_w; | |||
| wx += 1; | |||
| } | |||
| sy += dilation_h; | |||
| } | |||
| #else // NCNN_image_shader | |||
| int w_offset = gz * kernel_w * kernel_h; | |||
| int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; | |||
| @@ -111,6 +147,7 @@ void main() | |||
| v_offset += dilation_h * psc(w); | |||
| w_offset += kernel_w; | |||
| } | |||
| #endif // NCNN_image_shader | |||
| if (activation_type == 1) | |||
| { | |||
| @@ -136,7 +173,11 @@ void main() | |||
| sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st8(top_blob, ivec3(gx, gy, gz), sum); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| buffer_st8(top_blob_data, gi, sum); | |||
| #endif | |||
| } | |||
| @@ -38,8 +38,13 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -69,13 +74,17 @@ void main() | |||
| if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) | |||
| return; | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| int x = gx + p.woffset; | |||
| int y = gy + p.hoffset; | |||
| int z = gz + p.coffset; | |||
| #if NCNN_image_shader | |||
| image3d_cp1(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, z)); | |||
| #else | |||
| const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| int v_offset = z * psc(cstep) + y * psc(w) + x; | |||
| buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); | |||
| #endif | |||
| } | |||
| @@ -38,8 +38,13 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -69,12 +74,23 @@ void main() | |||
| if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) | |||
| return; | |||
| int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| int x = gx + p.woffset; | |||
| int y = gy + p.hoffset; | |||
| int z = gz * 4 + p.coffset; | |||
| #if NCNN_image_shader | |||
| afpvec4 v; | |||
| v.r = image3d_ld1(bottom_blob, ivec3(x, y, z + 0)); | |||
| v.g = image3d_ld1(bottom_blob, ivec3(x, y, z + 1)); | |||
| v.b = image3d_ld1(bottom_blob, ivec3(x, y, z + 2)); | |||
| v.a = image3d_ld1(bottom_blob, ivec3(x, y, z + 3)); | |||
| image3d_st4(top_blob, ivec3(gx, gy, gz), v); | |||
| #else | |||
| int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| ivec4 v_offset = z * psc(cstep) + y * psc(w) + x + ivec4(0, 1, 2, 3) * psc(cstep); | |||
| buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset); | |||
| #endif | |||
| } | |||
| @@ -39,8 +39,13 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -70,13 +75,28 @@ void main() | |||
| if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) | |||
| return; | |||
| int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| int x = gx + p.woffset; | |||
| int y = gy + p.hoffset; | |||
| int z = gz * 8 + p.coffset; | |||
| #if NCNN_image_shader | |||
| afpvec8 v; | |||
| v[0].r = image3d_ld1(bottom_blob, ivec3(x, y, z + 0)); | |||
| v[0].g = image3d_ld1(bottom_blob, ivec3(x, y, z + 1)); | |||
| v[0].b = image3d_ld1(bottom_blob, ivec3(x, y, z + 2)); | |||
| v[0].a = image3d_ld1(bottom_blob, ivec3(x, y, z + 3)); | |||
| v[1].r = image3d_ld1(bottom_blob, ivec3(x, y, z + 4)); | |||
| v[1].g = image3d_ld1(bottom_blob, ivec3(x, y, z + 5)); | |||
| v[1].b = image3d_ld1(bottom_blob, ivec3(x, y, z + 6)); | |||
| v[1].a = image3d_ld1(bottom_blob, ivec3(x, y, z + 7)); | |||
| image3d_st8(top_blob, ivec3(gx, gy, gz), v); | |||
| #else | |||
| int gi = gz * psc(outcstep) + gy * psc(outw) + gx; | |||
| ivec4 v_offset = z * psc(cstep) + y * psc(w) + x + ivec4(0, 1, 2, 3) * psc(cstep); | |||
| ivec4 vv_offset = v_offset + 4 * psc(cstep); | |||
| buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); | |||
| #endif | |||
| } | |||