You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

testutil.h 32 kB

6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
LSTM arm/x86 + fp16 innerproduct arm (#1881) * added fp16 weight storage version * Small changes * Fixed fp16 weight storage layers * fix innerproduct * fix loop error * Fix windows build. Disable fp 16 conversion when detecting int8 weights. Implement requested changes. * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * Update option.cpp Set fp16 storage based on vulkan being used or not. * added ability for storing state in lstm layer * added avx lstm * added arm lstm * fix innerproduct activation location and add 4 parallel channel version * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * revert arm file * commit before switch * implement requested changes * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * More x86 optimized implementations of common layers. Added LSTM layers for arm and x86 + a ctest to verify the layer accuracy Added fp16 innerproduct for arm * fix non avx build * Add fp16 arm compiler and cpu checks. Remove statefullness from LSTM implementation. * Fix build check for fp16 arm * Bypass lstm_fp16 if not supported * Build order was incorrect * fix std::min missing in windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * attempting to fix gnu build by enabling: -mfp16-format=ieee to fix the missing __fp16 type * remove double "fix" * Specify ieee fp16 format * implement requested changes * fix arm non-fp16 build * fix arm lstm * Restyled/pull 1881 (#15) * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle Co-authored-by: Restyled.io <commits@restyled.io> * Check blob size on arm lstm * fix styling Co-authored-by: Restyled.io <commits@restyled.io>
5 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
LSTM arm/x86 + fp16 innerproduct arm (#1881) * added fp16 weight storage version * Small changes * Fixed fp16 weight storage layers * fix innerproduct * fix loop error * Fix windows build. Disable fp 16 conversion when detecting int8 weights. Implement requested changes. * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * Update option.cpp Set fp16 storage based on vulkan being used or not. * added ability for storing state in lstm layer * added avx lstm * added arm lstm * fix innerproduct activation location and add 4 parallel channel version * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * revert arm file * commit before switch * implement requested changes * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * More x86 optimized implementations of common layers. Added LSTM layers for arm and x86 + a ctest to verify the layer accuracy Added fp16 innerproduct for arm * fix non avx build * Add fp16 arm compiler and cpu checks. Remove statefullness from LSTM implementation. * Fix build check for fp16 arm * Bypass lstm_fp16 if not supported * Build order was incorrect * fix std::min missing in windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * attempting to fix gnu build by enabling: -mfp16-format=ieee to fix the missing __fp16 type * remove double "fix" * Specify ieee fp16 format * implement requested changes * fix arm non-fp16 build * fix arm lstm * Restyled/pull 1881 (#15) * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle Co-authored-by: Restyled.io <commits@restyled.io> * Check blob size on arm lstm * fix styling Co-authored-by: Restyled.io <commits@restyled.io>
5 years ago
LSTM arm/x86 + fp16 innerproduct arm (#1881) * added fp16 weight storage version * Small changes * Fixed fp16 weight storage layers * fix innerproduct * fix loop error * Fix windows build. Disable fp 16 conversion when detecting int8 weights. Implement requested changes. * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * Update option.cpp Set fp16 storage based on vulkan being used or not. * added ability for storing state in lstm layer * added avx lstm * added arm lstm * fix innerproduct activation location and add 4 parallel channel version * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * revert arm file * commit before switch * implement requested changes * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * More x86 optimized implementations of common layers. Added LSTM layers for arm and x86 + a ctest to verify the layer accuracy Added fp16 innerproduct for arm * fix non avx build * Add fp16 arm compiler and cpu checks. Remove statefullness from LSTM implementation. * Fix build check for fp16 arm * Bypass lstm_fp16 if not supported * Build order was incorrect * fix std::min missing in windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * attempting to fix gnu build by enabling: -mfp16-format=ieee to fix the missing __fp16 type * remove double "fix" * Specify ieee fp16 format * implement requested changes * fix arm non-fp16 build * fix arm lstm * Restyled/pull 1881 (#15) * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle Co-authored-by: Restyled.io <commits@restyled.io> * Check blob size on arm lstm * fix styling Co-authored-by: Restyled.io <commits@restyled.io>
5 years ago
LSTM arm/x86 + fp16 innerproduct arm (#1881) * added fp16 weight storage version * Small changes * Fixed fp16 weight storage layers * fix innerproduct * fix loop error * Fix windows build. Disable fp 16 conversion when detecting int8 weights. Implement requested changes. * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * Update option.cpp Set fp16 storage based on vulkan being used or not. * added ability for storing state in lstm layer * added avx lstm * added arm lstm * fix innerproduct activation location and add 4 parallel channel version * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * revert arm file * commit before switch * implement requested changes * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * More x86 optimized implementations of common layers. Added LSTM layers for arm and x86 + a ctest to verify the layer accuracy Added fp16 innerproduct for arm * fix non avx build * Add fp16 arm compiler and cpu checks. Remove statefullness from LSTM implementation. * Fix build check for fp16 arm * Bypass lstm_fp16 if not supported * Build order was incorrect * fix std::min missing in windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * attempting to fix gnu build by enabling: -mfp16-format=ieee to fix the missing __fp16 type * remove double "fix" * Specify ieee fp16 format * implement requested changes * fix arm non-fp16 build * fix arm lstm * Restyled/pull 1881 (#15) * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle Co-authored-by: Restyled.io <commits@restyled.io> * Check blob size on arm lstm * fix styling Co-authored-by: Restyled.io <commits@restyled.io>
5 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
LSTM arm/x86 + fp16 innerproduct arm (#1881) * added fp16 weight storage version * Small changes * Fixed fp16 weight storage layers * fix innerproduct * fix loop error * Fix windows build. Disable fp 16 conversion when detecting int8 weights. Implement requested changes. * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * Update option.cpp Set fp16 storage based on vulkan being used or not. * added ability for storing state in lstm layer * added avx lstm * added arm lstm * fix innerproduct activation location and add 4 parallel channel version * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * revert arm file * commit before switch * implement requested changes * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * More x86 optimized implementations of common layers. Added LSTM layers for arm and x86 + a ctest to verify the layer accuracy Added fp16 innerproduct for arm * fix non avx build * Add fp16 arm compiler and cpu checks. Remove statefullness from LSTM implementation. * Fix build check for fp16 arm * Bypass lstm_fp16 if not supported * Build order was incorrect * fix std::min missing in windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * attempting to fix gnu build by enabling: -mfp16-format=ieee to fix the missing __fp16 type * remove double "fix" * Specify ieee fp16 format * implement requested changes * fix arm non-fp16 build * fix arm lstm * Restyled/pull 1881 (#15) * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle Co-authored-by: Restyled.io <commits@restyled.io> * Check blob size on arm lstm * fix styling Co-authored-by: Restyled.io <commits@restyled.io>
5 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
LSTM arm/x86 + fp16 innerproduct arm (#1881) * added fp16 weight storage version * Small changes * Fixed fp16 weight storage layers * fix innerproduct * fix loop error * Fix windows build. Disable fp 16 conversion when detecting int8 weights. Implement requested changes. * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * Update option.cpp Set fp16 storage based on vulkan being used or not. * added ability for storing state in lstm layer * added avx lstm * added arm lstm * fix innerproduct activation location and add 4 parallel channel version * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * revert arm file * commit before switch * implement requested changes * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * More x86 optimized implementations of common layers. Added LSTM layers for arm and x86 + a ctest to verify the layer accuracy Added fp16 innerproduct for arm * fix non avx build * Add fp16 arm compiler and cpu checks. Remove statefullness from LSTM implementation. * Fix build check for fp16 arm * Bypass lstm_fp16 if not supported * Build order was incorrect * fix std::min missing in windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * attempting to fix gnu build by enabling: -mfp16-format=ieee to fix the missing __fp16 type * remove double "fix" * Specify ieee fp16 format * implement requested changes * fix arm non-fp16 build * fix arm lstm * Restyled/pull 1881 (#15) * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle Co-authored-by: Restyled.io <commits@restyled.io> * Check blob size on arm lstm * fix styling Co-authored-by: Restyled.io <commits@restyled.io>
5 years ago
LSTM arm/x86 + fp16 innerproduct arm (#1881) * added fp16 weight storage version * Small changes * Fixed fp16 weight storage layers * fix innerproduct * fix loop error * Fix windows build. Disable fp 16 conversion when detecting int8 weights. Implement requested changes. * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * Update option.cpp Set fp16 storage based on vulkan being used or not. * added ability for storing state in lstm layer * added avx lstm * added arm lstm * fix innerproduct activation location and add 4 parallel channel version * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * revert arm file * commit before switch * implement requested changes * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * More x86 optimized implementations of common layers. Added LSTM layers for arm and x86 + a ctest to verify the layer accuracy Added fp16 innerproduct for arm * fix non avx build * Add fp16 arm compiler and cpu checks. Remove statefullness from LSTM implementation. * Fix build check for fp16 arm * Bypass lstm_fp16 if not supported * Build order was incorrect * fix std::min missing in windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * attempting to fix gnu build by enabling: -mfp16-format=ieee to fix the missing __fp16 type * remove double "fix" * Specify ieee fp16 format * implement requested changes * fix arm non-fp16 build * fix arm lstm * Restyled/pull 1881 (#15) * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle Co-authored-by: Restyled.io <commits@restyled.io> * Check blob size on arm lstm * fix styling Co-authored-by: Restyled.io <commits@restyled.io>
5 years ago
LSTM arm/x86 + fp16 innerproduct arm (#1881) * added fp16 weight storage version * Small changes * Fixed fp16 weight storage layers * fix innerproduct * fix loop error * Fix windows build. Disable fp 16 conversion when detecting int8 weights. Implement requested changes. * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * Update option.cpp Set fp16 storage based on vulkan being used or not. * added ability for storing state in lstm layer * added avx lstm * added arm lstm * fix innerproduct activation location and add 4 parallel channel version * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * revert arm file * commit before switch * implement requested changes * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * More x86 optimized implementations of common layers. Added LSTM layers for arm and x86 + a ctest to verify the layer accuracy Added fp16 innerproduct for arm * fix non avx build * Add fp16 arm compiler and cpu checks. Remove statefullness from LSTM implementation. * Fix build check for fp16 arm * Bypass lstm_fp16 if not supported * Build order was incorrect * fix std::min missing in windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * attempting to fix gnu build by enabling: -mfp16-format=ieee to fix the missing __fp16 type * remove double "fix" * Specify ieee fp16 format * implement requested changes * fix arm non-fp16 build * fix arm lstm * Restyled/pull 1881 (#15) * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle Co-authored-by: Restyled.io <commits@restyled.io> * Check blob size on arm lstm * fix styling Co-authored-by: Restyled.io <commits@restyled.io>
5 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
adreno image shader + fp16 + fp16a (#1714) * wip * wip * fix * image and imageview can not be destroyed until command execution ends * fast copy path for tightly packed data * wip * texture load works * 1d 3d image * record clone image, multiple commands share one image reference * upload download image * layer forward accept vkimagemat * vkimagemat graph works * staging vkimagemat for passing dynamic parameters, macro for fp32+image shader, padding image shader * vkimagemat elemsize * convolution test pass * conv1x1s1 image shader * fast staging image allocator from host memory, pooling image shader * convolutiondepthwise image shader * innerproduct image shader * packing image shader * crop deconvolution image shader * resolve spirv binding types * image fp16 and fp16a, cast image shader * eltwise image shader * wip * absval image shader * deconvolutiondepthwise image shader * concat image shader, squeezenet works * noop split image shader * uniform precision hint * layer support_image_storage * wip * vulkan device utility operator * command is storage and packing option aware * fallback to cpu on image allocation failed, mobilenetssd works * flatten image shader, enable more test * ci test * check imgfp32 imgfp16 imgfp16a features * fix ci test * fix ci test * upgrade swiftshader * wip * opt aggressive * imgfp16p * opt none * convolution winograd image shader * fix flush range, fast copy path for continous buffer * minor fix * fix innerproduct * wip ... * wip * cast fix * packing test * wip * image fp16p is fp16p * wip * silence * more line info * code clean * softmax image shader
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #ifndef TESTUTIL_H
  15. #define TESTUTIL_H
  16. #include "layer.h"
  17. #include "mat.h"
  18. #include "prng.h"
  19. #include <algorithm>
  20. #include <math.h>
  21. #include <stdio.h>
  22. #if NCNN_VULKAN
  23. #include "command.h"
  24. #include "gpu.h"
  25. #endif // NCNN_VULKAN
  26. static struct prng_rand_t g_prng_rand_state;
  27. #define SRAND(seed) prng_srand(seed, &g_prng_rand_state)
  28. #define RAND() prng_rand(&g_prng_rand_state)
  29. static float RandomFloat(float a = -1.5f, float b = 1.5f)
  30. {
  31. float random = ((float)RAND()) / (float)uint64_t(-1); //RAND_MAX;
  32. float diff = b - a;
  33. float r = random * diff;
  34. return a + r;
  35. }
  36. static void Randomize(ncnn::Mat& m, float a = -1.5f, float b = 1.5f)
  37. {
  38. for (size_t i = 0; i < m.total(); i++)
  39. {
  40. m[i] = RandomFloat(a, b);
  41. }
  42. }
  43. static ncnn::Mat RandomMat(int w)
  44. {
  45. ncnn::Mat m(w);
  46. Randomize(m);
  47. return m;
  48. }
  49. static ncnn::Mat RandomMat(int w, int h)
  50. {
  51. ncnn::Mat m(w, h);
  52. Randomize(m);
  53. return m;
  54. }
  55. static ncnn::Mat RandomMat(int w, int h, int c)
  56. {
  57. ncnn::Mat m(w, h, c);
  58. Randomize(m);
  59. return m;
  60. }
  61. static bool NearlyEqual(float a, float b, float epsilon)
  62. {
  63. if (a == b)
  64. return true;
  65. float diff = fabs(a - b);
  66. if (diff <= epsilon)
  67. return true;
  68. // relative error
  69. return diff < epsilon * std::max(fabs(a), fabs(b));
  70. }
  71. static int Compare(const ncnn::Mat& a, const ncnn::Mat& b, float epsilon = 0.001)
  72. {
  73. #define CHECK_MEMBER(m) \
  74. if (a.m != b.m) \
  75. { \
  76. fprintf(stderr, #m " not match expect %d but got %d\n", (int)a.m, (int)b.m); \
  77. return -1; \
  78. }
  79. CHECK_MEMBER(dims)
  80. CHECK_MEMBER(w)
  81. CHECK_MEMBER(h)
  82. CHECK_MEMBER(c)
  83. CHECK_MEMBER(elemsize)
  84. CHECK_MEMBER(elempack)
  85. #undef CHECK_MEMBER
  86. for (int q = 0; q < a.c; q++)
  87. {
  88. const ncnn::Mat ma = a.channel(q);
  89. const ncnn::Mat mb = b.channel(q);
  90. for (int i = 0; i < a.h; i++)
  91. {
  92. const float* pa = ma.row(i);
  93. const float* pb = mb.row(i);
  94. for (int j = 0; j < a.w; j++)
  95. {
  96. if (!NearlyEqual(pa[j], pb[j], epsilon))
  97. {
  98. fprintf(stderr, "value not match at c:%d h:%d w:%d expect %f but got %f\n", q, i, j, pa[j], pb[j]);
  99. return -1;
  100. }
  101. }
  102. }
  103. }
  104. return 0;
  105. }
  106. static int CompareMat(const ncnn::Mat& a, const ncnn::Mat& b, float epsilon = 0.001)
  107. {
  108. if (a.elempack != 1)
  109. {
  110. ncnn::Mat a1;
  111. ncnn::convert_packing(a, a1, 1);
  112. return CompareMat(a1, b, epsilon);
  113. }
  114. if (b.elempack != 1)
  115. {
  116. ncnn::Mat b1;
  117. ncnn::convert_packing(b, b1, 1);
  118. return CompareMat(a, b1, epsilon);
  119. }
  120. if (a.elemsize == 2u)
  121. {
  122. ncnn::Mat a32;
  123. cast_float16_to_float32(a, a32);
  124. return CompareMat(a32, b, epsilon);
  125. }
  126. if (a.elemsize == 1u)
  127. {
  128. ncnn::Mat a32;
  129. cast_int8_to_float32(a, a32);
  130. return CompareMat(a32, b, epsilon);
  131. }
  132. if (b.elemsize == 2u)
  133. {
  134. ncnn::Mat b32;
  135. cast_float16_to_float32(b, b32);
  136. return CompareMat(a, b32, epsilon);
  137. }
  138. if (b.elemsize == 1u)
  139. {
  140. ncnn::Mat b32;
  141. cast_int8_to_float32(b, b32);
  142. return CompareMat(a, b32, epsilon);
  143. }
  144. return Compare(a, b, epsilon);
  145. }
  146. static int CompareMat(const std::vector<ncnn::Mat>& a, const std::vector<ncnn::Mat>& b, float epsilon = 0.001)
  147. {
  148. if (a.size() != b.size())
  149. {
  150. fprintf(stderr, "output blob count not match %zu %zu\n", a.size(), b.size());
  151. return -1;
  152. }
  153. for (size_t i = 0; i < a.size(); i++)
  154. {
  155. if (CompareMat(a[i], b[i], epsilon))
  156. {
  157. fprintf(stderr, "output blob %zu not match\n", i);
  158. return -1;
  159. }
  160. }
  161. return 0;
  162. }
  163. template<typename T>
  164. int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& b, void (*func)(T*))
  165. {
  166. ncnn::Layer* op = ncnn::create_layer(typeindex);
  167. if (func)
  168. {
  169. (*func)((T*)op);
  170. }
  171. op->load_param(pd);
  172. if (op->one_blob_only && a.size() != 1)
  173. {
  174. fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
  175. delete op;
  176. return -1;
  177. }
  178. ncnn::ModelBinFromMatArray mb(weights.data());
  179. op->load_model(mb);
  180. ncnn::Option opt;
  181. opt.num_threads = 1;
  182. opt.use_packing_layout = false;
  183. opt.use_fp16_packed = false;
  184. opt.use_fp16_storage = false;
  185. opt.use_fp16_arithmetic = false;
  186. opt.use_shader_pack8 = false;
  187. opt.use_image_storage = false;
  188. opt.use_bf16_storage = false;
  189. opt.use_vulkan_compute = false;
  190. opt.use_weight_fp16_storage = false;
  191. op->create_pipeline(opt);
  192. b.resize(top_blob_count);
  193. if (op->support_inplace)
  194. {
  195. for (size_t i = 0; i < a.size(); i++)
  196. {
  197. b[i] = a[i].clone();
  198. }
  199. ((T*)op)->T::forward_inplace(b, opt);
  200. }
  201. else
  202. {
  203. ((T*)op)->T::forward(a, b, opt);
  204. }
  205. op->destroy_pipeline(opt);
  206. delete op;
  207. return 0;
  208. }
  209. template<typename T>
  210. int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& c, const std::vector<ncnn::Mat>& top_shapes, void (*func)(T*))
  211. {
  212. ncnn::Layer* op = ncnn::create_layer(typeindex);
  213. if (func)
  214. {
  215. (*func)((T*)op);
  216. }
  217. if (!top_shapes.empty())
  218. {
  219. op->bottom_shapes = a;
  220. op->top_shapes = top_shapes;
  221. }
  222. op->load_param(pd);
  223. if (op->one_blob_only && a.size() != 1)
  224. {
  225. fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
  226. delete op;
  227. return -1;
  228. }
  229. ncnn::ModelBinFromMatArray mb(weights.data());
  230. op->load_model(mb);
  231. ncnn::Option opt = _opt;
  232. opt.num_threads = 1;
  233. opt.use_vulkan_compute = false;
  234. if (!op->support_packing) opt.use_packing_layout = false;
  235. if (!op->support_bf16_storage) opt.use_bf16_storage = false;
  236. if (!op->support_fp16_storage) opt.use_fp16_storage = false;
  237. if (!op->support_weight_fp16_storage) opt.use_weight_fp16_storage = false;
  238. if (op->use_int8_inference)
  239. {
  240. opt.use_bf16_storage = false;
  241. opt.use_fp16_storage = false;
  242. opt.use_packing_layout = false;
  243. }
  244. op->create_pipeline(opt);
  245. std::vector<ncnn::Mat> a4(a.size());
  246. if (opt.use_packing_layout)
  247. {
  248. for (size_t i = 0; i < a.size(); i++)
  249. {
  250. #if NCNN_AVX2
  251. ncnn::convert_packing(a[i], a4[i], 8, opt);
  252. #else
  253. ncnn::convert_packing(a[i], a4[i], 4, opt);
  254. #endif
  255. }
  256. }
  257. else
  258. {
  259. a4 = a;
  260. }
  261. if (opt.use_bf16_storage)
  262. {
  263. for (size_t i = 0; i < a4.size(); i++)
  264. {
  265. ncnn::Mat a_bf16;
  266. ncnn::cast_float32_to_bfloat16(a4[i], a_bf16, opt);
  267. a4[i] = a_bf16;
  268. }
  269. }
  270. else if (opt.use_fp16_storage)
  271. {
  272. for (size_t i = 0; i < a4.size(); i++)
  273. {
  274. ncnn::Mat a_fp16;
  275. ncnn::cast_float32_to_float16(a4[i], a_fp16, opt);
  276. a4[i] = a_fp16;
  277. }
  278. }
  279. c.resize(top_blob_count);
  280. if (op->support_inplace)
  281. {
  282. for (size_t i = 0; i < a4.size(); i++)
  283. {
  284. c[i] = a4[i].clone();
  285. }
  286. op->forward_inplace(c, opt);
  287. }
  288. else
  289. {
  290. op->forward(a4, c, opt);
  291. }
  292. if (opt.use_bf16_storage)
  293. {
  294. for (size_t i = 0; i < c.size(); i++)
  295. {
  296. ncnn::Mat c_fp32;
  297. ncnn::cast_bfloat16_to_float32(c[i], c_fp32, opt);
  298. c[i] = c_fp32;
  299. }
  300. }
  301. else if (opt.use_fp16_storage)
  302. {
  303. for (size_t i = 0; i < c.size(); i++)
  304. {
  305. ncnn::Mat c_fp32;
  306. ncnn::cast_float16_to_float32(c[i], c_fp32, opt);
  307. c[i] = c_fp32;
  308. }
  309. }
  310. op->destroy_pipeline(opt);
  311. delete op;
  312. return 0;
  313. }
  314. #if NCNN_VULKAN
  315. template<typename T>
  316. int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& d, const std::vector<ncnn::Mat>& top_shapes, void (*func)(T*))
  317. {
  318. ncnn::Layer* op = ncnn::create_layer(typeindex);
  319. if (!op->support_vulkan)
  320. {
  321. delete op;
  322. return 233;
  323. }
  324. ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
  325. op->vkdev = vkdev;
  326. if (func)
  327. {
  328. (*func)((T*)op);
  329. }
  330. if (!top_shapes.empty())
  331. {
  332. op->bottom_shapes = a;
  333. op->top_shapes = top_shapes;
  334. }
  335. op->load_param(pd);
  336. if (op->one_blob_only && a.size() != 1)
  337. {
  338. fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
  339. delete op;
  340. return -1;
  341. }
  342. ncnn::ModelBinFromMatArray mb(weights.data());
  343. op->load_model(mb);
  344. if (op->use_int8_inference)
  345. {
  346. // NOTE skip int8 on gpu
  347. delete op;
  348. return 233;
  349. }
  350. ncnn::VkWeightAllocator g_weight_vkallocator(vkdev);
  351. ncnn::VkWeightStagingAllocator g_weight_staging_vkallocator(vkdev);
  352. ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
  353. ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
  354. ncnn::Option opt = _opt;
  355. opt.num_threads = 1;
  356. opt.use_vulkan_compute = true;
  357. if (!op->support_packing) opt.use_packing_layout = false;
  358. if (!op->support_bf16_storage) opt.use_bf16_storage = false;
  359. if (!op->support_image_storage) opt.use_image_storage = false;
  360. if (!op->support_weight_fp16_storage) opt.use_weight_fp16_storage = false;
  361. #if __APPLE__
  362. opt.use_image_storage = false;
  363. #endif
  364. opt.blob_vkallocator = blob_vkallocator;
  365. opt.workspace_vkallocator = blob_vkallocator;
  366. opt.staging_vkallocator = staging_vkallocator;
  367. if (!vkdev->info.support_fp16_packed) opt.use_fp16_packed = false;
  368. if (!vkdev->info.support_fp16_storage) opt.use_fp16_storage = false;
  369. if (!vkdev->info.support_fp16_arithmetic) opt.use_fp16_arithmetic = false;
  370. // FIXME fp16a may produce large error
  371. opt.use_fp16_arithmetic = false;
  372. op->create_pipeline(opt);
  373. {
  374. ncnn::VkTransfer cmd(vkdev);
  375. ncnn::Option opt_upload = opt;
  376. opt_upload.blob_vkallocator = &g_weight_vkallocator;
  377. opt_upload.workspace_vkallocator = &g_weight_vkallocator;
  378. opt_upload.staging_vkallocator = &g_weight_staging_vkallocator;
  379. op->upload_model(cmd, opt_upload);
  380. cmd.submit_and_wait();
  381. }
  382. d.resize(top_blob_count);
  383. {
  384. // forward
  385. ncnn::VkCompute cmd(vkdev);
  386. if (opt.use_image_storage)
  387. {
  388. // upload
  389. std::vector<ncnn::VkImageMat> a_gpu(a.size());
  390. for (size_t i = 0; i < a_gpu.size(); i++)
  391. {
  392. cmd.record_upload(a[i], a_gpu[i], opt);
  393. }
  394. std::vector<ncnn::VkImageMat> d_gpu(top_blob_count);
  395. if (op->support_inplace)
  396. {
  397. op->forward_inplace(a_gpu, cmd, opt);
  398. d_gpu = a_gpu;
  399. }
  400. else
  401. {
  402. op->forward(a_gpu, d_gpu, cmd, opt);
  403. }
  404. // download
  405. for (size_t i = 0; i < d_gpu.size(); i++)
  406. {
  407. cmd.record_download(d_gpu[i], d[i], opt);
  408. }
  409. }
  410. else
  411. {
  412. // upload
  413. std::vector<ncnn::VkMat> a_gpu(a.size());
  414. for (size_t i = 0; i < a_gpu.size(); i++)
  415. {
  416. cmd.record_upload(a[i], a_gpu[i], opt);
  417. }
  418. std::vector<ncnn::VkMat> d_gpu(top_blob_count);
  419. if (op->support_inplace)
  420. {
  421. op->forward_inplace(a_gpu, cmd, opt);
  422. d_gpu = a_gpu;
  423. }
  424. else
  425. {
  426. op->forward(a_gpu, d_gpu, cmd, opt);
  427. }
  428. // download
  429. for (size_t i = 0; i < d_gpu.size(); i++)
  430. {
  431. cmd.record_download(d_gpu[i], d[i], opt);
  432. }
  433. }
  434. cmd.submit_and_wait();
  435. }
  436. op->destroy_pipeline(opt);
  437. delete op;
  438. vkdev->reclaim_blob_allocator(blob_vkallocator);
  439. vkdev->reclaim_staging_allocator(staging_vkallocator);
  440. g_weight_vkallocator.clear();
  441. g_weight_staging_vkallocator.clear();
  442. return 0;
  443. }
  444. #endif // NCNN_VULKAN
  445. template<typename T>
  446. int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, const std::vector<ncnn::Mat>& top_shapes = std::vector<ncnn::Mat>(), float epsilon = 0.001, void (*func)(T*) = 0)
  447. {
  448. // naive
  449. std::vector<ncnn::Mat> b;
  450. {
  451. int ret = test_layer_naive(typeindex, pd, weights, a, top_blob_count, b, func);
  452. if (ret != 0)
  453. {
  454. fprintf(stderr, "test_layer_naive failed\n");
  455. return -1;
  456. }
  457. }
  458. // cpu
  459. {
  460. std::vector<ncnn::Mat> c;
  461. int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, top_blob_count, c, std::vector<ncnn::Mat>(), func);
  462. if (ret != 0 || CompareMat(b, c, epsilon) != 0)
  463. {
  464. fprintf(stderr, "test_layer_cpu failed\n");
  465. return -1;
  466. }
  467. }
  468. // cpu shape hint
  469. {
  470. std::vector<ncnn::Mat> c;
  471. int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, top_blob_count, c, b, func);
  472. if (ret != 0 || CompareMat(b, c, epsilon) != 0)
  473. {
  474. fprintf(stderr, "test_layer_cpu failed with shape hint\n");
  475. return -1;
  476. }
  477. }
  478. #if NCNN_VULKAN
  479. // gpu
  480. {
  481. std::vector<ncnn::Mat> d;
  482. int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, top_blob_count, d, std::vector<ncnn::Mat>(), func);
  483. if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
  484. {
  485. fprintf(stderr, "test_layer_gpu failed\n");
  486. return -1;
  487. }
  488. }
  489. // gpu shape hint
  490. {
  491. std::vector<ncnn::Mat> d;
  492. int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, top_blob_count, d, b, func);
  493. if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
  494. {
  495. fprintf(stderr, "test_layer_gpu failed with shape hint\n");
  496. return -1;
  497. }
  498. }
  499. #endif // NCNN_VULKAN
  500. return 0;
  501. }
  502. template<typename T>
  503. int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, ncnn::Mat& b, void (*func)(T*))
  504. {
  505. ncnn::Layer* op = ncnn::create_layer(typeindex);
  506. if (func)
  507. {
  508. (*func)((T*)op);
  509. }
  510. op->load_param(pd);
  511. ncnn::ModelBinFromMatArray mb(weights.data());
  512. op->load_model(mb);
  513. ncnn::Option opt;
  514. opt.num_threads = 1;
  515. opt.use_packing_layout = false;
  516. opt.use_fp16_packed = false;
  517. opt.use_fp16_storage = false;
  518. opt.use_fp16_arithmetic = false;
  519. opt.use_shader_pack8 = false;
  520. opt.use_image_storage = false;
  521. opt.use_bf16_storage = false;
  522. opt.use_vulkan_compute = false;
  523. opt.use_weight_fp16_storage = false;
  524. op->create_pipeline(opt);
  525. if (op->support_inplace)
  526. {
  527. b = a.clone();
  528. ((T*)op)->T::forward_inplace(b, opt);
  529. }
  530. else
  531. {
  532. ((T*)op)->T::forward(a, b, opt);
  533. }
  534. op->destroy_pipeline(opt);
  535. delete op;
  536. return 0;
  537. }
  538. template<typename T>
  539. int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& c, const ncnn::Mat& top_shape, void (*func)(T*))
  540. {
  541. ncnn::Layer* op = ncnn::create_layer(typeindex);
  542. if (func)
  543. {
  544. (*func)((T*)op);
  545. }
  546. if (top_shape.dims)
  547. {
  548. op->bottom_shapes.resize(1);
  549. op->top_shapes.resize(1);
  550. op->bottom_shapes[0] = a;
  551. op->top_shapes[0] = top_shape;
  552. }
  553. op->load_param(pd);
  554. ncnn::ModelBinFromMatArray mb(weights.data());
  555. op->load_model(mb);
  556. ncnn::Option opt = _opt;
  557. opt.num_threads = 1;
  558. opt.use_vulkan_compute = false;
  559. if (!op->support_packing) opt.use_packing_layout = false;
  560. if (!op->support_bf16_storage) opt.use_bf16_storage = false;
  561. if (!op->support_fp16_storage) opt.use_fp16_storage = false;
  562. if (!op->support_weight_fp16_storage) opt.use_weight_fp16_storage = false;
  563. if (op->use_int8_inference)
  564. {
  565. opt.use_bf16_storage = false;
  566. opt.use_fp16_storage = false;
  567. opt.use_packing_layout = false;
  568. }
  569. op->create_pipeline(opt);
  570. ncnn::Mat a4;
  571. if (opt.use_packing_layout)
  572. {
  573. #if NCNN_AVX2
  574. ncnn::convert_packing(a, a4, 8, opt);
  575. #else
  576. ncnn::convert_packing(a, a4, 4, opt);
  577. #endif
  578. }
  579. else
  580. {
  581. a4 = a;
  582. }
  583. if (opt.use_bf16_storage)
  584. {
  585. ncnn::Mat a_bf16;
  586. ncnn::cast_float32_to_bfloat16(a4, a_bf16, opt);
  587. a4 = a_bf16;
  588. }
  589. else if (opt.use_fp16_storage)
  590. {
  591. ncnn::Mat a_fp16;
  592. ncnn::cast_float32_to_float16(a4, a_fp16, opt);
  593. a4 = a_fp16;
  594. }
  595. if (op->support_inplace)
  596. {
  597. c = a4.clone();
  598. op->forward_inplace(c, opt);
  599. }
  600. else
  601. {
  602. op->forward(a4, c, opt);
  603. }
  604. if (opt.use_bf16_storage)
  605. {
  606. ncnn::Mat c_fp32;
  607. ncnn::cast_bfloat16_to_float32(c, c_fp32, opt);
  608. c = c_fp32;
  609. }
  610. else if (opt.use_fp16_storage)
  611. {
  612. ncnn::Mat c_fp32;
  613. ncnn::cast_float16_to_float32(c, c_fp32, opt);
  614. c = c_fp32;
  615. }
  616. op->destroy_pipeline(opt);
  617. delete op;
  618. return 0;
  619. }
  620. #if NCNN_VULKAN
  621. template<typename T>
  622. int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& d, const ncnn::Mat& top_shape, void (*func)(T*))
  623. {
  624. ncnn::Layer* op = ncnn::create_layer(typeindex);
  625. if (!op->support_vulkan)
  626. {
  627. delete op;
  628. return 233;
  629. }
  630. ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
  631. op->vkdev = vkdev;
  632. if (func)
  633. {
  634. (*func)((T*)op);
  635. }
  636. if (top_shape.dims)
  637. {
  638. op->bottom_shapes.resize(1);
  639. op->top_shapes.resize(1);
  640. op->bottom_shapes[0] = a;
  641. op->top_shapes[0] = top_shape;
  642. }
  643. op->load_param(pd);
  644. ncnn::ModelBinFromMatArray mb(weights.data());
  645. op->load_model(mb);
  646. if (op->use_int8_inference)
  647. {
  648. // NOTE skip int8 on gpu
  649. delete op;
  650. return 233;
  651. }
  652. ncnn::VkWeightAllocator g_weight_vkallocator(vkdev);
  653. ncnn::VkWeightStagingAllocator g_weight_staging_vkallocator(vkdev);
  654. ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
  655. ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
  656. ncnn::Option opt = _opt;
  657. opt.num_threads = 1;
  658. opt.use_vulkan_compute = true;
  659. if (!op->support_packing) opt.use_packing_layout = false;
  660. if (!op->support_bf16_storage) opt.use_bf16_storage = false;
  661. if (!op->support_image_storage) opt.use_image_storage = false;
  662. if (!op->support_weight_fp16_storage) opt.use_weight_fp16_storage = false;
  663. #if __APPLE__
  664. opt.use_image_storage = false;
  665. #endif
  666. opt.blob_vkallocator = blob_vkallocator;
  667. opt.workspace_vkallocator = blob_vkallocator;
  668. opt.staging_vkallocator = staging_vkallocator;
  669. if (!vkdev->info.support_fp16_packed) opt.use_fp16_packed = false;
  670. if (!vkdev->info.support_fp16_storage) opt.use_fp16_storage = false;
  671. if (!vkdev->info.support_fp16_arithmetic) opt.use_fp16_arithmetic = false;
  672. // FIXME fp16a may produce large error
  673. opt.use_fp16_arithmetic = false;
  674. op->create_pipeline(opt);
  675. {
  676. ncnn::VkTransfer cmd(vkdev);
  677. ncnn::Option opt_upload = opt;
  678. opt_upload.blob_vkallocator = &g_weight_vkallocator;
  679. opt_upload.workspace_vkallocator = &g_weight_vkallocator;
  680. opt_upload.staging_vkallocator = &g_weight_staging_vkallocator;
  681. op->upload_model(cmd, opt_upload);
  682. cmd.submit_and_wait();
  683. }
  684. {
  685. // forward
  686. ncnn::VkCompute cmd(vkdev);
  687. if (opt.use_image_storage)
  688. {
  689. // upload
  690. ncnn::VkImageMat a_gpu;
  691. cmd.record_upload(a, a_gpu, opt);
  692. ncnn::VkImageMat d_gpu;
  693. if (op->support_inplace)
  694. {
  695. op->forward_inplace(a_gpu, cmd, opt);
  696. d_gpu = a_gpu;
  697. }
  698. else
  699. {
  700. op->forward(a_gpu, d_gpu, cmd, opt);
  701. }
  702. // download
  703. cmd.record_download(d_gpu, d, opt);
  704. }
  705. else
  706. {
  707. // upload
  708. ncnn::VkMat a_gpu;
  709. cmd.record_upload(a, a_gpu, opt);
  710. ncnn::VkMat d_gpu;
  711. if (op->support_inplace)
  712. {
  713. op->forward_inplace(a_gpu, cmd, opt);
  714. d_gpu = a_gpu;
  715. }
  716. else
  717. {
  718. op->forward(a_gpu, d_gpu, cmd, opt);
  719. }
  720. // download
  721. cmd.record_download(d_gpu, d, opt);
  722. }
  723. cmd.submit_and_wait();
  724. }
  725. op->destroy_pipeline(opt);
  726. delete op;
  727. vkdev->reclaim_blob_allocator(blob_vkallocator);
  728. vkdev->reclaim_staging_allocator(staging_vkallocator);
  729. g_weight_vkallocator.clear();
  730. g_weight_staging_vkallocator.clear();
  731. return 0;
  732. }
  733. #endif // NCNN_VULKAN
  734. template<typename T>
  735. int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, const ncnn::Mat& top_shape = ncnn::Mat(), float epsilon = 0.001, void (*func)(T*) = 0)
  736. {
  737. // naive
  738. ncnn::Mat b;
  739. {
  740. int ret = test_layer_naive(typeindex, pd, weights, a, b, func);
  741. if (ret != 0)
  742. {
  743. fprintf(stderr, "test_layer_naive failed\n");
  744. return -1;
  745. }
  746. }
  747. // cpu
  748. {
  749. ncnn::Mat c;
  750. int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, c, ncnn::Mat(), func);
  751. if (ret != 0 || CompareMat(b, c, epsilon) != 0)
  752. {
  753. fprintf(stderr, "test_layer_cpu failed\n");
  754. return -1;
  755. }
  756. }
  757. // cpu shape hint
  758. {
  759. ncnn::Mat c;
  760. int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, c, b, func);
  761. if (ret != 0 || CompareMat(b, c, epsilon) != 0)
  762. {
  763. fprintf(stderr, "test_layer_cpu failed with shape hint\n");
  764. return -1;
  765. }
  766. }
  767. #if NCNN_VULKAN
  768. // gpu
  769. {
  770. ncnn::Mat d;
  771. int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, d, ncnn::Mat(), func);
  772. if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
  773. {
  774. fprintf(stderr, "test_layer_gpu failed\n");
  775. return -1;
  776. }
  777. }
  778. // gpu shape hint
  779. {
  780. ncnn::Mat d;
  781. int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, d, b, func);
  782. if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
  783. {
  784. fprintf(stderr, "test_layer_gpu failed with shape hint\n");
  785. return -1;
  786. }
  787. }
  788. #endif // NCNN_VULKAN
  789. return 0;
  790. }
  791. template<typename T>
  792. int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count = 1, float epsilon = 0.001, void (*func)(T*) = 0)
  793. {
  794. ncnn::Option opts[4];
  795. opts[0].use_packing_layout = false;
  796. opts[0].use_fp16_packed = false;
  797. opts[0].use_fp16_storage = false;
  798. opts[0].use_fp16_arithmetic = false;
  799. opts[0].use_bf16_storage = false;
  800. opts[0].use_shader_pack8 = false;
  801. opts[0].use_image_storage = false;
  802. opts[0].use_weight_fp16_storage = false;
  803. opts[1].use_packing_layout = true;
  804. opts[1].use_fp16_packed = true;
  805. opts[1].use_fp16_storage = false;
  806. opts[1].use_fp16_arithmetic = false;
  807. opts[1].use_bf16_storage = false;
  808. opts[1].use_shader_pack8 = true;
  809. opts[1].use_image_storage = false;
  810. opts[1].use_weight_fp16_storage = false;
  811. opts[2].use_packing_layout = true;
  812. opts[2].use_fp16_packed = true;
  813. opts[2].use_fp16_storage = true;
  814. opts[2].use_fp16_arithmetic = false;
  815. opts[2].use_bf16_storage = true;
  816. opts[2].use_shader_pack8 = true;
  817. opts[2].use_image_storage = true;
  818. opts[2].use_weight_fp16_storage = true;
  819. opts[3].use_packing_layout = true;
  820. opts[3].use_fp16_packed = true;
  821. opts[3].use_fp16_storage = true;
  822. opts[3].use_fp16_arithmetic = true;
  823. opts[3].use_bf16_storage = false;
  824. opts[3].use_shader_pack8 = true;
  825. opts[3].use_image_storage = true;
  826. opts[3].use_weight_fp16_storage = true;
  827. for (int i = 0; i < 4; i++)
  828. {
  829. const ncnn::Option& opt = opts[i];
  830. // fp16 representation
  831. std::vector<ncnn::Mat> a_fp16;
  832. std::vector<ncnn::Mat> weights_fp16;
  833. float epsilon_fp16;
  834. if (opt.use_bf16_storage)
  835. {
  836. a_fp16.resize(a.size());
  837. for (size_t j = 0; j < a.size(); j++)
  838. {
  839. ncnn::Mat tmp;
  840. ncnn::cast_float32_to_bfloat16(a[j], tmp, opt);
  841. ncnn::cast_bfloat16_to_float32(tmp, a_fp16[j], opt);
  842. }
  843. weights_fp16.resize(weights.size());
  844. for (size_t j = 0; j < weights.size(); j++)
  845. {
  846. ncnn::Mat tmp;
  847. ncnn::cast_float32_to_bfloat16(weights[j], tmp, opt);
  848. ncnn::cast_bfloat16_to_float32(tmp, weights_fp16[j], opt);
  849. }
  850. epsilon_fp16 = epsilon * 100; // 0.1
  851. }
  852. else if (opt.use_fp16_packed || opt.use_fp16_storage)
  853. {
  854. a_fp16.resize(a.size());
  855. for (size_t j = 0; j < a.size(); j++)
  856. {
  857. ncnn::Mat tmp;
  858. ncnn::cast_float32_to_float16(a[j], tmp, opt);
  859. ncnn::cast_float16_to_float32(tmp, a_fp16[j], opt);
  860. }
  861. weights_fp16.resize(weights.size());
  862. for (size_t j = 0; j < weights.size(); j++)
  863. {
  864. ncnn::Mat tmp;
  865. ncnn::cast_float32_to_float16(weights[j], tmp, opt);
  866. ncnn::cast_float16_to_float32(tmp, weights_fp16[j], opt);
  867. }
  868. epsilon_fp16 = epsilon * 100; // 0.1
  869. }
  870. else
  871. {
  872. a_fp16 = a;
  873. weights_fp16 = weights;
  874. epsilon_fp16 = epsilon;
  875. }
  876. if (opt.use_fp16_arithmetic)
  877. {
  878. epsilon_fp16 = epsilon * 500; // 0.5
  879. }
  880. std::vector<ncnn::Mat> top_shapes;
  881. int ret = test_layer<T>(ncnn::layer_to_index(layer_type), pd, weights_fp16, opt, a_fp16, top_blob_count, top_shapes, epsilon_fp16, func);
  882. if (ret != 0)
  883. {
  884. fprintf(stderr, "test_layer %s failed use_packing_layout=%d use_fp16_packed=%d use_fp16_storage=%d use_fp16_arithmetic=%d use_shader_pack8=%d use_bf16_storage=%d use_image_storage=%d\n", layer_type, opt.use_packing_layout, opt.use_fp16_packed, opt.use_fp16_storage, opt.use_fp16_arithmetic, opt.use_shader_pack8, opt.use_bf16_storage, opt.use_image_storage);
  885. return ret;
  886. }
  887. }
  888. return 0;
  889. }
  890. template<typename T>
  891. int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, float epsilon = 0.001, void (*func)(T*) = 0)
  892. {
  893. ncnn::Option opts[4];
  894. opts[0].use_packing_layout = false;
  895. opts[0].use_fp16_packed = false;
  896. opts[0].use_fp16_storage = false;
  897. opts[0].use_fp16_arithmetic = false;
  898. opts[0].use_bf16_storage = false;
  899. opts[0].use_shader_pack8 = false;
  900. opts[0].use_image_storage = false;
  901. opts[0].use_weight_fp16_storage = false;
  902. opts[1].use_packing_layout = true;
  903. opts[1].use_fp16_packed = true;
  904. opts[1].use_fp16_storage = false;
  905. opts[1].use_fp16_arithmetic = false;
  906. opts[1].use_bf16_storage = false;
  907. opts[1].use_shader_pack8 = true;
  908. opts[1].use_image_storage = false;
  909. opts[1].use_weight_fp16_storage = false;
  910. opts[2].use_packing_layout = true;
  911. opts[2].use_fp16_packed = true;
  912. opts[2].use_fp16_storage = true;
  913. opts[2].use_fp16_arithmetic = false;
  914. opts[2].use_bf16_storage = true;
  915. opts[2].use_shader_pack8 = true;
  916. opts[2].use_image_storage = true;
  917. opts[2].use_weight_fp16_storage = true;
  918. opts[3].use_packing_layout = true;
  919. opts[3].use_fp16_packed = true;
  920. opts[3].use_fp16_storage = true;
  921. opts[3].use_fp16_arithmetic = true;
  922. opts[3].use_bf16_storage = false;
  923. opts[3].use_shader_pack8 = true;
  924. opts[3].use_image_storage = true;
  925. opts[3].use_weight_fp16_storage = true;
  926. for (int i = 0; i < 4; i++)
  927. {
  928. const ncnn::Option& opt = opts[i];
  929. // fp16 representation
  930. ncnn::Mat a_fp16;
  931. std::vector<ncnn::Mat> weights_fp16;
  932. float epsilon_fp16;
  933. if (opt.use_bf16_storage)
  934. {
  935. {
  936. ncnn::Mat tmp;
  937. ncnn::cast_float32_to_bfloat16(a, tmp, opt);
  938. ncnn::cast_bfloat16_to_float32(tmp, a_fp16, opt);
  939. }
  940. weights_fp16.resize(weights.size());
  941. for (size_t j = 0; j < weights.size(); j++)
  942. {
  943. ncnn::Mat tmp;
  944. ncnn::cast_float32_to_bfloat16(weights[j], tmp, opt);
  945. ncnn::cast_bfloat16_to_float32(tmp, weights_fp16[j], opt);
  946. }
  947. epsilon_fp16 = epsilon * 100; // 0.1
  948. }
  949. else if (opt.use_fp16_packed || opt.use_fp16_storage)
  950. {
  951. {
  952. ncnn::Mat tmp;
  953. ncnn::cast_float32_to_float16(a, tmp, opt);
  954. ncnn::cast_float16_to_float32(tmp, a_fp16, opt);
  955. }
  956. weights_fp16.resize(weights.size());
  957. for (size_t j = 0; j < weights.size(); j++)
  958. {
  959. ncnn::Mat tmp;
  960. ncnn::cast_float32_to_float16(weights[j], tmp, opt);
  961. ncnn::cast_float16_to_float32(tmp, weights_fp16[j], opt);
  962. }
  963. epsilon_fp16 = epsilon * 100; // 0.1
  964. }
  965. else
  966. {
  967. a_fp16 = a;
  968. weights_fp16 = weights;
  969. epsilon_fp16 = epsilon;
  970. }
  971. if (opt.use_fp16_arithmetic)
  972. {
  973. epsilon_fp16 = epsilon * 500; // 0.5
  974. }
  975. ncnn::Mat top_shape;
  976. int ret = test_layer<T>(ncnn::layer_to_index(layer_type), pd, weights_fp16, opt, a_fp16, top_shape, epsilon_fp16, func);
  977. if (ret != 0)
  978. {
  979. fprintf(stderr, "test_layer %s failed use_packing_layout=%d use_fp16_packed=%d use_fp16_storage=%d use_fp16_arithmetic=%d use_shader_pack8=%d use_bf16_storage=%d use_image_storage=%d\n", layer_type, opt.use_packing_layout, opt.use_fp16_packed, opt.use_fp16_storage, opt.use_fp16_arithmetic, opt.use_shader_pack8, opt.use_bf16_storage, opt.use_image_storage);
  980. return ret;
  981. }
  982. }
  983. return 0;
  984. }
  985. #endif // TESTUTIL_H