You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution_x86.cpp 52 kB

X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
5 years ago
5 years ago
5 years ago
5 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "convolution_x86.h"
  15. #if __SSE2__
  16. #include <emmintrin.h>
  17. #if __SSSE3__
  18. #include <tmmintrin.h>
  19. #if __SSE4_1__
  20. #include <smmintrin.h>
  21. #if __AVX__
  22. #include <immintrin.h>
  23. #endif
  24. #endif // __SSE4_1__
  25. #endif // __SSSE3__
  26. #endif // __SSE2__
  27. #include "x86_activation.h"
  28. #include "x86_usability.h"
  29. #include "benchmark.h"
  30. #include "cpu.h"
  31. #include "layer_type.h"
  32. namespace ncnn {
  33. #include "convolution_3x3.h"
  34. #include "convolution_5x5.h"
  35. #include "convolution_3x3_winograd.h"
  36. #include "convolution_packed.h"
  37. #if NCNN_INT8
  38. #include "convolution_3x3_int8.h"
  39. #include "convolution_packed_int8.h"
  40. #include "convolution_im2col_gemm_int8.h"
  41. #include "convolution_3x3_winograd_int8.h"
  42. #endif // NCNN_INT8
  43. #if __SSE2__
  44. #include "convolution_3x3_pack1to4.h"
  45. #if __AVX__
  46. #include "convolution_3x3_pack1to8.h"
  47. #include "convolution_3x3_pack8to1.h"
  48. #include "convolution_3x3_pack8.h"
  49. #include "convolution_2x2_pack8.h"
  50. #if __AVX512F__
  51. #include "convolution_3x3_pack16to1.h"
  52. #endif // __AVX512F__
  53. #endif // __AVX__
  54. #endif // __SSE2__
  55. Convolution_x86::Convolution_x86()
  56. {
  57. #if __SSE2__
  58. support_packing = true;
  59. #endif // __SSE2__
  60. activation = 0;
  61. nT = 0;
  62. convolution_dilation1 = 0;
  63. gemm = 0;
  64. }
  65. static void convolution_transform_kernel_packed_sse(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
  66. {
  67. const int maxk = kernel_w * kernel_h;
  68. // src = kw-kh-inch-outch
  69. // dst = pb-pa-kw-kh-inch/pa-outch/pb
  70. {
  71. Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
  72. weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);
  73. for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
  74. {
  75. float* g00 = weight_data_tm.channel(q / out_elempack);
  76. for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
  77. {
  78. for (int k = 0; k < maxk; k++)
  79. {
  80. for (int i = 0; i < elempack; i++)
  81. {
  82. for (int j = 0; j < out_elempack; j++)
  83. {
  84. const float* k00 = weight_data_r2.channel(q + j).row(p + i);
  85. g00[0] = k00[k];
  86. g00++;
  87. }
  88. }
  89. }
  90. }
  91. }
  92. }
  93. }
  94. static bool test_prefer_winograd63(int num_input, int num_output, int w, int h)
  95. {
  96. // winograd selection strategy (profiled on i7-7700 single thread)
  97. int minwh = std::min(w, h);
  98. if (num_input >= 64)
  99. {
  100. return false;
  101. }
  102. if (num_input >= 32)
  103. {
  104. if (num_output >= 64) return false;
  105. if (num_output >= 32) return (minwh >= 11 && minwh <= 14)
  106. || (minwh >= 19 && minwh <= 20)
  107. || (minwh >= 23 && minwh <= 44)
  108. || (minwh >= 47 && minwh <= 56)
  109. || (minwh >= 63 && minwh <= 130);
  110. if (num_output >= 16) return (minwh >= 13 && minwh <= 14)
  111. || (minwh >= 19 && minwh <= 20)
  112. || (minwh >= 23 && minwh <= 38)
  113. || (minwh >= 43 && minwh <= 44)
  114. || (minwh >= 47 && minwh <= 140);
  115. if (num_output >= 8) return (minwh >= 11 && minwh <= 14)
  116. || (minwh >= 19 && minwh <= 20)
  117. || (minwh >= 31 && minwh <= 38)
  118. || (minwh >= 43 && minwh <= 44)
  119. || (minwh >= 55 && minwh <= 162);
  120. return false;
  121. }
  122. if (num_input >= 16)
  123. {
  124. if (num_output >= 64) return false;
  125. if (num_output >= 32) return (minwh >= 11 && minwh <= 14)
  126. || (minwh >= 19 && minwh <= 20)
  127. || (minwh >= 23 && minwh <= 44)
  128. || (minwh >= 47 && minwh <= 92)
  129. || (minwh >= 95 && minwh <= 188);
  130. if (num_output >= 16) return (minwh >= 11 && minwh <= 14)
  131. || (minwh >= 27 && minwh <= 38)
  132. || (minwh >= 43 && minwh <= 44)
  133. || (minwh >= 47 && minwh <= 74)
  134. || (minwh >= 81 && minwh <= 110)
  135. || (minwh >= 117 && minwh <= 170)
  136. || (minwh >= 177 && minwh <= 182);
  137. if (num_output >= 8) return (minwh >= 19 && minwh <= 20)
  138. || (minwh >= 33 && minwh <= 38)
  139. || (minwh >= 43 && minwh <= 44)
  140. || (minwh >= 47 && minwh <= 128)
  141. || (minwh >= 155 && minwh <= 210);
  142. return false;
  143. }
  144. if (num_input >= 8)
  145. {
  146. if (num_output >= 64) return false;
  147. if (num_output >= 32) return (minwh >= 7 && minwh <= 14)
  148. || (minwh >= 17 && minwh <= 20)
  149. || (minwh >= 23 && minwh <= 26)
  150. || (minwh >= 31 && minwh <= 38)
  151. || (minwh >= 43 && minwh <= 162);
  152. if (num_output >= 16) return minwh == 31 || minwh == 32
  153. || (minwh >= 39 && minwh <= 44)
  154. || (minwh >= 47 && minwh <= 212);
  155. if (num_output >= 8) return false;
  156. return false;
  157. }
  158. return false;
  159. }
  160. static bool test_prefer_winograd23(int num_input, int num_output, int w, int h)
  161. {
  162. int minwh = std::min(w, h);
  163. if (num_input >= 512)
  164. {
  165. if (num_output >= 512) return (minwh >= 3 && minwh <= 14);
  166. if (num_output >= 256) return (minwh >= 3 && minwh <= 14);
  167. if (num_output >= 128) return (minwh >= 3 && minwh <= 14);
  168. if (num_output >= 64) return (minwh >= 3 && minwh <= 8) || (minwh >= 11 && minwh <= 12);
  169. if (num_output >= 32) return (minwh >= 3 && minwh <= 8);
  170. if (num_output >= 16) return (minwh >= 3 && minwh <= 8);
  171. if (num_output >= 8) return (minwh >= 3 && minwh <= 6);
  172. return false;
  173. }
  174. if (num_input >= 256)
  175. {
  176. if (num_output >= 512) return (minwh >= 3 && minwh <= 14);
  177. if (num_output >= 256) return (minwh >= 3 && minwh <= 14);
  178. if (num_output >= 128) return (minwh >= 3 && minwh <= 12);
  179. if (num_output >= 64) return (minwh >= 3 && minwh <= 4);
  180. if (num_output >= 32) return (minwh >= 3 && minwh <= 8);
  181. if (num_output >= 16) return (minwh >= 3 && minwh <= 8);
  182. if (num_output >= 8) return (minwh >= 3 && minwh <= 6);
  183. return false;
  184. }
  185. if (num_input >= 128)
  186. {
  187. if (num_output >= 512) return (minwh >= 3 && minwh <= 14);
  188. if (num_output >= 256) return (minwh >= 3 && minwh <= 8) || (minwh >= 11 && minwh <= 12);
  189. if (num_output >= 128) return (minwh >= 3 && minwh <= 10);
  190. if (num_output >= 64) return (minwh >= 3 && minwh <= 8);
  191. if (num_output >= 32) return (minwh >= 3 && minwh <= 10);
  192. if (num_output >= 16) return (minwh >= 3 && minwh <= 6);
  193. if (num_output >= 8) return (minwh >= 3 && minwh <= 6);
  194. return false;
  195. }
  196. if (num_input >= 64)
  197. {
  198. if (num_output >= 512) return (minwh >= 3 && minwh <= 8) || (minwh >= 11 && minwh <= 12) || (minwh >= 15 && minwh <= 20);
  199. if (num_output >= 256) return (minwh >= 7 && minwh <= 8);
  200. if (num_output >= 128) return (minwh >= 3 && minwh <= 8) || (minwh >= 19 && minwh <= 22);
  201. if (num_output >= 64) return (minwh >= 3 && minwh <= 12);
  202. if (num_output >= 32) return (minwh >= 3 && minwh <= 12);
  203. if (num_output >= 16) return (minwh >= 3 && minwh <= 12);
  204. if (num_output >= 8) return (minwh >= 3 && minwh <= 12);
  205. return false;
  206. }
  207. if (num_input >= 32)
  208. {
  209. if (num_output >= 512) return (minwh >= 3 && minwh <= 6) || (minwh >= 11 && minwh <= 12);
  210. if (num_output >= 256) return (minwh >= 3 && minwh <= 6) || (minwh >= 11 && minwh <= 12);
  211. if (num_output >= 128) return (minwh >= 3 && minwh <= 4) || (minwh >= 7 && minwh <= 16);
  212. if (num_output >= 64) return (minwh >= 3 && minwh <= 8);
  213. if (num_output >= 32) return (minwh >= 7 && minwh <= 8);
  214. if (num_output >= 16) return (minwh >= 7 && minwh <= 8);
  215. if (num_output >= 8) return (minwh >= 3 && minwh <= 10);
  216. return false;
  217. }
  218. if (num_input >= 16)
  219. {
  220. if (num_output >= 512) return (minwh >= 11 && minwh <= 12);
  221. if (num_output >= 256) return (minwh >= 3 && minwh <= 12);
  222. if (num_output >= 128) return (minwh >= 3 && minwh <= 6)
  223. || (minwh >= 9 && minwh <= 18);
  224. if (num_output >= 64) return (minwh >= 3 && minwh <= 4)
  225. || (minwh >= 7 && minwh <= 8)
  226. || (minwh >= 11 && minwh <= 12)
  227. || (minwh >= 15 && minwh <= 18);
  228. if (num_output >= 32) return (minwh >= 3 && minwh <= 4)
  229. || (minwh >= 9 && minwh <= 10);
  230. if (num_output >= 16) return (minwh >= 3 && minwh <= 10);
  231. if (num_output >= 8) return (minwh >= 3 && minwh <= 8)
  232. || (minwh >= 11 && minwh <= 12);
  233. return false;
  234. }
  235. if (num_input >= 8)
  236. {
  237. if (num_output >= 128) return false;
  238. if (num_output >= 64) return (minwh >= 3 && minwh <= 4)
  239. || (minwh >= 7 && minwh <= 14)
  240. || (minwh >= 47 && minwh <= 48);
  241. if (num_output >= 32) return (minwh >= 3 && minwh <= 6)
  242. || (minwh >= 15 && minwh <= 16);
  243. if (num_output >= 16) return (minwh >= 3 && minwh <= 6)
  244. || (minwh >= 9 && minwh <= 14)
  245. || (minwh >= 47 && minwh <= 212);
  246. if (num_output >= 8) return true;
  247. return false;
  248. }
  249. return false;
  250. }
  251. int Convolution_x86::create_pipeline(const Option& opt)
  252. {
  253. if (dynamic_weight)
  254. return 0;
  255. activation = create_activation_layer(activation_type, activation_params, opt);
  256. nT = opt.num_threads;
  257. #if NCNN_INT8
  258. if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
  259. {
  260. return create_pipeline_int8_x86(opt);
  261. }
  262. #endif
  263. int kernel_size = kernel_w * kernel_h;
  264. int num_input = weight_data_size / kernel_size / num_output;
  265. if (!opt.use_packing_layout && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
  266. {
  267. convolution_dilation1 = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
  268. // set param
  269. ncnn::ParamDict pd;
  270. pd.set(0, num_output); // num_output
  271. pd.set(1, kernel_w);
  272. pd.set(11, kernel_h);
  273. pd.set(2, 1);
  274. pd.set(12, 1);
  275. pd.set(3, 1); // stride_w
  276. pd.set(13, 1); // stride_h
  277. pd.set(4, 0); // pad_w
  278. pd.set(14, 0); // pad_h
  279. pd.set(5, bias_term);
  280. pd.set(6, weight_data_size);
  281. convolution_dilation1->load_param(pd);
  282. // set weights
  283. if (bias_term)
  284. {
  285. ncnn::Mat weights[2];
  286. weights[0] = weight_data;
  287. weights[1] = bias_data;
  288. convolution_dilation1->load_model(ModelBinFromMatArray(weights));
  289. }
  290. else
  291. {
  292. ncnn::Mat weights[1];
  293. weights[0] = weight_data;
  294. convolution_dilation1->load_model(ModelBinFromMatArray(weights));
  295. }
  296. convolution_dilation1->create_pipeline(opt);
  297. if (opt.lightmode)
  298. weight_data.release();
  299. return 0;
  300. }
  301. int elempack = 1;
  302. int out_elempack = 1;
  303. #if __SSE2__
  304. if (opt.use_packing_layout)
  305. {
  306. #if __AVX512F__
  307. elempack = num_input % 16 == 0 ? 16 : num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
  308. out_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
  309. #elif __AVX__
  310. elempack = num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
  311. out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
  312. #else
  313. elempack = num_input % 4 == 0 ? 4 : 1;
  314. out_elempack = num_output % 4 == 0 ? 4 : 1;
  315. #endif
  316. }
  317. #endif // __SSE2__
  318. bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && (num_input > 8 || num_output > 8);
  319. if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  320. {
  321. if ((bottom_shapes.empty() || bottom_shapes[0].w == 0 || bottom_shapes[0].h == 0) && (top_shapes.empty() || top_shapes[0].w == 0 || top_shapes[0].h == 0))
  322. {
  323. // dynamic shape
  324. if ((opt.use_winograd63_convolution) && (num_input <= 32 && num_output <= 32))
  325. conv3x3s1_winograd63_transform_kernel(weight_data, weight_winograd63_data, num_input, num_output, opt);
  326. else if (opt.use_winograd43_convolution)
  327. conv3x3s1_winograd43_transform_kernel(weight_data, weight_winograd43_data, num_input, num_output, opt);
  328. else
  329. conv3x3s1_winograd23_transform_kernel(weight_data, weight_winograd23_data, num_input, num_output, opt);
  330. }
  331. else
  332. {
  333. int w;
  334. int h;
  335. if (top_shapes.empty() || top_shapes[0].w == 0 || top_shapes[0].h == 0)
  336. {
  337. w = bottom_shapes[0].w;
  338. h = bottom_shapes[0].h;
  339. // make padding
  340. if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
  341. {
  342. w += pad_left + pad_right;
  343. h += pad_top + pad_bottom;
  344. }
  345. else if ((pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
  346. || (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234))
  347. {
  348. // tensorflow padding=SAME or onnx padding=SAME_UPPER/SAME_LOWER
  349. w += 2;
  350. h += 2;
  351. }
  352. }
  353. else
  354. {
  355. w = top_shapes[0].w + 2;
  356. h = top_shapes[0].h + 2;
  357. }
  358. bool prefer_winograd63 = test_prefer_winograd63(num_input, num_output, w, h);
  359. bool prefer_winograd23 = test_prefer_winograd23(num_input, num_output, w, h);
  360. bool prefer_winograd43 = !prefer_winograd63 && !prefer_winograd23;
  361. if (prefer_winograd23 && !opt.use_winograd23_convolution)
  362. {
  363. // f23 fallback to f43
  364. prefer_winograd23 = false;
  365. prefer_winograd43 = true;
  366. }
  367. if (prefer_winograd63 && !opt.use_winograd63_convolution)
  368. {
  369. // f63 fallback to f43
  370. prefer_winograd63 = false;
  371. prefer_winograd43 = true;
  372. }
  373. if (prefer_winograd43 && !opt.use_winograd43_convolution)
  374. {
  375. // f43 fallback to f63 or f23
  376. prefer_winograd43 = false;
  377. if (opt.use_winograd63_convolution)
  378. {
  379. prefer_winograd63 = true;
  380. }
  381. else
  382. {
  383. prefer_winograd23 = true;
  384. }
  385. }
  386. if (prefer_winograd23)
  387. {
  388. conv3x3s1_winograd23_transform_kernel(weight_data, weight_winograd23_data, num_input, num_output, opt);
  389. }
  390. else if (prefer_winograd43)
  391. {
  392. conv3x3s1_winograd43_transform_kernel(weight_data, weight_winograd43_data, num_input, num_output, opt);
  393. }
  394. else if (prefer_winograd63)
  395. {
  396. conv3x3s1_winograd63_transform_kernel(weight_data, weight_winograd63_data, num_input, num_output, opt);
  397. }
  398. else
  399. {
  400. // should never reach here
  401. }
  402. }
  403. if (opt.lightmode)
  404. weight_data.release();
  405. return 0;
  406. }
  407. int l2_cache_size = get_cpu_level2_cache_size();
  408. bool prefer_sgemm = num_input * num_output * kernel_w * kernel_h * dilation_w * dilation_h * stride_w * stride_h * (int)sizeof(float) * 2 > l2_cache_size || (num_input > 16 || num_output > 16);
  409. if ((opt.use_sgemm_convolution && prefer_sgemm) || (kernel_w == 1 && kernel_h == 1))
  410. {
  411. const int maxk = kernel_w * kernel_h;
  412. gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
  413. ncnn::ParamDict pd;
  414. pd.set(2, 0); // transA
  415. pd.set(3, 0); // transB
  416. pd.set(4, 1); // constantA
  417. pd.set(5, 0); // constantB
  418. pd.set(6, 1); // constantC
  419. pd.set(7, num_output); // M = outch
  420. pd.set(8, 0); // N = size
  421. pd.set(9, maxk * num_input); // K = maxk*inch
  422. pd.set(10, bias_term ? 1 : -1); // constant_broadcast_type_C = (M)
  423. pd.set(11, 1); // output_N1M
  424. gemm->load_param(pd);
  425. // maxk-inch-outch to pa-maxk-inch/pa-outch
  426. Mat tmp;
  427. {
  428. Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
  429. tmp.create(maxk * num_input, num_output);
  430. for (int q = 0; q < num_output; q += 1)
  431. {
  432. float* g00 = tmp.row(q);
  433. for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
  434. {
  435. for (int k = 0; k < maxk; k++)
  436. {
  437. for (int i = 0; i < elempack; i++)
  438. {
  439. const float* k00 = weight_data_r2.channel(q).row(p + i);
  440. g00[0] = k00[k];
  441. g00++;
  442. }
  443. }
  444. }
  445. }
  446. }
  447. if (bias_term)
  448. {
  449. ncnn::Mat weights[2];
  450. weights[0] = tmp;
  451. weights[1] = bias_data;
  452. gemm->load_model(ModelBinFromMatArray(weights));
  453. }
  454. else
  455. {
  456. ncnn::Mat weights[1];
  457. weights[0] = tmp;
  458. gemm->load_model(ModelBinFromMatArray(weights));
  459. }
  460. gemm->create_pipeline(opt);
  461. }
  462. else
  463. {
  464. if ((elempack == 16 && out_elempack == 1 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  465. || (elempack == 8 && out_elempack == 8 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  466. || (elempack == 8 && out_elempack == 8 && kernel_w == 2 && kernel_h == 2 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  467. || (elempack == 1 && out_elempack == 8 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  468. || (elempack == 1 && out_elempack == 8 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  469. || (elempack == 8 && out_elempack == 1 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  470. || (elempack == 1 && out_elempack == 4 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  471. || (elempack == 1 && out_elempack == 4 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2))
  472. {
  473. convolution_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
  474. }
  475. else
  476. {
  477. convolution_transform_kernel_packed(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
  478. }
  479. }
  480. if (opt.lightmode)
  481. weight_data.release();
  482. return 0;
  483. }
  484. int Convolution_x86::destroy_pipeline(const Option& opt)
  485. {
  486. if (activation)
  487. {
  488. activation->destroy_pipeline(opt);
  489. delete activation;
  490. activation = 0;
  491. }
  492. if (convolution_dilation1)
  493. {
  494. convolution_dilation1->destroy_pipeline(opt);
  495. delete convolution_dilation1;
  496. convolution_dilation1 = 0;
  497. }
  498. if (gemm)
  499. {
  500. gemm->destroy_pipeline(opt);
  501. delete gemm;
  502. gemm = 0;
  503. }
  504. return 0;
  505. }
  506. int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  507. {
  508. #if NCNN_INT8
  509. if (opt.use_int8_inference && int8_scale_term)
  510. {
  511. return forward_int8_x86(bottom_blob, top_blob, opt);
  512. }
  513. #endif
  514. // flattened blob, implement as InnerProduct
  515. if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
  516. {
  517. Mat bottom_blob_3d;
  518. if (bottom_blob.elemsize % 16 == 0)
  519. {
  520. bottom_blob_3d = bottom_blob;
  521. bottom_blob_3d.dims = 3;
  522. bottom_blob_3d.w = 1;
  523. bottom_blob_3d.h = 1;
  524. bottom_blob_3d.c = bottom_blob.w;
  525. bottom_blob_3d.cstep = 1;
  526. }
  527. else
  528. {
  529. bottom_blob_3d = bottom_blob.reshape(1, 1, bottom_blob.w, opt.workspace_allocator);
  530. }
  531. Mat top_blob_3d;
  532. int ret = forward(bottom_blob_3d, top_blob_3d, opt);
  533. if (ret != 0)
  534. return ret;
  535. if (top_blob_3d.elemsize % 16 == 0)
  536. {
  537. top_blob = top_blob_3d;
  538. top_blob.dims = 1;
  539. top_blob.w = top_blob_3d.c;
  540. top_blob.h = 1;
  541. top_blob.c = 1;
  542. bottom_blob_3d.cstep = top_blob_3d.c;
  543. }
  544. else
  545. {
  546. top_blob = top_blob_3d.reshape(top_blob_3d.c, opt.blob_allocator);
  547. }
  548. return 0;
  549. }
  550. int w = bottom_blob.w;
  551. int h = bottom_blob.h;
  552. int channels = bottom_blob.c;
  553. size_t elemsize = bottom_blob.elemsize;
  554. int elempack = bottom_blob.elempack;
  555. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  556. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  557. Mat bottom_blob_bordered;
  558. make_padding(bottom_blob, bottom_blob_bordered, opt);
  559. if (bottom_blob_bordered.empty())
  560. return -100;
  561. w = bottom_blob_bordered.w;
  562. h = bottom_blob_bordered.h;
  563. int outw = (w - kernel_extent_w) / stride_w + 1;
  564. int outh = (h - kernel_extent_h) / stride_h + 1;
  565. int out_elempack = 1;
  566. #if __SSE2__
  567. if (opt.use_packing_layout)
  568. {
  569. #if __AVX512F__
  570. out_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
  571. #elif __AVX__
  572. out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
  573. #else
  574. out_elempack = num_output % 4 == 0 ? 4 : 1;
  575. #endif
  576. }
  577. #endif // __SSE2__
  578. size_t out_elemsize = elemsize / elempack * out_elempack;
  579. top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  580. if (top_blob.empty())
  581. return -100;
  582. if (!opt.use_packing_layout && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
  583. {
  584. if (outw >= dilation_w && outh >= dilation_h)
  585. {
  586. return forwardDilation_x86(bottom_blob_bordered, top_blob, opt);
  587. }
  588. }
  589. const int num_input = channels * elempack;
  590. bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && (num_input > 8 || num_output > 8);
  591. if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  592. {
  593. bool prefer_winograd63 = test_prefer_winograd63(num_input, num_output, w, h);
  594. bool prefer_winograd23 = test_prefer_winograd23(num_input, num_output, w, h);
  595. bool prefer_winograd43 = !prefer_winograd63 && !prefer_winograd23;
  596. if (prefer_winograd23 && (!opt.use_winograd23_convolution || weight_winograd23_data.empty()))
  597. {
  598. // f23 fallback to f43
  599. prefer_winograd23 = false;
  600. prefer_winograd43 = true;
  601. }
  602. if (prefer_winograd63 && (!opt.use_winograd63_convolution || weight_winograd63_data.empty()))
  603. {
  604. // f63 fallback to f43
  605. prefer_winograd63 = false;
  606. prefer_winograd43 = true;
  607. }
  608. if (prefer_winograd43 && (!opt.use_winograd43_convolution || weight_winograd43_data.empty()))
  609. {
  610. // f43 fallback to f63 or f23
  611. prefer_winograd43 = false;
  612. if (opt.use_winograd63_convolution && !weight_winograd63_data.empty())
  613. {
  614. prefer_winograd63 = true;
  615. }
  616. else
  617. {
  618. prefer_winograd23 = true;
  619. }
  620. }
  621. int _nT = nT ? nT : opt.num_threads;
  622. if (nT != 0 && opt.num_threads != nT)
  623. {
  624. // force num_threads the same as in create_pipeline
  625. // so we could use pre-packed A/B from the same tile config
  626. NCNN_LOGE("opt.num_threads %d changed, convolution winograd will use load-time value %d", opt.num_threads, nT);
  627. }
  628. if (prefer_winograd23)
  629. {
  630. conv3x3s1_winograd23(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, _nT, opt);
  631. }
  632. else if (prefer_winograd43)
  633. {
  634. conv3x3s1_winograd43(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, _nT, opt);
  635. }
  636. else if (prefer_winograd63)
  637. {
  638. conv3x3s1_winograd63(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, _nT, opt);
  639. }
  640. else
  641. {
  642. // should never reach here
  643. }
  644. if (activation)
  645. {
  646. activation->forward_inplace(top_blob, opt);
  647. }
  648. return 0;
  649. }
  650. int l2_cache_size = get_cpu_level2_cache_size();
  651. bool prefer_sgemm = num_input * num_output * kernel_w * kernel_h * dilation_w * dilation_h * stride_w * stride_h * (int)sizeof(float) * 2 > l2_cache_size || (num_input > 16 || num_output > 16);
  652. if ((opt.use_sgemm_convolution && prefer_sgemm) || (kernel_w == 1 && kernel_h == 1))
  653. {
  654. // im2col
  655. Mat bottom_im2col;
  656. if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  657. {
  658. bottom_im2col = bottom_blob_bordered;
  659. bottom_im2col.w = w * h;
  660. bottom_im2col.h = 1;
  661. }
  662. else if (kernel_w == 1 && kernel_h == 1)
  663. {
  664. const int size = outw * outh;
  665. bottom_im2col.create(size, channels, elemsize, elempack, opt.workspace_allocator);
  666. if (bottom_im2col.empty())
  667. return -100;
  668. const int gap = (w * stride_h - outw * stride_w) * elempack;
  669. #if __SSE2__
  670. #if __AVX__
  671. #if __AVX512F__
  672. if (elempack == 16)
  673. {
  674. #pragma omp parallel for num_threads(opt.num_threads)
  675. for (int p = 0; p < channels; p++)
  676. {
  677. const float* sptr = bottom_blob_bordered.channel(p);
  678. float* ptr = bottom_im2col.row(p);
  679. for (int i = 0; i < outh; i++)
  680. {
  681. for (int j = 0; j < outw; j++)
  682. {
  683. __m512 _val = _mm512_load_ps(sptr);
  684. _mm512_store_ps(ptr, _val);
  685. sptr += stride_w * 16;
  686. ptr += 16;
  687. }
  688. sptr += gap;
  689. }
  690. }
  691. }
  692. #endif // __AVX512F__
  693. if (elempack == 8)
  694. {
  695. #pragma omp parallel for num_threads(opt.num_threads)
  696. for (int p = 0; p < channels; p++)
  697. {
  698. const float* sptr = bottom_blob_bordered.channel(p);
  699. float* ptr = bottom_im2col.row(p);
  700. for (int i = 0; i < outh; i++)
  701. {
  702. for (int j = 0; j < outw; j++)
  703. {
  704. __m256 _val = _mm256_load_ps(sptr);
  705. _mm256_store_ps(ptr, _val);
  706. sptr += stride_w * 8;
  707. ptr += 8;
  708. }
  709. sptr += gap;
  710. }
  711. }
  712. }
  713. #endif // __AVX__
  714. if (elempack == 4)
  715. {
  716. #pragma omp parallel for num_threads(opt.num_threads)
  717. for (int p = 0; p < channels; p++)
  718. {
  719. const float* sptr = bottom_blob_bordered.channel(p);
  720. float* ptr = bottom_im2col.row(p);
  721. for (int i = 0; i < outh; i++)
  722. {
  723. for (int j = 0; j < outw; j++)
  724. {
  725. __m128 _val = _mm_load_ps(sptr);
  726. _mm_store_ps(ptr, _val);
  727. sptr += stride_w * 4;
  728. ptr += 4;
  729. }
  730. sptr += gap;
  731. }
  732. }
  733. }
  734. #endif // __SSE2__
  735. if (elempack == 1)
  736. {
  737. #pragma omp parallel for num_threads(opt.num_threads)
  738. for (int p = 0; p < channels; p++)
  739. {
  740. const float* sptr = bottom_blob_bordered.channel(p);
  741. float* ptr = bottom_im2col.row(p);
  742. for (int i = 0; i < outh; i++)
  743. {
  744. for (int j = 0; j < outw; j++)
  745. {
  746. ptr[0] = sptr[0];
  747. sptr += stride_w;
  748. ptr += 1;
  749. }
  750. sptr += gap;
  751. }
  752. }
  753. }
  754. }
  755. else
  756. {
  757. const int size = outw * outh;
  758. const int maxk = kernel_w * kernel_h;
  759. bottom_im2col.create(size, maxk * channels, elemsize, elempack, opt.workspace_allocator);
  760. if (bottom_im2col.empty())
  761. return -100;
  762. const int gap = (w * stride_h - outw * stride_w) * elempack;
  763. #if __SSE2__
  764. #if __AVX__
  765. #if __AVX512F__
  766. if (elempack == 16)
  767. {
  768. #pragma omp parallel for num_threads(opt.num_threads)
  769. for (int p = 0; p < channels; p++)
  770. {
  771. const Mat img = bottom_blob_bordered.channel(p);
  772. float* ptr = bottom_im2col.row(p * maxk);
  773. for (int u = 0; u < kernel_h; u++)
  774. {
  775. for (int v = 0; v < kernel_w; v++)
  776. {
  777. const float* sptr = img.row(dilation_h * u) + dilation_w * v * 16;
  778. for (int i = 0; i < outh; i++)
  779. {
  780. for (int j = 0; j < outw; j++)
  781. {
  782. __m512 _val = _mm512_load_ps(sptr);
  783. _mm512_store_ps(ptr, _val);
  784. sptr += stride_w * 16;
  785. ptr += 16;
  786. }
  787. sptr += gap;
  788. }
  789. }
  790. }
  791. }
  792. }
  793. #endif // __AVX512F__
  794. if (elempack == 8)
  795. {
  796. #pragma omp parallel for num_threads(opt.num_threads)
  797. for (int p = 0; p < channels; p++)
  798. {
  799. const Mat img = bottom_blob_bordered.channel(p);
  800. float* ptr = bottom_im2col.row(p * maxk);
  801. for (int u = 0; u < kernel_h; u++)
  802. {
  803. for (int v = 0; v < kernel_w; v++)
  804. {
  805. const float* sptr = img.row(dilation_h * u) + dilation_w * v * 8;
  806. for (int i = 0; i < outh; i++)
  807. {
  808. for (int j = 0; j < outw; j++)
  809. {
  810. __m256 _val = _mm256_load_ps(sptr);
  811. _mm256_store_ps(ptr, _val);
  812. sptr += stride_w * 8;
  813. ptr += 8;
  814. }
  815. sptr += gap;
  816. }
  817. }
  818. }
  819. }
  820. }
  821. #endif // __AVX__
  822. if (elempack == 4)
  823. {
  824. #pragma omp parallel for num_threads(opt.num_threads)
  825. for (int p = 0; p < channels; p++)
  826. {
  827. const Mat img = bottom_blob_bordered.channel(p);
  828. float* ptr = bottom_im2col.row(p * maxk);
  829. for (int u = 0; u < kernel_h; u++)
  830. {
  831. for (int v = 0; v < kernel_w; v++)
  832. {
  833. const float* sptr = img.row(dilation_h * u) + dilation_w * v * 4;
  834. for (int i = 0; i < outh; i++)
  835. {
  836. for (int j = 0; j < outw; j++)
  837. {
  838. __m128 _val = _mm_load_ps(sptr);
  839. _mm_store_ps(ptr, _val);
  840. sptr += stride_w * 4;
  841. ptr += 4;
  842. }
  843. sptr += gap;
  844. }
  845. }
  846. }
  847. }
  848. }
  849. #endif // __SSE2__
  850. if (elempack == 1)
  851. {
  852. #pragma omp parallel for num_threads(opt.num_threads)
  853. for (int p = 0; p < channels; p++)
  854. {
  855. const Mat img = bottom_blob_bordered.channel(p);
  856. float* ptr = bottom_im2col.row(p * maxk);
  857. for (int u = 0; u < kernel_h; u++)
  858. {
  859. for (int v = 0; v < kernel_w; v++)
  860. {
  861. const float* sptr = img.row(dilation_h * u) + dilation_w * v;
  862. for (int i = 0; i < outh; i++)
  863. {
  864. for (int j = 0; j < outw; j++)
  865. {
  866. ptr[0] = sptr[0];
  867. sptr += stride_w;
  868. ptr += 1;
  869. }
  870. sptr += gap;
  871. }
  872. }
  873. }
  874. }
  875. }
  876. }
  877. // sgemm
  878. {
  879. top_blob.w = outw * outh;
  880. top_blob.h = 1;
  881. }
  882. Option opt_b = opt;
  883. opt_b.blob_allocator = top_blob.allocator;
  884. gemm->forward(bottom_im2col, top_blob, opt_b);
  885. {
  886. top_blob.w = outw;
  887. top_blob.h = outh;
  888. }
  889. if (activation)
  890. {
  891. activation->forward_inplace(top_blob, opt);
  892. }
  893. }
  894. else
  895. {
  896. #if __SSE2__
  897. #if __AVX__
  898. #if __AVX512F__
  899. if (elempack == 16 && out_elempack == 1)
  900. {
  901. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  902. {
  903. conv3x3s1_pack16to1_avx512(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  904. if (activation)
  905. {
  906. activation->forward_inplace(top_blob, opt);
  907. }
  908. return 0;
  909. }
  910. }
  911. #endif // __AVX512F__
  912. if (elempack == 8 && out_elempack == 8)
  913. {
  914. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  915. {
  916. conv3x3s1_pack8_avx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  917. if (activation)
  918. {
  919. activation->forward_inplace(top_blob, opt);
  920. }
  921. return 0;
  922. }
  923. if (kernel_w == 2 && kernel_h == 2 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  924. {
  925. conv2x2s1_pack8_avx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  926. if (activation)
  927. {
  928. activation->forward_inplace(top_blob, opt);
  929. }
  930. return 0;
  931. }
  932. }
  933. if (elempack == 1 && out_elempack == 8)
  934. {
  935. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  936. {
  937. conv3x3s1_pack1to8_avx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  938. if (activation)
  939. {
  940. activation->forward_inplace(top_blob, opt);
  941. }
  942. return 0;
  943. }
  944. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  945. {
  946. conv3x3s2_pack1to8_avx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  947. if (activation)
  948. {
  949. activation->forward_inplace(top_blob, opt);
  950. }
  951. return 0;
  952. }
  953. }
  954. if (elempack == 8 && out_elempack == 1)
  955. {
  956. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  957. {
  958. conv3x3s1_pack8to1_avx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  959. if (activation)
  960. {
  961. activation->forward_inplace(top_blob, opt);
  962. }
  963. return 0;
  964. }
  965. }
  966. #endif // __AVX__
  967. if (elempack == 1 && out_elempack == 4)
  968. {
  969. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  970. {
  971. conv3x3s1_pack1to4_sse(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  972. if (activation)
  973. {
  974. activation->forward_inplace(top_blob, opt);
  975. }
  976. return 0;
  977. }
  978. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  979. {
  980. conv3x3s2_pack1to4_sse(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  981. if (activation)
  982. {
  983. activation->forward_inplace(top_blob, opt);
  984. }
  985. return 0;
  986. }
  987. }
  988. #endif // __SSE2__
  989. convolution_packed(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
  990. }
  991. return 0;
  992. }
  993. int Convolution_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
  994. {
  995. const Mat& bottom_blob = bottom_blobs[0];
  996. const Mat& _weight_data = bottom_blobs[1];
  997. Mat& top_blob = top_blobs[0];
  998. const int _kernel_w = _weight_data.w;
  999. const int _kernel_h = _weight_data.h;
  1000. const int _num_output = _weight_data.c * _weight_data.elempack;
  1001. Mat weight_data_flattened;
  1002. flatten(_weight_data, weight_data_flattened, opt);
  1003. if (weight_data_flattened.empty())
  1004. return -100;
  1005. // weight_data_flattened as pack1
  1006. weight_data_flattened.w *= weight_data_flattened.elempack;
  1007. weight_data_flattened.elemsize /= weight_data_flattened.elempack;
  1008. weight_data_flattened.elempack = 1;
  1009. Mat bias_data_flattened;
  1010. if (bias_term)
  1011. {
  1012. const Mat& _bias_data = bottom_blobs[2];
  1013. flatten(_bias_data, bias_data_flattened, opt);
  1014. if (bias_data_flattened.empty())
  1015. return -100;
  1016. // bias_data_flattened as pack1
  1017. bias_data_flattened.w *= bias_data_flattened.elempack;
  1018. bias_data_flattened.elemsize /= bias_data_flattened.elempack;
  1019. bias_data_flattened.elempack = 1;
  1020. }
  1021. ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
  1022. ncnn::ParamDict pd;
  1023. pd.set(0, _num_output);
  1024. pd.set(1, _kernel_w);
  1025. pd.set(11, _kernel_h);
  1026. pd.set(2, dilation_w);
  1027. pd.set(12, dilation_h);
  1028. pd.set(3, stride_w);
  1029. pd.set(13, stride_h);
  1030. pd.set(4, pad_left);
  1031. pd.set(15, pad_right);
  1032. pd.set(14, pad_top);
  1033. pd.set(16, pad_bottom);
  1034. pd.set(18, pad_value);
  1035. pd.set(5, bias_term);
  1036. pd.set(6, weight_data_flattened.w);
  1037. pd.set(8, int8_scale_term);
  1038. pd.set(9, activation_type);
  1039. pd.set(10, activation_params);
  1040. op->load_param(pd);
  1041. ncnn::Mat weights[2];
  1042. weights[0] = weight_data_flattened;
  1043. weights[1] = bias_data_flattened;
  1044. op->load_model(ncnn::ModelBinFromMatArray(weights));
  1045. op->create_pipeline(opt);
  1046. op->forward(bottom_blob, top_blob, opt);
  1047. op->destroy_pipeline(opt);
  1048. delete op;
  1049. return 0;
  1050. }
  1051. #if NCNN_INT8
  1052. int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
  1053. {
  1054. const int maxk = kernel_w * kernel_h;
  1055. const int num_input = weight_data_size / maxk / num_output;
  1056. bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input > 8 || num_output > 8);
  1057. if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1058. {
  1059. if (opt.use_winograd43_convolution)
  1060. conv3x3s1_winograd43_transform_kernel_int8(weight_data, weight_winograd43_data, num_input, num_output, opt);
  1061. else
  1062. conv3x3s1_winograd23_transform_kernel_int8(weight_data, weight_winograd23_data, num_input, num_output, opt);
  1063. }
  1064. else if (opt.use_sgemm_convolution)
  1065. {
  1066. convolution_im2col_gemm_transform_kernel_int8(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);
  1067. }
  1068. else
  1069. {
  1070. convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
  1071. }
  1072. scale_in_data.create(num_output);
  1073. for (int p = 0; p < num_output; p++)
  1074. {
  1075. // requantize and relu
  1076. float scale_in;
  1077. if (weight_data_int8_scales[p] == 0)
  1078. scale_in = 0;
  1079. else
  1080. scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
  1081. scale_in_data[p] = scale_in;
  1082. }
  1083. if (opt.lightmode)
  1084. weight_data.release();
  1085. return 0;
  1086. }
  1087. int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  1088. {
  1089. int elembits = bottom_blob.elembits();
  1090. Mat bottom_blob_int8 = bottom_blob;
  1091. if (elembits != 8)
  1092. {
  1093. Option opt_q = opt;
  1094. opt_q.blob_allocator = opt.workspace_allocator;
  1095. quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
  1096. }
  1097. // NCNN_LOGE("Convolution_x86 input %d x %d ksize=%d %d stride=%d %d", w, h, kernel_w, kernel_h, stride_w, stride_h);
  1098. Mat bottom_blob_bordered;
  1099. make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
  1100. if (bottom_blob_bordered.empty())
  1101. return -100;
  1102. int w = bottom_blob_bordered.w;
  1103. int h = bottom_blob_bordered.h;
  1104. int channels = bottom_blob_bordered.c;
  1105. int elempack = bottom_blob_bordered.elempack;
  1106. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  1107. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  1108. int outw = (w - kernel_extent_w) / stride_w + 1;
  1109. int outh = (h - kernel_extent_h) / stride_h + 1;
  1110. bool use_int8_requantize = int8_scale_term > 100;
  1111. int out_elempack = 1;
  1112. #if __SSE2__
  1113. if (opt.use_packing_layout)
  1114. {
  1115. if (use_int8_requantize)
  1116. out_elempack = num_output % 8 == 0 ? 8 : 1;
  1117. else
  1118. out_elempack = num_output % 4 == 0 ? 4 : 1;
  1119. }
  1120. #endif // __SSE2__
  1121. size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
  1122. // NCNN_LOGE("forward_int8_x86 %d %d %d %d %d", w, h, bottom_blob_bordered.c, elempack, out_elempack);
  1123. top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  1124. if (top_blob.empty())
  1125. return -100;
  1126. const int num_input = channels * elempack;
  1127. int out_elempack_int32 = 1;
  1128. #if __SSE2__
  1129. if (opt.use_packing_layout)
  1130. {
  1131. out_elempack_int32 = num_output % 4 == 0 ? 4 : 1;
  1132. }
  1133. #endif // __SSE2__
  1134. Mat top_blob_int32;
  1135. top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator);
  1136. if (top_blob_int32.empty())
  1137. return -100;
  1138. bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input > 8 || num_output > 8);
  1139. int _nT = nT ? nT : opt.num_threads;
  1140. if (nT != 0 && opt.num_threads != nT)
  1141. {
  1142. // force num_threads the same as in create_pipeline
  1143. // so we could use pre-packed A/B from the same tile config
  1144. NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT);
  1145. }
  1146. if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  1147. {
  1148. if (opt.use_winograd43_convolution && !weight_winograd43_data.empty())
  1149. conv3x3s1_winograd43_int8(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, _nT, opt);
  1150. else
  1151. conv3x3s1_winograd23_int8(bottom_blob_bordered, top_blob_int32, weight_winograd23_data, _nT, opt);
  1152. }
  1153. else if (opt.use_sgemm_convolution)
  1154. {
  1155. convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
  1156. }
  1157. else
  1158. {
  1159. convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
  1160. }
  1161. if (use_int8_requantize)
  1162. {
  1163. requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
  1164. }
  1165. else
  1166. {
  1167. dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
  1168. if (activation)
  1169. {
  1170. activation->forward_inplace(top_blob, opt);
  1171. }
  1172. }
  1173. return 0;
  1174. }
  1175. #endif // NCNN_INT8
  1176. int Convolution_x86::forwardDilation_x86(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  1177. {
  1178. int w = bottom_blob.w;
  1179. int h = bottom_blob.h;
  1180. size_t elemsize = bottom_blob.elemsize;
  1181. const int kernel_size = kernel_w;
  1182. const int stride = stride_w;
  1183. const int dilation = dilation_w;
  1184. const int kernel_extent = dilation * (kernel_size - 1) + 1;
  1185. int outw = (w - kernel_extent) / stride + 1;
  1186. int outh = (h - kernel_extent) / stride + 1;
  1187. top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
  1188. if (top_blob.empty())
  1189. return -100;
  1190. // Make (dilation * dilation) batches
  1191. Mat inner_bottom_blob;
  1192. Mat inner_top_blob;
  1193. for (int x = 0; x < dilation; x++)
  1194. {
  1195. for (int y = 0; y < dilation; y++)
  1196. {
  1197. int inner_w = (w - y + dilation - 1) / dilation;
  1198. int inner_h = (h - x + dilation - 1) / dilation;
  1199. int inner_outw = (inner_w - kernel_size) / stride + 1;
  1200. int inner_outh = (inner_h - kernel_size) / stride + 1;
  1201. inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c, elemsize, opt.workspace_allocator);
  1202. if (inner_bottom_blob.empty())
  1203. return -100;
  1204. inner_top_blob.create(inner_outw, inner_outh, num_output, elemsize, opt.workspace_allocator);
  1205. if (inner_top_blob.empty())
  1206. return -100;
  1207. #pragma omp parallel for num_threads(opt.num_threads)
  1208. for (int c = 0; c < bottom_blob.c; c++)
  1209. {
  1210. float* outptr = inner_bottom_blob.channel(c);
  1211. for (int i = 0; i < inner_h; i++)
  1212. {
  1213. const float* ptr = (const float*)bottom_blob.channel(c) + dilation * i * w + x * w + y;
  1214. for (int j = 0; j < inner_w; j++)
  1215. {
  1216. outptr[j] = ptr[j * dilation];
  1217. }
  1218. outptr += inner_w;
  1219. }
  1220. }
  1221. Option opt_g = opt;
  1222. opt_g.blob_allocator = inner_top_blob.allocator;
  1223. convolution_dilation1->forward(inner_bottom_blob, inner_top_blob, opt_g);
  1224. #pragma omp parallel for num_threads(opt.num_threads)
  1225. for (int c = 0; c < num_output; c++)
  1226. {
  1227. float* outptr = (float*)top_blob.channel(c) + x * outw + y;
  1228. for (int i = 0; i < inner_outh; i++)
  1229. {
  1230. const float* ptr = (const float*)inner_top_blob.channel(c) + i * inner_outw;
  1231. for (int j = 0; j < inner_outw; j++)
  1232. {
  1233. outptr[j * dilation] = ptr[j];
  1234. }
  1235. outptr += dilation * outw;
  1236. }
  1237. }
  1238. }
  1239. }
  1240. if (activation)
  1241. {
  1242. activation->forward_inplace(top_blob, opt);
  1243. }
  1244. return 0;
  1245. }
  1246. } // namespace ncnn