You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolutiondepthwise_x86.cpp 44 kB

X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "convolutiondepthwise_x86.h"
  15. #if __SSE2__
  16. #include <emmintrin.h>
  17. #if __AVX__
  18. #include <immintrin.h>
  19. #endif
  20. #endif // __SSE2__
  21. #include "x86_activation.h"
  22. #include "x86_usability.h"
  23. #include "layer_type.h"
  24. namespace ncnn {
  25. #if __SSE2__
  26. #include "convolutiondepthwise_3x3_pack4.h"
  27. #include "convolutiondepthwise_5x5_pack4.h"
  28. #if __AVX__
  29. #include "convolutiondepthwise_3x3_pack8.h"
  30. #include "convolutiondepthwise_5x5_pack8.h"
  31. #if __AVX512F__
  32. #include "convolutiondepthwise_3x3_pack16.h"
  33. #include "convolutiondepthwise_5x5_pack16.h"
  34. #endif // __AVX512F__
  35. #endif // __AVX__
  36. #endif // __SSE2__
  37. #include "convolutiondepthwise_3x3.h"
  38. #if NCNN_INT8
  39. #include "convolutiondepthwise_3x3_int8.h"
  40. #endif // NCNN_INT8
  41. ConvolutionDepthWise_x86::ConvolutionDepthWise_x86()
  42. {
  43. #if __SSE2__
  44. support_packing = true;
  45. #endif // __SSE2__
  46. activation = 0;
  47. }
  48. int ConvolutionDepthWise_x86::create_pipeline(const Option& opt)
  49. {
  50. if (dynamic_weight)
  51. return 0;
  52. activation = create_activation_layer(activation_type, activation_params, opt);
  53. #if NCNN_INT8
  54. if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
  55. {
  56. return create_pipeline_int8_x86(opt);
  57. }
  58. #endif
  59. const int maxk = kernel_w * kernel_h;
  60. int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
  61. // depth-wise
  62. if (channels == group && group == num_output)
  63. {
  64. int elempack = 1;
  65. #if __SSE2__
  66. if (opt.use_packing_layout)
  67. {
  68. #if __AVX512F__
  69. elempack = channels % 16 == 0 ? 16 : channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
  70. #elif __AVX__
  71. elempack = channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
  72. #else
  73. elempack = channels % 4 == 0 ? 4 : 1;
  74. #endif
  75. }
  76. #endif // __SSE2__
  77. #if __SSE2__
  78. #if __AVX__
  79. // pack16
  80. #if __AVX512F__
  81. if (elempack == 16)
  82. {
  83. Mat weight_data_r2 = weight_data.reshape(maxk, group);
  84. convert_packing(weight_data_r2, weight_data_tm, 16, opt);
  85. }
  86. #endif // __AVX512F__
  87. // pack8
  88. if (elempack == 8)
  89. {
  90. Mat weight_data_r2 = weight_data.reshape(maxk, group);
  91. convert_packing(weight_data_r2, weight_data_tm, 8, opt);
  92. }
  93. #endif // __AVX__
  94. // pack4
  95. if (elempack == 4)
  96. {
  97. Mat weight_data_r2 = weight_data.reshape(maxk, group);
  98. convert_packing(weight_data_r2, weight_data_tm, 4, opt);
  99. }
  100. #endif // __SSE2__
  101. if (elempack == 1)
  102. {
  103. // depth-wise specific
  104. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  105. {
  106. weight_data_tm = weight_data;
  107. }
  108. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  109. {
  110. weight_data_tm = weight_data;
  111. }
  112. else
  113. {
  114. create_group_ops(opt);
  115. }
  116. }
  117. if (opt.lightmode)
  118. {
  119. weight_data.release();
  120. }
  121. return 0;
  122. }
  123. // group convolution
  124. create_group_ops(opt);
  125. if (opt.lightmode)
  126. {
  127. weight_data.release();
  128. }
  129. return 0;
  130. }
  131. int ConvolutionDepthWise_x86::create_group_ops(const Option& opt)
  132. {
  133. // create Convolution op for each group
  134. const int maxk = kernel_w * kernel_h;
  135. int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
  136. for (int i = 0; i < (int)group_ops.size(); i++)
  137. delete group_ops[i];
  138. group_ops.clear();
  139. const int channels_g = channels / group;
  140. const int num_output_g = num_output / group;
  141. group_ops.resize(group);
  142. for (int g = 0; g < group; g++)
  143. {
  144. Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone();
  145. Mat bias_data_g;
  146. if (bias_term)
  147. bias_data_g = bias_data.range(num_output_g * g, num_output_g);
  148. ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
  149. // set param
  150. ncnn::ParamDict pd;
  151. pd.set(0, num_output_g); // num_output
  152. pd.set(1, kernel_w);
  153. pd.set(11, kernel_h);
  154. pd.set(2, dilation_w);
  155. pd.set(12, dilation_h);
  156. pd.set(3, stride_w);
  157. pd.set(13, stride_h);
  158. pd.set(4, 0); // pad_w
  159. pd.set(14, 0); // pad_h
  160. pd.set(5, bias_term);
  161. pd.set(6, maxk * channels_g * num_output_g); // weight_data_size
  162. pd.set(8, int8_scale_term);
  163. pd.set(9, activation_type);
  164. pd.set(10, activation_params);
  165. op->load_param(pd);
  166. // set weights
  167. if (bias_term)
  168. {
  169. ncnn::Mat weights[5];
  170. weights[0] = weight_data_g;
  171. weights[1] = bias_data_g;
  172. #if NCNN_INT8
  173. if (int8_scale_term)
  174. {
  175. Mat weight_data_int8_scales_g(num_output_g);
  176. weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
  177. weights[2] = weight_data_int8_scales_g;
  178. weights[3] = bottom_blob_int8_scales.range(g, 1);
  179. }
  180. if (int8_scale_term > 100)
  181. {
  182. weights[4] = top_blob_int8_scales.range(g, 1);
  183. }
  184. #endif
  185. op->load_model(ModelBinFromMatArray(weights));
  186. }
  187. else
  188. {
  189. ncnn::Mat weights[4];
  190. weights[0] = weight_data_g;
  191. #if NCNN_INT8
  192. if (int8_scale_term)
  193. {
  194. Mat weight_data_int8_scales_g(num_output_g);
  195. weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
  196. weights[1] = weight_data_int8_scales_g;
  197. weights[2] = bottom_blob_int8_scales.range(g, 1);
  198. }
  199. if (int8_scale_term > 100)
  200. {
  201. weights[3] = top_blob_int8_scales.range(g, 1);
  202. }
  203. #endif
  204. op->load_model(ModelBinFromMatArray(weights));
  205. }
  206. op->create_pipeline(opt);
  207. group_ops[g] = op;
  208. }
  209. return 0;
  210. }
  211. int ConvolutionDepthWise_x86::destroy_pipeline(const Option& opt)
  212. {
  213. if (activation)
  214. {
  215. activation->destroy_pipeline(opt);
  216. delete activation;
  217. activation = 0;
  218. }
  219. for (int i = 0; i < (int)group_ops.size(); i++)
  220. {
  221. group_ops[i]->destroy_pipeline(opt);
  222. delete group_ops[i];
  223. }
  224. group_ops.clear();
  225. return 0;
  226. }
  227. int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  228. {
  229. #if NCNN_INT8
  230. if (opt.use_int8_inference && int8_scale_term)
  231. {
  232. return forward_int8_x86(bottom_blob, top_blob, opt);
  233. }
  234. #endif
  235. int w = bottom_blob.w;
  236. int h = bottom_blob.h;
  237. int channels = bottom_blob.c;
  238. size_t elemsize = bottom_blob.elemsize;
  239. int elempack = bottom_blob.elempack;
  240. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  241. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  242. Mat bottom_blob_bordered;
  243. make_padding(bottom_blob, bottom_blob_bordered, opt);
  244. if (bottom_blob_bordered.empty())
  245. return -100;
  246. w = bottom_blob_bordered.w;
  247. h = bottom_blob_bordered.h;
  248. int outw = (w - kernel_extent_w) / stride_w + 1;
  249. int outh = (h - kernel_extent_h) / stride_h + 1;
  250. int out_elempack = 1;
  251. #if __SSE2__
  252. if (opt.use_packing_layout)
  253. {
  254. #if __AVX512F__
  255. out_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
  256. #elif __AVX__
  257. out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
  258. #else
  259. out_elempack = num_output % 4 == 0 ? 4 : 1;
  260. #endif
  261. }
  262. #endif // __SSE2__
  263. size_t out_elemsize = elemsize / elempack * out_elempack;
  264. top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  265. if (top_blob.empty())
  266. return -100;
  267. // depth-wise
  268. if (channels * elempack == group && group == num_output)
  269. {
  270. #if __SSE2__
  271. #if __AVX__
  272. #if __AVX512F__
  273. if (elempack == 16)
  274. {
  275. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  276. {
  277. convdw3x3s1_pack16_avx512(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  278. if (activation)
  279. {
  280. activation->forward_inplace(top_blob, opt);
  281. }
  282. return 0;
  283. }
  284. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  285. {
  286. convdw3x3s2_pack16_avx512(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  287. if (activation)
  288. {
  289. activation->forward_inplace(top_blob, opt);
  290. }
  291. return 0;
  292. }
  293. if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  294. {
  295. convdw5x5s1_pack16_avx512(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  296. if (activation)
  297. {
  298. activation->forward_inplace(top_blob, opt);
  299. }
  300. return 0;
  301. }
  302. if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  303. {
  304. convdw5x5s2_pack16_avx512(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  305. if (activation)
  306. {
  307. activation->forward_inplace(top_blob, opt);
  308. }
  309. return 0;
  310. }
  311. else
  312. {
  313. const int maxk = kernel_w * kernel_h;
  314. // kernel offsets
  315. std::vector<int> _space_ofs(maxk);
  316. int* space_ofs = &_space_ofs[0];
  317. {
  318. int p1 = 0;
  319. int p2 = 0;
  320. int gap = w * dilation_h - kernel_w * dilation_w;
  321. for (int i = 0; i < kernel_h; i++)
  322. {
  323. for (int j = 0; j < kernel_w; j++)
  324. {
  325. space_ofs[p1] = p2;
  326. p1++;
  327. p2 += dilation_w;
  328. }
  329. p2 += gap;
  330. }
  331. }
  332. #pragma omp parallel for num_threads(opt.num_threads)
  333. for (int g = 0; g < channels; g++)
  334. {
  335. float* outptr = top_blob.channel(g);
  336. const float* kptr = (const float*)weight_data_tm + maxk * g * 16;
  337. const Mat m = bottom_blob_bordered.channel(g);
  338. for (int i = 0; i < outh; i++)
  339. {
  340. for (int j = 0; j < outw; j++)
  341. {
  342. __m512 _sum = _mm512_set1_ps(0.f);
  343. if (bias_term)
  344. {
  345. _sum = _mm512_loadu_ps(((const float*)bias_data) + g * 16);
  346. }
  347. const float* sptr = m.row(i * stride_h) + j * stride_w * 16;
  348. for (int k = 0; k < maxk; k++)
  349. {
  350. __m512 _val = _mm512_loadu_ps(sptr + space_ofs[k] * 16);
  351. __m512 _w = _mm512_loadu_ps(kptr + k * 16);
  352. _sum = _mm512_fmadd_ps(_val, _w, _sum);
  353. }
  354. _mm512_storeu_ps(outptr, _sum);
  355. outptr += 16;
  356. }
  357. }
  358. }
  359. if (activation)
  360. {
  361. activation->forward_inplace(top_blob, opt);
  362. }
  363. return 0;
  364. }
  365. }
  366. #endif // __AVX512F__
  367. if (elempack == 8)
  368. {
  369. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  370. {
  371. convdw3x3s1_pack8_avx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  372. if (activation)
  373. {
  374. activation->forward_inplace(top_blob, opt);
  375. }
  376. return 0;
  377. }
  378. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  379. {
  380. convdw3x3s2_pack8_avx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  381. if (activation)
  382. {
  383. activation->forward_inplace(top_blob, opt);
  384. }
  385. return 0;
  386. }
  387. if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  388. {
  389. convdw5x5s1_pack8_avx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  390. if (activation)
  391. {
  392. activation->forward_inplace(top_blob, opt);
  393. }
  394. return 0;
  395. }
  396. if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  397. {
  398. convdw5x5s2_pack8_avx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  399. if (activation)
  400. {
  401. activation->forward_inplace(top_blob, opt);
  402. }
  403. return 0;
  404. }
  405. else
  406. {
  407. const int maxk = kernel_w * kernel_h;
  408. // kernel offsets
  409. std::vector<int> _space_ofs(maxk);
  410. int* space_ofs = &_space_ofs[0];
  411. {
  412. int p1 = 0;
  413. int p2 = 0;
  414. int gap = w * dilation_h - kernel_w * dilation_w;
  415. for (int i = 0; i < kernel_h; i++)
  416. {
  417. for (int j = 0; j < kernel_w; j++)
  418. {
  419. space_ofs[p1] = p2;
  420. p1++;
  421. p2 += dilation_w;
  422. }
  423. p2 += gap;
  424. }
  425. }
  426. #pragma omp parallel for num_threads(opt.num_threads)
  427. for (int g = 0; g < channels; g++)
  428. {
  429. float* outptr = top_blob.channel(g);
  430. const float* kptr = (const float*)weight_data_tm + maxk * g * 8;
  431. const Mat m = bottom_blob_bordered.channel(g);
  432. for (int i = 0; i < outh; i++)
  433. {
  434. for (int j = 0; j < outw; j++)
  435. {
  436. __m256 _sum = _mm256_set1_ps(0.f);
  437. if (bias_term)
  438. {
  439. _sum = _mm256_loadu_ps(((const float*)bias_data) + g * 8);
  440. }
  441. const float* sptr = m.row(i * stride_h) + j * stride_w * 8;
  442. for (int k = 0; k < maxk; k++)
  443. {
  444. __m256 _val = _mm256_loadu_ps(sptr + space_ofs[k] * 8);
  445. __m256 _w = _mm256_loadu_ps(kptr + k * 8);
  446. _sum = _mm256_comp_fmadd_ps(_val, _w, _sum);
  447. }
  448. _mm256_storeu_ps(outptr + j * 8, _sum);
  449. }
  450. outptr += outw * 8;
  451. }
  452. }
  453. if (activation)
  454. {
  455. activation->forward_inplace(top_blob, opt);
  456. }
  457. return 0;
  458. }
  459. }
  460. #endif // __AVX__
  461. if (elempack == 4)
  462. {
  463. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  464. {
  465. convdw3x3s1_pack4_sse(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  466. if (activation)
  467. {
  468. activation->forward_inplace(top_blob, opt);
  469. }
  470. return 0;
  471. }
  472. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  473. {
  474. convdw3x3s2_pack4_sse(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  475. if (activation)
  476. {
  477. activation->forward_inplace(top_blob, opt);
  478. }
  479. return 0;
  480. }
  481. if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  482. {
  483. convdw5x5s1_pack4_sse(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  484. if (activation)
  485. {
  486. activation->forward_inplace(top_blob, opt);
  487. }
  488. return 0;
  489. }
  490. if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  491. {
  492. convdw5x5s2_pack4_sse(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  493. if (activation)
  494. {
  495. activation->forward_inplace(top_blob, opt);
  496. }
  497. return 0;
  498. }
  499. {
  500. const int maxk = kernel_w * kernel_h;
  501. // kernel offsets
  502. std::vector<int> _space_ofs(maxk);
  503. int* space_ofs = &_space_ofs[0];
  504. {
  505. int p1 = 0;
  506. int p2 = 0;
  507. int gap = w * dilation_h - kernel_w * dilation_w;
  508. for (int i = 0; i < kernel_h; i++)
  509. {
  510. for (int j = 0; j < kernel_w; j++)
  511. {
  512. space_ofs[p1] = p2;
  513. p1++;
  514. p2 += dilation_w;
  515. }
  516. p2 += gap;
  517. }
  518. }
  519. #pragma omp parallel for num_threads(opt.num_threads)
  520. for (int g = 0; g < channels; g++)
  521. {
  522. float* outptr = top_blob.channel(g);
  523. const float* kptr = (const float*)weight_data_tm + maxk * g * 4;
  524. const Mat m = bottom_blob_bordered.channel(g);
  525. for (int i = 0; i < outh; i++)
  526. {
  527. for (int j = 0; j < outw; j++)
  528. {
  529. __m128 _sum = _mm_set1_ps(0.f);
  530. if (bias_term)
  531. {
  532. _sum = _mm_loadu_ps(((const float*)bias_data) + g * 4);
  533. }
  534. const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
  535. for (int k = 0; k < maxk; k++)
  536. {
  537. __m128 _val = _mm_loadu_ps(sptr + space_ofs[k] * 4);
  538. __m128 _w = _mm_loadu_ps(kptr + k * 4);
  539. _sum = _mm_add_ps(_mm_mul_ps(_val, _w), _sum);
  540. }
  541. _sum = activation_sse(_sum, activation_type, activation_params);
  542. _mm_storeu_ps(outptr + j * 4, _sum);
  543. }
  544. outptr += outw * 4;
  545. }
  546. }
  547. return 0;
  548. }
  549. }
  550. #endif // __SSE2__
  551. if (elempack == 1)
  552. {
  553. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  554. {
  555. convdw3x3s1_sse(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  556. if (activation)
  557. {
  558. activation->forward_inplace(top_blob, opt);
  559. }
  560. return 0;
  561. }
  562. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  563. {
  564. convdw3x3s2_sse(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
  565. if (activation)
  566. {
  567. activation->forward_inplace(top_blob, opt);
  568. }
  569. return 0;
  570. }
  571. }
  572. }
  573. // group convolution
  574. const int channels_g = channels * elempack / group;
  575. const int num_output_g = num_output / group;
  576. int g_elempack = 1;
  577. int out_g_elempack = 1;
  578. #if __SSE2__
  579. if (opt.use_packing_layout)
  580. {
  581. #if __AVX512F__
  582. g_elempack = channels_g % 16 == 0 ? 16 : channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
  583. out_g_elempack = num_output_g % 16 == 0 ? 16 : num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
  584. #elif __AVX__
  585. g_elempack = channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
  586. out_g_elempack = num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
  587. #else
  588. g_elempack = channels_g % 4 == 0 ? 4 : 1;
  589. out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
  590. #endif
  591. }
  592. #endif // __SSE2__
  593. // unpacking
  594. Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
  595. if (elempack > g_elempack)
  596. {
  597. Option opt_p = opt;
  598. opt_p.blob_allocator = opt.workspace_allocator;
  599. convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
  600. }
  601. Mat top_blob_unpacked = top_blob;
  602. if (out_g_elempack < out_elempack)
  603. {
  604. top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
  605. if (top_blob_unpacked.empty())
  606. return -100;
  607. }
  608. for (int g = 0; g < group; g++)
  609. {
  610. const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
  611. Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
  612. const ncnn::Layer* op = group_ops[g];
  613. Option opt_g = opt;
  614. opt_g.blob_allocator = top_blob_unpacked.allocator;
  615. // forward
  616. op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
  617. }
  618. // packing
  619. if (out_g_elempack < out_elempack)
  620. {
  621. convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
  622. }
  623. else
  624. {
  625. top_blob = top_blob_unpacked;
  626. }
  627. return 0;
  628. }
  629. int ConvolutionDepthWise_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
  630. {
  631. const Mat& bottom_blob = bottom_blobs[0];
  632. const Mat& _weight_data = bottom_blobs[1];
  633. Mat& top_blob = top_blobs[0];
  634. const int _kernel_w = _weight_data.w;
  635. const int _kernel_h = _weight_data.h;
  636. const int _num_output = _weight_data.c * _weight_data.elempack;
  637. Mat weight_data_flattened;
  638. flatten(_weight_data, weight_data_flattened, opt);
  639. if (weight_data_flattened.empty())
  640. return -100;
  641. // weight_data_flattened as pack1
  642. weight_data_flattened.w *= weight_data_flattened.elempack;
  643. weight_data_flattened.elemsize /= weight_data_flattened.elempack;
  644. weight_data_flattened.elempack = 1;
  645. Mat bias_data_flattened;
  646. if (bias_term)
  647. {
  648. const Mat& _bias_data = bottom_blobs[2];
  649. flatten(_bias_data, bias_data_flattened, opt);
  650. if (bias_data_flattened.empty())
  651. return -100;
  652. // bias_data_flattened as pack1
  653. bias_data_flattened.w *= bias_data_flattened.elempack;
  654. bias_data_flattened.elemsize /= bias_data_flattened.elempack;
  655. bias_data_flattened.elempack = 1;
  656. }
  657. ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::ConvolutionDepthWise);
  658. ncnn::ParamDict pd;
  659. pd.set(0, _num_output);
  660. pd.set(1, _kernel_w);
  661. pd.set(11, _kernel_h);
  662. pd.set(2, dilation_w);
  663. pd.set(12, dilation_h);
  664. pd.set(3, stride_w);
  665. pd.set(13, stride_h);
  666. pd.set(4, pad_left);
  667. pd.set(15, pad_right);
  668. pd.set(14, pad_top);
  669. pd.set(16, pad_bottom);
  670. pd.set(18, pad_value);
  671. pd.set(5, bias_term);
  672. pd.set(6, weight_data_flattened.w);
  673. pd.set(7, group);
  674. pd.set(8, int8_scale_term);
  675. pd.set(9, activation_type);
  676. pd.set(10, activation_params);
  677. op->load_param(pd);
  678. ncnn::Mat weights[2];
  679. weights[0] = weight_data_flattened;
  680. weights[1] = bias_data_flattened;
  681. op->load_model(ncnn::ModelBinFromMatArray(weights));
  682. op->create_pipeline(opt);
  683. op->forward(bottom_blob, top_blob, opt);
  684. op->destroy_pipeline(opt);
  685. delete op;
  686. return 0;
  687. }
  688. #if NCNN_INT8
  689. int ConvolutionDepthWise_x86::create_pipeline_int8_x86(const Option& opt)
  690. {
  691. const int maxk = kernel_w * kernel_h;
  692. int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
  693. // depth-wise
  694. if (channels == group && group == num_output)
  695. {
  696. int elempack = 1;
  697. #if __SSE2__
  698. if (opt.use_packing_layout)
  699. {
  700. elempack = channels % 8 == 0 ? 8 : 1;
  701. }
  702. #endif // __SSE2__
  703. if (elempack == 8)
  704. {
  705. Mat weight_data_r2 = weight_data.reshape(maxk, group);
  706. convert_packing(weight_data_r2, weight_data_tm, 8, opt);
  707. }
  708. if (elempack == 1)
  709. {
  710. weight_data_tm = weight_data;
  711. }
  712. return 0;
  713. }
  714. // group convolution
  715. create_group_ops(opt);
  716. if (opt.lightmode)
  717. {
  718. weight_data.release();
  719. }
  720. return 0;
  721. }
  722. int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  723. {
  724. int w = bottom_blob.w;
  725. int h = bottom_blob.h;
  726. int channels = bottom_blob.c;
  727. int elempack = bottom_blob.elempack;
  728. int elembits = bottom_blob.elembits();
  729. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  730. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  731. Mat bottom_blob_int8 = bottom_blob;
  732. if (elembits != 8)
  733. {
  734. const int channels_g = channels * elempack / group;
  735. Mat scales(channels * elempack);
  736. {
  737. float* ps = scales;
  738. for (int g = 0; g < group; g++)
  739. {
  740. float scale = bottom_blob_int8_scales[g];
  741. for (int q = 0; q < channels_g; q++)
  742. {
  743. *ps++ = scale;
  744. }
  745. }
  746. }
  747. Option opt_q = opt;
  748. opt_q.blob_allocator = opt.workspace_allocator;
  749. quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q);
  750. }
  751. Mat bottom_blob_bordered;
  752. make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
  753. if (bottom_blob_bordered.empty())
  754. return -100;
  755. w = bottom_blob_bordered.w;
  756. h = bottom_blob_bordered.h;
  757. channels = bottom_blob_bordered.c;
  758. elempack = bottom_blob_bordered.elempack;
  759. int outw = (w - kernel_extent_w) / stride_w + 1;
  760. int outh = (h - kernel_extent_h) / stride_h + 1;
  761. // depth-wise
  762. if (channels * elempack == group && group == num_output)
  763. {
  764. int out_elempack = 1;
  765. #if __SSE2__
  766. if (opt.use_packing_layout)
  767. {
  768. out_elempack = num_output % 8 == 0 ? 8 : 1;
  769. }
  770. #endif // __SSE2__
  771. bool use_int8_requantize = int8_scale_term > 100;
  772. size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
  773. top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  774. if (top_blob.empty())
  775. return -100;
  776. #if __SSE2__
  777. if (elempack == 8)
  778. {
  779. {
  780. const int maxk = kernel_w * kernel_h;
  781. // kernel offsets
  782. std::vector<int> _space_ofs(maxk);
  783. int* space_ofs = &_space_ofs[0];
  784. {
  785. int p1 = 0;
  786. int p2 = 0;
  787. int gap = w * dilation_h - kernel_w * dilation_w;
  788. for (int i = 0; i < kernel_h; i++)
  789. {
  790. for (int j = 0; j < kernel_w; j++)
  791. {
  792. space_ofs[p1] = p2;
  793. p1++;
  794. p2 += dilation_w;
  795. }
  796. p2 += gap;
  797. }
  798. }
  799. #pragma omp parallel for num_threads(opt.num_threads)
  800. for (int g = 0; g < channels; g++)
  801. {
  802. signed char* outptr_s8 = top_blob.channel(g);
  803. float* outptr_f32 = top_blob.channel(g);
  804. const signed char* kptr = (const signed char*)weight_data_tm + maxk * g * 8;
  805. const Mat m = bottom_blob_bordered.channel(g);
  806. for (int i = 0; i < outh; i++)
  807. {
  808. for (int j = 0; j < outw; j++)
  809. {
  810. __m128i _sum0 = _mm_setzero_si128();
  811. __m128i _sum1 = _mm_setzero_si128();
  812. const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * 8;
  813. for (int k = 0; k < maxk; k++)
  814. {
  815. // TODO use _mm_cvtepi8_epi16 on sse4.1
  816. __m128i _val = _mm_loadl_epi64((const __m128i*)(sptr + space_ofs[k] * 8));
  817. _val = _mm_unpacklo_epi8(_val, _mm_cmpgt_epi8(_mm_setzero_si128(), _val));
  818. __m128i _w = _mm_loadl_epi64((const __m128i*)(kptr + k * 8));
  819. _w = _mm_unpacklo_epi8(_w, _mm_cmpgt_epi8(_mm_setzero_si128(), _w));
  820. __m128i _sl = _mm_mullo_epi16(_val, _w);
  821. __m128i _sh = _mm_mulhi_epi16(_val, _w);
  822. __m128i _s0 = _mm_unpacklo_epi16(_sl, _sh);
  823. __m128i _s1 = _mm_unpackhi_epi16(_sl, _sh);
  824. _sum0 = _mm_add_epi32(_sum0, _s0);
  825. _sum1 = _mm_add_epi32(_sum1, _s1);
  826. }
  827. __m128 _scale_in0;
  828. __m128 _scale_in1;
  829. {
  830. __m128 _bottom_blob_int8_scales0 = _mm_loadu_ps((const float*)bottom_blob_int8_scales + g * 8);
  831. __m128 _bottom_blob_int8_scales1 = _mm_loadu_ps((const float*)bottom_blob_int8_scales + g * 8 + 4);
  832. __m128 _weight_data_int8_scales0 = _mm_loadu_ps((const float*)weight_data_int8_scales + g * 8);
  833. __m128 _weight_data_int8_scales1 = _mm_loadu_ps((const float*)weight_data_int8_scales + g * 8 + 4);
  834. _scale_in0 = _mm_rcp_ps(_mm_mul_ps(_bottom_blob_int8_scales0, _weight_data_int8_scales0));
  835. _scale_in1 = _mm_rcp_ps(_mm_mul_ps(_bottom_blob_int8_scales1, _weight_data_int8_scales1));
  836. __m128 _m0 = _mm_cmpneq_ps(_weight_data_int8_scales0, _mm_setzero_ps());
  837. __m128 _m1 = _mm_cmpneq_ps(_weight_data_int8_scales1, _mm_setzero_ps());
  838. _scale_in0 = _mm_and_ps(_scale_in0, _m0);
  839. _scale_in1 = _mm_and_ps(_scale_in1, _m1);
  840. }
  841. __m128 _sumfp32_0 = _mm_mul_ps(_mm_cvtepi32_ps(_sum0), _scale_in0);
  842. __m128 _sumfp32_1 = _mm_mul_ps(_mm_cvtepi32_ps(_sum1), _scale_in1);
  843. if (bias_term)
  844. {
  845. __m128 _bias0 = _mm_loadu_ps((const float*)bias_data + g * 8);
  846. __m128 _bias1 = _mm_loadu_ps((const float*)bias_data + g * 8 + 4);
  847. _sumfp32_0 = _mm_add_ps(_sumfp32_0, _bias0);
  848. _sumfp32_1 = _mm_add_ps(_sumfp32_1, _bias1);
  849. }
  850. _sumfp32_0 = activation_sse(_sumfp32_0, activation_type, activation_params);
  851. _sumfp32_1 = activation_sse(_sumfp32_1, activation_type, activation_params);
  852. if (use_int8_requantize)
  853. {
  854. // requantize and relu
  855. __m128 _scale_out0 = _mm_loadu_ps((const float*)top_blob_int8_scales + g * 8);
  856. __m128 _scale_out1 = _mm_loadu_ps((const float*)top_blob_int8_scales + g * 8 + 4);
  857. _sumfp32_0 = _mm_mul_ps(_sumfp32_0, _scale_out0);
  858. _sumfp32_1 = _mm_mul_ps(_sumfp32_1, _scale_out1);
  859. int64_t _sum8 = float2int8_sse(_sumfp32_0, _sumfp32_1);
  860. *(int64_t*)outptr_s8 = _sum8;
  861. outptr_s8 += 8;
  862. }
  863. else
  864. {
  865. // dequantize and relu
  866. _mm_storeu_ps(outptr_f32, _sumfp32_0);
  867. _mm_storeu_ps(outptr_f32 + 4, _sumfp32_1);
  868. outptr_f32 += 8;
  869. }
  870. }
  871. }
  872. }
  873. }
  874. }
  875. #endif // __SSE2__
  876. if (elempack == 1)
  877. {
  878. if (kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1 && (activation_type == 0 || activation_type == 1))
  879. {
  880. if (use_int8_requantize)
  881. {
  882. std::vector<float> requantize_scales;
  883. for (int g = 0; g < group; g++)
  884. {
  885. float scale_in;
  886. if (weight_data_int8_scales[g] == 0)
  887. scale_in = 0;
  888. else
  889. scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
  890. float scale_out = top_blob_int8_scales[g];
  891. requantize_scales.push_back(scale_in);
  892. requantize_scales.push_back(scale_out);
  893. }
  894. convdw3x3s1_int8_requant_sse(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, requantize_scales, opt);
  895. }
  896. else
  897. {
  898. std::vector<float> dequantize_scales;
  899. for (int g = 0; g < group; g++)
  900. {
  901. float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
  902. dequantize_scales.push_back(top_rescale);
  903. }
  904. convdw3x3s1_int8_dequant_sse(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, dequantize_scales, opt);
  905. }
  906. if (activation)
  907. {
  908. activation->forward_inplace(top_blob, opt);
  909. }
  910. }
  911. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (activation_type == 0 || activation_type == 1))
  912. {
  913. if (use_int8_requantize)
  914. {
  915. std::vector<float> requantize_scales;
  916. for (int g = 0; g < group; g++)
  917. {
  918. float scale_in;
  919. if (weight_data_int8_scales[g] == 0)
  920. scale_in = 0;
  921. else
  922. scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
  923. float scale_out = top_blob_int8_scales[g];
  924. requantize_scales.push_back(scale_in);
  925. requantize_scales.push_back(scale_out);
  926. }
  927. convdw3x3s2_int8_requant_sse(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, requantize_scales, opt);
  928. }
  929. else
  930. {
  931. std::vector<float> dequantize_scales;
  932. for (int g = 0; g < group; g++)
  933. {
  934. float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
  935. dequantize_scales.push_back(top_rescale);
  936. }
  937. convdw3x3s2_int8_dequant_sse(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, dequantize_scales, opt);
  938. }
  939. if (activation)
  940. {
  941. activation->forward_inplace(top_blob, opt);
  942. }
  943. }
  944. else
  945. {
  946. const int maxk = kernel_w * kernel_h;
  947. // kernel offsets
  948. std::vector<int> _space_ofs(maxk);
  949. int* space_ofs = &_space_ofs[0];
  950. {
  951. int p1 = 0;
  952. int p2 = 0;
  953. int gap = w * dilation_h - kernel_w * dilation_w;
  954. for (int i = 0; i < kernel_h; i++)
  955. {
  956. for (int j = 0; j < kernel_w; j++)
  957. {
  958. space_ofs[p1] = p2;
  959. p1++;
  960. p2 += dilation_w;
  961. }
  962. p2 += gap;
  963. }
  964. }
  965. #pragma omp parallel for num_threads(opt.num_threads)
  966. for (int g = 0; g < group; g++)
  967. {
  968. signed char* outptr_s8 = top_blob.channel(g);
  969. float* outptr_f32 = top_blob.channel(g);
  970. const signed char* kptr = (const signed char*)weight_data_tm + maxk * g;
  971. const Mat m = bottom_blob_bordered.channel(g);
  972. for (int i = 0; i < outh; i++)
  973. {
  974. for (int j = 0; j < outw; j++)
  975. {
  976. int sum = 0;
  977. const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w;
  978. for (int k = 0; k < maxk; k++)
  979. {
  980. signed char val = sptr[space_ofs[k]];
  981. signed char w = kptr[k];
  982. sum += val * w;
  983. }
  984. float scale_in;
  985. if (weight_data_int8_scales[g] == 0)
  986. scale_in = 0;
  987. else
  988. scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
  989. float sumfp32 = sum * scale_in;
  990. if (bias_term)
  991. sumfp32 += bias_data[g];
  992. sumfp32 = activation_ss(sumfp32, activation_type, activation_params);
  993. if (use_int8_requantize)
  994. {
  995. // requantize
  996. float scale_out = top_blob_int8_scales[g];
  997. signed char sums8 = float2int8(sumfp32 * scale_out);
  998. outptr_s8[0] = sums8;
  999. outptr_s8 += 1;
  1000. }
  1001. else
  1002. {
  1003. // dequantize
  1004. outptr_f32[0] = sumfp32;
  1005. outptr_f32 += 1;
  1006. }
  1007. }
  1008. }
  1009. }
  1010. }
  1011. }
  1012. return 0;
  1013. }
  1014. bool use_int8_requantize = int8_scale_term > 100;
  1015. int out_elempack = 1;
  1016. #if __SSE2__
  1017. if (opt.use_packing_layout)
  1018. {
  1019. if (use_int8_requantize)
  1020. out_elempack = num_output % 8 == 0 ? 8 : 1;
  1021. else
  1022. out_elempack = num_output % 4 == 0 ? 4 : 1;
  1023. }
  1024. #endif // __SSE2__
  1025. size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
  1026. top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  1027. if (top_blob.empty())
  1028. return -100;
  1029. // group convolution
  1030. const int channels_g = channels * elempack / group;
  1031. const int num_output_g = num_output / group;
  1032. int g_elempack = 1;
  1033. int out_g_elempack = 1;
  1034. #if __SSE2__
  1035. if (opt.use_packing_layout)
  1036. {
  1037. g_elempack = channels_g % 8 == 0 ? 8 : 1;
  1038. if (use_int8_requantize)
  1039. out_g_elempack = num_output_g % 8 == 0 ? 8 : 1;
  1040. else
  1041. out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
  1042. }
  1043. #endif // __SSE2__
  1044. // unpacking
  1045. Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
  1046. if (elempack > g_elempack)
  1047. {
  1048. Option opt_p = opt;
  1049. opt_p.blob_allocator = opt.workspace_allocator;
  1050. convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
  1051. }
  1052. Mat top_blob_unpacked = top_blob;
  1053. if (out_g_elempack < out_elempack)
  1054. {
  1055. top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
  1056. if (top_blob_unpacked.empty())
  1057. return -100;
  1058. }
  1059. #pragma omp parallel for num_threads(opt.num_threads)
  1060. for (int g = 0; g < group; g++)
  1061. {
  1062. const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
  1063. Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
  1064. const ncnn::Layer* op = group_ops[g];
  1065. Option opt_g = opt;
  1066. opt_g.blob_allocator = top_blob.allocator;
  1067. // forward
  1068. op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
  1069. }
  1070. // packing
  1071. if (out_g_elempack < out_elempack)
  1072. {
  1073. convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
  1074. }
  1075. else
  1076. {
  1077. top_blob = top_blob_unpacked;
  1078. }
  1079. return 0;
  1080. }
  1081. #endif // NCNN_INT8
  1082. } // namespace ncnn