You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolutiondepthwise_x86.cpp 21 kB

X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
7 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #if __AVX__
  15. #include "avx_activation.h"
  16. #include "avx_usability.h"
  17. #endif
  18. #include "convolutiondepthwise_x86.h"
  19. #include "layer_type.h"
  20. namespace ncnn {
  21. #ifdef __AVX__
  22. #include "convolutiondepthwise_3x3_pack8_fp16.h"
  23. #include "convolutiondepthwise_3x3_pack8.h"
  24. #include "convolutiondepthwise_5x5_pack8.h"
  25. #endif
  26. #include "convolutiondepthwise_3x3.h"
  27. #include "convolutiondepthwise_3x3_int8.h"
  28. ConvolutionDepthWise_x86::ConvolutionDepthWise_x86()
  29. {
  30. #ifdef __AVX__
  31. support_packing = true;
  32. support_weight_fp16_storage = true;
  33. #endif
  34. activation = 0;
  35. }
  36. int ConvolutionDepthWise_x86::create_pipeline(const Option& opt)
  37. {
  38. if (activation_type == 1)
  39. {
  40. activation = ncnn::create_layer(ncnn::LayerType::ReLU);
  41. ncnn::ParamDict pd;
  42. activation->load_param(pd);
  43. }
  44. else if (activation_type == 2)
  45. {
  46. activation = ncnn::create_layer(ncnn::LayerType::ReLU);
  47. ncnn::ParamDict pd;
  48. pd.set(0, activation_params[0]); // slope
  49. activation->load_param(pd);
  50. }
  51. else if (activation_type == 3)
  52. {
  53. activation = ncnn::create_layer(ncnn::LayerType::Clip);
  54. ncnn::ParamDict pd;
  55. if (use_int8_requantize)
  56. {
  57. pd.set(0, activation_params[0] * top_blob_int8_scale); // min
  58. pd.set(1, activation_params[1] * top_blob_int8_scale); // max
  59. }
  60. else
  61. {
  62. pd.set(0, activation_params[0]); // min
  63. pd.set(1, activation_params[1]); // max
  64. }
  65. activation->load_param(pd);
  66. }
  67. else if (activation_type == 4)
  68. {
  69. activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);
  70. ncnn::ParamDict pd;
  71. activation->load_param(pd);
  72. }
  73. else if (activation_type == 5)
  74. {
  75. activation = ncnn::create_layer(ncnn::LayerType::Mish);
  76. ncnn::ParamDict pd;
  77. activation->load_param(pd);
  78. }
  79. else if (activation_type == 5)
  80. {
  81. activation = ncnn::create_layer(ncnn::LayerType::Mish);
  82. ncnn::ParamDict pd;
  83. activation->load_param(pd);
  84. }
  85. if (activation)
  86. {
  87. activation->create_pipeline(opt);
  88. }
  89. if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
  90. {
  91. support_packing = false;
  92. }
  93. // create Convolution op for each group
  94. const int maxk = kernel_w * kernel_h;
  95. int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
  96. for (int i = 0; i < (int)group_ops.size(); i++)
  97. delete group_ops[i];
  98. group_ops.clear();
  99. if (channels == group && group == num_output)
  100. {
  101. int elempack = (support_packing && opt.use_packing_layout && channels % 8 == 0) ? 8 : 1;
  102. #if __AVX__
  103. // pack8
  104. if (elempack == 8)
  105. {
  106. if (opt.use_weight_fp16_storage && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  107. {
  108. Mat weight_data_r2 = weight_data.reshape(maxk, group);
  109. Mat weight_data_tmp;
  110. convert_packing(weight_data_r2, weight_data_tmp, 8);
  111. ncnn::cast_float32_to_float16(weight_data_tmp, weight_data_pack8, opt);
  112. return 0;
  113. }
  114. if (opt.use_weight_fp16_storage && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  115. {
  116. Mat weight_data_r2 = weight_data.reshape(maxk, group);
  117. Mat weight_data_tmp;
  118. convert_packing(weight_data_r2, weight_data_tmp, 8);
  119. ncnn::cast_float32_to_float16(weight_data_tmp, weight_data_pack8, opt);
  120. return 0;
  121. }
  122. Mat weight_data_r2 = weight_data.reshape(maxk, group);
  123. convert_packing(weight_data_r2, weight_data_pack8, 8);
  124. return 0;
  125. }
  126. #endif // __AVX__
  127. // depth-wise specific
  128. // special path for both int8 and fp32
  129. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  130. {
  131. return 0;
  132. }
  133. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  134. {
  135. return 0;
  136. }
  137. }
  138. const int channels_g = channels / group;
  139. const int num_output_g = num_output / group;
  140. group_ops.resize(group);
  141. for (int g = 0; g < group; g++)
  142. {
  143. Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g);
  144. Mat bias_data_g;
  145. if (bias_term)
  146. bias_data_g = bias_data.range(num_output_g * g, num_output_g);
  147. ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
  148. // FIXME
  149. // ((ncnn::Convolution*)op)->use_int8_requantize = use_int8_requantize;
  150. // set param
  151. ncnn::ParamDict pd;
  152. pd.set(0, num_output_g); // num_output
  153. pd.set(1, kernel_w);
  154. pd.set(11, kernel_h);
  155. pd.set(2, dilation_w);
  156. pd.set(12, dilation_h);
  157. pd.set(3, stride_w);
  158. pd.set(13, stride_h);
  159. pd.set(4, 0); // pad_w
  160. pd.set(14, 0); // pad_h
  161. pd.set(5, bias_term);
  162. pd.set(6, maxk * channels_g * num_output_g); // weight_data_size
  163. pd.set(8, int8_scale_term);
  164. pd.set(9, activation_type);
  165. pd.set(10, activation_params);
  166. op->load_param(pd);
  167. // set weights
  168. if (bias_term)
  169. {
  170. ncnn::Mat weights[4];
  171. weights[0] = weight_data_g;
  172. weights[1] = bias_data_g;
  173. if (int8_scale_term)
  174. {
  175. Mat weight_data_int8_scales_g(num_output_g);
  176. weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
  177. weights[2] = weight_data_int8_scales_g;
  178. weights[3] = bottom_blob_int8_scales.range(g, 1);
  179. }
  180. op->load_model(ModelBinFromMatArray(weights));
  181. }
  182. else
  183. {
  184. ncnn::Mat weights[3];
  185. weights[0] = weight_data_g;
  186. if (int8_scale_term)
  187. {
  188. Mat weight_data_int8_scales_g(num_output_g);
  189. weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
  190. weights[1] = weight_data_int8_scales_g;
  191. weights[2] = bottom_blob_int8_scales.range(g, 1);
  192. }
  193. op->load_model(ModelBinFromMatArray(weights));
  194. }
  195. op->create_pipeline(opt);
  196. group_ops[g] = op;
  197. }
  198. return 0;
  199. }
  200. int ConvolutionDepthWise_x86::destroy_pipeline(const Option& opt)
  201. {
  202. if (activation)
  203. {
  204. activation->destroy_pipeline(opt);
  205. delete activation;
  206. activation = 0;
  207. }
  208. for (int i = 0; i < (int)group_ops.size(); i++)
  209. {
  210. group_ops[i]->destroy_pipeline(opt);
  211. delete group_ops[i];
  212. }
  213. group_ops.clear();
  214. return 0;
  215. }
  216. int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  217. {
  218. // convolv with NxN kernel
  219. // value = value + bias
  220. if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
  221. {
  222. return forward_int8_x86(bottom_blob, top_blob, opt);
  223. }
  224. int w = bottom_blob.w;
  225. int h = bottom_blob.h;
  226. int channels = bottom_blob.c;
  227. size_t elemsize = bottom_blob.elemsize;
  228. int elempack = bottom_blob.elempack;
  229. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  230. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  231. Mat bottom_blob_bordered;
  232. make_padding(bottom_blob, bottom_blob_bordered, opt);
  233. if (bottom_blob_bordered.empty())
  234. return -100;
  235. w = bottom_blob_bordered.w;
  236. h = bottom_blob_bordered.h;
  237. int outw = (w - kernel_extent_w) / stride_w + 1;
  238. int outh = (h - kernel_extent_h) / stride_h + 1;
  239. int out_elempack = (support_packing && opt.use_packing_layout && num_output % 8 == 0) ? 8 : 1;
  240. size_t out_elemsize = elemsize / elempack * out_elempack;
  241. top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  242. if (top_blob.empty())
  243. return -100;
  244. // fprintf(stderr, "Depthwise kernel %d x %d elempack=%d group=%d channels = %d stride = %d x %d \n",kernel_w,kernel_h,elempack,group,channels,stride_w,stride_h );
  245. // depth-wise
  246. if (channels * elempack == group && group == num_output)
  247. {
  248. #if __AVX__
  249. if (elempack == 8)
  250. {
  251. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  252. {
  253. if (opt.use_weight_fp16_storage)
  254. {
  255. convdw3x3s1_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt);
  256. }
  257. else
  258. {
  259. convdw3x3s1_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt);
  260. }
  261. if (activation)
  262. {
  263. activation->forward_inplace(top_blob, opt);
  264. }
  265. return 0;
  266. }
  267. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  268. {
  269. if (opt.use_weight_fp16_storage)
  270. {
  271. convdw3x3s2_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt);
  272. }
  273. else
  274. {
  275. convdw3x3s2_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt);
  276. }
  277. if (activation)
  278. {
  279. activation->forward_inplace(top_blob, opt);
  280. }
  281. return 0;
  282. }
  283. if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  284. {
  285. convdw5x5s1_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt);
  286. if (activation)
  287. {
  288. activation->forward_inplace(top_blob, opt);
  289. }
  290. return 0;
  291. }
  292. if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  293. {
  294. convdw5x5s2_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt);
  295. if (activation)
  296. {
  297. activation->forward_inplace(top_blob, opt);
  298. }
  299. return 0;
  300. }
  301. else
  302. {
  303. const int maxk = kernel_w * kernel_h;
  304. // kernel offsets
  305. std::vector<int> _space_ofs(maxk);
  306. int* space_ofs = &_space_ofs[0];
  307. {
  308. int p1 = 0;
  309. int p2 = 0;
  310. int gap = w * dilation_h - kernel_w * dilation_w;
  311. for (int i = 0; i < kernel_h; i++)
  312. {
  313. for (int j = 0; j < kernel_w; j++)
  314. {
  315. space_ofs[p1] = p2;
  316. p1++;
  317. p2 += dilation_w;
  318. }
  319. p2 += gap;
  320. }
  321. }
  322. #pragma omp parallel for num_threads(opt.num_threads)
  323. for (int g = 0; g < channels; g++)
  324. {
  325. float* outptr = top_blob.channel(g);
  326. const float* kptr = (const float*)weight_data_pack8 + maxk * g * 8;
  327. const Mat m = bottom_blob_bordered.channel(g);
  328. for (int i = 0; i < outh; i++)
  329. {
  330. for (int j = 0; j < outw; j++)
  331. {
  332. __m256 _sum = _mm256_set1_ps(0.f);
  333. if (bias_term)
  334. {
  335. _sum = _mm256_loadu_ps(((const float*)bias_data) + g * 8);
  336. }
  337. const float* sptr = m.row(i * stride_h) + j * stride_w * 8;
  338. for (int k = 0; k < maxk; k++)
  339. {
  340. __m256 _val = _mm256_loadu_ps(sptr + space_ofs[k] * 8);
  341. __m256 _w = _mm256_loadu_ps(kptr + k * 8);
  342. _sum = _mm256_fmadd_ps(_val, _w, _sum);
  343. }
  344. _sum = activation_ps(_sum, activation_type, activation_params);
  345. _mm256_storeu_ps(outptr + j * 8, _sum);
  346. }
  347. outptr += outw * 8;
  348. }
  349. }
  350. return 0;
  351. }
  352. }
  353. #endif // __AVX__
  354. if (elempack == 1)
  355. {
  356. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  357. {
  358. convdw3x3s1_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  359. if (activation)
  360. {
  361. activation->forward_inplace(top_blob, opt);
  362. }
  363. return 0;
  364. }
  365. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  366. {
  367. convdw3x3s2_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  368. if (activation)
  369. {
  370. activation->forward_inplace(top_blob, opt);
  371. }
  372. return 0;
  373. }
  374. }
  375. }
  376. // group convolution
  377. const int channels_g = channels * elempack / group;
  378. const int num_output_g = num_output / group;
  379. int g_elempack = (support_packing && opt.use_packing_layout && channels_g % 8 == 0) ? 8 : 1;
  380. int out_g_elempack = (support_packing && opt.use_packing_layout && num_output_g % 8 == 0) ? 8 : 1;
  381. // unpacking
  382. Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
  383. if (elempack == 8 && g_elempack == 1)
  384. {
  385. Option opt_p = opt;
  386. opt_p.blob_allocator = opt.workspace_allocator;
  387. convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p);
  388. }
  389. Mat top_blob_unpacked = top_blob;
  390. if (out_g_elempack == 1 && out_elempack == 8)
  391. {
  392. top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
  393. if (top_blob_unpacked.empty())
  394. return -100;
  395. }
  396. for (int g = 0; g < group; g++)
  397. {
  398. const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
  399. Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
  400. const ncnn::Layer* op = group_ops[g];
  401. Option opt_g = opt;
  402. opt_g.blob_allocator = top_blob_unpacked.allocator;
  403. // forward
  404. op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
  405. }
  406. // packing
  407. if (out_g_elempack == 1 && out_elempack == 8)
  408. {
  409. convert_packing(top_blob_unpacked, top_blob, 8, opt);
  410. }
  411. else
  412. {
  413. top_blob = top_blob_unpacked;
  414. }
  415. return 0;
  416. }
  417. int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  418. {
  419. int w = bottom_blob.w;
  420. int h = bottom_blob.h;
  421. int channels = bottom_blob.c;
  422. size_t elemsize = bottom_blob.elemsize;
  423. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  424. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  425. Mat bottom_blob_unbordered = bottom_blob;
  426. if (elemsize != 1)
  427. {
  428. bottom_blob_unbordered.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
  429. if (bottom_blob_unbordered.empty())
  430. return -100;
  431. const int channels_g = channels / group;
  432. // quantize, scale and round to nearest
  433. #pragma omp parallel for num_threads(opt.num_threads)
  434. for (int g = 0; g < group; g++)
  435. {
  436. Option opt_g = opt;
  437. opt_g.num_threads = 1;
  438. opt_g.blob_allocator = bottom_blob_unbordered.allocator;
  439. const Mat bottom_blob_g = bottom_blob.channel_range(channels_g * g, channels_g);
  440. Mat bottom_blob_int8_g = bottom_blob_unbordered.channel_range(channels_g * g, channels_g);
  441. quantize_float32_to_int8(bottom_blob_g, bottom_blob_int8_g, bottom_blob_int8_scales[g], opt_g);
  442. }
  443. }
  444. Mat bottom_blob_bordered;
  445. make_padding(bottom_blob_unbordered, bottom_blob_bordered, opt);
  446. if (bottom_blob_bordered.empty())
  447. return -100;
  448. w = bottom_blob_bordered.w;
  449. h = bottom_blob_bordered.h;
  450. int outw = (w - kernel_extent_w) / stride_w + 1;
  451. int outh = (h - kernel_extent_h) / stride_h + 1;
  452. // int8
  453. size_t out_elemsize = use_int8_requantize ? 1u : 4u;
  454. top_blob.create(outw, outh, num_output, out_elemsize, opt.blob_allocator);
  455. if (top_blob.empty())
  456. return -100;
  457. // depth-wise
  458. if (channels == group && group == num_output)
  459. {
  460. if (use_int8_requantize)
  461. {
  462. std::vector<float> requantize_scales;
  463. for (int g = 0; g < group; g++)
  464. {
  465. float scale_in;
  466. if (weight_data_int8_scales[g] == 0)
  467. scale_in = 0;
  468. else
  469. scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
  470. float scale_out = top_blob_int8_scale;
  471. requantize_scales.push_back(scale_in);
  472. requantize_scales.push_back(scale_out);
  473. }
  474. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  475. {
  476. convdw3x3s1_int8_requant_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, requantize_scales, opt);
  477. if (activation)
  478. {
  479. activation->forward_inplace(top_blob, opt);
  480. }
  481. return 0;
  482. }
  483. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  484. {
  485. convdw3x3s2_int8_requant_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, requantize_scales, opt);
  486. if (activation)
  487. {
  488. activation->forward_inplace(top_blob, opt);
  489. }
  490. return 0;
  491. }
  492. }
  493. else
  494. {
  495. std::vector<float> dequantize_scales;
  496. for (int g = 0; g < group; g++)
  497. {
  498. float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
  499. dequantize_scales.push_back(top_rescale);
  500. }
  501. if (kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
  502. {
  503. convdw3x3s1_int8_dequant_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, dequantize_scales, opt);
  504. if (activation)
  505. {
  506. activation->forward_inplace(top_blob, opt);
  507. }
  508. return 0;
  509. }
  510. if (kernel_w == 3 && kernel_h == 3 && stride_w == 2 && stride_h == 2 && dilation_w == 1 && dilation_h == 1)
  511. {
  512. convdw3x3s2_int8_dequant_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, dequantize_scales, opt);
  513. if (activation)
  514. {
  515. activation->forward_inplace(top_blob, opt);
  516. }
  517. return 0;
  518. }
  519. }
  520. }
  521. // group convolution
  522. const int channels_g = channels / group;
  523. const int num_output_g = num_output / group;
  524. #pragma omp parallel for num_threads(opt.num_threads)
  525. for (int g = 0; g < group; g++)
  526. {
  527. const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(channels_g * g, channels_g);
  528. Mat top_blob_g = top_blob.channel_range(num_output_g * g, num_output_g);
  529. const ncnn::Layer* op = group_ops[g];
  530. Option opt_g = opt;
  531. opt_g.blob_allocator = top_blob.allocator;
  532. // forward
  533. op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
  534. }
  535. return 0;
  536. }
  537. } // namespace ncnn