You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolutiondepthwise_x86.cpp 37 kB

X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
7 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
X86 Elempack 8 AVX implementations. (#1853) * added avx implementations of FC and Max pool * Specify AVX2 * Small fixes and using Fused avx activations * fix type casting * fixing some CI errors * Fix code format * fix pooling test * remove vector typedef * More compile fixes * remove vector typedef * set c++ version to 17 * Force c++ 17 * Fixing mathfun * Try and workaround typedef issues * typefix * Remove typedef * switch to static inline * attempting to fix msvc bug * Verified MSVX FIX * Fixing clang build * commit before switch * More avx and packing implementation * Fix ctest * starting the depthwise pack 8 implementation * Unrolled loop * add depthwise pack 8 implementations * Working 1x1 pack 8 implementation added * revert incorrect changes * added conact elempack 8 * more elempack enabled layers added and started on the conversion of the winograd pack4 conv 3x3 * Added code formatting * fix styling * Unroll loops * unrolling loops * Added more elempac layers for mobilenet v3 * revert commit * fix code style * remove arm neon references * remove pack4 references * More cleanup * added packing avx code * fixing linux build ctests * remove usage of aligned loads * More aligned mem ops removed * Cleanup, revert some files and remove not working winograd and shufflechannel implementation * add stackoverflow referal * Fix windows build * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * implement requested chaanges * remove reshape * revert arm file change * Restyled by clang-format * Restyled by astyle * Restyled by clang-format * Restyled by astyle * fix unterminated directive Co-authored-by: Restyled.io <commits@restyled.io>
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "convolutiondepthwise_x86.h"
  15. #if __SSE2__
  16. #include <emmintrin.h>
  17. #if __AVX__
  18. #include <immintrin.h>
  19. #endif
  20. #endif // __SSE2__
  21. #include "x86_activation.h"
  22. #include "x86_usability.h"
  23. #include "layer_type.h"
  24. namespace ncnn {
  25. #if __SSE2__
  26. #if __AVX__
  27. #include "convolutiondepthwise_3x3_pack8_fp16.h"
  28. #include "convolutiondepthwise_3x3_pack8.h"
  29. #include "convolutiondepthwise_5x5_pack8.h"
  30. #endif
  31. #endif // __SSE2__
  32. #include "convolutiondepthwise_3x3.h"
  33. #if NCNN_INT8
  34. #include "convolutiondepthwise_3x3_int8.h"
  35. #endif // NCNN_INT8
  36. ConvolutionDepthWise_x86::ConvolutionDepthWise_x86()
  37. {
  38. #if __SSE2__
  39. support_packing = true;
  40. #if __AVX__
  41. support_weight_fp16_storage = true;
  42. #endif
  43. #endif // __SSE2__
  44. activation = 0;
  45. }
  46. int ConvolutionDepthWise_x86::create_pipeline(const Option& opt)
  47. {
  48. if (activation_type == 1)
  49. {
  50. activation = ncnn::create_layer(ncnn::LayerType::ReLU);
  51. ncnn::ParamDict pd;
  52. activation->load_param(pd);
  53. }
  54. else if (activation_type == 2)
  55. {
  56. activation = ncnn::create_layer(ncnn::LayerType::ReLU);
  57. ncnn::ParamDict pd;
  58. pd.set(0, activation_params[0]); // slope
  59. activation->load_param(pd);
  60. }
  61. else if (activation_type == 3)
  62. {
  63. activation = ncnn::create_layer(ncnn::LayerType::Clip);
  64. ncnn::ParamDict pd;
  65. pd.set(0, activation_params[0]); // min
  66. pd.set(1, activation_params[1]); // max
  67. activation->load_param(pd);
  68. }
  69. else if (activation_type == 4)
  70. {
  71. activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);
  72. ncnn::ParamDict pd;
  73. activation->load_param(pd);
  74. }
  75. else if (activation_type == 5)
  76. {
  77. activation = ncnn::create_layer(ncnn::LayerType::Mish);
  78. ncnn::ParamDict pd;
  79. activation->load_param(pd);
  80. }
  81. else if (activation_type == 5)
  82. {
  83. activation = ncnn::create_layer(ncnn::LayerType::Mish);
  84. ncnn::ParamDict pd;
  85. activation->load_param(pd);
  86. }
  87. if (activation)
  88. {
  89. activation->create_pipeline(opt);
  90. }
  91. #if NCNN_INT8
  92. if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
  93. {
  94. return create_pipeline_int8_x86(opt);
  95. }
  96. #endif
  97. const int maxk = kernel_w * kernel_h;
  98. int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
  99. // depth-wise
  100. if (channels == group && group == num_output)
  101. {
  102. int elempack = 1;
  103. #if __SSE2__
  104. if (opt.use_packing_layout)
  105. {
  106. #if __AVX__
  107. elempack = channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
  108. #else
  109. elempack = channels % 4 == 0 ? 4 : 1;
  110. #endif
  111. }
  112. #endif // __SSE2__
  113. #if __SSE2__
  114. #if __AVX__
  115. // pack8
  116. if (elempack == 8)
  117. {
  118. if (opt.use_weight_fp16_storage && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  119. {
  120. Mat weight_data_r2 = weight_data.reshape(maxk, group);
  121. Mat weight_data_tmp;
  122. convert_packing(weight_data_r2, weight_data_tmp, 8);
  123. ncnn::cast_float32_to_float16(weight_data_tmp, weight_data_packed, opt);
  124. return 0;
  125. }
  126. if (opt.use_weight_fp16_storage && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  127. {
  128. Mat weight_data_r2 = weight_data.reshape(maxk, group);
  129. Mat weight_data_tmp;
  130. convert_packing(weight_data_r2, weight_data_tmp, 8);
  131. ncnn::cast_float32_to_float16(weight_data_tmp, weight_data_packed, opt);
  132. return 0;
  133. }
  134. Mat weight_data_r2 = weight_data.reshape(maxk, group);
  135. convert_packing(weight_data_r2, weight_data_packed, 8);
  136. return 0;
  137. }
  138. #endif // __AVX__
  139. // pack4
  140. if (elempack == 4)
  141. {
  142. Mat weight_data_r2 = weight_data.reshape(maxk, group);
  143. convert_packing(weight_data_r2, weight_data_packed, 4);
  144. return 0;
  145. }
  146. #endif // __SSE2__
  147. if (elempack == 1)
  148. {
  149. // depth-wise specific
  150. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  151. {
  152. return 0;
  153. }
  154. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  155. {
  156. return 0;
  157. }
  158. }
  159. }
  160. // group convolution
  161. create_group_ops(opt);
  162. return 0;
  163. }
  164. int ConvolutionDepthWise_x86::create_group_ops(const Option& opt)
  165. {
  166. // create Convolution op for each group
  167. const int maxk = kernel_w * kernel_h;
  168. int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
  169. for (int i = 0; i < (int)group_ops.size(); i++)
  170. delete group_ops[i];
  171. group_ops.clear();
  172. const int channels_g = channels / group;
  173. const int num_output_g = num_output / group;
  174. group_ops.resize(group);
  175. for (int g = 0; g < group; g++)
  176. {
  177. Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g);
  178. Mat bias_data_g;
  179. if (bias_term)
  180. bias_data_g = bias_data.range(num_output_g * g, num_output_g);
  181. ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
  182. // set param
  183. ncnn::ParamDict pd;
  184. pd.set(0, num_output_g); // num_output
  185. pd.set(1, kernel_w);
  186. pd.set(11, kernel_h);
  187. pd.set(2, dilation_w);
  188. pd.set(12, dilation_h);
  189. pd.set(3, stride_w);
  190. pd.set(13, stride_h);
  191. pd.set(4, 0); // pad_w
  192. pd.set(14, 0); // pad_h
  193. pd.set(5, bias_term);
  194. pd.set(6, maxk * channels_g * num_output_g); // weight_data_size
  195. pd.set(8, int8_scale_term);
  196. pd.set(9, activation_type);
  197. pd.set(10, activation_params);
  198. op->load_param(pd);
  199. // set weights
  200. if (bias_term)
  201. {
  202. ncnn::Mat weights[5];
  203. weights[0] = weight_data_g;
  204. weights[1] = bias_data_g;
  205. #if NCNN_INT8
  206. if (int8_scale_term)
  207. {
  208. Mat weight_data_int8_scales_g(num_output_g);
  209. weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
  210. weights[2] = weight_data_int8_scales_g;
  211. weights[3] = bottom_blob_int8_scales.range(g, 1);
  212. }
  213. if (int8_scale_term > 100)
  214. {
  215. weights[4] = top_blob_int8_scales.range(g, 1);
  216. }
  217. #endif
  218. op->load_model(ModelBinFromMatArray(weights));
  219. }
  220. else
  221. {
  222. ncnn::Mat weights[4];
  223. weights[0] = weight_data_g;
  224. #if NCNN_INT8
  225. if (int8_scale_term)
  226. {
  227. Mat weight_data_int8_scales_g(num_output_g);
  228. weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
  229. weights[1] = weight_data_int8_scales_g;
  230. weights[2] = bottom_blob_int8_scales.range(g, 1);
  231. }
  232. if (int8_scale_term > 100)
  233. {
  234. weights[3] = top_blob_int8_scales.range(g, 1);
  235. }
  236. #endif
  237. op->load_model(ModelBinFromMatArray(weights));
  238. }
  239. op->create_pipeline(opt);
  240. group_ops[g] = op;
  241. }
  242. return 0;
  243. }
  244. int ConvolutionDepthWise_x86::destroy_pipeline(const Option& opt)
  245. {
  246. if (activation)
  247. {
  248. activation->destroy_pipeline(opt);
  249. delete activation;
  250. activation = 0;
  251. }
  252. for (int i = 0; i < (int)group_ops.size(); i++)
  253. {
  254. group_ops[i]->destroy_pipeline(opt);
  255. delete group_ops[i];
  256. }
  257. group_ops.clear();
  258. return 0;
  259. }
  260. int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  261. {
  262. #if NCNN_INT8
  263. if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
  264. {
  265. return forward_int8_x86(bottom_blob, top_blob, opt);
  266. }
  267. #endif
  268. int w = bottom_blob.w;
  269. int h = bottom_blob.h;
  270. int channels = bottom_blob.c;
  271. size_t elemsize = bottom_blob.elemsize;
  272. int elempack = bottom_blob.elempack;
  273. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  274. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  275. Mat bottom_blob_bordered;
  276. make_padding(bottom_blob, bottom_blob_bordered, opt);
  277. if (bottom_blob_bordered.empty())
  278. return -100;
  279. w = bottom_blob_bordered.w;
  280. h = bottom_blob_bordered.h;
  281. int outw = (w - kernel_extent_w) / stride_w + 1;
  282. int outh = (h - kernel_extent_h) / stride_h + 1;
  283. int out_elempack = 1;
  284. #if __SSE2__
  285. if (opt.use_packing_layout)
  286. {
  287. #if __AVX__
  288. out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
  289. #else
  290. out_elempack = num_output % 4 == 0 ? 4 : 1;
  291. #endif
  292. }
  293. #endif // __SSE2__
  294. size_t out_elemsize = elemsize / elempack * out_elempack;
  295. top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  296. if (top_blob.empty())
  297. return -100;
  298. // fprintf(stderr, "Depthwise kernel %d x %d elempack=%d group=%d channels = %d stride = %d x %d \n",kernel_w,kernel_h,elempack,group,channels,stride_w,stride_h );
  299. // depth-wise
  300. if (channels * elempack == group && group == num_output)
  301. {
  302. #if __SSE2__
  303. #if __AVX__
  304. if (elempack == 8)
  305. {
  306. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  307. {
  308. if (opt.use_weight_fp16_storage)
  309. {
  310. convdw3x3s1_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
  311. }
  312. else
  313. {
  314. convdw3x3s1_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
  315. }
  316. if (activation)
  317. {
  318. activation->forward_inplace(top_blob, opt);
  319. }
  320. return 0;
  321. }
  322. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  323. {
  324. if (opt.use_weight_fp16_storage)
  325. {
  326. convdw3x3s2_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
  327. }
  328. else
  329. {
  330. convdw3x3s2_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
  331. }
  332. if (activation)
  333. {
  334. activation->forward_inplace(top_blob, opt);
  335. }
  336. return 0;
  337. }
  338. if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  339. {
  340. convdw5x5s1_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
  341. if (activation)
  342. {
  343. activation->forward_inplace(top_blob, opt);
  344. }
  345. return 0;
  346. }
  347. if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  348. {
  349. convdw5x5s2_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
  350. if (activation)
  351. {
  352. activation->forward_inplace(top_blob, opt);
  353. }
  354. return 0;
  355. }
  356. else
  357. {
  358. const int maxk = kernel_w * kernel_h;
  359. // kernel offsets
  360. std::vector<int> _space_ofs(maxk);
  361. int* space_ofs = &_space_ofs[0];
  362. {
  363. int p1 = 0;
  364. int p2 = 0;
  365. int gap = w * dilation_h - kernel_w * dilation_w;
  366. for (int i = 0; i < kernel_h; i++)
  367. {
  368. for (int j = 0; j < kernel_w; j++)
  369. {
  370. space_ofs[p1] = p2;
  371. p1++;
  372. p2 += dilation_w;
  373. }
  374. p2 += gap;
  375. }
  376. }
  377. #pragma omp parallel for num_threads(opt.num_threads)
  378. for (int g = 0; g < channels; g++)
  379. {
  380. float* outptr = top_blob.channel(g);
  381. const float* kptr = (const float*)weight_data_packed + maxk * g * 8;
  382. const Mat m = bottom_blob_bordered.channel(g);
  383. for (int i = 0; i < outh; i++)
  384. {
  385. for (int j = 0; j < outw; j++)
  386. {
  387. __m256 _sum = _mm256_set1_ps(0.f);
  388. if (bias_term)
  389. {
  390. _sum = _mm256_loadu_ps(((const float*)bias_data) + g * 8);
  391. }
  392. const float* sptr = m.row(i * stride_h) + j * stride_w * 8;
  393. for (int k = 0; k < maxk; k++)
  394. {
  395. __m256 _val = _mm256_loadu_ps(sptr + space_ofs[k] * 8);
  396. __m256 _w = _mm256_loadu_ps(kptr + k * 8);
  397. _sum = _mm256_fmadd_ps(_val, _w, _sum);
  398. }
  399. _sum = activation_avx(_sum, activation_type, activation_params);
  400. _mm256_storeu_ps(outptr + j * 8, _sum);
  401. }
  402. outptr += outw * 8;
  403. }
  404. }
  405. return 0;
  406. }
  407. }
  408. #endif // __AVX__
  409. if (elempack == 4)
  410. {
  411. {
  412. const int maxk = kernel_w * kernel_h;
  413. // kernel offsets
  414. std::vector<int> _space_ofs(maxk);
  415. int* space_ofs = &_space_ofs[0];
  416. {
  417. int p1 = 0;
  418. int p2 = 0;
  419. int gap = w * dilation_h - kernel_w * dilation_w;
  420. for (int i = 0; i < kernel_h; i++)
  421. {
  422. for (int j = 0; j < kernel_w; j++)
  423. {
  424. space_ofs[p1] = p2;
  425. p1++;
  426. p2 += dilation_w;
  427. }
  428. p2 += gap;
  429. }
  430. }
  431. #pragma omp parallel for num_threads(opt.num_threads)
  432. for (int g = 0; g < channels; g++)
  433. {
  434. float* outptr = top_blob.channel(g);
  435. const float* kptr = (const float*)weight_data_packed + maxk * g * 4;
  436. const Mat m = bottom_blob_bordered.channel(g);
  437. for (int i = 0; i < outh; i++)
  438. {
  439. for (int j = 0; j < outw; j++)
  440. {
  441. __m128 _sum = _mm_set1_ps(0.f);
  442. if (bias_term)
  443. {
  444. _sum = _mm_loadu_ps(((const float*)bias_data) + g * 4);
  445. }
  446. const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
  447. for (int k = 0; k < maxk; k++)
  448. {
  449. __m128 _val = _mm_loadu_ps(sptr + space_ofs[k] * 4);
  450. __m128 _w = _mm_loadu_ps(kptr + k * 4);
  451. _sum = _mm_add_ps(_mm_mul_ps(_val, _w), _sum);
  452. }
  453. _sum = activation_sse(_sum, activation_type, activation_params);
  454. _mm_storeu_ps(outptr + j * 4, _sum);
  455. }
  456. outptr += outw * 4;
  457. }
  458. }
  459. return 0;
  460. }
  461. }
  462. #endif // __SSE2__
  463. if (elempack == 1)
  464. {
  465. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
  466. {
  467. convdw3x3s1_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  468. if (activation)
  469. {
  470. activation->forward_inplace(top_blob, opt);
  471. }
  472. return 0;
  473. }
  474. if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
  475. {
  476. convdw3x3s2_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
  477. if (activation)
  478. {
  479. activation->forward_inplace(top_blob, opt);
  480. }
  481. return 0;
  482. }
  483. }
  484. }
  485. // group convolution
  486. const int channels_g = channels * elempack / group;
  487. const int num_output_g = num_output / group;
  488. int g_elempack = 1;
  489. int out_g_elempack = 1;
  490. #if __SSE2__
  491. if (opt.use_packing_layout)
  492. {
  493. #if __AVX__
  494. g_elempack = channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
  495. out_g_elempack = num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
  496. #else
  497. g_elempack = channels_g % 4 == 0 ? 4 : 1;
  498. out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
  499. #endif
  500. }
  501. #endif // __SSE2__
  502. // unpacking
  503. Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
  504. if (elempack > g_elempack)
  505. {
  506. Option opt_p = opt;
  507. opt_p.blob_allocator = opt.workspace_allocator;
  508. convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
  509. }
  510. Mat top_blob_unpacked = top_blob;
  511. if (out_g_elempack < out_elempack)
  512. {
  513. top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
  514. if (top_blob_unpacked.empty())
  515. return -100;
  516. }
  517. for (int g = 0; g < group; g++)
  518. {
  519. const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
  520. Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
  521. const ncnn::Layer* op = group_ops[g];
  522. Option opt_g = opt;
  523. opt_g.blob_allocator = top_blob_unpacked.allocator;
  524. // forward
  525. op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
  526. }
  527. // packing
  528. if (out_g_elempack < out_elempack)
  529. {
  530. convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
  531. }
  532. else
  533. {
  534. top_blob = top_blob_unpacked;
  535. }
  536. return 0;
  537. }
  538. #if NCNN_INT8
  539. int ConvolutionDepthWise_x86::create_pipeline_int8_x86(const Option& opt)
  540. {
  541. const int maxk = kernel_w * kernel_h;
  542. int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
  543. // depth-wise
  544. if (channels == group && group == num_output)
  545. {
  546. int elempack = 1;
  547. #if __SSE2__
  548. if (opt.use_packing_layout)
  549. {
  550. elempack = channels % 8 == 0 ? 8 : 1;
  551. }
  552. #endif // __SSE2__
  553. if (elempack == 8)
  554. {
  555. Mat weight_data_r2 = weight_data.reshape(maxk, group);
  556. convert_packing(weight_data_r2, weight_data_int8, 8, opt);
  557. }
  558. return 0;
  559. }
  560. // group convolution
  561. create_group_ops(opt);
  562. return 0;
  563. }
  564. int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  565. {
  566. int w = bottom_blob.w;
  567. int h = bottom_blob.h;
  568. int channels = bottom_blob.c;
  569. size_t elemsize = bottom_blob.elemsize;
  570. int elempack = bottom_blob.elempack;
  571. int elembits = bottom_blob.elembits();
  572. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  573. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  574. Mat bottom_blob_int8 = bottom_blob;
  575. if (elembits != 8)
  576. {
  577. const int channels_g = channels * elempack / group;
  578. Mat scales(channels * elempack);
  579. {
  580. float* ps = scales;
  581. for (int g = 0; g < group; g++)
  582. {
  583. float scale = bottom_blob_int8_scales[g];
  584. for (int q = 0; q < channels_g; q++)
  585. {
  586. *ps++ = scale;
  587. }
  588. }
  589. }
  590. Option opt_q = opt;
  591. opt_q.blob_allocator = opt.workspace_allocator;
  592. quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q);
  593. }
  594. Mat bottom_blob_bordered;
  595. make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
  596. if (bottom_blob_bordered.empty())
  597. return -100;
  598. w = bottom_blob_bordered.w;
  599. h = bottom_blob_bordered.h;
  600. channels = bottom_blob_bordered.c;
  601. elempack = bottom_blob_bordered.elempack;
  602. int outw = (w - kernel_extent_w) / stride_w + 1;
  603. int outh = (h - kernel_extent_h) / stride_h + 1;
  604. int out_elempack = 1;
  605. #if __SSE2__
  606. if (opt.use_packing_layout)
  607. {
  608. out_elempack = num_output % 8 == 0 ? 8 : 1;
  609. }
  610. #endif // __SSE2__
  611. bool use_int8_requantize = int8_scale_term > 100;
  612. size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
  613. top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
  614. if (top_blob.empty())
  615. return -100;
  616. // depth-wise
  617. if (channels * elempack == group && group == num_output)
  618. {
  619. #if __SSE2__
  620. if (elempack == 8)
  621. {
  622. {
  623. const int maxk = kernel_w * kernel_h;
  624. // kernel offsets
  625. std::vector<int> _space_ofs(maxk);
  626. int* space_ofs = &_space_ofs[0];
  627. {
  628. int p1 = 0;
  629. int p2 = 0;
  630. int gap = w * dilation_h - kernel_w * dilation_w;
  631. for (int i = 0; i < kernel_h; i++)
  632. {
  633. for (int j = 0; j < kernel_w; j++)
  634. {
  635. space_ofs[p1] = p2;
  636. p1++;
  637. p2 += dilation_w;
  638. }
  639. p2 += gap;
  640. }
  641. }
  642. #pragma omp parallel for num_threads(opt.num_threads)
  643. for (int g = 0; g < channels; g++)
  644. {
  645. signed char* outptr_s8 = top_blob.channel(g);
  646. float* outptr_f32 = top_blob.channel(g);
  647. const signed char* kptr = (const signed char*)weight_data_int8 + maxk * g * 8;
  648. const Mat m = bottom_blob_bordered.channel(g);
  649. for (int i = 0; i < outh; i++)
  650. {
  651. for (int j = 0; j < outw; j++)
  652. {
  653. __m128i _sum0 = _mm_setzero_si128();
  654. __m128i _sum1 = _mm_setzero_si128();
  655. const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * 8;
  656. for (int k = 0; k < maxk; k++)
  657. {
  658. // TODO use _mm_cvtepi8_epi16 on sse4.1
  659. __m128i _val = _mm_loadl_epi64((const __m128i*)(sptr + space_ofs[k] * 8));
  660. _val = _mm_unpacklo_epi8(_val, _mm_cmpgt_epi8(_mm_setzero_si128(), _val));
  661. __m128i _w = _mm_loadl_epi64((const __m128i*)(kptr + k * 8));
  662. _w = _mm_unpacklo_epi8(_w, _mm_cmpgt_epi8(_mm_setzero_si128(), _w));
  663. __m128i _sl = _mm_mullo_epi16(_val, _w);
  664. __m128i _sh = _mm_mulhi_epi16(_val, _w);
  665. __m128i _s0 = _mm_unpacklo_epi16(_sl, _sh);
  666. __m128i _s1 = _mm_unpackhi_epi16(_sl, _sh);
  667. _sum0 = _mm_add_epi32(_sum0, _s0);
  668. _sum1 = _mm_add_epi32(_sum1, _s1);
  669. }
  670. __m128 _scale_in0;
  671. __m128 _scale_in1;
  672. {
  673. __m128 _bottom_blob_int8_scales0 = _mm_loadu_ps((const float*)bottom_blob_int8_scales + g * 8);
  674. __m128 _bottom_blob_int8_scales1 = _mm_loadu_ps((const float*)bottom_blob_int8_scales + g * 8 + 4);
  675. __m128 _weight_data_int8_scales0 = _mm_loadu_ps((const float*)weight_data_int8_scales + g * 8);
  676. __m128 _weight_data_int8_scales1 = _mm_loadu_ps((const float*)weight_data_int8_scales + g * 8 + 4);
  677. _scale_in0 = _mm_rcp_ps(_mm_mul_ps(_bottom_blob_int8_scales0, _weight_data_int8_scales0));
  678. _scale_in1 = _mm_rcp_ps(_mm_mul_ps(_bottom_blob_int8_scales1, _weight_data_int8_scales1));
  679. __m128 _m0 = _mm_cmpneq_ps(_weight_data_int8_scales0, _mm_setzero_ps());
  680. __m128 _m1 = _mm_cmpneq_ps(_weight_data_int8_scales1, _mm_setzero_ps());
  681. _scale_in0 = _mm_and_ps(_scale_in0, _m0);
  682. _scale_in1 = _mm_and_ps(_scale_in1, _m1);
  683. }
  684. __m128 _sumfp32_0 = _mm_mul_ps(_mm_cvtepi32_ps(_sum0), _scale_in0);
  685. __m128 _sumfp32_1 = _mm_mul_ps(_mm_cvtepi32_ps(_sum1), _scale_in1);
  686. if (bias_term)
  687. {
  688. __m128 _bias0 = _mm_loadu_ps((const float*)bias_data + g * 8);
  689. __m128 _bias1 = _mm_loadu_ps((const float*)bias_data + g * 8 + 4);
  690. _sumfp32_0 = _mm_add_ps(_sumfp32_0, _bias0);
  691. _sumfp32_1 = _mm_add_ps(_sumfp32_1, _bias1);
  692. }
  693. _sumfp32_0 = activation_sse(_sumfp32_0, activation_type, activation_params);
  694. _sumfp32_1 = activation_sse(_sumfp32_1, activation_type, activation_params);
  695. if (use_int8_requantize)
  696. {
  697. // requantize and relu
  698. __m128 _scale_out0 = _mm_loadu_ps((const float*)top_blob_int8_scales + g * 8);
  699. __m128 _scale_out1 = _mm_loadu_ps((const float*)top_blob_int8_scales + g * 8 + 4);
  700. _sumfp32_0 = _mm_mul_ps(_sumfp32_0, _scale_out0);
  701. _sumfp32_1 = _mm_mul_ps(_sumfp32_1, _scale_out1);
  702. int64_t _sum8 = float2int8_sse(_sumfp32_0, _sumfp32_1);
  703. *(int64_t*)outptr_s8 = _sum8;
  704. outptr_s8 += 8;
  705. }
  706. else
  707. {
  708. // dequantize and relu
  709. _mm_storeu_ps(outptr_f32, _sumfp32_0);
  710. _mm_storeu_ps(outptr_f32 + 4, _sumfp32_1);
  711. outptr_f32 += 8;
  712. }
  713. }
  714. }
  715. }
  716. }
  717. }
  718. #endif // __SSE2__
  719. if (elempack == 1)
  720. {
  721. if (kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1 && (activation_type == 0 || activation_type == 1))
  722. {
  723. if (use_int8_requantize)
  724. {
  725. std::vector<float> requantize_scales;
  726. for (int g = 0; g < group; g++)
  727. {
  728. float scale_in;
  729. if (weight_data_int8_scales[g] == 0)
  730. scale_in = 0;
  731. else
  732. scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
  733. float scale_out = top_blob_int8_scales[g];
  734. requantize_scales.push_back(scale_in);
  735. requantize_scales.push_back(scale_out);
  736. }
  737. convdw3x3s1_int8_requant_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, requantize_scales, opt);
  738. }
  739. else
  740. {
  741. std::vector<float> dequantize_scales;
  742. for (int g = 0; g < group; g++)
  743. {
  744. float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
  745. dequantize_scales.push_back(top_rescale);
  746. }
  747. convdw3x3s1_int8_dequant_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, dequantize_scales, opt);
  748. }
  749. if (activation)
  750. {
  751. activation->forward_inplace(top_blob, opt);
  752. }
  753. }
  754. else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (activation_type == 0 || activation_type == 1))
  755. {
  756. if (use_int8_requantize)
  757. {
  758. std::vector<float> requantize_scales;
  759. for (int g = 0; g < group; g++)
  760. {
  761. float scale_in;
  762. if (weight_data_int8_scales[g] == 0)
  763. scale_in = 0;
  764. else
  765. scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
  766. float scale_out = top_blob_int8_scales[g];
  767. requantize_scales.push_back(scale_in);
  768. requantize_scales.push_back(scale_out);
  769. }
  770. convdw3x3s2_int8_requant_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, requantize_scales, opt);
  771. }
  772. else
  773. {
  774. std::vector<float> dequantize_scales;
  775. for (int g = 0; g < group; g++)
  776. {
  777. float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
  778. dequantize_scales.push_back(top_rescale);
  779. }
  780. convdw3x3s2_int8_dequant_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, dequantize_scales, opt);
  781. }
  782. if (activation)
  783. {
  784. activation->forward_inplace(top_blob, opt);
  785. }
  786. }
  787. else
  788. {
  789. const int maxk = kernel_w * kernel_h;
  790. // kernel offsets
  791. std::vector<int> _space_ofs(maxk);
  792. int* space_ofs = &_space_ofs[0];
  793. {
  794. int p1 = 0;
  795. int p2 = 0;
  796. int gap = w * dilation_h - kernel_w * dilation_w;
  797. for (int i = 0; i < kernel_h; i++)
  798. {
  799. for (int j = 0; j < kernel_w; j++)
  800. {
  801. space_ofs[p1] = p2;
  802. p1++;
  803. p2 += dilation_w;
  804. }
  805. p2 += gap;
  806. }
  807. }
  808. #pragma omp parallel for num_threads(opt.num_threads)
  809. for (int g = 0; g < group; g++)
  810. {
  811. signed char* outptr_s8 = top_blob.channel(g);
  812. float* outptr_f32 = top_blob.channel(g);
  813. const signed char* kptr = (const signed char*)weight_data + maxk * g;
  814. const Mat m = bottom_blob_bordered.channel(g);
  815. for (int i = 0; i < outh; i++)
  816. {
  817. for (int j = 0; j < outw; j++)
  818. {
  819. int sum = 0;
  820. const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w;
  821. for (int k = 0; k < maxk; k++)
  822. {
  823. signed char val = sptr[space_ofs[k]];
  824. signed char w = kptr[k];
  825. sum += val * w;
  826. }
  827. float scale_in;
  828. if (weight_data_int8_scales[g] == 0)
  829. scale_in = 0;
  830. else
  831. scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
  832. float sumfp32 = sum * scale_in;
  833. if (bias_term)
  834. sumfp32 += bias_data[g];
  835. sumfp32 = activation_ss(sumfp32, activation_type, activation_params);
  836. if (use_int8_requantize)
  837. {
  838. // requantize
  839. float scale_out = top_blob_int8_scales[g];
  840. signed char sums8 = float2int8(sumfp32 * scale_out);
  841. outptr_s8[0] = sums8;
  842. outptr_s8 += 1;
  843. }
  844. else
  845. {
  846. // dequantize
  847. outptr_f32[0] = sumfp32;
  848. outptr_f32 += 1;
  849. }
  850. }
  851. }
  852. }
  853. }
  854. }
  855. return 0;
  856. }
  857. // group convolution
  858. const int channels_g = channels * elempack / group;
  859. const int num_output_g = num_output / group;
  860. int g_elempack = 1;
  861. int out_g_elempack = 1;
  862. #if __SSE2__
  863. if (opt.use_packing_layout)
  864. {
  865. g_elempack = channels_g % 8 == 0 ? 8 : 1;
  866. out_g_elempack = num_output_g % 8 == 0 ? 8 : 1;
  867. }
  868. #endif // __SSE2__
  869. // unpacking
  870. Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
  871. if (elempack > g_elempack)
  872. {
  873. Option opt_p = opt;
  874. opt_p.blob_allocator = opt.workspace_allocator;
  875. convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
  876. }
  877. Mat top_blob_unpacked = top_blob;
  878. if (out_g_elempack < out_elempack)
  879. {
  880. top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
  881. if (top_blob_unpacked.empty())
  882. return -100;
  883. }
  884. #pragma omp parallel for num_threads(opt.num_threads)
  885. for (int g = 0; g < group; g++)
  886. {
  887. const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
  888. Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
  889. const ncnn::Layer* op = group_ops[g];
  890. Option opt_g = opt;
  891. opt_g.blob_allocator = top_blob.allocator;
  892. // forward
  893. op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
  894. }
  895. // packing
  896. if (out_g_elempack < out_elempack)
  897. {
  898. convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
  899. }
  900. else
  901. {
  902. top_blob = top_blob_unpacked;
  903. }
  904. return 0;
  905. }
  906. #endif // NCNN_INT8
  907. } // namespace ncnn