diff --git a/src/layer/arm/concat_arm.cpp b/src/layer/arm/concat_arm.cpp index 9c55cc98d..afd0ae0b2 100644 --- a/src/layer/arm/concat_arm.cpp +++ b/src/layer/arm/concat_arm.cpp @@ -27,49 +27,11 @@ Concat_arm::Concat_arm() #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC support_fp16_storage = true; #endif - - packing_pack4 = 0; #endif // __ARM_NEON support_bf16_storage = true; } -int Concat_arm::create_pipeline(const Option& opt) -{ -#if __ARM_NEON - if (opt.use_packing_layout) - { - packing_pack4 = ncnn::create_layer(ncnn::LayerType::Packing); - - ncnn::ParamDict pd; - pd.set(0, 4); - - packing_pack4->load_param(pd); - - packing_pack4->create_pipeline(opt); - } -#endif // __ARM_NEON - - return 0; -} - -int Concat_arm::destroy_pipeline(const Option& opt) -{ -#if __ARM_NEON - if (opt.use_packing_layout) - { - if (packing_pack4) - { - packing_pack4->destroy_pipeline(opt); - delete packing_pack4; - packing_pack4 = 0; - } - } -#endif // __ARM_NEON - - return 0; -} - int Concat_arm::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { int elembits = bottom_blobs[0].elembits(); @@ -84,678 +46,782 @@ int Concat_arm::forward(const std::vector& bottom_blobs, std::vector& int dims = bottom_blobs[0].dims; -#if __ARM_NEON - if (opt.use_packing_layout) + if (dims == 1) // axis == 0 { - if (dims == 1) // axis == 0 + // concat vector + // total length + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + int top_w = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) { - // concat vector - // total length - size_t elemsize = bottom_blobs[0].elemsize; - int elempack = bottom_blobs[0].elempack; - int top_w = 0; - for (size_t b = 0; b < bottom_blobs.size(); b++) - { - const Mat& bottom_blob = bottom_blobs[b]; - top_w += bottom_blob.w * bottom_blob.elempack; - } + const Mat& bottom_blob = bottom_blobs[b]; + top_w += bottom_blob.w * bottom_blob.elempack; + } - int out_elempack = top_w % 4 == 0 ? 4 : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; + int out_elempack = opt.use_packing_layout && top_w % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; - Mat& top_blob = top_blobs[0]; - top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + Mat& top_blob = top_blobs[0]; + top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - float* outptr = top_blob; - for (size_t b = 0; b < bottom_blobs.size(); b++) - { - const Mat& bottom_blob = bottom_blobs[b]; - - const float* ptr = bottom_blob; - memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize); + float* outptr = top_blob; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; - outptr += bottom_blob.w * bottom_blob.elempack; - } + const float* ptr = bottom_blob; + memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize); - return 0; + outptr += bottom_blob.w * bottom_blob.elempack; } + } - if (dims == 2 && axis == 0) + if (dims == 2 && axis == 0) + { + // concat image + int w = bottom_blobs[0].w; + + // total height + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + int top_h = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) { - // concat image - int w = bottom_blobs[0].w; + const Mat& bottom_blob = bottom_blobs[b]; + elemsize = std::min(elemsize, bottom_blob.elemsize); + elempack = std::min(elempack, bottom_blob.elempack); + top_h += bottom_blob.h * bottom_blob.elempack; + } - // total height - size_t elemsize = bottom_blobs[0].elemsize; - int elempack = bottom_blobs[0].elempack; - int top_h = 0; - for (size_t b = 0; b < bottom_blobs.size(); b++) - { - const Mat& bottom_blob = bottom_blobs[b]; - elemsize = std::min(elemsize, bottom_blob.elemsize); - elempack = std::min(elempack, bottom_blob.elempack); - top_h += bottom_blob.h * bottom_blob.elempack; - } + int out_elempack = opt.use_packing_layout && top_h % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; - int out_elempack = top_h % 4 == 0 ? 4 : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; + Mat& top_blob = top_blobs[0]; + top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - Mat& top_blob = top_blobs[0]; - top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) + Mat top_blob_unpacked = top_blob; + if (elempack < out_elempack) + { + top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator); + if (top_blob_unpacked.empty()) return -100; + } - Mat top_blob_unpacked = top_blob; - if (elempack == 1 && out_elempack == 4) - { - top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator); - if (top_blob_unpacked.empty()) - return -100; - } + float* outptr = top_blob_unpacked; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; - float* outptr = top_blob_unpacked; - for (size_t b = 0; b < bottom_blobs.size(); b++) + if (bottom_blob.elempack == 4 && elempack == 1) { - const Mat& bottom_blob = bottom_blobs[b]; - - if (bottom_blob.elempack == 4 && elempack == 1) + for (int i = 0; i < bottom_blob.h; i++) { - for (int i = 0; i < bottom_blob.h; i++) - { - const float* r0 = bottom_blob.row(i); - - float* outptr0 = outptr; - float* outptr1 = outptr + w; - float* outptr2 = outptr + w * 2; - float* outptr3 = outptr + w * 3; + const float* r0 = bottom_blob.row(i); - for (int j = 0; j < w; j++) - { - *outptr0++ = r0[0]; - *outptr1++ = r0[1]; - *outptr2++ = r0[2]; - *outptr3++ = r0[3]; + float* outptr0 = outptr; + float* outptr1 = outptr + w; + float* outptr2 = outptr + w * 2; + float* outptr3 = outptr + w * 3; - r0 += 4; - } + for (int j = 0; j < w; j++) + { + *outptr0++ = r0[0]; + *outptr1++ = r0[1]; + *outptr2++ = r0[2]; + *outptr3++ = r0[3]; - outptr += w * 4; + r0 += 4; } - } - else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4) - { - int size = w * bottom_blob.h; - - const float* ptr = bottom_blob; - memcpy(outptr, ptr, size * bottom_blob.elemsize); - outptr += size * bottom_blob.elempack; + outptr += w * 4; } } - - // packing - if (elempack == 1 && out_elempack == 4) + else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4) { - packing_pack4->forward(top_blob_unpacked, top_blob, opt); + int size = w * bottom_blob.h; + + const float* ptr = bottom_blob; + memcpy(outptr, ptr, size * bottom_blob.elemsize); + + outptr += size * bottom_blob.elempack; } + } - return 0; + // packing + if (elempack < out_elempack) + { + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); } + } - if (dims == 2 && axis == 1) + if (dims == 2 && axis == 1) + { + // interleave image row + int h = bottom_blobs[0].h; + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + + // total width + int top_w = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) { - // interleave image row - int h = bottom_blobs[0].h; - size_t elemsize = bottom_blobs[0].elemsize; - int elempack = bottom_blobs[0].elempack; + const Mat& bottom_blob = bottom_blobs[b]; + top_w += bottom_blob.w; + } + + Mat& top_blob = top_blobs[0]; + top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - // total width - int top_w = 0; + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + float* outptr = top_blob.row(i); for (size_t b = 0; b < bottom_blobs.size(); b++) { const Mat& bottom_blob = bottom_blobs[b]; - top_w += bottom_blob.w; - } - - Mat& top_blob = top_blobs[0]; - top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - float* outptr = top_blob.row(i); - for (size_t b = 0; b < bottom_blobs.size(); b++) - { - const Mat& bottom_blob = bottom_blobs[b]; - const float* ptr = bottom_blob.row(i); - memcpy(outptr, ptr, bottom_blob.w * elemsize); + const float* ptr = bottom_blob.row(i); + memcpy(outptr, ptr, bottom_blob.w * elemsize); - outptr += bottom_blob.w * elempack; - } + outptr += bottom_blob.w * elempack; } - - return 0; } + } - if (dims == 3 && axis == 0) + if (dims == 3 && axis == 0) + { + // concat dim + int w = bottom_blobs[0].w; + int h = bottom_blobs[0].h; + + // total channels + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + int top_channels = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) { - // concat dim - int w = bottom_blobs[0].w; - int h = bottom_blobs[0].h; + const Mat& bottom_blob = bottom_blobs[b]; + elemsize = std::min(elemsize, bottom_blob.elemsize); + elempack = std::min(elempack, bottom_blob.elempack); + top_channels += bottom_blob.c * bottom_blob.elempack; + } - // total channels - size_t elemsize = bottom_blobs[0].elemsize; - int elempack = bottom_blobs[0].elempack; - int top_channels = 0; - for (size_t b = 0; b < bottom_blobs.size(); b++) - { - const Mat& bottom_blob = bottom_blobs[b]; - elemsize = std::min(elemsize, bottom_blob.elemsize); - elempack = std::min(elempack, bottom_blob.elempack); - top_channels += bottom_blob.c * bottom_blob.elempack; - } + int out_elempack = opt.use_packing_layout && top_channels % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; - int out_elempack = top_channels % 4 == 0 ? 4 : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; + Mat& top_blob = top_blobs[0]; + top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - Mat& top_blob = top_blobs[0]; - top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) + Mat top_blob_unpacked = top_blob; + if (elempack < out_elempack) + { + top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_allocator); + if (top_blob_unpacked.empty()) return -100; + } - Mat top_blob_unpacked = top_blob; - if (elempack == 1 && out_elempack == 4) - { - top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_allocator); - if (top_blob_unpacked.empty()) - return -100; - } + int p = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; - int p = 0; - for (size_t b = 0; b < bottom_blobs.size(); b++) + if (bottom_blob.elempack == 4 && elempack == 1) { - const Mat& bottom_blob = bottom_blobs[b]; + int size = bottom_blob.w * bottom_blob.h; - if (bottom_blob.elempack == 4 && elempack == 1) + for (int q = 0; q < bottom_blob.c; q++) { - int size = bottom_blob.w * bottom_blob.h; - - for (int q = 0; q < bottom_blob.c; q++) - { - const float* r0 = bottom_blob.channel(q); - - float* outptr0 = top_blob_unpacked.channel(p); - float* outptr1 = top_blob_unpacked.channel(p + 1); - float* outptr2 = top_blob_unpacked.channel(p + 2); - float* outptr3 = top_blob_unpacked.channel(p + 3); + const float* r0 = bottom_blob.channel(q); - for (int i = 0; i < size; i++) - { - *outptr0++ = r0[0]; - *outptr1++ = r0[1]; - *outptr2++ = r0[2]; - *outptr3++ = r0[3]; + float* outptr0 = top_blob_unpacked.channel(p); + float* outptr1 = top_blob_unpacked.channel(p + 1); + float* outptr2 = top_blob_unpacked.channel(p + 2); + float* outptr3 = top_blob_unpacked.channel(p + 3); - r0 += 4; - } + for (int i = 0; i < size; i++) + { + *outptr0++ = r0[0]; + *outptr1++ = r0[1]; + *outptr2++ = r0[2]; + *outptr3++ = r0[3]; - p += 4; + r0 += 4; } - } - else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4) - { - int size = bottom_blob.total(); - const float* ptr = bottom_blob; - float* outptr = top_blob_unpacked.channel(p); - memcpy(outptr, ptr, size * bottom_blob.elemsize); - - p += bottom_blob.c; + p += 4; } } - - // packing - if (elempack == 1 && out_elempack == 4) + else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4) { - packing_pack4->forward(top_blob_unpacked, top_blob, opt); + int size = bottom_blob.total(); + + const float* ptr = bottom_blob; + float* outptr = top_blob_unpacked.channel(p); + memcpy(outptr, ptr, size * bottom_blob.elemsize); + + p += bottom_blob.c; } + } - return 0; + // packing + if (elempack < out_elempack) + { + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); } + } - if (dims == 3 && axis == 1) + if (dims == 3 && axis == 1) + { + // interleave dim height + int w = bottom_blobs[0].w; + int channels = bottom_blobs[0].c; + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + + // total height + int top_h = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) { - // interleave dim height - int w = bottom_blobs[0].w; - int channels = bottom_blobs[0].c; - size_t elemsize = bottom_blobs[0].elemsize; - int elempack = bottom_blobs[0].elempack; + const Mat& bottom_blob = bottom_blobs[b]; + top_h += bottom_blob.h; + } - // total height - int top_h = 0; - for (size_t b = 0; b < bottom_blobs.size(); b++) - { - const Mat& bottom_blob = bottom_blobs[b]; - top_h += bottom_blob.h; - } + Mat& top_blob = top_blobs[0]; + top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - Mat& top_blob = top_blobs[0]; - top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (size_t b = 0; b < bottom_blobs.size(); b++) { - float* outptr = top_blob.channel(q); - - for (size_t b = 0; b < bottom_blobs.size(); b++) - { - const Mat& bottom_blob = bottom_blobs[b]; + const Mat& bottom_blob = bottom_blobs[b]; - int size = bottom_blob.w * bottom_blob.h; + int size = bottom_blob.w * bottom_blob.h; - const float* ptr = bottom_blob.channel(q); - memcpy(outptr, ptr, size * elemsize); + const float* ptr = bottom_blob.channel(q); + memcpy(outptr, ptr, size * elemsize); - outptr += size * elempack; - } + outptr += size * elempack; } - - return 0; } + } - if (dims == 3 && axis == 2) + if (dims == 3 && axis == 2) + { + // interleave dim width + int h = bottom_blobs[0].h; + int channels = bottom_blobs[0].c; + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + + // total height + int top_w = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) { - // interleave dim width - int h = bottom_blobs[0].h; - int channels = bottom_blobs[0].c; - size_t elemsize = bottom_blobs[0].elemsize; - int elempack = bottom_blobs[0].elempack; + const Mat& bottom_blob = bottom_blobs[b]; + top_w += bottom_blob.w; + } - // total height - int top_w = 0; - for (size_t b = 0; b < bottom_blobs.size(); b++) - { - const Mat& bottom_blob = bottom_blobs[b]; - top_w += bottom_blob.w; - } + Mat& top_blob = top_blobs[0]; + top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - Mat& top_blob = top_blobs[0]; - top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int i = 0; i < h; i++) { - float* outptr = top_blob.channel(q); - - for (int i = 0; i < h; i++) + for (size_t b = 0; b < bottom_blobs.size(); b++) { - for (size_t b = 0; b < bottom_blobs.size(); b++) - { - const Mat& bottom_blob = bottom_blobs[b]; + const Mat& bottom_blob = bottom_blobs[b]; - const float* ptr = bottom_blob.channel(q).row(i); - memcpy(outptr, ptr, bottom_blob.w * elemsize); + const float* ptr = bottom_blob.channel(q).row(i); + memcpy(outptr, ptr, bottom_blob.w * elemsize); - outptr += bottom_blob.w * elempack; - } + outptr += bottom_blob.w * elempack; } } - - return 0; } + } - } // opt.use_packing_layout -#endif // __ARM_NEON - - return Concat::forward(bottom_blobs, top_blobs, opt); + return 0; } int Concat_arm::forward_bf16s_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { int dims = bottom_blobs[0].dims; -#if __ARM_NEON - if (opt.use_packing_layout) + if (dims == 1) // axis == 0 { - if (dims == 1) // axis == 0 + // concat vector + // total length + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + int top_w = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) { - // concat vector - // total length - size_t elemsize = bottom_blobs[0].elemsize; - int elempack = bottom_blobs[0].elempack; - int top_w = 0; - for (size_t b = 0; b < bottom_blobs.size(); b++) - { - const Mat& bottom_blob = bottom_blobs[b]; - top_w += bottom_blob.w * bottom_blob.elempack; - } - - int out_elempack = top_w % 4 == 0 ? 4 : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; + const Mat& bottom_blob = bottom_blobs[b]; + top_w += bottom_blob.w * bottom_blob.elempack; + } - Mat& top_blob = top_blobs[0]; - top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + int out_elempack = 1; + if (opt.use_packing_layout) + { + out_elempack = opt.use_fp16_arithmetic && top_w % 8 == 0 ? 8 : top_w % 4 == 0 ? 4 : 1; + } + size_t out_elemsize = elemsize / elempack * out_elempack; - unsigned short* outptr = top_blob; - for (size_t b = 0; b < bottom_blobs.size(); b++) - { - const Mat& bottom_blob = bottom_blobs[b]; + Mat& top_blob = top_blobs[0]; + top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - const unsigned short* ptr = bottom_blob; - memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize); + unsigned short* outptr = top_blob; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; - outptr += bottom_blob.w * bottom_blob.elempack; - } + const unsigned short* ptr = bottom_blob; + memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize); - return 0; + outptr += bottom_blob.w * bottom_blob.elempack; } + } - if (dims == 2 && axis == 0) + if (dims == 2 && axis == 0) + { + // concat image + int w = bottom_blobs[0].w; + + // total height + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + int top_h = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) { - // concat image - int w = bottom_blobs[0].w; + const Mat& bottom_blob = bottom_blobs[b]; + elemsize = std::min(elemsize, bottom_blob.elemsize); + elempack = std::min(elempack, bottom_blob.elempack); + top_h += bottom_blob.h * bottom_blob.elempack; + } - // total height - size_t elemsize = bottom_blobs[0].elemsize; - int elempack = bottom_blobs[0].elempack; - int top_h = 0; - for (size_t b = 0; b < bottom_blobs.size(); b++) - { - const Mat& bottom_blob = bottom_blobs[b]; - elemsize = std::min(elemsize, bottom_blob.elemsize); - elempack = std::min(elempack, bottom_blob.elempack); - top_h += bottom_blob.h * bottom_blob.elempack; - } + int out_elempack = 1; + if (opt.use_packing_layout) + { + out_elempack = opt.use_fp16_arithmetic && top_h % 8 == 0 ? 8 : top_h % 4 == 0 ? 4 : 1; + } + size_t out_elemsize = elemsize / elempack * out_elempack; - int out_elempack = top_h % 4 == 0 ? 4 : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; + Mat& top_blob = top_blobs[0]; + top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - Mat& top_blob = top_blobs[0]; - top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) + Mat top_blob_unpacked = top_blob; + if (elempack < out_elempack) + { + top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator); + if (top_blob_unpacked.empty()) return -100; + } - Mat top_blob_unpacked = top_blob; - if (elempack == 1 && out_elempack == 4) - { - top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator); - if (top_blob_unpacked.empty()) - return -100; - } + unsigned short* outptr = top_blob_unpacked; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; - unsigned short* outptr = top_blob_unpacked; - for (size_t b = 0; b < bottom_blobs.size(); b++) +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + if (bottom_blob.elempack == 8 && elempack == 4) { - const Mat& bottom_blob = bottom_blobs[b]; - - if (bottom_blob.elempack == 4 && elempack == 1) + for (int i = 0; i < bottom_blob.h; i++) { - for (int i = 0; i < bottom_blob.h; i++) - { - const unsigned short* r0 = bottom_blob.row(i); + const unsigned short* r0 = bottom_blob.row(i); - unsigned short* outptr0 = outptr; - unsigned short* outptr1 = outptr + w; - unsigned short* outptr2 = outptr + w * 2; - unsigned short* outptr3 = outptr + w * 3; + unsigned short* outptr0 = outptr; + unsigned short* outptr1 = outptr + w * 4; - for (int j = 0; j < w; j++) - { - *outptr0++ = r0[0]; - *outptr1++ = r0[1]; - *outptr2++ = r0[2]; - *outptr3++ = r0[3]; - - r0 += 4; - } + for (int j = 0; j < w; j++) + { + outptr0[0] = r0[0]; + outptr0[1] = r0[1]; + outptr0[2] = r0[2]; + outptr0[3] = r0[3]; + outptr1[0] = r0[4]; + outptr1[1] = r0[5]; + outptr1[2] = r0[6]; + outptr1[3] = r0[7]; + + outptr0 += 4; + outptr1 += 4; + r0 += 8; + } - outptr += w * 4; + outptr += w * 8; + } + } + if (bottom_blob.elempack == 8 && elempack == 1) + { + for (int i = 0; i < bottom_blob.h; i++) + { + const unsigned short* r0 = bottom_blob.row(i); + + unsigned short* outptr0 = outptr; + unsigned short* outptr1 = outptr + w; + unsigned short* outptr2 = outptr + w * 2; + unsigned short* outptr3 = outptr + w * 3; + unsigned short* outptr4 = outptr + w * 4; + unsigned short* outptr5 = outptr + w * 5; + unsigned short* outptr6 = outptr + w * 6; + unsigned short* outptr7 = outptr + w * 7; + + for (int j = 0; j < w; j++) + { + *outptr0++ = r0[0]; + *outptr1++ = r0[1]; + *outptr2++ = r0[2]; + *outptr3++ = r0[3]; + *outptr4++ = r0[4]; + *outptr5++ = r0[5]; + *outptr6++ = r0[6]; + *outptr7++ = r0[7]; + + r0 += 8; } + + outptr += w * 8; } - else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4) + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + if (bottom_blob.elempack == 4 && elempack == 1) + { + for (int i = 0; i < bottom_blob.h; i++) { - int size = w * bottom_blob.h; + const unsigned short* r0 = bottom_blob.row(i); - const unsigned short* ptr = bottom_blob; - memcpy(outptr, ptr, size * bottom_blob.elemsize); + unsigned short* outptr0 = outptr; + unsigned short* outptr1 = outptr + w; + unsigned short* outptr2 = outptr + w * 2; + unsigned short* outptr3 = outptr + w * 3; - outptr += size * bottom_blob.elempack; + for (int j = 0; j < w; j++) + { + *outptr0++ = r0[0]; + *outptr1++ = r0[1]; + *outptr2++ = r0[2]; + *outptr3++ = r0[3]; + + r0 += 4; + } + + outptr += w * 4; } } - - // packing - if (elempack == 1 && out_elempack == 4) + if (bottom_blob.elempack == elempack) // 1-1 4-4 8-8 { - packing_pack4->forward(top_blob_unpacked, top_blob, opt); + int size = w * bottom_blob.h; + + const unsigned short* ptr = bottom_blob; + memcpy(outptr, ptr, size * bottom_blob.elemsize); + + outptr += size * bottom_blob.elempack; } + } - return 0; + // packing + if (elempack < out_elempack) + { + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); } + } - if (dims == 2 && axis == 1) + if (dims == 2 && axis == 1) + { + // interleave image row + int h = bottom_blobs[0].h; + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + + // total width + int top_w = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) { - // interleave image row - int h = bottom_blobs[0].h; - size_t elemsize = bottom_blobs[0].elemsize; - int elempack = bottom_blobs[0].elempack; + const Mat& bottom_blob = bottom_blobs[b]; + top_w += bottom_blob.w; + } + + Mat& top_blob = top_blobs[0]; + top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - // total width - int top_w = 0; + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + unsigned short* outptr = top_blob.row(i); for (size_t b = 0; b < bottom_blobs.size(); b++) { const Mat& bottom_blob = bottom_blobs[b]; - top_w += bottom_blob.w; - } - Mat& top_blob = top_blobs[0]; - top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - unsigned short* outptr = top_blob.row(i); - for (size_t b = 0; b < bottom_blobs.size(); b++) - { - const Mat& bottom_blob = bottom_blobs[b]; + const unsigned short* ptr = bottom_blob.row(i); + memcpy(outptr, ptr, bottom_blob.w * elemsize); - const unsigned short* ptr = bottom_blob.row(i); - memcpy(outptr, ptr, bottom_blob.w * elemsize); - - outptr += bottom_blob.w * elempack; - } + outptr += bottom_blob.w * elempack; } - - return 0; } + } - if (dims == 3 && axis == 0) + if (dims == 3 && axis == 0) + { + // concat dim + int w = bottom_blobs[0].w; + int h = bottom_blobs[0].h; + + // total channels + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + int top_channels = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) { - // concat dim - int w = bottom_blobs[0].w; - int h = bottom_blobs[0].h; + const Mat& bottom_blob = bottom_blobs[b]; + elemsize = std::min(elemsize, bottom_blob.elemsize); + elempack = std::min(elempack, bottom_blob.elempack); + top_channels += bottom_blob.c * bottom_blob.elempack; + } - // total channels - size_t elemsize = bottom_blobs[0].elemsize; - int elempack = bottom_blobs[0].elempack; - int top_channels = 0; - for (size_t b = 0; b < bottom_blobs.size(); b++) - { - const Mat& bottom_blob = bottom_blobs[b]; - elemsize = std::min(elemsize, bottom_blob.elemsize); - elempack = std::min(elempack, bottom_blob.elempack); - top_channels += bottom_blob.c * bottom_blob.elempack; - } + int out_elempack = 1; + if (opt.use_packing_layout) + { + out_elempack = opt.use_fp16_arithmetic && top_channels % 8 == 0 ? 8 : top_channels % 4 == 0 ? 4 : 1; + } + size_t out_elemsize = elemsize / elempack * out_elempack; - int out_elempack = top_channels % 4 == 0 ? 4 : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; + Mat& top_blob = top_blobs[0]; + top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - Mat& top_blob = top_blobs[0]; - top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) + Mat top_blob_unpacked = top_blob; + if (elempack < out_elempack) + { + top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_allocator); + if (top_blob_unpacked.empty()) return -100; + } - Mat top_blob_unpacked = top_blob; - if (elempack == 1 && out_elempack == 4) - { - top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_allocator); - if (top_blob_unpacked.empty()) - return -100; - } + int p = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; - int p = 0; - for (size_t b = 0; b < bottom_blobs.size(); b++) +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + if (bottom_blob.elempack == 8 && elempack == 4) { - const Mat& bottom_blob = bottom_blobs[b]; + int size = bottom_blob.w * bottom_blob.h; - if (bottom_blob.elempack == 4 && elempack == 1) + for (int q = 0; q < bottom_blob.c; q++) { - int size = bottom_blob.w * bottom_blob.h; + const unsigned short* r0 = bottom_blob.channel(q); - for (int q = 0; q < bottom_blob.c; q++) - { - const unsigned short* r0 = bottom_blob.channel(q); + unsigned short* outptr0 = top_blob_unpacked.channel(p); + unsigned short* outptr1 = top_blob_unpacked.channel(p + 1); - unsigned short* outptr0 = top_blob_unpacked.channel(p); - unsigned short* outptr1 = top_blob_unpacked.channel(p + 1); - unsigned short* outptr2 = top_blob_unpacked.channel(p + 2); - unsigned short* outptr3 = top_blob_unpacked.channel(p + 3); - - for (int i = 0; i < size; i++) - { - *outptr0++ = r0[0]; - *outptr1++ = r0[1]; - *outptr2++ = r0[2]; - *outptr3++ = r0[3]; + for (int i = 0; i < size; i++) + { + outptr0[0] = r0[0]; + outptr0[1] = r0[1]; + outptr0[2] = r0[2]; + outptr0[3] = r0[3]; + outptr1[0] = r0[4]; + outptr1[1] = r0[5]; + outptr1[2] = r0[6]; + outptr1[3] = r0[7]; + + outptr0 += 4; + outptr1 += 4; + r0 += 8; + } - r0 += 4; - } + p += 2; + } + } + if (bottom_blob.elempack == 8 && elempack == 1) + { + int size = bottom_blob.w * bottom_blob.h; - p += 4; + for (int q = 0; q < bottom_blob.c; q++) + { + const unsigned short* r0 = bottom_blob.channel(q); + + unsigned short* outptr0 = top_blob_unpacked.channel(p); + unsigned short* outptr1 = top_blob_unpacked.channel(p + 1); + unsigned short* outptr2 = top_blob_unpacked.channel(p + 2); + unsigned short* outptr3 = top_blob_unpacked.channel(p + 3); + unsigned short* outptr4 = top_blob_unpacked.channel(p + 4); + unsigned short* outptr5 = top_blob_unpacked.channel(p + 5); + unsigned short* outptr6 = top_blob_unpacked.channel(p + 6); + unsigned short* outptr7 = top_blob_unpacked.channel(p + 7); + + for (int i = 0; i < size; i++) + { + *outptr0++ = r0[0]; + *outptr1++ = r0[1]; + *outptr2++ = r0[2]; + *outptr3++ = r0[3]; + *outptr4++ = r0[4]; + *outptr5++ = r0[5]; + *outptr6++ = r0[6]; + *outptr7++ = r0[7]; + + r0 += 8; } + + p += 8; } - else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4) + } +#endif + if (bottom_blob.elempack == 4 && elempack == 1) + { + int size = bottom_blob.w * bottom_blob.h; + + for (int q = 0; q < bottom_blob.c; q++) { - int size = bottom_blob.total(); + const unsigned short* r0 = bottom_blob.channel(q); - const unsigned short* ptr = bottom_blob; - unsigned short* outptr = top_blob_unpacked.channel(p); - memcpy(outptr, ptr, size * bottom_blob.elemsize); + unsigned short* outptr0 = top_blob_unpacked.channel(p); + unsigned short* outptr1 = top_blob_unpacked.channel(p + 1); + unsigned short* outptr2 = top_blob_unpacked.channel(p + 2); + unsigned short* outptr3 = top_blob_unpacked.channel(p + 3); - p += bottom_blob.c; + for (int i = 0; i < size; i++) + { + *outptr0++ = r0[0]; + *outptr1++ = r0[1]; + *outptr2++ = r0[2]; + *outptr3++ = r0[3]; + + r0 += 4; + } + + p += 4; } } - - // packing - if (elempack == 1 && out_elempack == 4) + if (bottom_blob.elempack == elempack) // 1-1 4-4 8-8 { - packing_pack4->forward(top_blob_unpacked, top_blob, opt); + int size = bottom_blob.total(); + + const unsigned short* ptr = bottom_blob; + unsigned short* outptr = top_blob_unpacked.channel(p); + memcpy(outptr, ptr, size * bottom_blob.elemsize); + + p += bottom_blob.c; } + } - return 0; + // packing + if (elempack < out_elempack) + { + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); } + } - if (dims == 3 && axis == 1) + if (dims == 3 && axis == 1) + { + // interleave dim height + int w = bottom_blobs[0].w; + int channels = bottom_blobs[0].c; + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + + // total height + int top_h = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) { - // interleave dim height - int w = bottom_blobs[0].w; - int channels = bottom_blobs[0].c; - size_t elemsize = bottom_blobs[0].elemsize; - int elempack = bottom_blobs[0].elempack; + const Mat& bottom_blob = bottom_blobs[b]; + top_h += bottom_blob.h; + } - // total height - int top_h = 0; - for (size_t b = 0; b < bottom_blobs.size(); b++) - { - const Mat& bottom_blob = bottom_blobs[b]; - top_h += bottom_blob.h; - } + Mat& top_blob = top_blobs[0]; + top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - Mat& top_blob = top_blobs[0]; - top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + unsigned short* outptr = top_blob.channel(q); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (size_t b = 0; b < bottom_blobs.size(); b++) { - unsigned short* outptr = top_blob.channel(q); - - for (size_t b = 0; b < bottom_blobs.size(); b++) - { - const Mat& bottom_blob = bottom_blobs[b]; + const Mat& bottom_blob = bottom_blobs[b]; - int size = bottom_blob.w * bottom_blob.h; + int size = bottom_blob.w * bottom_blob.h; - const unsigned short* ptr = bottom_blob.channel(q); - memcpy(outptr, ptr, size * elemsize); + const unsigned short* ptr = bottom_blob.channel(q); + memcpy(outptr, ptr, size * elemsize); - outptr += size * elempack; - } + outptr += size * elempack; } - - return 0; } + } - if (dims == 3 && axis == 2) + if (dims == 3 && axis == 2) + { + // interleave dim width + int h = bottom_blobs[0].h; + int channels = bottom_blobs[0].c; + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + + // total height + int top_w = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) { - // interleave dim width - int h = bottom_blobs[0].h; - int channels = bottom_blobs[0].c; - size_t elemsize = bottom_blobs[0].elemsize; - int elempack = bottom_blobs[0].elempack; + const Mat& bottom_blob = bottom_blobs[b]; + top_w += bottom_blob.w; + } - // total height - int top_w = 0; - for (size_t b = 0; b < bottom_blobs.size(); b++) - { - const Mat& bottom_blob = bottom_blobs[b]; - top_w += bottom_blob.w; - } + Mat& top_blob = top_blobs[0]; + top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - Mat& top_blob = top_blobs[0]; - top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + unsigned short* outptr = top_blob.channel(q); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int i = 0; i < h; i++) { - unsigned short* outptr = top_blob.channel(q); - - for (int i = 0; i < h; i++) + for (size_t b = 0; b < bottom_blobs.size(); b++) { - for (size_t b = 0; b < bottom_blobs.size(); b++) - { - const Mat& bottom_blob = bottom_blobs[b]; + const Mat& bottom_blob = bottom_blobs[b]; - const unsigned short* ptr = bottom_blob.channel(q).row(i); - memcpy(outptr, ptr, bottom_blob.w * elemsize); + const unsigned short* ptr = bottom_blob.channel(q).row(i); + memcpy(outptr, ptr, bottom_blob.w * elemsize); - outptr += bottom_blob.w * elempack; - } + outptr += bottom_blob.w * elempack; } } - - return 0; } + } - } // opt.use_packing_layout -#endif // __ARM_NEON - - return Concat::forward(bottom_blobs, top_blobs, opt); + return 0; } } // namespace ncnn diff --git a/src/layer/arm/concat_arm.h b/src/layer/arm/concat_arm.h index c42eae68d..c09dfa275 100644 --- a/src/layer/arm/concat_arm.h +++ b/src/layer/arm/concat_arm.h @@ -24,16 +24,10 @@ class Concat_arm : virtual public Concat public: Concat_arm(); - virtual int create_pipeline(const Option& opt); - virtual int destroy_pipeline(const Option& opt); - virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; protected: int forward_bf16s_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; - -public: - ncnn::Layer* packing_pack4; }; } // namespace ncnn diff --git a/src/layer/arm/slice_arm.cpp b/src/layer/arm/slice_arm.cpp index bd4b12d0a..88fdabc99 100644 --- a/src/layer/arm/slice_arm.cpp +++ b/src/layer/arm/slice_arm.cpp @@ -29,49 +29,11 @@ Slice_arm::Slice_arm() #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC support_fp16_storage = true; #endif - - packing_pack1 = 0; #endif // __ARM_NEON support_bf16_storage = true; } -int Slice_arm::create_pipeline(const Option& opt) -{ -#if __ARM_NEON - if (opt.use_packing_layout) - { - packing_pack1 = ncnn::create_layer(ncnn::LayerType::Packing); - - ncnn::ParamDict pd; - pd.set(0, 1); - - packing_pack1->load_param(pd); - - packing_pack1->create_pipeline(opt); - } -#endif // __ARM_NEON - - return 0; -} - -int Slice_arm::destroy_pipeline(const Option& opt) -{ -#if __ARM_NEON - if (opt.use_packing_layout) - { - if (packing_pack1) - { - packing_pack1->destroy_pipeline(opt); - delete packing_pack1; - packing_pack1 = 0; - } - } -#endif // __ARM_NEON - - return 0; -} - int Slice_arm::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { int elembits = bottom_blobs[0].elembits(); @@ -90,346 +52,328 @@ int Slice_arm::forward(const std::vector& bottom_blobs, std::vector& t int elempack = bottom_blob.elempack; const int* slices_ptr = slices; -#if __ARM_NEON - if (opt.use_packing_layout) + if (dims == 1) // axis == 0 { - if (dims == 1) // axis == 0 + // slice vector + int w = bottom_blob.w * elempack; + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) { - // slice vector - int w = bottom_blob.w * elempack; - int q = 0; - for (size_t i = 0; i < top_blobs.size(); i++) + int slice = slices_ptr[i]; + if (slice == -233) { - int slice = slices_ptr[i]; - if (slice == -233) - { - slice = (w - q) / (top_blobs.size() - i); - } + slice = (w - q) / (top_blobs.size() - i); + } - int out_elempack = slice % 4 == 0 ? 4 : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; + int out_elempack = opt.use_packing_layout && slice % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; - Mat& top_blob = top_blobs[i]; - top_blob.create(slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + Mat& top_blob = top_blobs[i]; + top_blob.create(slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - const float* ptr = (const float*)bottom_blob + q; - float* outptr = top_blob; - memcpy(outptr, ptr, top_blob.w * top_blob.elemsize); + const float* ptr = (const float*)bottom_blob + q; + float* outptr = top_blob; + memcpy(outptr, ptr, top_blob.w * top_blob.elemsize); - q += slice; - } - - return 0; + q += slice; } + } - if (dims == 2 && axis == 0) - { - // slice image height - int w = bottom_blob.w; - int h = bottom_blob.h * elempack; + if (dims == 2 && axis == 0) + { + // slice image height + int w = bottom_blob.w; + int h = bottom_blob.h * elempack; - int q = 0; - for (size_t i = 0; i < top_blobs.size(); i++) + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) { - int slice = slices_ptr[i]; - if (slice == -233) - { - slice = (h - q) / (top_blobs.size() - i); - } + slice = (h - q) / (top_blobs.size() - i); + } - int out_elempack = slice % 4 == 0 ? 4 : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; + int out_elempack = opt.use_packing_layout && slice % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; - Mat& top_blob = top_blobs[i]; - top_blob.create(w, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + Mat& top_blob = top_blobs[i]; + top_blob.create(w, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - q += slice; - } + q += slice; + } - size_t out_elemsize = top_blobs[0].elemsize; - int out_elempack = top_blobs[0].elempack; - for (size_t i = 0; i < top_blobs.size(); i++) - { - out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize); - out_elempack = std::min(out_elempack, top_blobs[i].elempack); - } + size_t out_elemsize = top_blobs[0].elemsize; + int out_elempack = top_blobs[0].elempack; + for (size_t i = 0; i < top_blobs.size(); i++) + { + out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize); + out_elempack = std::min(out_elempack, top_blobs[i].elempack); + } - Mat bottom_blob_unpacked = bottom_blob; - if (elempack == 4 && out_elempack == 1) - { - packing_pack1->forward(bottom_blob, bottom_blob_unpacked, opt); - } + Mat bottom_blob_unpacked = bottom_blob; + if (elempack > out_elempack) + { + convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt); + } - const float* ptr = bottom_blob_unpacked; - for (size_t i = 0; i < top_blobs.size(); i++) - { - Mat& top_blob = top_blobs[i]; + const float* ptr = bottom_blob_unpacked; + for (size_t i = 0; i < top_blobs.size(); i++) + { + Mat& top_blob = top_blobs[i]; - if (out_elempack == 1 && top_blob.elempack == 4) + if (out_elempack == 1 && top_blob.elempack == 4) + { + for (int j = 0; j < top_blob.h; j++) { - for (int j = 0; j < top_blob.h; j++) - { - const float* r0 = ptr; - const float* r1 = ptr + w; - const float* r2 = ptr + w * 2; - const float* r3 = ptr + w * 3; - - float* outptr0 = top_blob.row(j); + const float* r0 = ptr; + const float* r1 = ptr + w; + const float* r2 = ptr + w * 2; + const float* r3 = ptr + w * 3; - for (int j = 0; j < w; j++) - { - outptr0[0] = *r0++; - outptr0[1] = *r1++; - outptr0[2] = *r2++; - outptr0[3] = *r3++; + float* outptr0 = top_blob.row(j); - outptr0 += 4; - } + for (int j = 0; j < w; j++) + { + outptr0[0] = *r0++; + outptr0[1] = *r1++; + outptr0[2] = *r2++; + outptr0[3] = *r3++; - ptr += w * 4; + outptr0 += 4; } - } - else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4) - { - int size = w * top_blob.h; - float* outptr = top_blob; - memcpy(outptr, ptr, size * top_blob.elemsize); - - ptr += size * top_blob.elempack; + ptr += w * 4; } } + else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4) + { + int size = w * top_blob.h; + + float* outptr = top_blob; + memcpy(outptr, ptr, size * top_blob.elemsize); - return 0; + ptr += size * top_blob.elempack; + } } + } - if (dims == 2 && axis == 1) - { - // slice image width - int w = bottom_blob.w; - int h = bottom_blob.h; + if (dims == 2 && axis == 1) + { + // slice image width + int w = bottom_blob.w; + int h = bottom_blob.h; - int q = 0; - for (size_t i = 0; i < top_blobs.size(); i++) + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) { - int slice = slices_ptr[i]; - if (slice == -233) - { - slice = (w - q) / (top_blobs.size() - i); - } + slice = (w - q) / (top_blobs.size() - i); + } - Mat& top_blob = top_blobs[i]; - top_blob.create(slice, h, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + Mat& top_blob = top_blobs[i]; + top_blob.create(slice, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - q += slice; - } + q += slice; + } - #pragma omp parallel for num_threads(opt.num_threads) - for (int j = 0; j < h; j++) + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < h; j++) + { + const float* ptr = bottom_blob.row(j); + for (size_t i = 0; i < top_blobs.size(); i++) { - const float* ptr = bottom_blob.row(j); - for (size_t i = 0; i < top_blobs.size(); i++) - { - Mat& top_blob = top_blobs[i]; + Mat& top_blob = top_blobs[i]; - float* outptr = top_blob.row(j); - memcpy(outptr, ptr, top_blob.w * elemsize); + float* outptr = top_blob.row(j); + memcpy(outptr, ptr, top_blob.w * elemsize); - ptr += top_blob.w * elempack; - } + ptr += top_blob.w * elempack; } - - return 0; } + } - if (dims == 3 && axis == 0) - { - // slice dim channel - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c * elempack; + if (dims == 3 && axis == 0) + { + // slice dim channel + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c * elempack; - int q = 0; - for (size_t i = 0; i < top_blobs.size(); i++) + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) { - int slice = slices_ptr[i]; - if (slice == -233) - { - slice = (channels - q) / (top_blobs.size() - i); - } + slice = (channels - q) / (top_blobs.size() - i); + } - int out_elempack = slice % 4 == 0 ? 4 : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; + int out_elempack = opt.use_packing_layout && slice % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; - Mat& top_blob = top_blobs[i]; - top_blob.create(w, h, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + Mat& top_blob = top_blobs[i]; + top_blob.create(w, h, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - q += slice; - } + q += slice; + } - size_t out_elemsize = top_blobs[0].elemsize; - int out_elempack = top_blobs[0].elempack; - for (size_t i = 0; i < top_blobs.size(); i++) - { - out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize); - out_elempack = std::min(out_elempack, top_blobs[i].elempack); - } + size_t out_elemsize = top_blobs[0].elemsize; + int out_elempack = top_blobs[0].elempack; + for (size_t i = 0; i < top_blobs.size(); i++) + { + out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize); + out_elempack = std::min(out_elempack, top_blobs[i].elempack); + } - Mat bottom_blob_unpacked = bottom_blob; - if (elempack == 4 && out_elempack == 1) - { - packing_pack1->forward(bottom_blob, bottom_blob_unpacked, opt); - } + Mat bottom_blob_unpacked = bottom_blob; + if (elempack > out_elempack) + { + convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt); + } - int p = 0; - for (size_t i = 0; i < top_blobs.size(); i++) + int p = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + Mat& top_blob = top_blobs[i]; + + if (out_elempack == 1 && top_blob.elempack == 4) { - Mat& top_blob = top_blobs[i]; + int size = top_blob.w * top_blob.h; - if (out_elempack == 1 && top_blob.elempack == 4) + for (int q = 0; q < top_blob.c; q++) { - int size = top_blob.w * top_blob.h; + const float* r0 = bottom_blob_unpacked.channel(p); + const float* r1 = bottom_blob_unpacked.channel(p + 1); + const float* r2 = bottom_blob_unpacked.channel(p + 2); + const float* r3 = bottom_blob_unpacked.channel(p + 3); - for (int q = 0; q < top_blob.c; q++) + float* outptr0 = top_blob.channel(q); + + for (int j = 0; j < size; j++) { - const float* r0 = bottom_blob_unpacked.channel(p); - const float* r1 = bottom_blob_unpacked.channel(p + 1); - const float* r2 = bottom_blob_unpacked.channel(p + 2); - const float* r3 = bottom_blob_unpacked.channel(p + 3); + outptr0[0] = *r0++; + outptr0[1] = *r1++; + outptr0[2] = *r2++; + outptr0[3] = *r3++; - float* outptr0 = top_blob.channel(q); + outptr0 += 4; + } - for (int j = 0; j < size; j++) - { - outptr0[0] = *r0++; - outptr0[1] = *r1++; - outptr0[2] = *r2++; - outptr0[3] = *r3++; + p += 4; + } + } + else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4) + { + int size = top_blob.total(); - outptr0 += 4; - } + const float* ptr = bottom_blob_unpacked.channel(p); + float* outptr = top_blob; + memcpy(outptr, ptr, size * top_blob.elemsize); - p += 4; - } - } - else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4) - { - int size = top_blob.total(); + p += top_blob.c; + } + } + } - const float* ptr = bottom_blob_unpacked.channel(p); - float* outptr = top_blob; - memcpy(outptr, ptr, size * top_blob.elemsize); + if (dims == 3 && axis == 1) + { + // slice dim height + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; - p += top_blob.c; - } + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) + { + slice = (h - q) / (top_blobs.size() - i); } - return 0; + Mat& top_blob = top_blobs[i]; + top_blob.create(w, slice, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + q += slice; } - if (dims == 3 && axis == 1) + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) { - // slice dim height - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; + const float* ptr = bottom_blob.channel(p); - int q = 0; for (size_t i = 0; i < top_blobs.size(); i++) { - int slice = slices_ptr[i]; - if (slice == -233) - { - slice = (h - q) / (top_blobs.size() - i); - } - Mat& top_blob = top_blobs[i]; - top_blob.create(w, slice, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - q += slice; - } + int size = top_blob.w * top_blob.h; - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < channels; p++) - { - const float* ptr = bottom_blob.channel(p); + float* outptr = top_blob.channel(p); + memcpy(outptr, ptr, size * elemsize); - for (size_t i = 0; i < top_blobs.size(); i++) - { - Mat& top_blob = top_blobs[i]; - - int size = top_blob.w * top_blob.h; - - float* outptr = top_blob.channel(p); - memcpy(outptr, ptr, size * elemsize); - - ptr += size * elempack; - } + ptr += size * elempack; } - - return 0; } + } - if (dims == 3 && axis == 2) - { - // slice dim width - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; + if (dims == 3 && axis == 2) + { + // slice dim width + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; - int q = 0; - for (size_t i = 0; i < top_blobs.size(); i++) + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) { - int slice = slices_ptr[i]; - if (slice == -233) - { - slice = (w - q) / (top_blobs.size() - i); - } + slice = (w - q) / (top_blobs.size() - i); + } - Mat& top_blob = top_blobs[i]; - top_blob.create(slice, h, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + Mat& top_blob = top_blobs[i]; + top_blob.create(slice, h, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - q += slice; - } + q += slice; + } - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < channels; p++) - { - const float* ptr = bottom_blob.channel(p); + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const float* ptr = bottom_blob.channel(p); - for (int j = 0; j < h; j++) + for (int j = 0; j < h; j++) + { + for (size_t i = 0; i < top_blobs.size(); i++) { - for (size_t i = 0; i < top_blobs.size(); i++) - { - Mat& top_blob = top_blobs[i]; + Mat& top_blob = top_blobs[i]; - float* outptr = top_blob.channel(p).row(j); - memcpy(outptr, ptr, top_blob.w * elemsize); + float* outptr = top_blob.channel(p).row(j); + memcpy(outptr, ptr, top_blob.w * elemsize); - ptr += top_blob.w * elempack; - } + ptr += top_blob.w * elempack; } } - - return 0; } + } - } // opt.use_packing_layout -#endif // __ARM_NEON - - return Slice::forward(bottom_blobs, top_blobs, opt); + return 0; } int Slice_arm::forward_bf16s_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const @@ -440,346 +384,464 @@ int Slice_arm::forward_bf16s_fp16s(const std::vector& bottom_blobs, std::ve int elempack = bottom_blob.elempack; const int* slices_ptr = slices; -#if __ARM_NEON - if (opt.use_packing_layout) + if (dims == 1) // axis == 0 { - if (dims == 1) // axis == 0 + // slice vector + int w = bottom_blob.w * elempack; + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) { - // slice vector - int w = bottom_blob.w * elempack; - int q = 0; - for (size_t i = 0; i < top_blobs.size(); i++) + int slice = slices_ptr[i]; + if (slice == -233) { - int slice = slices_ptr[i]; - if (slice == -233) - { - slice = (w - q) / (top_blobs.size() - i); - } + slice = (w - q) / (top_blobs.size() - i); + } - int out_elempack = slice % 4 == 0 ? 4 : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; + int out_elempack = opt.use_packing_layout && slice % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; - Mat& top_blob = top_blobs[i]; - top_blob.create(slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + Mat& top_blob = top_blobs[i]; + top_blob.create(slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - const unsigned short* ptr = (const unsigned short*)bottom_blob + q; - unsigned short* outptr = top_blob; - memcpy(outptr, ptr, top_blob.w * top_blob.elemsize); + const unsigned short* ptr = (const unsigned short*)bottom_blob + q; + unsigned short* outptr = top_blob; + memcpy(outptr, ptr, top_blob.w * top_blob.elemsize); - q += slice; - } - - return 0; + q += slice; } + } - if (dims == 2 && axis == 0) - { - // slice image height - int w = bottom_blob.w; - int h = bottom_blob.h * elempack; + if (dims == 2 && axis == 0) + { + // slice image height + int w = bottom_blob.w; + int h = bottom_blob.h * elempack; - int q = 0; - for (size_t i = 0; i < top_blobs.size(); i++) + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) { - int slice = slices_ptr[i]; - if (slice == -233) - { - slice = (h - q) / (top_blobs.size() - i); - } - - int out_elempack = slice % 4 == 0 ? 4 : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; - - Mat& top_blob = top_blobs[i]; - top_blob.create(w, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - q += slice; + slice = (h - q) / (top_blobs.size() - i); } - size_t out_elemsize = top_blobs[0].elemsize; - int out_elempack = top_blobs[0].elempack; - for (size_t i = 0; i < top_blobs.size(); i++) + int out_elempack = 1; + if (opt.use_packing_layout) { - out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize); - out_elempack = std::min(out_elempack, top_blobs[i].elempack); + out_elempack = opt.use_fp16_arithmetic && slice % 8 == 0 ? 8 : slice % 4 == 0 ? 4 : 1; } + size_t out_elemsize = elemsize / elempack * out_elempack; - Mat bottom_blob_unpacked = bottom_blob; - if (elempack == 4 && out_elempack == 1) - { - packing_pack1->forward(bottom_blob, bottom_blob_unpacked, opt); - } + Mat& top_blob = top_blobs[i]; + top_blob.create(w, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - const unsigned short* ptr = bottom_blob_unpacked; - for (size_t i = 0; i < top_blobs.size(); i++) - { - Mat& top_blob = top_blobs[i]; + q += slice; + } - if (out_elempack == 1 && top_blob.elempack == 4) - { - for (int j = 0; j < top_blob.h; j++) - { - const unsigned short* r0 = ptr; - const unsigned short* r1 = ptr + w; - const unsigned short* r2 = ptr + w * 2; - const unsigned short* r3 = ptr + w * 3; + size_t out_elemsize = top_blobs[0].elemsize; + int out_elempack = top_blobs[0].elempack; + for (size_t i = 0; i < top_blobs.size(); i++) + { + out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize); + out_elempack = std::min(out_elempack, top_blobs[i].elempack); + } + + Mat bottom_blob_unpacked = bottom_blob; + if (elempack > out_elempack) + { + convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt); + } - unsigned short* outptr0 = top_blob.row(j); + const unsigned short* ptr = bottom_blob_unpacked; + for (size_t i = 0; i < top_blobs.size(); i++) + { + Mat& top_blob = top_blobs[i]; - for (int j = 0; j < w; j++) - { - outptr0[0] = *r0++; - outptr0[1] = *r1++; - outptr0[2] = *r2++; - outptr0[3] = *r3++; +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + if (out_elempack == 4 && top_blob.elempack == 8) + { + for (int j = 0; j < top_blob.h; j++) + { + const unsigned short* r0 = ptr; + const unsigned short* r1 = ptr + w * 4; - outptr0 += 4; - } + unsigned short* outptr0 = top_blob.row(j); - ptr += w * 4; + for (int j = 0; j < w; j++) + { + outptr0[0] = r0[0]; + outptr0[1] = r0[1]; + outptr0[2] = r0[2]; + outptr0[3] = r0[3]; + outptr0[4] = r1[0]; + outptr0[5] = r1[1]; + outptr0[6] = r1[2]; + outptr0[7] = r1[3]; + + r0 += 4; + r1 += 4; + outptr0 += 8; } + + ptr += w * 8; } - else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4) + } + if (out_elempack == 1 && top_blob.elempack == 8) + { + for (int j = 0; j < top_blob.h; j++) { - int size = w * top_blob.h; - - unsigned short* outptr = top_blob; - memcpy(outptr, ptr, size * top_blob.elemsize); + const unsigned short* r0 = ptr; + const unsigned short* r1 = ptr + w; + const unsigned short* r2 = ptr + w * 2; + const unsigned short* r3 = ptr + w * 3; + const unsigned short* r4 = ptr + w * 4; + const unsigned short* r5 = ptr + w * 5; + const unsigned short* r6 = ptr + w * 6; + const unsigned short* r7 = ptr + w * 7; + + unsigned short* outptr0 = top_blob.row(j); + + for (int j = 0; j < w; j++) + { + outptr0[0] = *r0++; + outptr0[1] = *r1++; + outptr0[2] = *r2++; + outptr0[3] = *r3++; + outptr0[4] = *r4++; + outptr0[5] = *r5++; + outptr0[6] = *r6++; + outptr0[7] = *r7++; + + outptr0 += 8; + } - ptr += size * top_blob.elempack; + ptr += w * 8; } } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + if (out_elempack == 1 && top_blob.elempack == 4) + { + for (int j = 0; j < top_blob.h; j++) + { + const unsigned short* r0 = ptr; + const unsigned short* r1 = ptr + w; + const unsigned short* r2 = ptr + w * 2; + const unsigned short* r3 = ptr + w * 3; - return 0; - } + unsigned short* outptr0 = top_blob.row(j); - if (dims == 2 && axis == 1) - { - // slice image width - int w = bottom_blob.w; - int h = bottom_blob.h; + for (int j = 0; j < w; j++) + { + outptr0[0] = *r0++; + outptr0[1] = *r1++; + outptr0[2] = *r2++; + outptr0[3] = *r3++; - int q = 0; - for (size_t i = 0; i < top_blobs.size(); i++) - { - int slice = slices_ptr[i]; - if (slice == -233) - { - slice = (w - q) / (top_blobs.size() - i); + outptr0 += 4; + } + + ptr += w * 4; } + } + if (out_elempack == top_blob.elempack) // 1-1 4-4 8-8 + { + int size = w * top_blob.h; - Mat& top_blob = top_blobs[i]; - top_blob.create(slice, h, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + unsigned short* outptr = top_blob; + memcpy(outptr, ptr, size * top_blob.elemsize); - q += slice; + ptr += size * top_blob.elempack; } + } + } - #pragma omp parallel for num_threads(opt.num_threads) - for (int j = 0; j < h; j++) - { - const unsigned short* ptr = bottom_blob.row(j); - for (size_t i = 0; i < top_blobs.size(); i++) - { - Mat& top_blob = top_blobs[i]; - - unsigned short* outptr = top_blob.row(j); - memcpy(outptr, ptr, top_blob.w * elemsize); + if (dims == 2 && axis == 1) + { + // slice image width + int w = bottom_blob.w; + int h = bottom_blob.h; - ptr += top_blob.w * elempack; - } + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) + { + slice = (w - q) / (top_blobs.size() - i); } - return 0; + Mat& top_blob = top_blobs[i]; + top_blob.create(slice, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + q += slice; } - if (dims == 3 && axis == 0) + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < h; j++) { - // slice dim channel - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c * elempack; - - int q = 0; + const unsigned short* ptr = bottom_blob.row(j); for (size_t i = 0; i < top_blobs.size(); i++) { - int slice = slices_ptr[i]; - if (slice == -233) - { - slice = (channels - q) / (top_blobs.size() - i); - } - - int out_elempack = slice % 4 == 0 ? 4 : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; - Mat& top_blob = top_blobs[i]; - top_blob.create(w, h, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - q += slice; + unsigned short* outptr = top_blob.row(j); + memcpy(outptr, ptr, top_blob.w * elemsize); + + ptr += top_blob.w * elempack; } + } + } - size_t out_elemsize = top_blobs[0].elemsize; - int out_elempack = top_blobs[0].elempack; - for (size_t i = 0; i < top_blobs.size(); i++) + if (dims == 3 && axis == 0) + { + // slice dim channel + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c * elempack; + + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) { - out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize); - out_elempack = std::min(out_elempack, top_blobs[i].elempack); + slice = (channels - q) / (top_blobs.size() - i); } - Mat bottom_blob_unpacked = bottom_blob; - if (elempack == 4 && out_elempack == 1) + int out_elempack = 1; + if (opt.use_packing_layout) { - packing_pack1->forward(bottom_blob, bottom_blob_unpacked, opt); + out_elempack = opt.use_fp16_arithmetic && slice % 8 == 0 ? 8 : slice % 4 == 0 ? 4 : 1; } + size_t out_elemsize = elemsize / elempack * out_elempack; - int p = 0; - for (size_t i = 0; i < top_blobs.size(); i++) - { - Mat& top_blob = top_blobs[i]; + Mat& top_blob = top_blobs[i]; + top_blob.create(w, h, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - if (out_elempack == 1 && top_blob.elempack == 4) - { - int size = top_blob.w * top_blob.h; + q += slice; + } - for (int q = 0; q < top_blob.c; q++) - { - const unsigned short* r0 = bottom_blob_unpacked.channel(p); - const unsigned short* r1 = bottom_blob_unpacked.channel(p + 1); - const unsigned short* r2 = bottom_blob_unpacked.channel(p + 2); - const unsigned short* r3 = bottom_blob_unpacked.channel(p + 3); + size_t out_elemsize = top_blobs[0].elemsize; + int out_elempack = top_blobs[0].elempack; + for (size_t i = 0; i < top_blobs.size(); i++) + { + out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize); + out_elempack = std::min(out_elempack, top_blobs[i].elempack); + } - unsigned short* outptr0 = top_blob.channel(q); + Mat bottom_blob_unpacked = bottom_blob; + if (elempack > out_elempack) + { + convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt); + } - for (int j = 0; j < size; j++) - { - outptr0[0] = *r0++; - outptr0[1] = *r1++; - outptr0[2] = *r2++; - outptr0[3] = *r3++; + int p = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + Mat& top_blob = top_blobs[i]; - outptr0 += 4; - } +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + if (out_elempack == 4 && top_blob.elempack == 8) + { + int size = top_blob.w * top_blob.h; - p += 4; - } - } - else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4) + for (int q = 0; q < top_blob.c; q++) { - int size = top_blob.total(); + const unsigned short* r0 = bottom_blob_unpacked.channel(p); + const unsigned short* r1 = bottom_blob_unpacked.channel(p + 1); + + unsigned short* outptr0 = top_blob.channel(q); - const unsigned short* ptr = bottom_blob_unpacked.channel(p); - unsigned short* outptr = top_blob; - memcpy(outptr, ptr, size * top_blob.elemsize); + for (int j = 0; j < size; j++) + { + outptr0[0] = r0[0]; + outptr0[1] = r0[1]; + outptr0[2] = r0[2]; + outptr0[3] = r0[3]; + outptr0[4] = r1[0]; + outptr0[5] = r1[1]; + outptr0[6] = r1[2]; + outptr0[7] = r1[3]; + + r0 += 4; + r1 += 4; + outptr0 += 8; + } - p += top_blob.c; + p += 2; } } + if (out_elempack == 1 && top_blob.elempack == 8) + { + int size = top_blob.w * top_blob.h; - return 0; - } - - if (dims == 3 && axis == 1) - { - // slice dim height - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; + for (int q = 0; q < top_blob.c; q++) + { + const unsigned short* r0 = bottom_blob_unpacked.channel(p); + const unsigned short* r1 = bottom_blob_unpacked.channel(p + 1); + const unsigned short* r2 = bottom_blob_unpacked.channel(p + 2); + const unsigned short* r3 = bottom_blob_unpacked.channel(p + 3); + const unsigned short* r4 = bottom_blob_unpacked.channel(p + 4); + const unsigned short* r5 = bottom_blob_unpacked.channel(p + 5); + const unsigned short* r6 = bottom_blob_unpacked.channel(p + 6); + const unsigned short* r7 = bottom_blob_unpacked.channel(p + 7); + + unsigned short* outptr0 = top_blob.channel(q); + + for (int j = 0; j < size; j++) + { + outptr0[0] = *r0++; + outptr0[1] = *r1++; + outptr0[2] = *r2++; + outptr0[3] = *r3++; + outptr0[4] = *r4++; + outptr0[5] = *r5++; + outptr0[6] = *r6++; + outptr0[7] = *r7++; + + outptr0 += 8; + } - int q = 0; - for (size_t i = 0; i < top_blobs.size(); i++) + p += 8; + } + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + if (out_elempack == 1 && top_blob.elempack == 4) { - int slice = slices_ptr[i]; - if (slice == -233) + int size = top_blob.w * top_blob.h; + + for (int q = 0; q < top_blob.c; q++) { - slice = (h - q) / (top_blobs.size() - i); - } + const unsigned short* r0 = bottom_blob_unpacked.channel(p); + const unsigned short* r1 = bottom_blob_unpacked.channel(p + 1); + const unsigned short* r2 = bottom_blob_unpacked.channel(p + 2); + const unsigned short* r3 = bottom_blob_unpacked.channel(p + 3); - Mat& top_blob = top_blobs[i]; - top_blob.create(w, slice, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; + unsigned short* outptr0 = top_blob.channel(q); - q += slice; - } + for (int j = 0; j < size; j++) + { + outptr0[0] = *r0++; + outptr0[1] = *r1++; + outptr0[2] = *r2++; + outptr0[3] = *r3++; + + outptr0 += 4; + } - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < channels; p++) + p += 4; + } + } + if (out_elempack == top_blob.elempack) // 1-1 4-4 8-8 { - const unsigned short* ptr = bottom_blob.channel(p); + int size = top_blob.total(); - for (size_t i = 0; i < top_blobs.size(); i++) - { - Mat& top_blob = top_blobs[i]; + const unsigned short* ptr = bottom_blob_unpacked.channel(p); + unsigned short* outptr = top_blob; + memcpy(outptr, ptr, size * top_blob.elemsize); - int size = top_blob.w * top_blob.h; + p += top_blob.c; + } + } + } - unsigned short* outptr = top_blob.channel(p); - memcpy(outptr, ptr, size * elemsize); + if (dims == 3 && axis == 1) + { + // slice dim height + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; - ptr += size * elempack; - } + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) + { + slice = (h - q) / (top_blobs.size() - i); } - return 0; + Mat& top_blob = top_blobs[i]; + top_blob.create(w, slice, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + q += slice; } - if (dims == 3 && axis == 2) + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) { - // slice dim width - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; + const unsigned short* ptr = bottom_blob.channel(p); - int q = 0; for (size_t i = 0; i < top_blobs.size(); i++) { - int slice = slices_ptr[i]; - if (slice == -233) - { - slice = (w - q) / (top_blobs.size() - i); - } - Mat& top_blob = top_blobs[i]; - top_blob.create(slice, h, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - q += slice; + int size = top_blob.w * top_blob.h; + + unsigned short* outptr = top_blob.channel(p); + memcpy(outptr, ptr, size * elemsize); + + ptr += size * elempack; } + } + } + + if (dims == 3 && axis == 2) + { + // slice dim width + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < channels; p++) + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) { - const unsigned short* ptr = bottom_blob.channel(p); + slice = (w - q) / (top_blobs.size() - i); + } + + Mat& top_blob = top_blobs[i]; + top_blob.create(slice, h, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + q += slice; + } - for (int j = 0; j < h; j++) + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const unsigned short* ptr = bottom_blob.channel(p); + + for (int j = 0; j < h; j++) + { + for (size_t i = 0; i < top_blobs.size(); i++) { - for (size_t i = 0; i < top_blobs.size(); i++) - { - Mat& top_blob = top_blobs[i]; + Mat& top_blob = top_blobs[i]; - unsigned short* outptr = top_blob.channel(p).row(j); - memcpy(outptr, ptr, top_blob.w * elemsize); + unsigned short* outptr = top_blob.channel(p).row(j); + memcpy(outptr, ptr, top_blob.w * elemsize); - ptr += top_blob.w * elempack; - } + ptr += top_blob.w * elempack; } } - - return 0; } + } - } // opt.use_packing_layout -#endif // __ARM_NEON - - return Slice::forward(bottom_blobs, top_blobs, opt); + return 0; } } // namespace ncnn diff --git a/src/layer/arm/slice_arm.h b/src/layer/arm/slice_arm.h index 4faa21c70..50da56743 100644 --- a/src/layer/arm/slice_arm.h +++ b/src/layer/arm/slice_arm.h @@ -24,16 +24,10 @@ class Slice_arm : virtual public Slice public: Slice_arm(); - virtual int create_pipeline(const Option& opt); - virtual int destroy_pipeline(const Option& opt); - virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; protected: int forward_bf16s_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; - -public: - ncnn::Layer* packing_pack1; }; } // namespace ncnn