// Tencent is pleased to support the open source community by making ncnn available. // // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at // // https://opensource.org/licenses/BSD-3-Clause // // Unless required by applicable law or agreed to in writing, software distributed // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. #include "reduction.h" #include #include #include namespace ncnn { Reduction::Reduction() { one_blob_only = true; support_inplace = false; } int Reduction::load_param(const ParamDict& pd) { operation = pd.get(0, 0); reduce_all = pd.get(1, 1); coeff = pd.get(2, 1.f); axes = pd.get(3, Mat()); keepdims = pd.get(4, 0); return 0; } template static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool reduce_h, bool reduce_c, const Option& opt) { Op op; Op2 op2; size_t elemsize = a.elemsize; int dims = a.dims; if (dims == 1) { int w = a.w; b.create(1, elemsize, opt.blob_allocator); const float* ptr = a; float sum = v0; for (int i = 0; i < w; i++) { sum = op(sum, ptr[i]); } b[0] = sum; return 0; } if (dims == 2) { int w = a.w; int h = a.h; if (reduce_w && reduce_h) { // w h -> X X b.create(1, elemsize, opt.blob_allocator); Mat sums(h, elemsize, opt.workspace_allocator); if (sums.empty()) return -100; #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < h; i++) { const float* ptr = a.row(i); float sum = v0; for (int j = 0; j < w; j++) { sum = op(sum, ptr[j]); } sums[i] = sum; } float sum = v0; for (int i = 0; i < h; i++) { sum = op2(sum, sums[i]); } b[0] = sum; return 0; } if (reduce_w && !reduce_h) { // w h -> X h b.create(h, elemsize, opt.blob_allocator); #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < h; i++) { const float* ptr = a.row(i); float sum = v0; for (int j = 0; j < w; j++) { sum = op(sum, ptr[j]); } b[i] = sum; } return 0; } if (!reduce_w && reduce_h) { // w h -> w X b.create(w, elemsize, opt.blob_allocator); b.fill(v0); for (int i = 0; i < h; i++) { const float* ptr = a.row(i); for (int j = 0; j < w; j++) { b[j] = op(b[j], ptr[j]); } } return 0; } } if (dims == 3) { int w = a.w; int h = a.h; int channels = a.c; int size = w * h; if (reduce_w && reduce_h && reduce_c) { // w h c -> X X X b.create(1, elemsize, opt.blob_allocator); Mat sums(channels, elemsize, opt.workspace_allocator); if (sums.empty()) return -100; #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = a.channel(q); float sum = v0; for (int i = 0; i < size; i++) { sum = op(sum, ptr[i]); } sums[q] = sum; } float sum = v0; for (int i = 0; i < channels; i++) { sum = op2(sum, sums[i]); } b[0] = sum; return 0; } if (reduce_w && reduce_h && !reduce_c) { // w h c -> X X c b.create(channels, elemsize, opt.blob_allocator); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = a.channel(q); float sum = v0; for (int i = 0; i < size; i++) { sum = op(sum, ptr[i]); } b[q] = sum; } return 0; } if (reduce_w && !reduce_h && !reduce_c) { // w h c -> X h c b.create(h, channels, elemsize, opt.blob_allocator); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = a.channel(q); float* outptr = b.row(q); for (int i = 0; i < h; i++) { float sum = v0; for (int j = 0; j < w; j++) { sum = op(sum, ptr[j]); } outptr[i] = sum; ptr += w; } } return 0; } if (reduce_w && !reduce_h && reduce_c) { // w h c -> X h X b.create(h, elemsize, opt.blob_allocator); Mat mins(1, h, channels, elemsize, opt.workspace_allocator); if (mins.empty()) return -100; mins.fill(v0); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = a.channel(q); float* mins_ptr = mins.channel(q); for (int i = 0; i < h; i++) { float sum = v0; for (int j = 0; j < w; j++) { sum = op(sum, ptr[j]); } mins_ptr[i] = sum; ptr += w; } } b.fill(v0); for (int q = 0; q < channels; q++) { const float* mins_ptr = mins.channel(q); for (int i = 0; i < h; i++) { b[i] = op2(b[i], mins_ptr[i]); } } return 0; } if (!reduce_w && reduce_h && reduce_c) { // w h c -> w X X b.create(w, elemsize, opt.blob_allocator); Mat mins(w, 1, channels, elemsize, opt.workspace_allocator); if (mins.empty()) return -100; mins.fill(v0); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = a.channel(q); float* mins_ptr = mins.channel(q); for (int i = 0; i < h; i++) { for (int j = 0; j < w; j++) { mins_ptr[j] = op(mins_ptr[j], ptr[j]); } ptr += w; } } b.fill(v0); for (int q = 0; q < channels; q++) { const float* mins_ptr = mins.channel(q); for (int j = 0; j < w; j++) { b[j] = op2(b[j], mins_ptr[j]); } } return 0; } if (!reduce_w && !reduce_h && reduce_c) { // w h c -> w h X b.create(w, h, elemsize, opt.blob_allocator); b.fill(v0); for (int q = 0; q < channels; q++) { const float* ptr = a.channel(q); for (int i = 0; i < size; i++) { b[i] = op(b[i], ptr[i]); } } return 0; } if (!reduce_w && reduce_h && !reduce_c) { // w h c -> w X c b.create(w, channels, elemsize, opt.blob_allocator); b.fill(v0); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = a.channel(q); float* outptr = b.row(q); for (int i = 0; i < h; i++) { for (int j = 0; j < w; j++) { outptr[j] = op(outptr[j], ptr[j]); } ptr += w; } } return 0; } } return 0; } template static int reduction_op_keepdims(const Mat& a, Mat& b, float v0, bool reduce_w, bool reduce_h, bool reduce_c, const Option& opt) { Op op; Op2 op2; size_t elemsize = a.elemsize; int dims = a.dims; if (dims == 1) { int w = a.w; b.create(1, elemsize, opt.blob_allocator); const float* ptr = a; float sum = v0; for (int i = 0; i < w; i++) { sum = op(sum, ptr[i]); } b[0] = sum; return 0; } if (dims == 2) { int w = a.w; int h = a.h; if (reduce_w && reduce_h) { // w h -> 1 1 b.create(1, 1, elemsize, opt.blob_allocator); Mat sums(h, elemsize, opt.workspace_allocator); if (sums.empty()) return -100; #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < h; i++) { const float* ptr = a.row(i); float sum = v0; for (int j = 0; j < w; j++) { sum = op(sum, ptr[j]); } sums[i] = sum; } float sum = v0; for (int i = 0; i < h; i++) { sum = op2(sum, sums[i]); } b[0] = sum; return 0; } if (reduce_w && !reduce_h) { // w h -> 1 h b.create(1, h, elemsize, opt.blob_allocator); #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < h; i++) { const float* ptr = a.row(i); float sum = v0; for (int j = 0; j < w; j++) { sum = op(sum, ptr[j]); } b[i] = sum; } return 0; } if (!reduce_w && reduce_h) { // w h -> w 1 b.create(w, 1, elemsize, opt.blob_allocator); b.fill(v0); for (int i = 0; i < h; i++) { const float* ptr = a.row(i); for (int j = 0; j < w; j++) { b[j] = op(b[j], ptr[j]); } } return 0; } } if (dims == 3) { int w = a.w; int h = a.h; int channels = a.c; int size = w * h; if (reduce_w && reduce_h && reduce_c) { // w h c -> 1 1 1 b.create(1, 1, 1, elemsize, opt.blob_allocator); Mat sums(channels, elemsize, opt.workspace_allocator); if (sums.empty()) return -100; #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = a.channel(q); float sum = v0; for (int i = 0; i < size; i++) { sum = op(sum, ptr[i]); } sums[q] = sum; } float sum = v0; for (int i = 0; i < channels; i++) { sum = op2(sum, sums[i]); } b[0] = sum; return 0; } if (reduce_w && reduce_h && !reduce_c) { // w h c -> 1 1 c b.create(1, 1, channels, elemsize, opt.blob_allocator); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = a.channel(q); float* outptr = b.channel(q); float sum = v0; for (int i = 0; i < size; i++) { sum = op(sum, ptr[i]); } outptr[0] = sum; } return 0; } if (reduce_w && !reduce_h && !reduce_c) { // w h c -> 1 h c b.create(1, h, channels, elemsize, opt.blob_allocator); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = a.channel(q); float* outptr = b.channel(q); for (int i = 0; i < h; i++) { float sum = v0; for (int j = 0; j < w; j++) { sum = op(sum, ptr[j]); } outptr[i] = sum; ptr += w; } } return 0; } if (reduce_w && !reduce_h && reduce_c) { // w h c -> 1 h 1 b.create(1, h, 1, elemsize, opt.blob_allocator); Mat mins(1, h, channels, elemsize, opt.workspace_allocator); if (mins.empty()) return -100; mins.fill(v0); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = a.channel(q); float* mins_ptr = mins.channel(q); for (int i = 0; i < h; i++) { float sum = v0; for (int j = 0; j < w; j++) { sum = op(sum, ptr[j]); } mins_ptr[i] = sum; ptr += w; } } b.fill(v0); for (int q = 0; q < channels; q++) { const float* mins_ptr = mins.channel(q); for (int i = 0; i < h; i++) { b[i] = op2(b[i], mins_ptr[i]); } } return 0; } if (!reduce_w && reduce_h && reduce_c) { // w h c -> w 1 1 b.create(w, 1, 1, elemsize, opt.blob_allocator); Mat mins(w, 1, channels, elemsize, opt.workspace_allocator); if (mins.empty()) return -100; mins.fill(v0); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = a.channel(q); float* mins_ptr = mins.channel(q); for (int i = 0; i < h; i++) { for (int j = 0; j < w; j++) { mins_ptr[j] = op(mins_ptr[j], ptr[j]); } ptr += w; } } b.fill(v0); for (int q = 0; q < channels; q++) { const float* mins_ptr = mins.channel(q); for (int j = 0; j < w; j++) { b[j] = op2(b[j], mins_ptr[j]); } } return 0; } if (!reduce_w && !reduce_h && reduce_c) { // w h c -> w h 1 b.create(w, h, 1, elemsize, opt.blob_allocator); b.fill(v0); for (int q = 0; q < channels; q++) { const float* ptr = a.channel(q); for (int i = 0; i < size; i++) { b[i] = op(b[i], ptr[i]); } } return 0; } if (!reduce_w && reduce_h && !reduce_c) { // w h c -> w 1 c b.create(w, 1, channels, elemsize, opt.blob_allocator); b.fill(v0); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = a.channel(q); float* outptr = b.channel(q); for (int i = 0; i < h; i++) { for (int j = 0; j < w; j++) { outptr[j] = op(outptr[j], ptr[j]); } ptr += w; } } return 0; } } return 0; } template static int reduction_post_process(Mat& a, float coeff, const Option& opt) { MathOp mathop; int dims = a.dims; if (dims == 1) { int w = a.w; #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < w; i++) a[i] = mathop(a[i]) * coeff; } else if (dims == 2) { int size = a.w * a.h; #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < size; i++) a[i] = mathop(a[i]) * coeff; } else if (dims == 3) { int c = a.c; int size = a.w * a.h; if (c == 1) { #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < size; i++) a[i] = mathop(a[i]) * coeff; } else { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < c; q++) { float* outptr = a.channel(q); for (int i = 0; i < size; i++) outptr[i] = mathop(outptr[i]) * coeff; } } } return 0; } template static int reduction(const Mat& a, Mat& b, float v0, bool reduce_w, bool reduce_h, bool reduce_c, bool post_process, float coeff, int keepdims, const Option& opt) { int ret; if (keepdims) ret = reduction_op_keepdims(a, b, v0, reduce_w, reduce_h, reduce_c, opt); else ret = reduction_op(a, b, v0, reduce_w, reduce_h, reduce_c, opt); if (ret != 0) return -100; if (post_process || fabs(coeff - 1.f) > FLT_EPSILON) { ret = reduction_post_process(b, coeff, opt); if (ret != 0) return -100; } return ret; } template struct post_process_identity { T operator()(const T& x) const { return x; } }; template struct post_process_sqrt { T operator()(const T& x) const { return static_cast(sqrt(x)); } }; template struct post_process_log { T operator()(const T& x) const { return static_cast(log(x)); } }; template struct reduction_op_add { T operator()(const T& x, const T& y) const { return x + y; } }; template struct reduction_op_mul { T operator()(const T& x, const T& y) const { return x * y; } }; template struct reduction_op_asum { T operator()(const T& x, const T& y) const { return static_cast(x + fabs(y)); } }; template struct reduction_op_sumsq { T operator()(const T& x, const T& y) const { return x + y * y; } }; template struct reduction_op_sumsexp { T operator()(const T& x, const T& y) const { return static_cast(x + exp(y)); } }; template struct reduction_op_max { T operator()(const T& x, const T& y) const { return std::max(x, y); } }; template struct reduction_op_min { T operator()(const T& x, const T& y) const { return std::min(x, y); } }; int Reduction::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { int dims = bottom_blob.dims; int axes_flag[3] = {0}; bool reduce_w = false; bool reduce_h = false; bool reduce_c = false; if (reduce_all) { reduce_w = true; reduce_h = true; reduce_c = true; } else { const int* axes_ptr = axes; int reduced_axes_num = axes.w; for (int i = 0; i < reduced_axes_num; i++) { int axis = axes_ptr[i]; // handle negative axis if (axis < 0) axis += dims; axes_flag[axis] = 1; } if (dims == 1) { reduce_w = true; } else if (dims == 2) { if (axes_flag[0] == 1) reduce_h = true; if (axes_flag[1] == 1) reduce_w = true; } else if (dims == 3) { if (axes_flag[0] == 1) reduce_c = true; if (axes_flag[1] == 1) reduce_h = true; if (axes_flag[2] == 1) reduce_w = true; } } if (operation == ReductionOp_SUM) return reduction, reduction_op_add, post_process_identity >(bottom_blob, top_blob, 0.f, reduce_w, reduce_h, reduce_c, false, coeff, keepdims, opt); if (operation == ReductionOp_ASUM) return reduction, reduction_op_add, post_process_identity >(bottom_blob, top_blob, 0.f, reduce_w, reduce_h, reduce_c, false, coeff, keepdims, opt); if (operation == ReductionOp_SUMSQ) return reduction, reduction_op_add, post_process_identity >(bottom_blob, top_blob, 0.f, reduce_w, reduce_h, reduce_c, false, coeff, keepdims, opt); if (operation == ReductionOp_MEAN) { int scale = 1; int dims = bottom_blob.dims; if (dims == 1) { scale = bottom_blob.w; } else if (dims == 2) { if (reduce_w) scale *= bottom_blob.w; if (reduce_h) scale *= bottom_blob.h; } else if (dims == 3) { if (reduce_w) scale *= bottom_blob.w; if (reduce_h) scale *= bottom_blob.h; if (reduce_c) scale *= bottom_blob.c; } float coeff_mean = coeff / scale; return reduction, reduction_op_add, post_process_identity >(bottom_blob, top_blob, 0.f, reduce_w, reduce_h, reduce_c, true, coeff_mean, keepdims, opt); } if (operation == ReductionOp_MAX) return reduction, reduction_op_max, post_process_identity >(bottom_blob, top_blob, -FLT_MAX, reduce_w, reduce_h, reduce_c, false, coeff, keepdims, opt); if (operation == ReductionOp_MIN) return reduction, reduction_op_min, post_process_identity >(bottom_blob, top_blob, FLT_MAX, reduce_w, reduce_h, reduce_c, false, coeff, keepdims, opt); if (operation == ReductionOp_PROD) return reduction, reduction_op_mul, post_process_identity >(bottom_blob, top_blob, 1.f, reduce_w, reduce_h, reduce_c, false, coeff, keepdims, opt); if (operation == ReductionOp_L1) return reduction, reduction_op_add, post_process_identity >(bottom_blob, top_blob, 0.f, reduce_w, reduce_h, reduce_c, false, 1.f, keepdims, opt); if (operation == ReductionOp_L2) return reduction, reduction_op_add, post_process_sqrt >(bottom_blob, top_blob, 0.f, reduce_w, reduce_h, reduce_c, true, 1.f, keepdims, opt); if (operation == ReductionOp_LogSum) return reduction, reduction_op_add, post_process_log >(bottom_blob, top_blob, 0.f, reduce_w, reduce_h, reduce_c, true, 1.f, keepdims, opt); if (operation == ReductionOp_LogSumExp) return reduction, reduction_op_add, post_process_log >(bottom_blob, top_blob, 0.f, reduce_w, reduce_h, reduce_c, true, 1.f, keepdims, opt); return 0; } } // namespace ncnn