GitOrigin-RevId: 97bc32f561
tags/v1.11.0
| @@ -21,6 +21,7 @@ | |||
| #include "src/fallback/resize/opr_impl.h" | |||
| #include "src/fallback/roi_copy/opr_impl.h" | |||
| #include "src/fallback/rotate/opr_impl.h" | |||
| #include "src/fallback/softmax/opr_impl.h" | |||
| #include "src/fallback/split/opr_impl.h" | |||
| #include "src/fallback/tile/opr_impl.h" | |||
| #include "src/fallback/type_cvt/opr_impl.h" | |||
| @@ -50,6 +51,7 @@ MEGDNN_SPECIALIZE_CREATE_OPERATOR(TypeCvt) | |||
| MEGDNN_SPECIALIZE_CREATE_OPERATOR(GroupLocal) | |||
| MEGDNN_SPECIALIZE_CREATE_OPERATOR(Flip) | |||
| MEGDNN_SPECIALIZE_CREATE_OPERATOR(GaussianBlur) | |||
| MEGDNN_SPECIALIZE_CREATE_OPERATOR(SoftmaxForward) | |||
| MEGDNN_SPECIALIZE_CREATE_OPERATOR(ROICopy) | |||
| MEGDNN_SPECIALIZE_CREATE_OPERATOR(Rotate) | |||
| MEGDNN_SPECIALIZE_CREATE_OPERATOR(ElemwiseMultiType) | |||
| @@ -0,0 +1,163 @@ | |||
| #include "src/fallback/softmax/opr_impl.h" | |||
| #include <cstring> | |||
| #include <numeric> | |||
| #include "src/fallback/elemwise/gi_impl/gi_mathfun.h" | |||
| #include "src/naive/handle.h" | |||
| namespace megdnn { | |||
| namespace fallback { | |||
| void SoftmaxForwardImpl::exec( | |||
| _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { | |||
| auto axis = param().axis; | |||
| if (axis < 0) | |||
| axis += src.layout.ndim; | |||
| megdnn_assert(axis >= 0); | |||
| check_exec(src.layout, dst.layout, workspace.size); | |||
| if (!usable(src.layout)) { | |||
| naive::SoftmaxForwardImpl::exec(src, dst, workspace); | |||
| return; | |||
| } | |||
| typedef DTypeTrait<dtype::Float32>::ctype Float32; | |||
| auto sptr = src.ptr<Float32>(); | |||
| auto dptr = dst.ptr<Float32>(); | |||
| constexpr auto float_min = std::numeric_limits<Float32>::min(); | |||
| constexpr auto step = GI_SIMD_LEN_BYTE / sizeof(Float32); | |||
| size_t A, B, C; | |||
| reduce::get_ABC(src.layout, A, B, C, axis); | |||
| // TODO: When C=2,3,4..., src_ptr span is relatively large, the performance may | |||
| // be poor | |||
| if (C != 1) { | |||
| WorkspaceBundle workspace_bundle{ | |||
| workspace.raw_ptr, {A * C * sizeof(Float32), A * C * sizeof(Float32)}}; | |||
| Float32* max = workspace_bundle.get_workspace(0).raw_ptr->as<Float32>(); | |||
| GI_FLOAT32_t v_max = GiBroadcastFloat32(float_min); | |||
| size_t i = 0; | |||
| for (; i + step <= A * C; i += step) | |||
| GiStoreFloat32(max + i, v_max); | |||
| for (; i < A * C; i++) | |||
| max[i] = float_min; | |||
| for (size_t a = 0; a < A; a++) { | |||
| for (size_t b = 0; b < B; b++) { | |||
| auto max_ptr = max + a * C; | |||
| auto limit = max_ptr + C; | |||
| auto src_ptr = sptr + a * B * C + b * C; | |||
| for (; max_ptr + step <= limit; max_ptr += step, src_ptr += step) { | |||
| GI_FLOAT32_t v_p = GiLoadFloat32(src_ptr); | |||
| GI_FLOAT32_t v_max = GiLoadFloat32(max_ptr); | |||
| v_max = GiMaximumFloat32(v_max, v_p); | |||
| GiStoreFloat32(max_ptr, v_max); | |||
| } | |||
| for (; max_ptr < limit; ++max_ptr, ++src_ptr) { | |||
| *max_ptr = std::max(*src_ptr, *max_ptr); | |||
| } | |||
| } | |||
| } | |||
| Float32* sum = workspace_bundle.get_workspace(1).raw_ptr->as<Float32>(); | |||
| memset(sum, 0, A * C * sizeof(Float32)); | |||
| for (size_t a = 0; a < A; a++) { | |||
| for (size_t b = 0; b < B; b++) { | |||
| auto max_ptr = max + a * C; | |||
| auto limit = max_ptr + C; | |||
| auto sum_ptr = sum + a * C; | |||
| auto src_ptr = sptr + a * B * C + C * b; | |||
| auto dst_ptr = dptr + a * B * C + C * b; | |||
| for (; max_ptr + step <= limit; max_ptr += step, sum_ptr += step, | |||
| src_ptr += step, dst_ptr += step) { | |||
| GI_FLOAT32_t v_p = GiLoadFloat32(src_ptr); | |||
| GI_FLOAT32_t v_max = GiLoadFloat32(max_ptr); | |||
| GI_FLOAT32_t v_sum = GiLoadFloat32(sum_ptr); | |||
| v_p = GiExpPsFloat32(GiSubtractFloat32(v_p, v_max)); | |||
| v_sum = GiAddFloat32(v_p, v_sum); | |||
| GiStoreFloat32(dst_ptr, v_p); | |||
| GiStoreFloat32(sum_ptr, v_sum); | |||
| } | |||
| for (; max_ptr < limit; ++max_ptr, ++sum_ptr, ++src_ptr, ++dst_ptr) { | |||
| *dst_ptr = exp(*src_ptr - *max_ptr); | |||
| *sum_ptr += *dst_ptr; | |||
| } | |||
| } | |||
| } | |||
| for (size_t a = 0; a < A; a++) { | |||
| for (size_t b = 0; b < B; b++) { | |||
| auto sum_ptr = sum + a * C; | |||
| auto limit = sum_ptr + C; | |||
| auto dst_ptr = dptr + a * B * C + C * b; | |||
| for (; sum_ptr + step <= limit; sum_ptr += step, dst_ptr += step) { | |||
| GI_FLOAT32_t v_p = GiLoadFloat32(dst_ptr); | |||
| GI_FLOAT32_t v_sum = GiLoadFloat32(sum_ptr); | |||
| v_p = GiDivideFloat32(v_p, v_sum); | |||
| GiStoreFloat32(dst_ptr, v_p); | |||
| } | |||
| for (; sum_ptr < limit; ++sum_ptr, ++dst_ptr) | |||
| *dst_ptr = *dst_ptr / *sum_ptr; | |||
| } | |||
| } | |||
| } else { | |||
| for (size_t a = 0; a < A; a++) { | |||
| auto max = float_min; | |||
| { | |||
| auto src_ptr = sptr + a * B; | |||
| auto limit = src_ptr + B; | |||
| GI_FLOAT32_t v_max = GiBroadcastFloat32(max); | |||
| for (; src_ptr + step <= limit; src_ptr += step) { | |||
| GI_FLOAT32_t v_p = GiLoadFloat32(src_ptr); | |||
| v_max = GiMaximumFloat32(v_max, v_p); | |||
| } | |||
| max = std::max(max, GiReduceMaxNanFloat32(v_max)); | |||
| for (; src_ptr < limit; ++src_ptr) { | |||
| max = std::max(*src_ptr, max); | |||
| } | |||
| } | |||
| auto sum = 0.f; | |||
| { | |||
| auto src_ptr = sptr + a * B; | |||
| auto limit = src_ptr + B; | |||
| auto dst_ptr = dptr + a * B; | |||
| GI_FLOAT32_t v_sum = GiZeroFloat32(); | |||
| GI_FLOAT32_t v_max = GiBroadcastFloat32(max); | |||
| for (; src_ptr + step <= limit; src_ptr += step, dst_ptr += step) { | |||
| GI_FLOAT32_t v_p = GiLoadFloat32(src_ptr); | |||
| v_p = GiExpPsFloat32(GiSubtractFloat32(v_p, v_max)); | |||
| GiStoreFloat32(dst_ptr, v_p); | |||
| v_sum = GiAddFloat32(v_sum, v_p); | |||
| } | |||
| sum += GiReduceAddFloat32(v_sum); | |||
| for (; src_ptr < limit; ++src_ptr, ++dst_ptr) { | |||
| *dst_ptr = exp(*src_ptr - max); | |||
| sum += *dst_ptr; | |||
| } | |||
| } | |||
| { | |||
| auto dst_ptr = dptr + a * B; | |||
| auto limit = dst_ptr + B; | |||
| sum = 1 / sum; | |||
| GI_FLOAT32_t v_sum = GiBroadcastFloat32(sum); | |||
| for (; dst_ptr + step <= limit; dst_ptr += step) { | |||
| GI_FLOAT32_t v_p = GiLoadFloat32(dst_ptr); | |||
| v_p = GiMultiplyFloat32(v_p, v_sum); | |||
| GiStoreFloat32(dst_ptr, v_p); | |||
| } | |||
| for (; dst_ptr < limit; ++dst_ptr) { | |||
| *dst_ptr *= sum; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } // namespace fallback | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -0,0 +1,45 @@ | |||
| #pragma once | |||
| #include "megdnn/tensor_format.h" | |||
| #include "src/common/reduce_helper.h" | |||
| #include "src/common/utils.h" | |||
| #include "src/naive/softmax/opr_impl.h" | |||
| namespace megdnn { | |||
| namespace fallback { | |||
| class SoftmaxForwardImpl : public naive::SoftmaxForwardImpl { | |||
| public: | |||
| using naive::SoftmaxForwardImpl::SoftmaxForwardImpl; | |||
| void exec( | |||
| _megdnn_tensor_in src, _megdnn_tensor_out dst, | |||
| _megdnn_workspace workspace) override; | |||
| bool usable(const TensorLayout& src) { | |||
| return src.is_contiguous() && (src.dtype.enumv() == DTypeEnum::Float32) && | |||
| (src.format.type() == TensorFormat::Type::DEFAULT); | |||
| } | |||
| size_t get_workspace_in_bytes( | |||
| const TensorLayout& src, const TensorLayout& dst) override { | |||
| if (!usable(src)) { | |||
| return naive::SoftmaxForwardImpl::get_workspace_in_bytes(src, dst); | |||
| } | |||
| auto axis = param().axis; | |||
| if (axis < 0) | |||
| axis += src.ndim; | |||
| typedef DTypeTrait<dtype::Float32>::ctype Float32; | |||
| size_t A, B, C; | |||
| reduce::get_ABC(src, A, B, C, axis); | |||
| if (C != 1) { | |||
| return WorkspaceBundle( | |||
| nullptr, {A * C * sizeof(Float32), A * C * sizeof(Float32)}) | |||
| .total_size_in_bytes(); | |||
| } | |||
| return 0; | |||
| } | |||
| }; | |||
| } // namespace fallback | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -0,0 +1,56 @@ | |||
| #include "test/fallback/fixture.h" | |||
| #include "megdnn/oprs.h" | |||
| #include "test/common/benchmarker.h" | |||
| #include "test/common/checker.h" | |||
| #include "test/common/task_record_check.h" | |||
| #include "test/common/tensor.h" | |||
| #include "test/common/workspace_wrapper.h" | |||
| namespace megdnn { | |||
| namespace test { | |||
| TEST_F(FALLBACK, SOFTMAX_FORWARD) { | |||
| Checker<Softmax> checker(handle()); | |||
| Softmax::Param param0{0}; | |||
| checker.set_param(param0).exec(TensorShapeArray{{11}, {}}); | |||
| checker.set_param(param0).exec(TensorShapeArray{{11, 11}, {}}); | |||
| checker.set_param(param0).exec(TensorShapeArray{{11, 11, 11}, {}}); | |||
| checker.set_param(param0).exec(TensorShapeArray{{11, 11, 11, 11}, {}}); | |||
| checker.set_param(param0).exec(TensorShapeArray{{11, 11, 11, 11, 11}, {}}); | |||
| checker.set_param(param0).exec(TensorShapeArray{{11, 7, 5, 5, 5, 11}, {}}); | |||
| checker.set_param(param0).exec(TensorShapeArray{{11, 7, 5, 7, 5, 7, 7}, {}}); | |||
| Softmax::Param param1{1}; | |||
| checker.set_param(param1).exec(TensorShapeArray{{11, 11}, {}}); | |||
| checker.set_param(param1).exec(TensorShapeArray{{11, 11, 11}, {}}); | |||
| checker.set_param(param1).exec(TensorShapeArray{{11, 11, 11, 11}, {}}); | |||
| checker.set_param(param1).exec(TensorShapeArray{{11, 11, 11, 11, 11}, {}}); | |||
| checker.set_param(param1).exec(TensorShapeArray{{11, 5, 5, 5, 5, 11}, {}}); | |||
| checker.set_param(param1).exec(TensorShapeArray{{11, 7, 5, 7, 5, 7, 7}, {}}); | |||
| Softmax::Param param2{2}; | |||
| checker.set_param(param2).exec(TensorShapeArray{{11, 11, 11}, {}}); | |||
| checker.set_param(param2).exec(TensorShapeArray{{11, 11, 11, 11}, {}}); | |||
| checker.set_param(param2).exec(TensorShapeArray{{11, 11, 11, 11, 11}, {}}); | |||
| checker.set_param(param2).exec(TensorShapeArray{{11, 5, 5, 5, 5, 11}, {}}); | |||
| checker.set_param(param2).exec(TensorShapeArray{{11, 5, 5, 5, 5, 7, 7}, {}}); | |||
| Softmax::Param param3{3}; | |||
| checker.set_param(param3).exec(TensorShapeArray{{11, 11, 11, 11}, {}}); | |||
| checker.set_param(param3).exec(TensorShapeArray{{11, 11, 11, 11, 11}, {}}); | |||
| checker.set_param(param3).exec(TensorShapeArray{{11, 5, 5, 5, 5, 11}, {}}); | |||
| checker.set_param(param3).exec(TensorShapeArray{{11, 5, 5, 5, 5, 7, 7}, {}}); | |||
| Softmax::Param param4{4}; | |||
| checker.set_param(param4).exec(TensorShapeArray{{11, 11, 11, 11, 11}, {}}); | |||
| checker.set_param(param4).exec(TensorShapeArray{{11, 5, 5, 5, 5, 11}, {}}); | |||
| checker.set_param(param4).exec(TensorShapeArray{{11, 5, 5, 5, 5, 7, 7}, {}}); | |||
| Softmax::Param param5{5}; | |||
| checker.set_param(param5).exec(TensorShapeArray{{11, 5, 5, 5, 5, 11}, {}}); | |||
| checker.set_param(param5).exec(TensorShapeArray{{11, 5, 5, 5, 5, 7, 7}, {}}); | |||
| Softmax::Param param6{6}; | |||
| checker.set_param(param6).exec(TensorShapeArray{{11, 5, 5, 5, 5, 7, 7}, {}}); | |||
| } | |||
| } // namespace test | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -43,60 +43,3 @@ TEST_F(NAIVE, SOFTMAX_BACKWARD) { | |||
| checker.set_param(param).exect(Testcase{input, diff, {}}, Testcase{{}, {}, output}); | |||
| } | |||
| TEST_F(NAIVE, SOFTMAX_FORWARD_NHWCD4) { | |||
| Checker<Softmax> checker(handle(), false); | |||
| Softmax::Param param{0}; | |||
| TensorND input1 = TensorValue( | |||
| {1, 2, 1, 2, 4}, dtype::Float32(), | |||
| {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}); | |||
| TensorND output1 = TensorValue( | |||
| {1, 2, 1, 2, 4}, dtype::Float32(), | |||
| {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); | |||
| checker.set_param(param).exect(Testcase{input1, {}}, Testcase{{}, output1}); | |||
| TensorND input2 = TensorValue( | |||
| {2, 2, 1, 2, 4}, dtype::Float32(), | |||
| {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, | |||
| 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31}); | |||
| TensorND output2 = TensorValue( | |||
| {2, 2, 1, 2, 4}, dtype::Float32(), | |||
| {1.12535162e-07, 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, | |||
| 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, | |||
| 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, | |||
| 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, | |||
| 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, | |||
| 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, | |||
| 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, | |||
| 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, 9.99999887e-01}); | |||
| checker.set_param(param).exect(Testcase{input2, {}}, Testcase{{}, output2}); | |||
| } | |||
| TEST_F(NAIVE, SOFTMAX_BACKWARD_NHWCD4) { | |||
| Checker<SoftmaxBackward> checker(handle(), false); | |||
| Softmax::Param param{0}; | |||
| TensorND input = TensorValue( | |||
| {2, 2, 1, 2, 4}, dtype::Float32(), | |||
| {1.12535162e-07, 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, | |||
| 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, | |||
| 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, | |||
| 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, 1.12535162e-07, | |||
| 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, | |||
| 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, | |||
| 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, | |||
| 9.99999887e-01, 9.99999887e-01, 9.99999887e-01, 9.99999887e-01}); | |||
| TensorND diff = TensorValue( | |||
| {2, 2, 1, 2, 4}, dtype::Float32(), | |||
| {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., | |||
| 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}); | |||
| TensorND output = TensorValue( | |||
| {2, 2, 1, 2, 4}, dtype::Float32(), | |||
| {0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., | |||
| 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.}); | |||
| checker.set_param(param).exect(Testcase{input, diff, {}}, Testcase{{}, {}, output}); | |||
| } | |||