Browse Source

feat(src/gopt): add optpass on arm for fusing typecvt and elemwise to elemwise multi type

GitOrigin-RevId: e6bcbbf91b
tags/v1.7.2.m1
Megvii Engine Team 4 years ago
parent
commit
e715423f20
6 changed files with 297 additions and 0 deletions
  1. +6
    -0
      src/gopt/impl/framework.cpp
  2. +140
    -0
      src/gopt/impl/inference.cpp
  3. +9
    -0
      src/gopt/include/megbrain/gopt/inference.h
  4. +132
    -0
      src/gopt/test/inference.cpp
  5. +8
    -0
      src/opr/impl/nn_int.cpp
  6. +2
    -0
      src/opr/include/megbrain/opr/nn_int.h

+ 6
- 0
src/gopt/impl/framework.cpp View File

@@ -644,6 +644,11 @@ GraphOptimizer& GraphOptimizer::add_preset_passes(
add_pass<RemoveRedundantTypeCvtPass>();
add_pass<RemoveRedundantCopyPass>();

//! Only arm_common implement Fuse TypeCvt and Elemwise optimized kernel
#if (MEGDNN_AARCH64 || MEGDNN_ARMV7) && !MGB_OPENCL && !MGB_CUDA
add_pass<FuseTypecvtElemwisePass>();
#endif

#if MGB_JIT
using JITConfig = cg::ComputingGraph::Options::GraphOpt::JITConfig;
int jit_opt_level = 0;
@@ -691,6 +696,7 @@ GraphOptimizer& GraphOptimizer::add_preset_passes(
// remove shape hint after inference optimization
add_pass<RemoveShapeHintPass>();
}

return *this;
}



+ 140
- 0
src/gopt/impl/inference.cpp View File

@@ -2187,4 +2187,144 @@ void ParamMergePass::apply(OptState& opt_state) const {
MIDOUT_E
}

/* ==================== FuseTypecvtElemwisePass ================= */
const char* FuseTypecvtElemwisePass::name() const {
return mgb_cstr_log("Fuse typecvt elemwise pass");
}

void FuseTypecvtElemwisePass::apply(OptState& opt) const {
MIDOUT_B("FuseTypecvtElemwisePass::apply")
opt.set_var_replace_check_flag(
VarReplaceCheckFlag::CHECK_DTYPE | VarReplaceCheckFlag::CHECK_SHAPE);
auto rewriter = opt.graph().make_rewriter();
auto uniq_reader_check = UniqReaderCheck{opt.graph()};

auto try_typecvt_elemwise_fma_i16xf32xf32xf32 = [&rewriter, &uniq_reader_check](
OperatorNodeBase* opr) {
// check elemwise
auto elemwise = try_cast_as_op<opr::Elemwise>(opr);
if (elemwise == nullptr)
return false;
if (elemwise->param().mode != opr::Elemwise::Mode::FUSE_MUL_ADD3)
return false;

bool is_elem_src_f32 =
elemwise->input(0)->dtype().enumv() == DTypeEnum::Float32;
bool is_elem_dst_f32 =
elemwise->output(0)->dtype().enumv() == DTypeEnum::Float32;

if (!(is_elem_src_f32 && is_elem_dst_f32))
return false;
if (!uniq_reader_check(elemwise->input(0)))
return false;

// check typecvt
auto typecvt = try_cast_as_op<opr::TypeCvt>(elemwise->input(0)->owner_opr());
if (typecvt == nullptr)
return false;

bool is_typecvt_src_i16 =
typecvt->input(0)->dtype().enumv() == DTypeEnum::Int16;
bool is_typecvt_src_u8 = typecvt->input(0)->dtype().enumv() == DTypeEnum::Uint8;
bool is_typecvt_dst_f32 =
typecvt->output(0)->dtype().enumv() == DTypeEnum::Float32;

if (!((is_typecvt_src_i16 || is_typecvt_src_u8) && is_typecvt_dst_f32))
return false;

SymbolVar new_elem;
auto src0 = rewriter.get_var(typecvt->input(0)),
src1 = rewriter.get_var(elemwise->input(1)),
src2 = rewriter.get_var(elemwise->input(2));
if (is_typecvt_src_i16) {
new_elem = opr::ElemwiseMultiType::make(
{src0, src1, src2},
{opr::ElemwiseMultiType::Mode::FUSE_MUL_ADD3_INT16xF32xF32xF32},
OperatorNodeConfig{dtype::Float32()});
} else {
new_elem = opr::ElemwiseMultiType::make(
{src0, src1, src2},
{opr::ElemwiseMultiType::Mode::FUSE_MUL_ADD3_UINT8xF32xF32xF32},
OperatorNodeConfig{dtype::Float32()});
}

rewriter.replace_var(
opr->output(0), new_elem.node(),
mgb_cstr_log("replace typecvt + elemwise(FUSE_MUL_ADD3)"
"to ElemwiseMultiType(FUSE_MUL_ADD3_INTXxF32xF32xF32)"));

return true;
};

auto try_typecvt_elemwise_mul_i16xf32xf32 = [&rewriter, &uniq_reader_check](
OperatorNodeBase* opr) {
// check elemwise
auto elemwise = try_cast_as_op<opr::Elemwise>(opr);
if (elemwise == nullptr)
return false;
if (elemwise->param().mode != opr::Elemwise::Mode::MUL)
return false;

bool is_elem_src_f32 =
elemwise->input(0)->dtype().enumv() == DTypeEnum::Float32;
bool is_elem_dst_f32 =
elemwise->output(0)->dtype().enumv() == DTypeEnum::Float32;

if (!(is_elem_src_f32 && is_elem_dst_f32))
return false;
// maybe src0 or src1
if (!(try_cast_as_op<opr::TypeCvt>(elemwise->input(0)->owner_opr()) ||
try_cast_as_op<opr::TypeCvt>(elemwise->input(1)->owner_opr())))
return false;

int typecvt_src_idx = (try_cast_as_op<opr::TypeCvt>(
elemwise->input(0)->owner_opr()) != nullptr)
? 0
: 1;

int other_src_idx = (typecvt_src_idx == 0) ? 1 : 0;

if (!uniq_reader_check(elemwise->input(typecvt_src_idx)))
return false;

// check typecvt
auto typecvt = try_cast_as_op<opr::TypeCvt>(
elemwise->input(typecvt_src_idx)->owner_opr());

bool is_typecvt_src_i16 =
typecvt->input(0)->dtype().enumv() == DTypeEnum::Int16;
bool is_typecvt_dst_f32 =
typecvt->output(0)->dtype().enumv() == DTypeEnum::Float32;

if (!(is_typecvt_src_i16 && is_typecvt_dst_f32))
return false;

SymbolVar new_elem;
auto src0 = rewriter.get_var(typecvt->input(0)),
src1 = rewriter.get_var(elemwise->input(other_src_idx));
new_elem = opr::ElemwiseMultiType::make(
{src0, src1}, {opr::ElemwiseMultiType::Mode::MUL_INT16xF32xF32},
OperatorNodeConfig{dtype::Float32()});

rewriter.replace_var(
opr->output(0), new_elem.node(),
mgb_cstr_log("replace typecvt + elemwise(MUL)"
"to ElemwiseMultiType(MUL_INT16xF32xF32)"));

return true;
};

auto on_opr = [&try_typecvt_elemwise_fma_i16xf32xf32xf32,
&try_typecvt_elemwise_mul_i16xf32xf32,
&rewriter](OperatorNodeBase* opr) {
if (!try_typecvt_elemwise_fma_i16xf32xf32xf32(opr) &&
!try_typecvt_elemwise_mul_i16xf32xf32(opr)) {
rewriter.auto_replace_outputs(opr);
}
};
opt.graph().iter(on_opr);
rewriter.apply_inplace();
MIDOUT_E
}

// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

+ 9
- 0
src/gopt/include/megbrain/gopt/inference.h View File

@@ -200,6 +200,15 @@ public:
void apply(OptState& opt_state) const override;
};

/*!
* \brief Fuse typecvt elemwise
*/
class FuseTypecvtElemwisePass final : public Pass {
public:
const char* name() const override;
void apply(OptState& opt) const override;
};

/*!
* \brief tensor format converter to accelerate inference speed on Nvidia
* platform


+ 132
- 0
src/gopt/test/inference.cpp View File

@@ -1780,6 +1780,138 @@ TEST(TestGoptInference, ConvBiasNonlinearityFusePass_FullBias) {
}
}

#if (MEGDNN_AARCH64 || MEGDNN_ARMV7) && !MGB_OPENCL && !MGB_CUDA
TEST(TestGoptInference, FuseTypeCvtAndElemwiseCase0) {
HostTensorGenerator<dtype::Int16, RandomDistribution::UNIFORM> gen(0, 255);
auto cn = CompNode::load("cpu0");
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;

size_t n = 1;
size_t c = 128;
size_t h = 16;
size_t w = 16;
auto host_x1 = gen({n, h, w, c}, cn);
auto x = opr::Host2DeviceCopy::make(*graph, host_x1);

auto x_nchw = opr::Dimshuffle::make(x, {0, 3, 1, 2}, 4, cn);
auto x_f32 = opr::TypeCvt::make(x_nchw, dtype::Float32(), cn);
auto mkcvar = [&](const char* name, const TensorShape& shp) {
return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
};
auto s = mkcvar("s", {1, c, 1, 1});
auto b = mkcvar("b", {1, c, 1, 1});

auto result = opr::Elemwise::make(
{x_f32, s, b}, opr::Elemwise::Param::Mode::FUSE_MUL_ADD3);

auto y = result;
SymbolVar y_opt;
auto options = gopt::OptimizeForInferenceOptions{};
unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);

ASSERT_TRUE(y_opt.node()->owner_opr()->same_type<opr::ElemwiseMultiType>());

ASSERT_EQ(
opr::ElemwiseMultiType::Param::Mode::FUSE_MUL_ADD3_INT16xF32xF32xF32,
find_opr<opr::ElemwiseMultiType>(y_opt).param().mode);

HostTensorND host_y_opt, host_y;
auto func = graph->compile({make_callback_copy(y, host_y)});
func->execute();
graph->options().graph_opt_level = 2;
auto func_opt = graph->compile({make_callback_copy(y, host_y_opt)});
func_opt->execute();
MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
}

TEST(TestGoptInference, FuseTypeCvtAndElemwiseCase1) {
HostTensorGenerator<dtype::Int16, RandomDistribution::UNIFORM> gen(0, 255);
auto cn = CompNode::load("cpu0");
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;

size_t n = 1;
size_t c = 128;
size_t h = 16;
size_t w = 16;
auto host_x1 = gen({n, h, w, c}, cn);
auto x = opr::Host2DeviceCopy::make(*graph, host_x1);

auto x_nchw = opr::Dimshuffle::make(x, {0, 3, 1, 2}, 4, cn);
auto x_f32 = opr::TypeCvt::make(x_nchw, dtype::Float32(), cn);
auto mkcvar = [&](const char* name, const TensorShape& shp) {
return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
};
auto s = mkcvar("s", {1, c, 1, 1});

auto result = opr::Elemwise::make({x_f32, s}, opr::Elemwise::Param::Mode::MUL);

auto y = result;
SymbolVar y_opt;
auto options = gopt::OptimizeForInferenceOptions{};
unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);

ASSERT_TRUE(y_opt.node()->owner_opr()->same_type<opr::ElemwiseMultiType>());

ASSERT_EQ(
opr::ElemwiseMultiType::Param::Mode::MUL_INT16xF32xF32,
find_opr<opr::ElemwiseMultiType>(y_opt).param().mode);

HostTensorND host_y_opt, host_y;
auto func = graph->compile({make_callback_copy(y, host_y)});
func->execute();
graph->options().graph_opt_level = 2;
auto func_opt = graph->compile({make_callback_copy(y, host_y_opt)});
func_opt->execute();
MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
}

TEST(TestGoptInference, FuseTypeCvtAndElemwiseCase2) {
HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255);
auto cn = CompNode::load("cpu0");
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;

size_t n = 1;
size_t c = 128;
size_t h = 16;
size_t w = 16;
auto host_x1 = gen({n, h, w, c}, cn);
auto x = opr::Host2DeviceCopy::make(*graph, host_x1);

auto x_nchw = opr::Dimshuffle::make(x, {0, 3, 1, 2}, 4, cn);
auto x_f32 = opr::TypeCvt::make(x_nchw, dtype::Float32(), cn);
auto mkcvar = [&](const char* name, const TensorShape& shp) {
return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name);
};
auto s = mkcvar("s", {1, c, 1, 1});
auto b = mkcvar("b", {1, c, 1, 1});

auto result = opr::Elemwise::make(
{x_f32, s, b}, opr::Elemwise::Param::Mode::FUSE_MUL_ADD3);

auto y = result;
SymbolVar y_opt;
auto options = gopt::OptimizeForInferenceOptions{};
unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);

ASSERT_TRUE(y_opt.node()->owner_opr()->same_type<opr::ElemwiseMultiType>());

ASSERT_EQ(
opr::ElemwiseMultiType::Param::Mode::FUSE_MUL_ADD3_UINT8xF32xF32xF32,
find_opr<opr::ElemwiseMultiType>(y_opt).param().mode);

HostTensorND host_y_opt, host_y;
auto func = graph->compile({make_callback_copy(y, host_y)});
func->execute();
graph->options().graph_opt_level = 2;
auto func_opt = graph->compile({make_callback_copy(y, host_y_opt)});
func_opt->execute();
MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
}
#endif

TEST(TestGoptInference, ParamMerge) {
auto cns = load_multiple_xpus(2);
HostTensorGenerator<> gen;


+ 8
- 0
src/opr/impl/nn_int.cpp View File

@@ -77,4 +77,12 @@ void ElemwiseMultiType::record_execute_deps(ExecDependencyArray& deps) {
record_megdnn_opr(deps);
}

void ElemwiseMultiType::add_input_layout_constraint() {
#if (MEGDNN_AARCH64 || MEGDNN_ARMV7) && !MGB_OPENCL && !MGB_CUDA
for (auto i : input()) {
i->add_layout_constraint_contiguous();
}
#endif
}

// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

+ 2
- 0
src/opr/include/megbrain/opr/nn_int.h View File

@@ -49,6 +49,8 @@ private:
void init_output_dtype() override;

void record_execute_deps(ExecDependencyArray& deps) override;

void add_input_layout_constraint() override;
};

//! deprecated; TODO: remove in megbrain 8


Loading…
Cancel
Save