|
- /**
- * \file src/core/test/graph/misc.cpp
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-
- #include "megbrain/opr/io.h"
- #include "megbrain/opr/basic_arith_wrapper.h"
- #include "megbrain/opr/dnn/convolution.h"
- #include "megbrain/opr/utility.h"
- #include "megbrain/opr/blas.h"
- #include "megbrain/opr/tensor_manip.h"
- #include "megbrain/opr/misc.h"
- #include "megbrain/opr/indexing.h"
- #include "megbrain/opr/tensor_manip.h"
- #include "megbrain/graph/helper.h"
- #include "megbrain/graph/grad_impl.h"
- #include "megbrain/graph/event.h"
- #include "megbrain/graph/execution_mask.h"
- #include "megbrain/utils/timer.h"
- #include "megbrain/comp_node_env.h"
- #include "megbrain/gopt/inference.h"
-
- #include "megbrain/test/helper.h"
- #include "megdnn/oprs/base.h"
-
- #include <atomic>
- #include <chrono>
- #include <array>
- #include <memory>
-
- using namespace mgb;
-
- namespace mgb {
- namespace cg {
- // declaration of impl class to access its methods
- class ComputingGraphImpl : public ComputingGraph {
- public:
- GraphExecutable::ExecEnv* current_exec_env();
- };
- class SeqCompNodeOptimizerImpl : public SeqCompNodeOptimizer {
- ~SeqCompNodeOptimizerImpl() = default;
- public:
- void optimize_comp_nodes(const VarNodeArray &endpoints);
- };
- } // namespace cg
- } // namespace mgb
-
- namespace {
-
- MGB_DEFINE_OPR_CLASS(PODDedupTestOpr, cg::SingleCNOperatorNodeBase) // {
- public:
- struct Param {
- int v0;
- char v1;
- } MGB_PACKED;
-
- PODDedupTestOpr(ComputingGraph *owner, const Param ¶m):
- Super{owner, OperatorNodeConfig{}, "node", {}},
- m_param(param)
- {
- add_equivalence_component<PODHash<Param>>(&m_param);
- add_output(None)->dtype(dtype::Byte());
- }
-
- static SymbolVar make(ComputingGraph &owner, const Param ¶m) {
- return owner.insert_opr(std::make_unique<PODDedupTestOpr>(
- &owner, param))->output(0);
- }
- private:
- Param m_param;
-
- void scn_do_execute() override {
- }
-
- void init_output_comp_node() override {
- output(0)->comp_node(CompNode::load("xpu0"));
- }
-
- void init_output_static_infer_desc() override {
- using namespace mgb::cg::static_infer;
- owner_graph()->static_infer_manager().register_shape_infer(
- output(0), {
- SourceType::CONSTANT,
- {},
- [](TensorShape &dest, const InpVal &) {
- dest = {1};
- return true;
- }
- });
- }
- };
-
- MGB_DEFINE_OPR_CLASS(WorkspaceAllocTestOpr,
- cg::SingleCNOutshapePureByInshapeOprBase) // {
-
- void get_output_var_shape(
- const TensorShapeArray &inp_shape,
- TensorShapeArray &out_shape) const override {
- MGB_MARK_USED_VAR(inp_shape);
- out_shape.at(0) = {2};
- out_shape.at(1) = {3};
- }
-
- void scn_do_execute() override {
- ASSERT_EQ(TensorShape{2}, output(0)->dev_tensor().shape());
- ASSERT_EQ(TensorShape{3}, output(1)->dev_tensor().shape());
- executed = true;
- }
-
- public:
- bool executed = false;
-
- WorkspaceAllocTestOpr(VarNode *inp):
- Super(inp->owner_graph(), {}, "test", {inp})
- {
- add_input({inp});
- add_output("out")->dtype(dtype::Float32());
- cg::add_workspace_output(this);
- }
- };
-
- MGB_DEFINE_OPR_CLASS(AllInputGradOpr,
- cg::SingleCNOutshapePureByInshapeOprBase) // {
-
- void get_output_var_shape(
- const TensorShapeArray &inp_shape,
- TensorShapeArray &out_shape) const override {
- out_shape.at(0) = {2};
- }
-
- void scn_do_execute() override { }
-
- public:
- size_t nr_grad_call = 0;
- VarNode* prev_out_grad = nullptr;
-
- AllInputGradOpr(VarNode *a, VarNode *b):
- Super(a->owner_graph(), {}, "all_inp_grad", {a, b})
- {
- add_input({a, b});
- add_output(None);
- }
- };
- MGB_DYN_TYPE_OBJ_FINAL_IMPL(AllInputGradOpr);
- MGB_IMPL_OPR_GRAD(AllInputGradOpr) {
- auto &&ncopr = const_cast<AllInputGradOpr&>(opr);
- ncopr.prev_out_grad = out_grad[0];
- ++ ncopr.nr_grad_call;
- SymbolVar x = opr.input(0), y = opr.input(1);
- if (ncopr.nr_grad_call & 1) {
- return VarNodeArray{(x + y).node(), nullptr};
- } else {
- return VarNodeArray{nullptr, (x * y).node()};
- }
- }
-
- template<bool dynamic, typename dtype>
- void test_aplusb() {
- using Gen = HostTensorGenerator<dtype>;
- using ctype = typename Gen::ctype;
- Gen gen;
- constexpr size_t SIZE = 1234;
- auto host_x = gen({SIZE}), host_y = gen({SIZE});
- auto graph = ComputingGraph::make();
- SymbolVar
- x = opr::Host2DeviceCopy::make(*graph, host_x).rename("x"),
- y = opr::Host2DeviceCopy::make(*graph, host_y).rename("y");
- if (dynamic) {
- x = opr::MarkDynamicVar::make(x).rename("xd");
- y = opr::MarkDynamicVar::make(y).rename("yd");
- }
- auto z = opr::add(x, y).rename("z");
- HostTensorND host_z;
- auto func = graph->compile({make_callback_copy(z, host_z)});
-
- for (ctype delta = 0; delta < 2; ++ delta) {
- auto px = host_x->template ptr<ctype>();
- px[0] += delta; // test change input data
- func->execute();
- auto py = host_y->template ptr<ctype>(),
- pz = host_z. template ptr<ctype>();
- ASSERT_EQ(host_x->shape(), host_z.shape());
- for (size_t i = 0; i < SIZE; ++ i) {
- MGB_ASSERT_FLOAT_EQ(px[i] + py[i], pz[i]) <<
- ssprintf("failed at %zu: %g+%g", i, float(px[i]), float(py[i]));
- }
- }
- }
-
- class TrackableStaticMemAlloc final : public cg::DeviceMemoryAllocator {
- SmallVector<DeviceTensorStorage> m_refhold;
-
- public:
- size_t version_num = 0, size_expect = 0;
-
- void alloc_static(ComputingGraph*, DeviceTensorStorage& dest,
- size_t size) override {
- dest.ensure_size(size);
- m_refhold.emplace_back(dest);
- if (size_expect) {
- ASSERT_EQ(size_expect, size);
- }
- }
-
- size_t nr_call() const { return m_refhold.size(); }
-
- size_t static_alloc_version(ComputingGraph*) const override {
- return version_num;
- }
- };
-
- class TrackableDynamicMemAlloc final : public cg::DeviceMemoryAllocator {
- ThinHashSet<VarNode*> m_alive_vars;
- std::mutex m_mtx;
-
- public:
- void alloc_dynamic(VarNode* var, DeviceTensorStorage& dest,
- size_t size) override {
- ASSERT_LT(dest.size(), size);
- MGB_LOCK_GUARD(m_mtx);
- auto ptr = dest.comp_node().alloc_device(size);
- auto ins = m_alive_vars.insert(var);
- ASSERT_TRUE(ins.second);
- auto del = [ this, var, cn = dest.comp_node() ](void* ptr) {
- cn.free_device(ptr);
- MGB_LOCK_GUARD(m_mtx);
- auto nr = m_alive_vars.erase(var);
- ASSERT_EQ(1u, nr);
- };
- dest.reset(dest.comp_node(), size, {static_cast<dt_byte*>(ptr), del});
- }
-
- const ThinHashSet<VarNode*>& alive_vars() const { return m_alive_vars; }
-
- ~TrackableDynamicMemAlloc() { EXPECT_TRUE(m_alive_vars.empty()); }
- };
-
- } // anonymous namespace
-
- MGB_DYN_TYPE_OBJ_FINAL_IMPL(PODDedupTestOpr);
- MGB_DYN_TYPE_OBJ_FINAL_IMPL(WorkspaceAllocTestOpr);
-
- TEST(TestGraphBasic, APlusBF32) {
- test_aplusb<false, dtype::Float32>();
- }
-
- TEST(TestGraphBasic, APlusBI32) {
- test_aplusb<false, dtype::Int32>();
- }
-
- TEST(TestGraphBasic, DynAPlusBF32) {
- test_aplusb<true, dtype::Float32>();
- }
-
- TEST(TestGraphBasic, DynAPlusBI32) {
- test_aplusb<true, dtype::Int32>();
- }
-
- TEST(TestGraph, APlusBOnCPU) {
- HostTensorGenerator<> gen;
- constexpr size_t SIZE = 1234;
- auto host_x = gen({SIZE}, "cpu0"), host_y = gen({SIZE}, "cpu0");
- auto graph = ComputingGraph::make();
- SymbolVar
- x = opr::Host2DeviceCopy::make(*graph, host_x).rename("x"),
- y = opr::Host2DeviceCopy::make(*graph, host_y).rename("y");
- auto z = (x + y).rename("z");
- HostTensorND host_z;
- auto func = graph->compile({make_callback_copy(z, host_z)});
- func->execute();
-
- ASSERT_EQ(host_x->shape(), host_z.shape());
- auto px = host_x->ptr<float>(), py = host_y->ptr<float>(),
- pz = host_z.ptr<float>();
- for (size_t i = 0; i < SIZE; ++ i)
- MGB_ASSERT_FLOAT_EQ(px[i] + py[i], pz[i]);
- }
-
- TEST(TestGraph, DeDup) {
- auto t0 = std::make_shared<DeviceTensorND>(
- CompNode::load("xpu0"), TensorShape{2, 2}),
- t1 = std::make_shared<DeviceTensorND>(
- CompNode::load("xpu0"), TensorShape{2, 2}),
- t2 = std::make_shared<DeviceTensorND>(
- CompNode::load("xpu0"), TensorShape{2, 2});
- auto graph = ComputingGraph::make();
- auto st0 = opr::SharedDeviceTensor::make(*graph, t0),
- st1 = opr::SharedDeviceTensor::make(*graph, t1);
- SymbolVar
- x = opr::add(st0, st1),
- y = opr::add(opr::SharedDeviceTensor::make(*graph, t1),
- opr::SharedDeviceTensor::make(*graph, t0)),
- z = opr::add(opr::SharedDeviceTensor::make(*graph, t0),
- opr::SharedDeviceTensor::make(*graph, t2));
- EXPECT_EQ(x.node(), y.node());
- EXPECT_NE(x.node(), z.node());
- }
-
- TEST(TestGraph, PODDeDup) {
- auto graph = ComputingGraph::make();
- PODDedupTestOpr::Param param{42, 'x'};
- auto var0 = PODDedupTestOpr::make(*graph, param),
- var1 = PODDedupTestOpr::make(*graph, param);
- param.v1 = 'y';
- auto var2 = PODDedupTestOpr::make(*graph, param);
- EXPECT_NE(var0.node(), var2.node());
- EXPECT_NE(var1.node(), var2.node());
- EXPECT_EQ(var0.node(), var1.node());
- }
-
- TEST(TestGraph, MultiCard) {
- auto cns = load_multiple_xpus(2);
- constexpr size_t SIZE = 123456;
- constexpr double SLEEP_TIME = 0.8, MAX_EXE_TIME = 0.5;
- HostTensorGenerator<> gen;
- auto host_opr0 = gen({SIZE}, cns[0]), host_opr1 = gen({SIZE}, cns[1]);
- auto graph = ComputingGraph::make();
- graph->options().var_sanity_check_first_run = false;
- SymbolVar
- opr0 = opr::Host2DeviceCopy::make(*graph, host_opr0, {"opr0"}),
- opr1 = opr::Host2DeviceCopy::make(*graph, host_opr1, {"opr1"}),
- opr0_delay = opr::Sleep::make(opr0, SLEEP_TIME),
- opr1_delay = opr::Sleep::make(opr1, SLEEP_TIME),
- opr1_card0 = opr::Copy::make(opr1_delay,
- OperatorNodeConfig{"opr1_card0"}.comp_node(
- cns[0].change_stream(1))),
- opr0_double = opr::add(opr0_delay, opr0_delay, {"opr0_double"}),
- sum = opr::add(opr0_double, opr1_card0,
- OperatorNodeConfig{"sum"}.comp_node(cns[0].change_stream(2))),
- sum_delay = opr::Sleep::make(sum, SLEEP_TIME);
- HostTensorND host_sum;
- auto func = graph->compile({{
- sum_delay, [&](DeviceTensorND &s){
- host_sum.copy_from(s);
- }}});
-
- RealTimer timer;
- func->execute();
- // sleep kernel in cuda is easily affected by the frequency change of GPU,
- // so we just print warn log instead assert. more refer to
- // XPU-226
- auto use_time = timer.get_secs();
- if (use_time >= MAX_EXE_TIME) {
- mgb_log_warn("expect time [%f < %f], got %f", use_time, MAX_EXE_TIME,
- use_time);
- }
-
- ASSERT_EQ(host_sum.layout(), host_opr0->layout());
-
- auto p0 = host_opr0->ptr<float>(), p1 = host_opr1->ptr<float>(),
- ps = host_sum.sync().ptr<float>();
- // sleep kernel in cuda is easily affected by the frequency change of GPU,
- // so we just print warn log instead assert. more refer to
- // XPU-226
- use_time = timer.get_secs();
- if (use_time <= SLEEP_TIME * 2) {
- mgb_log_warn("expect time [%f > %f], got %f", use_time, SLEEP_TIME * 2,
- use_time);
- }
- use_time = timer.get_secs();
- if (use_time >= SLEEP_TIME * 3) {
- mgb_log_warn("expect time [%f < %f], got %f", use_time, SLEEP_TIME * 3,
- use_time);
- }
- for (size_t i = 0; i < SIZE; i++)
- ASSERT_FLOAT_EQ(p0[i] * 2 + p1[i], ps[i]);
- }
-
- TEST(TestGraph, AsyncExec) {
- static constexpr double SLEEP_TIME = 0.1;
- HostTensorGenerator<> gen;
- auto graph = ComputingGraph::make();
- graph->options().var_sanity_check_first_run = false;
- auto host_x = gen({1});
- SymbolVar
- x0 = opr::Host2DeviceCopy::make(*graph, host_x),
- xs = opr::Sleep::make(x0, SLEEP_TIME);
- auto func = graph->compile({{xs, [](DeviceTensorND&){}}});
-
- RealTimer timer;
- double t0, t1, t2, t3, t4, t5;
- t0 = timer.get_secs();
- func->execute();
- t1 = timer.get_secs();
- func->wait();
- t2 = timer.get_secs();
- func->execute();
- t3 = timer.get_secs();
- func->execute();
- t4 = timer.get_secs();
- func->wait();
- t5 = timer.get_secs();
-
- // sleep kernel in cuda is easily affected by the frequency change of GPU,
- // so we just print warn log instead assert. more refer to
- // XPU-226
- if ((t1 - t0) >= SLEEP_TIME / 2 || (t2 - t0) <= SLEEP_TIME ||
- (t3 - t2) >= SLEEP_TIME / 2 || (t4 - t2) <= SLEEP_TIME ||
- (t5 - t4) <= SLEEP_TIME / 2 ||
- func->get_prev_exec_time() <= SLEEP_TIME ||
- func->get_prev_exec_time() >= SLEEP_TIME * 1.5) {
- mgb_log_warn(
- "time issue, pls check detail: [t0: %f, t1:%f, t2:%f, t3: %f, "
- "t4: %f, t5: %f]",
- t0, t1, t2, t3, t4, t5);
- }
- }
-
- TEST(TestGraph, VSizeTensor) {
- HostTensorGenerator<> gen;
- auto host_x = gen({1}), host_y = gen({1});
-
- auto graph = ComputingGraph::make();
- auto dev_x = opr::Host2DeviceCopy::make(*graph, host_x),
- dev_y = opr::Host2DeviceCopy::make(*graph, host_y),
- dev_z = dev_x + dev_y;
-
- HostTensorND host_z;
- auto func = graph->compile({{dev_z, [&](DeviceTensorND &z){
- host_z.copy_from(z).sync();
- }}});
-
- auto check = [&](size_t inp_sz) {
- *host_x = *gen({inp_sz});
- *host_y = *gen({inp_sz});
- func->execute();
- ASSERT_EQ(host_z.shape(), TensorShape({inp_sz}));
- auto px = host_x->ptr<float>(), py = host_y->ptr<float>(),
- pz = host_z.ptr<float>();
- for (size_t i = 0; i < inp_sz; i ++)
- ASSERT_EQ(px[i] + py[i], pz[i]);
- };
-
- check(100);
- check(456);
- check(456);
- check(10);
- }
-
- TEST(TestGraph, CompileTwice) {
- HostTensorGenerator<> gen;
- auto host_x = gen({1});
-
- auto graph = ComputingGraph::make();
- auto x = opr::Host2DeviceCopy::make(*graph, host_x),
- y1 = x + 1,
- y2 = x + 2,
- z1 = opr::Copy::make(y1),
- z2 = opr::Copy::make(y2);
- EXPECT_TRUE(graph->var_receiver_in_current_comp_seq(y1.node()).empty());
- EXPECT_TRUE(graph->var_receiver_in_current_comp_seq(y2.node()).empty());
-
- HostTensorND host_z1, host_z2;
- auto func = graph->compile({make_callback_copy(z1, host_z1)});
- EXPECT_FALSE(graph->var_receiver_in_current_comp_seq(y1.node()).empty());
- EXPECT_TRUE(graph->var_receiver_in_current_comp_seq(y2.node()).empty());
- func->execute();
- EXPECT_EQ(host_x->ptr<float>()[0] + 1, host_z1.ptr<float>()[0]);
- EXPECT_FALSE(host_z2.storage().comp_node_valid());
- host_z1.ptr<float>()[0] ++;
-
- func = graph->compile({make_callback_copy(z2, host_z2)});
- EXPECT_TRUE(graph->var_receiver_in_current_comp_seq(y1.node()).empty());
- EXPECT_FALSE(graph->var_receiver_in_current_comp_seq(y2.node()).empty());
- func->execute();
- EXPECT_NE(host_x->ptr<float>()[0] + 1, host_z1.ptr<float>()[0]);
- EXPECT_EQ(host_x->ptr<float>()[0] + 2, host_z2.ptr<float>()[0]);
- }
-
- TEST(TestGraph, MultiCNDynamicInputs) {
- auto cns = load_multiple_xpus(3);
- HostTensorGenerator<> gen;
- auto host_x = gen({5, 2}, cns[0]), host_y = gen({5, 3}, cns[1]);
- auto graph = ComputingGraph::make();
-
- auto x = opr::Host2DeviceCopy::make(*graph, host_x),
- y = opr::Host2DeviceCopy::make(*graph, host_y),
- xd = opr::Sleep::make(opr::MarkDynamicVar::make(x), 0.1),
- yd = opr::Sleep::make(opr::MarkDynamicVar::make(y), 0.2),
- z = opr::Concat::make({xd, yd}, 1,
- OperatorNodeConfig().comp_node(cns[2]));
-
- HostTensorND host_z;
- auto func = graph->compile({make_callback_copy(z, host_z)});
- func->execute();
- ASSERT_EQ(host_z.shape(), TensorShape({5, 5}));
- for (size_t i = 0; i < 5; ++ i)
- for (size_t j = 0; j < 5; ++ j) {
- float expect;
- if (j < 2)
- expect = *host_x->ptr<float>({i, j});
- else
- expect = *host_y->ptr<float>({i, j - 2});
- ASSERT_FLOAT_EQ(expect, *host_z.ptr<float>({i, j}));
- }
- }
-
- TEST(TestGraph, DepMapSameNode) {
- auto run = [](bool dyn) {
- auto graph = ComputingGraph::make();
- auto cn = CompNode::load("xpu0");
- auto x = SymbolVar::make_scalar(1, *graph, cn);
- if (dyn)
- x = opr::MarkDynamicVar::make(x);
- auto y = opr::Reshape::make(x, x);
- HostTensorND host_y;
- auto func = graph->compile({make_callback_copy(y, host_y)});
- func->execute();
- ASSERT_EQ(TensorShape{1}, host_y.shape());
- ASSERT_EQ(1, host_y.ptr<dt_int32>()[0]);
- };
- run(false);
- run(true);
- }
-
- TEST(TestGraph, DoubleThrowOnInit) {
- HostTensorGenerator<> gen;
- auto host_x = gen({23});
- auto graph = ComputingGraph::make();
-
- auto x = opr::Host2DeviceCopy::make(*graph, host_x);
- using Ad = opr::SetSubtensor::AxisIndexer;
- std::vector<Ad> axis_desc{
- Ad::make_index(0, x.make_scalar(0.f))};
-
- ASSERT_THROW(opr::SetSubtensor::make(x, x, axis_desc), MegBrainError);
- ASSERT_THROW(opr::SetSubtensor::make(x, x, axis_desc), MegBrainError);
- }
-
- TEST(TestGraph, ShapeOnlyDep) {
- HostTensorGenerator<> gen;
- auto host_x = gen({2, 3});
- auto graph = ComputingGraph::make();
-
- using Ad = opr::AxisAddRemove::AxisDesc;
- bool shp_dep_exec = false;
- auto cb_set_shp_dep_exec = [&](DeviceTensorND &) {
- shp_dep_exec = true;
- };
- auto add_chk = [&](SymbolVar var) {
- return opr::CallbackInjector::make(var, cb_set_shp_dep_exec);
- };
- auto x = opr::Host2DeviceCopy::make(*graph, host_x),
- xd_ = opr::MarkDynamicVar::make(x),
- xd = add_chk(x.make_scalar(0)).broadcast(opr::GetVarShape::make(xd_)),
- axadd = add_chk(opr::AxisAddRemove::make(xd, {Ad::make_add(0)})),
- y = opr::GetVarShape::make(axadd);
- HostTensorND host_y;
- auto func = graph->compile({make_callback_copy(y, host_y)});
- func->execute();
- TensorShape y_as_shp;
- DeviceTensorND yv{CompNode::default_cpu()};
- yv.copy_from(host_y);
- cg::copy_tensor_value_to_shape(y_as_shp, yv);
- ASSERT_EQ(TensorShape({1, 2, 3}), y_as_shp);
- ASSERT_FALSE(shp_dep_exec);
- }
-
- TEST(TestGraph, MemAllocForAsyncRead) {
- auto cns = load_multiple_xpus(2);
- auto cn1 = cns[1];
- HostTensorGenerator<> gen;
- auto host_x = gen({4, 3}, cns[0]);
-
- std::atomic_bool copy_issued = ATOMIC_VAR_INIT(false);
-
- RealTimer timer;
- auto cb_wait_copy_issue = [&](DeviceTensorND &) {
- while(!copy_issued.load());
- auto t = timer.get_secs();
- mgb_assert(t <= 0.1, "copy issue time too long: %.2f", t);
- };
-
- auto cb_set_copy_issue = [&](DeviceTensorND &) {
- copy_issued.store(true);
- };
-
- auto make_cb_async = [](SymbolVar dev, HostTensorND &host) {
- return std::make_pair(dev, [&](DeviceTensorND &d){host.copy_from(d);});
- };
-
- auto graph = ComputingGraph::make();
- // disable var check to avoid stram sync
- graph->options().var_sanity_check_first_run = false;
- graph->options().seq_opt.enable_seq_comp_node_opt = false;
- auto x = opr::Host2DeviceCopy::make(*graph, host_x),
- xrshp = x.reshape({6, 2}),
- xv0_static = xrshp + 3 - 2,
- xv0_after_copy = opr::CallbackInjector::make(
- xv0_static, cb_wait_copy_issue),
- xdyn = opr::MarkDynamicVar::make(xv0_after_copy),
- y0 = xdyn + 1,
- xcp_cn1 = opr::CallbackInjector::make(
- opr::Copy::make(x, {cn1}), cb_set_copy_issue),
- y1 = xcp_cn1 + 3;
-
- HostTensorND host_y0, host_y1;
- auto func = graph->compile({make_cb_async(y0, host_y0),
- make_cb_async(y1, host_y1)});
-
- timer.reset();
- opr::Sleep::sleep(cn1, 0.2);
- func->execute().wait();
- ASSERT_EQ(x.node()->prev_dev_ptr(), xrshp.node()->prev_dev_ptr());
- ASSERT_NE(x.node()->prev_dev_ptr(), xdyn.node()->prev_dev_ptr());
- ASSERT_EQ(TensorShape({6, 2}), host_y0.shape());
- ASSERT_EQ(TensorShape({4, 3}), host_y1.shape());
- for (size_t i = 0; i < 12; ++ i) {
- auto xv = host_x->ptr<float>()[i];
- MGB_ASSERT_FLOAT_EQ(xv + 2, host_y0.ptr<float>()[i]);
- MGB_ASSERT_FLOAT_EQ(xv + 3, host_y1.ptr<float>()[i]);
- }
- }
-
- TEST(TestGraph, EmptyStaticAlloc) {
- HostTensorGenerator<> gen;
- auto graph = ComputingGraph::make();
- for (int i = 0; i < 2; ++i) {
- auto host_x = gen({2, 3});
- auto dev_x = std::make_shared<DeviceTensorND>();
- dev_x->copy_from(*host_x);
- auto x = opr::SharedDeviceTensor::make(*graph, dev_x),
- y = x.reshape({6});
- auto func = graph->compile({{y, {}}});
- func->execute();
- ASSERT_EQ(dev_x->raw_ptr(), prev_dev_ptr(y));
- }
- }
-
- TEST(TestGraph, MultiOutRelease) {
- // output(0) released before output(1) started execution, while output(2) is
- // forwarded but not used
- auto cns = load_multiple_xpus(4);
-
- auto cn0 = cns[1], cn1 = cns[2], cn2 = cns[3];
- HostTensorGenerator<> gen;
- auto host_x = gen({6, 3}, cns[0]), host_one = gen({1}, cns[0]);
- host_one->ptr<float>()[0] = 1;
- auto graph = ComputingGraph::make();
-
- // disable var check to avoid stram sync
- graph->options().var_sanity_check_first_run = false;
- graph->options().async_exec_level = 0b10;
-
- std::atomic_bool cn0_finished{false};
-
- float *splt2_dev_ptr_produced = nullptr;
- DeviceTensorStorage splt2_alloc;
- splt2_alloc.comp_node(cn2.change_stream(CompNode::Stream::COPY)).
- ensure_size(6);
-
- VarNode *split_out0 = nullptr;
-
- auto cb_set_cn0_finish = [&](DeviceTensorND &) {
- mgb_assert(split_out0->contain_flag(
- VarNode::Flag::RT_FORCE_DYNAMIC_MEM_ALLOC));
- // wait for async releaser
- while (split_out0->mem_plan().valid()) {
- asm volatile ("": : : "memory");
- }
- mgb_assert(!split_out0->dev_tensor_valid());
-
- splt2_alloc = {};
- cn0_finished.store(true);
- };
-
- auto cb_wait_cn0_finish = [&](DeviceTensorND &) {
- while(!cn0_finished.load());
- };
-
- auto cb_record_ptr = [&](DeviceTensorND &dv) {
- splt2_dev_ptr_produced = dv.ptr<float>();
- };
-
- auto x = opr::Host2DeviceCopy::make(*graph, host_x),
- tmp = opr::CallbackInjector::make(
- SymbolVar::make_scalar(2.3f, *graph,
- cn1.change_stream(CompNode::Stream::COPY)),
- cb_wait_cn0_finish),
- one0 = opr::Host2DeviceCopy::make(*graph, host_one, {cn0}),
- one1 = opr::Host2DeviceCopy::make(*graph, host_one, {cn1}),
- one2 = opr::Host2DeviceCopy::make(*graph, host_one, {cn2});
- set_priority(tmp, -100);
- // use Host2DeviceCopy to make constant values for multistream add
- auto splt = opr::Split::make(x,
- opr::Split::Options::make_average(0, 3),
- OperatorNodeConfig{}.comp_node_arr({
- cn0.change_stream(23),
- cn1.change_stream(23),
- cn2.change_stream(23)}));
- HostTensorND host_y1;
- split_out0 = splt[0].node();
-
- auto func = graph->compile({
- {opr::add(splt[0], one0, cn0), cb_set_cn0_finish},
- {tmp, {}},
- make_callback_copy(opr::add(splt[1], one1, cn1), host_y1),
- {opr::add(splt[2], one2, cn2), {}}, // mark dynamic
- {splt[2], cb_record_ptr},
- });
-
- func->execute();
- func->to_json()->writeto_fpath(
- output_file("TestGraph.MultiOutRelease.json"));
- ASSERT_EQ(TensorShape({2, 3}), host_y1.shape());
- auto py1 = host_y1.ptr<float>(), px = host_x->ptr<float>({2});
- for (size_t i = 0; i < 6; ++ i)
- MGB_ASSERT_FLOAT_EQ(px[i] + 1, py1[i]);
-
- ASSERT_EQ(splt2_dev_ptr_produced, splt[2].node()->prev_dev_ptr());
- }
-
- TEST(TestGraph, MemAllocForRemoteReadVars) {
- auto cn1 = CompNode::load("xpu0:1");
- HostTensorGenerator<> gen;
- auto host_x = gen({5, 6}),
- host_y = gen({5, 6});
-
- auto graph = ComputingGraph::make();
- auto x = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x).rename("x"),
- y = opr::Host2DeviceCopy::make_no_fwd(*graph, host_y).rename("y"),
- sum0 = (opr::Sleep::make(x, 0.2) * x + opr::Sleep::make(y, 0.2) * y).
- rename("sum0"),
- sum1 = opr::add(x, y, {cn1}).rename("sum1");
- HostTensorND host_sum0, host_sum1;
- auto func = graph->compile({make_callback_copy(sum0, host_sum0),
- make_callback_copy(sum1, host_sum1)});
- func->execute();
- func->wait();
- for (bool sleep_cn1: {false, true}) {
- host_sum0 = {};
- host_sum1 = {};
- if (sleep_cn1)
- opr::Sleep::sleep(cn1, 0.5);
-
- func->execute();
-
- auto px = host_x->ptr<float>(), py = host_y->ptr<float>(),
- ps0 = host_sum0.ptr<float>(), ps1 = host_sum1.ptr<float>();
- for (int i = 0; i < 30; ++ i) {
- auto x = px[i], y = py[i];
- ASSERT_FLOAT_EQ(x * x + y * y, ps0[i]);
- ASSERT_FLOAT_EQ(x + y, ps1[i]);
- }
- }
-
- ASSERT_FALSE(cg::is_static_var_storage(x.node()));
- ASSERT_FALSE(cg::is_static_var_storage(y.node()));
- }
-
- TEST(TestGraph, ShapeOnlyInput) {
- HostTensorGenerator<> gen;
- auto host_x = gen({1});
- auto graph = ComputingGraph::make();
- auto x = opr::Host2DeviceCopy::make(*graph, host_x),
- y = opr::GetVarShape::make(x);
- HostTensorND host_y;
- auto func = graph->compile({make_callback_copy(y, host_y)});
- for (size_t sz: {1, 3, 5}) {
- *host_x = *gen({sz});
- func->execute();
- ASSERT_EQ(sz, size_t(host_y.ptr<dt_int32>()[0]));
- }
- }
-
- TEST(TestGraph, HostAndDevValueDep) {
- HostTensorGenerator<dtype::Int32> gen;
- auto host_idx = gen({1}), host_x = gen({3});
- host_idx->ptr<dt_int32>()[0] = 0;
- for (int i = 0; i < 3; ++ i)
- host_x->ptr<dt_int32>()[i] = i + 1;
-
- auto graph = ComputingGraph::make();
- auto x = opr::Host2DeviceCopy::make(*graph, host_x).rename("x"),
- idx = opr::Host2DeviceCopy::make(*graph, host_idx).rename("idx"),
- xsub = opr::IndexAt::make(x, {{0, idx}}).rename("xsub"),
- idx2 = (idx * idx).rename("idx2"),
- y = (xsub + idx2).rename("y");
-
- set_priority(xsub, -10);
- set_priority(idx2, 10);
-
- HostTensorND host_y;
- auto func = graph->compile({make_callback_copy(y, host_y)});
-
- bool found = false;
- for (auto &&i: func->get_rt_static_source_deps()) {
- constexpr auto V = cg::static_infer::DepType::VALUE;
- if (i.dest == idx.node() && i.type == V) {
- found = true;
- break;
- }
- }
- ASSERT_TRUE(found);
-
- for (int i = 0; i < 3; ++ i) {
- host_idx->ptr<dt_int32>()[0] = i;
- func->execute();
- ASSERT_EQ(i + 1 + i * i, host_y.ptr<dt_int32>()[0]) << "fail at " << i;
- }
- }
-
- TEST(TestGraph, ExtraVarDeps) {
- HostTensorND hv{dtype::Float32()};
- hv.comp_node(CompNode::load("xpu0")).resize({1}).ptr<float>()[0] = 0;
- auto dv = std::make_shared<DeviceTensorND>();
- dv->copy_from(hv);
-
- float cbv0 = -1, cbv1 = -1;
-
- auto cb0 = [&](DeviceTensorND &v) {
- cbv0 = HostTensorND().copy_from(v).sync().ptr<float>()[0];
- };
- auto cb1 = [&](DeviceTensorND &v) {
- cbv1 = HostTensorND().copy_from(v).sync().ptr<float>()[0];
- };
-
- auto graph = ComputingGraph::make();
- auto x = opr::SharedDeviceTensor::make(*graph, dv),
- xu = opr::AddUpdate::make(x, x.make_scalar(1.f)),
- y0 = opr::CallbackInjector::make(x, cb0),
- y1 = opr::CallbackInjector::make(xu, cb1);
- graph->options().extra_vardeps[xu.node()].push_back(y0.node());
- graph->options().extra_vardeps[xu.node()].push_back(y1.node());
- auto func = graph->compile({{xu, {}}});
- for (int i = 0; i < 3; ++ i) {
- func->execute();
- MGB_ASSERT_FLOAT_EQ(i, cbv0);
- MGB_ASSERT_FLOAT_EQ(i + 1, cbv1);
- }
- }
-
- TEST(TestGraph, WorkspaceAlloc) {
- auto graph = ComputingGraph::make();
- auto x = SymbolVar::make_scalar(0, *graph, CompNode::load("xpu0"));
- auto opr = graph->insert_opr(
- std::make_unique<WorkspaceAllocTestOpr>(x.node()));
- ASSERT_EQ(2u, opr->output().size());
- ASSERT_EQ(TensorShape{2}, opr->output(0)->shape());
- ASSERT_EQ(TensorShape{}, opr->output(1)->shape());
- auto func = graph->compile({{opr->output(0), {}}});
- func->execute();
- ASSERT_TRUE(opr->cast_final_safe<WorkspaceAllocTestOpr>().executed);
- }
-
- TEST(TestGraph, ConstFolding) {
- auto graph = ComputingGraph::make();
- auto a = SymbolVar::make_scalar(3, *graph, CompNode::load("xpu0")),
- b = SymbolVar::make_scalar(3, *graph, CompNode::load("xpu0")),
- c = a + b,
- d = a + b;
- ASSERT_EQ(a.node(), b.node());
- ASSERT_EQ(c.node(), d.node());
- ASSERT_NE(a.node(), c.node());
- ASSERT_EQ(d.node()->owner_opr()->dyn_typeinfo(),
- opr::ImmutableTensor::typeinfo());
- }
-
- TEST(TestGraph, MergeBroadcast) {
- HostTensorGenerator<> gen;
- auto host_x = gen({1});
- auto graph = ComputingGraph::make();
- auto a = opr::Host2DeviceCopy::make(*graph, host_x),
- b = a.broadcast({1,2}),
- c = b.broadcast({3,4});
- ASSERT_EQ(b.node(), b.node());
- ASSERT_EQ(c.node()->shape(), TensorShape({3, 4}));
- }
-
- TEST(TestGraph, SwapTypeCvtAndBcast) {
- HostTensorGenerator<> gen;
- auto host_x = gen({1});
- auto graph = ComputingGraph::make();
- auto a = opr::Host2DeviceCopy::make(*graph, host_x),
- b = a.broadcast({1, 2}), c = opr::TypeCvt::make(b, dtype::Int32());
- ASSERT_EQ(b.node()->owner_opr()->dyn_typeinfo(),
- opr::Broadcast::typeinfo());
- ASSERT_EQ(c.node()->dtype(), dtype::Int32());
- }
-
- TEST(TestGraph, SingleGraphMultipleCompile) {
- HostTensorGenerator<> gen;
- auto host_x = gen({2, 3});
- auto graph = ComputingGraph::make();
- auto x = opr::MarkDynamicVar::make(
- opr::Host2DeviceCopy::make(*graph, host_x)),
- y = x + 1;
- HostTensorND host_y0, host_y1, host_y_expect;
- host_y_expect.copy_from(*host_x);
- for (size_t i = 0, it = host_x->shape().total_nr_elems(); i < it; ++ i)
- host_y_expect.ptr<float>()[i] ++;
-
- auto func0 = graph->compile({make_callback_copy(y, host_y0)});
- func0->execute();
- auto func1 = graph->compile({make_callback_copy(y, host_y1)});
- func1->execute();
-
- ASSERT_THROW(func0->execute(), MegBrainError);
-
- MGB_ASSERT_TENSOR_EQ(host_y_expect, host_y0);
- MGB_ASSERT_TENSOR_EQ(host_y_expect, host_y1);
- }
-
- TEST(TestGraph, VarVirtualReceiverGrad) {
- HostTensorGenerator<> gen;
- constexpr size_t SIZE = 23;
-
- // add a virtual opr that takes (i0, i1, i2) and outputs
- // (i0^2, (i1+1)^3, (i2+2)^4)
- // in this test, i0 = i2 = x, i1 = x * .9f
- //
- // test for var multiple receivers and same input var of virtual opr
-
- auto graph = ComputingGraph::make();
- auto bind_vo = [&](
- const std::array<SymbolVar, 3> &inp,
- const std::array<SymbolVar, 3> &out) {
- HostTensorND iv[3];
- ComputingGraph::OutputSpec outspec;
- for (int i = 0; i < 3; ++ i) {
- outspec.push_back(make_callback_copy(inp[i], iv[i]));
- inp[i].rename(ssprintf("vinp%d", i));
- out[i].rename(ssprintf("vout%d", i));
- }
- graph->compile(outspec)->execute();
-
- auto grad = [](const VarNodeArray &inp, const VarNodeArray&,
- size_t idx, const VarNodeArray &out_grad) {
- SymbolVar
- x = inp[idx],
- exp = x.make_scalar(float(idx + 2)),
- gx = exp * opr::pow(x + float(idx), exp - 1.f) * out_grad[idx];
- return gx.node();
- };
-
- VarNodeArray vinp(3), vout(3);
- for (int i = 0; i < 3; ++ i) {
- vinp[i] = inp[i].node();
- vout[i] = out[i].node();
- }
- cg::add_var_virtual_receiver(vinp, vout, grad);
-
- float *iptr[3], *optr[3];
- for (int i = 0; i < 3; ++ i) {
- iptr[i] = iv[i].ptr<float>();
- optr[i] = out[i].node()->owner_opr()->
- cast_final_safe<opr::Host2DeviceCopy>().host_data()->
- ptr<float>();
- }
- for (size_t i = 0; i < SIZE; ++ i) {
- for (int j = 0; j < 3; ++ j)
- optr[j][i] = std::pow(iptr[j][i] + j, 2.0 + j);
- }
- };
- std::shared_ptr<HostTensorND>
- host_x = gen({SIZE}),
- host_vo[3], host_loss_p[5];
- for (int i = 0; i < 5; ++ i) {
- if (i < 3)
- host_vo[i] = gen({SIZE});
- host_loss_p[i] = gen({SIZE});
- }
-
- auto mkl = [&](SymbolVar x, size_t idx) {
- return opr::Dot::make(x,
- opr::Host2DeviceCopy::make(*graph, host_loss_p[idx]));
- };
-
- auto
- x = opr::Host2DeviceCopy::make(*graph, host_x).rename("x"),
- loss = mkl(x, 3) + mkl(opr::pow(x, x.make_scalar(-1.f)), 4);
- std::array<SymbolVar, 3> vout;
- for (int i = 0; i < 3; ++ i) {
- vout[i] = opr::Host2DeviceCopy::make(*graph, host_vo[i]);
- loss = loss + mkl(vout[i], i);
- }
- bind_vo({x, x * .9f, x}, vout);
-
- HostTensorND gx, host_loss;
- auto func = graph->compile({
- make_callback_copy(cg::grad(loss, x), gx),
- make_callback_copy(loss, host_loss)});
- func->execute();
-
- auto px = host_x->ptr<float>(), pgx = gx.ptr<float>();
- float *plp[5], *pvo[3], scale[5], bias[5], exp[5];
- for (int i = 0; i < 5; ++ i) {
- plp[i] = host_loss_p[i]->ptr<float>();
- scale[i] = 1;
- bias[i] = 0;
- exp[i] = 1;
- if (i < 3)
- pvo[i] = host_vo[i]->ptr<float>();
- }
- exp[0] = 2;
- scale[1] = 0.9; bias[1] = 1; exp[1] = 3;
- bias[2] = 2; exp[2] = 4;
- exp[4] = -1;
- float loss_expect = 0;
- for (size_t i = 0; i < SIZE; ++ i) {
- float gx = 0, x = px[i];
- for (int j = 0; j < 5; ++ j) {
- auto a = scale[j], b = bias[j], c = exp[j];
- // (ax + b)**c
- auto base = a * x + b;
- gx += plp[j][i] * c * a * std::pow(base, c - 1.f);
- loss_expect += plp[j][i] * std::pow(base, c);
-
- if (j < 3) {
- MGB_ASSERT_FLOAT_EQ(std::pow(base, c), pvo[j][i]);
- }
- }
- MGB_ASSERT_FLOAT_EQ(gx, pgx[i]);
- }
- MGB_ASSERT_FLOAT_EQ(loss_expect, host_loss.ptr<float>()[0]);
- }
-
- TEST(TestGraph, ClearDeviceMemory) {
- HostTensorGenerator<> gen;
- auto host_x = gen({2, 3});
- auto graph = ComputingGraph::make();
- auto x = opr::Host2DeviceCopy::make(*graph, host_x),
- y = x + 1;
- auto func = graph->compile({{y, {}}});
- for (int i = 0; i < 2; ++ i) {
- ASSERT_EQ(0u, graph->clear_device_memory());
- func->execute();
- ASSERT_EQ(1u, graph->clear_device_memory());
- ASSERT_EQ(0u, graph->clear_device_memory());
- }
- }
-
- TEST(TestGraph, CopyStream) {
- REQUIRE_GPU(2);
-
- HostTensorGenerator<> gen;
- auto cn0 = CompNode::load("gpu0"),
- cn1 = CompNode::load("gpu1");
- auto host_x = gen({23}, cn0);
- auto sum_sqr = [](SymbolVar x) {
- return opr::reduce_sum_sqr(x, x.make_scalar(1));
- };
- auto graph = ComputingGraph::make();
- graph->options().log_level = 3;
- auto x = opr::Host2DeviceCopy::make(*graph, host_x),
- loss0 = opr::Copy::make(
- sum_sqr(x) + opr::reduce_sum(x, x.make_scalar(1)), cn1),
- loss1 = sum_sqr(opr::Copy::make(x, cn1)),
- gx = opr::VirtualGrad::make(loss0 + loss1, x);
- HostTensorND host_gx;
- auto func = graph->compile({make_callback_copy(gx, host_gx)});
- func->execute();
- ASSERT_EQ(host_gx.shape(), host_x->shape());
- auto px = host_x->ptr<float>(), pgx = host_gx.ptr<float>();
- for (size_t i = 0; i < 23; ++ i) {
- MGB_ASSERT_FLOAT_EQ(px[i] * 4 + 1, pgx[i]);
- }
-
- ASSERT_EQ(int(CompNode::Stream::COPY),
- host_gx.comp_node().locator().stream);
- }
-
- TEST(TestGraph, DynShapeDepCrossCN) {
- auto cns = load_multiple_xpus(2);
- HostTensorGenerator<> gen;
- auto host_x = gen({23}, cns[0]);
- auto graph = ComputingGraph::make();
- auto x = opr::Host2DeviceCopy::make(*graph, host_x),
- y = opr::Sleep::make(x, 0.1, {true, true}),
- a = opr::MarkDynamicVar::make(y),
- ao = opr::Copy::make(a, cns[1].change_stream(1)),
- b = opr::GetVarShape::make(ao, {}, cns[1]) + 1;
- graph->options().var_sanity_check_first_run = false;
- graph->options().async_exec_level |= 0b10;
- set_priority(b, -100);
- HostTensorND host_a, host_b;
- auto func = graph->compile({make_callback_copy(a, host_a, false),
- make_callback_copy(b, host_b, false)});
- func->execute().wait();
- MGB_ASSERT_TENSOR_EQ(*host_x, host_a);
- ASSERT_EQ(TensorShape{1}, host_b.shape());
- ASSERT_EQ(24.f, host_b.ptr<int>()[0]);
- }
-
- namespace {
- MGB_DEFINE_OPR_CLASS(CustomCopy, cg::SingleCNOperatorNodeBase) // {
- std::shared_ptr<DeviceTensorND> m_data;
-
- void scn_do_execute() override {
- using namespace std::literals;
- std::this_thread::sleep_for(100ms);
- m_data->copy_from(input(0)->dev_tensor());
- }
-
- void init_output_static_infer_desc() override {
- using namespace cg::static_infer;
- owner_graph()->static_infer_manager().register_shape_infer(
- output(0), ShapeInferDesc::make_const({}));
- }
-
- public:
- CustomCopy(VarNode* x, std::shared_ptr<DeviceTensorND> dv)
- : Super{x->owner_graph(), {dv->comp_node()}, "d2h", {x}},
- m_data(dv) {
- add_input({x});
- using F = VarNode::Flag;
- add_output(None)
- ->add_flag(F::ALLOW_EMPTY_SHAPE)
- .add_flag(F::VOLATILE_CONTENT);
- }
- };
- MGB_DYN_TYPE_OBJ_FINAL_IMPL(CustomCopy);
- }
-
- TEST(TestGraph, DependentOnVolatileContent) {
- HostTensorGenerator<> gen;
- auto cn0 = CompNode::load("xpu0"),
- cn1 = cn0.change_stream(1);
- auto host_x = gen({233}, cn0);
- auto dev_y = std::make_shared<DeviceTensorND>(cn1);
-
- auto graph = ComputingGraph::make();
- auto x = opr::SharedDeviceTensor::make(*graph, *host_x),
- y = x.insert_single_output_opr<CustomCopy>(x.node(), dev_y),
- x_new = opr::AddUpdate::make(x, x.make_scalar(1));
-
- auto func = graph->compile({{y, {}}, {x_new, {}}});
- func->execute().wait();
- HostTensorND host_y;
- host_y.copy_from(*dev_y).sync();
- MGB_ASSERT_TENSOR_EQ(*host_x, host_y);
- }
-
- namespace {
- void check_wait(SymbolVar dest, SymbolVar dep) {
- if (!dep.node()) {
- ASSERT_EQ(0u,
- dest.node()->owner_opr()->input_waiting_spec().size());
- return;
- }
- cg::OperatorNodeBase::InputWaitingSpecElem ws;
- unpack_vector(dest.node()->owner_opr()->input_waiting_spec(), ws);
- ASSERT_EQ(ws.comp_node, dest.node()->comp_node());
- VarNode *get;
- unpack_vector(ws.dev_ready, get);
- ASSERT_EQ(dep, get);
- };
- }
-
- TEST(TestGraph, InputWaitingSpec) {
- auto cns = load_multiple_xpus(2);
- constexpr size_t SIZE = 12345;
- HostTensorGenerator<> gen;
- auto host_x = gen({SIZE}, cns[0]);
- auto graph = ComputingGraph::make();
- graph->options().seq_opt.enable_seq_comp_node_opt = false; // no copy stream
- auto cn0 = cns[0], cn1 = cns[1];
- auto x = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x),
- y0 = opr::Copy::make(x, cn1),
- y1 = opr::Copy::make(x + 1, cn1),
- z1 = opr::Copy::make(y1 + 1, cn0),
- z0 = opr::Copy::make(y0 + 1, cn0);
- set_priority(y0, 5);
- set_priority(y1, 10);
- set_priority(z1, 15);
- set_priority(z0, 20);
-
- HostTensorND host_z0, host_z1;
- auto func = graph->compile({
- make_callback_copy(z0, host_z0),
- make_callback_copy(z1, host_z1)});
- func->execute();
-
- auto px = host_x->ptr<float>(),
- pz0 = host_z0.ptr<float>(), pz1 = host_z1.ptr<float>();
- for (size_t i = 0; i < SIZE; ++ i) {
- MGB_ASSERT_FLOAT_EQ(px[i] + 1, pz0[i]);
- MGB_ASSERT_FLOAT_EQ(px[i] + 2, pz1[i]);
- }
- check_wait(y0, x);
- check_wait(y1, x + 1);
- check_wait(z1, y1 + 1);
- check_wait(z0, {});
- }
-
- TEST(TestGraph, InputWaitingSpecMultiOut) {
- auto cn0 = CompNode::load("xpu0:0"), cn1 = CompNode::load("xpu0:1");
- HostTensorGenerator<> gen;
- auto graph = cg::ComputingGraph::make();
- graph->options().graph_opt_level = 0;
- graph->options().var_sanity_check_first_run = 0;
- graph->options().async_exec_level = 0b100;
- graph->options().seq_opt.enable_seq_comp_node_opt = false;
- size_t nr_out = 1024, length = 32;
- auto hv = gen({nr_out * length}, cn0);
- auto x = opr::Host2DeviceCopy::make(*graph, hv);
- auto outs = opr::Split::make(x, opr::Split::Options::make_average(0, nr_out));
- cg::ComputingGraph::OutputSpec output_spec;
- for (size_t i = 0; i < nr_out; ++ i) {
- auto y = opr::Copy::make(outs[i], cn1);
- y.node()->owner_opr()->node_prop().attribute().priority = i ? nr_out - i : 0;
- output_spec.push_back({y, {}});
- }
- auto func = graph->compile(output_spec);
- func->execute().wait();
-
- check_wait(output_spec[0].first, outs[0]);
- check_wait(output_spec[nr_out - 1].first, outs[nr_out - 1]);
- for (size_t i = 1; i < nr_out - 1; ++ i) {
- check_wait(output_spec[i].first, {});
- }
- }
-
- TEST(TestGraph, GradStaticShape) {
- for (bool enable: {false, true}) {
- auto graph = ComputingGraph::make();
- graph->options().enable_grad_var_static_reshape = enable;
- HostTensorGenerator<> gen;
- auto host_x = gen({234});
- auto x = opr::Host2DeviceCopy::make(*graph, host_x),
- x1 = x + 1.f,
- y = opr::MarkDynamicVar::make(x1) * x1,
- gx = cg::grad(opr::reduce_sum(y, y.make_scalar(1)), x);
- ASSERT_FALSE(cg::is_static_var_shape(y.node()));
- ASSERT_EQ(enable, cg::is_static_var_shape(gx.node()));
-
- HostTensorND host_gx;
- auto func = graph->compile({make_callback_copy(gx, host_gx)});
- func->execute();
- auto px = host_x->ptr<float>(), pgx = host_gx.ptr<float>();
- for (size_t i = 0; i < 234; ++ i) {
- MGB_ASSERT_FLOAT_EQ(2 * (px[i] + 1), pgx[i]);
- }
- }
- }
-
- TEST(TestGraph, AllInputGrad) {
- auto graph = ComputingGraph::make();
- HostTensorGenerator<> gen;
- auto host_x = gen({2}), host_y = gen({2});
- auto x = opr::Host2DeviceCopy::make(*graph, host_x, {"x"}),
- y = opr::Host2DeviceCopy::make(*graph, host_y, {"y"}),
- z = x.insert_single_output_opr<AllInputGradOpr>(x.node(), y.node()),
- loss0 = opr::reduce_sum_sqr(z, z.make_scalar(1)),
- loss1 = opr::reduce_sum_sqr(z * 2, z.make_scalar(1));
-
- auto&& op = z.node()->owner_opr()->cast_final_safe<AllInputGradOpr>();
- auto grad = [](SymbolVar x, SymbolVar y) {
- return cg::grad(x, y, true, false);
- };
- auto gx0 = grad(loss0, x), gy0 = grad(loss0, y);
- ASSERT_EQ(1u, op.nr_grad_call);
- ASSERT_EQ(x + y, gx0);
- ASSERT_EQ(nullptr, gy0.node());
-
- auto gx1 = grad(loss1, x), gy1 = grad(loss1, y);
- ASSERT_EQ(2u, op.nr_grad_call);
- ASSERT_EQ(nullptr, gx1.node());
- ASSERT_EQ(x * y, gy1);
- }
-
- TEST(TestGraph, CPPMemLeak) {
- auto run = [](){
- auto graph = ComputingGraph::make();
- HostTensorGenerator<> gen;
- auto host_x = gen({23}, "cpux");
- auto x = opr::Host2DeviceCopy::make(*graph, host_x),
- y0 = x + 1.f,
- y1 = x + 1.f;
- ASSERT_EQ(y0, y1); // opr dedup calls clear() in static inference
- HostTensorND host_y;
- auto func = graph->compile({make_callback_copy(y0, host_y)});
- func->execute();
- auto px = host_x->ptr<float>(), py = host_y.ptr<float>();
- for (size_t i = 0; i < 23; ++ i) {
- MGB_ASSERT_FLOAT_EQ(px[i] + 1, py[i]);
- }
- };
- // initialize global objects
- CompNode::finalize();
- run();
- run(); // memleak should be caught by asan, if there is any
- }
-
- TEST(TestGraph, ReplaceVarHelper) {
- auto graph = ComputingGraph::make();
- HostTensorGenerator<> gen;
- auto x = opr::Host2DeviceCopy::make(*graph, gen({2})),
- y = x + 1.f,
- z = y + 1.f;
- auto x1 = opr::Host2DeviceCopy::make(*graph, gen({3}));
- SymbolVar y1, z1;
- unpack_vector(cg::replace_vars({y, z}, {{x, x1}}), y1, z1);
- ASSERT_EQ(x1 + 1.f, y1);
- ASSERT_EQ(y1 + 1.f, z1);
- }
-
- TEST(TestGraph, ReplaceVarWithDeps) {
- auto cn = CompNode::load("xpu0");
- auto graph = ComputingGraph::make();
- HostTensorGenerator<> gen;
-
- using Iter = std::pair<std::shared_ptr<DeviceTensorND>, SymbolVar>;
-
- auto make_iter = [&]() -> Iter {
- HostTensorND host(cn, {1});
- host.ptr<float>()[0] = 0.0;
- auto dev = opr::SharedDeviceTensor::make(*graph, host);
- auto iter = opr::AddUpdate::make(dev, dev.make_scalar(1));
- return {dev.node()
- ->owner_opr()
- ->cast_final_safe<opr::SharedDeviceTensor>()
- .dev_data(),
- iter};
- };
- auto check_iter = [&](float val, const Iter& iter) {
- HostTensorND host(cn, {1});
- host.copy_from_fixlayout(*iter.first);
- host.sync();
- MGB_ASSERT_FLOAT_EQ(val, host.ptr<float>()[0]);
- };
-
- auto iter0 = make_iter();
- auto iter1 = make_iter();
- auto iter2 = make_iter();
- auto iter3 = make_iter();
-
- auto a = iter0.second + 1;
- auto b = iter1.second + 2;
- auto c = b * 5;
-
- graph->options().extra_vardeps[b.node()].push_back(a.node());
-
- auto y = cg::replace_vars({c},
- {{iter0.second.node(), iter2.second.node()},
- {iter1.second.node(), iter3.second.node()}});
-
- ASSERT_EQ(y.size(), 1u);
-
- HostTensorND host_y;
- auto func = graph->compile({make_callback_copy(y[0], host_y)});
-
- func->execute();
-
- check_iter(0, iter0);
- check_iter(0, iter1);
- check_iter(1, iter2);
- check_iter(1, iter3);
- }
-
- TEST(TestGraph, EmptyShapeCheck) {
- auto cn = CompNode::load("xpux");
- auto graph = ComputingGraph::make();
- auto host_x = std::make_shared<HostTensorND>(cn, TensorShape{1});
- host_x->ptr<float>()[0] = 2;
- using Param = opr::CondTake::Param;
- auto x = opr::Host2DeviceCopy::make(*graph, host_x),
- y = opr::CondTake::make(x, x, {Param::Mode::GT})[0],
- z = opr::reduce_sum(y, y.make_scalar(1));
- HostTensorND host_z;
- auto func = graph->compile({make_callback_copy(z, host_z)});
- func->execute();
- MGB_ASSERT_FLOAT_EQ(2.f, host_z.ptr<float>()[0]);
-
- host_x->ptr<float>()[0] = -2;
- ASSERT_THROW(
- {
- try {
- func->execute();
- } catch (const MegBrainError& exc) {
- std::string msg{exc.what()};
- ASSERT_TRUE(msg.find("empty output var") !=
- std::string::npos)
- << "bad message " << msg;
- throw;
- }
- },
- MegBrainError);
- }
-
- TEST(TestGraph, RefCntManage) {
- HostTensorGenerator<> gen;
- auto cns = load_multiple_xpus(2);
- auto graph = ComputingGraph::make();
- auto host_x = gen({2, 3}, cns[0]), host_y = gen({1, 3}, cns[1]);
- auto x = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x).rename("x"),
- y = opr::Host2DeviceCopy::make_no_fwd(*graph, host_y).rename("y"),
- x_cn1 = opr::Copy::make(x, {cns[1]}).rename("x_cn1"),
- z = (x_cn1 + y).rename("z");
- HostTensorND host_z;
- // disable comp node opt to avoid copy stream
- graph->options().seq_opt.enable_seq_comp_node_opt = false;
- graph->compile({make_callback_copy(z, host_z)})->execute();
- auto chk_dyn = [](SymbolVar var) {
- auto v = var.node();
- ASSERT_FALSE(cg::is_static_var_storage(v)) << v->name();
- ASSERT_FALSE(v->dev_tensor_valid()) << v->name();
- ASSERT_EQ(0u, v->refcnt()) << v->name();
- };
-
- bool cross_cn_mem_share = cns[0].mem_node() == cns[1].mem_node();
-
- for (auto i : {x, y, x_cn1, z}) {
- ASSERT_EQ(0u, i.node()->refcnt()) << i.node()->name();
- if (i.node() == x.node() ||
- (cross_cn_mem_share && i.node() == x_cn1.node())) {
- chk_dyn(i);
- } else {
- ASSERT_TRUE(cg::is_static_var_storage(i.node()))
- << i.node()->name();
- ASSERT_TRUE(i.node()->dev_tensor_valid()) << i.node()->name();
- }
- }
-
- graph->options().force_dynamic_alloc = true;
- HostTensorND host_z1;
- graph->compile({make_callback_copy(z, host_z1)})->execute();
- MGB_ASSERT_TENSOR_EQ(host_z, host_z1);
- for (auto i : {x, y, x_cn1, z}) {
- chk_dyn(i);
- }
-
- // var with refcnt and without reader
- graph->compile({{z, {}}})->execute().wait();
- chk_dyn(z);
- }
-
- TEST(TestGraph, CompNodeFinalize) {
- for (int rec = 0; rec < 3; ++rec) {
- auto cn = CompNode::load(rec ? "cpu0" : "xpux");
- HostTensorGenerator<> gen;
- auto graph = ComputingGraph::make();
- auto host_x = gen({1}, cn), host_y = gen({1}, cn);
- auto x = opr::Host2DeviceCopy::make(*graph, host_x),
- y = opr::Host2DeviceCopy::make(*graph, host_y), z = x + y;
- HostTensorND host_z;
- if (rec) {
- graph->options().var_sanity_check_first_run = false;
- graph->options().comp_node_seq_record_level = rec;
- }
- auto func = graph->compile({make_callback_copy(z, host_z)});
- if (rec == 2) {
- ComputingGraph::assert_destroy(graph);
- }
- for (int i = 0; i < 5; ++i) {
- host_x->copy_from(*gen({1}, cn));
- func->execute();
- MGB_ASSERT_FLOAT_EQ(
- host_x->ptr<float>()[0] + host_y->ptr<float>()[0],
- host_z.ptr<float>()[0]);
- }
- CompNode::finalize();
- ASSERT_THROW(func->execute(), InternalError);
- }
- }
-
- namespace {
- class GraphHolder final : public UserDataContainer::UserData {
- MGB_TYPEINFO_OBJ_DECL;
- std::shared_ptr<ComputingGraph> m_graph;
- int* m_del_chk;
-
- public:
- GraphHolder(std::shared_ptr<ComputingGraph> graph, int* del_chk)
- : m_graph{std::move(graph)}, m_del_chk{del_chk} {}
- ~GraphHolder() { ++*m_del_chk; }
- };
- MGB_TYPEINFO_OBJ_IMPL(GraphHolder);
- } // anonymous namespace
-
- TEST(TestGraph, CompNodeFinalizeRecursive) {
- // recursive case may occur in python
- int del_chk = 0;
- auto graph = ComputingGraph::make();
- graph->options().user_data.get_user_data_or_create<GraphHolder>([&]() {
- return std::make_shared<GraphHolder>(std::move(graph), &del_chk);
- });
- graph.reset();
- ASSERT_EQ(0, del_chk);
- CompNode::finalize();
- ASSERT_EQ(1, del_chk);
- }
-
- #if MGB_NEED_MEGDNN_ASYNC_ERROR
- TEST(TestGraph, SignalCompSeqExecFinishedAsyncError) {
- REQUIRE_GPU(1);
- auto graph = ComputingGraph::make();
- HostTensorGenerator<dtype::Int32> gen;
- auto host_x = gen({10});
- auto host_y = gen({1});
- host_y->ptr<int>()[0] = 20;
- auto x = opr::Host2DeviceCopy::make(*graph, host_x);
- auto y = opr::Host2DeviceCopy::make(*graph, host_y);
- y = opr::MarkDynamicVar::make(y);
- using AIdx = opr::indexing::AxisIndexer;
- auto out1 = opr::IndexingMultiAxisVec::make({x}, {AIdx::make_index(0, y)});
- size_t exec_cnt = 0;
- auto cb = [&exec_cnt](const cg::event::CompSeqExecFinished &ev) {
- MGB_MARK_USED_VAR(ev);
- exec_cnt ++;
- };
- auto handle = graph->event().register_receiver<cg::event::CompSeqExecFinished>(cb);
- auto func = graph->compile({{out1, {}}});
- for(size_t i = 0; i < 3; ++ i) {
- ASSERT_THROW({
- try {
- func->execute().wait();
- } catch(const MegBrainError&) {
- ASSERT_EQ(exec_cnt, i + 1);
- throw;
- }
- }, MegBrainError);
- }
- }
-
- TEST(TestGraph, RecoverFromAsyncError) {
- REQUIRE_GPU(1);
- auto graph = ComputingGraph::make();
- HostTensorGenerator<dtype::Int32> gen;
- auto host_x = gen({10});
- auto host_y = gen({1});
- host_y->ptr<int>()[0] = 5;
- auto x = opr::Host2DeviceCopy::make(*graph, host_x);
- auto y = opr::Host2DeviceCopy::make(*graph, host_y);
- y = opr::MarkDynamicVar::make(y);
- using AIdx = opr::indexing::AxisIndexer;
- auto out1 = opr::IndexingMultiAxisVec::make({x}, {AIdx::make_index(0, y)});
-
- auto func = graph->compile({{out1, {}}});
-
- func->execute().wait();
-
- ASSERT_THROW({
- try{
- host_y->ptr<int>()[0] = 20;
- func->execute().wait();
- } catch(const MegBrainError&) {
- host_y->ptr<int>()[0] = 5;
- throw;
- }
- }, MegBrainError);
-
- func->execute().wait();
- }
-
- TEST(TestGraph, AsyncErrorMultiCompGraph) {
- REQUIRE_GPU(1);
- HostTensorGenerator<dtype::Int32> gen;
- auto host_x = gen({10});
- auto host_y0 = gen({1}), host_y1 = gen({1});
-
- auto gen_func = [&](decltype(host_y0) host_y) {
- auto graph = ComputingGraph::make();
- auto x = opr::Host2DeviceCopy::make(*graph, host_x);
- auto y = opr::Host2DeviceCopy::make(*graph, host_y);
- y = opr::MarkDynamicVar::make(y);
- using AIdx = opr::indexing::AxisIndexer;
- auto out1 =
- opr::IndexingMultiAxisVec::make({x}, {AIdx::make_index(0, y)});
- return graph->compile({{out1, {}}});
- };
-
- auto func0 = gen_func(host_y0);
- auto func1 = gen_func(host_y1);
-
- ASSERT_THROW({
- host_y0->ptr<int>()[0] = 20;
- host_y1->ptr<int>()[0] = 5;
- ASSERT_NO_THROW({
- func0->execute();
- func1->execute().wait();
- });
- func0->wait();
- }, MegBrainError);
-
- ASSERT_NO_THROW({
- host_y0->ptr<int>()[0] = 5;
- host_y1->ptr<int>()[0] = 5;
- func0->execute().wait();
- func1->execute().wait();
- });
- }
- #endif
-
- TEST(TestGraph, WaitAfterException) {
- auto graph = ComputingGraph::make();
- HostTensorGenerator<dtype::Int32> gen;
- auto host_x = gen({10});
- auto host_y = gen({10});
- size_t flag;
- auto x = opr::Host2DeviceCopy::make(*graph, host_x);
- auto y = opr::Host2DeviceCopy::make(*graph, host_y);
- auto z = opr::CallbackInjector::make(
- x + y,
- [&](DeviceTensorND &) {
- mgb_throw_if(flag, MegBrainError,
- "throw exception after a + b.");
- }
- );
- auto cb = [&](const cg::event::CompSeqExecFinished &ev) {
- MGB_MARK_USED_VAR(ev);
- mgb_throw_if(flag, MegBrainError,
- "It should not signal CompSeqExecFinished "
- "if any exception is thrown during execution.");
- };
- auto handle = graph->event().register_receiver<cg::event::CompSeqExecFinished>(cb);
- auto func = graph->compile({{z, {}}});
-
- flag = 1;
- ASSERT_THROW(func->execute(), MegBrainError);
- ASSERT_NO_THROW(func->wait());
- flag = 0;
- ASSERT_NO_THROW(func->execute().wait());
- }
-
- TEST(TestGraph, PauseExecEnv) {
- auto graph = ComputingGraph::make();
- graph->options().var_sanity_check_first_run = false;
- graph->options().async_exec_level = 0b100;
- HostTensorGenerator<> gen;
- auto host_x = gen({2, 3}, CompNode::default_cpu());
- std::atomic_bool flag0{false}, flag1{false};
- auto cb0 = [&flag0](DeviceTensorND&) {
- flag0 = true;
- while (flag0.load()) {
- std::this_thread::yield();
- }
- };
- auto cb1 = [&flag1](DeviceTensorND&) { flag1 = true; };
- auto x = opr::Host2DeviceCopy::make(*graph, host_x),
- y = opr::CallbackInjector::make(x, cb0),
- z = opr::CallbackInjector::make(y, cb1);
- auto func = graph->compile({{z, {}}});
-
- auto exec_env = static_cast<cg::ComputingGraphImpl*>(graph.get())
- ->current_exec_env();
- auto worker = [&flag0, &flag1, exec_env]() {
- while (!flag0.load()) {
- std::this_thread::yield();
- }
- exec_env->pause_exec();
- flag0 = false;
-
- using namespace std::chrono_literals;
- std::this_thread::sleep_for(100ms);
- ASSERT_FALSE(flag1.load());
- exec_env->resume_exec();
- std::this_thread::sleep_for(100ms);
- ASSERT_TRUE(flag1.load());
- };
- std::thread worker_th{worker};
-
- func->execute();
- func->wait();
- worker_th.join();
- }
-
- TEST(TestGraph, CustomStaticDeviceMemoryAllocator) {
- HostTensorGenerator<> gen;
- auto host_x = gen({1, 3});
- auto graph = ComputingGraph::make();
- auto x = opr::Host2DeviceCopy::make(*graph, host_x),
- y = x + opr::ImmutableTensor::make(*graph, *gen({2, 1}));
- auto func = graph->compile({{y, {}}});
- auto allocator = std::make_shared<TrackableStaticMemAlloc>();
- graph->set_device_memory_allocator(allocator);
-
- ASSERT_EQ(0u, allocator->nr_call());
- ThinHashSet<const void*> y_addrs;
- size_t expected_nr_call = 1;
- auto check = [&]() {
- func->execute();
- y_addrs.insert(prev_dev_ptr(y));
- ASSERT_EQ(expected_nr_call, allocator->nr_call());
- ASSERT_EQ(expected_nr_call, y_addrs.size());
- };
-
- for (int i = 1; i < 12; ++i) {
- if (i % 3 == 0) {
- ++expected_nr_call;
- ++allocator->version_num;
- }
- check();
- }
-
- *host_x = *gen({1, 1023});
- ++expected_nr_call;
- check();
-
- *host_x = *gen({1, 2047});
- allocator->size_expect = func->update_static_alloc_plan_and_get_size().at(
- host_x->comp_node());
- ASSERT_EQ(expected_nr_call, allocator->nr_call());
- ++expected_nr_call;
- check();
-
- allocator->version_num = TrackableStaticMemAlloc::VERSION_INVALID;
- ASSERT_THROW(func->execute(), MegBrainError);
- }
-
- TEST(TestGraph, CustomDynamicDeviceMemoryAllocator) {
- HostTensorGenerator<> gen;
- auto host_x = gen({1});
- auto allocator = std::make_shared<TrackableDynamicMemAlloc>();
- SymbolVar x, xp1, y, z;
- auto cb = [&](DeviceTensorND& dv) {
- HostTensorND hv;
- hv.copy_from(dv).sync();
- ASSERT_EQ(host_x->ptr<float>()[0] + 1.f, hv.ptr<float>()[0]);
- // CallbackInjector output should reuse its input, so only one var here
- EXPECT_EQ(1u, allocator->alive_vars().count(xp1.node()));
- EXPECT_EQ(1u, allocator->alive_vars().size());
- };
- auto graph = ComputingGraph::make();
- graph->options().graph_opt_level = 0;
- graph->options().force_dynamic_alloc = true;
-
- x = opr::Host2DeviceCopy::make(*graph, host_x);
- xp1 = x + 1;
- y = opr::CallbackInjector::make(xp1, cb);
- z = y * 2;
- HostTensorND host_z;
- auto func = graph->compile({make_callback_copy(z, host_z)});
- graph->set_device_memory_allocator(allocator);
-
- ASSERT_TRUE(allocator->alive_vars().empty());
- func->execute();
- ASSERT_EQ(2.f * (host_x->ptr<float>()[0] + 1.f), host_z.ptr<float>()[0]);
- ASSERT_TRUE(allocator->alive_vars().empty());
-
- *host_x = *gen({1});
- func->execute();
- ASSERT_EQ(2.f * (host_x->ptr<float>()[0] + 1.f), host_z.ptr<float>()[0]);
- ASSERT_TRUE(allocator->alive_vars().empty());
- }
-
- TEST(TestGraph, ExecutionMask) {
- HostTensorGenerator<> gen;
- auto host_x = gen({1});
- int called = 0;
- auto cb = [&](DeviceTensorND&) { ++called; };
- auto graph = ComputingGraph::make();
- auto x = opr::Host2DeviceCopy::make(*graph, host_x),
- y = opr::CallbackInjector::make(x, cb);
- auto exec_mask = std::make_shared<cg::ExecutionMask>(nullptr);
- exec_mask->register_to_opr(y.node()->owner_opr());
- auto func = graph->compile({{y, {}}});
- func->execute();
- ASSERT_EQ(0, called);
- exec_mask->enable(true);
- func->execute();
- ASSERT_EQ(1, called);
- func->execute();
- ASSERT_EQ(2, called);
- exec_mask->enable(false);
- func->execute();
- ASSERT_EQ(2, called);
- }
-
- TEST(TestGraph, AsyncRelease) {
- // check that async release happens before reset var mem plan (when mem plan
- // is reset, var
- HostTensorGenerator<> gen;
- auto host_x = gen({1024});
- auto dev_x = std::make_shared<DeviceTensorND>();
- dev_x->copy_from(*host_x).sync();
- auto cn1 = host_x->comp_node().change_stream(1);
-
- auto host_tshp = std::make_shared<HostTensorND>(
- host_x->comp_node(), TensorShape{2}, dtype::Int32{});
- auto set_shape = [p = host_tshp->ptr<int>()](int x) {
- p[0] = 1 << x;
- p[1] = 1 << (10 - x);
- };
- set_shape(0);
-
- auto graph = ComputingGraph::make();
- auto x = opr::SharedDeviceTensor::make(*graph, dev_x),
- tshp = opr::Host2DeviceCopy::make(*graph, host_tshp),
- x_fwd = x.reshape(tshp), y = opr::Sleep::make(x_fwd, 0.05, {}, cn1);
- auto func = graph->compile({{y, {}}});
-
- ASSERT_TRUE(cg::is_static_var_storage(x.node()));
- ASSERT_FALSE(cg::is_static_var_storage(x_fwd.node()));
-
- for (int i = 0; i < 3; ++i) {
- set_shape(i + 1);
- func->execute();
- ASSERT_EQ(prev_dev_ptr(x_fwd), dev_x->raw_ptr());
- ASSERT_EQ(TensorShape({2u << i, 1u << (9 - i)}), y.shape());
- }
- }
-
- TEST(TestGraph, UpdateStaticAllocPlan) {
- HostTensorGenerator<> gen;
- auto host_x = gen({3});
- auto graph = ComputingGraph::make();
- auto x = opr::Sleep::make(opr::Host2DeviceCopy::make(*graph, host_x), 0.5),
- y = x + opr::ImmutableTensor::make(*graph, *gen({1}));
- auto func = graph->compile({{y, {}}});
- func->update_static_alloc_plan_and_get_size();
- func->execute();
-
- *host_x = *gen({1023});
- func->execute();
-
- *host_x = *gen({2047});
- func->update_static_alloc_plan_and_get_size();
- func->execute();
- }
-
- TEST(TestGraph, CPUGPUHybrid) {
- REQUIRE_GPU(1);
- auto cn_gpu = CompNode::load("gpu0");
- for (auto&& cn_cpu : {CompNode::load("cpu0"), CompNode::default_cpu()}) {
- auto graph = ComputingGraph::make();
- HostTensorGenerator<> gen;
- constexpr size_t length = 23333;
- auto host_x = gen({length});
- graph->options().var_sanity_check_first_run = false;
- auto x = opr::Host2DeviceCopy::make(*graph, host_x, {cn_cpu}),
- y = opr::Sleep::make(x, 0.5) * 2,
- z_gpu = opr::Copy::make(y, cn_gpu) + 1,
- z = opr::Copy::make(z_gpu, cn_cpu) * 2;
- HostTensorND host_z;
- auto func = graph->compile({make_callback_copy(z, host_z)});
- func->execute();
- for (size_t i = 0; i < length; ++ i) {
- MGB_ASSERT_FLOAT_EQ((host_x->ptr<float>()[i] * 2 + 1) * 2,
- host_z.ptr<float>()[i]);
- }
- }
- }
-
- TEST(TestGraph, In2OutOpStreamPropagate) {
- REQUIRE_GPU(1); // seq_comp_node_opt works on comp_node with HAS_COPY_STREAM
- HostTensorGenerator<> gen;
- SmallVector<std::shared_ptr<HostTensorND>> host_v = {gen({233}), gen({23})};
- using PropType = cg::SeqCompNodeOptimizer::StreamPropType;
- for (auto type : {PropType::STRONG, PropType::WEAK})
- for (size_t idx : {0, 1}) {
- auto graph = ComputingGraph::make();
- SymbolVarArray inp(2);
- for (size_t i = 0 ; i < 2; ++ i) {
- inp[i] = opr::Host2DeviceCopy::make(*graph, host_v[i]);
- }
- auto out = opr::VirtualDep::make(inp);
- auto &&mgr = static_cast<cg::SeqCompNodeOptimizerImpl&>(graph->seq_comp_node_optimizer());
- mgr.register_stream_var(inp[idx].node(), PropType{CompNode::Stream::COPY, type});
- mgr.optimize_comp_nodes({out.node()});
- ASSERT_EQ(inp[0].node()->comp_node(), out.node()->comp_node());
- auto o_stream = out.node()->comp_node().locator().stream;
- int expect = idx ? 0 : int(CompNode::Stream::COPY);
- ASSERT_EQ(o_stream, expect);
- }
- }
-
- TEST(TestGraph, OperatorNodeConfigInstanceID) {
- OperatorNodeConfig config0, config1;
- void *p0 = &config0, *p1 = &config1;
- { // set and reset
- ASSERT_EQ(config0.instance_id(), config1.instance_id());
- config0.update_instance_id(p0);
- ASSERT_NE(config0.instance_id(), config1.instance_id());
- config0.reset_instance_id();
- ASSERT_EQ(config0.instance_id(), config1.instance_id());
- }
- { // set to the same pointer
- config0.reset_instance_id();
- config0.update_instance_id(p1);
- config1.reset_instance_id();
- config1.update_instance_id(p1);
- ASSERT_EQ(config0.instance_id(), config1.instance_id());
- }
- { // check update semantics
- config0.reset_instance_id();
- config0.update_instance_id(p0);
- config1.reset_instance_id();
- config1.update_instance_id(p1);
- ASSERT_NE(config0.instance_id(), config1.instance_id());
- config0.update_instance_id(p1);
- ASSERT_NE(config0.instance_id(), config1.instance_id());
- }
- { // set in different order
- config0.reset_instance_id();
- config0.update_instance_id(p1);
- config0.update_instance_id(p0);
- config1.reset_instance_id();
- config1.update_instance_id(p0);
- config1.update_instance_id(p1);
- ASSERT_NE(config0.instance_id(), config1.instance_id());
- }
- }
-
- TEST(TestGraph, NaiveRecord2NCHW44) {
- auto cn = CompNode::load("cpu0");
- using ConvParam = megdnn::ConvBias::Param;
- ConvParam param;
- param.sparse = ConvParam::Sparse::DENSE;
- param.format = ConvParam::Format::NCHW44;
- HostTensorGenerator<> gen;
- auto host_x = gen({1, 2, 12, 12, 4}, cn),
- host_w = gen({2, 2, 3, 3, 4, 4}, cn),
- host_b = gen({1, 2, 1, 1, 4}, cn);
-
- HostTensorND host_z;
- auto graph = ComputingGraph::make();
- auto x = opr::Host2DeviceCopy::make(*graph, host_x),
- w = opr::Host2DeviceCopy::make(*graph, host_w),
- b = opr::Host2DeviceCopy::make(*graph, host_b),
- z = opr::ConvBiasForward::make(x, w, b, param, {});
- graph->options().comp_node_seq_record_level = 2;
- graph->options().var_sanity_check_first_run = false;
- auto func = graph->compile({make_callback_copy(z, host_z)});
- ComputingGraph::assert_destroy(graph);
- host_x->copy_from_fixlayout(*gen(host_x->shape(), cn));
- func->execute().wait();
- }
-
- namespace {
- template <typename DnnOp, typename... Args>
- typename megdnn::ExecutionPolicy try_find_any_weight_preprocess_algo(
- DnnOp* dnn_op, const char* mgb_info, Maybe<bool>& found,
- Args&& ...args) {
- if (found.valid()) {
- if (found.val()) {
- return dnn_op->execution_policy();
- } else {
- return {};
- }
- }
- for (auto&& algo : dnn_op->get_all_algorithms_info(
- std::forward<Args>(args)...)) {
- dnn_op->execution_policy().algo = algo.desc;
- auto layouts = dnn_op->deduce_preprocessed_filter_layout(
- std::forward<Args>(args)...);
- if (layouts.empty()) continue;
- bool valid = false;
- for (auto&& l: layouts) {
- if (!l.is_empty()) {
- valid = true;
- break;
- }
- }
- if (valid) {
- found.emplace(true);
- return {algo.desc, {}};
- }
- }
- found.emplace(false);
- mgb_log_warn("Can't find weight preprocess algo for op %s", mgb_info);
- return {};
- }
-
- template <typename DnnOp, typename... Args>
- typename megdnn::ExecutionPolicy try_find_any_bias_preprocess_algo(
- DnnOp* dnn_op, const char* mgb_info, Maybe<bool>& found,
- Args&& ...args) {
- if (found.valid()) {
- if (found.val()) {
- return dnn_op->execution_policy();
- } else {
- return {};
- }
- }
- for (auto&& algo : dnn_op->get_all_algorithms_info(
- std::forward<Args>(args)...)) {
- dnn_op->execution_policy().algo = algo.desc;
- auto layouts = dnn_op->deduce_preprocessed_filter_layout(
- std::forward<Args>(args)...);
- if (layouts.size() <= 1)
- continue;
- bool valid = false;
- if (!layouts[1].is_empty()) {
- valid = true;
- }
- if (valid) {
- found.emplace(true);
- return {algo.desc, {}};
- }
- }
- found.emplace(false);
- mgb_log_warn("Can't find bias preprocess algo for op %s", mgb_info);
- return {};
- }
-
- void test_free_memory_in_weight_preprocess(int record_level, CompNode cn) {
- HostTensorGenerator<> gen;
- auto graph = ComputingGraph::make();
- graph->options().graph_opt.weight_preprocess = true;
- graph->options().comp_node_seq_record_level = record_level;
- auto mkvar = [&](const char* name, const TensorShape& shp) {
- return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
- };
- auto mkcvar = [&](const char* name, const TensorShape& shp) {
- return opr::SharedDeviceTensor::make_const(*graph, *gen(shp, cn))
- .rename(name);
- };
- auto x = mkvar("x", {1, 32, 16, 16});
- // ConvBias test dense
- opr::ConvBias::Param param_conv_bias;
- param_conv_bias.pad_h = param_conv_bias.pad_w = 0;
- param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
- auto w1 = mkcvar("w1", {32, 32, 1, 1}), b1 = mkcvar("b1", {1, 32, 1, 1});
- auto conv1 = opr::ConvBias::make(x, w1, b1, param_conv_bias);
- Maybe<bool> wp1, wp2;
- conv1.node()->owner_opr()->cast_final_safe<opr::ConvBias>()
- .setup_algo_chooser([&](const cg::OperatorNodeBase* opr) {
- return try_find_any_weight_preprocess_algo(
- opr->cast_final_safe<opr::ConvBias>().megdnn_opr(),
- opr->cname(), wp1,
- opr->input(0)->layout(), opr->input(1)->layout(),
- opr->input(2)->layout(), TensorLayout{},
- opr->output(0)->layout());
- });
- // Convolution
- opr::Convolution::Param param_conv;
- param_conv.pad_h = param_conv.pad_w = 0;
- param_conv.sparse = opr::Convolution::Param::Sparse::DENSE;
- auto w2 = mkcvar("w2", {32, 32, 1, 1});
- auto y = opr::Convolution::make(conv1, w2, param_conv);
- y.node()->owner_opr()->cast_final_safe<opr::Convolution>()
- .setup_algo_chooser([&](const cg::OperatorNodeBase* opr) {
- return try_find_any_weight_preprocess_algo(
- opr->cast_final_safe<opr::Convolution>().megdnn_opr(),
- opr->cname(), wp2,
- opr->input(0)->layout(), opr->input(1)->layout(),
- opr->output(0)->layout());
- });
-
- HostTensorND host_y;
- auto func =graph->compile({make_callback_copy(y, host_y)});
- //!flag the no need memory of var
- func->execute();
- //!free the no need memory of var
- func->execute();
- auto check = [&](SymbolVar v) {
- ASSERT_TRUE(v.node()->contain_flag(VarNode::Flag::MEMORY_NO_NEED));
- ASSERT_TRUE(v.node()->dev_tensor().empty());
- ASSERT_TRUE(v.node()->owner_opr()
- ->cast_final_safe<opr::SharedDeviceTensor>()
- .get_dev_tensor()
- .empty());
- };
- ASSERT_TRUE(wp1.valid() && wp2.valid());
- if (wp1.val()) {
- check(w1);
- }
- if (wp2.val()) {
- check(w2);
- }
- }
- } // anonymous namespace
-
- TEST(TestGraph, FreeMemoryInWeightPreprocess) {
- test_free_memory_in_weight_preprocess(0, CompNode::load("xpu0"));
- }
-
- TEST(TestGraph, RecordFreeMemoryInWeightPreprocess) {
- test_free_memory_in_weight_preprocess(1, CompNode::load("cpu0"));
- }
-
- namespace {
- MGB_DEFINE_OPR_CLASS(HostValueReader, cg::SingleCNOutshapePureByInshapeOprBase) // {
- void scn_do_execute() override {
- auto&& hv = owner_graph()->static_infer_manager().infer_value(input(0));
- MGB_MARK_USED_VAR(hv);
- }
-
- NodeProp* do_make_node_prop() const override {
- auto ret = Super::do_make_node_prop();
- ret->dep_map()[input(0)] = NodeProp::DepType::HOST_VALUE;
- return ret;
- }
-
- void get_output_var_shape(
- const TensorShapeArray &,
- TensorShapeArray &out_shape) const override {
- out_shape.at(0) = {};
- }
-
- public:
- HostValueReader(VarNode* inp)
- : Super{inp->owner_graph(), {}, "host_value_reader", {inp}} {
- add_input({inp});
- using F = VarNode::Flag;
- add_output(None)
- ->add_flag(F::ALLOW_EMPTY_SHAPE)
- .add_flag(F::VOLATILE_CONTENT);
- }
-
- static SymbolVar make(SymbolVar inp) {
- return inp.node()->owner_graph()->insert_opr(
- std::make_unique<HostValueReader>(inp.node()))->output(0);
- }
- };
- MGB_DYN_TYPE_OBJ_FINAL_IMPL(HostValueReader);
- }
-
- TEST(TestGraph, FreeMemoryInWeightPreprocessWithValueInfer) {
- HostTensorGenerator<> gen;
- CompNode cn = CompNode::load("xpux");
- auto graph = ComputingGraph::make();
- graph->options().graph_opt.weight_preprocess = true;
- graph->options().var_sanity_check_first_run = false;
- auto mkvar = [&](const char* name, const TensorShape& shp) {
- return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
- };
- auto mkcvar = [&](const char* name, const TensorShape& shp) {
- return opr::SharedDeviceTensor::make_const(*graph, *gen(shp, cn))
- .rename(name);
- };
- auto x = mkvar("x", {1, 32, 16, 16});
- auto w = mkcvar("w", {32, 32, 1, 1});
- auto y = opr::Convolution::make(x, w);
- Maybe<bool> found;
- y.node()->owner_opr()->cast_final_safe<opr::Convolution>()
- .setup_algo_chooser([&](const cg::OperatorNodeBase* opr) {
- return try_find_any_weight_preprocess_algo(
- opr->cast_final_safe<opr::Convolution>().megdnn_opr(),
- opr->cname(), found,
- opr->input(0)->layout(), opr->input(1)->layout(),
- opr->output(0)->layout());
- });
- auto reader = HostValueReader::make(w);
-
- HostTensorND host_y;
- auto func = graph->compile({make_callback_copy(y, host_y), {reader, {}}});
- func->execute();
- // FIXME: failed on second execution due to requiring host value of the empty
- // tensor which was freed in weight preprocess
- func->execute();
- ASSERT_FALSE(w.node()->contain_flag(VarNode::Flag::MEMORY_NO_NEED));
- ASSERT_FALSE(w.node()->dev_tensor().empty());
- ASSERT_FALSE(w.node()->owner_opr()
- ->cast_final_safe<opr::SharedDeviceTensor>()
- .get_dev_tensor()
- .empty());
- }
-
- TEST(TestGraph, FreeMemoryInWeightPreprocessWithMultiReader) {
- HostTensorGenerator<> gen;
- CompNode cn = CompNode::load("xpux");
- auto graph = ComputingGraph::make();
- graph->options().graph_opt.weight_preprocess = true;
- graph->options().var_sanity_check_first_run = false;
- graph->options().graph_opt_level = 0;
- auto mkvar = [&](const char* name, const TensorShape& shp) {
- return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
- };
- auto mkcvar = [&](const char* name, const TensorShape& shp) {
- return opr::SharedDeviceTensor::make_const(*graph, *gen(shp, cn))
- .rename(name);
- };
- auto x = mkvar("x", {1, 32, 16, 16});
- auto w = mkcvar("w", {32, 32, 1, 1});
- auto y = opr::Convolution::make(x, w);
- Maybe<bool> found;
- y.node()->owner_opr()->cast_final_safe<opr::Convolution>()
- .setup_algo_chooser([&](const cg::OperatorNodeBase* opr) {
- return try_find_any_weight_preprocess_algo(
- opr->cast_final_safe<opr::Convolution>().megdnn_opr(),
- opr->cname(), found,
- opr->input(0)->layout(), opr->input(1)->layout(),
- opr->output(0)->layout());
- });
- auto y1 = w * 2 + 1;
-
- HostTensorND host_y, host_y1;
- auto func = graph->compile({
- make_callback_copy(y, host_y), make_callback_copy(y1, host_y1)});
- func->execute();
- // FIXME: failed on second execution due to calculate expression
- // (w * 2 + 1) with empty tensor
- func->execute();
- ASSERT_FALSE(w.node()->contain_flag(VarNode::Flag::MEMORY_NO_NEED));
- ASSERT_FALSE(w.node()->dev_tensor().empty());
- ASSERT_FALSE(w.node()->owner_opr()
- ->cast_final_safe<opr::SharedDeviceTensor>()
- .get_dev_tensor()
- .empty());
- }
-
- TEST(TestGraph, FreeBias) {
- HostTensorGenerator<> gen;
- auto graph = ComputingGraph::make();
- auto cn = CompNode::load("xpu0");
- graph->options().graph_opt.weight_preprocess = true;
- auto mkvar = [&](const char* name, const TensorShape& shp) {
- return opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name);
- };
- auto mkcvar = [&](const char* name, const TensorShape& shp) {
- return opr::SharedDeviceTensor::make_const(*graph, *gen(shp, cn))
- .rename(name);
- };
- auto x = mkvar("x", {1, 32, 16, 16});
- // ConvBias test dense
- opr::ConvBias::Param param_conv_bias;
- param_conv_bias.pad_h = param_conv_bias.pad_w = 0;
- param_conv_bias.sparse = opr::ConvBias::Param::Sparse::DENSE;
- auto w1 = mkcvar("w1", {32, 32, 1, 1}), b1 = mkcvar("b1", {1, 32, 1, 1});
- auto conv1 = opr::ConvBias::make(x, w1, b1, param_conv_bias);
- auto w2 = mkcvar("w2", {32, 32, 1, 1});
- auto conv2 = opr::ConvBias::make(conv1, w2, param_conv_bias);
- Maybe<bool> wp1;
- conv1.node()->owner_opr()->cast_final_safe<opr::ConvBias>()
- .setup_algo_chooser([&](const cg::OperatorNodeBase* opr) {
- return try_find_any_bias_preprocess_algo(
- opr->cast_final_safe<opr::ConvBias>().megdnn_opr(),
- opr->cname(), wp1,
- opr->input(0)->layout(), opr->input(1)->layout(),
- opr->input(2)->layout(), TensorLayout{},
- opr->output(0)->layout());
- });
-
- HostTensorND host_y;
- auto func =graph->compile({make_callback_copy(conv2, host_y)});
- //!flag the no need memory of var
- func->execute();
- //!free the no need memory of var
- func->execute();
- auto check = [&](SymbolVar v) {
- ASSERT_TRUE(v.node()->contain_flag(VarNode::Flag::MEMORY_NO_NEED));
- ASSERT_TRUE(v.node()->dev_tensor().empty());
- ASSERT_TRUE(v.node()->owner_opr()
- ->cast_final_safe<opr::SharedDeviceTensor>()
- .get_dev_tensor()
- .empty());
- };
- ASSERT_TRUE(wp1.valid());
- if (wp1.val()) {
- check(b1);
- }
- }
-
- TEST(TestGraph, CallbackCaller) {
- using namespace opr;
- auto cns = load_multiple_xpus(3);
- constexpr size_t C1 = 20, C2 = 30, C3 = 10, C4 = 40;
- constexpr size_t N = 2, C = C1 + C2;
- HostTensorGenerator<> gen;
- auto host_opr0 = gen({N, C}, cns[0]);
- auto graph = ComputingGraph::make();
- SymbolVar opr0 = opr::Host2DeviceCopy::make(*graph, host_opr0, {"opr0"});
-
- auto spl0 = opr::Split::make(
- opr0, Split::Options::make_partition(opr0, 1, {C1, C2}),
- OperatorNodeConfig("split0").comp_node_arr({cns[1], cns[2]}));
-
- auto spl1 = opr::Split::make(
- opr0, Split::Options::make_partition(opr0, 1, {C3, C4}),
- OperatorNodeConfig("split1"));
-
- HostTensorND host_spl00, host_spl01, host_spl10, host_spl11;
- auto func = graph->compile({make_callback_copy(spl0[0], host_spl00),
- make_callback_copy(spl0[1], host_spl01),
- make_callback_copy(spl1[0], host_spl10),
- make_callback_copy(spl1[1], host_spl11)});
- func->execute();
- auto o00 = host_spl00.ptr<float>(),
- o01 = host_spl01.ptr<float>(),
- o10 = host_spl10.ptr<float>(),
- o11 = host_spl11.ptr<float>(), c = host_opr0->ptr<float>();
- for (size_t i = 0, it = host_opr0->layout().total_nr_elems(); i < it; i++) {
- auto ch = i % C;
- auto n = i / C;
- if (ch < C1) {
- MGB_ASSERT_FLOAT_EQ(o00[n * C1 + ch], c[i])
- << ssprintf("failed at %zd", i);
- } else {
- MGB_ASSERT_FLOAT_EQ(o01[n * C2 + ch - C1], c[i])
- << ssprintf("failed at %zd", i);
- }
- if (ch < C3) {
- MGB_ASSERT_FLOAT_EQ(o10[n * C3 + ch], c[i])
- << ssprintf("failed at %zd", i);
- } else {
- MGB_ASSERT_FLOAT_EQ(o11[n * C4 + ch - C3], c[i])
- << ssprintf("failed at %zd", i);
- }
- }
- }
-
- // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
|