| @@ -72,7 +72,7 @@ DeviceTensorND BlobManagerImpl::alloc_workspace_with_defrag( | |||||
| dev_tensor.reset(storage, layout); | dev_tensor.reset(storage, layout); | ||||
| return dev_tensor; | return dev_tensor; | ||||
| } | } | ||||
| MGB_TRY { return alloc_workspace(cn, layout); } | |||||
| MGB_TRY { dev_tensor = alloc_workspace(cn, layout); } | |||||
| MGB_CATCH(MemAllocError&, { | MGB_CATCH(MemAllocError&, { | ||||
| mgb_log_warn("memory allocation failed for workspace; try defragmenting"); | mgb_log_warn("memory allocation failed for workspace; try defragmenting"); | ||||
| defrag(cn); | defrag(cn); | ||||
| @@ -583,9 +583,7 @@ TensorInfo* ChannelImpl::alloc() { | |||||
| auto& state = get_channel_state(); | auto& state = get_channel_state(); | ||||
| auto info = [this] { | auto info = [this] { | ||||
| MGB_LOCK_GUARD(m_pool_spin); | MGB_LOCK_GUARD(m_pool_spin); | ||||
| auto* ptr = m_pool.alloc_raw(); | |||||
| new (ptr) TensorInfo(); | |||||
| return (TensorInfo*)ptr; | |||||
| return m_pool.alloc(); | |||||
| }(); | }(); | ||||
| info->id = Profiler::next_id(); | info->id = Profiler::next_id(); | ||||
| if (Profiler::is_profiling()) { | if (Profiler::is_profiling()) { | ||||
| @@ -816,7 +814,8 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd, std::string reason) { | |||||
| for (auto&& [device, kernel_id] : kernels) { | for (auto&& [device, kernel_id] : kernels) { | ||||
| MGB_RECORD_EVENT(KernelLaunchEvent, apply_id, kernel_id, device); | MGB_RECORD_EVENT(KernelLaunchEvent, apply_id, kernel_id, device); | ||||
| MGB_RECORD_EVENT_IF( | MGB_RECORD_EVENT_IF( | ||||
| profiling_device, RecordDeviceEvent, Timer::record_device(device)); | |||||
| (Profiler::get_option("profile_device", 0)), RecordDeviceEvent, | |||||
| Timer::record_device(device)); | |||||
| } | } | ||||
| // Apply op | // Apply op | ||||
| SmallVector<LogicalTensorDesc> output_descs; | SmallVector<LogicalTensorDesc> output_descs; | ||||
| @@ -830,7 +829,8 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd, std::string reason) { | |||||
| // After execute | // After execute | ||||
| for (auto&& [device, kernel_id] : kernels) { | for (auto&& [device, kernel_id] : kernels) { | ||||
| MGB_RECORD_EVENT_IF( | MGB_RECORD_EVENT_IF( | ||||
| profiling_device, RecordDeviceEvent, Timer::record_device(device)); | |||||
| (Profiler::get_option("profile_device", 0)), RecordDeviceEvent, | |||||
| Timer::record_device(device)); | |||||
| MGB_RECORD_EVENT(KernelLaunchFinishEvent, apply_id, kernel_id, device); | MGB_RECORD_EVENT(KernelLaunchFinishEvent, apply_id, kernel_id, device); | ||||
| } | } | ||||
| // End profiling operator | // End profiling operator | ||||
| @@ -847,9 +847,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd, std::string reason) { | |||||
| MGB_RECORD_EVENT(OpOutputEvent, output->id); | MGB_RECORD_EVENT(OpOutputEvent, output->id); | ||||
| produce_tensor(output, outputs[i]); | produce_tensor(output, outputs[i]); | ||||
| MGB_RECORD_EVENT(OpOutputFinishEvent, output->id); | MGB_RECORD_EVENT(OpOutputFinishEvent, output->id); | ||||
| if (Profiler::is_profiling()) { | |||||
| sample_on_device(output->desc.comp_node, false); | |||||
| } | |||||
| sample_on_device(output->desc.comp_node, false); | |||||
| } | } | ||||
| } | } | ||||
| @@ -0,0 +1,88 @@ | |||||
| #include "megbrain/imperative/opr_utility.h" | |||||
| #include "megbrain/imperative/ops/autogen.h" | |||||
| #include "megbrain/imperative/utils/stats.h" | |||||
| #include "megbrain/opr/basic_arith.h" | |||||
| #include "megbrain/opr/blas.h" | |||||
| #include "megbrain/opr/utility.h" | |||||
| #include "../blob_manager_impl.h" | |||||
| #include "../dnn_op_helper.h" | |||||
| #include "../op_trait.h" | |||||
| namespace mgb { | |||||
| namespace imperative { | |||||
| namespace { | |||||
| namespace dot { | |||||
| auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||||
| auto&& op = def.cast_final_safe<Dot>(); | |||||
| mgb_assert(inputs.size() == 2); | |||||
| OperatorNodeConfig config{op.make_name()}; | |||||
| return opr::Dot::make(inputs[0], inputs[1], config); | |||||
| } | |||||
| SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||||
| auto comp_node = inputs[0]->comp_node(); | |||||
| using TensorND = megdnn::TensorND; | |||||
| SmallVector<TensorND> inp_tensornds; | |||||
| inp_tensornds.reserve(inputs.size()); | |||||
| auto&& dnn_opr = opr::intl::create_megdnn_opr<megdnn::Dot>(comp_node); | |||||
| for (unsigned i = 0; i < inputs.size(); ++i) { | |||||
| auto dnn_ten = inputs[i]->dnn_tensor(); | |||||
| inp_tensornds.push_back(dnn_ten); | |||||
| } | |||||
| TensorLayout oup_layout{inputs[0]->dtype()}; | |||||
| auto inp1_tensor = inputs[0]->dnn_tensor(); | |||||
| auto inp2_tensor = inputs[1]->dnn_tensor(); | |||||
| dnn_opr->deduce_layout(inp1_tensor.layout, inp2_tensor.layout, oup_layout); | |||||
| if (inputs[0]->layout().is_empty() || inputs[1]->layout().is_empty()) { | |||||
| auto fill_opr = opr::intl::create_megdnn_opr<megdnn::Fill>(comp_node); | |||||
| DeviceTensorND out = | |||||
| BlobManager::inst()->alloc_workspace_with_defrag(comp_node, oup_layout); | |||||
| fill_opr->param() = 0; | |||||
| fill_opr->exec(out.as_megdnn(), {}); | |||||
| return {Tensor::make(out)}; | |||||
| } | |||||
| auto wk_size = dnn_opr->get_workspace_in_bytes( | |||||
| inp_tensornds[0].layout, inp_tensornds[1].layout, output_descs[0].layout); | |||||
| DeviceTensorND out_devtensor = | |||||
| BlobManager::inst()->alloc_workspace_with_defrag(comp_node, oup_layout); | |||||
| TensorLayout wk_layout{TensorShape{wk_size}, inputs[0]->dtype()}; | |||||
| DeviceTensorND workspace = | |||||
| BlobManager::inst()->alloc_workspace_with_defrag(comp_node, wk_layout); | |||||
| megdnn::Workspace dnn_wk(workspace.raw_ptr(), wk_size); | |||||
| dnn_opr->exec( | |||||
| inp_tensornds[0], inp_tensornds[1], out_devtensor.as_megdnn(), dnn_wk); | |||||
| return {Tensor::make(out_devtensor)}; | |||||
| } | |||||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | |||||
| mgb_assert( | |||||
| inputs.size() == 2, "Dot expects 2 inputs; got %lu actually", | |||||
| inputs.size()); | |||||
| SmallVector<LogicalTensorDesc> dests(1); | |||||
| dests[0].layout = TensorLayout(TensorShape{1}, inputs[0].layout.dtype); | |||||
| dests[0].comp_node = inputs[0].comp_node; | |||||
| bool validated = inputs[0].layout.ndim != 0 && inputs[1].layout.ndim != 0; | |||||
| return {dests, validated}; | |||||
| } | |||||
| OP_TRAIT_REG(Dot, Dot, mgb::opr::Dot) | |||||
| .apply_on_var_node(apply_on_var_node) | |||||
| .infer_output_attrs_fallible(infer_output_attrs_fallible) | |||||
| .apply_on_physical_tensor(apply_on_physical_tensor) | |||||
| .fallback(); | |||||
| } // namespace dot | |||||
| } // anonymous namespace | |||||
| } // namespace imperative | |||||
| } // namespace mgb | |||||
| @@ -372,81 +372,6 @@ OP_TRAIT_REG(BatchedMatrixMul, BatchedMatrixMul) | |||||
| } // namespace batched_matrix_mul | } // namespace batched_matrix_mul | ||||
| } // namespace | } // namespace | ||||
| namespace { | |||||
| namespace dot { | |||||
| auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||||
| auto&& op = def.cast_final_safe<Dot>(); | |||||
| mgb_assert(inputs.size() == 2); | |||||
| OperatorNodeConfig config{op.make_name()}; | |||||
| return opr::Dot::make(inputs[0], inputs[1], config); | |||||
| } | |||||
| // std::shared_ptr<OpDef> make_from_op_node(cg::OperatorNodeBase* node_) { | |||||
| // auto* node = &node_->cast_final_safe<opr::Dot>(); | |||||
| // return Dot::make(node->param()); | |||||
| // } | |||||
| SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||||
| auto a = inputs[0]->layout(); | |||||
| auto comp_node = inputs[0]->comp_node(); | |||||
| using TensorND = megdnn::TensorND; | |||||
| SmallVector<TensorND> inp_tensornds; | |||||
| inp_tensornds.reserve(inputs.size()); | |||||
| auto dnn_opr = opr::intl::create_megdnn_opr<megdnn::Dot>(comp_node); | |||||
| for (unsigned i = 0; i < inputs.size(); ++i) { | |||||
| auto dnn_ten = inputs[i]->dnn_tensor(); | |||||
| inp_tensornds.push_back(dnn_ten); | |||||
| } | |||||
| TensorLayout oup_layout{inputs[0]->dtype()}; | |||||
| auto inp1_tensor = inputs[0]->dnn_tensor(); | |||||
| auto inp2_tensor = inputs[1]->dnn_tensor(); | |||||
| dnn_opr->deduce_layout(inp1_tensor.layout, inp2_tensor.layout, oup_layout); | |||||
| if (inputs[0]->layout().is_empty() || inputs[1]->layout().is_empty()) { | |||||
| auto fill_opr = opr::intl::create_megdnn_opr<megdnn::Fill>(comp_node); | |||||
| DeviceTensorND out = | |||||
| BlobManager::inst()->alloc_workspace_with_defrag(comp_node, oup_layout); | |||||
| fill_opr->param() = 0; | |||||
| fill_opr->exec(out.as_megdnn(), {}); | |||||
| return {Tensor::make(out)}; | |||||
| } | |||||
| auto wk_size = dnn_opr->get_workspace_in_bytes( | |||||
| inp_tensornds[0].layout, inp_tensornds[1].layout, output_descs[0].layout); | |||||
| DeviceTensorND out_devtensor = | |||||
| BlobManager::inst()->alloc_workspace_with_defrag(comp_node, oup_layout); | |||||
| TensorLayout wk_layout{TensorShape{wk_size}, inputs[0]->dtype()}; | |||||
| DeviceTensorND workspace = | |||||
| BlobManager::inst()->alloc_workspace_with_defrag(comp_node, wk_layout); | |||||
| megdnn::Workspace dnn_wk(workspace.raw_ptr(), wk_size); | |||||
| dnn_opr->exec( | |||||
| inp_tensornds[0], inp_tensornds[1], out_devtensor.as_megdnn(), dnn_wk); | |||||
| return {Tensor::make(out_devtensor)}; | |||||
| } | |||||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | |||||
| auto&& op_def = def.cast_final_safe<Dot>(); | |||||
| SmallVector<LogicalTensorDesc> dests(1); | |||||
| dests[0].layout = TensorLayout(TensorShape{1}, inputs[0].layout.dtype); | |||||
| dests[0].comp_node = inputs[0].comp_node; | |||||
| return {dests, true}; | |||||
| } | |||||
| OP_TRAIT_REG(Dot, Dot, opr::Dot) | |||||
| .apply_on_var_node(apply_on_var_node) | |||||
| .infer_output_attrs_fallible(infer_output_attrs_fallible) | |||||
| .apply_on_physical_tensor(apply_on_physical_tensor) | |||||
| .fallback(); | |||||
| } // namespace dot | |||||
| } // namespace | |||||
| namespace { | namespace { | ||||
| namespace argsort { | namespace argsort { | ||||
| auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | ||||