GitOrigin-RevId: 402cba209a
tags/v1.11.0
| @@ -397,7 +397,8 @@ public: | |||
| OutputDType infer_dtype(DType data, DType mask); | |||
| virtual size_t get_workspace_in_bytes(const TensorLayout& data) = 0; | |||
| virtual size_t get_workspace_in_bytes( | |||
| const TensorLayout& data, const TensorLayout& mask) = 0; | |||
| virtual Output exec( | |||
| _megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, | |||
| @@ -512,7 +513,8 @@ public: | |||
| virtual void exec( | |||
| _megdnn_in const TensorNDArray& srcs, _megdnn_tensor_out dst, | |||
| _megdnn_workspace workspace) = 0; | |||
| void deduce_layout(const TensorLayoutArray& srcs, TensorLayout& dst); | |||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
| const TensorLayoutArray& srcs, TensorLayout& dst); | |||
| virtual size_t get_workspace_in_bytes( | |||
| const TensorLayoutArray& srcs, const TensorLayout& dst) = 0; | |||
| @@ -596,7 +598,7 @@ public: | |||
| _megdnn_workspace workspace) = 0; | |||
| virtual size_t get_workspace_in_bytes( | |||
| const TensorShapeArray& srcs, const TensorShape& offsets, | |||
| const TensorShape& srcs, const TensorShape& offsets, | |||
| const TensorShape& dst) = 0; | |||
| }; | |||
| @@ -1145,7 +1147,7 @@ protected: | |||
| /*! | |||
| * \return axis on dst used by indexer (i.e. ExecInfo::idx_axis) | |||
| */ | |||
| static size_t deduce_layout_fwd( | |||
| MGE_WIN_DECLSPEC_FUC static size_t deduce_layout_fwd( | |||
| const TensorLayout& data, const IndexDescLayoutOnly& index, | |||
| TensorLayout& dst); | |||
| @@ -1362,9 +1364,10 @@ class CheckNonFinite : public OperatorBase { | |||
| public: | |||
| virtual size_t get_workspace_in_bytes( | |||
| const TensorNDArray& srcs, const TensorLayout& dst) = 0; | |||
| const TensorLayoutArray& srcs, const TensorLayout& dst) = 0; | |||
| void deduce_layout(const TensorLayoutArray& srcs, TensorLayout& dst); | |||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
| const TensorLayoutArray& srcs, TensorLayout& dst); | |||
| virtual void exec( | |||
| _megdnn_in const TensorNDArray& srcs, _megdnn_tensor_out dst, | |||
| @@ -1420,7 +1423,7 @@ public: | |||
| } | |||
| virtual size_t get_workspace_in_bytes( | |||
| const TensorLayout& src, const TensorLayout& dst) = 0; | |||
| void deduce_layout(const TensorLayout& src, TensorLayout& dst); | |||
| MGE_WIN_DECLSPEC_FUC void deduce_layout(const TensorLayout& src, TensorLayout& dst); | |||
| MGE_WIN_DECLSPEC_FUC static void deduce_layout_impl( | |||
| const TensorLayout& src, TensorLayout& dst, const Param& p); | |||
| @@ -1464,7 +1467,7 @@ public: | |||
| const TensorLayout& m_t, const TensorLayout& v_t, | |||
| const TensorLayout& new_param) = 0; | |||
| void deduce_layout( | |||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
| const TensorLayout& m_t_1, const TensorLayout& v_t_1, | |||
| const TensorLayout& lamb_param, const TensorLayout& grad, TensorLayout& m_t, | |||
| TensorLayout& v_t, TensorLayout& new_param); | |||
| @@ -27,7 +27,8 @@ public: | |||
| _megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, | |||
| _megdnn_workspace workspace) = 0; | |||
| MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType A, DType B, DType& C); | |||
| void deduce_layout(const TensorLayout& A, const TensorLayout& B, TensorLayout& C); | |||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
| const TensorLayout& A, const TensorLayout& B, TensorLayout& C); | |||
| virtual size_t get_workspace_in_bytes( | |||
| const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) = 0; | |||
| @@ -64,7 +65,8 @@ public: | |||
| _megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, | |||
| _megdnn_workspace workspace) = 0; | |||
| MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType A, DType B, DType& C); | |||
| void deduce_layout(const TensorLayout& A, const TensorLayout& B, TensorLayout& C); | |||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
| const TensorLayout& A, const TensorLayout& B, TensorLayout& C); | |||
| virtual size_t get_workspace_in_bytes( | |||
| const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) = 0; | |||
| @@ -224,9 +224,9 @@ public: | |||
| const TensorLayout& src_layout, _megdnn_tensor_in filter, | |||
| const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter, | |||
| _megdnn_workspace workspace) = 0; | |||
| void deduce_dtype(DType src, DType filter, DType& dst); | |||
| MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType src, DType filter, DType& dst); | |||
| void deduce_layout( | |||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
| const TensorLayout& src, const TensorLayout& filter, TensorLayout& dst); | |||
| /** | |||
| @@ -300,7 +300,7 @@ public: | |||
| const TensorLayout& grad) = 0; | |||
| MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType filter, DType diff, DType& grad); | |||
| void deduce_layout( | |||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
| const TensorLayout& filter, const TensorLayout& diff, TensorLayout& grad); | |||
| static Algorithm::OprType get_opr_type() { | |||
| @@ -378,6 +378,12 @@ public: | |||
| const PreprocessedFilter* preprocessed_filter, | |||
| _megdnn_workspace workspace) = 0; | |||
| MGE_WIN_DECLSPEC_FUC void exec( | |||
| _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias, | |||
| _megdnn_tensor_in z, _megdnn_tensor_out dst, _megdnn_workspace workspace) { | |||
| exec(src, filter, bias, z, dst, nullptr, workspace); | |||
| } | |||
| /** | |||
| * \brief execute weight preprocessing, read weights form filter and bias, | |||
| * write to preprocessed_filter after preprocessed. | |||
| @@ -390,8 +396,9 @@ public: | |||
| _megdnn_tensor_in bias, const TensorLayout& z_layout, | |||
| const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter, | |||
| _megdnn_workspace workspace) = 0; | |||
| void deduce_dtype(DType src, DType filter, DType bias, DType z, DType& dst); | |||
| void deduce_layout( | |||
| MGE_WIN_DECLSPEC_FUC void deduce_dtype( | |||
| DType src, DType filter, DType bias, DType z, DType& dst); | |||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| const TensorLayout& bias, const TensorLayout& z, TensorLayout& dst); | |||
| @@ -775,7 +782,7 @@ protected: | |||
| void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst); | |||
| public: | |||
| MGE_WIN_DECLSPEC_FUC static void deduce_layout_impl( | |||
| static void deduce_layout_impl( | |||
| const TensorLayout& src, const Param& param, TensorLayout& dst); | |||
| }; | |||
| @@ -791,7 +798,7 @@ public: | |||
| virtual void exec( | |||
| _megdnn_tensor_in src, _megdnn_tensor_out dst, | |||
| _megdnn_workspace workspace) = 0; | |||
| void deduce_layout(const TensorLayout& src, TensorLayout& dst); | |||
| MGE_WIN_DECLSPEC_FUC void deduce_layout(const TensorLayout& src, TensorLayout& dst); | |||
| virtual size_t get_workspace_in_bytes( | |||
| const TensorLayout& src, const TensorLayout& dst) = 0; | |||
| @@ -1253,7 +1260,7 @@ public: | |||
| virtual void exec( | |||
| _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst, | |||
| _megdnn_workspace workspace) = 0; | |||
| void deduce_layout( | |||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
| const TensorLayout& src, const TensorLayout& filter, TensorLayout& dst); | |||
| virtual size_t get_workspace_in_bytes( | |||
| const TensorLayout& src, const TensorLayout& filter, | |||
| @@ -1281,18 +1288,16 @@ public: | |||
| * \param[in] diff (n, oc, od, oh, ow) | |||
| * \param[out] grad (n, ic, id, ih, iw) | |||
| */ | |||
| MGE_WIN_DECLSPEC_FUC static void deduce_layout_impl( | |||
| static void deduce_layout_impl( | |||
| const TensorLayout& filter, const TensorLayout& diff, const Param& param, | |||
| TensorLayout& grad); | |||
| virtual void exec( | |||
| _megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad, | |||
| _megdnn_workspace workspace) = 0; | |||
| virtual size_t get_workspace_in_bytes( | |||
| const TensorLayout& filter, const TensorLayout& diff, | |||
| const TensorLayout& grad) = 0; | |||
| void deduce_layout( | |||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
| const TensorLayout& filter, const TensorLayout& diff, TensorLayout& grad); | |||
| static Algorithm::OprType get_opr_type() { | |||
| @@ -1472,7 +1477,7 @@ public: | |||
| virtual void exec( | |||
| _megdnn_tensor_in src, _megdnn_tensor_in rois, _megdnn_tensor_out dst, | |||
| _megdnn_tensor_out index, _megdnn_workspace workspace) = 0; | |||
| void deduce_layout( | |||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
| const TensorLayout& src, const TensorLayout& rois, TensorLayout& dst, | |||
| TensorLayout& index); | |||
| virtual size_t get_workspace_in_bytes( | |||
| @@ -1963,7 +1968,7 @@ public: | |||
| _megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias, | |||
| _megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd, | |||
| _megdnn_workspace workspace) = 0; | |||
| void deduce_layout( | |||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||
| const TensorLayout& data, const TensorLayout& weight, | |||
| const TensorLayout& bias, TensorLayout& dst, TensorLayout& mean, | |||
| TensorLayout& rstd); | |||
| @@ -7,7 +7,11 @@ void CheckNonFinite::check_exec( | |||
| const TensorNDArray& srcs, const TensorND& dst, size_t workspace_in_bytes) { | |||
| megdnn_assert_contiguous(dst.layout); | |||
| megdnn_assert(srcs.size() > 0); | |||
| auto required_workspace_in_bytes = get_workspace_in_bytes(srcs, dst.layout); | |||
| TensorLayoutArray src_layouts; | |||
| for (auto&& src : srcs) { | |||
| src_layouts.push_back(src.layout); | |||
| } | |||
| auto required_workspace_in_bytes = get_workspace_in_bytes(src_layouts, dst.layout); | |||
| megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); | |||
| } | |||
| @@ -11,7 +11,7 @@ size_t CondTake::check_exec_get_size( | |||
| mask.TensorShape::to_string().c_str()); | |||
| megdnn_assert(data.is_physical_contiguous() && mask.is_physical_contiguous()); | |||
| megdnn_assert(m_param.eps > 0, "eps must be non-negative; got: %g", m_param.eps); | |||
| megdnn_assert(workspace_in_bytes >= get_workspace_in_bytes(data)); | |||
| megdnn_assert(workspace_in_bytes >= get_workspace_in_bytes(data, mask)); | |||
| return data.total_nr_elems(); | |||
| } | |||
| @@ -7,9 +7,9 @@ void LAMBUpdate::deduce_layout( | |||
| const TensorLayout& m_t_1, const TensorLayout& v_t_1, | |||
| const TensorLayout& lamb_param, const TensorLayout& grad, TensorLayout& m_t, | |||
| TensorLayout& v_t, TensorLayout& new_param) { | |||
| m_t = TensorLayout(m_t_1); | |||
| v_t = TensorLayout(v_t_1); | |||
| new_param = TensorLayout(lamb_param); | |||
| m_t = m_t_1; | |||
| v_t = v_t_1; | |||
| new_param = lamb_param; | |||
| MEGDNN_MARK_USED_VAR(grad); | |||
| } | |||
| @@ -26,14 +26,14 @@ size_t CheckNonFiniteImpl::_get_workspace_in_bytes() { | |||
| } | |||
| size_t CheckNonFiniteImpl::get_workspace_in_bytes( | |||
| const TensorNDArray& srcs, const TensorLayout&) { | |||
| const TensorLayoutArray& srcs, const TensorLayout&) { | |||
| m_size = 0; | |||
| for (const auto& src : srcs) { | |||
| m_size += DIVUP(src.layout.total_nr_elems(), total_nr_elems_max); | |||
| m_size += DIVUP(src.total_nr_elems(), total_nr_elems_max); | |||
| } | |||
| if (srcs.begin()->layout.dtype == dtype::Float32()) { | |||
| if (srcs.begin()->dtype == dtype::Float32()) { | |||
| return _get_workspace_in_bytes<dt_float32>(); | |||
| } else if (srcs.begin()->layout.dtype == dtype::Float16()) { | |||
| } else if (srcs.begin()->dtype == dtype::Float16()) { | |||
| return _get_workspace_in_bytes<dt_float16>(); | |||
| } else { | |||
| megdnn_log_warn("only support fp16 and fp32, fallback to fp32"); | |||
| @@ -19,7 +19,7 @@ public: | |||
| using CheckNonFinite::CheckNonFinite; | |||
| size_t get_workspace_in_bytes( | |||
| const TensorNDArray& srcs, const TensorLayout& dst) override; | |||
| const TensorLayoutArray& srcs, const TensorLayout& dst) override; | |||
| bool is_thread_safe() const override { return true; } | |||
| @@ -20,7 +20,8 @@ WorkspaceBundle CondTakeImpl::make_bundle(size_t nr_item) { | |||
| handle()->alignment_requirement()}; | |||
| } | |||
| size_t CondTakeImpl::get_workspace_in_bytes(const TensorLayout& data) { | |||
| size_t CondTakeImpl::get_workspace_in_bytes( | |||
| const TensorLayout& data, const TensorLayout&) { | |||
| return make_bundle(data.total_nr_elems()).total_size_in_bytes(); | |||
| } | |||
| @@ -15,7 +15,8 @@ public: | |||
| _megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, | |||
| DynOutMallocPolicyCall malloc_policy) override; | |||
| size_t get_workspace_in_bytes(const TensorLayout& data) override; | |||
| size_t get_workspace_in_bytes( | |||
| const TensorLayout& data, const TensorLayout& mask) override; | |||
| }; | |||
| } // namespace cuda | |||
| @@ -6,8 +6,8 @@ namespace megdnn { | |||
| namespace cuda { | |||
| size_t ParamPackConcatImpl::get_workspace_in_bytes( | |||
| const TensorShapeArray& srcs, const TensorShape&, const TensorShape&) { | |||
| return sizeof(size_t) * srcs.size(); | |||
| const TensorShape&, const TensorShape& offsets, const TensorShape&) { | |||
| return sizeof(size_t) * (offsets.shape[0] / 2); | |||
| } | |||
| template <typename T> | |||
| @@ -12,7 +12,7 @@ public: | |||
| _megdnn_workspace workspace) override; | |||
| size_t get_workspace_in_bytes( | |||
| const TensorShapeArray& srcs, const TensorShape& table, | |||
| const TensorShape& srcs, const TensorShape& table, | |||
| const TensorShape& dst) override; | |||
| private: | |||
| @@ -13,7 +13,8 @@ public: | |||
| bool is_thread_safe() const override { return true; } | |||
| size_t get_workspace_in_bytes(const TensorNDArray&, const TensorLayout&) override { | |||
| size_t get_workspace_in_bytes( | |||
| const TensorLayoutArray&, const TensorLayout&) override { | |||
| m_size = 0; | |||
| return _get_workspace_in_bytes(); | |||
| } | |||
| @@ -38,7 +38,8 @@ void copy_data( | |||
| } // anonymous namespace | |||
| size_t CondTakeImpl::get_workspace_in_bytes(const TensorLayout& data) { | |||
| size_t CondTakeImpl::get_workspace_in_bytes( | |||
| const TensorLayout& data, const TensorLayout&) { | |||
| return (data.total_nr_elems() + 1) * sizeof(dt_int32); | |||
| } | |||
| @@ -11,7 +11,8 @@ class CondTakeImpl : public CondTake { | |||
| public: | |||
| using CondTake::CondTake; | |||
| size_t get_workspace_in_bytes(const TensorLayout& data) override; | |||
| size_t get_workspace_in_bytes( | |||
| const TensorLayout& data, const TensorLayout& mask) override; | |||
| Output exec( | |||
| _megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, | |||
| @@ -11,7 +11,7 @@ public: | |||
| _megdnn_workspace workspace) override; | |||
| size_t get_workspace_in_bytes( | |||
| const TensorShapeArray&, const TensorShape&, const TensorShape&) override { | |||
| const TensorShape&, const TensorShape&, const TensorShape&) override { | |||
| return 0; | |||
| } | |||
| }; | |||
| @@ -7,8 +7,8 @@ namespace megdnn { | |||
| namespace rocm { | |||
| size_t ParamPackConcatImpl::get_workspace_in_bytes( | |||
| const TensorShapeArray& srcs, const TensorShape&, const TensorShape&) { | |||
| return sizeof(size_t) * srcs.size(); | |||
| const TensorShape&, const TensorShape& offsets, const TensorShape&) { | |||
| return sizeof(size_t) * (offsets.shape[0] / 2); | |||
| } | |||
| template <typename T> | |||
| @@ -12,7 +12,7 @@ public: | |||
| _megdnn_workspace workspace) override; | |||
| size_t get_workspace_in_bytes( | |||
| const TensorShapeArray& srcs, const TensorShape& table, | |||
| const TensorShape& srcs, const TensorShape& table, | |||
| const TensorShape& dst) override; | |||
| private: | |||
| @@ -71,7 +71,7 @@ CondTakeTestcase::Result CondTakeTestcase::run(CondTake* opr) { | |||
| opr->param() = m_param; | |||
| DynOutMallocPolicyImpl malloc_policy(handle); | |||
| auto workspace_size = opr->get_workspace_in_bytes(data->layout); | |||
| auto workspace_size = opr->get_workspace_in_bytes(data->layout, mask->layout); | |||
| auto workspace_ptr = malloc_policy.alloc_workspace(workspace_size, nullptr); | |||
| auto result = opr->exec( | |||
| *data, *mask, {(dt_byte*)workspace_ptr, workspace_size}, &malloc_policy); | |||
| @@ -205,9 +205,14 @@ struct OprProxy<CheckNonFinite> { | |||
| auto inps = tensors; | |||
| inps.pop_back(); | |||
| TensorLayoutArray inp_layouts(inps.size()); | |||
| std::transform( | |||
| inps.begin(), inps.end(), inp_layouts.begin(), | |||
| [](const TensorND& tensor) { return tensor.layout; }); | |||
| WorkspaceWrapper W( | |||
| opr->handle(), | |||
| opr->get_workspace_in_bytes(inps, tensors.back().layout)); | |||
| opr->get_workspace_in_bytes(inp_layouts, tensors.back().layout)); | |||
| opr->exec(inps, tensors.back(), W.workspace()); | |||
| } | |||
| }; | |||
| @@ -95,7 +95,7 @@ void test_param_pack_concat( | |||
| test::WorkspaceWrapper workspace( | |||
| handle, | |||
| concat->get_workspace_in_bytes(shapes, offsets_layout, {pack_size})); | |||
| concat->get_workspace_in_bytes({nr_params}, offsets_layout, {pack_size})); | |||
| TensorND src_tensor(param_ptrs.data(), TensorLayout({nr_params}, dtype::Int32())); | |||
| concat->exec(src_tensor, offsets_tensor, dst_tensor, workspace.workspace()); | |||
| @@ -97,7 +97,7 @@ void test_param_pack_concat( | |||
| test::WorkspaceWrapper workspace( | |||
| handle, | |||
| concat->get_workspace_in_bytes(shapes, offsets_layout, {pack_size})); | |||
| concat->get_workspace_in_bytes({nr_params}, offsets_layout, {pack_size})); | |||
| TensorND src_tensor(param_ptrs.data(), TensorLayout({nr_params}, dtype::Int32())); | |||
| concat->exec(src_tensor, offsets_tensor, dst_tensor, workspace.workspace()); | |||
| @@ -9,11 +9,8 @@ BlobManagerImpl::BlobData::BlobData(OwnedBlob* in_blob) { | |||
| blob = in_blob; | |||
| DeviceTensorStorage d_storage; | |||
| d_storage.reset(blob->m_comp_node, blob->m_size, blob->m_storage); | |||
| h_storage = HostTensorStorage(blob->m_comp_node); | |||
| h_storage.ensure_size(blob->m_size); | |||
| h_storage.copy_from(const_cast<DeviceTensorStorage&>(d_storage), blob->m_size); | |||
| } | |||
| @@ -30,65 +27,36 @@ void BlobManagerImpl::unregister_blob(OwnedBlob* blob) { | |||
| } | |||
| void BlobManagerImpl::alloc_with_defrag(OwnedBlob* blob, size_t size) { | |||
| if (custom_allocator) { | |||
| blob->m_storage = custom_allocator(blob->m_comp_node, size); | |||
| if (m_custom_allocator) { | |||
| blob->m_storage = m_custom_allocator(blob->m_comp_node, size); | |||
| return; | |||
| } | |||
| // try alloc | |||
| MGB_TRY { alloc_direct(blob, size); } | |||
| // if fail, try defrag, alloc again | |||
| MGB_CATCH(MemAllocError&, { | |||
| if (!try_alloc_direct(blob, size)) { | |||
| mgb_log_warn("memory allocation failed for blob; try defragmenting"); | |||
| defrag(blob->m_comp_node); | |||
| alloc_direct(blob, size); | |||
| }); | |||
| } | |||
| } | |||
| void BlobManagerImpl::alloc_direct(OwnedBlob* blob, size_t size) { | |||
| DeviceTensorStorage storage(blob->m_comp_node); | |||
| mgb_assert(blob->m_comp_node.valid()); | |||
| DeviceTensorStorage storage(blob->m_comp_node); | |||
| storage.ensure_size(size); | |||
| blob->m_storage = storage.raw_storage(); | |||
| } | |||
| DeviceTensorND BlobManagerImpl::alloc_workspace_with_defrag( | |||
| CompNode cn, TensorLayout& layout) { | |||
| DeviceTensorND dev_tensor; | |||
| if (custom_allocator) { | |||
| DeviceTensorStorage storage(cn); | |||
| size_t sz = layout.dtype.size(layout.total_nr_elems()); | |||
| storage.reset(cn, sz, custom_allocator(cn, sz)); | |||
| dev_tensor.reset(storage, layout); | |||
| return dev_tensor; | |||
| } | |||
| MGB_TRY { dev_tensor = alloc_workspace(cn, layout); } | |||
| MGB_CATCH(MemAllocError&, { | |||
| mgb_log_warn("memory allocation failed for workspace; try defragmenting"); | |||
| defrag(cn); | |||
| dev_tensor = alloc_workspace(cn, layout); | |||
| }); | |||
| return dev_tensor; | |||
| }; | |||
| DeviceTensorND BlobManagerImpl::alloc_workspace(CompNode cn, TensorLayout layout) { | |||
| DeviceTensorStorage storage(cn); | |||
| storage.ensure_size(layout.dtype.size(layout.total_nr_elems())); | |||
| DeviceTensorND dev_tensor; | |||
| dev_tensor.reset(storage, layout); | |||
| return dev_tensor; | |||
| } | |||
| void BlobManagerImpl::set_allocator(allocator_t allocator) { | |||
| custom_allocator = allocator; | |||
| m_custom_allocator = allocator; | |||
| } | |||
| void BlobManagerImpl::defrag(const CompNode& cn) { | |||
| BlobSetWithMux* blobs_set_ptr; | |||
| { | |||
| auto& blobs_set_ptr = ([&]() -> auto& { | |||
| MGB_LOCK_GUARD(m_mtx); | |||
| blobs_set_ptr = &m_comp2blobs_map[cn]; | |||
| } | |||
| MGB_LOCK_GUARD(blobs_set_ptr->mtx); | |||
| return m_comp2blobs_map[cn]; | |||
| })(); | |||
| MGB_LOCK_GUARD(blobs_set_ptr.mtx); | |||
| std::vector<BlobData> blob_data_arrary; | |||
| std::set<Blob::RawStorage> storage_set; | |||
| @@ -96,7 +64,7 @@ void BlobManagerImpl::defrag(const CompNode& cn) { | |||
| size_t tot_sz = 0; | |||
| // copy to HostTensorStorage, and release | |||
| for (auto i : blobs_set_ptr->blobs_set) { | |||
| for (auto i : blobs_set_ptr.blobs_set) { | |||
| // skip if blob do not have m_storage | |||
| if (!i->m_storage) | |||
| continue; | |||
| @@ -153,9 +121,6 @@ struct BlobManagerStub : BlobManager { | |||
| void alloc_with_defrag(OwnedBlob* blob, size_t size) { | |||
| mgb_assert(0, "prohibited after global variable destruction"); | |||
| }; | |||
| DeviceTensorND alloc_workspace_with_defrag(CompNode cn, TensorLayout& layout) { | |||
| mgb_assert(0, "prohibited after global variable destruction"); | |||
| }; | |||
| void register_blob(OwnedBlob* blob) { | |||
| mgb_assert(0, "prohibited after global variable destruction"); | |||
| }; | |||
| @@ -163,7 +128,7 @@ struct BlobManagerStub : BlobManager { | |||
| void defrag(const CompNode& cn) { | |||
| mgb_assert(0, "prohibited after global variable destruction"); | |||
| }; | |||
| virtual void set_allocator(allocator_t allocator) { | |||
| void set_allocator(allocator_t allocator) { | |||
| mgb_assert(0, "prohibited after global variable destruction"); | |||
| }; | |||
| }; | |||
| @@ -27,27 +27,21 @@ class BlobManagerImpl final : public BlobManager { | |||
| std::mutex m_mtx; | |||
| CompNode::UnorderedMap<BlobSetWithMux> m_comp2blobs_map; | |||
| void defrag(const CompNode& cn) override; | |||
| BlobManager::allocator_t m_custom_allocator; | |||
| void alloc_direct(OwnedBlob* blob, size_t size) override; | |||
| DeviceTensorND alloc_workspace(CompNode cn, TensorLayout layout); | |||
| BlobManager::allocator_t custom_allocator; | |||
| public: | |||
| static BlobManager* inst(); | |||
| void alloc_with_defrag(OwnedBlob* blob, size_t size) override; | |||
| DeviceTensorND alloc_workspace_with_defrag( | |||
| CompNode cn, TensorLayout& layout) override; | |||
| void register_blob(OwnedBlob* blob) override; | |||
| void unregister_blob(OwnedBlob* blob) override; | |||
| void defrag(const CompNode& cn) override; | |||
| void set_allocator(allocator_t allocator) override; | |||
| }; | |||
| @@ -1,79 +1,331 @@ | |||
| #pragma once | |||
| #include <optional> | |||
| #include <type_traits> | |||
| #include "algo_chooser.h" | |||
| #include "megbrain/comp_node.h" | |||
| #include "megbrain/comp_node_env.h" | |||
| #include "megbrain/imperative/blob_manager.h" | |||
| #include "megbrain/imperative/physical_tensor.h" | |||
| #include "megbrain/imperative/utils/helper.h" | |||
| #include "megbrain/imperative/utils/platform.h" | |||
| #include "megbrain/rdnn/management.h" | |||
| using namespace megdnn; | |||
| #include "megdnn/basic_types.h" | |||
| namespace mgb { | |||
| namespace imperative { | |||
| /*! | |||
| * \brief A struct for safely calling DNN oprs | |||
| * In some cases, op may be released before the complete of the execution | |||
| * This destructor will prevent this | |||
| * /brief Helps deduce layout and dtype | |||
| */ | |||
| template <typename Opr> | |||
| struct DnnOprCaller { | |||
| CompNode cn; | |||
| DeviceTensorND dev_tensor; | |||
| Workspace workspace; | |||
| mgb::opr::intl::UniqPtrWithCN<Opr> op; | |||
| class DnnOprDeducer { | |||
| private: | |||
| Opr* m_opr; | |||
| DnnOprCaller(CompNode cn) : cn(cn), op(std::move(create_operator(cn))) {} | |||
| public: | |||
| DnnOprDeducer(Opr* opr) : m_opr(opr) { mgb_assert(opr); } | |||
| static mgb::opr::intl::UniqPtrWithCN<Opr> create_operator(CompNode cn) { | |||
| return mgb::opr::intl::create_megdnn_opr<Opr>(cn); | |||
| // FIXME: maybe in-place style deduction works better | |||
| template <typename... TArgs> | |||
| TensorLayout deduce_layout(TArgs&&... args) { | |||
| static_assert((std::is_convertible_v<TArgs, TensorLayout> && ...)); | |||
| TensorLayout output_layout; | |||
| m_opr->deduce_layout(args..., output_layout); | |||
| return output_layout; | |||
| } | |||
| Workspace create_workspace(size_t sz) { | |||
| if (workspace.raw_ptr) { | |||
| mgb_throw(MegBrainError, "workspace should not be applicated many times"); | |||
| } | |||
| if (sz) { | |||
| TensorLayout layout({sz}, dtype::Byte()); | |||
| dev_tensor = Tensor::make(layout, cn)->dev_tensor(); | |||
| workspace = megdnn::Workspace( | |||
| dev_tensor.raw_ptr(), dev_tensor.storage().size()); | |||
| template <typename... TArgs> | |||
| TensorLayout deduce_layout_fallible(TArgs&&... args) { | |||
| static_assert((std::is_convertible_v<TArgs, TensorLayout> && ...)); | |||
| TensorLayout output_layout; | |||
| bool success = (args.ndim * ...) > 0; | |||
| if (success) { | |||
| m_opr->deduce_layout(args..., output_layout); | |||
| } else { | |||
| m_opr->deduce_dtype(args.dtype..., output_layout.dtype); | |||
| } | |||
| return workspace; | |||
| return output_layout; | |||
| } | |||
| ~DnnOprCaller() { | |||
| template <size_t nr_outputs, typename... TArgs> | |||
| std::array<TensorLayout, nr_outputs> deduce_layouts(TArgs&&... args) { | |||
| static_assert((std::is_convertible_v<TArgs, TensorLayout> && ...)); | |||
| std::array<TensorLayout, nr_outputs> layouts; | |||
| std::apply( | |||
| [&](auto&&... outputs) { m_opr->deduce_layout(args..., outputs...); }, | |||
| layouts); | |||
| return layouts; | |||
| } | |||
| }; | |||
| /*! | |||
| * /brief Declare an abstract operator and initialize it's param | |||
| */ | |||
| template <typename Opr> | |||
| class DnnOprStub { | |||
| private: | |||
| // TODO: make opr concrete | |||
| std::aligned_storage_t<sizeof(Opr), alignof(Opr)> m_storage; | |||
| using Param = typename Opr::Param; | |||
| private: | |||
| DnnOprStub() { new (¶m()) Param(); } | |||
| public: | |||
| DnnOprStub(const Param& param) { this->param() = param; } | |||
| // undefined behavior | |||
| Opr& opr() { return *reinterpret_cast<Opr*>(&m_storage); } | |||
| auto& param() { return opr().param(); } | |||
| auto& param() const { return opr().param(); } | |||
| ~DnnOprStub() { param().~Param(); } | |||
| }; | |||
| /*! | |||
| * /brief Deduce layout without create concrete opr | |||
| */ | |||
| template <typename Opr> | |||
| class DnnOprHelper : public DnnOprStub<Opr>, public DnnOprDeducer<Opr> { | |||
| private: | |||
| using Stub = DnnOprStub<Opr>; | |||
| using Deducer = DnnOprDeducer<Opr>; | |||
| public: | |||
| DnnOprHelper(const typename Opr::Param& param) | |||
| : Stub(param), Deducer(&Stub::opr()) {} | |||
| }; | |||
| // hold a concrete operator in given comp_node | |||
| template <typename Opr> | |||
| class DnnOprHolder { | |||
| private: | |||
| CompNode m_comp_node; | |||
| opr::intl::UniqPtrWithCN<Opr> m_opr = | |||
| opr::intl::create_megdnn_opr<Opr>(m_comp_node); | |||
| public: | |||
| DnnOprHolder(CompNode comp_node) : m_comp_node(comp_node) {} | |||
| auto& op() { return m_opr; } | |||
| auto comp_node() { return m_comp_node; } | |||
| auto& param() { return m_opr->param(); } | |||
| auto& param() const { return m_opr->param(); } | |||
| ~DnnOprHolder() { | |||
| using DT = CompNode::DeviceType; | |||
| if (cn.device_type() == DT::CPU && cn != CompNode::default_cpu()) { | |||
| CompNodeEnv::from_comp_node(cn).cpu_env().dispatch( | |||
| [p = op.release()] { delete p; }); | |||
| if (m_comp_node.device_type() == DT::CPU && | |||
| m_comp_node != CompNode::default_cpu()) { | |||
| CompNodeEnv::from_comp_node(m_comp_node) | |||
| .cpu_env() | |||
| .dispatch([p = m_opr.release()] { delete p; }); | |||
| } | |||
| } | |||
| }; | |||
| /*! | |||
| * /brief Prevent binary float | |||
| */ | |||
| class DnnOprCallerBase { | |||
| protected: | |||
| static auto&& get_layout(const megdnn::TensorND& tensor) { return tensor.layout; } | |||
| static auto get_layout(const megdnn::TensorNDArray& tensors) { | |||
| SmallVector<TensorLayout> layouts; | |||
| for (auto&& tensor : tensors) { | |||
| layouts.push_back(tensor.layout); | |||
| } | |||
| return layouts; | |||
| } | |||
| }; | |||
| template <size_t OSize> | |||
| class MegDNNDynOutMallocImpl final : public megdnn::DynOutMallocPolicy { | |||
| using Output = std::array<TensorPtr, OSize>; | |||
| /*! | |||
| * \brief A struct for safely calling DNN oprs | |||
| * | |||
| * In some cases, op may be released before the complete of the execution | |||
| * This destructor will prevent this | |||
| */ | |||
| template <typename Opr> | |||
| class DnnOprCaller final : public DnnOprHolder<Opr>, | |||
| public DnnOprDeducer<Opr>, | |||
| public DnnOprCallerBase { | |||
| private: | |||
| using Holder = DnnOprHolder<Opr>; | |||
| using Deducer = DnnOprDeducer<Opr>; | |||
| using Base = DnnOprCallerBase; | |||
| std::optional<DnnTensorND> m_workspace; | |||
| std::optional<megdnn::param::ExecutionPolicy> m_policy; | |||
| CompNode m_cn; | |||
| Output m_out; | |||
| megdnn::Workspace create_workspace(size_t sz) { | |||
| mgb_assert( | |||
| !m_workspace, "workspace asked more than once by op: %s", | |||
| demangled_typename<Opr>()); | |||
| dt_byte* ptr = nullptr; | |||
| if (sz) { | |||
| TensorLayout layout({sz}, dtype::Byte()); | |||
| m_workspace.emplace( | |||
| Tensor::make(layout, Holder::comp_node())->dnn_tensor()); | |||
| ptr = reinterpret_cast<dt_byte*>(m_workspace->raw_ptr()); | |||
| } | |||
| return {ptr, sz}; | |||
| } | |||
| public: | |||
| MegDNNDynOutMallocImpl(CompNode cn) : m_cn{cn} {} | |||
| megdnn::TensorND alloc_output( | |||
| size_t id, DType dtype, const TensorShape& shape, | |||
| void* user_data) override { | |||
| TensorLayout m_layout(shape, dtype); | |||
| m_out[id] = Tensor::make(m_layout, m_cn); | |||
| return m_out[id]->dev_tensor().as_megdnn(); | |||
| using Param = typename Opr::Param; | |||
| DnnOprCaller(CompNode cn) : Holder(cn), Deducer(Holder::op().get()) {} | |||
| DnnOprCaller(CompNode cn, const Param& param) : DnnOprCaller(cn) { | |||
| Holder::param() = param; | |||
| } | |||
| DnnOprCaller(CompNode cn, const Param& param, megdnn::param::ExecutionPolicy policy) | |||
| : DnnOprCaller(cn, param) { | |||
| m_policy.emplace(policy); | |||
| } | |||
| void* alloc_workspace(size_t sz, void* user_data) override { | |||
| return m_cn.alloc_device(sz); | |||
| /** | |||
| * /brief Convert TensorPtr args to megdnn::TensorND and call f | |||
| * | |||
| */ | |||
| template <typename TFunctor, typename... TArgs> | |||
| auto call_dnn(TFunctor&& f, TArgs&&... args) { | |||
| std::optional<SmallVector<std::shared_ptr<dt_byte>>> input_ptrs; | |||
| // recursive convert: | |||
| // 1. TensorPtr to DnnTensorND (subclass of megdnn::TensorND) ; | |||
| // 2. DeviceTensorND, HostTensorND to megdnn::TensorND ; | |||
| // 3. SmallVector of above to SmallVector<megdnn::TensorND> . | |||
| auto to_dnn = [&](auto&& arg, auto&& to_dnn) { | |||
| using T = decltype(arg); | |||
| if constexpr (std::is_convertible_v<T, TensorPtr>) { | |||
| return arg->dnn_tensor(); | |||
| } else if constexpr ( | |||
| std::is_convertible_v<T, DeviceTensorND> || | |||
| std::is_convertible_v<T, HostTensorND>) { | |||
| return arg.as_megdnn(); | |||
| } else if constexpr ( | |||
| std::is_convertible_v<T, megdnn::TensorND> || | |||
| std::is_convertible_v<T, SmallVector<megdnn::TensorND>>) { | |||
| return std::forward<T>(arg); | |||
| } else if constexpr (is_small_vector_v<std::decay_t<T>>) { | |||
| using TItem = std::decay_t<decltype(to_dnn(arg[0], to_dnn))>; | |||
| SmallVector<megdnn::TensorND> dnn_tensors; | |||
| for (auto&& tensor : arg) { | |||
| if constexpr (std::is_same_v<TItem, DnnTensorND>) { | |||
| if (!input_ptrs) { | |||
| input_ptrs.emplace(); | |||
| } | |||
| auto dnn_tensor = to_dnn(tensor, to_dnn); | |||
| input_ptrs->push_back(std::move(dnn_tensor.reference)); | |||
| dnn_tensors.push_back(std::move(dnn_tensor)); | |||
| } else if constexpr (std::is_same_v<TItem, megdnn::TensorND>) { | |||
| dnn_tensors.push_back(to_dnn(tensor, to_dnn)); | |||
| } else { | |||
| static_assert(!std::is_same_v<TItem, TItem>); | |||
| } | |||
| } | |||
| return dnn_tensors; | |||
| } else { | |||
| static_assert(!std::is_same_v<T, T>); | |||
| } | |||
| }; | |||
| return f(to_dnn(std::forward<TArgs>(args), to_dnn)...); | |||
| } | |||
| void free_workspace(void* ptr, void* user_data) override { m_cn.free_device(ptr); } | |||
| // common execution (opr->exec(inputs..., outputs...)) | |||
| template <typename... TArgs> | |||
| void exec(TArgs&&... args) { | |||
| call_dnn( | |||
| [this](auto&&... args) { | |||
| Holder::op()->exec(std::forward<decltype(args)>(args)...); | |||
| }, | |||
| std::forward<TArgs>(args)...); | |||
| } | |||
| // execution fastrun opr | |||
| // (opr->exec(inputs..., outputs..., create_ws(setup_algo(...)))) | |||
| template <typename... TArgs> | |||
| void exec_fastrun(TArgs&&... args) { | |||
| call_dnn( | |||
| [&](auto&&... args) { | |||
| using FixedTensorLayouts = | |||
| typename rdnn::AlgoChooser<Opr>::FixedTensorLayouts; | |||
| SmallVector<megdnn::TensorND> dnn_inputs = {args...}; | |||
| mgb_assert(m_policy, "policy not set"); | |||
| size_t workspace_size = setup_algo<Opr>( | |||
| FixedTensorLayouts{args.layout...}, Holder::op().get(), 0, | |||
| false, false, Holder::comp_node(), *m_policy, false, | |||
| &dnn_inputs); | |||
| Holder::op()->exec( | |||
| std::forward<decltype(args)>(args)..., | |||
| create_workspace(workspace_size)); | |||
| }, | |||
| std::forward<TArgs>(args)...); | |||
| } | |||
| // execute with fixed workspace | |||
| // (opr->exec(input..., outputs..., create_ws(get_workspace_in_bytes(...)))) | |||
| template <typename... TArgs> | |||
| void exec_with_ws(TArgs&&... args) { | |||
| call_dnn( | |||
| [&](auto&&... args) { | |||
| size_t workspace_size = | |||
| Holder::op()->get_workspace_in_bytes(get_layout(args)...); | |||
| Holder::op()->exec( | |||
| std::forward<decltype(args)>(args)..., | |||
| create_workspace(workspace_size)); | |||
| }, | |||
| std::forward<TArgs>(args)...); | |||
| } | |||
| TensorPtr at(size_t id) { return m_out[id]; } | |||
| // execute dynamic out opr | |||
| // (opr->exec(inputs..., outputs... create_ws(get_workspace_in_bytes(...)), alloc)) | |||
| template <size_t nr_out, typename... TArgs> | |||
| auto exec_dynout(TArgs&&... args) { | |||
| struct Alloc final : public megdnn::DynOutMallocPolicy { | |||
| CompNode comp_node; | |||
| std::array<TensorPtr, nr_out> output_tensors; | |||
| std::array<std::optional<DnnTensorND>, nr_out> output_dnn_tensors; | |||
| public: | |||
| Alloc(CompNode comp_node) : comp_node(comp_node) {} | |||
| megdnn::TensorND alloc_output( | |||
| size_t id, DType dtype, const TensorShape& shape, | |||
| void* user_data) override { | |||
| TensorLayout layout(shape, dtype); | |||
| output_tensors[id] = Tensor::make(layout, comp_node); | |||
| output_dnn_tensors[id].emplace( | |||
| output_tensors[id]->dnn_tensor()); // pin output | |||
| return *output_dnn_tensors[id]; | |||
| } | |||
| void* alloc_workspace(size_t sz, void* user_data) override { | |||
| mgb_assert(false); | |||
| } | |||
| void free_workspace(void* ptr, void* user_data) override { | |||
| mgb_assert(false); | |||
| } | |||
| } alloc{Holder::comp_node()}; | |||
| call_dnn( | |||
| [&](auto&&... args) { | |||
| size_t workspace_size = | |||
| Holder::op()->get_workspace_in_bytes(get_layout(args)...); | |||
| Holder::op()->exec( | |||
| std::forward<decltype(args)>(args)..., | |||
| create_workspace(workspace_size), &alloc); | |||
| }, | |||
| std::forward<TArgs>(args)...); | |||
| return alloc.output_tensors; | |||
| } | |||
| }; | |||
| } // namespace imperative | |||
| @@ -605,6 +605,7 @@ TensorInfo* ChannelImpl::alloc() { | |||
| void ChannelImpl::init(TensorInfo* info, LogicalTensorDesc&& desc) { | |||
| m_valid_handle.insert(reinterpret_cast<Handle>(info)); | |||
| MGB_RECORD_EVENT(TensorDeclareEvent, info->id, info->name); | |||
| mgb_assert(desc.comp_node.valid(), "comp_node invalid"); | |||
| info->status = TensorInfo::Allocated; | |||
| info->desc = std::move(desc); | |||
| } | |||
| @@ -831,6 +832,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd, std::string reason) { | |||
| output_descs.push_back(i->desc); | |||
| } | |||
| } else { | |||
| // i may be null | |||
| validated = false; | |||
| } | |||
| // Here std::move is REQUIRED for removing duplicated references. | |||
| @@ -1064,17 +1066,16 @@ void ChannelImpl::alloc_tensor_with_evict(OwnedBlob* x) { | |||
| if (in_worker) { | |||
| reserve_size(x->size()); | |||
| } | |||
| MGB_TRY { BlobManager::inst()->alloc_direct(x, x->size()); } | |||
| MGB_CATCH(MemAllocError&, { | |||
| if (!BlobManager::inst()->try_alloc_direct(x, x->size())) { | |||
| bool suc = false; | |||
| if (in_worker) { | |||
| while (!suc) { | |||
| if (!auto_evict(1)) { | |||
| break; | |||
| } | |||
| MGB_TRY { BlobManager::inst()->alloc_direct(x, x->size()); } | |||
| MGB_CATCH(MemAllocError&, { continue; }); | |||
| suc = true; | |||
| if (BlobManager::inst()->try_alloc_direct(x, x->size())) { | |||
| suc = true; | |||
| } | |||
| } | |||
| } | |||
| if (!suc) { | |||
| @@ -1086,9 +1087,11 @@ void ChannelImpl::alloc_tensor_with_evict(OwnedBlob* x) { | |||
| imperative_log_profile_begin("defrag"); | |||
| BlobManager::inst()->defrag(x->comp_node()); | |||
| imperative_log_profile_end("defrag"); | |||
| BlobManager::inst()->alloc_direct(x, x->size()); | |||
| mgb_assert( | |||
| BlobManager::inst()->try_alloc_direct(x, x->size()), | |||
| "allocation failed after defrag"); | |||
| } | |||
| }); | |||
| } | |||
| set_log_level(pre_level); | |||
| } | |||
| @@ -75,13 +75,12 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
| auto&& pool = static_cast<const AdaptivePooling&>(def); | |||
| auto&& pooling = def.cast_final_safe<AdaptivePooling>(); | |||
| auto&& cn = inputs[0]->comp_node(); | |||
| using TensorND = megdnn::TensorND; | |||
| auto&& src_layout = inputs[0]->layout(); | |||
| TensorLayout dst_layout = output_descs[0].layout; | |||
| auto param_format = pool.format; | |||
| TensorLayout dst_layout{inputs[0]->dtype()}; | |||
| auto param_format = pooling.format; | |||
| if (!validated) { | |||
| dst_layout.ndim = src_layout.ndim; | |||
| const dt_int32* oshp2d = nullptr; | |||
| @@ -91,7 +90,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| tshp1n = inputs[1]->layout().total_nr_elems() == 1; | |||
| oshp2d = tshp_nd->get_value().proxy_to_default_cpu().ptr<dt_int32>(); | |||
| } else { | |||
| oshp2d = pool.shape.data(); | |||
| oshp2d = pooling.shape.data(); | |||
| } | |||
| if (param_format == opr::AdaptivePooling::Param::Format::NCHW) { | |||
| dst_layout[0] = src_layout[0]; | |||
| @@ -108,15 +107,17 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| MegBrainError, "AdaptivePooling only support NCHW or NHWC format"); | |||
| } | |||
| dst_layout.init_contiguous_stride(); | |||
| } else { | |||
| dst_layout = output_descs[0].layout; | |||
| } | |||
| size_t IH, IW, OH, OW; | |||
| if (param_format == param::AdaptivePooling::Format::NCHW) { | |||
| if (param_format == megdnn::param::AdaptivePooling::Format::NCHW) { | |||
| IH = src_layout[2]; | |||
| IW = src_layout[3]; | |||
| OH = dst_layout[2]; | |||
| OW = dst_layout[3]; | |||
| } else if (param_format == param::AdaptivePooling::Format::NHWC) { | |||
| } else if (param_format == megdnn::param::AdaptivePooling::Format::NHWC) { | |||
| IH = src_layout[1]; | |||
| IW = src_layout[2]; | |||
| OH = dst_layout[1]; | |||
| @@ -124,26 +125,21 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| } else { | |||
| mgb_throw(MegBrainError, "AdaptivePooling only support NCHW or NHWC format"); | |||
| } | |||
| DnnOprCaller<megdnn::Pooling> dnn_opr(cn); | |||
| auto&& param = dnn_opr.op->param(); | |||
| param.mode = pool.mode; | |||
| param.format = pool.format; | |||
| // adaptive_pooling param to pooling | |||
| auto&& param = megdnn::Pooling::Param(); | |||
| param.mode = pooling.mode; | |||
| param.format = pooling.format; | |||
| param.pad_h = param.pad_w = 0; | |||
| param.stride_h = floor(IH / OH); | |||
| param.stride_w = floor(IW / OW); | |||
| param.stride_h = IH / OH; | |||
| param.stride_w = IW / OW; | |||
| param.window_h = IH - (OH - 1) * param.stride_h; | |||
| param.window_w = IW - (OW - 1) * param.stride_w; | |||
| TensorND src = inputs[0]->dnn_tensor(); | |||
| DnnOprCaller<megdnn::Pooling> dnn_opr(cn, param, megdnn::param::ExecutionPolicy{}); | |||
| auto src = inputs[0]; | |||
| auto dst = Tensor::make(dst_layout, cn); | |||
| size_t sz = setup_algo<megdnn::Pooling>( | |||
| {src_layout, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, | |||
| ::megdnn::param::ExecutionPolicy{}, false); | |||
| auto dnn_wk = dnn_opr.create_workspace(sz); | |||
| dnn_opr.op->exec(src, dst->dnn_tensor(), dnn_wk); | |||
| dnn_opr.exec_fastrun(inputs[0], dst); | |||
| return {dst}; | |||
| } | |||
| @@ -145,79 +145,44 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| auto&& op_def = def.cast_final_safe<BatchNorm>(); | |||
| auto&& comp_node = inputs[0]->comp_node(); | |||
| using TensorND = megdnn::TensorND; | |||
| DnnOprCaller<megdnn::BN> dnn_opr(comp_node, op_def.param()); | |||
| SmallVector<TensorND> inp_tensornds(inputs.size()); | |||
| for (size_t i = 0; i < inputs.size(); ++i) { | |||
| inp_tensornds[i] = inputs[i]->dnn_tensor(); | |||
| } | |||
| DnnOprCaller<megdnn::BN> dnn_opr(comp_node); | |||
| dnn_opr.op->param() = op_def.param(); | |||
| TensorLayout src_layout = inputs[0]->layout(); | |||
| TensorLayout scale_layout = inputs[1]->layout(); | |||
| auto src_layout = inputs[0]->layout(); | |||
| auto scale_layout = inputs[1]->layout(); | |||
| bool empty_input = src_layout.is_empty(); | |||
| size_t nr_inp = inputs.size(); | |||
| size_t sz = 0, rsz = 0; | |||
| TensorLayout r_layout({rsz}, dtype::Byte()); | |||
| if (!empty_input) { | |||
| sz = dnn_opr.op->get_workspace_in_bytes( | |||
| src_layout, src_layout, src_layout, src_layout, src_layout, src_layout, | |||
| src_layout, src_layout, src_layout); | |||
| rsz = dnn_opr.op->get_reserve_in_bytes(src_layout); | |||
| r_layout = TensorLayout({rsz}, dtype::Byte()); | |||
| } | |||
| auto dnn_wk = dnn_opr.create_workspace(sz); | |||
| auto reserve = Tensor::make(r_layout, comp_node); | |||
| // size_t ws_size = 0, reserve_size = 0; | |||
| size_t reserve_size = | |||
| empty_input ? (size_t)0 : dnn_opr.op()->get_reserve_in_bytes(src_layout); | |||
| // alloc memory | |||
| // alloc outputs | |||
| auto y = Tensor::make(src_layout, comp_node); | |||
| auto save_mean = Tensor::make(scale_layout, comp_node); | |||
| auto save_variance = Tensor::make(scale_layout, comp_node); | |||
| auto reserve = Tensor::make(TensorLayout{{reserve_size}, dtype::Byte()}, comp_node); | |||
| if (op_def.fwd_mode == ::megdnn::param::BN::FwdMode::INFERENCE) { | |||
| if (!empty_input) | |||
| dnn_opr.op->exec( | |||
| inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], | |||
| inp_tensornds[3], inp_tensornds[4], save_mean->dnn_tensor(), | |||
| save_variance->dnn_tensor(), reserve->dnn_tensor(), y->dnn_tensor(), | |||
| dnn_wk); | |||
| if (!empty_input) { | |||
| dnn_opr.exec_with_ws( | |||
| inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], save_mean, | |||
| save_variance, reserve, y); | |||
| } | |||
| return {inputs[3], inputs[4], reserve, y}; | |||
| } else { | |||
| if (nr_inp == 5) { | |||
| auto mean = Tensor::make(scale_layout, comp_node); | |||
| auto variance = Tensor::make(scale_layout, comp_node); | |||
| megdnn::RefPtr src_ptr1( | |||
| inp_tensornds[3].get_ref_ptr().get_ptr(), inputs[3]->offset()); | |||
| megdnn::RefPtr dst_ptr1( | |||
| mean->dev_tensor().storage().get_ref_ptr(), | |||
| mean->dev_tensor().storage().offset(), false); | |||
| comp_node.peer_copy_to_ref( | |||
| comp_node, dst_ptr1, src_ptr1, scale_layout.span().high_byte); | |||
| megdnn::RefPtr src_ptr2( | |||
| inp_tensornds[4].get_ref_ptr().get_ptr(), inputs[4]->offset()); | |||
| megdnn::RefPtr dst_ptr2( | |||
| variance->dev_tensor().storage().get_ref_ptr(), | |||
| variance->dev_tensor().storage().offset(), false); | |||
| comp_node.peer_copy_to_ref( | |||
| comp_node, dst_ptr2, src_ptr2, scale_layout.span().high_byte); | |||
| if (!empty_input) | |||
| dnn_opr.op->exec( | |||
| inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], | |||
| mean->dnn_tensor(), variance->dnn_tensor(), | |||
| save_mean->dnn_tensor(), save_variance->dnn_tensor(), | |||
| reserve->dnn_tensor(), y->dnn_tensor(), dnn_wk); | |||
| // FIXME | |||
| mean->dev_tensor().copy_from(inputs[3]->dev_tensor()); | |||
| variance->dev_tensor().copy_from(inputs[4]->dev_tensor()); | |||
| if (!empty_input) { | |||
| dnn_opr.exec_with_ws( | |||
| inputs[0], inputs[1], inputs[2], mean, variance, save_mean, | |||
| save_variance, reserve, y); | |||
| } | |||
| return {mean, variance, save_mean, save_variance, reserve, y}; | |||
| } | |||
| @@ -227,11 +192,9 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| auto variance = Tensor::make(m_layout, comp_node); | |||
| if (!empty_input) { | |||
| dnn_opr.op->exec( | |||
| inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], | |||
| mean->dnn_tensor(), variance->dnn_tensor(), save_mean->dnn_tensor(), | |||
| save_variance->dnn_tensor(), reserve->dnn_tensor(), y->dnn_tensor(), | |||
| dnn_wk); | |||
| dnn_opr.exec_with_ws( | |||
| inputs[0], inputs[1], inputs[2], mean, variance, save_mean, | |||
| save_variance, reserve, y); | |||
| } | |||
| return {save_mean, save_variance, reserve, y}; | |||
| @@ -28,33 +28,26 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| auto&& inp = inputs[0]; | |||
| auto&& msk = inputs[1]; | |||
| SmallVector<TensorPtr> out; | |||
| mgb_assert( | |||
| inp->layout().eq_shape(msk->layout()), | |||
| "input shape does not match mask shape"); | |||
| mgb_assert( | |||
| msk->get_value().dtype().enumv() == DTypeEnum::Bool, | |||
| "mask dtype must be bool"); | |||
| MegDNNDynOutMallocImpl<2> policy{inp->comp_node()}; | |||
| if (inp->layout().is_empty()) { | |||
| // empty tensor | |||
| policy.alloc_output(0, inp->layout().dtype, {0}, nullptr); | |||
| policy.alloc_output(1, dtype::Int32(), {0}, nullptr); | |||
| return { | |||
| Tensor::make(TensorLayout{{0}, inp->dtype()}, inp->comp_node()), | |||
| Tensor::make(TensorLayout{{0}, dtype::Int32()}, inp->comp_node()), | |||
| }; | |||
| } else { | |||
| DnnOprCaller<megdnn::CondTake> dnn_op(inp->comp_node()); | |||
| dnn_op.op->param().val = 1; | |||
| size_t sz = dnn_op.op->get_workspace_in_bytes(inp->layout()); | |||
| auto dnn_workspace = dnn_op.create_workspace(sz); | |||
| dnn_op.op->exec( | |||
| inp->dev_tensor().as_megdnn(), msk->dev_tensor().as_megdnn(), | |||
| dnn_workspace, &policy); | |||
| // maybe we need to split CondTake | |||
| megdnn::CondTake::Param param; | |||
| param.val = 1; | |||
| DnnOprCaller<megdnn::CondTake> dnn_op(inp->comp_node(), param); | |||
| auto&& [out0, out1] = dnn_op.exec_dynout<2>(inp, msk); | |||
| return {out0, out1}; | |||
| } | |||
| out.push_back(policy.at(0)); | |||
| out.push_back(policy.at(1)); | |||
| return out; | |||
| } | |||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| @@ -8,14 +8,7 @@ | |||
| namespace mgb { | |||
| namespace imperative { | |||
| namespace { | |||
| size_t infer_conv_shape(size_t inp, size_t flt, size_t stride, size_t pad) { | |||
| mgb_assert(inp + 2 * pad >= flt, "input=%zu padding=%zu filter=%zu", inp, pad, flt); | |||
| return (inp + 2 * pad - flt) / stride + 1; | |||
| } | |||
| namespace convolution { | |||
| std::shared_ptr<OpDef> make_from_op_node(cg::OperatorNodeBase* node_) { | |||
| auto* node = &node_->cast_final_safe<opr::Convolution>(); | |||
| @@ -29,131 +22,23 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||
| inputs[0], inputs[1], conv.param(), conv.policy(), config); | |||
| } | |||
| TensorLayout do_shape_infer( | |||
| const OpDef& def, size_t src_ndim, TensorLayout src, TensorLayout filter) { | |||
| auto&& conv = static_cast<const Convolution&>(def); | |||
| using Param = ::megdnn::param::Convolution; | |||
| auto img_ndim = src_ndim - 2; | |||
| mgb_assert( | |||
| img_ndim == 2, | |||
| "only 2D convolution is supported, and input should be 4-dim; " | |||
| "got input dim = %zu", | |||
| src_ndim); | |||
| size_t group = 1; | |||
| size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; | |||
| if (conv.sparse == Param::Sparse::DENSE) { | |||
| mgb_assert( | |||
| filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4, | |||
| "bad filter ndim for dense convolution: " | |||
| "spatial_ndim=%zu filter_ndim=%zu", | |||
| img_ndim, filter.ndim); | |||
| group = 1; | |||
| flt_start = 0; | |||
| } else { // Param::Sparse::GROUP | |||
| mgb_assert( | |||
| filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5, | |||
| "bad filter ndim for group convolution: " | |||
| "spatial_ndim=%zu filter_ndim=%zu", | |||
| img_ndim, filter.ndim); | |||
| // grp, oc, ic, dims[] | |||
| group = filter[0]; | |||
| flt_start = 1; | |||
| } | |||
| uint32_t ic_block_size = 1, oc_block_size = 1; | |||
| size_t src_or_dst_c_pos = 0; | |||
| size_t src_or_dst_spatial_start = 0; | |||
| if (conv.format == Param::Format::NCHW) { | |||
| // filter should be (oc, ic, fh, fw) | |||
| flt_spatial_start = 2; | |||
| ocpg_pos = 0; | |||
| icpg_pos = 1; | |||
| src_or_dst_c_pos = 1; | |||
| src_or_dst_spatial_start = 2; | |||
| } else { // Param::Format::NHWC | |||
| // filter should be (oc, fh, fw, ic) | |||
| flt_spatial_start = 1; | |||
| ocpg_pos = 0; | |||
| icpg_pos = 3; | |||
| src_or_dst_c_pos = 3; | |||
| src_or_dst_spatial_start = 1; | |||
| } | |||
| size_t ocpg = filter[flt_start + ocpg_pos] * oc_block_size; | |||
| size_t icpg = filter[flt_start + icpg_pos] * ic_block_size; | |||
| uint32_t dilation[2], dilated_spatial[2], stride[2], padding[2]; | |||
| dilation[0] = conv.dilate_h; | |||
| dilation[1] = conv.dilate_w; | |||
| stride[0] = conv.stride_h; | |||
| stride[1] = conv.stride_w; | |||
| padding[0] = conv.pad_h; | |||
| padding[1] = conv.pad_w; | |||
| for (size_t i = 0; i < img_ndim; ++i) { | |||
| mgb_assert( | |||
| dilation[i] > 0, "invalid dilation on spatial dim %zu: %u", i, | |||
| dilation[i]); | |||
| dilated_spatial[i] = | |||
| (filter[i + flt_start + flt_spatial_start] - 1) * dilation[i] + 1; | |||
| } | |||
| mgb_assert( | |||
| icpg * group == src[src_or_dst_c_pos], | |||
| "group conv invalid: input channel of Conv expect %zu, but got %zu\n" | |||
| "hint: weight may be changed by mistake\n", | |||
| icpg * group, src[src_or_dst_c_pos]); | |||
| TensorLayout dst{src.dtype}; | |||
| dst.ndim = src_ndim; | |||
| dst[0] = src[0]; | |||
| dst[src_or_dst_c_pos] = ocpg * group; | |||
| for (size_t i = 0; i < img_ndim; ++i) { | |||
| dst[i + src_or_dst_spatial_start] = infer_conv_shape( | |||
| src[i + src_or_dst_spatial_start], dilated_spatial[i], stride[i], | |||
| padding[i]); | |||
| } | |||
| dst.init_contiguous_stride(); | |||
| return dst; | |||
| } | |||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | |||
| SmallVector<LogicalTensorDesc> dests(1); | |||
| auto&& desc = dests[0]; | |||
| desc.comp_node = inputs[0].comp_node; | |||
| TensorLayout src = inputs[0].layout; | |||
| TensorLayout filter = inputs[1].layout; | |||
| size_t src_ndim = src.ndim; | |||
| if (src_ndim == 0 || filter.ndim == 0) { | |||
| desc.layout = TensorLayout{{}, src.dtype}; | |||
| return {dests, false}; | |||
| auto&& conv = def.cast_final_safe<Convolution>(); | |||
| DnnOprHelper<megdnn::ConvolutionForward> dnn_opr(conv.param()); | |||
| auto&& data = inputs[0].layout; | |||
| auto&& filter = inputs[1].layout; | |||
| TensorLayout output_layout{data.dtype}; | |||
| if (data.ndim && filter.ndim) { | |||
| // deduce_layout won't override existing dtype | |||
| dnn_opr.opr().deduce_layout(data, filter, output_layout); | |||
| } | |||
| desc.layout = do_shape_infer(def, src_ndim, src, filter); | |||
| return {dests, true}; | |||
| return {{{output_layout, inputs[0].comp_node}}, output_layout.ndim != 0}; | |||
| } | |||
| SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
| // create megdnn opr | |||
| auto&& conv = static_cast<const Convolution&>(def); | |||
| CompNode cn = inputs[0]->comp_node(); | |||
| TensorLayout out_layout = output_descs[0].layout; | |||
| if (!validated) | |||
| out_layout = do_shape_infer( | |||
| def, inputs[0]->layout().ndim, inputs[0]->layout(), | |||
| inputs[1]->layout()); | |||
| using TensorND = megdnn::TensorND; | |||
| SmallVector<TensorND> inp_tensornds(inputs.size() + 2); | |||
| TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); | |||
| for (unsigned i = 0; i < inputs.size(); ++i) { | |||
| inp_tensornds[i] = inputs[i]->dnn_tensor(); | |||
| inp_shapes[i] = inputs[i]->layout(); | |||
| } | |||
| oup_shapes[0] = out_layout; | |||
| DnnOprCaller<megdnn::ConvBiasForward> dnn_opr(cn); | |||
| auto&& param = dnn_opr.op->param(); | |||
| // Convolution::Param -> ConvBias::Param | |||
| auto conv_bias_param_from_convolution(const Convolution& conv) { | |||
| megdnn::ConvBias::Param param; | |||
| param.pad_h = conv.pad_h; | |||
| param.pad_w = conv.pad_w; | |||
| param.stride_h = conv.stride_h; | |||
| @@ -163,30 +48,37 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| param.sparse = conv.sparse; | |||
| param.compute_mode = conv.compute_mode; | |||
| param.format = conv.format; | |||
| return param; | |||
| } | |||
| // shape infer | |||
| TensorLayout empty_shp({0}, inputs[0]->dtype()); | |||
| empty_shp.ndim = 0; | |||
| auto empty_bias = Tensor::make(empty_shp, cn); | |||
| inp_tensornds[2] = empty_bias->dnn_tensor(); | |||
| inp_tensornds[3] = empty_bias->dnn_tensor(); | |||
| size_t sz = setup_algo<megdnn::ConvBiasForward>( | |||
| {inp_shapes[0], inp_shapes[1], empty_shp, empty_shp, oup_shapes[0]}, | |||
| dnn_opr.op.get(), 0, false, false, cn, conv.policy(), false, | |||
| &inp_tensornds); | |||
| SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
| // create megdnn opr | |||
| auto&& conv = def.cast_final_safe<Convolution>(); | |||
| CompNode cn = inputs[0]->comp_node(); | |||
| auto&& param = conv_bias_param_from_convolution(conv); | |||
| DnnOprCaller<megdnn::ConvBiasForward> dnn_opr(cn, param, conv.policy()); | |||
| megdnn::TensorND empty_bias; | |||
| empty_bias.layout.dtype = inputs[0]->dtype(); | |||
| empty_bias.layout.ndim = 0; | |||
| auto out_layout = [&] { | |||
| if (validated) { | |||
| return output_descs[0].layout; | |||
| } else { | |||
| TensorLayout out_layout{inputs[0]->dtype()}; | |||
| dnn_opr.op()->deduce_layout( | |||
| inputs[0]->layout(), inputs[1]->layout(), empty_bias.layout, | |||
| empty_bias.layout, out_layout); | |||
| return out_layout; | |||
| } | |||
| }(); | |||
| // alloc memory | |||
| auto out = Tensor::make(out_layout, cn); | |||
| auto dnn_wk = dnn_opr.create_workspace(sz); | |||
| // exeucte | |||
| dnn_opr.op->exec( | |||
| inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], inp_tensornds[3], | |||
| out->dnn_tensor(), nullptr, dnn_wk); | |||
| dnn_opr.exec_fastrun(inputs[0], inputs[1], empty_bias, empty_bias, out); | |||
| return {out}; | |||
| } | |||
| @@ -243,155 +135,41 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||
| } | |||
| } | |||
| TensorLayout convbwd_do_shape_infer( | |||
| const OpDef& def, size_t diff_ndim, TensorLayout filter, TensorLayout diff, | |||
| CompNode cn) { | |||
| auto&& bwd_conv = static_cast<const ConvolutionBackwardData&>(def); | |||
| DnnOprCaller<megdnn::ConvolutionBackwardData> caller(cn); | |||
| auto&& dnn_opr = caller.op; | |||
| using Param = ::megdnn::param::Convolution; | |||
| // using Param1 = ::megdnn::param::ConvolutionBackwardData; | |||
| auto img_ndim = diff_ndim - 2; | |||
| mgb_assert( | |||
| img_ndim == 2, | |||
| "only 2D convolution is supported, and input should be 4-dim; " | |||
| "got input dim = %zu", | |||
| diff_ndim); | |||
| size_t group = 1; | |||
| size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; | |||
| if (bwd_conv.sparse == Param::Sparse::DENSE) { | |||
| mgb_assert( | |||
| filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4, | |||
| "bad filter ndim for dense convolution: " | |||
| "spatial_ndim=%zu filter_ndim=%zu", | |||
| img_ndim, filter.ndim); | |||
| group = 1; | |||
| flt_start = 0; | |||
| } else { // Param::Sparse::GROUP | |||
| mgb_assert( | |||
| filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5, | |||
| "bad filter ndim for group convolution: " | |||
| "spatial_ndim=%zu filter_ndim=%zu", | |||
| img_ndim, filter.ndim); | |||
| // grp, oc, ic, dims[] | |||
| group = filter[0]; | |||
| flt_start = 1; | |||
| } | |||
| uint32_t ic_block_size = 1, oc_block_size = 1; | |||
| size_t src_or_dst_c_pos = 0; | |||
| size_t src_or_dst_spatial_start = 0; | |||
| if (bwd_conv.format == Param::Format::NCHW) { | |||
| // filter should be (oc, ic, fh, fw) | |||
| flt_spatial_start = 2; | |||
| ocpg_pos = 0; | |||
| icpg_pos = 1; | |||
| src_or_dst_c_pos = 1; | |||
| src_or_dst_spatial_start = 2; | |||
| } else { // Param::Format::NHWC | |||
| // filter should be (oc, fh, fw, ic) | |||
| flt_spatial_start = 1; | |||
| ocpg_pos = 0; | |||
| icpg_pos = 3; | |||
| src_or_dst_c_pos = 3; | |||
| src_or_dst_spatial_start = 1; | |||
| } | |||
| size_t ocpg = filter[flt_start + ocpg_pos] * oc_block_size; | |||
| size_t icpg = filter[flt_start + icpg_pos] * ic_block_size; | |||
| uint32_t dilation[2], dilated_spatial[2], stride[2], padding[2]; | |||
| dilation[0] = bwd_conv.dilate_h; | |||
| dilation[1] = bwd_conv.dilate_w; | |||
| stride[0] = bwd_conv.stride_h; | |||
| stride[1] = bwd_conv.stride_w; | |||
| padding[0] = bwd_conv.pad_h; | |||
| padding[1] = bwd_conv.pad_w; | |||
| for (size_t i = 0; i < img_ndim; ++i) { | |||
| mgb_assert( | |||
| dilation[i] > 0, "invalid dilation on spatial dim %zu: %u", i, | |||
| dilation[i]); | |||
| dilated_spatial[i] = | |||
| (filter[i + flt_start + flt_spatial_start] - 1) * dilation[i] + 1; | |||
| } | |||
| mgb_assert( | |||
| ocpg * group == diff[src_or_dst_c_pos], | |||
| "group conv invalid: input channel of Conv expect %zu, but got %zu\n" | |||
| "hint: weight may be changed by mistake\n", | |||
| ocpg * group, diff[src_or_dst_c_pos]); | |||
| auto deduce = [](size_t out, size_t filter, size_t stride, size_t pad) { | |||
| auto i = (out - 1) * stride + filter; | |||
| mgb_assert(i > pad * 2); | |||
| return i - pad * 2; | |||
| }; | |||
| DType dst_dtype = bwd_conv.dtype; | |||
| dnn_opr->deduce_dtype(filter.dtype, diff.dtype, dst_dtype); | |||
| TensorLayout dst{dst_dtype}; | |||
| dst.ndim = diff_ndim; | |||
| dst[0] = diff[0]; | |||
| dst[src_or_dst_c_pos] = icpg * group; | |||
| for (size_t i = 0; i < img_ndim; ++i) { | |||
| dst[i + src_or_dst_spatial_start] = | |||
| deduce(diff[i + src_or_dst_spatial_start], dilated_spatial[i], | |||
| stride[i], padding[i]); | |||
| } | |||
| dst.init_contiguous_stride(); | |||
| return dst; | |||
| } | |||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | |||
| SmallVector<LogicalTensorDesc> dests(1); | |||
| auto&& desc = dests[0]; | |||
| desc.comp_node = inputs[0].comp_node; | |||
| TensorLayout filter = inputs[0].layout; | |||
| TensorLayout diff = inputs[1].layout; | |||
| size_t diff_ndim = diff.ndim; | |||
| if (diff_ndim == 0 || filter.ndim == 0) { | |||
| desc.layout = TensorLayout{{}, diff.dtype}; | |||
| return {dests, false}; | |||
| auto&& convbwd = def.cast_final_safe<ConvolutionBackwardData>(); | |||
| DnnOprHelper<megdnn::ConvolutionBackwardData> dnn_opr(convbwd.param()); | |||
| // force set dtype | |||
| auto&& filter = inputs[0].layout; | |||
| auto&& diff = inputs[1].layout; | |||
| TensorLayout output_layout{convbwd.dtype}; | |||
| if (filter.ndim && diff.ndim) { | |||
| // deduce_layout won't override existing dtype | |||
| dnn_opr.opr().deduce_layout(filter, diff, output_layout); | |||
| } | |||
| desc.layout = | |||
| convbwd_do_shape_infer(def, diff_ndim, filter, diff, inputs[0].comp_node); | |||
| return {dests, true}; | |||
| return {{{output_layout, inputs[0].comp_node}}, output_layout.ndim != 0}; | |||
| } | |||
| SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
| // create megdnn opr | |||
| auto&& convbwd = static_cast<const ConvolutionBackwardData&>(def); | |||
| auto&& convbwd = def.cast_final_safe<ConvolutionBackwardData>(); | |||
| CompNode cn = inputs[0]->comp_node(); | |||
| TensorLayout out_layout = output_descs[0].layout; | |||
| if (!validated) | |||
| out_layout = convbwd_do_shape_infer( | |||
| def, inputs[1]->layout().ndim, inputs[0]->layout(), inputs[1]->layout(), | |||
| cn); | |||
| DnnOprCaller<megdnn::ConvolutionBackwardData> dnn_opr( | |||
| cn, convbwd.param(), convbwd.policy()); | |||
| auto out_layout = [&] { | |||
| if (validated) { | |||
| return output_descs[0].layout; | |||
| } else { | |||
| TensorLayout out_layout{inputs[0]->dtype()}; | |||
| dnn_opr.op()->deduce_layout( | |||
| inputs[0]->layout(), inputs[1]->layout(), out_layout); | |||
| return out_layout; | |||
| } | |||
| }(); | |||
| auto out = Tensor::make(out_layout, cn); | |||
| using TensorND = megdnn::TensorND; | |||
| SmallVector<TensorND> inp_tensornds(inputs.size()); | |||
| TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); | |||
| for (unsigned i = 0; i < inputs.size(); ++i) { | |||
| inp_tensornds[i] = inputs[i]->dnn_tensor(); | |||
| inp_shapes[i] = inputs[i]->layout(); | |||
| } | |||
| oup_shapes[0] = out_layout; | |||
| DnnOprCaller<megdnn::ConvolutionBackwardData> dnn_opr(cn); | |||
| dnn_opr.op->param() = convbwd.param(); | |||
| size_t sz = setup_algo<megdnn::ConvolutionBackwardData>( | |||
| {inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false, | |||
| false, cn, convbwd.policy(), false, &inp_tensornds); | |||
| auto dnn_wk = dnn_opr.create_workspace(sz); | |||
| // exeucte | |||
| dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); | |||
| dnn_opr.exec_fastrun(inputs[0], inputs[1], out); | |||
| return {out}; | |||
| } | |||
| @@ -415,149 +193,36 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||
| return opr::Convolution3D::make(inputs[0], inputs[1], conv.param(), conv.policy()); | |||
| } | |||
| TensorLayout do_shape_infer( | |||
| const OpDef& def, size_t src_ndim, TensorLayout src, TensorLayout filter) { | |||
| auto&& conv = static_cast<const Convolution3D&>(def); | |||
| using Param = ::megdnn::param::Convolution3D; | |||
| auto img_ndim = src_ndim - 2; | |||
| mgb_assert( | |||
| img_ndim == 3, | |||
| "only 3D convolution is supported, and input should be 5-dim; " | |||
| "got input dim = %zu", | |||
| src_ndim); | |||
| size_t group = 1; | |||
| size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; | |||
| if (conv.sparse == Param::Sparse::DENSE) { | |||
| mgb_assert( | |||
| filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4, | |||
| "bad filter ndim for dense convolution: " | |||
| "spatial_ndim=%zu filter_ndim=%zu", | |||
| img_ndim, filter.ndim); | |||
| group = 1; | |||
| flt_start = 0; | |||
| } else { // Param::Sparse::GROUP | |||
| mgb_assert( | |||
| filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5, | |||
| "bad filter ndim for group convolution: " | |||
| "spatial_ndim=%zu filter_ndim=%zu", | |||
| img_ndim, filter.ndim); | |||
| // grp, oc, ic, dims[] | |||
| group = filter[0]; | |||
| flt_start = 1; | |||
| } | |||
| uint32_t ic_block_size = 1, oc_block_size = 1; | |||
| size_t src_or_dst_c_pos = 0; | |||
| size_t src_or_dst_spatial_start = 0; | |||
| if (conv.format == Param::Format::NCDHW) { | |||
| // filter should be (oc, ic, fd, fh, fw) | |||
| flt_spatial_start = 2; | |||
| ocpg_pos = 0; | |||
| icpg_pos = 1; | |||
| src_or_dst_c_pos = 1; | |||
| src_or_dst_spatial_start = 2; | |||
| } else { // Param::Format::NDHWC | |||
| // filter should be (oc, fd, fh, fw, ic) | |||
| flt_spatial_start = 1; | |||
| ocpg_pos = 0; | |||
| icpg_pos = 4; | |||
| src_or_dst_c_pos = 4; | |||
| src_or_dst_spatial_start = 1; | |||
| } | |||
| size_t ocpg = filter[flt_start + ocpg_pos] * oc_block_size; | |||
| size_t icpg = filter[flt_start + icpg_pos] * ic_block_size; | |||
| uint32_t dilation[3], dilated_spatial[3], stride[3], padding[3]; | |||
| dilation[0] = conv.dilate_d; | |||
| dilation[1] = conv.dilate_h; | |||
| dilation[2] = conv.dilate_w; | |||
| stride[0] = conv.stride_d; | |||
| stride[1] = conv.stride_h; | |||
| stride[2] = conv.stride_w; | |||
| padding[0] = conv.pad_d; | |||
| padding[1] = conv.pad_h; | |||
| padding[2] = conv.pad_w; | |||
| for (size_t i = 0; i < img_ndim; ++i) { | |||
| mgb_assert( | |||
| dilation[i] > 0, "invalid dilation on spatial dim %zu: %u", i, | |||
| dilation[i]); | |||
| dilated_spatial[i] = | |||
| (filter[i + flt_start + flt_spatial_start] - 1) * dilation[i] + 1; | |||
| } | |||
| mgb_assert( | |||
| icpg * group == src[src_or_dst_c_pos], | |||
| "group conv invalid: input channel of Conv expect %zu, but got %zu\n" | |||
| "hint: weight may be changed by mistake\n", | |||
| icpg * group, src[src_or_dst_c_pos]); | |||
| TensorLayout dst{src.dtype}; | |||
| dst.ndim = src_ndim; | |||
| dst[0] = src[0]; | |||
| dst[src_or_dst_c_pos] = ocpg * group; | |||
| for (size_t i = 0; i < img_ndim; ++i) { | |||
| dst[i + src_or_dst_spatial_start] = infer_conv_shape( | |||
| src[i + src_or_dst_spatial_start], dilated_spatial[i], stride[i], | |||
| padding[i]); | |||
| } | |||
| dst.init_contiguous_stride(); | |||
| return dst; | |||
| } | |||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | |||
| SmallVector<LogicalTensorDesc> dests(1); | |||
| auto&& desc = dests[0]; | |||
| desc.comp_node = inputs[0].comp_node; | |||
| auto&& conv = def.cast_final_safe<Convolution3D>(); | |||
| TensorLayout src = inputs[0].layout; | |||
| TensorLayout filter = inputs[1].layout; | |||
| size_t src_ndim = src.ndim; | |||
| if (src_ndim == 0 || filter.ndim == 0) { | |||
| desc.layout = TensorLayout{{}, src.dtype}; | |||
| return {dests, false}; | |||
| if (src.ndim == 0 || filter.ndim == 0) { | |||
| return {{{TensorLayout{src.dtype}, inputs[0].comp_node}}, false}; | |||
| } | |||
| desc.layout = do_shape_infer(def, src_ndim, src, filter); | |||
| return {dests, true}; | |||
| DnnOprHelper<megdnn::Convolution3DForward> dnn_opr(conv.param()); | |||
| auto output = dnn_opr.deduce_layout(src, filter); | |||
| return {{{output, inputs[0].comp_node}}, false}; | |||
| } | |||
| SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
| // create megdnn opr | |||
| auto&& conv = static_cast<const Convolution3D&>(def); | |||
| TensorLayout out_layout = output_descs[0].layout; | |||
| if (!validated) | |||
| out_layout = do_shape_infer( | |||
| def, inputs[0]->layout().ndim, inputs[0]->layout(), | |||
| inputs[1]->layout()); | |||
| using TensorND = megdnn::TensorND; | |||
| auto&& conv = def.cast_final_safe<Convolution3D>(); | |||
| CompNode cn = inputs[0]->comp_node(); | |||
| SmallVector<TensorND> inp_tensornds(inputs.size()); | |||
| TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); | |||
| for (unsigned i = 0; i < inputs.size(); ++i) { | |||
| inp_tensornds[i] = inputs[i]->dnn_tensor(); | |||
| inp_shapes[i] = inputs[i]->layout(); | |||
| } | |||
| oup_shapes[0] = out_layout; | |||
| DnnOprCaller<megdnn::Convolution3D> dnn_opr(cn); | |||
| dnn_opr.op->param() = conv.param(); | |||
| // shape infer | |||
| size_t sz = setup_algo<megdnn::Convolution3D>( | |||
| {inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false, | |||
| false, cn, conv.policy(), false, &inp_tensornds); | |||
| DnnOprCaller<megdnn::Convolution3D> dnn_opr(cn, conv.param(), conv.policy()); | |||
| auto out_layout = [&] { | |||
| if (validated) { | |||
| return output_descs[0].layout; | |||
| } else { | |||
| return dnn_opr.deduce_layout(inputs[0]->layout(), inputs[1]->layout()); | |||
| } | |||
| }(); | |||
| // alloc memory | |||
| auto out = Tensor::make(out_layout, cn); | |||
| auto dnn_wk = dnn_opr.create_workspace(sz); | |||
| // exeucte | |||
| dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); | |||
| dnn_opr.exec_fastrun(inputs[0], inputs[1], out); | |||
| return {out}; | |||
| } | |||
| @@ -579,51 +244,38 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| inputs.size() == 2, | |||
| "inputs num of conv_transpose3d should be 2 but you give %zu", | |||
| inputs.size()); | |||
| auto&& op_def = def.cast_final_safe<Convolution3DBackwardData>(); | |||
| auto&& weight = inputs[0]; | |||
| auto&& diff = inputs[1]; | |||
| auto& cn = weight.comp_node; | |||
| if (weight.layout.ndim == 0 || diff.layout.ndim == 0) { | |||
| return {{{TensorLayout{weight.layout.dtype}, cn, {}}}, false}; | |||
| if (!(weight.layout.ndim && diff.layout.ndim)) { | |||
| return {{{TensorLayout{weight.layout.dtype}, weight.comp_node}}, false}; | |||
| } | |||
| TensorLayout oup_layout; | |||
| megdnn::Convolution3DBackwardData::deduce_layout_impl( | |||
| weight.layout, diff.layout, op_def.param(), oup_layout); | |||
| return {{{oup_layout, cn, {}}}, true}; | |||
| DnnOprHelper<megdnn::Convolution3DBackwardData> dnn_opr(op_def.param()); | |||
| auto oup_layout = dnn_opr.deduce_layout(weight.layout, diff.layout); | |||
| return {{{oup_layout, weight.comp_node}}, true}; | |||
| } | |||
| SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
| auto&& op_def = def.cast_final_safe<Convolution3DBackwardData>(); | |||
| auto&& conv = def.cast_final_safe<Convolution3DBackwardData>(); | |||
| auto cn = inputs[0]->comp_node(); | |||
| auto&& wlayout = inputs[0]->layout(); | |||
| auto&& dlayout = inputs[1]->layout(); | |||
| DnnOprCaller<megdnn::Convolution3DBackwardData> caller(cn); | |||
| auto&& dnn_opr = caller.op; | |||
| dnn_opr->param() = op_def.param(); | |||
| DnnOprCaller<megdnn::Convolution3DBackwardData> dnn_op( | |||
| cn, conv.param(), conv.policy()); | |||
| TensorLayout& oup_layout = output_descs[0].layout; | |||
| if (!validated) { | |||
| megdnn::Convolution3DBackwardData::deduce_layout_impl( | |||
| wlayout, dlayout, op_def.param(), oup_layout); | |||
| } | |||
| auto oup_layout = [&] { | |||
| if (validated) { | |||
| return output_descs[0].layout; | |||
| } else { | |||
| return dnn_op.deduce_layout(wlayout, dlayout); | |||
| } | |||
| }(); | |||
| auto oup = Tensor::make(oup_layout, cn); | |||
| SmallVector<megdnn::TensorND> inp_tensornds(inputs.size()); | |||
| inp_tensornds[0] = inputs[0]->dnn_tensor(); | |||
| inp_tensornds[1] = inputs[1]->dnn_tensor(); | |||
| size_t wk_size = setup_algo<megdnn::Convolution3DBackwardData>( | |||
| {wlayout, dlayout, oup_layout}, dnn_opr.get(), 0, false, false, cn, | |||
| op_def.policy(), false, &inp_tensornds); | |||
| auto dnn_wk = caller.create_workspace(wk_size); | |||
| dnn_opr->exec(inp_tensornds[0], inp_tensornds[1], oup->dnn_tensor(), dnn_wk); | |||
| dnn_op.exec_fastrun(inputs[0], inputs[1], oup); | |||
| return {oup}; | |||
| } | |||
| @@ -94,52 +94,44 @@ void apply_on_device_tensornd( | |||
| mgb_assert( | |||
| inputs.size() == trait.arity, "%s expects %u inputs; got %zu actually", | |||
| trait.name, trait.arity, inputs.size()); | |||
| DnnOprCaller<megdnn::Elemwise> dnn_opr(inputs[0].comp_node()); | |||
| opr::Elemwise::perform(op_def.mode, (*outputs)[0], inputs, dnn_opr.op); | |||
| DnnOprCaller<megdnn::Elemwise> dnn_opr(inputs[0].comp_node(), {op_def.mode}); | |||
| opr::Elemwise::perform(op_def.mode, (*outputs)[0], inputs, dnn_opr.op()); | |||
| } | |||
| SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
| auto comp_node = inputs[0]->comp_node(); | |||
| auto dtype = inputs[0]->dtype(); | |||
| using Mode = Elemwise::Mode; | |||
| using TensorND = megdnn::TensorND; | |||
| auto&& op_def = def.cast_final_safe<Elemwise>(); | |||
| SmallVector<TensorND> inp_tensornds; | |||
| TensorShapeArray inp_shapes(inputs.size()); | |||
| inp_tensornds.reserve(inputs.size()); | |||
| TensorLayout layout{inputs[0]->layout().dtype}; | |||
| bool is_empty = false; | |||
| for (unsigned i = 0; i < inputs.size(); ++i) { | |||
| if (inputs[i]->layout().is_empty()) { | |||
| is_empty = true; | |||
| } | |||
| inp_tensornds.push_back(inputs[i]->dnn_tensor()); | |||
| inp_shapes[i] = inputs[i]->layout(); | |||
| auto mode = op_def.mode; | |||
| TensorShapeArray input_shapes; | |||
| input_shapes.reserve(inputs.size()); | |||
| for (auto&& input : inputs) { | |||
| input_shapes.push_back(input->shape()); | |||
| } | |||
| megdnn::Elemwise::deduce_shape(inp_shapes, layout); | |||
| layout.init_contiguous_stride(); | |||
| auto out = Tensor::make(layout, comp_node); | |||
| if (is_empty) { | |||
| return {out}; | |||
| // deduce_shape is static and fast | |||
| TensorLayout output_layout{dtype}; | |||
| // TODO: deduce_layout directly | |||
| megdnn::Elemwise::deduce_shape(input_shapes, output_layout); | |||
| output_layout.init_contiguous_stride(); | |||
| auto output = Tensor::make(output_layout, comp_node); | |||
| if (output_layout.is_empty()) { | |||
| return {output}; | |||
| } | |||
| DnnOprCaller<megdnn::Elemwise> dnn_opr(comp_node); | |||
| dnn_opr.op->param() = op_def.param(); | |||
| if (dnn_opr.op->param().mode == Mode::FUSE_MUL_ADD3 || | |||
| dnn_opr.op->param().mode == Mode::FUSE_MUL_ADD4 || | |||
| (inp_tensornds.size() && | |||
| inp_tensornds[0].layout.dtype.category() == DTypeCategory::QUANTIZED)) { | |||
| opr::Elemwise::perform_dnn( | |||
| comp_node, out->dnn_tensor(), inp_tensornds, dnn_opr.op); | |||
| DnnOprCaller<megdnn::Elemwise> dnn_opr(comp_node, op_def.param()); | |||
| if (mode == Mode::FUSE_MUL_ADD3 || mode == Mode::FUSE_MUL_ADD4 || | |||
| dtype.category() == DTypeCategory::QUANTIZED) { | |||
| dnn_opr.call_dnn( | |||
| [&](auto&& inputs, auto&& output) { | |||
| opr::Elemwise::perform_dnn(comp_node, output, inputs, dnn_opr.op()); | |||
| }, | |||
| inputs, output); | |||
| } else { | |||
| dnn_opr.op->exec(inp_tensornds, out->dnn_tensor()); | |||
| dnn_opr.exec(inputs, output); | |||
| } | |||
| return {out}; | |||
| return {output}; | |||
| } | |||
| MGB_DEFINE_OPR_CLASS( | |||
| @@ -179,7 +171,7 @@ protected: | |||
| return ret; | |||
| } | |||
| void create_megdnn_opr() override { | |||
| auto opr = DnnOprCaller<megdnn::Elemwise>::create_operator(comp_node()); | |||
| auto opr = mgb::opr::intl::create_megdnn_opr<megdnn::Elemwise>(comp_node()); | |||
| opr->param().mode = m_param.mode; | |||
| set_megdnn_opr(std::move(opr)); | |||
| } | |||
| @@ -243,22 +235,19 @@ SmallVector<TensorPtr> apply_inplace_add_on_physical_tensor( | |||
| "This inplace modification may change the elements of other tensors. " | |||
| "Fallback to non-inplace update."); | |||
| DeviceTensorStorage storage; | |||
| storage.reset(dest->comp_node(), dest->blob()->size(), dest->blob()->storage()); | |||
| storage = storage.sub(dest->offset()); | |||
| DeviceTensorND dv; | |||
| dv.reset(storage, dest->layout()); | |||
| DeviceTensorND dv_new; | |||
| dv_new.copy_from(dv); | |||
| dest = Tensor::make(dv_new); | |||
| auto dest_layout = inputs[0]->layout(); | |||
| dest_layout.init_contiguous_stride(); | |||
| auto new_dest = Tensor::make(dest_layout, inputs[0]->comp_node()); | |||
| new_dest->dev_tensor().copy_from(dest->dev_tensor()); | |||
| dest = new_dest; | |||
| } | |||
| auto tensor_to_scalar = [](const TensorPtr& tensor) -> float { | |||
| return *tensor->get_value().ptr<float>(); | |||
| }; | |||
| DnnOprCaller<megdnn::AddUpdate> caller{dest->comp_node()}; | |||
| caller.op->param() = {tensor_to_scalar(alpha), tensor_to_scalar(beta)}; | |||
| caller.op->exec(dest->dev_tensor().as_megdnn(), delta->dev_tensor().as_megdnn()); | |||
| DnnOprCaller<megdnn::AddUpdate> caller{ | |||
| dest->comp_node(), {tensor_to_scalar(alpha), tensor_to_scalar(beta)}}; | |||
| caller.exec(dest, delta); | |||
| // FIXME: inplace update host value | |||
| return {std::make_shared<Tensor>(dest->blob(), dest->offset(), dest->layout())}; | |||
| } | |||
| @@ -67,10 +67,8 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| auto&& op = def.cast_final_safe<IndexingOneHot>(); | |||
| auto&& inp = inputs[0]; | |||
| auto&& index = inputs[1]; | |||
| TensorLayout layout = inp->layout(); | |||
| TensorLayout index_layout = index->layout(); | |||
| DnnOprCaller<megdnn::IndexingOneHot> dnn_op(inp->comp_node()); | |||
| auto&& indexing_one_hot_param = dnn_op.op->param(); | |||
| auto&& layout = inp->layout(); | |||
| auto&& index_layout = index->layout(); | |||
| int real_axis = static_cast<int>(op.axis); | |||
| if (real_axis < 0) { | |||
| real_axis += static_cast<int>(layout.ndim); | |||
| @@ -79,16 +77,10 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| 0 <= real_axis && real_axis < static_cast<int>(layout.ndim), | |||
| "Dimension out of range (expected to be in range of [%d, %d], but got %d)", | |||
| 0, static_cast<int>(layout.ndim) - 1, op.axis); | |||
| indexing_one_hot_param = real_axis; | |||
| TensorLayout tlayout; | |||
| dnn_op.op->deduce_layout(layout, index_layout, tlayout); | |||
| TensorPtr out = Tensor::make(tlayout, inp->comp_node()); | |||
| megdnn::TensorND in = inp->dnn_tensor(); | |||
| megdnn::TensorND ind = index->dnn_tensor(); | |||
| size_t sz = dnn_op.op->get_workspace_in_bytes(layout, index_layout, tlayout); | |||
| auto dnn_workspace = dnn_op.create_workspace(sz); | |||
| dnn_op.op->exec(in, ind, out->dnn_tensor(), dnn_workspace); | |||
| DnnOprCaller<megdnn::IndexingOneHot> dnn_op(inp->comp_node(), real_axis); | |||
| auto tlayout = dnn_op.deduce_layout(layout, index_layout); | |||
| auto out = Tensor::make(tlayout, inp->comp_node()); | |||
| dnn_op.exec_with_ws(inp, index, out); | |||
| return {out}; | |||
| } | |||
| @@ -105,15 +97,14 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& input_descs) { | |||
| mgb_assert(input_descs.size() == 3, "IndexingSetOneHot expects three inputs"); | |||
| auto comp_node = input_descs[0].comp_node; | |||
| TensorLayout src = input_descs[0].layout, index = input_descs[1].layout; | |||
| auto&& src = input_descs[0].layout; | |||
| auto&& index = input_descs[1].layout; | |||
| mgb_assert(index.dtype == dtype::Int32(), "index dtype must be int32"); | |||
| if (!src.ndim) { | |||
| return {{{{{}, src.dtype}, comp_node}}, false}; | |||
| } | |||
| mgb_assert(src.is_contiguous(), "src should be contiguous"); | |||
| return {{input_descs[0]}, true}; | |||
| return {{{src, comp_node}}, true}; | |||
| } | |||
| auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||
| @@ -136,25 +127,15 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| auto&& index = inputs[1]; | |||
| auto&& sub = inputs[2]; | |||
| TensorLayout layout = inp->layout(); | |||
| TensorLayout index_layout = index->layout(); | |||
| TensorLayout tlayout = sub->layout(); | |||
| mgb_assert(layout.is_contiguous()); | |||
| DnnOprCaller<megdnn::IndexingSetOneHot> dnn_op(inp->comp_node()); | |||
| auto&& indexing_one_hot_param = dnn_op.op->param(); | |||
| int real_axis = static_cast<int>(op.axis); | |||
| if (real_axis < 0) { | |||
| real_axis += static_cast<int>(layout.ndim); | |||
| } | |||
| indexing_one_hot_param = real_axis; | |||
| DnnOprCaller<megdnn::IndexingSetOneHot> dnn_op(inp->comp_node(), real_axis); | |||
| TensorPtr out = Tensor::make(layout, inp->comp_node()); | |||
| out->dev_tensor().copy_from_fixlayout(inp->dev_tensor()); | |||
| megdnn::TensorND in = inp->dnn_tensor(); | |||
| megdnn::TensorND ind = index->dnn_tensor(); | |||
| megdnn::TensorND su = sub->dnn_tensor(); | |||
| size_t sz = dnn_op.op->get_workspace_in_bytes(layout, index_layout, tlayout); | |||
| auto dnn_workspace = dnn_op.create_workspace(sz); | |||
| dnn_op.op->exec(out->dnn_tensor(), ind, su, dnn_workspace); | |||
| dnn_op.exec_with_ws(out, index, sub); | |||
| return {out}; | |||
| } | |||
| @@ -54,14 +54,15 @@ cg::OperatorNodeBase* apply_on_var_node_remote_recv( | |||
| TensorPtr megray_recv_tensor( | |||
| std::shared_ptr<MegRay::Communicator> megray_comm, TensorLayout& layout, | |||
| CompNode cn, uint32_t rank_from) { | |||
| DeviceTensorND out = BlobManager::inst()->alloc_workspace_with_defrag(cn, layout); | |||
| auto out = Tensor::make(layout, cn); | |||
| auto dnn_out = out->dnn_tensor(); | |||
| auto megray_ctx = mgb::opr::get_megray_context(cn); | |||
| size_t data_size = layout.total_nr_elems(); | |||
| auto status = megray_comm->recv( | |||
| out.raw_ptr(), data_size, mgb::opr::get_megray_dtype(layout.dtype), | |||
| dnn_out.raw_ptr(), data_size, mgb::opr::get_megray_dtype(layout.dtype), | |||
| rank_from, megray_ctx); | |||
| mgb_assert(status == MegRay::MEGRAY_OK, "MegRay recv failed"); | |||
| return Tensor::make(out); | |||
| return out; | |||
| } | |||
| void megray_send_tensor( | |||
| @@ -105,9 +106,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor_remote_send( | |||
| mgb_assert(megray_comm != nullptr); | |||
| megray_send_tensor(megray_comm, inputs[0], op.rank_to); | |||
| TensorLayout layout({0}, inputs[0]->dtype()); | |||
| DeviceTensorND out = BlobManager::inst()->alloc_workspace_with_defrag( | |||
| inputs[0]->comp_node(), layout); | |||
| return {Tensor::make(out)}; | |||
| return {Tensor::make(layout, inputs[0]->comp_node())}; | |||
| } | |||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible_remote_recv( | |||
| @@ -21,14 +21,17 @@ SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint( | |||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& input_descs) { | |||
| mgb_assert(input_descs.size() == 4, "IndexingOneHot expects 4inputs"); | |||
| auto comp_node = input_descs[0].comp_node; | |||
| auto comp_node1 = input_descs[1].comp_node; | |||
| auto comp_node2 = input_descs[2].comp_node; | |||
| TensorLayout m_t_1 = input_descs[0].layout, v_t_1 = input_descs[1].layout, | |||
| lamb_param = input_descs[2].layout, grad = input_descs[3].layout; | |||
| TensorLayout new_param = lamb_param, m_t = m_t_1, v_t = v_t_1; | |||
| auto&& m_t_1 = input_descs[0].layout; | |||
| auto&& v_t_1 = input_descs[1].layout; | |||
| auto&& lamb_param = input_descs[2].layout; | |||
| auto&& grad = input_descs[3].layout; | |||
| MGB_MARK_USED_VAR(grad); | |||
| auto&& new_param = lamb_param; | |||
| auto&& m_t = m_t_1; | |||
| auto&& v_t = v_t_1; | |||
| return {{{m_t, comp_node}, {v_t, comp_node1}, {new_param, comp_node2}}, true}; | |||
| } | |||
| @@ -46,23 +49,11 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| TensorLayout lamb_param_layout{lamb_param->layout()}; | |||
| auto m_t = Tensor::make(m_t_1_layout, m_t_1->comp_node()); | |||
| auto v_t = Tensor::make(v_t_1_layout, v_t_1->comp_node()); | |||
| auto new_param = Tensor::make(lamb_param_layout, lamb_param->comp_node()); | |||
| DnnOprCaller<megdnn::LAMBUpdate> caller{lamb_param->comp_node()}; | |||
| size_t sz = caller.op->get_workspace_in_bytes( | |||
| m_t_1->layout(), v_t_1->layout(), lamb_param->layout(), grad->layout(), | |||
| m_t->layout(), v_t->layout(), new_param->layout()); | |||
| auto dnn_workspace = caller.create_workspace(sz); | |||
| caller.op->param() = op.param(); | |||
| caller.op->exec( | |||
| m_t_1->dev_tensor().as_megdnn(), v_t_1->dev_tensor().as_megdnn(), | |||
| lamb_param->dev_tensor().as_megdnn(), grad->dev_tensor().as_megdnn(), | |||
| m_t->dnn_tensor(), v_t->dnn_tensor(), new_param->dnn_tensor(), | |||
| dnn_workspace); | |||
| DnnOprCaller<megdnn::LAMBUpdate> dnn_opr{lamb_param->comp_node(), op.param()}; | |||
| dnn_opr.exec_with_ws(m_t_1, v_t_1, lamb_param, grad, m_t, v_t, new_param); | |||
| return {m_t, v_t, new_param}; | |||
| } | |||
| @@ -29,11 +29,11 @@ cg::OperatorNodeBase* apply_on_var_node(const OpDef& def, const VarNodeArray& in | |||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | |||
| auto&& op_def = def.cast_final_safe<LayerNorm>(); | |||
| auto&& layer_norm = def.cast_final_safe<LayerNorm>(); | |||
| size_t nr_inp = inputs.size(); | |||
| auto p = op_def.param(); | |||
| auto affine = layer_norm.affine; | |||
| mgb_assert( | |||
| (nr_inp == 3 && p.affine) || (nr_inp == 1 && !p.affine), | |||
| (nr_inp == 3 && affine) || (nr_inp == 1 && !affine), | |||
| "num of inputs of pooling should be 1 or 3 but you give %zu", | |||
| inputs.size()); | |||
| @@ -47,9 +47,9 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| false}; | |||
| } | |||
| TensorLayout oup_layout, mean_layout, rstd_layout; | |||
| megdnn::LayerNorm::deduce_layout_fwd_impl( | |||
| inp.layout, p, oup_layout, mean_layout, rstd_layout); | |||
| DnnOprHelper<megdnn::LayerNorm> dnn_opr(layer_norm.param()); | |||
| auto&& [oup_layout, mean_layout, rstd_layout] = | |||
| dnn_opr.deduce_layouts<3>(inp.layout, TensorLayout{}, TensorLayout{}); | |||
| return {{{oup_layout, inp_cn, {}}, | |||
| {mean_layout, inp_cn, {}}, | |||
| {rstd_layout, inp_cn, {}}}, | |||
| @@ -69,32 +69,21 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| inputs.size()); | |||
| auto cn = inputs[0]->comp_node(); | |||
| DnnOprCaller<megdnn::LayerNorm> caller(cn); | |||
| auto&& dnn_opr = caller.op; | |||
| dnn_opr->param() = p; | |||
| DnnOprCaller<megdnn::LayerNorm> caller(cn, op_def.param()); | |||
| TensorLayout oup_layout, mean_layout, rstd_layout; | |||
| megdnn::LayerNorm::deduce_layout_fwd_impl( | |||
| inputs[0]->dnn_tensor().layout, p, oup_layout, mean_layout, rstd_layout); | |||
| auto&& [oup_layout, mean_layout, rstd_layout] = caller.deduce_layouts<3>( | |||
| inputs[0]->layout(), TensorLayout{}, TensorLayout{}); | |||
| auto out = Tensor::make(oup_layout, cn); | |||
| auto mean = Tensor::make(mean_layout, cn); | |||
| auto rstd = Tensor::make(rstd_layout, cn); | |||
| auto wk_size = caller.op->get_workspace_in_bytes( | |||
| inputs[0]->dnn_tensor().layout, | |||
| p.affine ? inputs[1]->dnn_tensor().layout : TensorLayout(), | |||
| p.affine ? inputs[2]->dnn_tensor().layout : TensorLayout(), oup_layout, | |||
| mean_layout, rstd_layout); | |||
| auto dnn_wk = caller.create_workspace(wk_size); | |||
| caller.op->exec( | |||
| inputs[0]->dnn_tensor(), | |||
| p.affine ? inputs[1]->dnn_tensor() : megdnn::TensorND(), | |||
| p.affine ? inputs[2]->dnn_tensor() : megdnn::TensorND(), out->dnn_tensor(), | |||
| mean->dnn_tensor(), rstd->dnn_tensor(), dnn_wk); | |||
| if (p.affine) { | |||
| caller.exec_with_ws(inputs[0], inputs[1], inputs[2], out, mean, rstd); | |||
| } else { | |||
| megdnn::TensorND empty_dnn; | |||
| caller.exec_with_ws(inputs[0], empty_dnn, empty_dnn, out, mean, rstd); | |||
| } | |||
| return {out, mean, rstd}; | |||
| } | |||
| @@ -105,4 +94,4 @@ OP_TRAIT_REG(LayerNorm, LayerNorm) | |||
| .fallback(); | |||
| } // namespace layer_norm | |||
| } // namespace mgb::imperative | |||
| } // namespace mgb::imperative | |||
| @@ -24,7 +24,6 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||
| auto dim1 = matmul.dimA, dim2 = matmul.dimB; | |||
| auto cn = inputs[0]->comp_node(); | |||
| using Desc = opr::AxisAddRemove::AxisDesc; | |||
| using IndexDesc = opr::Subtensor::IndexDesc; | |||
| OperatorNodeConfig config{matmul.make_name(), cn}; | |||
| @@ -104,9 +103,8 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| dim1 = dim2 = 2; | |||
| } | |||
| DnnOprCaller<megdnn::MatrixMul> dnn_opr(inputs[0].comp_node); | |||
| dnn_opr.op->param() = matmul.param(); | |||
| dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||
| DnnOprHelper<megdnn::MatrixMul> dnn_opr(matmul.param()); | |||
| dnn_opr.opr().deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||
| if (dim1 == 0 || dim2 == 0) { | |||
| return {{{TensorLayout(dst_dtype), inputs[0].comp_node}}, false}; | |||
| @@ -143,8 +141,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| SmallVector<TensorND> inp_tensornds(inputs.size()); | |||
| TensorLayout layout1 = inputs[0]->layout(), layout2 = inputs[1]->layout(); | |||
| DnnOprCaller<megdnn::MatrixMul> dnn_opr(cn); | |||
| dnn_opr.op->param() = matmul.param(); | |||
| DnnOprCaller<megdnn::MatrixMul> dnn_opr(cn, matmul.param(), matmul.policy()); | |||
| if (matmul.dimA == matmul.dimB && matmul.dimB >= 3) { // only happens in backward | |||
| for (size_t i = 1; i + 1 < layout1.ndim; ++i) { | |||
| @@ -160,7 +157,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| } | |||
| DType dst_dtype; | |||
| dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||
| dnn_opr.op()->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||
| // only matters when layout1 has dim 2 | |||
| if (matmul.transposeA) | |||
| @@ -229,13 +226,8 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| inp_tensornds[0].layout = layout_a; | |||
| inp_tensornds[1].layout = layout_b; | |||
| } | |||
| size_t sz = setup_algo<megdnn::MatrixMul>( | |||
| {layout_a, layout_b, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, | |||
| matmul.policy(), false, &inp_tensornds); | |||
| auto out = Tensor::make(dst_layout, cn); | |||
| auto dnn_wk = dnn_opr.create_workspace(sz); | |||
| dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); | |||
| dnn_opr.exec_fastrun(inp_tensornds[0], inp_tensornds[1], out); | |||
| return {out->sub(0, real_dst_layout)}; | |||
| } | |||
| @@ -266,7 +258,6 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||
| auto dim1 = matmul.dimA, dim2 = matmul.dimB; | |||
| auto cn = inputs[0]->comp_node(); | |||
| using Desc = opr::AxisAddRemove::AxisDesc; | |||
| using IndexDesc = opr::Subtensor::IndexDesc; | |||
| OperatorNodeConfig config{matmul.make_name(), cn}; | |||
| @@ -343,9 +334,8 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| DType dst_dtype; | |||
| DnnOprCaller<megdnn::MatrixMul> dnn_opr(inputs[0].comp_node); | |||
| dnn_opr.op->param() = matmul.param(); | |||
| dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||
| DnnOprHelper<megdnn::MatrixMul> dnn_opr(matmul.param()); | |||
| dnn_opr.opr().deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||
| if (dim1 == 0 || dim2 == 0) { | |||
| return {{{TensorLayout(dst_dtype), inputs[0].comp_node}}, false}; | |||
| @@ -386,10 +376,9 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| TensorLayout layout1 = inputs[0]->layout(), layout2 = inputs[1]->layout(); | |||
| size_t dim1 = layout1.ndim, dim2 = layout2.ndim; | |||
| DnnOprCaller<megdnn::BatchedMatrixMul> dnn_opr(cn); | |||
| dnn_opr.op->param() = matmul.param(); | |||
| DnnOprCaller<megdnn::BatchedMatrixMul> dnn_opr(cn, matmul.param(), matmul.policy()); | |||
| DType dst_dtype; | |||
| dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||
| dnn_opr.op()->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||
| TensorShape tshp, batch_shp; | |||
| size_t j = 0; | |||
| @@ -473,14 +462,9 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| inp_tensornds[1] = inp2->dnn_tensor(); | |||
| inp_tensornds[1].layout = layout2; | |||
| size_t sz = setup_algo<megdnn::BatchedMatrixMul>( | |||
| {layout1, layout2, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, | |||
| matmul.policy(), false, &inp_tensornds); | |||
| auto out = Tensor::make(dst_layout, cn); | |||
| auto dnn_wk = dnn_opr.create_workspace(sz); | |||
| dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); | |||
| dnn_opr.exec_fastrun(inp_tensornds[0], inp_tensornds[1], out); | |||
| shp1[shp1.ndim - 2] = dst_layout[dst_layout.ndim - 2]; | |||
| shp1[shp1.ndim - 1] = dst_layout[dst_layout.ndim - 1]; | |||
| @@ -533,7 +517,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| TensorLayout oup_layout{inputs[0]->dtype()}; | |||
| auto inp1_tensor = inputs[0]->dnn_tensor(); | |||
| auto inp2_tensor = inputs[1]->dnn_tensor(); | |||
| dnn_opr.op->deduce_layout(inp1_tensor.layout, inp2_tensor.layout, oup_layout); | |||
| oup_layout = dnn_opr.deduce_layout(inp1_tensor.layout, inp2_tensor.layout); | |||
| if (inputs[0]->layout().is_empty() || inputs[1]->layout().is_empty()) { | |||
| auto out = Tensor::make(oup_layout, comp_node); | |||
| @@ -543,14 +527,8 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| return {out}; | |||
| } | |||
| auto sz = dnn_opr.op->get_workspace_in_bytes( | |||
| inp_tensornds[0].layout, inp_tensornds[1].layout, output_descs[0].layout); | |||
| auto out = Tensor::make(oup_layout, comp_node); | |||
| auto dnn_wk = dnn_opr.create_workspace(sz); | |||
| dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); | |||
| dnn_opr.exec_with_ws(inp_tensornds[0], inp_tensornds[1], out); | |||
| return {out}; | |||
| } | |||
| @@ -17,27 +17,18 @@ SymbolVarArray apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||
| SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
| size_t size = inputs.size(); | |||
| auto&& op = def.cast_final_safe<CheckNonFinite>(); | |||
| SmallVector<TensorPtr> outputs(size + 1); | |||
| outputs[size] = Tensor::make( | |||
| TensorLayout(TensorShape({1}), dtype::Int32()), inputs[0]->comp_node()); | |||
| auto dest = outputs[size]; | |||
| auto cn = dest->comp_node(); | |||
| DnnOprCaller<megdnn::CheckNonFinite> dnn_opr(cn); | |||
| SmallVector<megdnn::TensorND> srcs(size); | |||
| // copy an outputs to the dnn for inplace | |||
| for (size_t i = 0; i < size; ++i) { | |||
| outputs[i] = Tensor::make(inputs[i]->layout(), inputs[0]->comp_node()); | |||
| outputs[i]->dev_tensor().copy_from_fixlayout(inputs[i]->dev_tensor()); | |||
| srcs[i] = outputs[i]->dev_tensor().as_megdnn(); | |||
| auto comp_node = inputs[0]->comp_node(); | |||
| auto dest = Tensor::make(TensorLayout({1}, dtype::Int32()), comp_node); | |||
| SmallVector<TensorPtr> outputs; | |||
| outputs.reserve(inputs.size() + 1); | |||
| for (auto&& input : inputs) { | |||
| outputs.push_back(Tensor::make(input->layout(), comp_node)); | |||
| outputs.back()->dev_tensor().copy_from_fixlayout(input->dev_tensor()); | |||
| } | |||
| megdnn::CheckNonFinite::Param param({op.scale}); | |||
| dnn_opr.op->param() = param; | |||
| size_t sz = dnn_opr.op->get_workspace_in_bytes(srcs, dest->layout()); | |||
| auto dnn_wk = dnn_opr.create_workspace(sz); | |||
| dnn_opr.op->exec(srcs, dest->dnn_tensor(), dnn_wk); | |||
| DnnOprCaller<megdnn::CheckNonFinite> dnn_opr(comp_node, {op.scale}); | |||
| dnn_opr.exec_with_ws(outputs, dest); | |||
| outputs.push_back(dest); | |||
| return outputs; | |||
| } | |||
| @@ -45,13 +36,15 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | |||
| size_t size = inputs.size(); | |||
| SmallVector<LogicalTensorDesc> dests(size + 1); | |||
| bool validated = true; | |||
| for (size_t i = 0; i < size; ++i) { | |||
| dests[i].comp_node = inputs[i].comp_node; | |||
| dests[i].layout = inputs[i].layout; | |||
| validated &= bool(dests[i].layout.ndim); | |||
| } | |||
| dests[size].comp_node = inputs[0].comp_node; | |||
| dests[size].layout = TensorLayout(TensorShape({1}), dtype::Int32()); | |||
| return {dests, true}; | |||
| dests[size].layout = TensorLayout({1}, dtype::Int32()); | |||
| return {dests, validated}; | |||
| } | |||
| OP_TRAIT_REG(CheckNonFinite, CheckNonFinite) | |||
| @@ -27,40 +27,31 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
| auto comp_node = inputs[0]->comp_node(); | |||
| auto&& op_def = def.cast_final_safe<Padding>(); | |||
| DnnOprCaller<megdnn::Padding> dnn_op(comp_node); | |||
| dnn_op.op->param() = op_def.param(); | |||
| TensorLayout dst = output_descs[0].layout; | |||
| if (!validated) { | |||
| megdnn::Padding::deduce_layout_impl( | |||
| inputs[0]->dnn_tensor().layout, dst, op_def.param()); | |||
| } | |||
| DeviceTensorND out = | |||
| BlobManager::inst()->alloc_workspace_with_defrag(comp_node, dst); | |||
| dnn_op.op->exec(inputs[0]->dnn_tensor(), out.as_megdnn()); | |||
| return {Tensor::make(out)}; | |||
| DnnOprCaller<megdnn::Padding> dnn_op(comp_node, op_def.param()); | |||
| auto dst = [&] { | |||
| if (validated) { | |||
| return output_descs[0].layout; | |||
| } else { | |||
| return dnn_op.deduce_layout(inputs[0]->layout()); | |||
| } | |||
| }(); | |||
| auto out = Tensor::make(dst, comp_node); | |||
| dnn_op.exec(inputs[0], out); | |||
| return {out}; | |||
| } | |||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | |||
| auto&& op_def = def.cast_final_safe<Padding>(); | |||
| size_t nr_inp = inputs.size(); | |||
| auto p = op_def.param(); | |||
| auto&& inp = inputs[0]; | |||
| auto& inp_cn = inp.comp_node; | |||
| if (inp.layout.ndim == 0) { | |||
| return {{{TensorLayout{inp.layout.dtype}, inp_cn, {}}}, false}; | |||
| return {{{TensorLayout{inp.layout.dtype}, inp.comp_node, {}}}, false}; | |||
| } | |||
| TensorLayout oup_layout; | |||
| megdnn::Padding::deduce_layout_impl(inp.layout, oup_layout, p); | |||
| return {{{oup_layout, inp_cn, {}}}, true}; | |||
| DnnOprHelper<megdnn::Padding> dnn_op(op_def.param()); | |||
| auto oup_layout = dnn_op.deduce_layout(inp.layout); | |||
| return {{{oup_layout, inp.comp_node}}, true}; | |||
| } | |||
| OP_TRAIT_REG(Padding, Padding, opr::Padding) | |||
| @@ -74,4 +65,4 @@ OP_TRAIT_REG(Padding, Padding, opr::Padding) | |||
| } // namespace imperative | |||
| } // namespace mgb | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -25,19 +25,13 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| mgb_assert( | |||
| inputs.size() == 1, "num of inputs of pooling should be 1 but you give %zu", | |||
| inputs.size()); | |||
| auto&& op_def = def.cast_final_safe<Pooling>(); | |||
| auto&& inp = inputs[0]; | |||
| auto& inp_cn = inp.comp_node; | |||
| if (inp.layout.ndim == 0) { | |||
| return {{{TensorLayout{inp.layout.dtype}, inp_cn, {}}}, false}; | |||
| if (!inputs[0].layout.ndim) { | |||
| return {{{inputs[0].layout, inputs[0].comp_node}}, false}; | |||
| } | |||
| TensorLayout oup_layout; | |||
| megdnn::Pooling::deduce_layout_impl(inp.layout, op_def.param(), oup_layout); | |||
| return {{{oup_layout, inp_cn, {}}}, true}; | |||
| DnnOprHelper<megdnn::Pooling> dnn_opr(op_def.param()); | |||
| auto oup_layout = dnn_opr.deduce_layout(inputs[0].layout); | |||
| return {{{oup_layout, inputs[0].comp_node}}, true}; | |||
| } | |||
| SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| @@ -47,30 +41,18 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| inputs.size() == 1, "num of inputs of pooling should be 1 but you give %zu", | |||
| inputs.size()); | |||
| auto&& op_def = def.cast_final_safe<Pooling>(); | |||
| auto&& pooling = def.cast_final_safe<Pooling>(); | |||
| auto cn = inputs[0]->comp_node(); | |||
| DnnOprCaller<megdnn::Pooling> caller(cn); | |||
| auto&& dnn_opr = caller.op; | |||
| dnn_opr->param() = op_def.param(); | |||
| SmallVector<megdnn::TensorND> inp_tensornds(inputs.size()); | |||
| inp_tensornds[0] = inputs[0]->dnn_tensor(); | |||
| TensorLayout& oup_layout = output_descs[0].layout; | |||
| if (!validated) { | |||
| megdnn::Pooling::deduce_layout_impl( | |||
| inp_tensornds[0].layout, op_def.param(), oup_layout); | |||
| } | |||
| size_t wk_size = setup_algo<megdnn::Pooling>( | |||
| {inp_tensornds[0].layout, oup_layout}, dnn_opr.get(), 0, false, false, cn, | |||
| op_def.policy(), false, &inp_tensornds); | |||
| DnnOprCaller<megdnn::Pooling> dnn_opr(cn, pooling.param(), pooling.policy()); | |||
| auto oup_layout = [&] { | |||
| if (validated) { | |||
| return output_descs[0].layout; | |||
| } else { | |||
| return dnn_opr.deduce_layout(inputs[0]->layout()); | |||
| } | |||
| }(); | |||
| auto out = Tensor::make(oup_layout, cn); | |||
| auto dnn_wk = caller.create_workspace(wk_size); | |||
| caller.op->exec(inp_tensornds[0], out->dnn_tensor(), dnn_wk); | |||
| dnn_opr.exec_fastrun(inputs[0], out); | |||
| return {out}; | |||
| } | |||
| @@ -18,33 +18,31 @@ namespace reduce { | |||
| auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||
| auto&& reduce = static_cast<const Reduce&>(def); | |||
| auto comp_node = inputs[0]->comp_node(); | |||
| OperatorNodeConfig config{reduce.make_name(), comp_node, inputs[0]->dtype()}; | |||
| auto name = reduce.make_name(); | |||
| if (inputs.size() > 1) { | |||
| return opr::Reduce::make(inputs[0], reduce.param(), inputs[1], config); | |||
| } | |||
| using Param = megdnn::param::Reduce; | |||
| auto param = reduce.param(); | |||
| if (param.axis < 0) { | |||
| param.axis = inputs[0]->shape().ndim + param.axis; | |||
| auto axis = param.axis; | |||
| auto keepdim = reduce.keepdim; | |||
| if (inputs.size() == 2) { | |||
| return opr::Reduce::make(inputs[0], param, inputs[1], {name}); | |||
| } | |||
| mgb_assert(inputs.size() == 1); | |||
| SymbolVar target_shape = (cg::VarNode*)nullptr; | |||
| if (param.axis == INT_MAX) { | |||
| DTypeScalar vi{1}; | |||
| // auto graph = ComputingGraph::make(); | |||
| if (axis == INT_MAX) { | |||
| // keepdim could be ignored when ndim == 1 | |||
| auto graph = inputs[0]->owner_graph(); | |||
| target_shape = opr::ImmutableTensor::make(*graph, vi, config); | |||
| auto scalar_shape = | |||
| opr::ImmutableTensor::make(*graph, DTypeScalar(1), {name, comp_node}); | |||
| return opr::Reduce::make(inputs[0], param, scalar_shape, {name}); | |||
| } | |||
| auto res = opr::Reduce::make(inputs[0], param, target_shape, config); | |||
| if (!reduce.keepdim && param.axis != INT_MAX) { | |||
| // mgb::opr::Reduce supports negative axis | |||
| auto res = opr::Reduce::make(inputs[0], param, {}, {name}); | |||
| if (!keepdim) { | |||
| using Desc = opr::AxisAddRemove::AxisDesc; | |||
| std::vector<Desc> remove_param; | |||
| remove_param.push_back(Desc::make_remove(param.axis)); | |||
| OperatorNodeConfig remove_config{ | |||
| def.make_name(), comp_node, inputs[0]->dtype()}; | |||
| return opr::AxisAddRemove::make(res, remove_param, remove_config); | |||
| std::vector<Desc> remove_axis_param; | |||
| remove_axis_param.push_back(Desc::make_remove(axis)); | |||
| res = opr::AxisAddRemove::make(res, remove_axis_param, {name}); | |||
| } | |||
| return res; | |||
| } | |||
| @@ -71,111 +69,104 @@ bool memory_forward_success(const OpDef& def, SmallVector<TensorPtr> inputs) { | |||
| SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
| // memory forward | |||
| if (memory_forward_success(def, inputs)) { | |||
| // maybe returns inputs[0] directly | |||
| return {Tensor::make( | |||
| inputs[0]->blob(), inputs[0]->offset(), inputs[0]->layout())}; | |||
| } | |||
| auto size = inputs.size(); | |||
| if (size > 1) { | |||
| if (inputs.size() == 2) { | |||
| // reduce to target shape, fallback to proxy_graph | |||
| return proxy_graph_detail::apply_on_physical_tensor( | |||
| def, inputs, output_descs, validated); | |||
| } | |||
| mgb_assert(inputs.size() == 1); | |||
| auto comp_node = inputs[0]->comp_node(); | |||
| using TensorND = megdnn::TensorND; | |||
| auto&& op_def = def.cast_final_safe<Reduce>(); | |||
| SmallVector<TensorND> inp_tensornds; | |||
| inp_tensornds.reserve(inputs.size()); | |||
| auto src = inputs[0]->layout(); | |||
| DnnOprCaller<megdnn::Reduce> dnn_op(comp_node); | |||
| dnn_op.op->param() = op_def.param(); | |||
| auto axis = op_def.param().axis; | |||
| DnnOprCaller<megdnn::Reduce> dnn_op(comp_node, op_def.param()); | |||
| auto&& mode = dnn_op.param().mode; | |||
| auto& axis = dnn_op.param().axis; | |||
| auto keepdim = op_def.keepdim; | |||
| if (axis < 0) { | |||
| axis = inputs[0]->layout().ndim + axis; | |||
| } | |||
| dnn_op.op->param().axis = axis == INT_MAX ? 0 : axis; | |||
| if (axis == INT_MAX) { | |||
| src.shape[0] = src.total_nr_elems(); | |||
| src.ndim = 1; | |||
| src.init_contiguous_stride(); | |||
| } | |||
| TensorLayout layout{src.dtype}; | |||
| dnn_op.op->deduce_layout(src, layout); | |||
| if (inputs[0]->layout().is_empty()) { | |||
| inputs[0]->dev_tensor().reset(inputs[0]->dev_tensor().storage(), src); | |||
| auto mode = op_def.param().mode; | |||
| if (!keepdim && src.ndim > 1) { | |||
| layout.remove_axis_inplace(axis); | |||
| layout.init_contiguous_stride(); | |||
| DnnTensorND dnn_input = [&] { | |||
| if (axis == INT_MAX) { // reduce to scalar | |||
| axis = 0; | |||
| // flatten input | |||
| return inputs[0]->dnn_tensor({inputs[0]->shape().total_nr_elems()}); | |||
| } else { | |||
| if (axis < 0) { | |||
| axis = inputs[0]->layout().ndim + axis; | |||
| } | |||
| mgb_assert(axis >= 0 && axis < inputs[0]->layout().ndim); | |||
| return inputs[0]->dnn_tensor(); | |||
| } | |||
| auto out = Tensor::make(layout, comp_node); | |||
| }(); | |||
| auto output_layout = dnn_op.deduce_layout(dnn_input.layout); | |||
| auto resolve_keepdim = [&] { | |||
| if (!keepdim) { | |||
| if (output_layout.ndim > 1) { | |||
| mgb_assert(output_layout.shape[axis] == 1); | |||
| output_layout.remove_axis_inplace(axis); | |||
| } | |||
| } | |||
| }; | |||
| std::string err_msg; | |||
| TensorPtr output; | |||
| if (output_layout.is_empty()) { | |||
| // output empty, no computation | |||
| resolve_keepdim(); | |||
| output = Tensor::make(output_layout, comp_node); | |||
| } else if (dnn_input.layout.is_empty()) { | |||
| // input empty but output not, do fill | |||
| resolve_keepdim(); | |||
| output = Tensor::make(output_layout, comp_node); | |||
| auto on_bad_empty_reduce = [](const char* name) { | |||
| mgb_throw( | |||
| MegBrainError, "empty input is not allowed for reduce mode: %s", | |||
| name); | |||
| }; | |||
| switch (mode) { | |||
| case Reduce::Mode::SUM: | |||
| if (!out->empty()) { | |||
| dev_tensor_memset(out->dev_tensor(), 0); | |||
| } | |||
| // fill 0 | |||
| dev_tensor_memset(output->dev_tensor(), 0); | |||
| break; | |||
| case Reduce::Mode::PRODUCT: | |||
| if (!out->empty()) { | |||
| DnnOprCaller<megdnn::Fill> fill_op(comp_node); | |||
| fill_op.op->param() = 1; | |||
| fill_op.op->exec(out->dnn_tensor(), {}); | |||
| } | |||
| case Reduce::Mode::PRODUCT: { | |||
| // fill 1 | |||
| DnnOprCaller<megdnn::Fill> fill_op(comp_node, {1}); | |||
| fill_op.exec_with_ws(output); | |||
| break; | |||
| } | |||
| case Reduce::Mode::MEAN: | |||
| err_msg = "mean"; | |||
| on_bad_empty_reduce("mean"); | |||
| break; | |||
| case Reduce::Mode::MIN: | |||
| err_msg = "min"; | |||
| on_bad_empty_reduce("min"); | |||
| break; | |||
| case Reduce::Mode::MAX: | |||
| err_msg = "max"; | |||
| on_bad_empty_reduce("max"); | |||
| break; | |||
| case Reduce::Mode::SUM_SQR: | |||
| err_msg = "sum_sqr"; | |||
| on_bad_empty_reduce("sum_sqr"); | |||
| break; | |||
| default: | |||
| mgb_throw(MegBrainError, "bad reduce mode"); | |||
| } | |||
| if (!err_msg.empty()) { | |||
| mgb_throw( | |||
| MegBrainError, "empty input is not allowed for reduce mode: %s", | |||
| err_msg.c_str()); | |||
| } else { | |||
| // common reduction | |||
| if (keepdim) { | |||
| output = Tensor::make(output_layout, comp_node); | |||
| dnn_op.exec_with_ws(dnn_input, output); | |||
| } else { | |||
| // used by megdnn::exec | |||
| auto output_layout_keepdim = output_layout; | |||
| resolve_keepdim(); | |||
| output = Tensor::make(output_layout, comp_node); | |||
| dnn_op.exec_with_ws(dnn_input, output->dnn_tensor(output_layout_keepdim)); | |||
| } | |||
| return {out}; | |||
| } | |||
| auto dnn_ten = inputs[0]->dnn_tensor(); | |||
| dnn_ten.layout = src; | |||
| inp_tensornds.push_back(dnn_ten); | |||
| auto wk_size = dnn_op.op->get_workspace_in_bytes(src, layout); | |||
| auto dnn_wk = dnn_op.create_workspace(wk_size); | |||
| TensorLayout ori_layout = layout; | |||
| if (!keepdim && src.ndim > 1) { | |||
| layout.remove_axis_inplace(axis); | |||
| layout.init_contiguous_stride(); | |||
| } | |||
| auto out = Tensor::make(layout, comp_node); | |||
| auto dnn_out = out->dnn_tensor(); | |||
| dnn_out.layout = ori_layout; | |||
| dnn_op.op->exec(inp_tensornds[0], dnn_out, dnn_wk); | |||
| return {out}; | |||
| return {output}; | |||
| } | |||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| @@ -184,16 +175,12 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| auto axis = op_def.param().axis; | |||
| auto keepdim = op_def.keepdim; | |||
| size_t size = inputs.size(); | |||
| SmallVector<LogicalTensorDesc> dests(size); | |||
| mgb_assert(inputs.size() > 0); | |||
| auto&& comp_node = inputs[0].comp_node; | |||
| auto&& input_layout = inputs[0].layout; | |||
| for (size_t i = 0; i < size; i++) { | |||
| if (inputs[i].layout.ndim == 0) { | |||
| return {{{TensorLayout(inputs[0].layout.dtype), inputs[0].comp_node}}, | |||
| false}; | |||
| } | |||
| } | |||
| if (size > 1) { | |||
| if (inputs.size() == 2) { | |||
| // fallback to proxy_graph, matters on backward | |||
| auto [output_descs, validated] = | |||
| proxy_graph_detail::infer_output_attrs_fallible(def, inputs); | |||
| if (!inputs[1].value.empty()) { | |||
| @@ -203,30 +190,37 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| return {output_descs, validated}; | |||
| } | |||
| mgb_assert(inputs.size() == 1); | |||
| if (axis == INT_MAX) { | |||
| // reduce to scalar | |||
| // ignore keepdim because ndim is 1 | |||
| auto&& dtype = input_layout.dtype; | |||
| auto&& format = input_layout.format; | |||
| auto output_layout = TensorLayout{{1}, dtype, format}; | |||
| return {{{output_layout, comp_node}}, true}; | |||
| } | |||
| if (input_layout.ndim == 0) { | |||
| // shape incomplete | |||
| return {{{TensorLayout(input_layout.dtype, input_layout.format), comp_node}}, | |||
| false}; | |||
| } | |||
| if (axis < 0) { | |||
| axis = inputs[0].layout.ndim + axis; | |||
| axis = input_layout.ndim + axis; | |||
| } | |||
| mgb_assert(axis >= 0 && axis < input_layout.ndim); | |||
| if (axis == INT_MAX || inputs[0].layout.ndim == 1) { | |||
| TensorLayout layout{inputs[0].layout.dtype}; | |||
| layout.shape[0] = 1; | |||
| layout.ndim = 1; | |||
| dests[0].layout = layout; | |||
| dests[0].comp_node = inputs[0].comp_node; | |||
| TensorLayout output_layout = input_layout; | |||
| bool remove_axis = (!keepdim) && input_layout.ndim > 1; | |||
| if (remove_axis) { | |||
| output_layout.remove_axis_inplace(axis); | |||
| } else { | |||
| for (size_t i = 0; i < size; ++i) { | |||
| dests[i].comp_node = inputs[i].comp_node; | |||
| dests[i].layout = inputs[i].layout; | |||
| if (!keepdim && dests[i].layout.ndim > 1) { | |||
| dests[i].layout.remove_axis_inplace(axis); | |||
| } else { | |||
| dests[i].layout.shape[axis] = 1; | |||
| } | |||
| dests[i].layout.init_contiguous_stride(); | |||
| } | |||
| output_layout.shape[axis] = 1; | |||
| } | |||
| return {dests, true}; | |||
| output_layout.init_contiguous_stride(); | |||
| return {{{output_layout, comp_node}}, true}; | |||
| } | |||
| SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint( | |||
| @@ -230,31 +230,19 @@ SmallVector<TensorPtr> param_pack_concat_apply_on_physical_tensor( | |||
| } | |||
| auto dest_layout = TensorLayout({nr_elems}, dtype); | |||
| auto output = Tensor::make(dest_layout, comp_node); | |||
| auto caller = DnnOprCaller<megdnn::ParamPackConcat>(comp_node); | |||
| size_t srcs_size = sizeof(void*) * nr_inputs; | |||
| void** srcs_raw_ptr = (void**)comp_node.alloc_host(srcs_size); | |||
| std::shared_ptr<dt_byte> srcs_ptr = { | |||
| (dt_byte*)srcs_raw_ptr, | |||
| [comp_node](dt_byte* ptr) { comp_node.free_host(ptr); }}; | |||
| // FIXME: add param to ParamPackConcat | |||
| DnnOprCaller<megdnn::ParamPackConcat> caller{comp_node}; | |||
| HostTensorStorage srcs_storage{comp_node}; | |||
| srcs_storage.ensure_size(sizeof(void*) * nr_inputs); | |||
| TensorLayout srcs_layout = TensorLayout{{nr_inputs}, dtype::Int32()}; | |||
| size_t ws_size; | |||
| { | |||
| TensorShapeArray src_shapes; | |||
| for (size_t i = 0; i < nr_inputs; ++i) { | |||
| src_shapes.push_back(inputs[i]->shape()); | |||
| } | |||
| ws_size = caller.op->get_workspace_in_bytes( | |||
| src_shapes, inputs.back()->shape(), TensorShape{}); | |||
| } | |||
| HostTensorND srcs_tensornd; | |||
| srcs_tensornd.reset(srcs_storage, srcs_layout); | |||
| auto* srcs_raw_ptr = reinterpret_cast<void**>(srcs_storage.ptr()); | |||
| for (size_t i = 0; i < nr_inputs; ++i) { | |||
| srcs_raw_ptr[i] = inputs[i]->dev_tensor().as_megdnn().raw_ptr(); | |||
| srcs_raw_ptr[i] = inputs[i]->dnn_tensor().raw_ptr(); | |||
| } | |||
| HostTensorStorage srcs_storage; | |||
| srcs_storage.reset(comp_node, srcs_size, srcs_ptr); | |||
| caller.op->exec( | |||
| {srcs_raw_ptr, srcs_layout}, inputs.back()->dnn_tensor(), | |||
| output->dnn_tensor(), caller.create_workspace(ws_size)); | |||
| async_release(HostTensorND{comp_node, srcs_layout}.storage(srcs_storage)); | |||
| caller.exec_with_ws(srcs_tensornd.as_megdnn(), inputs.back(), output); | |||
| async_release(srcs_tensornd); | |||
| return {output}; | |||
| } | |||
| @@ -33,69 +33,39 @@ VarNodeArray apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | |||
| auto&& op = static_cast<const ROIAlign&>(def); | |||
| if (inputs[0].layout.is_empty() || inputs[1].layout.is_empty()) { | |||
| return {{{TensorLayout(inputs[0].layout.dtype), inputs[0].comp_node}, | |||
| {TensorLayout(dtype::Int32()), inputs[1].comp_node}}, | |||
| false}; | |||
| } | |||
| SmallVector<LogicalTensorDesc> descs(2u); | |||
| size_t n = inputs[1].layout[0]; | |||
| size_t c = inputs[0].layout[1]; | |||
| descs[0].layout = TensorLayout( | |||
| {n, c, op.pooled_height, op.pooled_width}, inputs[0].layout.dtype); | |||
| descs[0].layout.init_contiguous_stride(); | |||
| descs[0].comp_node = inputs[0].comp_node; | |||
| descs[1].layout = | |||
| TensorLayout({n, c, op.pooled_height, op.pooled_width}, dtype::Int32()); | |||
| descs[1].layout.init_contiguous_stride(); | |||
| descs[1].comp_node = descs[0].comp_node; | |||
| return {descs, true}; | |||
| auto&& op = def.cast_final_safe<ROIAlign>(); | |||
| DnnOprHelper<megdnn::ROIAlign> dnn_opr(op.param()); | |||
| auto cn = inputs[0].comp_node; | |||
| auto&& [out_layout, ind_layout] = | |||
| dnn_opr.deduce_layouts<2>(inputs[0].layout, inputs[1].layout); | |||
| bool validated = out_layout.ndim == 0 && ind_layout.ndim == 0; | |||
| return {{{out_layout, cn}, {ind_layout, cn}}, validated}; | |||
| } | |||
| SmallVector<TensorPtr> apply_on_physical_tensor( | |||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||
| auto&& op = static_cast<const ROIAlign&>(def); | |||
| CompNode cn = inputs[0]->comp_node(); | |||
| auto&& op = def.cast_final_safe<ROIAlign>(); | |||
| auto cn = inputs[0]->comp_node(); | |||
| TensorLayout out_layout = output_descs[0].layout; | |||
| TensorLayout ind_layout = output_descs[1].layout; | |||
| if (!validated) { | |||
| size_t n = inputs[1]->layout()[0]; | |||
| size_t c = inputs[0]->layout()[1]; | |||
| out_layout = TensorLayout( | |||
| {n, c, op.pooled_height, op.pooled_width}, inputs[0]->layout().dtype); | |||
| out_layout.init_contiguous_stride(); | |||
| ind_layout = | |||
| TensorLayout({n, c, op.pooled_height, op.pooled_width}, dtype::Int32()); | |||
| ind_layout.init_contiguous_stride(); | |||
| } | |||
| DnnOprCaller<megdnn::ROIAlign> dnn_opr(cn, op.param()); | |||
| auto&& [out_layout, ind_layout] = [&]() -> std::array<TensorLayout, 2> { | |||
| if (validated) { | |||
| return {output_descs[0].layout, output_descs[1].layout}; | |||
| } else { | |||
| return dnn_opr.deduce_layouts<2>(inputs[0]->layout(), inputs[1]->layout()); | |||
| } | |||
| }(); | |||
| DeviceTensorND out = | |||
| BlobManager::inst()->alloc_workspace_with_defrag(cn, out_layout); | |||
| DeviceTensorND inds = | |||
| BlobManager::inst()->alloc_workspace_with_defrag(cn, ind_layout); | |||
| auto out = Tensor::make(out_layout, cn); | |||
| auto ind = Tensor::make(ind_layout, cn); | |||
| if (out_layout.is_empty() || ind_layout.is_empty()) { | |||
| return {Tensor::make(out), Tensor::make(inds)}; | |||
| return {out, ind}; | |||
| } | |||
| DnnOprCaller<megdnn::ROIAlign> dnn_opr(cn); | |||
| dnn_opr.op->param() = op.param(); | |||
| size_t sz = dnn_opr.op->get_workspace_in_bytes( | |||
| inputs[0]->layout(), inputs[1]->layout(), out_layout, ind_layout); | |||
| auto dnn_wk = dnn_opr.create_workspace(sz); | |||
| dnn_opr.op->exec( | |||
| inputs[0]->dnn_tensor(), inputs[1]->dnn_tensor(), out.as_megdnn(), | |||
| inds.as_megdnn(), dnn_wk); | |||
| return {Tensor::make(out), Tensor::make(inds)}; | |||
| dnn_opr.exec_with_ws(inputs[0], inputs[1], out, ind); | |||
| return {out, ind}; | |||
| } | |||
| SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint( | |||
| @@ -570,11 +570,17 @@ bool Tensor::empty() { | |||
| return !m_blob->size(); | |||
| } | |||
| megdnn::TensorND Tensor::dnn_tensor() { | |||
| DnnTensorND Tensor::dnn_tensor() { | |||
| mgb_assert(m_blob, "uninitialized tensor."); | |||
| mgb_assert(m_layout.ndim, "dnn don't support scalar"); | |||
| return DnnTensorND{m_layout, m_blob->storage(), m_offset}; | |||
| } | |||
| DnnTensorND Tensor::dnn_tensor(TensorShape new_shape) { | |||
| mgb_assert(m_blob, "uninitialized tensor."); | |||
| return DnnTensorND{m_layout.reshape(new_shape), m_blob->storage(), m_offset}; | |||
| } | |||
| void Tensor::fetch_value() { | |||
| MGB_LOCK_GUARD(m_value_mtx); | |||
| if (m_value.empty()) { | |||
| @@ -334,9 +334,16 @@ public: | |||
| size_t j = 0; | |||
| for (auto&& var : m_opr->output()) { | |||
| if (var->contain_flag(VarNode::Flag::VOLATILE_CONTENT)) { | |||
| TensorLayout layout{var->shape(), var->dtype(), var->format()}; | |||
| var->m_dev_tensor = BlobManager::inst()->alloc_workspace_with_defrag( | |||
| var->comp_node(), layout); | |||
| auto comp_node = var->comp_node(); | |||
| auto dtype = var->dtype(); | |||
| auto&& shape = var->shape(); | |||
| size_t size = dtype.size(shape.total_nr_elems()); | |||
| mgb_assert( | |||
| var->format().is_default(), "non default format for workspace"); | |||
| auto raw_storage = Blob::make(comp_node, size)->storage(); | |||
| DeviceTensorStorage storage; | |||
| storage.reset(comp_node, size, raw_storage); | |||
| var->m_dev_tensor.reset(storage, {shape, dtype}); | |||
| } else { | |||
| mgb_assert(j < outputs.size()); | |||
| auto&& tensor = outputs[j]; | |||
| @@ -1,6 +1,7 @@ | |||
| #pragma once | |||
| #include "megbrain/imperative/physical_tensor.h" | |||
| #include "megbrain/imperative/utils/helper.h" | |||
| namespace mgb { | |||
| namespace imperative { | |||
| @@ -15,13 +16,19 @@ public: | |||
| virtual void alloc_direct(OwnedBlob* blob, size_t size) = 0; | |||
| virtual bool try_alloc_direct(OwnedBlob* blob, size_t size) { | |||
| try { | |||
| alloc_direct(blob, size); | |||
| return true; | |||
| } catch (MemAllocError&) { | |||
| return false; | |||
| } | |||
| } | |||
| virtual void alloc_with_defrag(OwnedBlob* blob, size_t size) = 0; | |||
| virtual void set_allocator(allocator_t allocator) = 0; | |||
| virtual DeviceTensorND alloc_workspace_with_defrag( | |||
| CompNode cn, TensorLayout& layout) = 0; | |||
| virtual void register_blob(OwnedBlob* blob) = 0; | |||
| virtual void unregister_blob(OwnedBlob* blob) = 0; | |||
| @@ -89,24 +89,19 @@ using EventPtr = std::unique_ptr<CompNode::Event, EventDeleter>; | |||
| class Tensor; | |||
| using TensorPtr = std::shared_ptr<Tensor>; | |||
| /* | |||
| using DnnTensorND to save the reference count of workspace | |||
| allocted by blobmanager to prevent invalidation | |||
| */ | |||
| struct DnnTensorND : megdnn::TensorND { | |||
| private: | |||
| std::shared_ptr<dt_byte> m_reference; | |||
| // hold extra reference to repvent defrag-in-use | |||
| std::shared_ptr<dt_byte> reference; | |||
| public: | |||
| DnnTensorND(TensorLayout& layout_, std::shared_ptr<dt_byte> ref_ptr, size_t offset) | |||
| : megdnn::TensorND(layout_, {ref_ptr.get(), offset}) { | |||
| m_reference = ref_ptr; | |||
| DnnTensorND( | |||
| const TensorLayout& layout_, std::shared_ptr<dt_byte> ptr, size_t offset) | |||
| : megdnn::TensorND(layout_, {ptr.get(), offset}) { | |||
| reference = std::move(ptr); | |||
| } | |||
| }; | |||
| class Tensor : public NonCopyableObj { | |||
| public: | |||
| Tensor() = default; | |||
| Tensor(BlobPtr blob, const TensorLayout& layout, size_t offset = 0, | |||
| const HostTensorND& hv = {}); | |||
| Tensor(BlobPtr blob, const TensorLayout& layout, const HostTensorND& hv = {}) | |||
| @@ -154,7 +149,9 @@ public: | |||
| void assign_from_dev_tensor(DeviceTensorND); | |||
| megdnn::TensorND dnn_tensor(); | |||
| DnnTensorND dnn_tensor(); | |||
| DnnTensorND dnn_tensor(TensorShape new_shape); | |||
| static TensorPtr make_scalar(DTypeScalar value, CompNode cn); | |||
| @@ -3,6 +3,7 @@ | |||
| #include <iomanip> | |||
| #include <memory> | |||
| #include <mutex> | |||
| #include <optional> | |||
| #include <sstream> | |||
| #include "megbrain/utils/metahelper.h" | |||
| @@ -14,11 +15,28 @@ namespace imperative { | |||
| template <typename T = std::function<void()>> | |||
| class CleanupGuard : public NonCopyableObj { | |||
| private: | |||
| T m_callback; | |||
| std::optional<T> m_callback; | |||
| public: | |||
| CleanupGuard() = default; | |||
| explicit CleanupGuard(T cb) : m_callback{std::move(cb)} {} | |||
| ~CleanupGuard() { m_callback(); } | |||
| ~CleanupGuard() { reset(); } | |||
| CleanupGuard(CleanupGuard&& rhs) : m_callback(std::move(rhs.m_callback)) { | |||
| rhs.m_callback.reset(); | |||
| } | |||
| CleanupGuard& operator=(CleanupGuard&& rhs) { | |||
| swap(m_callback, rhs.m_callback); | |||
| rhs.reset(); | |||
| return *this; | |||
| } | |||
| public: | |||
| void reset() { | |||
| if (m_callback) { | |||
| (*m_callback)(); | |||
| m_callback.reset(); | |||
| } | |||
| } | |||
| }; | |||
| inline std::string quoted(std::string str) { | |||
| @@ -33,6 +51,19 @@ inline std::string quoted(std::string str) { | |||
| std::call_once(_once_flag, [&] { __VA_ARGS__; }); \ | |||
| } while (false) | |||
| template <typename T> | |||
| struct is_small_vector { | |||
| static constexpr bool value = false; | |||
| }; | |||
| template <typename T> | |||
| struct is_small_vector<SmallVector<T>> { | |||
| static constexpr bool value = true; | |||
| }; | |||
| template <typename T> | |||
| static constexpr bool is_small_vector_v = is_small_vector<T>::value; | |||
| } // namespace imperative | |||
| } // namespace mgb | |||
| @@ -6,4 +6,10 @@ namespace mgb::imperative { | |||
| std::string demangle(std::string mangled); | |||
| template <typename T> | |||
| const char* demangled_typename() { | |||
| static auto name = demangle(typeid(T).name()); | |||
| return name.c_str(); | |||
| } | |||
| } // namespace mgb::imperative | |||
| @@ -314,7 +314,8 @@ void CondTake::init_output_static_infer_desc() { | |||
| auto dtype = input(0)->dtype(); | |||
| TensorLayout ily(iv.val[0].shape(), dtype); | |||
| dest.ndim = 1; | |||
| dest.shape[0] = megdnn_opr()->get_workspace_in_bytes(ily); | |||
| TensorLayout mly(iv.val[0].shape(), dtype::Int32()); | |||
| dest.shape[0] = megdnn_opr()->get_workspace_in_bytes(ily, mly); | |||
| return true; | |||
| }; | |||
| owner_graph()->static_infer_manager().register_shape_infer( | |||
| @@ -548,9 +549,9 @@ void CheckNonFinite::init_output_static_infer_desc() { | |||
| auto infer_wk = [this](TensorShape& dest, const InpVal& inp) { | |||
| dest.ndim = 1; | |||
| megdnn::TensorNDArray inp_arr(input().size()); | |||
| SmallVector<megdnn::TensorLayout> inp_arr(input().size()); | |||
| for (size_t i = 0; i < input().size(); ++i) { | |||
| inp_arr[i] = {NULL, {inp.val.at(i).shape(), input(0)->dtype()}}; | |||
| inp_arr[i] = {inp.val.at(i).shape(), input(0)->dtype()}; | |||
| } | |||
| dest.shape[0] = megdnn_opr()->get_workspace_in_bytes( | |||
| inp_arr, {output(input().size() + 1)->shape(), | |||
| @@ -1447,11 +1447,8 @@ void ParamPackConcat::init_output_static_infer_desc() { | |||
| auto infer_wk = [this](TensorShape& dest, const InpVal& inp) { | |||
| TensorShapeArray shapes; | |||
| auto vals = inp.val; | |||
| shapes.reserve(vals.size() - 1); | |||
| for (size_t i = 0; i < vals.size() - 1; i++) { | |||
| shapes.push_back(vals[i].shape()); | |||
| } | |||
| dest = {m_opr->get_workspace_in_bytes(shapes, vals.back().shape(), dest)}; | |||
| size_t nr_params = vals.size() - 1; | |||
| dest = {m_opr->get_workspace_in_bytes({nr_params}, vals.back().shape(), dest)}; | |||
| return true; | |||
| }; | |||
| mgr.register_shape_infer(output(0), {SourceType::DEP, shp_deps, infer_out}); | |||
| @@ -970,8 +970,9 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||
| if (!policy.algo.valid()) | |||
| continue; | |||
| size_t workspace_needed = get_workspace_size_bytes(policy); | |||
| if (m_inputs != nullptr) | |||
| if (m_inputs == nullptr) { | |||
| workspace_needed += data_size; | |||
| } | |||
| if (workspace_needed > | |||
| m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) { | |||
| continue; | |||
| @@ -18,7 +18,8 @@ failed_files = Manager().list() | |||
| def process_file(file, clang_format, write): | |||
| source = open(file, "r").read() | |||
| original_source = open(file, "r").read() | |||
| source = original_source | |||
| source = re.sub(r"MGB_DEFINE(?P<r>([^\\]|\n)*?)// *{", r"class MGB_DEFINE\g<r>{", source) | |||
| source, count = re.subn(r"(?<!#define )MGB_DEFINE(.*) +\\", r"class MGB_DEFINE\1{\\", source) | |||
| @@ -38,7 +39,7 @@ def process_file(file, clang_format, write): | |||
| result = re.sub(r"class MGB_DEFINE(.*){( *)\\", r"MGB_DEFINE\1\2 \\", result) | |||
| result = re.sub(r"class MGB_DEFINE((.|\n)*?){", r"MGB_DEFINE\1// {", result) | |||
| if write: | |||
| if write and original_source != result: | |||
| with tempfile.NamedTemporaryFile( | |||
| dir=os.path.dirname(file), delete=False | |||
| ) as tmp_file: | |||