GitOrigin-RevId: 402cba209a
tags/v1.11.0
| @@ -397,7 +397,8 @@ public: | |||||
| OutputDType infer_dtype(DType data, DType mask); | OutputDType infer_dtype(DType data, DType mask); | ||||
| virtual size_t get_workspace_in_bytes(const TensorLayout& data) = 0; | |||||
| virtual size_t get_workspace_in_bytes( | |||||
| const TensorLayout& data, const TensorLayout& mask) = 0; | |||||
| virtual Output exec( | virtual Output exec( | ||||
| _megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, | _megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, | ||||
| @@ -512,7 +513,8 @@ public: | |||||
| virtual void exec( | virtual void exec( | ||||
| _megdnn_in const TensorNDArray& srcs, _megdnn_tensor_out dst, | _megdnn_in const TensorNDArray& srcs, _megdnn_tensor_out dst, | ||||
| _megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
| void deduce_layout(const TensorLayoutArray& srcs, TensorLayout& dst); | |||||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
| const TensorLayoutArray& srcs, TensorLayout& dst); | |||||
| virtual size_t get_workspace_in_bytes( | virtual size_t get_workspace_in_bytes( | ||||
| const TensorLayoutArray& srcs, const TensorLayout& dst) = 0; | const TensorLayoutArray& srcs, const TensorLayout& dst) = 0; | ||||
| @@ -596,7 +598,7 @@ public: | |||||
| _megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
| virtual size_t get_workspace_in_bytes( | virtual size_t get_workspace_in_bytes( | ||||
| const TensorShapeArray& srcs, const TensorShape& offsets, | |||||
| const TensorShape& srcs, const TensorShape& offsets, | |||||
| const TensorShape& dst) = 0; | const TensorShape& dst) = 0; | ||||
| }; | }; | ||||
| @@ -1145,7 +1147,7 @@ protected: | |||||
| /*! | /*! | ||||
| * \return axis on dst used by indexer (i.e. ExecInfo::idx_axis) | * \return axis on dst used by indexer (i.e. ExecInfo::idx_axis) | ||||
| */ | */ | ||||
| static size_t deduce_layout_fwd( | |||||
| MGE_WIN_DECLSPEC_FUC static size_t deduce_layout_fwd( | |||||
| const TensorLayout& data, const IndexDescLayoutOnly& index, | const TensorLayout& data, const IndexDescLayoutOnly& index, | ||||
| TensorLayout& dst); | TensorLayout& dst); | ||||
| @@ -1362,9 +1364,10 @@ class CheckNonFinite : public OperatorBase { | |||||
| public: | public: | ||||
| virtual size_t get_workspace_in_bytes( | virtual size_t get_workspace_in_bytes( | ||||
| const TensorNDArray& srcs, const TensorLayout& dst) = 0; | |||||
| const TensorLayoutArray& srcs, const TensorLayout& dst) = 0; | |||||
| void deduce_layout(const TensorLayoutArray& srcs, TensorLayout& dst); | |||||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
| const TensorLayoutArray& srcs, TensorLayout& dst); | |||||
| virtual void exec( | virtual void exec( | ||||
| _megdnn_in const TensorNDArray& srcs, _megdnn_tensor_out dst, | _megdnn_in const TensorNDArray& srcs, _megdnn_tensor_out dst, | ||||
| @@ -1420,7 +1423,7 @@ public: | |||||
| } | } | ||||
| virtual size_t get_workspace_in_bytes( | virtual size_t get_workspace_in_bytes( | ||||
| const TensorLayout& src, const TensorLayout& dst) = 0; | const TensorLayout& src, const TensorLayout& dst) = 0; | ||||
| void deduce_layout(const TensorLayout& src, TensorLayout& dst); | |||||
| MGE_WIN_DECLSPEC_FUC void deduce_layout(const TensorLayout& src, TensorLayout& dst); | |||||
| MGE_WIN_DECLSPEC_FUC static void deduce_layout_impl( | MGE_WIN_DECLSPEC_FUC static void deduce_layout_impl( | ||||
| const TensorLayout& src, TensorLayout& dst, const Param& p); | const TensorLayout& src, TensorLayout& dst, const Param& p); | ||||
| @@ -1464,7 +1467,7 @@ public: | |||||
| const TensorLayout& m_t, const TensorLayout& v_t, | const TensorLayout& m_t, const TensorLayout& v_t, | ||||
| const TensorLayout& new_param) = 0; | const TensorLayout& new_param) = 0; | ||||
| void deduce_layout( | |||||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
| const TensorLayout& m_t_1, const TensorLayout& v_t_1, | const TensorLayout& m_t_1, const TensorLayout& v_t_1, | ||||
| const TensorLayout& lamb_param, const TensorLayout& grad, TensorLayout& m_t, | const TensorLayout& lamb_param, const TensorLayout& grad, TensorLayout& m_t, | ||||
| TensorLayout& v_t, TensorLayout& new_param); | TensorLayout& v_t, TensorLayout& new_param); | ||||
| @@ -27,7 +27,8 @@ public: | |||||
| _megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, | _megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, | ||||
| _megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
| MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType A, DType B, DType& C); | MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType A, DType B, DType& C); | ||||
| void deduce_layout(const TensorLayout& A, const TensorLayout& B, TensorLayout& C); | |||||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
| const TensorLayout& A, const TensorLayout& B, TensorLayout& C); | |||||
| virtual size_t get_workspace_in_bytes( | virtual size_t get_workspace_in_bytes( | ||||
| const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) = 0; | const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) = 0; | ||||
| @@ -64,7 +65,8 @@ public: | |||||
| _megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, | _megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, | ||||
| _megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
| MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType A, DType B, DType& C); | MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType A, DType B, DType& C); | ||||
| void deduce_layout(const TensorLayout& A, const TensorLayout& B, TensorLayout& C); | |||||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
| const TensorLayout& A, const TensorLayout& B, TensorLayout& C); | |||||
| virtual size_t get_workspace_in_bytes( | virtual size_t get_workspace_in_bytes( | ||||
| const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) = 0; | const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) = 0; | ||||
| @@ -224,9 +224,9 @@ public: | |||||
| const TensorLayout& src_layout, _megdnn_tensor_in filter, | const TensorLayout& src_layout, _megdnn_tensor_in filter, | ||||
| const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter, | const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter, | ||||
| _megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
| void deduce_dtype(DType src, DType filter, DType& dst); | |||||
| MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType src, DType filter, DType& dst); | |||||
| void deduce_layout( | |||||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
| const TensorLayout& src, const TensorLayout& filter, TensorLayout& dst); | const TensorLayout& src, const TensorLayout& filter, TensorLayout& dst); | ||||
| /** | /** | ||||
| @@ -300,7 +300,7 @@ public: | |||||
| const TensorLayout& grad) = 0; | const TensorLayout& grad) = 0; | ||||
| MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType filter, DType diff, DType& grad); | MGE_WIN_DECLSPEC_FUC void deduce_dtype(DType filter, DType diff, DType& grad); | ||||
| void deduce_layout( | |||||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
| const TensorLayout& filter, const TensorLayout& diff, TensorLayout& grad); | const TensorLayout& filter, const TensorLayout& diff, TensorLayout& grad); | ||||
| static Algorithm::OprType get_opr_type() { | static Algorithm::OprType get_opr_type() { | ||||
| @@ -378,6 +378,12 @@ public: | |||||
| const PreprocessedFilter* preprocessed_filter, | const PreprocessedFilter* preprocessed_filter, | ||||
| _megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
| MGE_WIN_DECLSPEC_FUC void exec( | |||||
| _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias, | |||||
| _megdnn_tensor_in z, _megdnn_tensor_out dst, _megdnn_workspace workspace) { | |||||
| exec(src, filter, bias, z, dst, nullptr, workspace); | |||||
| } | |||||
| /** | /** | ||||
| * \brief execute weight preprocessing, read weights form filter and bias, | * \brief execute weight preprocessing, read weights form filter and bias, | ||||
| * write to preprocessed_filter after preprocessed. | * write to preprocessed_filter after preprocessed. | ||||
| @@ -390,8 +396,9 @@ public: | |||||
| _megdnn_tensor_in bias, const TensorLayout& z_layout, | _megdnn_tensor_in bias, const TensorLayout& z_layout, | ||||
| const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter, | const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter, | ||||
| _megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
| void deduce_dtype(DType src, DType filter, DType bias, DType z, DType& dst); | |||||
| void deduce_layout( | |||||
| MGE_WIN_DECLSPEC_FUC void deduce_dtype( | |||||
| DType src, DType filter, DType bias, DType z, DType& dst); | |||||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
| const TensorLayout& src, const TensorLayout& filter, | const TensorLayout& src, const TensorLayout& filter, | ||||
| const TensorLayout& bias, const TensorLayout& z, TensorLayout& dst); | const TensorLayout& bias, const TensorLayout& z, TensorLayout& dst); | ||||
| @@ -775,7 +782,7 @@ protected: | |||||
| void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst); | void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst); | ||||
| public: | public: | ||||
| MGE_WIN_DECLSPEC_FUC static void deduce_layout_impl( | |||||
| static void deduce_layout_impl( | |||||
| const TensorLayout& src, const Param& param, TensorLayout& dst); | const TensorLayout& src, const Param& param, TensorLayout& dst); | ||||
| }; | }; | ||||
| @@ -791,7 +798,7 @@ public: | |||||
| virtual void exec( | virtual void exec( | ||||
| _megdnn_tensor_in src, _megdnn_tensor_out dst, | _megdnn_tensor_in src, _megdnn_tensor_out dst, | ||||
| _megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
| void deduce_layout(const TensorLayout& src, TensorLayout& dst); | |||||
| MGE_WIN_DECLSPEC_FUC void deduce_layout(const TensorLayout& src, TensorLayout& dst); | |||||
| virtual size_t get_workspace_in_bytes( | virtual size_t get_workspace_in_bytes( | ||||
| const TensorLayout& src, const TensorLayout& dst) = 0; | const TensorLayout& src, const TensorLayout& dst) = 0; | ||||
| @@ -1253,7 +1260,7 @@ public: | |||||
| virtual void exec( | virtual void exec( | ||||
| _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst, | _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst, | ||||
| _megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
| void deduce_layout( | |||||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
| const TensorLayout& src, const TensorLayout& filter, TensorLayout& dst); | const TensorLayout& src, const TensorLayout& filter, TensorLayout& dst); | ||||
| virtual size_t get_workspace_in_bytes( | virtual size_t get_workspace_in_bytes( | ||||
| const TensorLayout& src, const TensorLayout& filter, | const TensorLayout& src, const TensorLayout& filter, | ||||
| @@ -1281,18 +1288,16 @@ public: | |||||
| * \param[in] diff (n, oc, od, oh, ow) | * \param[in] diff (n, oc, od, oh, ow) | ||||
| * \param[out] grad (n, ic, id, ih, iw) | * \param[out] grad (n, ic, id, ih, iw) | ||||
| */ | */ | ||||
| MGE_WIN_DECLSPEC_FUC static void deduce_layout_impl( | |||||
| static void deduce_layout_impl( | |||||
| const TensorLayout& filter, const TensorLayout& diff, const Param& param, | const TensorLayout& filter, const TensorLayout& diff, const Param& param, | ||||
| TensorLayout& grad); | TensorLayout& grad); | ||||
| virtual void exec( | virtual void exec( | ||||
| _megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad, | _megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad, | ||||
| _megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
| virtual size_t get_workspace_in_bytes( | virtual size_t get_workspace_in_bytes( | ||||
| const TensorLayout& filter, const TensorLayout& diff, | const TensorLayout& filter, const TensorLayout& diff, | ||||
| const TensorLayout& grad) = 0; | const TensorLayout& grad) = 0; | ||||
| void deduce_layout( | |||||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
| const TensorLayout& filter, const TensorLayout& diff, TensorLayout& grad); | const TensorLayout& filter, const TensorLayout& diff, TensorLayout& grad); | ||||
| static Algorithm::OprType get_opr_type() { | static Algorithm::OprType get_opr_type() { | ||||
| @@ -1472,7 +1477,7 @@ public: | |||||
| virtual void exec( | virtual void exec( | ||||
| _megdnn_tensor_in src, _megdnn_tensor_in rois, _megdnn_tensor_out dst, | _megdnn_tensor_in src, _megdnn_tensor_in rois, _megdnn_tensor_out dst, | ||||
| _megdnn_tensor_out index, _megdnn_workspace workspace) = 0; | _megdnn_tensor_out index, _megdnn_workspace workspace) = 0; | ||||
| void deduce_layout( | |||||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
| const TensorLayout& src, const TensorLayout& rois, TensorLayout& dst, | const TensorLayout& src, const TensorLayout& rois, TensorLayout& dst, | ||||
| TensorLayout& index); | TensorLayout& index); | ||||
| virtual size_t get_workspace_in_bytes( | virtual size_t get_workspace_in_bytes( | ||||
| @@ -1963,7 +1968,7 @@ public: | |||||
| _megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias, | _megdnn_tensor_in data, _megdnn_tensor_in weight, _megdnn_tensor_in bias, | ||||
| _megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd, | _megdnn_tensor_out dst, _megdnn_tensor_out mean, _megdnn_tensor_out rstd, | ||||
| _megdnn_workspace workspace) = 0; | _megdnn_workspace workspace) = 0; | ||||
| void deduce_layout( | |||||
| MGE_WIN_DECLSPEC_FUC void deduce_layout( | |||||
| const TensorLayout& data, const TensorLayout& weight, | const TensorLayout& data, const TensorLayout& weight, | ||||
| const TensorLayout& bias, TensorLayout& dst, TensorLayout& mean, | const TensorLayout& bias, TensorLayout& dst, TensorLayout& mean, | ||||
| TensorLayout& rstd); | TensorLayout& rstd); | ||||
| @@ -7,7 +7,11 @@ void CheckNonFinite::check_exec( | |||||
| const TensorNDArray& srcs, const TensorND& dst, size_t workspace_in_bytes) { | const TensorNDArray& srcs, const TensorND& dst, size_t workspace_in_bytes) { | ||||
| megdnn_assert_contiguous(dst.layout); | megdnn_assert_contiguous(dst.layout); | ||||
| megdnn_assert(srcs.size() > 0); | megdnn_assert(srcs.size() > 0); | ||||
| auto required_workspace_in_bytes = get_workspace_in_bytes(srcs, dst.layout); | |||||
| TensorLayoutArray src_layouts; | |||||
| for (auto&& src : srcs) { | |||||
| src_layouts.push_back(src.layout); | |||||
| } | |||||
| auto required_workspace_in_bytes = get_workspace_in_bytes(src_layouts, dst.layout); | |||||
| megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); | megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); | ||||
| } | } | ||||
| @@ -11,7 +11,7 @@ size_t CondTake::check_exec_get_size( | |||||
| mask.TensorShape::to_string().c_str()); | mask.TensorShape::to_string().c_str()); | ||||
| megdnn_assert(data.is_physical_contiguous() && mask.is_physical_contiguous()); | megdnn_assert(data.is_physical_contiguous() && mask.is_physical_contiguous()); | ||||
| megdnn_assert(m_param.eps > 0, "eps must be non-negative; got: %g", m_param.eps); | megdnn_assert(m_param.eps > 0, "eps must be non-negative; got: %g", m_param.eps); | ||||
| megdnn_assert(workspace_in_bytes >= get_workspace_in_bytes(data)); | |||||
| megdnn_assert(workspace_in_bytes >= get_workspace_in_bytes(data, mask)); | |||||
| return data.total_nr_elems(); | return data.total_nr_elems(); | ||||
| } | } | ||||
| @@ -7,9 +7,9 @@ void LAMBUpdate::deduce_layout( | |||||
| const TensorLayout& m_t_1, const TensorLayout& v_t_1, | const TensorLayout& m_t_1, const TensorLayout& v_t_1, | ||||
| const TensorLayout& lamb_param, const TensorLayout& grad, TensorLayout& m_t, | const TensorLayout& lamb_param, const TensorLayout& grad, TensorLayout& m_t, | ||||
| TensorLayout& v_t, TensorLayout& new_param) { | TensorLayout& v_t, TensorLayout& new_param) { | ||||
| m_t = TensorLayout(m_t_1); | |||||
| v_t = TensorLayout(v_t_1); | |||||
| new_param = TensorLayout(lamb_param); | |||||
| m_t = m_t_1; | |||||
| v_t = v_t_1; | |||||
| new_param = lamb_param; | |||||
| MEGDNN_MARK_USED_VAR(grad); | MEGDNN_MARK_USED_VAR(grad); | ||||
| } | } | ||||
| @@ -26,14 +26,14 @@ size_t CheckNonFiniteImpl::_get_workspace_in_bytes() { | |||||
| } | } | ||||
| size_t CheckNonFiniteImpl::get_workspace_in_bytes( | size_t CheckNonFiniteImpl::get_workspace_in_bytes( | ||||
| const TensorNDArray& srcs, const TensorLayout&) { | |||||
| const TensorLayoutArray& srcs, const TensorLayout&) { | |||||
| m_size = 0; | m_size = 0; | ||||
| for (const auto& src : srcs) { | for (const auto& src : srcs) { | ||||
| m_size += DIVUP(src.layout.total_nr_elems(), total_nr_elems_max); | |||||
| m_size += DIVUP(src.total_nr_elems(), total_nr_elems_max); | |||||
| } | } | ||||
| if (srcs.begin()->layout.dtype == dtype::Float32()) { | |||||
| if (srcs.begin()->dtype == dtype::Float32()) { | |||||
| return _get_workspace_in_bytes<dt_float32>(); | return _get_workspace_in_bytes<dt_float32>(); | ||||
| } else if (srcs.begin()->layout.dtype == dtype::Float16()) { | |||||
| } else if (srcs.begin()->dtype == dtype::Float16()) { | |||||
| return _get_workspace_in_bytes<dt_float16>(); | return _get_workspace_in_bytes<dt_float16>(); | ||||
| } else { | } else { | ||||
| megdnn_log_warn("only support fp16 and fp32, fallback to fp32"); | megdnn_log_warn("only support fp16 and fp32, fallback to fp32"); | ||||
| @@ -19,7 +19,7 @@ public: | |||||
| using CheckNonFinite::CheckNonFinite; | using CheckNonFinite::CheckNonFinite; | ||||
| size_t get_workspace_in_bytes( | size_t get_workspace_in_bytes( | ||||
| const TensorNDArray& srcs, const TensorLayout& dst) override; | |||||
| const TensorLayoutArray& srcs, const TensorLayout& dst) override; | |||||
| bool is_thread_safe() const override { return true; } | bool is_thread_safe() const override { return true; } | ||||
| @@ -20,7 +20,8 @@ WorkspaceBundle CondTakeImpl::make_bundle(size_t nr_item) { | |||||
| handle()->alignment_requirement()}; | handle()->alignment_requirement()}; | ||||
| } | } | ||||
| size_t CondTakeImpl::get_workspace_in_bytes(const TensorLayout& data) { | |||||
| size_t CondTakeImpl::get_workspace_in_bytes( | |||||
| const TensorLayout& data, const TensorLayout&) { | |||||
| return make_bundle(data.total_nr_elems()).total_size_in_bytes(); | return make_bundle(data.total_nr_elems()).total_size_in_bytes(); | ||||
| } | } | ||||
| @@ -15,7 +15,8 @@ public: | |||||
| _megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, | _megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, | ||||
| DynOutMallocPolicyCall malloc_policy) override; | DynOutMallocPolicyCall malloc_policy) override; | ||||
| size_t get_workspace_in_bytes(const TensorLayout& data) override; | |||||
| size_t get_workspace_in_bytes( | |||||
| const TensorLayout& data, const TensorLayout& mask) override; | |||||
| }; | }; | ||||
| } // namespace cuda | } // namespace cuda | ||||
| @@ -6,8 +6,8 @@ namespace megdnn { | |||||
| namespace cuda { | namespace cuda { | ||||
| size_t ParamPackConcatImpl::get_workspace_in_bytes( | size_t ParamPackConcatImpl::get_workspace_in_bytes( | ||||
| const TensorShapeArray& srcs, const TensorShape&, const TensorShape&) { | |||||
| return sizeof(size_t) * srcs.size(); | |||||
| const TensorShape&, const TensorShape& offsets, const TensorShape&) { | |||||
| return sizeof(size_t) * (offsets.shape[0] / 2); | |||||
| } | } | ||||
| template <typename T> | template <typename T> | ||||
| @@ -12,7 +12,7 @@ public: | |||||
| _megdnn_workspace workspace) override; | _megdnn_workspace workspace) override; | ||||
| size_t get_workspace_in_bytes( | size_t get_workspace_in_bytes( | ||||
| const TensorShapeArray& srcs, const TensorShape& table, | |||||
| const TensorShape& srcs, const TensorShape& table, | |||||
| const TensorShape& dst) override; | const TensorShape& dst) override; | ||||
| private: | private: | ||||
| @@ -13,7 +13,8 @@ public: | |||||
| bool is_thread_safe() const override { return true; } | bool is_thread_safe() const override { return true; } | ||||
| size_t get_workspace_in_bytes(const TensorNDArray&, const TensorLayout&) override { | |||||
| size_t get_workspace_in_bytes( | |||||
| const TensorLayoutArray&, const TensorLayout&) override { | |||||
| m_size = 0; | m_size = 0; | ||||
| return _get_workspace_in_bytes(); | return _get_workspace_in_bytes(); | ||||
| } | } | ||||
| @@ -38,7 +38,8 @@ void copy_data( | |||||
| } // anonymous namespace | } // anonymous namespace | ||||
| size_t CondTakeImpl::get_workspace_in_bytes(const TensorLayout& data) { | |||||
| size_t CondTakeImpl::get_workspace_in_bytes( | |||||
| const TensorLayout& data, const TensorLayout&) { | |||||
| return (data.total_nr_elems() + 1) * sizeof(dt_int32); | return (data.total_nr_elems() + 1) * sizeof(dt_int32); | ||||
| } | } | ||||
| @@ -11,7 +11,8 @@ class CondTakeImpl : public CondTake { | |||||
| public: | public: | ||||
| using CondTake::CondTake; | using CondTake::CondTake; | ||||
| size_t get_workspace_in_bytes(const TensorLayout& data) override; | |||||
| size_t get_workspace_in_bytes( | |||||
| const TensorLayout& data, const TensorLayout& mask) override; | |||||
| Output exec( | Output exec( | ||||
| _megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, | _megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace, | ||||
| @@ -11,7 +11,7 @@ public: | |||||
| _megdnn_workspace workspace) override; | _megdnn_workspace workspace) override; | ||||
| size_t get_workspace_in_bytes( | size_t get_workspace_in_bytes( | ||||
| const TensorShapeArray&, const TensorShape&, const TensorShape&) override { | |||||
| const TensorShape&, const TensorShape&, const TensorShape&) override { | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| }; | }; | ||||
| @@ -7,8 +7,8 @@ namespace megdnn { | |||||
| namespace rocm { | namespace rocm { | ||||
| size_t ParamPackConcatImpl::get_workspace_in_bytes( | size_t ParamPackConcatImpl::get_workspace_in_bytes( | ||||
| const TensorShapeArray& srcs, const TensorShape&, const TensorShape&) { | |||||
| return sizeof(size_t) * srcs.size(); | |||||
| const TensorShape&, const TensorShape& offsets, const TensorShape&) { | |||||
| return sizeof(size_t) * (offsets.shape[0] / 2); | |||||
| } | } | ||||
| template <typename T> | template <typename T> | ||||
| @@ -12,7 +12,7 @@ public: | |||||
| _megdnn_workspace workspace) override; | _megdnn_workspace workspace) override; | ||||
| size_t get_workspace_in_bytes( | size_t get_workspace_in_bytes( | ||||
| const TensorShapeArray& srcs, const TensorShape& table, | |||||
| const TensorShape& srcs, const TensorShape& table, | |||||
| const TensorShape& dst) override; | const TensorShape& dst) override; | ||||
| private: | private: | ||||
| @@ -71,7 +71,7 @@ CondTakeTestcase::Result CondTakeTestcase::run(CondTake* opr) { | |||||
| opr->param() = m_param; | opr->param() = m_param; | ||||
| DynOutMallocPolicyImpl malloc_policy(handle); | DynOutMallocPolicyImpl malloc_policy(handle); | ||||
| auto workspace_size = opr->get_workspace_in_bytes(data->layout); | |||||
| auto workspace_size = opr->get_workspace_in_bytes(data->layout, mask->layout); | |||||
| auto workspace_ptr = malloc_policy.alloc_workspace(workspace_size, nullptr); | auto workspace_ptr = malloc_policy.alloc_workspace(workspace_size, nullptr); | ||||
| auto result = opr->exec( | auto result = opr->exec( | ||||
| *data, *mask, {(dt_byte*)workspace_ptr, workspace_size}, &malloc_policy); | *data, *mask, {(dt_byte*)workspace_ptr, workspace_size}, &malloc_policy); | ||||
| @@ -205,9 +205,14 @@ struct OprProxy<CheckNonFinite> { | |||||
| auto inps = tensors; | auto inps = tensors; | ||||
| inps.pop_back(); | inps.pop_back(); | ||||
| TensorLayoutArray inp_layouts(inps.size()); | |||||
| std::transform( | |||||
| inps.begin(), inps.end(), inp_layouts.begin(), | |||||
| [](const TensorND& tensor) { return tensor.layout; }); | |||||
| WorkspaceWrapper W( | WorkspaceWrapper W( | ||||
| opr->handle(), | opr->handle(), | ||||
| opr->get_workspace_in_bytes(inps, tensors.back().layout)); | |||||
| opr->get_workspace_in_bytes(inp_layouts, tensors.back().layout)); | |||||
| opr->exec(inps, tensors.back(), W.workspace()); | opr->exec(inps, tensors.back(), W.workspace()); | ||||
| } | } | ||||
| }; | }; | ||||
| @@ -95,7 +95,7 @@ void test_param_pack_concat( | |||||
| test::WorkspaceWrapper workspace( | test::WorkspaceWrapper workspace( | ||||
| handle, | handle, | ||||
| concat->get_workspace_in_bytes(shapes, offsets_layout, {pack_size})); | |||||
| concat->get_workspace_in_bytes({nr_params}, offsets_layout, {pack_size})); | |||||
| TensorND src_tensor(param_ptrs.data(), TensorLayout({nr_params}, dtype::Int32())); | TensorND src_tensor(param_ptrs.data(), TensorLayout({nr_params}, dtype::Int32())); | ||||
| concat->exec(src_tensor, offsets_tensor, dst_tensor, workspace.workspace()); | concat->exec(src_tensor, offsets_tensor, dst_tensor, workspace.workspace()); | ||||
| @@ -97,7 +97,7 @@ void test_param_pack_concat( | |||||
| test::WorkspaceWrapper workspace( | test::WorkspaceWrapper workspace( | ||||
| handle, | handle, | ||||
| concat->get_workspace_in_bytes(shapes, offsets_layout, {pack_size})); | |||||
| concat->get_workspace_in_bytes({nr_params}, offsets_layout, {pack_size})); | |||||
| TensorND src_tensor(param_ptrs.data(), TensorLayout({nr_params}, dtype::Int32())); | TensorND src_tensor(param_ptrs.data(), TensorLayout({nr_params}, dtype::Int32())); | ||||
| concat->exec(src_tensor, offsets_tensor, dst_tensor, workspace.workspace()); | concat->exec(src_tensor, offsets_tensor, dst_tensor, workspace.workspace()); | ||||
| @@ -9,11 +9,8 @@ BlobManagerImpl::BlobData::BlobData(OwnedBlob* in_blob) { | |||||
| blob = in_blob; | blob = in_blob; | ||||
| DeviceTensorStorage d_storage; | DeviceTensorStorage d_storage; | ||||
| d_storage.reset(blob->m_comp_node, blob->m_size, blob->m_storage); | d_storage.reset(blob->m_comp_node, blob->m_size, blob->m_storage); | ||||
| h_storage = HostTensorStorage(blob->m_comp_node); | h_storage = HostTensorStorage(blob->m_comp_node); | ||||
| h_storage.ensure_size(blob->m_size); | h_storage.ensure_size(blob->m_size); | ||||
| h_storage.copy_from(const_cast<DeviceTensorStorage&>(d_storage), blob->m_size); | h_storage.copy_from(const_cast<DeviceTensorStorage&>(d_storage), blob->m_size); | ||||
| } | } | ||||
| @@ -30,65 +27,36 @@ void BlobManagerImpl::unregister_blob(OwnedBlob* blob) { | |||||
| } | } | ||||
| void BlobManagerImpl::alloc_with_defrag(OwnedBlob* blob, size_t size) { | void BlobManagerImpl::alloc_with_defrag(OwnedBlob* blob, size_t size) { | ||||
| if (custom_allocator) { | |||||
| blob->m_storage = custom_allocator(blob->m_comp_node, size); | |||||
| if (m_custom_allocator) { | |||||
| blob->m_storage = m_custom_allocator(blob->m_comp_node, size); | |||||
| return; | return; | ||||
| } | } | ||||
| // try alloc | // try alloc | ||||
| MGB_TRY { alloc_direct(blob, size); } | |||||
| // if fail, try defrag, alloc again | // if fail, try defrag, alloc again | ||||
| MGB_CATCH(MemAllocError&, { | |||||
| if (!try_alloc_direct(blob, size)) { | |||||
| mgb_log_warn("memory allocation failed for blob; try defragmenting"); | mgb_log_warn("memory allocation failed for blob; try defragmenting"); | ||||
| defrag(blob->m_comp_node); | defrag(blob->m_comp_node); | ||||
| alloc_direct(blob, size); | alloc_direct(blob, size); | ||||
| }); | |||||
| } | |||||
| } | } | ||||
| void BlobManagerImpl::alloc_direct(OwnedBlob* blob, size_t size) { | void BlobManagerImpl::alloc_direct(OwnedBlob* blob, size_t size) { | ||||
| DeviceTensorStorage storage(blob->m_comp_node); | |||||
| mgb_assert(blob->m_comp_node.valid()); | mgb_assert(blob->m_comp_node.valid()); | ||||
| DeviceTensorStorage storage(blob->m_comp_node); | |||||
| storage.ensure_size(size); | storage.ensure_size(size); | ||||
| blob->m_storage = storage.raw_storage(); | blob->m_storage = storage.raw_storage(); | ||||
| } | } | ||||
| DeviceTensorND BlobManagerImpl::alloc_workspace_with_defrag( | |||||
| CompNode cn, TensorLayout& layout) { | |||||
| DeviceTensorND dev_tensor; | |||||
| if (custom_allocator) { | |||||
| DeviceTensorStorage storage(cn); | |||||
| size_t sz = layout.dtype.size(layout.total_nr_elems()); | |||||
| storage.reset(cn, sz, custom_allocator(cn, sz)); | |||||
| dev_tensor.reset(storage, layout); | |||||
| return dev_tensor; | |||||
| } | |||||
| MGB_TRY { dev_tensor = alloc_workspace(cn, layout); } | |||||
| MGB_CATCH(MemAllocError&, { | |||||
| mgb_log_warn("memory allocation failed for workspace; try defragmenting"); | |||||
| defrag(cn); | |||||
| dev_tensor = alloc_workspace(cn, layout); | |||||
| }); | |||||
| return dev_tensor; | |||||
| }; | |||||
| DeviceTensorND BlobManagerImpl::alloc_workspace(CompNode cn, TensorLayout layout) { | |||||
| DeviceTensorStorage storage(cn); | |||||
| storage.ensure_size(layout.dtype.size(layout.total_nr_elems())); | |||||
| DeviceTensorND dev_tensor; | |||||
| dev_tensor.reset(storage, layout); | |||||
| return dev_tensor; | |||||
| } | |||||
| void BlobManagerImpl::set_allocator(allocator_t allocator) { | void BlobManagerImpl::set_allocator(allocator_t allocator) { | ||||
| custom_allocator = allocator; | |||||
| m_custom_allocator = allocator; | |||||
| } | } | ||||
| void BlobManagerImpl::defrag(const CompNode& cn) { | void BlobManagerImpl::defrag(const CompNode& cn) { | ||||
| BlobSetWithMux* blobs_set_ptr; | |||||
| { | |||||
| auto& blobs_set_ptr = ([&]() -> auto& { | |||||
| MGB_LOCK_GUARD(m_mtx); | MGB_LOCK_GUARD(m_mtx); | ||||
| blobs_set_ptr = &m_comp2blobs_map[cn]; | |||||
| } | |||||
| MGB_LOCK_GUARD(blobs_set_ptr->mtx); | |||||
| return m_comp2blobs_map[cn]; | |||||
| })(); | |||||
| MGB_LOCK_GUARD(blobs_set_ptr.mtx); | |||||
| std::vector<BlobData> blob_data_arrary; | std::vector<BlobData> blob_data_arrary; | ||||
| std::set<Blob::RawStorage> storage_set; | std::set<Blob::RawStorage> storage_set; | ||||
| @@ -96,7 +64,7 @@ void BlobManagerImpl::defrag(const CompNode& cn) { | |||||
| size_t tot_sz = 0; | size_t tot_sz = 0; | ||||
| // copy to HostTensorStorage, and release | // copy to HostTensorStorage, and release | ||||
| for (auto i : blobs_set_ptr->blobs_set) { | |||||
| for (auto i : blobs_set_ptr.blobs_set) { | |||||
| // skip if blob do not have m_storage | // skip if blob do not have m_storage | ||||
| if (!i->m_storage) | if (!i->m_storage) | ||||
| continue; | continue; | ||||
| @@ -153,9 +121,6 @@ struct BlobManagerStub : BlobManager { | |||||
| void alloc_with_defrag(OwnedBlob* blob, size_t size) { | void alloc_with_defrag(OwnedBlob* blob, size_t size) { | ||||
| mgb_assert(0, "prohibited after global variable destruction"); | mgb_assert(0, "prohibited after global variable destruction"); | ||||
| }; | }; | ||||
| DeviceTensorND alloc_workspace_with_defrag(CompNode cn, TensorLayout& layout) { | |||||
| mgb_assert(0, "prohibited after global variable destruction"); | |||||
| }; | |||||
| void register_blob(OwnedBlob* blob) { | void register_blob(OwnedBlob* blob) { | ||||
| mgb_assert(0, "prohibited after global variable destruction"); | mgb_assert(0, "prohibited after global variable destruction"); | ||||
| }; | }; | ||||
| @@ -163,7 +128,7 @@ struct BlobManagerStub : BlobManager { | |||||
| void defrag(const CompNode& cn) { | void defrag(const CompNode& cn) { | ||||
| mgb_assert(0, "prohibited after global variable destruction"); | mgb_assert(0, "prohibited after global variable destruction"); | ||||
| }; | }; | ||||
| virtual void set_allocator(allocator_t allocator) { | |||||
| void set_allocator(allocator_t allocator) { | |||||
| mgb_assert(0, "prohibited after global variable destruction"); | mgb_assert(0, "prohibited after global variable destruction"); | ||||
| }; | }; | ||||
| }; | }; | ||||
| @@ -27,27 +27,21 @@ class BlobManagerImpl final : public BlobManager { | |||||
| std::mutex m_mtx; | std::mutex m_mtx; | ||||
| CompNode::UnorderedMap<BlobSetWithMux> m_comp2blobs_map; | CompNode::UnorderedMap<BlobSetWithMux> m_comp2blobs_map; | ||||
| void defrag(const CompNode& cn) override; | |||||
| BlobManager::allocator_t m_custom_allocator; | |||||
| void alloc_direct(OwnedBlob* blob, size_t size) override; | void alloc_direct(OwnedBlob* blob, size_t size) override; | ||||
| DeviceTensorND alloc_workspace(CompNode cn, TensorLayout layout); | |||||
| BlobManager::allocator_t custom_allocator; | |||||
| public: | public: | ||||
| static BlobManager* inst(); | static BlobManager* inst(); | ||||
| void alloc_with_defrag(OwnedBlob* blob, size_t size) override; | void alloc_with_defrag(OwnedBlob* blob, size_t size) override; | ||||
| DeviceTensorND alloc_workspace_with_defrag( | |||||
| CompNode cn, TensorLayout& layout) override; | |||||
| void register_blob(OwnedBlob* blob) override; | void register_blob(OwnedBlob* blob) override; | ||||
| void unregister_blob(OwnedBlob* blob) override; | void unregister_blob(OwnedBlob* blob) override; | ||||
| void defrag(const CompNode& cn) override; | |||||
| void set_allocator(allocator_t allocator) override; | void set_allocator(allocator_t allocator) override; | ||||
| }; | }; | ||||
| @@ -1,79 +1,331 @@ | |||||
| #pragma once | |||||
| #include <optional> | |||||
| #include <type_traits> | |||||
| #include "algo_chooser.h" | |||||
| #include "megbrain/comp_node.h" | #include "megbrain/comp_node.h" | ||||
| #include "megbrain/comp_node_env.h" | #include "megbrain/comp_node_env.h" | ||||
| #include "megbrain/imperative/blob_manager.h" | |||||
| #include "megbrain/imperative/physical_tensor.h" | #include "megbrain/imperative/physical_tensor.h" | ||||
| #include "megbrain/imperative/utils/helper.h" | |||||
| #include "megbrain/imperative/utils/platform.h" | |||||
| #include "megbrain/rdnn/management.h" | #include "megbrain/rdnn/management.h" | ||||
| using namespace megdnn; | |||||
| #include "megdnn/basic_types.h" | |||||
| namespace mgb { | namespace mgb { | ||||
| namespace imperative { | namespace imperative { | ||||
| /*! | /*! | ||||
| * \brief A struct for safely calling DNN oprs | |||||
| * In some cases, op may be released before the complete of the execution | |||||
| * This destructor will prevent this | |||||
| * /brief Helps deduce layout and dtype | |||||
| */ | */ | ||||
| template <typename Opr> | template <typename Opr> | ||||
| struct DnnOprCaller { | |||||
| CompNode cn; | |||||
| DeviceTensorND dev_tensor; | |||||
| Workspace workspace; | |||||
| mgb::opr::intl::UniqPtrWithCN<Opr> op; | |||||
| class DnnOprDeducer { | |||||
| private: | |||||
| Opr* m_opr; | |||||
| DnnOprCaller(CompNode cn) : cn(cn), op(std::move(create_operator(cn))) {} | |||||
| public: | |||||
| DnnOprDeducer(Opr* opr) : m_opr(opr) { mgb_assert(opr); } | |||||
| static mgb::opr::intl::UniqPtrWithCN<Opr> create_operator(CompNode cn) { | |||||
| return mgb::opr::intl::create_megdnn_opr<Opr>(cn); | |||||
| // FIXME: maybe in-place style deduction works better | |||||
| template <typename... TArgs> | |||||
| TensorLayout deduce_layout(TArgs&&... args) { | |||||
| static_assert((std::is_convertible_v<TArgs, TensorLayout> && ...)); | |||||
| TensorLayout output_layout; | |||||
| m_opr->deduce_layout(args..., output_layout); | |||||
| return output_layout; | |||||
| } | } | ||||
| Workspace create_workspace(size_t sz) { | |||||
| if (workspace.raw_ptr) { | |||||
| mgb_throw(MegBrainError, "workspace should not be applicated many times"); | |||||
| } | |||||
| if (sz) { | |||||
| TensorLayout layout({sz}, dtype::Byte()); | |||||
| dev_tensor = Tensor::make(layout, cn)->dev_tensor(); | |||||
| workspace = megdnn::Workspace( | |||||
| dev_tensor.raw_ptr(), dev_tensor.storage().size()); | |||||
| template <typename... TArgs> | |||||
| TensorLayout deduce_layout_fallible(TArgs&&... args) { | |||||
| static_assert((std::is_convertible_v<TArgs, TensorLayout> && ...)); | |||||
| TensorLayout output_layout; | |||||
| bool success = (args.ndim * ...) > 0; | |||||
| if (success) { | |||||
| m_opr->deduce_layout(args..., output_layout); | |||||
| } else { | |||||
| m_opr->deduce_dtype(args.dtype..., output_layout.dtype); | |||||
| } | } | ||||
| return workspace; | |||||
| return output_layout; | |||||
| } | } | ||||
| ~DnnOprCaller() { | |||||
| template <size_t nr_outputs, typename... TArgs> | |||||
| std::array<TensorLayout, nr_outputs> deduce_layouts(TArgs&&... args) { | |||||
| static_assert((std::is_convertible_v<TArgs, TensorLayout> && ...)); | |||||
| std::array<TensorLayout, nr_outputs> layouts; | |||||
| std::apply( | |||||
| [&](auto&&... outputs) { m_opr->deduce_layout(args..., outputs...); }, | |||||
| layouts); | |||||
| return layouts; | |||||
| } | |||||
| }; | |||||
| /*! | |||||
| * /brief Declare an abstract operator and initialize it's param | |||||
| */ | |||||
| template <typename Opr> | |||||
| class DnnOprStub { | |||||
| private: | |||||
| // TODO: make opr concrete | |||||
| std::aligned_storage_t<sizeof(Opr), alignof(Opr)> m_storage; | |||||
| using Param = typename Opr::Param; | |||||
| private: | |||||
| DnnOprStub() { new (¶m()) Param(); } | |||||
| public: | |||||
| DnnOprStub(const Param& param) { this->param() = param; } | |||||
| // undefined behavior | |||||
| Opr& opr() { return *reinterpret_cast<Opr*>(&m_storage); } | |||||
| auto& param() { return opr().param(); } | |||||
| auto& param() const { return opr().param(); } | |||||
| ~DnnOprStub() { param().~Param(); } | |||||
| }; | |||||
| /*! | |||||
| * /brief Deduce layout without create concrete opr | |||||
| */ | |||||
| template <typename Opr> | |||||
| class DnnOprHelper : public DnnOprStub<Opr>, public DnnOprDeducer<Opr> { | |||||
| private: | |||||
| using Stub = DnnOprStub<Opr>; | |||||
| using Deducer = DnnOprDeducer<Opr>; | |||||
| public: | |||||
| DnnOprHelper(const typename Opr::Param& param) | |||||
| : Stub(param), Deducer(&Stub::opr()) {} | |||||
| }; | |||||
| // hold a concrete operator in given comp_node | |||||
| template <typename Opr> | |||||
| class DnnOprHolder { | |||||
| private: | |||||
| CompNode m_comp_node; | |||||
| opr::intl::UniqPtrWithCN<Opr> m_opr = | |||||
| opr::intl::create_megdnn_opr<Opr>(m_comp_node); | |||||
| public: | |||||
| DnnOprHolder(CompNode comp_node) : m_comp_node(comp_node) {} | |||||
| auto& op() { return m_opr; } | |||||
| auto comp_node() { return m_comp_node; } | |||||
| auto& param() { return m_opr->param(); } | |||||
| auto& param() const { return m_opr->param(); } | |||||
| ~DnnOprHolder() { | |||||
| using DT = CompNode::DeviceType; | using DT = CompNode::DeviceType; | ||||
| if (cn.device_type() == DT::CPU && cn != CompNode::default_cpu()) { | |||||
| CompNodeEnv::from_comp_node(cn).cpu_env().dispatch( | |||||
| [p = op.release()] { delete p; }); | |||||
| if (m_comp_node.device_type() == DT::CPU && | |||||
| m_comp_node != CompNode::default_cpu()) { | |||||
| CompNodeEnv::from_comp_node(m_comp_node) | |||||
| .cpu_env() | |||||
| .dispatch([p = m_opr.release()] { delete p; }); | |||||
| } | |||||
| } | |||||
| }; | |||||
| /*! | |||||
| * /brief Prevent binary float | |||||
| */ | |||||
| class DnnOprCallerBase { | |||||
| protected: | |||||
| static auto&& get_layout(const megdnn::TensorND& tensor) { return tensor.layout; } | |||||
| static auto get_layout(const megdnn::TensorNDArray& tensors) { | |||||
| SmallVector<TensorLayout> layouts; | |||||
| for (auto&& tensor : tensors) { | |||||
| layouts.push_back(tensor.layout); | |||||
| } | } | ||||
| return layouts; | |||||
| } | } | ||||
| }; | }; | ||||
| template <size_t OSize> | |||||
| class MegDNNDynOutMallocImpl final : public megdnn::DynOutMallocPolicy { | |||||
| using Output = std::array<TensorPtr, OSize>; | |||||
| /*! | |||||
| * \brief A struct for safely calling DNN oprs | |||||
| * | |||||
| * In some cases, op may be released before the complete of the execution | |||||
| * This destructor will prevent this | |||||
| */ | |||||
| template <typename Opr> | |||||
| class DnnOprCaller final : public DnnOprHolder<Opr>, | |||||
| public DnnOprDeducer<Opr>, | |||||
| public DnnOprCallerBase { | |||||
| private: | |||||
| using Holder = DnnOprHolder<Opr>; | |||||
| using Deducer = DnnOprDeducer<Opr>; | |||||
| using Base = DnnOprCallerBase; | |||||
| std::optional<DnnTensorND> m_workspace; | |||||
| std::optional<megdnn::param::ExecutionPolicy> m_policy; | |||||
| CompNode m_cn; | |||||
| Output m_out; | |||||
| megdnn::Workspace create_workspace(size_t sz) { | |||||
| mgb_assert( | |||||
| !m_workspace, "workspace asked more than once by op: %s", | |||||
| demangled_typename<Opr>()); | |||||
| dt_byte* ptr = nullptr; | |||||
| if (sz) { | |||||
| TensorLayout layout({sz}, dtype::Byte()); | |||||
| m_workspace.emplace( | |||||
| Tensor::make(layout, Holder::comp_node())->dnn_tensor()); | |||||
| ptr = reinterpret_cast<dt_byte*>(m_workspace->raw_ptr()); | |||||
| } | |||||
| return {ptr, sz}; | |||||
| } | |||||
| public: | public: | ||||
| MegDNNDynOutMallocImpl(CompNode cn) : m_cn{cn} {} | |||||
| megdnn::TensorND alloc_output( | |||||
| size_t id, DType dtype, const TensorShape& shape, | |||||
| void* user_data) override { | |||||
| TensorLayout m_layout(shape, dtype); | |||||
| m_out[id] = Tensor::make(m_layout, m_cn); | |||||
| return m_out[id]->dev_tensor().as_megdnn(); | |||||
| using Param = typename Opr::Param; | |||||
| DnnOprCaller(CompNode cn) : Holder(cn), Deducer(Holder::op().get()) {} | |||||
| DnnOprCaller(CompNode cn, const Param& param) : DnnOprCaller(cn) { | |||||
| Holder::param() = param; | |||||
| } | |||||
| DnnOprCaller(CompNode cn, const Param& param, megdnn::param::ExecutionPolicy policy) | |||||
| : DnnOprCaller(cn, param) { | |||||
| m_policy.emplace(policy); | |||||
| } | } | ||||
| void* alloc_workspace(size_t sz, void* user_data) override { | |||||
| return m_cn.alloc_device(sz); | |||||
| /** | |||||
| * /brief Convert TensorPtr args to megdnn::TensorND and call f | |||||
| * | |||||
| */ | |||||
| template <typename TFunctor, typename... TArgs> | |||||
| auto call_dnn(TFunctor&& f, TArgs&&... args) { | |||||
| std::optional<SmallVector<std::shared_ptr<dt_byte>>> input_ptrs; | |||||
| // recursive convert: | |||||
| // 1. TensorPtr to DnnTensorND (subclass of megdnn::TensorND) ; | |||||
| // 2. DeviceTensorND, HostTensorND to megdnn::TensorND ; | |||||
| // 3. SmallVector of above to SmallVector<megdnn::TensorND> . | |||||
| auto to_dnn = [&](auto&& arg, auto&& to_dnn) { | |||||
| using T = decltype(arg); | |||||
| if constexpr (std::is_convertible_v<T, TensorPtr>) { | |||||
| return arg->dnn_tensor(); | |||||
| } else if constexpr ( | |||||
| std::is_convertible_v<T, DeviceTensorND> || | |||||
| std::is_convertible_v<T, HostTensorND>) { | |||||
| return arg.as_megdnn(); | |||||
| } else if constexpr ( | |||||
| std::is_convertible_v<T, megdnn::TensorND> || | |||||
| std::is_convertible_v<T, SmallVector<megdnn::TensorND>>) { | |||||
| return std::forward<T>(arg); | |||||
| } else if constexpr (is_small_vector_v<std::decay_t<T>>) { | |||||
| using TItem = std::decay_t<decltype(to_dnn(arg[0], to_dnn))>; | |||||
| SmallVector<megdnn::TensorND> dnn_tensors; | |||||
| for (auto&& tensor : arg) { | |||||
| if constexpr (std::is_same_v<TItem, DnnTensorND>) { | |||||
| if (!input_ptrs) { | |||||
| input_ptrs.emplace(); | |||||
| } | |||||
| auto dnn_tensor = to_dnn(tensor, to_dnn); | |||||
| input_ptrs->push_back(std::move(dnn_tensor.reference)); | |||||
| dnn_tensors.push_back(std::move(dnn_tensor)); | |||||
| } else if constexpr (std::is_same_v<TItem, megdnn::TensorND>) { | |||||
| dnn_tensors.push_back(to_dnn(tensor, to_dnn)); | |||||
| } else { | |||||
| static_assert(!std::is_same_v<TItem, TItem>); | |||||
| } | |||||
| } | |||||
| return dnn_tensors; | |||||
| } else { | |||||
| static_assert(!std::is_same_v<T, T>); | |||||
| } | |||||
| }; | |||||
| return f(to_dnn(std::forward<TArgs>(args), to_dnn)...); | |||||
| } | } | ||||
| void free_workspace(void* ptr, void* user_data) override { m_cn.free_device(ptr); } | |||||
| // common execution (opr->exec(inputs..., outputs...)) | |||||
| template <typename... TArgs> | |||||
| void exec(TArgs&&... args) { | |||||
| call_dnn( | |||||
| [this](auto&&... args) { | |||||
| Holder::op()->exec(std::forward<decltype(args)>(args)...); | |||||
| }, | |||||
| std::forward<TArgs>(args)...); | |||||
| } | |||||
| // execution fastrun opr | |||||
| // (opr->exec(inputs..., outputs..., create_ws(setup_algo(...)))) | |||||
| template <typename... TArgs> | |||||
| void exec_fastrun(TArgs&&... args) { | |||||
| call_dnn( | |||||
| [&](auto&&... args) { | |||||
| using FixedTensorLayouts = | |||||
| typename rdnn::AlgoChooser<Opr>::FixedTensorLayouts; | |||||
| SmallVector<megdnn::TensorND> dnn_inputs = {args...}; | |||||
| mgb_assert(m_policy, "policy not set"); | |||||
| size_t workspace_size = setup_algo<Opr>( | |||||
| FixedTensorLayouts{args.layout...}, Holder::op().get(), 0, | |||||
| false, false, Holder::comp_node(), *m_policy, false, | |||||
| &dnn_inputs); | |||||
| Holder::op()->exec( | |||||
| std::forward<decltype(args)>(args)..., | |||||
| create_workspace(workspace_size)); | |||||
| }, | |||||
| std::forward<TArgs>(args)...); | |||||
| } | |||||
| // execute with fixed workspace | |||||
| // (opr->exec(input..., outputs..., create_ws(get_workspace_in_bytes(...)))) | |||||
| template <typename... TArgs> | |||||
| void exec_with_ws(TArgs&&... args) { | |||||
| call_dnn( | |||||
| [&](auto&&... args) { | |||||
| size_t workspace_size = | |||||
| Holder::op()->get_workspace_in_bytes(get_layout(args)...); | |||||
| Holder::op()->exec( | |||||
| std::forward<decltype(args)>(args)..., | |||||
| create_workspace(workspace_size)); | |||||
| }, | |||||
| std::forward<TArgs>(args)...); | |||||
| } | |||||
| TensorPtr at(size_t id) { return m_out[id]; } | |||||
| // execute dynamic out opr | |||||
| // (opr->exec(inputs..., outputs... create_ws(get_workspace_in_bytes(...)), alloc)) | |||||
| template <size_t nr_out, typename... TArgs> | |||||
| auto exec_dynout(TArgs&&... args) { | |||||
| struct Alloc final : public megdnn::DynOutMallocPolicy { | |||||
| CompNode comp_node; | |||||
| std::array<TensorPtr, nr_out> output_tensors; | |||||
| std::array<std::optional<DnnTensorND>, nr_out> output_dnn_tensors; | |||||
| public: | |||||
| Alloc(CompNode comp_node) : comp_node(comp_node) {} | |||||
| megdnn::TensorND alloc_output( | |||||
| size_t id, DType dtype, const TensorShape& shape, | |||||
| void* user_data) override { | |||||
| TensorLayout layout(shape, dtype); | |||||
| output_tensors[id] = Tensor::make(layout, comp_node); | |||||
| output_dnn_tensors[id].emplace( | |||||
| output_tensors[id]->dnn_tensor()); // pin output | |||||
| return *output_dnn_tensors[id]; | |||||
| } | |||||
| void* alloc_workspace(size_t sz, void* user_data) override { | |||||
| mgb_assert(false); | |||||
| } | |||||
| void free_workspace(void* ptr, void* user_data) override { | |||||
| mgb_assert(false); | |||||
| } | |||||
| } alloc{Holder::comp_node()}; | |||||
| call_dnn( | |||||
| [&](auto&&... args) { | |||||
| size_t workspace_size = | |||||
| Holder::op()->get_workspace_in_bytes(get_layout(args)...); | |||||
| Holder::op()->exec( | |||||
| std::forward<decltype(args)>(args)..., | |||||
| create_workspace(workspace_size), &alloc); | |||||
| }, | |||||
| std::forward<TArgs>(args)...); | |||||
| return alloc.output_tensors; | |||||
| } | |||||
| }; | }; | ||||
| } // namespace imperative | } // namespace imperative | ||||
| @@ -605,6 +605,7 @@ TensorInfo* ChannelImpl::alloc() { | |||||
| void ChannelImpl::init(TensorInfo* info, LogicalTensorDesc&& desc) { | void ChannelImpl::init(TensorInfo* info, LogicalTensorDesc&& desc) { | ||||
| m_valid_handle.insert(reinterpret_cast<Handle>(info)); | m_valid_handle.insert(reinterpret_cast<Handle>(info)); | ||||
| MGB_RECORD_EVENT(TensorDeclareEvent, info->id, info->name); | MGB_RECORD_EVENT(TensorDeclareEvent, info->id, info->name); | ||||
| mgb_assert(desc.comp_node.valid(), "comp_node invalid"); | |||||
| info->status = TensorInfo::Allocated; | info->status = TensorInfo::Allocated; | ||||
| info->desc = std::move(desc); | info->desc = std::move(desc); | ||||
| } | } | ||||
| @@ -831,6 +832,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd, std::string reason) { | |||||
| output_descs.push_back(i->desc); | output_descs.push_back(i->desc); | ||||
| } | } | ||||
| } else { | } else { | ||||
| // i may be null | |||||
| validated = false; | validated = false; | ||||
| } | } | ||||
| // Here std::move is REQUIRED for removing duplicated references. | // Here std::move is REQUIRED for removing duplicated references. | ||||
| @@ -1064,17 +1066,16 @@ void ChannelImpl::alloc_tensor_with_evict(OwnedBlob* x) { | |||||
| if (in_worker) { | if (in_worker) { | ||||
| reserve_size(x->size()); | reserve_size(x->size()); | ||||
| } | } | ||||
| MGB_TRY { BlobManager::inst()->alloc_direct(x, x->size()); } | |||||
| MGB_CATCH(MemAllocError&, { | |||||
| if (!BlobManager::inst()->try_alloc_direct(x, x->size())) { | |||||
| bool suc = false; | bool suc = false; | ||||
| if (in_worker) { | if (in_worker) { | ||||
| while (!suc) { | while (!suc) { | ||||
| if (!auto_evict(1)) { | if (!auto_evict(1)) { | ||||
| break; | break; | ||||
| } | } | ||||
| MGB_TRY { BlobManager::inst()->alloc_direct(x, x->size()); } | |||||
| MGB_CATCH(MemAllocError&, { continue; }); | |||||
| suc = true; | |||||
| if (BlobManager::inst()->try_alloc_direct(x, x->size())) { | |||||
| suc = true; | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| if (!suc) { | if (!suc) { | ||||
| @@ -1086,9 +1087,11 @@ void ChannelImpl::alloc_tensor_with_evict(OwnedBlob* x) { | |||||
| imperative_log_profile_begin("defrag"); | imperative_log_profile_begin("defrag"); | ||||
| BlobManager::inst()->defrag(x->comp_node()); | BlobManager::inst()->defrag(x->comp_node()); | ||||
| imperative_log_profile_end("defrag"); | imperative_log_profile_end("defrag"); | ||||
| BlobManager::inst()->alloc_direct(x, x->size()); | |||||
| mgb_assert( | |||||
| BlobManager::inst()->try_alloc_direct(x, x->size()), | |||||
| "allocation failed after defrag"); | |||||
| } | } | ||||
| }); | |||||
| } | |||||
| set_log_level(pre_level); | set_log_level(pre_level); | ||||
| } | } | ||||
| @@ -75,13 +75,12 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
| SmallVector<TensorPtr> apply_on_physical_tensor( | SmallVector<TensorPtr> apply_on_physical_tensor( | ||||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | const OpDef& def, const SmallVector<TensorPtr>& inputs, | ||||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | ||||
| auto&& pool = static_cast<const AdaptivePooling&>(def); | |||||
| auto&& pooling = def.cast_final_safe<AdaptivePooling>(); | |||||
| auto&& cn = inputs[0]->comp_node(); | auto&& cn = inputs[0]->comp_node(); | ||||
| using TensorND = megdnn::TensorND; | |||||
| auto&& src_layout = inputs[0]->layout(); | auto&& src_layout = inputs[0]->layout(); | ||||
| TensorLayout dst_layout = output_descs[0].layout; | |||||
| auto param_format = pool.format; | |||||
| TensorLayout dst_layout{inputs[0]->dtype()}; | |||||
| auto param_format = pooling.format; | |||||
| if (!validated) { | if (!validated) { | ||||
| dst_layout.ndim = src_layout.ndim; | dst_layout.ndim = src_layout.ndim; | ||||
| const dt_int32* oshp2d = nullptr; | const dt_int32* oshp2d = nullptr; | ||||
| @@ -91,7 +90,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| tshp1n = inputs[1]->layout().total_nr_elems() == 1; | tshp1n = inputs[1]->layout().total_nr_elems() == 1; | ||||
| oshp2d = tshp_nd->get_value().proxy_to_default_cpu().ptr<dt_int32>(); | oshp2d = tshp_nd->get_value().proxy_to_default_cpu().ptr<dt_int32>(); | ||||
| } else { | } else { | ||||
| oshp2d = pool.shape.data(); | |||||
| oshp2d = pooling.shape.data(); | |||||
| } | } | ||||
| if (param_format == opr::AdaptivePooling::Param::Format::NCHW) { | if (param_format == opr::AdaptivePooling::Param::Format::NCHW) { | ||||
| dst_layout[0] = src_layout[0]; | dst_layout[0] = src_layout[0]; | ||||
| @@ -108,15 +107,17 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| MegBrainError, "AdaptivePooling only support NCHW or NHWC format"); | MegBrainError, "AdaptivePooling only support NCHW or NHWC format"); | ||||
| } | } | ||||
| dst_layout.init_contiguous_stride(); | dst_layout.init_contiguous_stride(); | ||||
| } else { | |||||
| dst_layout = output_descs[0].layout; | |||||
| } | } | ||||
| size_t IH, IW, OH, OW; | size_t IH, IW, OH, OW; | ||||
| if (param_format == param::AdaptivePooling::Format::NCHW) { | |||||
| if (param_format == megdnn::param::AdaptivePooling::Format::NCHW) { | |||||
| IH = src_layout[2]; | IH = src_layout[2]; | ||||
| IW = src_layout[3]; | IW = src_layout[3]; | ||||
| OH = dst_layout[2]; | OH = dst_layout[2]; | ||||
| OW = dst_layout[3]; | OW = dst_layout[3]; | ||||
| } else if (param_format == param::AdaptivePooling::Format::NHWC) { | |||||
| } else if (param_format == megdnn::param::AdaptivePooling::Format::NHWC) { | |||||
| IH = src_layout[1]; | IH = src_layout[1]; | ||||
| IW = src_layout[2]; | IW = src_layout[2]; | ||||
| OH = dst_layout[1]; | OH = dst_layout[1]; | ||||
| @@ -124,26 +125,21 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| } else { | } else { | ||||
| mgb_throw(MegBrainError, "AdaptivePooling only support NCHW or NHWC format"); | mgb_throw(MegBrainError, "AdaptivePooling only support NCHW or NHWC format"); | ||||
| } | } | ||||
| DnnOprCaller<megdnn::Pooling> dnn_opr(cn); | |||||
| auto&& param = dnn_opr.op->param(); | |||||
| param.mode = pool.mode; | |||||
| param.format = pool.format; | |||||
| // adaptive_pooling param to pooling | |||||
| auto&& param = megdnn::Pooling::Param(); | |||||
| param.mode = pooling.mode; | |||||
| param.format = pooling.format; | |||||
| param.pad_h = param.pad_w = 0; | param.pad_h = param.pad_w = 0; | ||||
| param.stride_h = floor(IH / OH); | |||||
| param.stride_w = floor(IW / OW); | |||||
| param.stride_h = IH / OH; | |||||
| param.stride_w = IW / OW; | |||||
| param.window_h = IH - (OH - 1) * param.stride_h; | param.window_h = IH - (OH - 1) * param.stride_h; | ||||
| param.window_w = IW - (OW - 1) * param.stride_w; | param.window_w = IW - (OW - 1) * param.stride_w; | ||||
| TensorND src = inputs[0]->dnn_tensor(); | |||||
| DnnOprCaller<megdnn::Pooling> dnn_opr(cn, param, megdnn::param::ExecutionPolicy{}); | |||||
| auto src = inputs[0]; | |||||
| auto dst = Tensor::make(dst_layout, cn); | auto dst = Tensor::make(dst_layout, cn); | ||||
| size_t sz = setup_algo<megdnn::Pooling>( | |||||
| {src_layout, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, | |||||
| ::megdnn::param::ExecutionPolicy{}, false); | |||||
| auto dnn_wk = dnn_opr.create_workspace(sz); | |||||
| dnn_opr.op->exec(src, dst->dnn_tensor(), dnn_wk); | |||||
| dnn_opr.exec_fastrun(inputs[0], dst); | |||||
| return {dst}; | return {dst}; | ||||
| } | } | ||||
| @@ -145,79 +145,44 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| auto&& op_def = def.cast_final_safe<BatchNorm>(); | auto&& op_def = def.cast_final_safe<BatchNorm>(); | ||||
| auto&& comp_node = inputs[0]->comp_node(); | auto&& comp_node = inputs[0]->comp_node(); | ||||
| using TensorND = megdnn::TensorND; | |||||
| DnnOprCaller<megdnn::BN> dnn_opr(comp_node, op_def.param()); | |||||
| SmallVector<TensorND> inp_tensornds(inputs.size()); | |||||
| for (size_t i = 0; i < inputs.size(); ++i) { | |||||
| inp_tensornds[i] = inputs[i]->dnn_tensor(); | |||||
| } | |||||
| DnnOprCaller<megdnn::BN> dnn_opr(comp_node); | |||||
| dnn_opr.op->param() = op_def.param(); | |||||
| TensorLayout src_layout = inputs[0]->layout(); | |||||
| TensorLayout scale_layout = inputs[1]->layout(); | |||||
| auto src_layout = inputs[0]->layout(); | |||||
| auto scale_layout = inputs[1]->layout(); | |||||
| bool empty_input = src_layout.is_empty(); | bool empty_input = src_layout.is_empty(); | ||||
| size_t nr_inp = inputs.size(); | size_t nr_inp = inputs.size(); | ||||
| size_t sz = 0, rsz = 0; | |||||
| TensorLayout r_layout({rsz}, dtype::Byte()); | |||||
| if (!empty_input) { | |||||
| sz = dnn_opr.op->get_workspace_in_bytes( | |||||
| src_layout, src_layout, src_layout, src_layout, src_layout, src_layout, | |||||
| src_layout, src_layout, src_layout); | |||||
| rsz = dnn_opr.op->get_reserve_in_bytes(src_layout); | |||||
| r_layout = TensorLayout({rsz}, dtype::Byte()); | |||||
| } | |||||
| auto dnn_wk = dnn_opr.create_workspace(sz); | |||||
| auto reserve = Tensor::make(r_layout, comp_node); | |||||
| // size_t ws_size = 0, reserve_size = 0; | |||||
| size_t reserve_size = | |||||
| empty_input ? (size_t)0 : dnn_opr.op()->get_reserve_in_bytes(src_layout); | |||||
| // alloc memory | |||||
| // alloc outputs | |||||
| auto y = Tensor::make(src_layout, comp_node); | auto y = Tensor::make(src_layout, comp_node); | ||||
| auto save_mean = Tensor::make(scale_layout, comp_node); | auto save_mean = Tensor::make(scale_layout, comp_node); | ||||
| auto save_variance = Tensor::make(scale_layout, comp_node); | auto save_variance = Tensor::make(scale_layout, comp_node); | ||||
| auto reserve = Tensor::make(TensorLayout{{reserve_size}, dtype::Byte()}, comp_node); | |||||
| if (op_def.fwd_mode == ::megdnn::param::BN::FwdMode::INFERENCE) { | if (op_def.fwd_mode == ::megdnn::param::BN::FwdMode::INFERENCE) { | ||||
| if (!empty_input) | |||||
| dnn_opr.op->exec( | |||||
| inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], | |||||
| inp_tensornds[3], inp_tensornds[4], save_mean->dnn_tensor(), | |||||
| save_variance->dnn_tensor(), reserve->dnn_tensor(), y->dnn_tensor(), | |||||
| dnn_wk); | |||||
| if (!empty_input) { | |||||
| dnn_opr.exec_with_ws( | |||||
| inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], save_mean, | |||||
| save_variance, reserve, y); | |||||
| } | |||||
| return {inputs[3], inputs[4], reserve, y}; | return {inputs[3], inputs[4], reserve, y}; | ||||
| } else { | } else { | ||||
| if (nr_inp == 5) { | if (nr_inp == 5) { | ||||
| auto mean = Tensor::make(scale_layout, comp_node); | auto mean = Tensor::make(scale_layout, comp_node); | ||||
| auto variance = Tensor::make(scale_layout, comp_node); | auto variance = Tensor::make(scale_layout, comp_node); | ||||
| megdnn::RefPtr src_ptr1( | |||||
| inp_tensornds[3].get_ref_ptr().get_ptr(), inputs[3]->offset()); | |||||
| megdnn::RefPtr dst_ptr1( | |||||
| mean->dev_tensor().storage().get_ref_ptr(), | |||||
| mean->dev_tensor().storage().offset(), false); | |||||
| comp_node.peer_copy_to_ref( | |||||
| comp_node, dst_ptr1, src_ptr1, scale_layout.span().high_byte); | |||||
| megdnn::RefPtr src_ptr2( | |||||
| inp_tensornds[4].get_ref_ptr().get_ptr(), inputs[4]->offset()); | |||||
| megdnn::RefPtr dst_ptr2( | |||||
| variance->dev_tensor().storage().get_ref_ptr(), | |||||
| variance->dev_tensor().storage().offset(), false); | |||||
| comp_node.peer_copy_to_ref( | |||||
| comp_node, dst_ptr2, src_ptr2, scale_layout.span().high_byte); | |||||
| if (!empty_input) | |||||
| dnn_opr.op->exec( | |||||
| inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], | |||||
| mean->dnn_tensor(), variance->dnn_tensor(), | |||||
| save_mean->dnn_tensor(), save_variance->dnn_tensor(), | |||||
| reserve->dnn_tensor(), y->dnn_tensor(), dnn_wk); | |||||
| // FIXME | |||||
| mean->dev_tensor().copy_from(inputs[3]->dev_tensor()); | |||||
| variance->dev_tensor().copy_from(inputs[4]->dev_tensor()); | |||||
| if (!empty_input) { | |||||
| dnn_opr.exec_with_ws( | |||||
| inputs[0], inputs[1], inputs[2], mean, variance, save_mean, | |||||
| save_variance, reserve, y); | |||||
| } | |||||
| return {mean, variance, save_mean, save_variance, reserve, y}; | return {mean, variance, save_mean, save_variance, reserve, y}; | ||||
| } | } | ||||
| @@ -227,11 +192,9 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| auto variance = Tensor::make(m_layout, comp_node); | auto variance = Tensor::make(m_layout, comp_node); | ||||
| if (!empty_input) { | if (!empty_input) { | ||||
| dnn_opr.op->exec( | |||||
| inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], | |||||
| mean->dnn_tensor(), variance->dnn_tensor(), save_mean->dnn_tensor(), | |||||
| save_variance->dnn_tensor(), reserve->dnn_tensor(), y->dnn_tensor(), | |||||
| dnn_wk); | |||||
| dnn_opr.exec_with_ws( | |||||
| inputs[0], inputs[1], inputs[2], mean, variance, save_mean, | |||||
| save_variance, reserve, y); | |||||
| } | } | ||||
| return {save_mean, save_variance, reserve, y}; | return {save_mean, save_variance, reserve, y}; | ||||
| @@ -28,33 +28,26 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| auto&& inp = inputs[0]; | auto&& inp = inputs[0]; | ||||
| auto&& msk = inputs[1]; | auto&& msk = inputs[1]; | ||||
| SmallVector<TensorPtr> out; | |||||
| mgb_assert( | mgb_assert( | ||||
| inp->layout().eq_shape(msk->layout()), | inp->layout().eq_shape(msk->layout()), | ||||
| "input shape does not match mask shape"); | "input shape does not match mask shape"); | ||||
| mgb_assert( | mgb_assert( | ||||
| msk->get_value().dtype().enumv() == DTypeEnum::Bool, | msk->get_value().dtype().enumv() == DTypeEnum::Bool, | ||||
| "mask dtype must be bool"); | "mask dtype must be bool"); | ||||
| MegDNNDynOutMallocImpl<2> policy{inp->comp_node()}; | |||||
| if (inp->layout().is_empty()) { | if (inp->layout().is_empty()) { | ||||
| // empty tensor | // empty tensor | ||||
| policy.alloc_output(0, inp->layout().dtype, {0}, nullptr); | |||||
| policy.alloc_output(1, dtype::Int32(), {0}, nullptr); | |||||
| return { | |||||
| Tensor::make(TensorLayout{{0}, inp->dtype()}, inp->comp_node()), | |||||
| Tensor::make(TensorLayout{{0}, dtype::Int32()}, inp->comp_node()), | |||||
| }; | |||||
| } else { | } else { | ||||
| DnnOprCaller<megdnn::CondTake> dnn_op(inp->comp_node()); | |||||
| dnn_op.op->param().val = 1; | |||||
| size_t sz = dnn_op.op->get_workspace_in_bytes(inp->layout()); | |||||
| auto dnn_workspace = dnn_op.create_workspace(sz); | |||||
| dnn_op.op->exec( | |||||
| inp->dev_tensor().as_megdnn(), msk->dev_tensor().as_megdnn(), | |||||
| dnn_workspace, &policy); | |||||
| // maybe we need to split CondTake | |||||
| megdnn::CondTake::Param param; | |||||
| param.val = 1; | |||||
| DnnOprCaller<megdnn::CondTake> dnn_op(inp->comp_node(), param); | |||||
| auto&& [out0, out1] = dnn_op.exec_dynout<2>(inp, msk); | |||||
| return {out0, out1}; | |||||
| } | } | ||||
| out.push_back(policy.at(0)); | |||||
| out.push_back(policy.at(1)); | |||||
| return out; | |||||
| } | } | ||||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | ||||
| @@ -8,14 +8,7 @@ | |||||
| namespace mgb { | namespace mgb { | ||||
| namespace imperative { | namespace imperative { | ||||
| namespace { | namespace { | ||||
| size_t infer_conv_shape(size_t inp, size_t flt, size_t stride, size_t pad) { | |||||
| mgb_assert(inp + 2 * pad >= flt, "input=%zu padding=%zu filter=%zu", inp, pad, flt); | |||||
| return (inp + 2 * pad - flt) / stride + 1; | |||||
| } | |||||
| namespace convolution { | namespace convolution { | ||||
| std::shared_ptr<OpDef> make_from_op_node(cg::OperatorNodeBase* node_) { | std::shared_ptr<OpDef> make_from_op_node(cg::OperatorNodeBase* node_) { | ||||
| auto* node = &node_->cast_final_safe<opr::Convolution>(); | auto* node = &node_->cast_final_safe<opr::Convolution>(); | ||||
| @@ -29,131 +22,23 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||||
| inputs[0], inputs[1], conv.param(), conv.policy(), config); | inputs[0], inputs[1], conv.param(), conv.policy(), config); | ||||
| } | } | ||||
| TensorLayout do_shape_infer( | |||||
| const OpDef& def, size_t src_ndim, TensorLayout src, TensorLayout filter) { | |||||
| auto&& conv = static_cast<const Convolution&>(def); | |||||
| using Param = ::megdnn::param::Convolution; | |||||
| auto img_ndim = src_ndim - 2; | |||||
| mgb_assert( | |||||
| img_ndim == 2, | |||||
| "only 2D convolution is supported, and input should be 4-dim; " | |||||
| "got input dim = %zu", | |||||
| src_ndim); | |||||
| size_t group = 1; | |||||
| size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; | |||||
| if (conv.sparse == Param::Sparse::DENSE) { | |||||
| mgb_assert( | |||||
| filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4, | |||||
| "bad filter ndim for dense convolution: " | |||||
| "spatial_ndim=%zu filter_ndim=%zu", | |||||
| img_ndim, filter.ndim); | |||||
| group = 1; | |||||
| flt_start = 0; | |||||
| } else { // Param::Sparse::GROUP | |||||
| mgb_assert( | |||||
| filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5, | |||||
| "bad filter ndim for group convolution: " | |||||
| "spatial_ndim=%zu filter_ndim=%zu", | |||||
| img_ndim, filter.ndim); | |||||
| // grp, oc, ic, dims[] | |||||
| group = filter[0]; | |||||
| flt_start = 1; | |||||
| } | |||||
| uint32_t ic_block_size = 1, oc_block_size = 1; | |||||
| size_t src_or_dst_c_pos = 0; | |||||
| size_t src_or_dst_spatial_start = 0; | |||||
| if (conv.format == Param::Format::NCHW) { | |||||
| // filter should be (oc, ic, fh, fw) | |||||
| flt_spatial_start = 2; | |||||
| ocpg_pos = 0; | |||||
| icpg_pos = 1; | |||||
| src_or_dst_c_pos = 1; | |||||
| src_or_dst_spatial_start = 2; | |||||
| } else { // Param::Format::NHWC | |||||
| // filter should be (oc, fh, fw, ic) | |||||
| flt_spatial_start = 1; | |||||
| ocpg_pos = 0; | |||||
| icpg_pos = 3; | |||||
| src_or_dst_c_pos = 3; | |||||
| src_or_dst_spatial_start = 1; | |||||
| } | |||||
| size_t ocpg = filter[flt_start + ocpg_pos] * oc_block_size; | |||||
| size_t icpg = filter[flt_start + icpg_pos] * ic_block_size; | |||||
| uint32_t dilation[2], dilated_spatial[2], stride[2], padding[2]; | |||||
| dilation[0] = conv.dilate_h; | |||||
| dilation[1] = conv.dilate_w; | |||||
| stride[0] = conv.stride_h; | |||||
| stride[1] = conv.stride_w; | |||||
| padding[0] = conv.pad_h; | |||||
| padding[1] = conv.pad_w; | |||||
| for (size_t i = 0; i < img_ndim; ++i) { | |||||
| mgb_assert( | |||||
| dilation[i] > 0, "invalid dilation on spatial dim %zu: %u", i, | |||||
| dilation[i]); | |||||
| dilated_spatial[i] = | |||||
| (filter[i + flt_start + flt_spatial_start] - 1) * dilation[i] + 1; | |||||
| } | |||||
| mgb_assert( | |||||
| icpg * group == src[src_or_dst_c_pos], | |||||
| "group conv invalid: input channel of Conv expect %zu, but got %zu\n" | |||||
| "hint: weight may be changed by mistake\n", | |||||
| icpg * group, src[src_or_dst_c_pos]); | |||||
| TensorLayout dst{src.dtype}; | |||||
| dst.ndim = src_ndim; | |||||
| dst[0] = src[0]; | |||||
| dst[src_or_dst_c_pos] = ocpg * group; | |||||
| for (size_t i = 0; i < img_ndim; ++i) { | |||||
| dst[i + src_or_dst_spatial_start] = infer_conv_shape( | |||||
| src[i + src_or_dst_spatial_start], dilated_spatial[i], stride[i], | |||||
| padding[i]); | |||||
| } | |||||
| dst.init_contiguous_stride(); | |||||
| return dst; | |||||
| } | |||||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | ||||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | ||||
| SmallVector<LogicalTensorDesc> dests(1); | |||||
| auto&& desc = dests[0]; | |||||
| desc.comp_node = inputs[0].comp_node; | |||||
| TensorLayout src = inputs[0].layout; | |||||
| TensorLayout filter = inputs[1].layout; | |||||
| size_t src_ndim = src.ndim; | |||||
| if (src_ndim == 0 || filter.ndim == 0) { | |||||
| desc.layout = TensorLayout{{}, src.dtype}; | |||||
| return {dests, false}; | |||||
| auto&& conv = def.cast_final_safe<Convolution>(); | |||||
| DnnOprHelper<megdnn::ConvolutionForward> dnn_opr(conv.param()); | |||||
| auto&& data = inputs[0].layout; | |||||
| auto&& filter = inputs[1].layout; | |||||
| TensorLayout output_layout{data.dtype}; | |||||
| if (data.ndim && filter.ndim) { | |||||
| // deduce_layout won't override existing dtype | |||||
| dnn_opr.opr().deduce_layout(data, filter, output_layout); | |||||
| } | } | ||||
| desc.layout = do_shape_infer(def, src_ndim, src, filter); | |||||
| return {dests, true}; | |||||
| return {{{output_layout, inputs[0].comp_node}}, output_layout.ndim != 0}; | |||||
| } | } | ||||
| SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||||
| // create megdnn opr | |||||
| auto&& conv = static_cast<const Convolution&>(def); | |||||
| CompNode cn = inputs[0]->comp_node(); | |||||
| TensorLayout out_layout = output_descs[0].layout; | |||||
| if (!validated) | |||||
| out_layout = do_shape_infer( | |||||
| def, inputs[0]->layout().ndim, inputs[0]->layout(), | |||||
| inputs[1]->layout()); | |||||
| using TensorND = megdnn::TensorND; | |||||
| SmallVector<TensorND> inp_tensornds(inputs.size() + 2); | |||||
| TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); | |||||
| for (unsigned i = 0; i < inputs.size(); ++i) { | |||||
| inp_tensornds[i] = inputs[i]->dnn_tensor(); | |||||
| inp_shapes[i] = inputs[i]->layout(); | |||||
| } | |||||
| oup_shapes[0] = out_layout; | |||||
| DnnOprCaller<megdnn::ConvBiasForward> dnn_opr(cn); | |||||
| auto&& param = dnn_opr.op->param(); | |||||
| // Convolution::Param -> ConvBias::Param | |||||
| auto conv_bias_param_from_convolution(const Convolution& conv) { | |||||
| megdnn::ConvBias::Param param; | |||||
| param.pad_h = conv.pad_h; | param.pad_h = conv.pad_h; | ||||
| param.pad_w = conv.pad_w; | param.pad_w = conv.pad_w; | ||||
| param.stride_h = conv.stride_h; | param.stride_h = conv.stride_h; | ||||
| @@ -163,30 +48,37 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| param.sparse = conv.sparse; | param.sparse = conv.sparse; | ||||
| param.compute_mode = conv.compute_mode; | param.compute_mode = conv.compute_mode; | ||||
| param.format = conv.format; | param.format = conv.format; | ||||
| return param; | |||||
| } | |||||
| // shape infer | |||||
| TensorLayout empty_shp({0}, inputs[0]->dtype()); | |||||
| empty_shp.ndim = 0; | |||||
| auto empty_bias = Tensor::make(empty_shp, cn); | |||||
| inp_tensornds[2] = empty_bias->dnn_tensor(); | |||||
| inp_tensornds[3] = empty_bias->dnn_tensor(); | |||||
| size_t sz = setup_algo<megdnn::ConvBiasForward>( | |||||
| {inp_shapes[0], inp_shapes[1], empty_shp, empty_shp, oup_shapes[0]}, | |||||
| dnn_opr.op.get(), 0, false, false, cn, conv.policy(), false, | |||||
| &inp_tensornds); | |||||
| SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | |||||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | |||||
| // create megdnn opr | |||||
| auto&& conv = def.cast_final_safe<Convolution>(); | |||||
| CompNode cn = inputs[0]->comp_node(); | |||||
| auto&& param = conv_bias_param_from_convolution(conv); | |||||
| DnnOprCaller<megdnn::ConvBiasForward> dnn_opr(cn, param, conv.policy()); | |||||
| megdnn::TensorND empty_bias; | |||||
| empty_bias.layout.dtype = inputs[0]->dtype(); | |||||
| empty_bias.layout.ndim = 0; | |||||
| auto out_layout = [&] { | |||||
| if (validated) { | |||||
| return output_descs[0].layout; | |||||
| } else { | |||||
| TensorLayout out_layout{inputs[0]->dtype()}; | |||||
| dnn_opr.op()->deduce_layout( | |||||
| inputs[0]->layout(), inputs[1]->layout(), empty_bias.layout, | |||||
| empty_bias.layout, out_layout); | |||||
| return out_layout; | |||||
| } | |||||
| }(); | |||||
| // alloc memory | // alloc memory | ||||
| auto out = Tensor::make(out_layout, cn); | auto out = Tensor::make(out_layout, cn); | ||||
| auto dnn_wk = dnn_opr.create_workspace(sz); | |||||
| // exeucte | |||||
| dnn_opr.op->exec( | |||||
| inp_tensornds[0], inp_tensornds[1], inp_tensornds[2], inp_tensornds[3], | |||||
| out->dnn_tensor(), nullptr, dnn_wk); | |||||
| dnn_opr.exec_fastrun(inputs[0], inputs[1], empty_bias, empty_bias, out); | |||||
| return {out}; | return {out}; | ||||
| } | } | ||||
| @@ -243,155 +135,41 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||||
| } | } | ||||
| } | } | ||||
| TensorLayout convbwd_do_shape_infer( | |||||
| const OpDef& def, size_t diff_ndim, TensorLayout filter, TensorLayout diff, | |||||
| CompNode cn) { | |||||
| auto&& bwd_conv = static_cast<const ConvolutionBackwardData&>(def); | |||||
| DnnOprCaller<megdnn::ConvolutionBackwardData> caller(cn); | |||||
| auto&& dnn_opr = caller.op; | |||||
| using Param = ::megdnn::param::Convolution; | |||||
| // using Param1 = ::megdnn::param::ConvolutionBackwardData; | |||||
| auto img_ndim = diff_ndim - 2; | |||||
| mgb_assert( | |||||
| img_ndim == 2, | |||||
| "only 2D convolution is supported, and input should be 4-dim; " | |||||
| "got input dim = %zu", | |||||
| diff_ndim); | |||||
| size_t group = 1; | |||||
| size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; | |||||
| if (bwd_conv.sparse == Param::Sparse::DENSE) { | |||||
| mgb_assert( | |||||
| filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4, | |||||
| "bad filter ndim for dense convolution: " | |||||
| "spatial_ndim=%zu filter_ndim=%zu", | |||||
| img_ndim, filter.ndim); | |||||
| group = 1; | |||||
| flt_start = 0; | |||||
| } else { // Param::Sparse::GROUP | |||||
| mgb_assert( | |||||
| filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5, | |||||
| "bad filter ndim for group convolution: " | |||||
| "spatial_ndim=%zu filter_ndim=%zu", | |||||
| img_ndim, filter.ndim); | |||||
| // grp, oc, ic, dims[] | |||||
| group = filter[0]; | |||||
| flt_start = 1; | |||||
| } | |||||
| uint32_t ic_block_size = 1, oc_block_size = 1; | |||||
| size_t src_or_dst_c_pos = 0; | |||||
| size_t src_or_dst_spatial_start = 0; | |||||
| if (bwd_conv.format == Param::Format::NCHW) { | |||||
| // filter should be (oc, ic, fh, fw) | |||||
| flt_spatial_start = 2; | |||||
| ocpg_pos = 0; | |||||
| icpg_pos = 1; | |||||
| src_or_dst_c_pos = 1; | |||||
| src_or_dst_spatial_start = 2; | |||||
| } else { // Param::Format::NHWC | |||||
| // filter should be (oc, fh, fw, ic) | |||||
| flt_spatial_start = 1; | |||||
| ocpg_pos = 0; | |||||
| icpg_pos = 3; | |||||
| src_or_dst_c_pos = 3; | |||||
| src_or_dst_spatial_start = 1; | |||||
| } | |||||
| size_t ocpg = filter[flt_start + ocpg_pos] * oc_block_size; | |||||
| size_t icpg = filter[flt_start + icpg_pos] * ic_block_size; | |||||
| uint32_t dilation[2], dilated_spatial[2], stride[2], padding[2]; | |||||
| dilation[0] = bwd_conv.dilate_h; | |||||
| dilation[1] = bwd_conv.dilate_w; | |||||
| stride[0] = bwd_conv.stride_h; | |||||
| stride[1] = bwd_conv.stride_w; | |||||
| padding[0] = bwd_conv.pad_h; | |||||
| padding[1] = bwd_conv.pad_w; | |||||
| for (size_t i = 0; i < img_ndim; ++i) { | |||||
| mgb_assert( | |||||
| dilation[i] > 0, "invalid dilation on spatial dim %zu: %u", i, | |||||
| dilation[i]); | |||||
| dilated_spatial[i] = | |||||
| (filter[i + flt_start + flt_spatial_start] - 1) * dilation[i] + 1; | |||||
| } | |||||
| mgb_assert( | |||||
| ocpg * group == diff[src_or_dst_c_pos], | |||||
| "group conv invalid: input channel of Conv expect %zu, but got %zu\n" | |||||
| "hint: weight may be changed by mistake\n", | |||||
| ocpg * group, diff[src_or_dst_c_pos]); | |||||
| auto deduce = [](size_t out, size_t filter, size_t stride, size_t pad) { | |||||
| auto i = (out - 1) * stride + filter; | |||||
| mgb_assert(i > pad * 2); | |||||
| return i - pad * 2; | |||||
| }; | |||||
| DType dst_dtype = bwd_conv.dtype; | |||||
| dnn_opr->deduce_dtype(filter.dtype, diff.dtype, dst_dtype); | |||||
| TensorLayout dst{dst_dtype}; | |||||
| dst.ndim = diff_ndim; | |||||
| dst[0] = diff[0]; | |||||
| dst[src_or_dst_c_pos] = icpg * group; | |||||
| for (size_t i = 0; i < img_ndim; ++i) { | |||||
| dst[i + src_or_dst_spatial_start] = | |||||
| deduce(diff[i + src_or_dst_spatial_start], dilated_spatial[i], | |||||
| stride[i], padding[i]); | |||||
| } | |||||
| dst.init_contiguous_stride(); | |||||
| return dst; | |||||
| } | |||||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | ||||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | ||||
| SmallVector<LogicalTensorDesc> dests(1); | |||||
| auto&& desc = dests[0]; | |||||
| desc.comp_node = inputs[0].comp_node; | |||||
| TensorLayout filter = inputs[0].layout; | |||||
| TensorLayout diff = inputs[1].layout; | |||||
| size_t diff_ndim = diff.ndim; | |||||
| if (diff_ndim == 0 || filter.ndim == 0) { | |||||
| desc.layout = TensorLayout{{}, diff.dtype}; | |||||
| return {dests, false}; | |||||
| auto&& convbwd = def.cast_final_safe<ConvolutionBackwardData>(); | |||||
| DnnOprHelper<megdnn::ConvolutionBackwardData> dnn_opr(convbwd.param()); | |||||
| // force set dtype | |||||
| auto&& filter = inputs[0].layout; | |||||
| auto&& diff = inputs[1].layout; | |||||
| TensorLayout output_layout{convbwd.dtype}; | |||||
| if (filter.ndim && diff.ndim) { | |||||
| // deduce_layout won't override existing dtype | |||||
| dnn_opr.opr().deduce_layout(filter, diff, output_layout); | |||||
| } | } | ||||
| desc.layout = | |||||
| convbwd_do_shape_infer(def, diff_ndim, filter, diff, inputs[0].comp_node); | |||||
| return {dests, true}; | |||||
| return {{{output_layout, inputs[0].comp_node}}, output_layout.ndim != 0}; | |||||
| } | } | ||||
| SmallVector<TensorPtr> apply_on_physical_tensor( | SmallVector<TensorPtr> apply_on_physical_tensor( | ||||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | const OpDef& def, const SmallVector<TensorPtr>& inputs, | ||||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | ||||
| // create megdnn opr | // create megdnn opr | ||||
| auto&& convbwd = static_cast<const ConvolutionBackwardData&>(def); | |||||
| auto&& convbwd = def.cast_final_safe<ConvolutionBackwardData>(); | |||||
| CompNode cn = inputs[0]->comp_node(); | CompNode cn = inputs[0]->comp_node(); | ||||
| TensorLayout out_layout = output_descs[0].layout; | |||||
| if (!validated) | |||||
| out_layout = convbwd_do_shape_infer( | |||||
| def, inputs[1]->layout().ndim, inputs[0]->layout(), inputs[1]->layout(), | |||||
| cn); | |||||
| DnnOprCaller<megdnn::ConvolutionBackwardData> dnn_opr( | |||||
| cn, convbwd.param(), convbwd.policy()); | |||||
| auto out_layout = [&] { | |||||
| if (validated) { | |||||
| return output_descs[0].layout; | |||||
| } else { | |||||
| TensorLayout out_layout{inputs[0]->dtype()}; | |||||
| dnn_opr.op()->deduce_layout( | |||||
| inputs[0]->layout(), inputs[1]->layout(), out_layout); | |||||
| return out_layout; | |||||
| } | |||||
| }(); | |||||
| auto out = Tensor::make(out_layout, cn); | auto out = Tensor::make(out_layout, cn); | ||||
| using TensorND = megdnn::TensorND; | |||||
| SmallVector<TensorND> inp_tensornds(inputs.size()); | |||||
| TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); | |||||
| for (unsigned i = 0; i < inputs.size(); ++i) { | |||||
| inp_tensornds[i] = inputs[i]->dnn_tensor(); | |||||
| inp_shapes[i] = inputs[i]->layout(); | |||||
| } | |||||
| oup_shapes[0] = out_layout; | |||||
| DnnOprCaller<megdnn::ConvolutionBackwardData> dnn_opr(cn); | |||||
| dnn_opr.op->param() = convbwd.param(); | |||||
| size_t sz = setup_algo<megdnn::ConvolutionBackwardData>( | |||||
| {inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false, | |||||
| false, cn, convbwd.policy(), false, &inp_tensornds); | |||||
| auto dnn_wk = dnn_opr.create_workspace(sz); | |||||
| // exeucte | |||||
| dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); | |||||
| dnn_opr.exec_fastrun(inputs[0], inputs[1], out); | |||||
| return {out}; | return {out}; | ||||
| } | } | ||||
| @@ -415,149 +193,36 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||||
| return opr::Convolution3D::make(inputs[0], inputs[1], conv.param(), conv.policy()); | return opr::Convolution3D::make(inputs[0], inputs[1], conv.param(), conv.policy()); | ||||
| } | } | ||||
| TensorLayout do_shape_infer( | |||||
| const OpDef& def, size_t src_ndim, TensorLayout src, TensorLayout filter) { | |||||
| auto&& conv = static_cast<const Convolution3D&>(def); | |||||
| using Param = ::megdnn::param::Convolution3D; | |||||
| auto img_ndim = src_ndim - 2; | |||||
| mgb_assert( | |||||
| img_ndim == 3, | |||||
| "only 3D convolution is supported, and input should be 5-dim; " | |||||
| "got input dim = %zu", | |||||
| src_ndim); | |||||
| size_t group = 1; | |||||
| size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; | |||||
| if (conv.sparse == Param::Sparse::DENSE) { | |||||
| mgb_assert( | |||||
| filter.ndim == img_ndim + 2 || filter.ndim == img_ndim + 4, | |||||
| "bad filter ndim for dense convolution: " | |||||
| "spatial_ndim=%zu filter_ndim=%zu", | |||||
| img_ndim, filter.ndim); | |||||
| group = 1; | |||||
| flt_start = 0; | |||||
| } else { // Param::Sparse::GROUP | |||||
| mgb_assert( | |||||
| filter.ndim == img_ndim + 3 || filter.ndim == img_ndim + 5, | |||||
| "bad filter ndim for group convolution: " | |||||
| "spatial_ndim=%zu filter_ndim=%zu", | |||||
| img_ndim, filter.ndim); | |||||
| // grp, oc, ic, dims[] | |||||
| group = filter[0]; | |||||
| flt_start = 1; | |||||
| } | |||||
| uint32_t ic_block_size = 1, oc_block_size = 1; | |||||
| size_t src_or_dst_c_pos = 0; | |||||
| size_t src_or_dst_spatial_start = 0; | |||||
| if (conv.format == Param::Format::NCDHW) { | |||||
| // filter should be (oc, ic, fd, fh, fw) | |||||
| flt_spatial_start = 2; | |||||
| ocpg_pos = 0; | |||||
| icpg_pos = 1; | |||||
| src_or_dst_c_pos = 1; | |||||
| src_or_dst_spatial_start = 2; | |||||
| } else { // Param::Format::NDHWC | |||||
| // filter should be (oc, fd, fh, fw, ic) | |||||
| flt_spatial_start = 1; | |||||
| ocpg_pos = 0; | |||||
| icpg_pos = 4; | |||||
| src_or_dst_c_pos = 4; | |||||
| src_or_dst_spatial_start = 1; | |||||
| } | |||||
| size_t ocpg = filter[flt_start + ocpg_pos] * oc_block_size; | |||||
| size_t icpg = filter[flt_start + icpg_pos] * ic_block_size; | |||||
| uint32_t dilation[3], dilated_spatial[3], stride[3], padding[3]; | |||||
| dilation[0] = conv.dilate_d; | |||||
| dilation[1] = conv.dilate_h; | |||||
| dilation[2] = conv.dilate_w; | |||||
| stride[0] = conv.stride_d; | |||||
| stride[1] = conv.stride_h; | |||||
| stride[2] = conv.stride_w; | |||||
| padding[0] = conv.pad_d; | |||||
| padding[1] = conv.pad_h; | |||||
| padding[2] = conv.pad_w; | |||||
| for (size_t i = 0; i < img_ndim; ++i) { | |||||
| mgb_assert( | |||||
| dilation[i] > 0, "invalid dilation on spatial dim %zu: %u", i, | |||||
| dilation[i]); | |||||
| dilated_spatial[i] = | |||||
| (filter[i + flt_start + flt_spatial_start] - 1) * dilation[i] + 1; | |||||
| } | |||||
| mgb_assert( | |||||
| icpg * group == src[src_or_dst_c_pos], | |||||
| "group conv invalid: input channel of Conv expect %zu, but got %zu\n" | |||||
| "hint: weight may be changed by mistake\n", | |||||
| icpg * group, src[src_or_dst_c_pos]); | |||||
| TensorLayout dst{src.dtype}; | |||||
| dst.ndim = src_ndim; | |||||
| dst[0] = src[0]; | |||||
| dst[src_or_dst_c_pos] = ocpg * group; | |||||
| for (size_t i = 0; i < img_ndim; ++i) { | |||||
| dst[i + src_or_dst_spatial_start] = infer_conv_shape( | |||||
| src[i + src_or_dst_spatial_start], dilated_spatial[i], stride[i], | |||||
| padding[i]); | |||||
| } | |||||
| dst.init_contiguous_stride(); | |||||
| return dst; | |||||
| } | |||||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | ||||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | ||||
| SmallVector<LogicalTensorDesc> dests(1); | |||||
| auto&& desc = dests[0]; | |||||
| desc.comp_node = inputs[0].comp_node; | |||||
| auto&& conv = def.cast_final_safe<Convolution3D>(); | |||||
| TensorLayout src = inputs[0].layout; | TensorLayout src = inputs[0].layout; | ||||
| TensorLayout filter = inputs[1].layout; | TensorLayout filter = inputs[1].layout; | ||||
| size_t src_ndim = src.ndim; | |||||
| if (src_ndim == 0 || filter.ndim == 0) { | |||||
| desc.layout = TensorLayout{{}, src.dtype}; | |||||
| return {dests, false}; | |||||
| if (src.ndim == 0 || filter.ndim == 0) { | |||||
| return {{{TensorLayout{src.dtype}, inputs[0].comp_node}}, false}; | |||||
| } | } | ||||
| desc.layout = do_shape_infer(def, src_ndim, src, filter); | |||||
| return {dests, true}; | |||||
| DnnOprHelper<megdnn::Convolution3DForward> dnn_opr(conv.param()); | |||||
| auto output = dnn_opr.deduce_layout(src, filter); | |||||
| return {{{output, inputs[0].comp_node}}, false}; | |||||
| } | } | ||||
| SmallVector<TensorPtr> apply_on_physical_tensor( | SmallVector<TensorPtr> apply_on_physical_tensor( | ||||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | const OpDef& def, const SmallVector<TensorPtr>& inputs, | ||||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | ||||
| // create megdnn opr | // create megdnn opr | ||||
| auto&& conv = static_cast<const Convolution3D&>(def); | |||||
| TensorLayout out_layout = output_descs[0].layout; | |||||
| if (!validated) | |||||
| out_layout = do_shape_infer( | |||||
| def, inputs[0]->layout().ndim, inputs[0]->layout(), | |||||
| inputs[1]->layout()); | |||||
| using TensorND = megdnn::TensorND; | |||||
| auto&& conv = def.cast_final_safe<Convolution3D>(); | |||||
| CompNode cn = inputs[0]->comp_node(); | CompNode cn = inputs[0]->comp_node(); | ||||
| SmallVector<TensorND> inp_tensornds(inputs.size()); | |||||
| TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); | |||||
| for (unsigned i = 0; i < inputs.size(); ++i) { | |||||
| inp_tensornds[i] = inputs[i]->dnn_tensor(); | |||||
| inp_shapes[i] = inputs[i]->layout(); | |||||
| } | |||||
| oup_shapes[0] = out_layout; | |||||
| DnnOprCaller<megdnn::Convolution3D> dnn_opr(cn); | |||||
| dnn_opr.op->param() = conv.param(); | |||||
| // shape infer | |||||
| size_t sz = setup_algo<megdnn::Convolution3D>( | |||||
| {inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false, | |||||
| false, cn, conv.policy(), false, &inp_tensornds); | |||||
| DnnOprCaller<megdnn::Convolution3D> dnn_opr(cn, conv.param(), conv.policy()); | |||||
| auto out_layout = [&] { | |||||
| if (validated) { | |||||
| return output_descs[0].layout; | |||||
| } else { | |||||
| return dnn_opr.deduce_layout(inputs[0]->layout(), inputs[1]->layout()); | |||||
| } | |||||
| }(); | |||||
| // alloc memory | // alloc memory | ||||
| auto out = Tensor::make(out_layout, cn); | auto out = Tensor::make(out_layout, cn); | ||||
| auto dnn_wk = dnn_opr.create_workspace(sz); | |||||
| // exeucte | |||||
| dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); | |||||
| dnn_opr.exec_fastrun(inputs[0], inputs[1], out); | |||||
| return {out}; | return {out}; | ||||
| } | } | ||||
| @@ -579,51 +244,38 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
| inputs.size() == 2, | inputs.size() == 2, | ||||
| "inputs num of conv_transpose3d should be 2 but you give %zu", | "inputs num of conv_transpose3d should be 2 but you give %zu", | ||||
| inputs.size()); | inputs.size()); | ||||
| auto&& op_def = def.cast_final_safe<Convolution3DBackwardData>(); | auto&& op_def = def.cast_final_safe<Convolution3DBackwardData>(); | ||||
| auto&& weight = inputs[0]; | auto&& weight = inputs[0]; | ||||
| auto&& diff = inputs[1]; | auto&& diff = inputs[1]; | ||||
| auto& cn = weight.comp_node; | |||||
| if (weight.layout.ndim == 0 || diff.layout.ndim == 0) { | |||||
| return {{{TensorLayout{weight.layout.dtype}, cn, {}}}, false}; | |||||
| if (!(weight.layout.ndim && diff.layout.ndim)) { | |||||
| return {{{TensorLayout{weight.layout.dtype}, weight.comp_node}}, false}; | |||||
| } | } | ||||
| TensorLayout oup_layout; | |||||
| megdnn::Convolution3DBackwardData::deduce_layout_impl( | |||||
| weight.layout, diff.layout, op_def.param(), oup_layout); | |||||
| return {{{oup_layout, cn, {}}}, true}; | |||||
| DnnOprHelper<megdnn::Convolution3DBackwardData> dnn_opr(op_def.param()); | |||||
| auto oup_layout = dnn_opr.deduce_layout(weight.layout, diff.layout); | |||||
| return {{{oup_layout, weight.comp_node}}, true}; | |||||
| } | } | ||||
| SmallVector<TensorPtr> apply_on_physical_tensor( | SmallVector<TensorPtr> apply_on_physical_tensor( | ||||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | const OpDef& def, const SmallVector<TensorPtr>& inputs, | ||||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | ||||
| auto&& op_def = def.cast_final_safe<Convolution3DBackwardData>(); | |||||
| auto&& conv = def.cast_final_safe<Convolution3DBackwardData>(); | |||||
| auto cn = inputs[0]->comp_node(); | auto cn = inputs[0]->comp_node(); | ||||
| auto&& wlayout = inputs[0]->layout(); | auto&& wlayout = inputs[0]->layout(); | ||||
| auto&& dlayout = inputs[1]->layout(); | auto&& dlayout = inputs[1]->layout(); | ||||
| DnnOprCaller<megdnn::Convolution3DBackwardData> caller(cn); | |||||
| auto&& dnn_opr = caller.op; | |||||
| dnn_opr->param() = op_def.param(); | |||||
| DnnOprCaller<megdnn::Convolution3DBackwardData> dnn_op( | |||||
| cn, conv.param(), conv.policy()); | |||||
| TensorLayout& oup_layout = output_descs[0].layout; | |||||
| if (!validated) { | |||||
| megdnn::Convolution3DBackwardData::deduce_layout_impl( | |||||
| wlayout, dlayout, op_def.param(), oup_layout); | |||||
| } | |||||
| auto oup_layout = [&] { | |||||
| if (validated) { | |||||
| return output_descs[0].layout; | |||||
| } else { | |||||
| return dnn_op.deduce_layout(wlayout, dlayout); | |||||
| } | |||||
| }(); | |||||
| auto oup = Tensor::make(oup_layout, cn); | auto oup = Tensor::make(oup_layout, cn); | ||||
| SmallVector<megdnn::TensorND> inp_tensornds(inputs.size()); | |||||
| inp_tensornds[0] = inputs[0]->dnn_tensor(); | |||||
| inp_tensornds[1] = inputs[1]->dnn_tensor(); | |||||
| size_t wk_size = setup_algo<megdnn::Convolution3DBackwardData>( | |||||
| {wlayout, dlayout, oup_layout}, dnn_opr.get(), 0, false, false, cn, | |||||
| op_def.policy(), false, &inp_tensornds); | |||||
| auto dnn_wk = caller.create_workspace(wk_size); | |||||
| dnn_opr->exec(inp_tensornds[0], inp_tensornds[1], oup->dnn_tensor(), dnn_wk); | |||||
| dnn_op.exec_fastrun(inputs[0], inputs[1], oup); | |||||
| return {oup}; | return {oup}; | ||||
| } | } | ||||
| @@ -94,52 +94,44 @@ void apply_on_device_tensornd( | |||||
| mgb_assert( | mgb_assert( | ||||
| inputs.size() == trait.arity, "%s expects %u inputs; got %zu actually", | inputs.size() == trait.arity, "%s expects %u inputs; got %zu actually", | ||||
| trait.name, trait.arity, inputs.size()); | trait.name, trait.arity, inputs.size()); | ||||
| DnnOprCaller<megdnn::Elemwise> dnn_opr(inputs[0].comp_node()); | |||||
| opr::Elemwise::perform(op_def.mode, (*outputs)[0], inputs, dnn_opr.op); | |||||
| DnnOprCaller<megdnn::Elemwise> dnn_opr(inputs[0].comp_node(), {op_def.mode}); | |||||
| opr::Elemwise::perform(op_def.mode, (*outputs)[0], inputs, dnn_opr.op()); | |||||
| } | } | ||||
| SmallVector<TensorPtr> apply_on_physical_tensor( | SmallVector<TensorPtr> apply_on_physical_tensor( | ||||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | const OpDef& def, const SmallVector<TensorPtr>& inputs, | ||||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | ||||
| auto comp_node = inputs[0]->comp_node(); | auto comp_node = inputs[0]->comp_node(); | ||||
| auto dtype = inputs[0]->dtype(); | |||||
| using Mode = Elemwise::Mode; | using Mode = Elemwise::Mode; | ||||
| using TensorND = megdnn::TensorND; | |||||
| auto&& op_def = def.cast_final_safe<Elemwise>(); | auto&& op_def = def.cast_final_safe<Elemwise>(); | ||||
| SmallVector<TensorND> inp_tensornds; | |||||
| TensorShapeArray inp_shapes(inputs.size()); | |||||
| inp_tensornds.reserve(inputs.size()); | |||||
| TensorLayout layout{inputs[0]->layout().dtype}; | |||||
| bool is_empty = false; | |||||
| for (unsigned i = 0; i < inputs.size(); ++i) { | |||||
| if (inputs[i]->layout().is_empty()) { | |||||
| is_empty = true; | |||||
| } | |||||
| inp_tensornds.push_back(inputs[i]->dnn_tensor()); | |||||
| inp_shapes[i] = inputs[i]->layout(); | |||||
| auto mode = op_def.mode; | |||||
| TensorShapeArray input_shapes; | |||||
| input_shapes.reserve(inputs.size()); | |||||
| for (auto&& input : inputs) { | |||||
| input_shapes.push_back(input->shape()); | |||||
| } | } | ||||
| megdnn::Elemwise::deduce_shape(inp_shapes, layout); | |||||
| layout.init_contiguous_stride(); | |||||
| auto out = Tensor::make(layout, comp_node); | |||||
| if (is_empty) { | |||||
| return {out}; | |||||
| // deduce_shape is static and fast | |||||
| TensorLayout output_layout{dtype}; | |||||
| // TODO: deduce_layout directly | |||||
| megdnn::Elemwise::deduce_shape(input_shapes, output_layout); | |||||
| output_layout.init_contiguous_stride(); | |||||
| auto output = Tensor::make(output_layout, comp_node); | |||||
| if (output_layout.is_empty()) { | |||||
| return {output}; | |||||
| } | } | ||||
| DnnOprCaller<megdnn::Elemwise> dnn_opr(comp_node); | |||||
| dnn_opr.op->param() = op_def.param(); | |||||
| if (dnn_opr.op->param().mode == Mode::FUSE_MUL_ADD3 || | |||||
| dnn_opr.op->param().mode == Mode::FUSE_MUL_ADD4 || | |||||
| (inp_tensornds.size() && | |||||
| inp_tensornds[0].layout.dtype.category() == DTypeCategory::QUANTIZED)) { | |||||
| opr::Elemwise::perform_dnn( | |||||
| comp_node, out->dnn_tensor(), inp_tensornds, dnn_opr.op); | |||||
| DnnOprCaller<megdnn::Elemwise> dnn_opr(comp_node, op_def.param()); | |||||
| if (mode == Mode::FUSE_MUL_ADD3 || mode == Mode::FUSE_MUL_ADD4 || | |||||
| dtype.category() == DTypeCategory::QUANTIZED) { | |||||
| dnn_opr.call_dnn( | |||||
| [&](auto&& inputs, auto&& output) { | |||||
| opr::Elemwise::perform_dnn(comp_node, output, inputs, dnn_opr.op()); | |||||
| }, | |||||
| inputs, output); | |||||
| } else { | } else { | ||||
| dnn_opr.op->exec(inp_tensornds, out->dnn_tensor()); | |||||
| dnn_opr.exec(inputs, output); | |||||
| } | } | ||||
| return {out}; | |||||
| return {output}; | |||||
| } | } | ||||
| MGB_DEFINE_OPR_CLASS( | MGB_DEFINE_OPR_CLASS( | ||||
| @@ -179,7 +171,7 @@ protected: | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| void create_megdnn_opr() override { | void create_megdnn_opr() override { | ||||
| auto opr = DnnOprCaller<megdnn::Elemwise>::create_operator(comp_node()); | |||||
| auto opr = mgb::opr::intl::create_megdnn_opr<megdnn::Elemwise>(comp_node()); | |||||
| opr->param().mode = m_param.mode; | opr->param().mode = m_param.mode; | ||||
| set_megdnn_opr(std::move(opr)); | set_megdnn_opr(std::move(opr)); | ||||
| } | } | ||||
| @@ -243,22 +235,19 @@ SmallVector<TensorPtr> apply_inplace_add_on_physical_tensor( | |||||
| "This inplace modification may change the elements of other tensors. " | "This inplace modification may change the elements of other tensors. " | ||||
| "Fallback to non-inplace update."); | "Fallback to non-inplace update."); | ||||
| DeviceTensorStorage storage; | |||||
| storage.reset(dest->comp_node(), dest->blob()->size(), dest->blob()->storage()); | |||||
| storage = storage.sub(dest->offset()); | |||||
| DeviceTensorND dv; | |||||
| dv.reset(storage, dest->layout()); | |||||
| DeviceTensorND dv_new; | |||||
| dv_new.copy_from(dv); | |||||
| dest = Tensor::make(dv_new); | |||||
| auto dest_layout = inputs[0]->layout(); | |||||
| dest_layout.init_contiguous_stride(); | |||||
| auto new_dest = Tensor::make(dest_layout, inputs[0]->comp_node()); | |||||
| new_dest->dev_tensor().copy_from(dest->dev_tensor()); | |||||
| dest = new_dest; | |||||
| } | } | ||||
| auto tensor_to_scalar = [](const TensorPtr& tensor) -> float { | auto tensor_to_scalar = [](const TensorPtr& tensor) -> float { | ||||
| return *tensor->get_value().ptr<float>(); | return *tensor->get_value().ptr<float>(); | ||||
| }; | }; | ||||
| DnnOprCaller<megdnn::AddUpdate> caller{dest->comp_node()}; | |||||
| caller.op->param() = {tensor_to_scalar(alpha), tensor_to_scalar(beta)}; | |||||
| caller.op->exec(dest->dev_tensor().as_megdnn(), delta->dev_tensor().as_megdnn()); | |||||
| DnnOprCaller<megdnn::AddUpdate> caller{ | |||||
| dest->comp_node(), {tensor_to_scalar(alpha), tensor_to_scalar(beta)}}; | |||||
| caller.exec(dest, delta); | |||||
| // FIXME: inplace update host value | |||||
| return {std::make_shared<Tensor>(dest->blob(), dest->offset(), dest->layout())}; | return {std::make_shared<Tensor>(dest->blob(), dest->offset(), dest->layout())}; | ||||
| } | } | ||||
| @@ -67,10 +67,8 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| auto&& op = def.cast_final_safe<IndexingOneHot>(); | auto&& op = def.cast_final_safe<IndexingOneHot>(); | ||||
| auto&& inp = inputs[0]; | auto&& inp = inputs[0]; | ||||
| auto&& index = inputs[1]; | auto&& index = inputs[1]; | ||||
| TensorLayout layout = inp->layout(); | |||||
| TensorLayout index_layout = index->layout(); | |||||
| DnnOprCaller<megdnn::IndexingOneHot> dnn_op(inp->comp_node()); | |||||
| auto&& indexing_one_hot_param = dnn_op.op->param(); | |||||
| auto&& layout = inp->layout(); | |||||
| auto&& index_layout = index->layout(); | |||||
| int real_axis = static_cast<int>(op.axis); | int real_axis = static_cast<int>(op.axis); | ||||
| if (real_axis < 0) { | if (real_axis < 0) { | ||||
| real_axis += static_cast<int>(layout.ndim); | real_axis += static_cast<int>(layout.ndim); | ||||
| @@ -79,16 +77,10 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| 0 <= real_axis && real_axis < static_cast<int>(layout.ndim), | 0 <= real_axis && real_axis < static_cast<int>(layout.ndim), | ||||
| "Dimension out of range (expected to be in range of [%d, %d], but got %d)", | "Dimension out of range (expected to be in range of [%d, %d], but got %d)", | ||||
| 0, static_cast<int>(layout.ndim) - 1, op.axis); | 0, static_cast<int>(layout.ndim) - 1, op.axis); | ||||
| indexing_one_hot_param = real_axis; | |||||
| TensorLayout tlayout; | |||||
| dnn_op.op->deduce_layout(layout, index_layout, tlayout); | |||||
| TensorPtr out = Tensor::make(tlayout, inp->comp_node()); | |||||
| megdnn::TensorND in = inp->dnn_tensor(); | |||||
| megdnn::TensorND ind = index->dnn_tensor(); | |||||
| size_t sz = dnn_op.op->get_workspace_in_bytes(layout, index_layout, tlayout); | |||||
| auto dnn_workspace = dnn_op.create_workspace(sz); | |||||
| dnn_op.op->exec(in, ind, out->dnn_tensor(), dnn_workspace); | |||||
| DnnOprCaller<megdnn::IndexingOneHot> dnn_op(inp->comp_node(), real_axis); | |||||
| auto tlayout = dnn_op.deduce_layout(layout, index_layout); | |||||
| auto out = Tensor::make(tlayout, inp->comp_node()); | |||||
| dnn_op.exec_with_ws(inp, index, out); | |||||
| return {out}; | return {out}; | ||||
| } | } | ||||
| @@ -105,15 +97,14 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& input_descs) { | const OpDef& def, const SmallVector<LogicalTensorDesc>& input_descs) { | ||||
| mgb_assert(input_descs.size() == 3, "IndexingSetOneHot expects three inputs"); | mgb_assert(input_descs.size() == 3, "IndexingSetOneHot expects three inputs"); | ||||
| auto comp_node = input_descs[0].comp_node; | auto comp_node = input_descs[0].comp_node; | ||||
| TensorLayout src = input_descs[0].layout, index = input_descs[1].layout; | |||||
| auto&& src = input_descs[0].layout; | |||||
| auto&& index = input_descs[1].layout; | |||||
| mgb_assert(index.dtype == dtype::Int32(), "index dtype must be int32"); | mgb_assert(index.dtype == dtype::Int32(), "index dtype must be int32"); | ||||
| if (!src.ndim) { | if (!src.ndim) { | ||||
| return {{{{{}, src.dtype}, comp_node}}, false}; | return {{{{{}, src.dtype}, comp_node}}, false}; | ||||
| } | } | ||||
| mgb_assert(src.is_contiguous(), "src should be contiguous"); | mgb_assert(src.is_contiguous(), "src should be contiguous"); | ||||
| return {{input_descs[0]}, true}; | |||||
| return {{{src, comp_node}}, true}; | |||||
| } | } | ||||
| auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | ||||
| @@ -136,25 +127,15 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| auto&& index = inputs[1]; | auto&& index = inputs[1]; | ||||
| auto&& sub = inputs[2]; | auto&& sub = inputs[2]; | ||||
| TensorLayout layout = inp->layout(); | TensorLayout layout = inp->layout(); | ||||
| TensorLayout index_layout = index->layout(); | |||||
| TensorLayout tlayout = sub->layout(); | |||||
| mgb_assert(layout.is_contiguous()); | mgb_assert(layout.is_contiguous()); | ||||
| DnnOprCaller<megdnn::IndexingSetOneHot> dnn_op(inp->comp_node()); | |||||
| auto&& indexing_one_hot_param = dnn_op.op->param(); | |||||
| int real_axis = static_cast<int>(op.axis); | int real_axis = static_cast<int>(op.axis); | ||||
| if (real_axis < 0) { | if (real_axis < 0) { | ||||
| real_axis += static_cast<int>(layout.ndim); | real_axis += static_cast<int>(layout.ndim); | ||||
| } | } | ||||
| indexing_one_hot_param = real_axis; | |||||
| DnnOprCaller<megdnn::IndexingSetOneHot> dnn_op(inp->comp_node(), real_axis); | |||||
| TensorPtr out = Tensor::make(layout, inp->comp_node()); | TensorPtr out = Tensor::make(layout, inp->comp_node()); | ||||
| out->dev_tensor().copy_from_fixlayout(inp->dev_tensor()); | out->dev_tensor().copy_from_fixlayout(inp->dev_tensor()); | ||||
| megdnn::TensorND in = inp->dnn_tensor(); | |||||
| megdnn::TensorND ind = index->dnn_tensor(); | |||||
| megdnn::TensorND su = sub->dnn_tensor(); | |||||
| size_t sz = dnn_op.op->get_workspace_in_bytes(layout, index_layout, tlayout); | |||||
| auto dnn_workspace = dnn_op.create_workspace(sz); | |||||
| dnn_op.op->exec(out->dnn_tensor(), ind, su, dnn_workspace); | |||||
| dnn_op.exec_with_ws(out, index, sub); | |||||
| return {out}; | return {out}; | ||||
| } | } | ||||
| @@ -54,14 +54,15 @@ cg::OperatorNodeBase* apply_on_var_node_remote_recv( | |||||
| TensorPtr megray_recv_tensor( | TensorPtr megray_recv_tensor( | ||||
| std::shared_ptr<MegRay::Communicator> megray_comm, TensorLayout& layout, | std::shared_ptr<MegRay::Communicator> megray_comm, TensorLayout& layout, | ||||
| CompNode cn, uint32_t rank_from) { | CompNode cn, uint32_t rank_from) { | ||||
| DeviceTensorND out = BlobManager::inst()->alloc_workspace_with_defrag(cn, layout); | |||||
| auto out = Tensor::make(layout, cn); | |||||
| auto dnn_out = out->dnn_tensor(); | |||||
| auto megray_ctx = mgb::opr::get_megray_context(cn); | auto megray_ctx = mgb::opr::get_megray_context(cn); | ||||
| size_t data_size = layout.total_nr_elems(); | size_t data_size = layout.total_nr_elems(); | ||||
| auto status = megray_comm->recv( | auto status = megray_comm->recv( | ||||
| out.raw_ptr(), data_size, mgb::opr::get_megray_dtype(layout.dtype), | |||||
| dnn_out.raw_ptr(), data_size, mgb::opr::get_megray_dtype(layout.dtype), | |||||
| rank_from, megray_ctx); | rank_from, megray_ctx); | ||||
| mgb_assert(status == MegRay::MEGRAY_OK, "MegRay recv failed"); | mgb_assert(status == MegRay::MEGRAY_OK, "MegRay recv failed"); | ||||
| return Tensor::make(out); | |||||
| return out; | |||||
| } | } | ||||
| void megray_send_tensor( | void megray_send_tensor( | ||||
| @@ -105,9 +106,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor_remote_send( | |||||
| mgb_assert(megray_comm != nullptr); | mgb_assert(megray_comm != nullptr); | ||||
| megray_send_tensor(megray_comm, inputs[0], op.rank_to); | megray_send_tensor(megray_comm, inputs[0], op.rank_to); | ||||
| TensorLayout layout({0}, inputs[0]->dtype()); | TensorLayout layout({0}, inputs[0]->dtype()); | ||||
| DeviceTensorND out = BlobManager::inst()->alloc_workspace_with_defrag( | |||||
| inputs[0]->comp_node(), layout); | |||||
| return {Tensor::make(out)}; | |||||
| return {Tensor::make(layout, inputs[0]->comp_node())}; | |||||
| } | } | ||||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible_remote_recv( | std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible_remote_recv( | ||||
| @@ -21,14 +21,17 @@ SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint( | |||||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | ||||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& input_descs) { | const OpDef& def, const SmallVector<LogicalTensorDesc>& input_descs) { | ||||
| mgb_assert(input_descs.size() == 4, "IndexingOneHot expects 4inputs"); | mgb_assert(input_descs.size() == 4, "IndexingOneHot expects 4inputs"); | ||||
| auto comp_node = input_descs[0].comp_node; | auto comp_node = input_descs[0].comp_node; | ||||
| auto comp_node1 = input_descs[1].comp_node; | auto comp_node1 = input_descs[1].comp_node; | ||||
| auto comp_node2 = input_descs[2].comp_node; | auto comp_node2 = input_descs[2].comp_node; | ||||
| TensorLayout m_t_1 = input_descs[0].layout, v_t_1 = input_descs[1].layout, | |||||
| lamb_param = input_descs[2].layout, grad = input_descs[3].layout; | |||||
| TensorLayout new_param = lamb_param, m_t = m_t_1, v_t = v_t_1; | |||||
| auto&& m_t_1 = input_descs[0].layout; | |||||
| auto&& v_t_1 = input_descs[1].layout; | |||||
| auto&& lamb_param = input_descs[2].layout; | |||||
| auto&& grad = input_descs[3].layout; | |||||
| MGB_MARK_USED_VAR(grad); | |||||
| auto&& new_param = lamb_param; | |||||
| auto&& m_t = m_t_1; | |||||
| auto&& v_t = v_t_1; | |||||
| return {{{m_t, comp_node}, {v_t, comp_node1}, {new_param, comp_node2}}, true}; | return {{{m_t, comp_node}, {v_t, comp_node1}, {new_param, comp_node2}}, true}; | ||||
| } | } | ||||
| @@ -46,23 +49,11 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| TensorLayout lamb_param_layout{lamb_param->layout()}; | TensorLayout lamb_param_layout{lamb_param->layout()}; | ||||
| auto m_t = Tensor::make(m_t_1_layout, m_t_1->comp_node()); | auto m_t = Tensor::make(m_t_1_layout, m_t_1->comp_node()); | ||||
| auto v_t = Tensor::make(v_t_1_layout, v_t_1->comp_node()); | auto v_t = Tensor::make(v_t_1_layout, v_t_1->comp_node()); | ||||
| auto new_param = Tensor::make(lamb_param_layout, lamb_param->comp_node()); | auto new_param = Tensor::make(lamb_param_layout, lamb_param->comp_node()); | ||||
| DnnOprCaller<megdnn::LAMBUpdate> caller{lamb_param->comp_node()}; | |||||
| size_t sz = caller.op->get_workspace_in_bytes( | |||||
| m_t_1->layout(), v_t_1->layout(), lamb_param->layout(), grad->layout(), | |||||
| m_t->layout(), v_t->layout(), new_param->layout()); | |||||
| auto dnn_workspace = caller.create_workspace(sz); | |||||
| caller.op->param() = op.param(); | |||||
| caller.op->exec( | |||||
| m_t_1->dev_tensor().as_megdnn(), v_t_1->dev_tensor().as_megdnn(), | |||||
| lamb_param->dev_tensor().as_megdnn(), grad->dev_tensor().as_megdnn(), | |||||
| m_t->dnn_tensor(), v_t->dnn_tensor(), new_param->dnn_tensor(), | |||||
| dnn_workspace); | |||||
| DnnOprCaller<megdnn::LAMBUpdate> dnn_opr{lamb_param->comp_node(), op.param()}; | |||||
| dnn_opr.exec_with_ws(m_t_1, v_t_1, lamb_param, grad, m_t, v_t, new_param); | |||||
| return {m_t, v_t, new_param}; | return {m_t, v_t, new_param}; | ||||
| } | } | ||||
| @@ -29,11 +29,11 @@ cg::OperatorNodeBase* apply_on_var_node(const OpDef& def, const VarNodeArray& in | |||||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | ||||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | ||||
| auto&& op_def = def.cast_final_safe<LayerNorm>(); | |||||
| auto&& layer_norm = def.cast_final_safe<LayerNorm>(); | |||||
| size_t nr_inp = inputs.size(); | size_t nr_inp = inputs.size(); | ||||
| auto p = op_def.param(); | |||||
| auto affine = layer_norm.affine; | |||||
| mgb_assert( | mgb_assert( | ||||
| (nr_inp == 3 && p.affine) || (nr_inp == 1 && !p.affine), | |||||
| (nr_inp == 3 && affine) || (nr_inp == 1 && !affine), | |||||
| "num of inputs of pooling should be 1 or 3 but you give %zu", | "num of inputs of pooling should be 1 or 3 but you give %zu", | ||||
| inputs.size()); | inputs.size()); | ||||
| @@ -47,9 +47,9 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
| false}; | false}; | ||||
| } | } | ||||
| TensorLayout oup_layout, mean_layout, rstd_layout; | |||||
| megdnn::LayerNorm::deduce_layout_fwd_impl( | |||||
| inp.layout, p, oup_layout, mean_layout, rstd_layout); | |||||
| DnnOprHelper<megdnn::LayerNorm> dnn_opr(layer_norm.param()); | |||||
| auto&& [oup_layout, mean_layout, rstd_layout] = | |||||
| dnn_opr.deduce_layouts<3>(inp.layout, TensorLayout{}, TensorLayout{}); | |||||
| return {{{oup_layout, inp_cn, {}}, | return {{{oup_layout, inp_cn, {}}, | ||||
| {mean_layout, inp_cn, {}}, | {mean_layout, inp_cn, {}}, | ||||
| {rstd_layout, inp_cn, {}}}, | {rstd_layout, inp_cn, {}}}, | ||||
| @@ -69,32 +69,21 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| inputs.size()); | inputs.size()); | ||||
| auto cn = inputs[0]->comp_node(); | auto cn = inputs[0]->comp_node(); | ||||
| DnnOprCaller<megdnn::LayerNorm> caller(cn); | |||||
| auto&& dnn_opr = caller.op; | |||||
| dnn_opr->param() = p; | |||||
| DnnOprCaller<megdnn::LayerNorm> caller(cn, op_def.param()); | |||||
| TensorLayout oup_layout, mean_layout, rstd_layout; | |||||
| megdnn::LayerNorm::deduce_layout_fwd_impl( | |||||
| inputs[0]->dnn_tensor().layout, p, oup_layout, mean_layout, rstd_layout); | |||||
| auto&& [oup_layout, mean_layout, rstd_layout] = caller.deduce_layouts<3>( | |||||
| inputs[0]->layout(), TensorLayout{}, TensorLayout{}); | |||||
| auto out = Tensor::make(oup_layout, cn); | auto out = Tensor::make(oup_layout, cn); | ||||
| auto mean = Tensor::make(mean_layout, cn); | auto mean = Tensor::make(mean_layout, cn); | ||||
| auto rstd = Tensor::make(rstd_layout, cn); | auto rstd = Tensor::make(rstd_layout, cn); | ||||
| auto wk_size = caller.op->get_workspace_in_bytes( | |||||
| inputs[0]->dnn_tensor().layout, | |||||
| p.affine ? inputs[1]->dnn_tensor().layout : TensorLayout(), | |||||
| p.affine ? inputs[2]->dnn_tensor().layout : TensorLayout(), oup_layout, | |||||
| mean_layout, rstd_layout); | |||||
| auto dnn_wk = caller.create_workspace(wk_size); | |||||
| caller.op->exec( | |||||
| inputs[0]->dnn_tensor(), | |||||
| p.affine ? inputs[1]->dnn_tensor() : megdnn::TensorND(), | |||||
| p.affine ? inputs[2]->dnn_tensor() : megdnn::TensorND(), out->dnn_tensor(), | |||||
| mean->dnn_tensor(), rstd->dnn_tensor(), dnn_wk); | |||||
| if (p.affine) { | |||||
| caller.exec_with_ws(inputs[0], inputs[1], inputs[2], out, mean, rstd); | |||||
| } else { | |||||
| megdnn::TensorND empty_dnn; | |||||
| caller.exec_with_ws(inputs[0], empty_dnn, empty_dnn, out, mean, rstd); | |||||
| } | |||||
| return {out, mean, rstd}; | return {out, mean, rstd}; | ||||
| } | } | ||||
| @@ -105,4 +94,4 @@ OP_TRAIT_REG(LayerNorm, LayerNorm) | |||||
| .fallback(); | .fallback(); | ||||
| } // namespace layer_norm | } // namespace layer_norm | ||||
| } // namespace mgb::imperative | |||||
| } // namespace mgb::imperative | |||||
| @@ -24,7 +24,6 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||||
| auto dim1 = matmul.dimA, dim2 = matmul.dimB; | auto dim1 = matmul.dimA, dim2 = matmul.dimB; | ||||
| auto cn = inputs[0]->comp_node(); | auto cn = inputs[0]->comp_node(); | ||||
| using Desc = opr::AxisAddRemove::AxisDesc; | |||||
| using IndexDesc = opr::Subtensor::IndexDesc; | using IndexDesc = opr::Subtensor::IndexDesc; | ||||
| OperatorNodeConfig config{matmul.make_name(), cn}; | OperatorNodeConfig config{matmul.make_name(), cn}; | ||||
| @@ -104,9 +103,8 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
| dim1 = dim2 = 2; | dim1 = dim2 = 2; | ||||
| } | } | ||||
| DnnOprCaller<megdnn::MatrixMul> dnn_opr(inputs[0].comp_node); | |||||
| dnn_opr.op->param() = matmul.param(); | |||||
| dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||||
| DnnOprHelper<megdnn::MatrixMul> dnn_opr(matmul.param()); | |||||
| dnn_opr.opr().deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||||
| if (dim1 == 0 || dim2 == 0) { | if (dim1 == 0 || dim2 == 0) { | ||||
| return {{{TensorLayout(dst_dtype), inputs[0].comp_node}}, false}; | return {{{TensorLayout(dst_dtype), inputs[0].comp_node}}, false}; | ||||
| @@ -143,8 +141,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| SmallVector<TensorND> inp_tensornds(inputs.size()); | SmallVector<TensorND> inp_tensornds(inputs.size()); | ||||
| TensorLayout layout1 = inputs[0]->layout(), layout2 = inputs[1]->layout(); | TensorLayout layout1 = inputs[0]->layout(), layout2 = inputs[1]->layout(); | ||||
| DnnOprCaller<megdnn::MatrixMul> dnn_opr(cn); | |||||
| dnn_opr.op->param() = matmul.param(); | |||||
| DnnOprCaller<megdnn::MatrixMul> dnn_opr(cn, matmul.param(), matmul.policy()); | |||||
| if (matmul.dimA == matmul.dimB && matmul.dimB >= 3) { // only happens in backward | if (matmul.dimA == matmul.dimB && matmul.dimB >= 3) { // only happens in backward | ||||
| for (size_t i = 1; i + 1 < layout1.ndim; ++i) { | for (size_t i = 1; i + 1 < layout1.ndim; ++i) { | ||||
| @@ -160,7 +157,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| } | } | ||||
| DType dst_dtype; | DType dst_dtype; | ||||
| dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||||
| dnn_opr.op()->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||||
| // only matters when layout1 has dim 2 | // only matters when layout1 has dim 2 | ||||
| if (matmul.transposeA) | if (matmul.transposeA) | ||||
| @@ -229,13 +226,8 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| inp_tensornds[0].layout = layout_a; | inp_tensornds[0].layout = layout_a; | ||||
| inp_tensornds[1].layout = layout_b; | inp_tensornds[1].layout = layout_b; | ||||
| } | } | ||||
| size_t sz = setup_algo<megdnn::MatrixMul>( | |||||
| {layout_a, layout_b, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, | |||||
| matmul.policy(), false, &inp_tensornds); | |||||
| auto out = Tensor::make(dst_layout, cn); | auto out = Tensor::make(dst_layout, cn); | ||||
| auto dnn_wk = dnn_opr.create_workspace(sz); | |||||
| dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); | |||||
| dnn_opr.exec_fastrun(inp_tensornds[0], inp_tensornds[1], out); | |||||
| return {out->sub(0, real_dst_layout)}; | return {out->sub(0, real_dst_layout)}; | ||||
| } | } | ||||
| @@ -266,7 +258,6 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||||
| auto dim1 = matmul.dimA, dim2 = matmul.dimB; | auto dim1 = matmul.dimA, dim2 = matmul.dimB; | ||||
| auto cn = inputs[0]->comp_node(); | auto cn = inputs[0]->comp_node(); | ||||
| using Desc = opr::AxisAddRemove::AxisDesc; | |||||
| using IndexDesc = opr::Subtensor::IndexDesc; | using IndexDesc = opr::Subtensor::IndexDesc; | ||||
| OperatorNodeConfig config{matmul.make_name(), cn}; | OperatorNodeConfig config{matmul.make_name(), cn}; | ||||
| @@ -343,9 +334,8 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
| DType dst_dtype; | DType dst_dtype; | ||||
| DnnOprCaller<megdnn::MatrixMul> dnn_opr(inputs[0].comp_node); | |||||
| dnn_opr.op->param() = matmul.param(); | |||||
| dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||||
| DnnOprHelper<megdnn::MatrixMul> dnn_opr(matmul.param()); | |||||
| dnn_opr.opr().deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||||
| if (dim1 == 0 || dim2 == 0) { | if (dim1 == 0 || dim2 == 0) { | ||||
| return {{{TensorLayout(dst_dtype), inputs[0].comp_node}}, false}; | return {{{TensorLayout(dst_dtype), inputs[0].comp_node}}, false}; | ||||
| @@ -386,10 +376,9 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| TensorLayout layout1 = inputs[0]->layout(), layout2 = inputs[1]->layout(); | TensorLayout layout1 = inputs[0]->layout(), layout2 = inputs[1]->layout(); | ||||
| size_t dim1 = layout1.ndim, dim2 = layout2.ndim; | size_t dim1 = layout1.ndim, dim2 = layout2.ndim; | ||||
| DnnOprCaller<megdnn::BatchedMatrixMul> dnn_opr(cn); | |||||
| dnn_opr.op->param() = matmul.param(); | |||||
| DnnOprCaller<megdnn::BatchedMatrixMul> dnn_opr(cn, matmul.param(), matmul.policy()); | |||||
| DType dst_dtype; | DType dst_dtype; | ||||
| dnn_opr.op->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||||
| dnn_opr.op()->deduce_dtype(layout1.dtype, layout1.dtype, dst_dtype); | |||||
| TensorShape tshp, batch_shp; | TensorShape tshp, batch_shp; | ||||
| size_t j = 0; | size_t j = 0; | ||||
| @@ -473,14 +462,9 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| inp_tensornds[1] = inp2->dnn_tensor(); | inp_tensornds[1] = inp2->dnn_tensor(); | ||||
| inp_tensornds[1].layout = layout2; | inp_tensornds[1].layout = layout2; | ||||
| size_t sz = setup_algo<megdnn::BatchedMatrixMul>( | |||||
| {layout1, layout2, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, | |||||
| matmul.policy(), false, &inp_tensornds); | |||||
| auto out = Tensor::make(dst_layout, cn); | auto out = Tensor::make(dst_layout, cn); | ||||
| auto dnn_wk = dnn_opr.create_workspace(sz); | |||||
| dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); | |||||
| dnn_opr.exec_fastrun(inp_tensornds[0], inp_tensornds[1], out); | |||||
| shp1[shp1.ndim - 2] = dst_layout[dst_layout.ndim - 2]; | shp1[shp1.ndim - 2] = dst_layout[dst_layout.ndim - 2]; | ||||
| shp1[shp1.ndim - 1] = dst_layout[dst_layout.ndim - 1]; | shp1[shp1.ndim - 1] = dst_layout[dst_layout.ndim - 1]; | ||||
| @@ -533,7 +517,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| TensorLayout oup_layout{inputs[0]->dtype()}; | TensorLayout oup_layout{inputs[0]->dtype()}; | ||||
| auto inp1_tensor = inputs[0]->dnn_tensor(); | auto inp1_tensor = inputs[0]->dnn_tensor(); | ||||
| auto inp2_tensor = inputs[1]->dnn_tensor(); | auto inp2_tensor = inputs[1]->dnn_tensor(); | ||||
| dnn_opr.op->deduce_layout(inp1_tensor.layout, inp2_tensor.layout, oup_layout); | |||||
| oup_layout = dnn_opr.deduce_layout(inp1_tensor.layout, inp2_tensor.layout); | |||||
| if (inputs[0]->layout().is_empty() || inputs[1]->layout().is_empty()) { | if (inputs[0]->layout().is_empty() || inputs[1]->layout().is_empty()) { | ||||
| auto out = Tensor::make(oup_layout, comp_node); | auto out = Tensor::make(oup_layout, comp_node); | ||||
| @@ -543,14 +527,8 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| return {out}; | return {out}; | ||||
| } | } | ||||
| auto sz = dnn_opr.op->get_workspace_in_bytes( | |||||
| inp_tensornds[0].layout, inp_tensornds[1].layout, output_descs[0].layout); | |||||
| auto out = Tensor::make(oup_layout, comp_node); | auto out = Tensor::make(oup_layout, comp_node); | ||||
| auto dnn_wk = dnn_opr.create_workspace(sz); | |||||
| dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out->dnn_tensor(), dnn_wk); | |||||
| dnn_opr.exec_with_ws(inp_tensornds[0], inp_tensornds[1], out); | |||||
| return {out}; | return {out}; | ||||
| } | } | ||||
| @@ -17,27 +17,18 @@ SymbolVarArray apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||||
| SmallVector<TensorPtr> apply_on_physical_tensor( | SmallVector<TensorPtr> apply_on_physical_tensor( | ||||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | const OpDef& def, const SmallVector<TensorPtr>& inputs, | ||||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | ||||
| size_t size = inputs.size(); | |||||
| auto&& op = def.cast_final_safe<CheckNonFinite>(); | auto&& op = def.cast_final_safe<CheckNonFinite>(); | ||||
| SmallVector<TensorPtr> outputs(size + 1); | |||||
| outputs[size] = Tensor::make( | |||||
| TensorLayout(TensorShape({1}), dtype::Int32()), inputs[0]->comp_node()); | |||||
| auto dest = outputs[size]; | |||||
| auto cn = dest->comp_node(); | |||||
| DnnOprCaller<megdnn::CheckNonFinite> dnn_opr(cn); | |||||
| SmallVector<megdnn::TensorND> srcs(size); | |||||
| // copy an outputs to the dnn for inplace | |||||
| for (size_t i = 0; i < size; ++i) { | |||||
| outputs[i] = Tensor::make(inputs[i]->layout(), inputs[0]->comp_node()); | |||||
| outputs[i]->dev_tensor().copy_from_fixlayout(inputs[i]->dev_tensor()); | |||||
| srcs[i] = outputs[i]->dev_tensor().as_megdnn(); | |||||
| auto comp_node = inputs[0]->comp_node(); | |||||
| auto dest = Tensor::make(TensorLayout({1}, dtype::Int32()), comp_node); | |||||
| SmallVector<TensorPtr> outputs; | |||||
| outputs.reserve(inputs.size() + 1); | |||||
| for (auto&& input : inputs) { | |||||
| outputs.push_back(Tensor::make(input->layout(), comp_node)); | |||||
| outputs.back()->dev_tensor().copy_from_fixlayout(input->dev_tensor()); | |||||
| } | } | ||||
| megdnn::CheckNonFinite::Param param({op.scale}); | |||||
| dnn_opr.op->param() = param; | |||||
| size_t sz = dnn_opr.op->get_workspace_in_bytes(srcs, dest->layout()); | |||||
| auto dnn_wk = dnn_opr.create_workspace(sz); | |||||
| dnn_opr.op->exec(srcs, dest->dnn_tensor(), dnn_wk); | |||||
| DnnOprCaller<megdnn::CheckNonFinite> dnn_opr(comp_node, {op.scale}); | |||||
| dnn_opr.exec_with_ws(outputs, dest); | |||||
| outputs.push_back(dest); | |||||
| return outputs; | return outputs; | ||||
| } | } | ||||
| @@ -45,13 +36,15 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | ||||
| size_t size = inputs.size(); | size_t size = inputs.size(); | ||||
| SmallVector<LogicalTensorDesc> dests(size + 1); | SmallVector<LogicalTensorDesc> dests(size + 1); | ||||
| bool validated = true; | |||||
| for (size_t i = 0; i < size; ++i) { | for (size_t i = 0; i < size; ++i) { | ||||
| dests[i].comp_node = inputs[i].comp_node; | dests[i].comp_node = inputs[i].comp_node; | ||||
| dests[i].layout = inputs[i].layout; | dests[i].layout = inputs[i].layout; | ||||
| validated &= bool(dests[i].layout.ndim); | |||||
| } | } | ||||
| dests[size].comp_node = inputs[0].comp_node; | dests[size].comp_node = inputs[0].comp_node; | ||||
| dests[size].layout = TensorLayout(TensorShape({1}), dtype::Int32()); | |||||
| return {dests, true}; | |||||
| dests[size].layout = TensorLayout({1}, dtype::Int32()); | |||||
| return {dests, validated}; | |||||
| } | } | ||||
| OP_TRAIT_REG(CheckNonFinite, CheckNonFinite) | OP_TRAIT_REG(CheckNonFinite, CheckNonFinite) | ||||
| @@ -27,40 +27,31 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | ||||
| auto comp_node = inputs[0]->comp_node(); | auto comp_node = inputs[0]->comp_node(); | ||||
| auto&& op_def = def.cast_final_safe<Padding>(); | auto&& op_def = def.cast_final_safe<Padding>(); | ||||
| DnnOprCaller<megdnn::Padding> dnn_op(comp_node); | |||||
| dnn_op.op->param() = op_def.param(); | |||||
| TensorLayout dst = output_descs[0].layout; | |||||
| if (!validated) { | |||||
| megdnn::Padding::deduce_layout_impl( | |||||
| inputs[0]->dnn_tensor().layout, dst, op_def.param()); | |||||
| } | |||||
| DeviceTensorND out = | |||||
| BlobManager::inst()->alloc_workspace_with_defrag(comp_node, dst); | |||||
| dnn_op.op->exec(inputs[0]->dnn_tensor(), out.as_megdnn()); | |||||
| return {Tensor::make(out)}; | |||||
| DnnOprCaller<megdnn::Padding> dnn_op(comp_node, op_def.param()); | |||||
| auto dst = [&] { | |||||
| if (validated) { | |||||
| return output_descs[0].layout; | |||||
| } else { | |||||
| return dnn_op.deduce_layout(inputs[0]->layout()); | |||||
| } | |||||
| }(); | |||||
| auto out = Tensor::make(dst, comp_node); | |||||
| dnn_op.exec(inputs[0], out); | |||||
| return {out}; | |||||
| } | } | ||||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | ||||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | ||||
| auto&& op_def = def.cast_final_safe<Padding>(); | auto&& op_def = def.cast_final_safe<Padding>(); | ||||
| size_t nr_inp = inputs.size(); | |||||
| auto p = op_def.param(); | |||||
| auto&& inp = inputs[0]; | auto&& inp = inputs[0]; | ||||
| auto& inp_cn = inp.comp_node; | |||||
| if (inp.layout.ndim == 0) { | if (inp.layout.ndim == 0) { | ||||
| return {{{TensorLayout{inp.layout.dtype}, inp_cn, {}}}, false}; | |||||
| return {{{TensorLayout{inp.layout.dtype}, inp.comp_node, {}}}, false}; | |||||
| } | } | ||||
| TensorLayout oup_layout; | |||||
| megdnn::Padding::deduce_layout_impl(inp.layout, oup_layout, p); | |||||
| return {{{oup_layout, inp_cn, {}}}, true}; | |||||
| DnnOprHelper<megdnn::Padding> dnn_op(op_def.param()); | |||||
| auto oup_layout = dnn_op.deduce_layout(inp.layout); | |||||
| return {{{oup_layout, inp.comp_node}}, true}; | |||||
| } | } | ||||
| OP_TRAIT_REG(Padding, Padding, opr::Padding) | OP_TRAIT_REG(Padding, Padding, opr::Padding) | ||||
| @@ -74,4 +65,4 @@ OP_TRAIT_REG(Padding, Padding, opr::Padding) | |||||
| } // namespace imperative | } // namespace imperative | ||||
| } // namespace mgb | } // namespace mgb | ||||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||||
| @@ -25,19 +25,13 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
| mgb_assert( | mgb_assert( | ||||
| inputs.size() == 1, "num of inputs of pooling should be 1 but you give %zu", | inputs.size() == 1, "num of inputs of pooling should be 1 but you give %zu", | ||||
| inputs.size()); | inputs.size()); | ||||
| auto&& op_def = def.cast_final_safe<Pooling>(); | auto&& op_def = def.cast_final_safe<Pooling>(); | ||||
| auto&& inp = inputs[0]; | |||||
| auto& inp_cn = inp.comp_node; | |||||
| if (inp.layout.ndim == 0) { | |||||
| return {{{TensorLayout{inp.layout.dtype}, inp_cn, {}}}, false}; | |||||
| if (!inputs[0].layout.ndim) { | |||||
| return {{{inputs[0].layout, inputs[0].comp_node}}, false}; | |||||
| } | } | ||||
| TensorLayout oup_layout; | |||||
| megdnn::Pooling::deduce_layout_impl(inp.layout, op_def.param(), oup_layout); | |||||
| return {{{oup_layout, inp_cn, {}}}, true}; | |||||
| DnnOprHelper<megdnn::Pooling> dnn_opr(op_def.param()); | |||||
| auto oup_layout = dnn_opr.deduce_layout(inputs[0].layout); | |||||
| return {{{oup_layout, inputs[0].comp_node}}, true}; | |||||
| } | } | ||||
| SmallVector<TensorPtr> apply_on_physical_tensor( | SmallVector<TensorPtr> apply_on_physical_tensor( | ||||
| @@ -47,30 +41,18 @@ SmallVector<TensorPtr> apply_on_physical_tensor( | |||||
| inputs.size() == 1, "num of inputs of pooling should be 1 but you give %zu", | inputs.size() == 1, "num of inputs of pooling should be 1 but you give %zu", | ||||
| inputs.size()); | inputs.size()); | ||||
| auto&& op_def = def.cast_final_safe<Pooling>(); | |||||
| auto&& pooling = def.cast_final_safe<Pooling>(); | |||||
| auto cn = inputs[0]->comp_node(); | auto cn = inputs[0]->comp_node(); | ||||
| DnnOprCaller<megdnn::Pooling> caller(cn); | |||||
| auto&& dnn_opr = caller.op; | |||||
| dnn_opr->param() = op_def.param(); | |||||
| SmallVector<megdnn::TensorND> inp_tensornds(inputs.size()); | |||||
| inp_tensornds[0] = inputs[0]->dnn_tensor(); | |||||
| TensorLayout& oup_layout = output_descs[0].layout; | |||||
| if (!validated) { | |||||
| megdnn::Pooling::deduce_layout_impl( | |||||
| inp_tensornds[0].layout, op_def.param(), oup_layout); | |||||
| } | |||||
| size_t wk_size = setup_algo<megdnn::Pooling>( | |||||
| {inp_tensornds[0].layout, oup_layout}, dnn_opr.get(), 0, false, false, cn, | |||||
| op_def.policy(), false, &inp_tensornds); | |||||
| DnnOprCaller<megdnn::Pooling> dnn_opr(cn, pooling.param(), pooling.policy()); | |||||
| auto oup_layout = [&] { | |||||
| if (validated) { | |||||
| return output_descs[0].layout; | |||||
| } else { | |||||
| return dnn_opr.deduce_layout(inputs[0]->layout()); | |||||
| } | |||||
| }(); | |||||
| auto out = Tensor::make(oup_layout, cn); | auto out = Tensor::make(oup_layout, cn); | ||||
| auto dnn_wk = caller.create_workspace(wk_size); | |||||
| caller.op->exec(inp_tensornds[0], out->dnn_tensor(), dnn_wk); | |||||
| dnn_opr.exec_fastrun(inputs[0], out); | |||||
| return {out}; | return {out}; | ||||
| } | } | ||||
| @@ -18,33 +18,31 @@ namespace reduce { | |||||
| auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | ||||
| auto&& reduce = static_cast<const Reduce&>(def); | auto&& reduce = static_cast<const Reduce&>(def); | ||||
| auto comp_node = inputs[0]->comp_node(); | auto comp_node = inputs[0]->comp_node(); | ||||
| OperatorNodeConfig config{reduce.make_name(), comp_node, inputs[0]->dtype()}; | |||||
| auto name = reduce.make_name(); | |||||
| if (inputs.size() > 1) { | |||||
| return opr::Reduce::make(inputs[0], reduce.param(), inputs[1], config); | |||||
| } | |||||
| using Param = megdnn::param::Reduce; | |||||
| auto param = reduce.param(); | auto param = reduce.param(); | ||||
| if (param.axis < 0) { | |||||
| param.axis = inputs[0]->shape().ndim + param.axis; | |||||
| auto axis = param.axis; | |||||
| auto keepdim = reduce.keepdim; | |||||
| if (inputs.size() == 2) { | |||||
| return opr::Reduce::make(inputs[0], param, inputs[1], {name}); | |||||
| } | } | ||||
| mgb_assert(inputs.size() == 1); | |||||
| SymbolVar target_shape = (cg::VarNode*)nullptr; | |||||
| if (param.axis == INT_MAX) { | |||||
| DTypeScalar vi{1}; | |||||
| // auto graph = ComputingGraph::make(); | |||||
| if (axis == INT_MAX) { | |||||
| // keepdim could be ignored when ndim == 1 | |||||
| auto graph = inputs[0]->owner_graph(); | auto graph = inputs[0]->owner_graph(); | ||||
| target_shape = opr::ImmutableTensor::make(*graph, vi, config); | |||||
| auto scalar_shape = | |||||
| opr::ImmutableTensor::make(*graph, DTypeScalar(1), {name, comp_node}); | |||||
| return opr::Reduce::make(inputs[0], param, scalar_shape, {name}); | |||||
| } | } | ||||
| auto res = opr::Reduce::make(inputs[0], param, target_shape, config); | |||||
| if (!reduce.keepdim && param.axis != INT_MAX) { | |||||
| // mgb::opr::Reduce supports negative axis | |||||
| auto res = opr::Reduce::make(inputs[0], param, {}, {name}); | |||||
| if (!keepdim) { | |||||
| using Desc = opr::AxisAddRemove::AxisDesc; | using Desc = opr::AxisAddRemove::AxisDesc; | ||||
| std::vector<Desc> remove_param; | |||||
| remove_param.push_back(Desc::make_remove(param.axis)); | |||||
| OperatorNodeConfig remove_config{ | |||||
| def.make_name(), comp_node, inputs[0]->dtype()}; | |||||
| return opr::AxisAddRemove::make(res, remove_param, remove_config); | |||||
| std::vector<Desc> remove_axis_param; | |||||
| remove_axis_param.push_back(Desc::make_remove(axis)); | |||||
| res = opr::AxisAddRemove::make(res, remove_axis_param, {name}); | |||||
| } | } | ||||
| return res; | return res; | ||||
| } | } | ||||
| @@ -71,111 +69,104 @@ bool memory_forward_success(const OpDef& def, SmallVector<TensorPtr> inputs) { | |||||
| SmallVector<TensorPtr> apply_on_physical_tensor( | SmallVector<TensorPtr> apply_on_physical_tensor( | ||||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | const OpDef& def, const SmallVector<TensorPtr>& inputs, | ||||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | ||||
| // memory forward | |||||
| if (memory_forward_success(def, inputs)) { | if (memory_forward_success(def, inputs)) { | ||||
| // maybe returns inputs[0] directly | |||||
| return {Tensor::make( | return {Tensor::make( | ||||
| inputs[0]->blob(), inputs[0]->offset(), inputs[0]->layout())}; | inputs[0]->blob(), inputs[0]->offset(), inputs[0]->layout())}; | ||||
| } | } | ||||
| auto size = inputs.size(); | |||||
| if (size > 1) { | |||||
| if (inputs.size() == 2) { | |||||
| // reduce to target shape, fallback to proxy_graph | |||||
| return proxy_graph_detail::apply_on_physical_tensor( | return proxy_graph_detail::apply_on_physical_tensor( | ||||
| def, inputs, output_descs, validated); | def, inputs, output_descs, validated); | ||||
| } | } | ||||
| mgb_assert(inputs.size() == 1); | |||||
| auto comp_node = inputs[0]->comp_node(); | auto comp_node = inputs[0]->comp_node(); | ||||
| using TensorND = megdnn::TensorND; | |||||
| auto&& op_def = def.cast_final_safe<Reduce>(); | auto&& op_def = def.cast_final_safe<Reduce>(); | ||||
| SmallVector<TensorND> inp_tensornds; | |||||
| inp_tensornds.reserve(inputs.size()); | |||||
| auto src = inputs[0]->layout(); | |||||
| DnnOprCaller<megdnn::Reduce> dnn_op(comp_node); | |||||
| dnn_op.op->param() = op_def.param(); | |||||
| auto axis = op_def.param().axis; | |||||
| DnnOprCaller<megdnn::Reduce> dnn_op(comp_node, op_def.param()); | |||||
| auto&& mode = dnn_op.param().mode; | |||||
| auto& axis = dnn_op.param().axis; | |||||
| auto keepdim = op_def.keepdim; | auto keepdim = op_def.keepdim; | ||||
| if (axis < 0) { | |||||
| axis = inputs[0]->layout().ndim + axis; | |||||
| } | |||||
| dnn_op.op->param().axis = axis == INT_MAX ? 0 : axis; | |||||
| if (axis == INT_MAX) { | |||||
| src.shape[0] = src.total_nr_elems(); | |||||
| src.ndim = 1; | |||||
| src.init_contiguous_stride(); | |||||
| } | |||||
| TensorLayout layout{src.dtype}; | |||||
| dnn_op.op->deduce_layout(src, layout); | |||||
| if (inputs[0]->layout().is_empty()) { | |||||
| inputs[0]->dev_tensor().reset(inputs[0]->dev_tensor().storage(), src); | |||||
| auto mode = op_def.param().mode; | |||||
| if (!keepdim && src.ndim > 1) { | |||||
| layout.remove_axis_inplace(axis); | |||||
| layout.init_contiguous_stride(); | |||||
| DnnTensorND dnn_input = [&] { | |||||
| if (axis == INT_MAX) { // reduce to scalar | |||||
| axis = 0; | |||||
| // flatten input | |||||
| return inputs[0]->dnn_tensor({inputs[0]->shape().total_nr_elems()}); | |||||
| } else { | |||||
| if (axis < 0) { | |||||
| axis = inputs[0]->layout().ndim + axis; | |||||
| } | |||||
| mgb_assert(axis >= 0 && axis < inputs[0]->layout().ndim); | |||||
| return inputs[0]->dnn_tensor(); | |||||
| } | } | ||||
| auto out = Tensor::make(layout, comp_node); | |||||
| }(); | |||||
| auto output_layout = dnn_op.deduce_layout(dnn_input.layout); | |||||
| auto resolve_keepdim = [&] { | |||||
| if (!keepdim) { | |||||
| if (output_layout.ndim > 1) { | |||||
| mgb_assert(output_layout.shape[axis] == 1); | |||||
| output_layout.remove_axis_inplace(axis); | |||||
| } | |||||
| } | |||||
| }; | |||||
| std::string err_msg; | |||||
| TensorPtr output; | |||||
| if (output_layout.is_empty()) { | |||||
| // output empty, no computation | |||||
| resolve_keepdim(); | |||||
| output = Tensor::make(output_layout, comp_node); | |||||
| } else if (dnn_input.layout.is_empty()) { | |||||
| // input empty but output not, do fill | |||||
| resolve_keepdim(); | |||||
| output = Tensor::make(output_layout, comp_node); | |||||
| auto on_bad_empty_reduce = [](const char* name) { | |||||
| mgb_throw( | |||||
| MegBrainError, "empty input is not allowed for reduce mode: %s", | |||||
| name); | |||||
| }; | |||||
| switch (mode) { | switch (mode) { | ||||
| case Reduce::Mode::SUM: | case Reduce::Mode::SUM: | ||||
| if (!out->empty()) { | |||||
| dev_tensor_memset(out->dev_tensor(), 0); | |||||
| } | |||||
| // fill 0 | |||||
| dev_tensor_memset(output->dev_tensor(), 0); | |||||
| break; | break; | ||||
| case Reduce::Mode::PRODUCT: | |||||
| if (!out->empty()) { | |||||
| DnnOprCaller<megdnn::Fill> fill_op(comp_node); | |||||
| fill_op.op->param() = 1; | |||||
| fill_op.op->exec(out->dnn_tensor(), {}); | |||||
| } | |||||
| case Reduce::Mode::PRODUCT: { | |||||
| // fill 1 | |||||
| DnnOprCaller<megdnn::Fill> fill_op(comp_node, {1}); | |||||
| fill_op.exec_with_ws(output); | |||||
| break; | break; | ||||
| } | |||||
| case Reduce::Mode::MEAN: | case Reduce::Mode::MEAN: | ||||
| err_msg = "mean"; | |||||
| on_bad_empty_reduce("mean"); | |||||
| break; | break; | ||||
| case Reduce::Mode::MIN: | case Reduce::Mode::MIN: | ||||
| err_msg = "min"; | |||||
| on_bad_empty_reduce("min"); | |||||
| break; | break; | ||||
| case Reduce::Mode::MAX: | case Reduce::Mode::MAX: | ||||
| err_msg = "max"; | |||||
| on_bad_empty_reduce("max"); | |||||
| break; | break; | ||||
| case Reduce::Mode::SUM_SQR: | case Reduce::Mode::SUM_SQR: | ||||
| err_msg = "sum_sqr"; | |||||
| on_bad_empty_reduce("sum_sqr"); | |||||
| break; | break; | ||||
| default: | default: | ||||
| mgb_throw(MegBrainError, "bad reduce mode"); | mgb_throw(MegBrainError, "bad reduce mode"); | ||||
| } | } | ||||
| if (!err_msg.empty()) { | |||||
| mgb_throw( | |||||
| MegBrainError, "empty input is not allowed for reduce mode: %s", | |||||
| err_msg.c_str()); | |||||
| } else { | |||||
| // common reduction | |||||
| if (keepdim) { | |||||
| output = Tensor::make(output_layout, comp_node); | |||||
| dnn_op.exec_with_ws(dnn_input, output); | |||||
| } else { | |||||
| // used by megdnn::exec | |||||
| auto output_layout_keepdim = output_layout; | |||||
| resolve_keepdim(); | |||||
| output = Tensor::make(output_layout, comp_node); | |||||
| dnn_op.exec_with_ws(dnn_input, output->dnn_tensor(output_layout_keepdim)); | |||||
| } | } | ||||
| return {out}; | |||||
| } | } | ||||
| auto dnn_ten = inputs[0]->dnn_tensor(); | |||||
| dnn_ten.layout = src; | |||||
| inp_tensornds.push_back(dnn_ten); | |||||
| auto wk_size = dnn_op.op->get_workspace_in_bytes(src, layout); | |||||
| auto dnn_wk = dnn_op.create_workspace(wk_size); | |||||
| TensorLayout ori_layout = layout; | |||||
| if (!keepdim && src.ndim > 1) { | |||||
| layout.remove_axis_inplace(axis); | |||||
| layout.init_contiguous_stride(); | |||||
| } | |||||
| auto out = Tensor::make(layout, comp_node); | |||||
| auto dnn_out = out->dnn_tensor(); | |||||
| dnn_out.layout = ori_layout; | |||||
| dnn_op.op->exec(inp_tensornds[0], dnn_out, dnn_wk); | |||||
| return {out}; | |||||
| return {output}; | |||||
| } | } | ||||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | ||||
| @@ -184,16 +175,12 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
| auto axis = op_def.param().axis; | auto axis = op_def.param().axis; | ||||
| auto keepdim = op_def.keepdim; | auto keepdim = op_def.keepdim; | ||||
| size_t size = inputs.size(); | |||||
| SmallVector<LogicalTensorDesc> dests(size); | |||||
| mgb_assert(inputs.size() > 0); | |||||
| auto&& comp_node = inputs[0].comp_node; | |||||
| auto&& input_layout = inputs[0].layout; | |||||
| for (size_t i = 0; i < size; i++) { | |||||
| if (inputs[i].layout.ndim == 0) { | |||||
| return {{{TensorLayout(inputs[0].layout.dtype), inputs[0].comp_node}}, | |||||
| false}; | |||||
| } | |||||
| } | |||||
| if (size > 1) { | |||||
| if (inputs.size() == 2) { | |||||
| // fallback to proxy_graph, matters on backward | |||||
| auto [output_descs, validated] = | auto [output_descs, validated] = | ||||
| proxy_graph_detail::infer_output_attrs_fallible(def, inputs); | proxy_graph_detail::infer_output_attrs_fallible(def, inputs); | ||||
| if (!inputs[1].value.empty()) { | if (!inputs[1].value.empty()) { | ||||
| @@ -203,30 +190,37 @@ std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | |||||
| return {output_descs, validated}; | return {output_descs, validated}; | ||||
| } | } | ||||
| mgb_assert(inputs.size() == 1); | |||||
| if (axis == INT_MAX) { | |||||
| // reduce to scalar | |||||
| // ignore keepdim because ndim is 1 | |||||
| auto&& dtype = input_layout.dtype; | |||||
| auto&& format = input_layout.format; | |||||
| auto output_layout = TensorLayout{{1}, dtype, format}; | |||||
| return {{{output_layout, comp_node}}, true}; | |||||
| } | |||||
| if (input_layout.ndim == 0) { | |||||
| // shape incomplete | |||||
| return {{{TensorLayout(input_layout.dtype, input_layout.format), comp_node}}, | |||||
| false}; | |||||
| } | |||||
| if (axis < 0) { | if (axis < 0) { | ||||
| axis = inputs[0].layout.ndim + axis; | |||||
| axis = input_layout.ndim + axis; | |||||
| } | } | ||||
| mgb_assert(axis >= 0 && axis < input_layout.ndim); | |||||
| if (axis == INT_MAX || inputs[0].layout.ndim == 1) { | |||||
| TensorLayout layout{inputs[0].layout.dtype}; | |||||
| layout.shape[0] = 1; | |||||
| layout.ndim = 1; | |||||
| dests[0].layout = layout; | |||||
| dests[0].comp_node = inputs[0].comp_node; | |||||
| TensorLayout output_layout = input_layout; | |||||
| bool remove_axis = (!keepdim) && input_layout.ndim > 1; | |||||
| if (remove_axis) { | |||||
| output_layout.remove_axis_inplace(axis); | |||||
| } else { | } else { | ||||
| for (size_t i = 0; i < size; ++i) { | |||||
| dests[i].comp_node = inputs[i].comp_node; | |||||
| dests[i].layout = inputs[i].layout; | |||||
| if (!keepdim && dests[i].layout.ndim > 1) { | |||||
| dests[i].layout.remove_axis_inplace(axis); | |||||
| } else { | |||||
| dests[i].layout.shape[axis] = 1; | |||||
| } | |||||
| dests[i].layout.init_contiguous_stride(); | |||||
| } | |||||
| output_layout.shape[axis] = 1; | |||||
| } | } | ||||
| return {dests, true}; | |||||
| output_layout.init_contiguous_stride(); | |||||
| return {{{output_layout, comp_node}}, true}; | |||||
| } | } | ||||
| SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint( | SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint( | ||||
| @@ -230,31 +230,19 @@ SmallVector<TensorPtr> param_pack_concat_apply_on_physical_tensor( | |||||
| } | } | ||||
| auto dest_layout = TensorLayout({nr_elems}, dtype); | auto dest_layout = TensorLayout({nr_elems}, dtype); | ||||
| auto output = Tensor::make(dest_layout, comp_node); | auto output = Tensor::make(dest_layout, comp_node); | ||||
| auto caller = DnnOprCaller<megdnn::ParamPackConcat>(comp_node); | |||||
| size_t srcs_size = sizeof(void*) * nr_inputs; | |||||
| void** srcs_raw_ptr = (void**)comp_node.alloc_host(srcs_size); | |||||
| std::shared_ptr<dt_byte> srcs_ptr = { | |||||
| (dt_byte*)srcs_raw_ptr, | |||||
| [comp_node](dt_byte* ptr) { comp_node.free_host(ptr); }}; | |||||
| // FIXME: add param to ParamPackConcat | |||||
| DnnOprCaller<megdnn::ParamPackConcat> caller{comp_node}; | |||||
| HostTensorStorage srcs_storage{comp_node}; | |||||
| srcs_storage.ensure_size(sizeof(void*) * nr_inputs); | |||||
| TensorLayout srcs_layout = TensorLayout{{nr_inputs}, dtype::Int32()}; | TensorLayout srcs_layout = TensorLayout{{nr_inputs}, dtype::Int32()}; | ||||
| size_t ws_size; | |||||
| { | |||||
| TensorShapeArray src_shapes; | |||||
| for (size_t i = 0; i < nr_inputs; ++i) { | |||||
| src_shapes.push_back(inputs[i]->shape()); | |||||
| } | |||||
| ws_size = caller.op->get_workspace_in_bytes( | |||||
| src_shapes, inputs.back()->shape(), TensorShape{}); | |||||
| } | |||||
| HostTensorND srcs_tensornd; | |||||
| srcs_tensornd.reset(srcs_storage, srcs_layout); | |||||
| auto* srcs_raw_ptr = reinterpret_cast<void**>(srcs_storage.ptr()); | |||||
| for (size_t i = 0; i < nr_inputs; ++i) { | for (size_t i = 0; i < nr_inputs; ++i) { | ||||
| srcs_raw_ptr[i] = inputs[i]->dev_tensor().as_megdnn().raw_ptr(); | |||||
| srcs_raw_ptr[i] = inputs[i]->dnn_tensor().raw_ptr(); | |||||
| } | } | ||||
| HostTensorStorage srcs_storage; | |||||
| srcs_storage.reset(comp_node, srcs_size, srcs_ptr); | |||||
| caller.op->exec( | |||||
| {srcs_raw_ptr, srcs_layout}, inputs.back()->dnn_tensor(), | |||||
| output->dnn_tensor(), caller.create_workspace(ws_size)); | |||||
| async_release(HostTensorND{comp_node, srcs_layout}.storage(srcs_storage)); | |||||
| caller.exec_with_ws(srcs_tensornd.as_megdnn(), inputs.back(), output); | |||||
| async_release(srcs_tensornd); | |||||
| return {output}; | return {output}; | ||||
| } | } | ||||
| @@ -33,69 +33,39 @@ VarNodeArray apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { | |||||
| std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible( | ||||
| const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) { | ||||
| auto&& op = static_cast<const ROIAlign&>(def); | |||||
| if (inputs[0].layout.is_empty() || inputs[1].layout.is_empty()) { | |||||
| return {{{TensorLayout(inputs[0].layout.dtype), inputs[0].comp_node}, | |||||
| {TensorLayout(dtype::Int32()), inputs[1].comp_node}}, | |||||
| false}; | |||||
| } | |||||
| SmallVector<LogicalTensorDesc> descs(2u); | |||||
| size_t n = inputs[1].layout[0]; | |||||
| size_t c = inputs[0].layout[1]; | |||||
| descs[0].layout = TensorLayout( | |||||
| {n, c, op.pooled_height, op.pooled_width}, inputs[0].layout.dtype); | |||||
| descs[0].layout.init_contiguous_stride(); | |||||
| descs[0].comp_node = inputs[0].comp_node; | |||||
| descs[1].layout = | |||||
| TensorLayout({n, c, op.pooled_height, op.pooled_width}, dtype::Int32()); | |||||
| descs[1].layout.init_contiguous_stride(); | |||||
| descs[1].comp_node = descs[0].comp_node; | |||||
| return {descs, true}; | |||||
| auto&& op = def.cast_final_safe<ROIAlign>(); | |||||
| DnnOprHelper<megdnn::ROIAlign> dnn_opr(op.param()); | |||||
| auto cn = inputs[0].comp_node; | |||||
| auto&& [out_layout, ind_layout] = | |||||
| dnn_opr.deduce_layouts<2>(inputs[0].layout, inputs[1].layout); | |||||
| bool validated = out_layout.ndim == 0 && ind_layout.ndim == 0; | |||||
| return {{{out_layout, cn}, {ind_layout, cn}}, validated}; | |||||
| } | } | ||||
| SmallVector<TensorPtr> apply_on_physical_tensor( | SmallVector<TensorPtr> apply_on_physical_tensor( | ||||
| const OpDef& def, const SmallVector<TensorPtr>& inputs, | const OpDef& def, const SmallVector<TensorPtr>& inputs, | ||||
| SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { | ||||
| auto&& op = static_cast<const ROIAlign&>(def); | |||||
| CompNode cn = inputs[0]->comp_node(); | |||||
| auto&& op = def.cast_final_safe<ROIAlign>(); | |||||
| auto cn = inputs[0]->comp_node(); | |||||
| TensorLayout out_layout = output_descs[0].layout; | |||||
| TensorLayout ind_layout = output_descs[1].layout; | |||||
| if (!validated) { | |||||
| size_t n = inputs[1]->layout()[0]; | |||||
| size_t c = inputs[0]->layout()[1]; | |||||
| out_layout = TensorLayout( | |||||
| {n, c, op.pooled_height, op.pooled_width}, inputs[0]->layout().dtype); | |||||
| out_layout.init_contiguous_stride(); | |||||
| ind_layout = | |||||
| TensorLayout({n, c, op.pooled_height, op.pooled_width}, dtype::Int32()); | |||||
| ind_layout.init_contiguous_stride(); | |||||
| } | |||||
| DnnOprCaller<megdnn::ROIAlign> dnn_opr(cn, op.param()); | |||||
| auto&& [out_layout, ind_layout] = [&]() -> std::array<TensorLayout, 2> { | |||||
| if (validated) { | |||||
| return {output_descs[0].layout, output_descs[1].layout}; | |||||
| } else { | |||||
| return dnn_opr.deduce_layouts<2>(inputs[0]->layout(), inputs[1]->layout()); | |||||
| } | |||||
| }(); | |||||
| DeviceTensorND out = | |||||
| BlobManager::inst()->alloc_workspace_with_defrag(cn, out_layout); | |||||
| DeviceTensorND inds = | |||||
| BlobManager::inst()->alloc_workspace_with_defrag(cn, ind_layout); | |||||
| auto out = Tensor::make(out_layout, cn); | |||||
| auto ind = Tensor::make(ind_layout, cn); | |||||
| if (out_layout.is_empty() || ind_layout.is_empty()) { | if (out_layout.is_empty() || ind_layout.is_empty()) { | ||||
| return {Tensor::make(out), Tensor::make(inds)}; | |||||
| return {out, ind}; | |||||
| } | } | ||||
| DnnOprCaller<megdnn::ROIAlign> dnn_opr(cn); | |||||
| dnn_opr.op->param() = op.param(); | |||||
| size_t sz = dnn_opr.op->get_workspace_in_bytes( | |||||
| inputs[0]->layout(), inputs[1]->layout(), out_layout, ind_layout); | |||||
| auto dnn_wk = dnn_opr.create_workspace(sz); | |||||
| dnn_opr.op->exec( | |||||
| inputs[0]->dnn_tensor(), inputs[1]->dnn_tensor(), out.as_megdnn(), | |||||
| inds.as_megdnn(), dnn_wk); | |||||
| return {Tensor::make(out), Tensor::make(inds)}; | |||||
| dnn_opr.exec_with_ws(inputs[0], inputs[1], out, ind); | |||||
| return {out, ind}; | |||||
| } | } | ||||
| SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint( | SmallVector<VarNode::LayoutConstraintCallback> get_input_layout_constraint( | ||||
| @@ -570,11 +570,17 @@ bool Tensor::empty() { | |||||
| return !m_blob->size(); | return !m_blob->size(); | ||||
| } | } | ||||
| megdnn::TensorND Tensor::dnn_tensor() { | |||||
| DnnTensorND Tensor::dnn_tensor() { | |||||
| mgb_assert(m_blob, "uninitialized tensor."); | mgb_assert(m_blob, "uninitialized tensor."); | ||||
| mgb_assert(m_layout.ndim, "dnn don't support scalar"); | |||||
| return DnnTensorND{m_layout, m_blob->storage(), m_offset}; | return DnnTensorND{m_layout, m_blob->storage(), m_offset}; | ||||
| } | } | ||||
| DnnTensorND Tensor::dnn_tensor(TensorShape new_shape) { | |||||
| mgb_assert(m_blob, "uninitialized tensor."); | |||||
| return DnnTensorND{m_layout.reshape(new_shape), m_blob->storage(), m_offset}; | |||||
| } | |||||
| void Tensor::fetch_value() { | void Tensor::fetch_value() { | ||||
| MGB_LOCK_GUARD(m_value_mtx); | MGB_LOCK_GUARD(m_value_mtx); | ||||
| if (m_value.empty()) { | if (m_value.empty()) { | ||||
| @@ -334,9 +334,16 @@ public: | |||||
| size_t j = 0; | size_t j = 0; | ||||
| for (auto&& var : m_opr->output()) { | for (auto&& var : m_opr->output()) { | ||||
| if (var->contain_flag(VarNode::Flag::VOLATILE_CONTENT)) { | if (var->contain_flag(VarNode::Flag::VOLATILE_CONTENT)) { | ||||
| TensorLayout layout{var->shape(), var->dtype(), var->format()}; | |||||
| var->m_dev_tensor = BlobManager::inst()->alloc_workspace_with_defrag( | |||||
| var->comp_node(), layout); | |||||
| auto comp_node = var->comp_node(); | |||||
| auto dtype = var->dtype(); | |||||
| auto&& shape = var->shape(); | |||||
| size_t size = dtype.size(shape.total_nr_elems()); | |||||
| mgb_assert( | |||||
| var->format().is_default(), "non default format for workspace"); | |||||
| auto raw_storage = Blob::make(comp_node, size)->storage(); | |||||
| DeviceTensorStorage storage; | |||||
| storage.reset(comp_node, size, raw_storage); | |||||
| var->m_dev_tensor.reset(storage, {shape, dtype}); | |||||
| } else { | } else { | ||||
| mgb_assert(j < outputs.size()); | mgb_assert(j < outputs.size()); | ||||
| auto&& tensor = outputs[j]; | auto&& tensor = outputs[j]; | ||||
| @@ -1,6 +1,7 @@ | |||||
| #pragma once | #pragma once | ||||
| #include "megbrain/imperative/physical_tensor.h" | #include "megbrain/imperative/physical_tensor.h" | ||||
| #include "megbrain/imperative/utils/helper.h" | |||||
| namespace mgb { | namespace mgb { | ||||
| namespace imperative { | namespace imperative { | ||||
| @@ -15,13 +16,19 @@ public: | |||||
| virtual void alloc_direct(OwnedBlob* blob, size_t size) = 0; | virtual void alloc_direct(OwnedBlob* blob, size_t size) = 0; | ||||
| virtual bool try_alloc_direct(OwnedBlob* blob, size_t size) { | |||||
| try { | |||||
| alloc_direct(blob, size); | |||||
| return true; | |||||
| } catch (MemAllocError&) { | |||||
| return false; | |||||
| } | |||||
| } | |||||
| virtual void alloc_with_defrag(OwnedBlob* blob, size_t size) = 0; | virtual void alloc_with_defrag(OwnedBlob* blob, size_t size) = 0; | ||||
| virtual void set_allocator(allocator_t allocator) = 0; | virtual void set_allocator(allocator_t allocator) = 0; | ||||
| virtual DeviceTensorND alloc_workspace_with_defrag( | |||||
| CompNode cn, TensorLayout& layout) = 0; | |||||
| virtual void register_blob(OwnedBlob* blob) = 0; | virtual void register_blob(OwnedBlob* blob) = 0; | ||||
| virtual void unregister_blob(OwnedBlob* blob) = 0; | virtual void unregister_blob(OwnedBlob* blob) = 0; | ||||
| @@ -89,24 +89,19 @@ using EventPtr = std::unique_ptr<CompNode::Event, EventDeleter>; | |||||
| class Tensor; | class Tensor; | ||||
| using TensorPtr = std::shared_ptr<Tensor>; | using TensorPtr = std::shared_ptr<Tensor>; | ||||
| /* | |||||
| using DnnTensorND to save the reference count of workspace | |||||
| allocted by blobmanager to prevent invalidation | |||||
| */ | |||||
| struct DnnTensorND : megdnn::TensorND { | struct DnnTensorND : megdnn::TensorND { | ||||
| private: | |||||
| std::shared_ptr<dt_byte> m_reference; | |||||
| // hold extra reference to repvent defrag-in-use | |||||
| std::shared_ptr<dt_byte> reference; | |||||
| public: | |||||
| DnnTensorND(TensorLayout& layout_, std::shared_ptr<dt_byte> ref_ptr, size_t offset) | |||||
| : megdnn::TensorND(layout_, {ref_ptr.get(), offset}) { | |||||
| m_reference = ref_ptr; | |||||
| DnnTensorND( | |||||
| const TensorLayout& layout_, std::shared_ptr<dt_byte> ptr, size_t offset) | |||||
| : megdnn::TensorND(layout_, {ptr.get(), offset}) { | |||||
| reference = std::move(ptr); | |||||
| } | } | ||||
| }; | }; | ||||
| class Tensor : public NonCopyableObj { | class Tensor : public NonCopyableObj { | ||||
| public: | public: | ||||
| Tensor() = default; | |||||
| Tensor(BlobPtr blob, const TensorLayout& layout, size_t offset = 0, | Tensor(BlobPtr blob, const TensorLayout& layout, size_t offset = 0, | ||||
| const HostTensorND& hv = {}); | const HostTensorND& hv = {}); | ||||
| Tensor(BlobPtr blob, const TensorLayout& layout, const HostTensorND& hv = {}) | Tensor(BlobPtr blob, const TensorLayout& layout, const HostTensorND& hv = {}) | ||||
| @@ -154,7 +149,9 @@ public: | |||||
| void assign_from_dev_tensor(DeviceTensorND); | void assign_from_dev_tensor(DeviceTensorND); | ||||
| megdnn::TensorND dnn_tensor(); | |||||
| DnnTensorND dnn_tensor(); | |||||
| DnnTensorND dnn_tensor(TensorShape new_shape); | |||||
| static TensorPtr make_scalar(DTypeScalar value, CompNode cn); | static TensorPtr make_scalar(DTypeScalar value, CompNode cn); | ||||
| @@ -3,6 +3,7 @@ | |||||
| #include <iomanip> | #include <iomanip> | ||||
| #include <memory> | #include <memory> | ||||
| #include <mutex> | #include <mutex> | ||||
| #include <optional> | |||||
| #include <sstream> | #include <sstream> | ||||
| #include "megbrain/utils/metahelper.h" | #include "megbrain/utils/metahelper.h" | ||||
| @@ -14,11 +15,28 @@ namespace imperative { | |||||
| template <typename T = std::function<void()>> | template <typename T = std::function<void()>> | ||||
| class CleanupGuard : public NonCopyableObj { | class CleanupGuard : public NonCopyableObj { | ||||
| private: | private: | ||||
| T m_callback; | |||||
| std::optional<T> m_callback; | |||||
| public: | public: | ||||
| CleanupGuard() = default; | |||||
| explicit CleanupGuard(T cb) : m_callback{std::move(cb)} {} | explicit CleanupGuard(T cb) : m_callback{std::move(cb)} {} | ||||
| ~CleanupGuard() { m_callback(); } | |||||
| ~CleanupGuard() { reset(); } | |||||
| CleanupGuard(CleanupGuard&& rhs) : m_callback(std::move(rhs.m_callback)) { | |||||
| rhs.m_callback.reset(); | |||||
| } | |||||
| CleanupGuard& operator=(CleanupGuard&& rhs) { | |||||
| swap(m_callback, rhs.m_callback); | |||||
| rhs.reset(); | |||||
| return *this; | |||||
| } | |||||
| public: | |||||
| void reset() { | |||||
| if (m_callback) { | |||||
| (*m_callback)(); | |||||
| m_callback.reset(); | |||||
| } | |||||
| } | |||||
| }; | }; | ||||
| inline std::string quoted(std::string str) { | inline std::string quoted(std::string str) { | ||||
| @@ -33,6 +51,19 @@ inline std::string quoted(std::string str) { | |||||
| std::call_once(_once_flag, [&] { __VA_ARGS__; }); \ | std::call_once(_once_flag, [&] { __VA_ARGS__; }); \ | ||||
| } while (false) | } while (false) | ||||
| template <typename T> | |||||
| struct is_small_vector { | |||||
| static constexpr bool value = false; | |||||
| }; | |||||
| template <typename T> | |||||
| struct is_small_vector<SmallVector<T>> { | |||||
| static constexpr bool value = true; | |||||
| }; | |||||
| template <typename T> | |||||
| static constexpr bool is_small_vector_v = is_small_vector<T>::value; | |||||
| } // namespace imperative | } // namespace imperative | ||||
| } // namespace mgb | } // namespace mgb | ||||
| @@ -6,4 +6,10 @@ namespace mgb::imperative { | |||||
| std::string demangle(std::string mangled); | std::string demangle(std::string mangled); | ||||
| template <typename T> | |||||
| const char* demangled_typename() { | |||||
| static auto name = demangle(typeid(T).name()); | |||||
| return name.c_str(); | |||||
| } | } | ||||
| } // namespace mgb::imperative | |||||
| @@ -314,7 +314,8 @@ void CondTake::init_output_static_infer_desc() { | |||||
| auto dtype = input(0)->dtype(); | auto dtype = input(0)->dtype(); | ||||
| TensorLayout ily(iv.val[0].shape(), dtype); | TensorLayout ily(iv.val[0].shape(), dtype); | ||||
| dest.ndim = 1; | dest.ndim = 1; | ||||
| dest.shape[0] = megdnn_opr()->get_workspace_in_bytes(ily); | |||||
| TensorLayout mly(iv.val[0].shape(), dtype::Int32()); | |||||
| dest.shape[0] = megdnn_opr()->get_workspace_in_bytes(ily, mly); | |||||
| return true; | return true; | ||||
| }; | }; | ||||
| owner_graph()->static_infer_manager().register_shape_infer( | owner_graph()->static_infer_manager().register_shape_infer( | ||||
| @@ -548,9 +549,9 @@ void CheckNonFinite::init_output_static_infer_desc() { | |||||
| auto infer_wk = [this](TensorShape& dest, const InpVal& inp) { | auto infer_wk = [this](TensorShape& dest, const InpVal& inp) { | ||||
| dest.ndim = 1; | dest.ndim = 1; | ||||
| megdnn::TensorNDArray inp_arr(input().size()); | |||||
| SmallVector<megdnn::TensorLayout> inp_arr(input().size()); | |||||
| for (size_t i = 0; i < input().size(); ++i) { | for (size_t i = 0; i < input().size(); ++i) { | ||||
| inp_arr[i] = {NULL, {inp.val.at(i).shape(), input(0)->dtype()}}; | |||||
| inp_arr[i] = {inp.val.at(i).shape(), input(0)->dtype()}; | |||||
| } | } | ||||
| dest.shape[0] = megdnn_opr()->get_workspace_in_bytes( | dest.shape[0] = megdnn_opr()->get_workspace_in_bytes( | ||||
| inp_arr, {output(input().size() + 1)->shape(), | inp_arr, {output(input().size() + 1)->shape(), | ||||
| @@ -1447,11 +1447,8 @@ void ParamPackConcat::init_output_static_infer_desc() { | |||||
| auto infer_wk = [this](TensorShape& dest, const InpVal& inp) { | auto infer_wk = [this](TensorShape& dest, const InpVal& inp) { | ||||
| TensorShapeArray shapes; | TensorShapeArray shapes; | ||||
| auto vals = inp.val; | auto vals = inp.val; | ||||
| shapes.reserve(vals.size() - 1); | |||||
| for (size_t i = 0; i < vals.size() - 1; i++) { | |||||
| shapes.push_back(vals[i].shape()); | |||||
| } | |||||
| dest = {m_opr->get_workspace_in_bytes(shapes, vals.back().shape(), dest)}; | |||||
| size_t nr_params = vals.size() - 1; | |||||
| dest = {m_opr->get_workspace_in_bytes({nr_params}, vals.back().shape(), dest)}; | |||||
| return true; | return true; | ||||
| }; | }; | ||||
| mgr.register_shape_infer(output(0), {SourceType::DEP, shp_deps, infer_out}); | mgr.register_shape_infer(output(0), {SourceType::DEP, shp_deps, infer_out}); | ||||
| @@ -970,8 +970,9 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||||
| if (!policy.algo.valid()) | if (!policy.algo.valid()) | ||||
| continue; | continue; | ||||
| size_t workspace_needed = get_workspace_size_bytes(policy); | size_t workspace_needed = get_workspace_size_bytes(policy); | ||||
| if (m_inputs != nullptr) | |||||
| if (m_inputs == nullptr) { | |||||
| workspace_needed += data_size; | workspace_needed += data_size; | ||||
| } | |||||
| if (workspace_needed > | if (workspace_needed > | ||||
| m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) { | m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) { | ||||
| continue; | continue; | ||||
| @@ -18,7 +18,8 @@ failed_files = Manager().list() | |||||
| def process_file(file, clang_format, write): | def process_file(file, clang_format, write): | ||||
| source = open(file, "r").read() | |||||
| original_source = open(file, "r").read() | |||||
| source = original_source | |||||
| source = re.sub(r"MGB_DEFINE(?P<r>([^\\]|\n)*?)// *{", r"class MGB_DEFINE\g<r>{", source) | source = re.sub(r"MGB_DEFINE(?P<r>([^\\]|\n)*?)// *{", r"class MGB_DEFINE\g<r>{", source) | ||||
| source, count = re.subn(r"(?<!#define )MGB_DEFINE(.*) +\\", r"class MGB_DEFINE\1{\\", source) | source, count = re.subn(r"(?<!#define )MGB_DEFINE(.*) +\\", r"class MGB_DEFINE\1{\\", source) | ||||
| @@ -38,7 +39,7 @@ def process_file(file, clang_format, write): | |||||
| result = re.sub(r"class MGB_DEFINE(.*){( *)\\", r"MGB_DEFINE\1\2 \\", result) | result = re.sub(r"class MGB_DEFINE(.*){( *)\\", r"MGB_DEFINE\1\2 \\", result) | ||||
| result = re.sub(r"class MGB_DEFINE((.|\n)*?){", r"MGB_DEFINE\1// {", result) | result = re.sub(r"class MGB_DEFINE((.|\n)*?){", r"MGB_DEFINE\1// {", result) | ||||
| if write: | |||||
| if write and original_source != result: | |||||
| with tempfile.NamedTemporaryFile( | with tempfile.NamedTemporaryFile( | ||||
| dir=os.path.dirname(file), delete=False | dir=os.path.dirname(file), delete=False | ||||
| ) as tmp_file: | ) as tmp_file: | ||||