GitOrigin-RevId: 1523833fcb
tags/v1.0.0-rc1
| @@ -537,6 +537,11 @@ set(MGB_CUDA ${MGE_WITH_CUDA}) | |||||
| set(MEGDNN_WITH_CUDA ${MGE_WITH_CUDA}) | set(MEGDNN_WITH_CUDA ${MGE_WITH_CUDA}) | ||||
| #ROCM | |||||
| set(MGB_ROCM ${MGE_WITH_ROCM}) | |||||
| set(MEGDNN_WITH_ROCM ${MGE_WITH_ROCM}) | |||||
| # CAMBRICON | # CAMBRICON | ||||
| set(MGB_CAMBRICON ${MGE_WITH_CAMBRICON}) | set(MGB_CAMBRICON ${MGE_WITH_CAMBRICON}) | ||||
| set(MEGDNN_WITH_CAMBRICON ${MGE_WITH_CAMBRICON}) | set(MEGDNN_WITH_CAMBRICON ${MGE_WITH_CAMBRICON}) | ||||
| @@ -0,0 +1,18 @@ | |||||
| /** | |||||
| * \file dnn/include/hcc_detail/hcc_defs_epilogue.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #ifdef __HIP_PLATFORM_HCC__ | |||||
| #undef __HIP_PLATFORM_HCC__ | |||||
| #else | |||||
| #error "hcc_defs_epilogue.h must be included after hcc_defs_prologue.h" | |||||
| #endif | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,14 @@ | |||||
| /** | |||||
| * \file dnn/include/hcc_detail/hcc_defs_prologue.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #define __HIP_PLATFORM_HCC__ | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,35 @@ | |||||
| /** | |||||
| * \file dnn/include/hip_header.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #pragma once | |||||
| /** | |||||
| * \remarks The files in the subdirectory include/hip are copied from HIP | |||||
| * headers provided by ROCm-Developer-Tools/HIP, which can be found from | |||||
| * https://github.com/ROCm-Developer-Tools/HIP. These files are included to make | |||||
| * the MegDNN can be compiled with both CUDA and ROCm backends, and the both | |||||
| * backends share the same code. | |||||
| */ | |||||
| #pragma GCC diagnostic push | |||||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
| #pragma GCC diagnostic ignored "-Wdeprecated-declarations" | |||||
| #pragma GCC diagnostic ignored "-Wsign-compare" | |||||
| #include <hip/hip_runtime_api.h> | |||||
| #include <hip/hip_runtime.h> | |||||
| #include <hip/hip_fp16.h> | |||||
| #pragma GCC diagnostic pop | |||||
| #if !defined(__HIP_PLATFORM_HCC__) | |||||
| #error "platform macro __HIP_PLATFORM_HCC__ must be defined" | |||||
| #endif | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -19,6 +19,7 @@ | |||||
| typedef enum { | typedef enum { | ||||
| megcorePlatformCPU = 1, | megcorePlatformCPU = 1, | ||||
| megcorePlatformCUDA = 4, | megcorePlatformCUDA = 4, | ||||
| megcorePlatformROCM = 6, | |||||
| megcorePlatformCambricon = 7, | megcorePlatformCambricon = 7, | ||||
| megcorePlatformAtlas = 8, | megcorePlatformAtlas = 8, | ||||
| } megcorePlatform_t; | } megcorePlatform_t; | ||||
| @@ -0,0 +1,70 @@ | |||||
| /** | |||||
| * \file dnn/include/megcore_rocm.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include "./megcore.h" | |||||
| #include "hip_header.h" | |||||
| #include "megdnn/internal/visibility_prologue.h" | |||||
| namespace megcore { | |||||
| struct ROCMContext { | |||||
| hipStream_t stream = nullptr; | |||||
| static std::atomic_bool sm_miopen_algo_search; | |||||
| static inline bool enable_miopen_algo_search() { return sm_miopen_algo_search.load(); } | |||||
| static inline void enable_miopen_algo_search(bool enable_algo_search) { | |||||
| sm_miopen_algo_search.store(enable_algo_search); | |||||
| } | |||||
| //! device pointer to buffer for error reporting from kernels | |||||
| AsyncErrorInfo* error_info = nullptr; | |||||
| ROCMContext() = default; | |||||
| ROCMContext(hipStream_t s, AsyncErrorInfo* e) : stream{s}, error_info{e} {} | |||||
| }; | |||||
| megcoreStatus_t createComputingHandleWithROCMContext( | |||||
| megcoreComputingHandle_t* compHandle, megcoreDeviceHandle_t devHandle, | |||||
| unsigned int flags, const ROCMContext& ctx); | |||||
| megcoreStatus_t getROCMContext(megcoreComputingHandle_t handle, | |||||
| ROCMContext* ctx); | |||||
| // Set MIOpen algo search enabled or disabled | |||||
| megcoreStatus_t enableMIOpenAlgoSearch(bool enable_algo_search = true); | |||||
| // Find out whether MIOpen algo search is enabled or disabled | |||||
| megcoreStatus_t getMIOpenAlgoSearchStatus(bool* algo_search_enabled); | |||||
| } // namespace megcore | |||||
| static inline megcoreStatus_t megcoreCreateComputingHandleWithROCMStream( | |||||
| megcoreComputingHandle_t* compHandle, megcoreDeviceHandle_t devHandle, | |||||
| unsigned int flags, hipStream_t stream) { | |||||
| megcore::ROCMContext ctx; | |||||
| ctx.stream = stream; | |||||
| return megcore::createComputingHandleWithROCMContext(compHandle, devHandle, | |||||
| flags, ctx); | |||||
| } | |||||
| static inline megcoreStatus_t megcoreGetROCMStream( | |||||
| megcoreComputingHandle_t handle, hipStream_t* stream) { | |||||
| megcore::ROCMContext ctx; | |||||
| auto ret = megcore::getROCMContext(handle, &ctx); | |||||
| *stream = ctx.stream; | |||||
| return ret; | |||||
| } | |||||
| #include "megdnn/internal/visibility_epilogue.h" | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -33,6 +33,7 @@ class Handle { | |||||
| ARMV7 = 4, | ARMV7 = 4, | ||||
| AARCH64 = 5, | AARCH64 = 5, | ||||
| CUDA = 6, | CUDA = 6, | ||||
| ROCM = 11, | |||||
| ATLAS = 13, | ATLAS = 13, | ||||
| CAMBRICON = 12, | CAMBRICON = 12, | ||||
| }; | }; | ||||
| @@ -71,6 +72,13 @@ class Handle { | |||||
| template <typename opr> | template <typename opr> | ||||
| std::unique_ptr<opr> create_cuda_operator(); | std::unique_ptr<opr> create_cuda_operator(); | ||||
| #endif | #endif | ||||
| #if MEGDNN_WITH_ROCM | |||||
| static std::unique_ptr<Handle> make_rocm_handle( | |||||
| megcoreComputingHandle_t computing_handle); | |||||
| template <typename opr> | |||||
| std::unique_ptr<opr> create_rocm_operator(); | |||||
| #endif | |||||
| virtual ~Handle(); | virtual ~Handle(); | ||||
| @@ -11,6 +11,7 @@ def main(): | |||||
| description='generate elemwise impl files', | description='generate elemwise impl files', | ||||
| formatter_class=argparse.ArgumentDefaultsHelpFormatter) | formatter_class=argparse.ArgumentDefaultsHelpFormatter) | ||||
| parser.add_argument('--type', type=str, choices=['cuda', | parser.add_argument('--type', type=str, choices=['cuda', | ||||
| 'hip', | |||||
| 'cpp'], | 'cpp'], | ||||
| default='cpp', help='generate cuda/hip kernel file') | default='cpp', help='generate cuda/hip kernel file') | ||||
| parser.add_argument('output', help='output directory') | parser.add_argument('output', help='output directory') | ||||
| @@ -21,6 +22,8 @@ def main(): | |||||
| if args.type == 'cuda': | if args.type == 'cuda': | ||||
| cpp_ext = 'cu' | cpp_ext = 'cu' | ||||
| elif args.type == 'hip': | |||||
| cpp_ext = 'cpp.hip' | |||||
| else: | else: | ||||
| assert args.type == 'cpp' | assert args.type == 'cpp' | ||||
| cpp_ext = 'cpp' | cpp_ext = 'cpp' | ||||
| @@ -11,6 +11,7 @@ def main(): | |||||
| formatter_class=argparse.ArgumentDefaultsHelpFormatter) | formatter_class=argparse.ArgumentDefaultsHelpFormatter) | ||||
| parser.add_argument('--type', type=str, choices=[ | parser.add_argument('--type', type=str, choices=[ | ||||
| 'cuda', | 'cuda', | ||||
| 'hip' | |||||
| ], | ], | ||||
| default='cuda', | default='cuda', | ||||
| help='generate cuda/hip elemwise special kernel file') | help='generate cuda/hip elemwise special kernel file') | ||||
| @@ -22,6 +23,9 @@ def main(): | |||||
| if args.type == 'cuda': | if args.type == 'cuda': | ||||
| cpp_ext = 'cu' | cpp_ext = 'cu' | ||||
| else: | |||||
| assert args.type =='hip' | |||||
| cpp_ext = 'cpp.hip' | |||||
| for dtype in DTYPES.keys(): | for dtype in DTYPES.keys(): | ||||
| fname = 'special_{}.{}'.format(dtype, cpp_ext) | fname = 'special_{}.{}'.format(dtype, cpp_ext) | ||||
| @@ -91,6 +91,13 @@ std::unique_ptr<Handle> Handle::make(megcoreComputingHandle_t computing_handle, | |||||
| } | } | ||||
| } | } | ||||
| MIDOUT_END(); | MIDOUT_END(); | ||||
| #endif | |||||
| } | |||||
| else if (platform == megcorePlatformROCM) { | |||||
| #if MEGDNN_WITH_ROCM | |||||
| return make_rocm_handle(computing_handle); | |||||
| #else | |||||
| return nullptr; | |||||
| #endif | #endif | ||||
| } | } | ||||
| else if (platform == megcorePlatformCambricon) { | else if (platform == megcorePlatformCambricon) { | ||||
| @@ -193,6 +200,14 @@ std::unique_ptr<Handle> Handle::make(megcoreComputingHandle_t computing_handle, | |||||
| #if MEGDNN_WITH_ATLAS | #if MEGDNN_WITH_ATLAS | ||||
| CASE(ATLAS, atlas); | CASE(ATLAS, atlas); | ||||
| #endif | #endif | ||||
| #if MEGDNN_WITH_ROCM | |||||
| case HandleType::ROCM: { | |||||
| MIDOUT_BEGIN(HandleOpr, Opr, midout_iv(HandleType::ROCM)) { | |||||
| return create_rocm_operator<Opr>(); | |||||
| } | |||||
| MIDOUT_END(); | |||||
| } | |||||
| #endif | |||||
| #if MEGDNN_WITH_CAMBRICON | #if MEGDNN_WITH_CAMBRICON | ||||
| CASE(CAMBRICON, cambricon); | CASE(CAMBRICON, cambricon); | ||||
| #endif | #endif | ||||
| @@ -18,6 +18,10 @@ | |||||
| #endif | #endif | ||||
| #if MEGDNN_WITH_ROCM | |||||
| #include "src/rocm/megcore/computing_context.hpp" | |||||
| #endif | |||||
| #if MEGDNN_WITH_CAMBRICON | #if MEGDNN_WITH_CAMBRICON | ||||
| #include "src/cambricon/megcore/cambricon_computing_context.hpp" | #include "src/cambricon/megcore/cambricon_computing_context.hpp" | ||||
| #endif | #endif | ||||
| @@ -41,6 +45,10 @@ std::unique_ptr<ComputingContext> ComputingContext::make( | |||||
| case megcorePlatformCUDA: | case megcorePlatformCUDA: | ||||
| return make_unique<cuda::CUDAComputingContext>(dev_handle, flags); | return make_unique<cuda::CUDAComputingContext>(dev_handle, flags); | ||||
| #endif | #endif | ||||
| #if MEGDNN_WITH_ROCM | |||||
| case megcorePlatformROCM: | |||||
| return make_rocm_computing_context(dev_handle, flags); | |||||
| #endif | |||||
| #if MEGDNN_WITH_CAMBRICON | #if MEGDNN_WITH_CAMBRICON | ||||
| case megcorePlatformCambricon: | case megcorePlatformCambricon: | ||||
| return make_unique<cambricon::CambriconComputingContext>(dev_handle, | return make_unique<cambricon::CambriconComputingContext>(dev_handle, | ||||
| @@ -15,6 +15,9 @@ | |||||
| #if MEGDNN_WITH_CUDA | #if MEGDNN_WITH_CUDA | ||||
| #include "src/cuda/megcore/cuda_device_context.hpp" | #include "src/cuda/megcore/cuda_device_context.hpp" | ||||
| #endif | #endif | ||||
| #if MEGDNN_WITH_ROCM | |||||
| #include "src/rocm/megcore/device_context.hpp" | |||||
| #endif | |||||
| #if MEGDNN_WITH_CAMBRICON | #if MEGDNN_WITH_CAMBRICON | ||||
| #include "src/cambricon/megcore/cambricon_device_context.hpp" | #include "src/cambricon/megcore/cambricon_device_context.hpp" | ||||
| #endif | #endif | ||||
| @@ -36,6 +39,10 @@ std::unique_ptr<DeviceContext> DeviceContext::make(megcorePlatform_t platform, | |||||
| case megcorePlatformCUDA: | case megcorePlatformCUDA: | ||||
| return make_unique<cuda::CUDADeviceContext>(deviceID, flags); | return make_unique<cuda::CUDADeviceContext>(deviceID, flags); | ||||
| #endif | #endif | ||||
| #if MEGDNN_WITH_ROCM | |||||
| case megcorePlatformROCM: | |||||
| return make_rocm_device_context(deviceID, flags); | |||||
| #endif | |||||
| #if MEGDNN_WITH_CAMBRICON | #if MEGDNN_WITH_CAMBRICON | ||||
| case megcorePlatformCambricon: | case megcorePlatformCambricon: | ||||
| return make_unique<cambricon::CambriconDeviceContext>(deviceID, | return make_unique<cambricon::CambriconDeviceContext>(deviceID, | ||||
| @@ -0,0 +1,28 @@ | |||||
| /** | |||||
| * \file src/rocm/add_update/add_update.cpp.hip | |||||
| * | |||||
| * This file is part of MegDNN, a deep neural network run-time library | |||||
| * developed by Megvii. | |||||
| * | |||||
| * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved. | |||||
| */ | |||||
| #include "hcc_detail/hcc_defs_prologue.h" | |||||
| #include "./add_update.h.hip" | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| #define cb(_dtype) \ | |||||
| INST_RUN_ELEMWISE(AddUpdateKernOp<DTypeTrait<_dtype>::ctype>, \ | |||||
| DTypeTrait<_dtype>::ctype, 1); \ | |||||
| INST_RUN_ELEMWISE(AddUpdateKernOpNonContig<DTypeTrait<_dtype>::ctype>, \ | |||||
| DTypeTrait<_dtype>::ctype, 2); | |||||
| MEGDNN_FOREACH_COMPUTING_DTYPE(cb) | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: ft=cpp syntax=cpp.doxygen | |||||
| @@ -0,0 +1,61 @@ | |||||
| /** | |||||
| * | |||||
| * \file src/rocm/add_update/add_update.h.hip | |||||
| * | |||||
| * This file is part of MegDNN, a deep neural network run-time library | |||||
| * developed by Megvii. | |||||
| * | |||||
| * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved. | |||||
| */ | |||||
| #pragma once | |||||
| #include "hip_header.h" | |||||
| #include "src/rocm/elemwise_helper.h.hip" | |||||
| #if MEGDNN_CC_HOST | |||||
| #include "megdnn/oprs.h" | |||||
| #endif | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| template<typename ctype> | |||||
| struct AddUpdateKernOp { | |||||
| ctype *dst; | |||||
| ctype alpha, beta, bias; | |||||
| __device__ void operator() (uint32_t idx, ctype delta) { | |||||
| dst[idx] = dst[idx] * alpha + delta * beta + bias; | |||||
| } | |||||
| #if MEGDNN_CC_HOST | |||||
| AddUpdateKernOp(const TensorND &dest, const AddUpdate::Param ¶m): | |||||
| dst{dest.ptr<ctype>()}, | |||||
| alpha(param.alpha), beta(param.beta), bias(param.bias) | |||||
| { | |||||
| } | |||||
| #endif | |||||
| }; | |||||
| template<typename ctype> | |||||
| struct AddUpdateKernOpNonContig { | |||||
| ctype alpha, beta, bias; | |||||
| __device__ void operator() (uint32_t /*idx*/, ctype &dst, ctype delta) { | |||||
| dst = dst * alpha + delta * beta + bias; | |||||
| } | |||||
| #if MEGDNN_CC_HOST | |||||
| AddUpdateKernOpNonContig(const AddUpdate::Param ¶m): | |||||
| alpha(param.alpha), beta(param.beta), bias(param.bias) | |||||
| { | |||||
| } | |||||
| #endif | |||||
| }; | |||||
| } | |||||
| } | |||||
| // vim: ft=cpp syntax=cpp.doxygen | |||||
| @@ -0,0 +1,67 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/add_update/opr_impl.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "hcc_detail/hcc_defs_prologue.h" | |||||
| #include "./opr_impl.h" | |||||
| #include "src/rocm/add_update/add_update.h.hip" | |||||
| #include "src/common/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| void AddUpdateForwardImpl::exec(_megdnn_tensor_inout dest, | |||||
| _megdnn_tensor_in delta) { | |||||
| check_exec(dest.layout, delta.layout); | |||||
| if (!dest.layout.is_contiguous()) { | |||||
| return exec_noncontig(dest, delta); | |||||
| } | |||||
| ElemwiseOpParamN<1> param; | |||||
| param[0] = delta; | |||||
| param[0].layout = param[0].layout.broadcast(dest.layout); | |||||
| param.init_from_given_tensor(); | |||||
| auto stream = hip_stream(handle()); | |||||
| switch (dest.layout.dtype.enumv()) { | |||||
| #define cb(_dt) \ | |||||
| case DTypeTrait<_dt>::enumv: { \ | |||||
| using ctype = DTypeTrait<_dt>::ctype; \ | |||||
| return run_elemwise<AddUpdateKernOp<ctype>, ctype, 1>( \ | |||||
| param, stream, {dest, m_param}); \ | |||||
| } | |||||
| MEGDNN_FOREACH_COMPUTING_DTYPE(cb) | |||||
| #undef cb | |||||
| default: | |||||
| megdnn_throw(megdnn_mangle("unsupported dtype for AddUpdate")); | |||||
| } | |||||
| } | |||||
| void AddUpdateForwardImpl::exec_noncontig(_megdnn_tensor_inout dest, | |||||
| _megdnn_tensor_in delta) { | |||||
| ElemwiseOpParamN<2> param = make_param(dest, delta); | |||||
| auto stream = hip_stream(handle()); | |||||
| switch (dest.layout.dtype.enumv()) { | |||||
| #define cb(_dt) \ | |||||
| case DTypeTrait<_dt>::enumv: { \ | |||||
| using ctype = DTypeTrait<_dt>::ctype; \ | |||||
| return run_elemwise<AddUpdateKernOpNonContig<ctype>, ctype, 2>( \ | |||||
| param, stream, {m_param}); \ | |||||
| } | |||||
| MEGDNN_FOREACH_COMPUTING_DTYPE(cb) | |||||
| #undef cb | |||||
| default: | |||||
| megdnn_throw(megdnn_mangle("unsupported dtype for AddUpdate")); | |||||
| } | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,35 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/add_update/opr_impl.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include "megdnn/oprs.h" | |||||
| #include "src/common/add_update_helper.h" | |||||
| #include "src/rocm/utils.h" | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| class AddUpdateForwardImpl final : public AddUpdateForwardHelper { | |||||
| void exec_noncontig(_megdnn_tensor_inout dest, _megdnn_tensor_in delta); | |||||
| public: | |||||
| using AddUpdateForwardHelper::AddUpdateForwardHelper; | |||||
| void exec(_megdnn_tensor_inout dest, _megdnn_tensor_in delta) override; | |||||
| bool is_thread_safe() const override { return true; } | |||||
| }; | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,26 @@ | |||||
| /** | |||||
| * \file src/rocm/argmxx/argmxx.cpp.hip | |||||
| * | |||||
| * This file is part of MegDNN, a deep neural network run-time library | |||||
| * developed by Megvii. | |||||
| * | |||||
| * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved. | |||||
| */ | |||||
| #include "hcc_detail/hcc_defs_prologue.h" | |||||
| #include "hip_header.h" | |||||
| #include "src/common/argmxx_helper.h" | |||||
| #include "src/rocm/reduce_helper.h.hip" | |||||
| #include "megdnn/dtype.h" | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| #define INST(_dt) \ | |||||
| INST_REDUCE(argmxx::ArgmxxOp<DTypeTrait<_dt>::ctype MEGDNN_COMMA false>, false); \ | |||||
| INST_REDUCE(argmxx::ArgmxxOp<DTypeTrait<_dt>::ctype MEGDNN_COMMA true>, false); \ | |||||
| MEGDNN_FOREACH_COMPUTING_DTYPE(INST) | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| @@ -0,0 +1,129 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/argmxx/opr_impl.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "hcc_detail/hcc_defs_prologue.h" | |||||
| #include "src/rocm/argmxx/opr_impl.h" | |||||
| #include "src/rocm/utils.h" | |||||
| #include "src/common/reduce_helper.h" | |||||
| #include "src/common/argmxx_helper.h" | |||||
| #include "src/rocm/reduce_helper.h.hip" | |||||
| namespace { | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| using namespace argmxx; | |||||
| template <typename T, bool is_max> | |||||
| size_t get_workspace_in_bytes_impl(const TensorLayout &src, | |||||
| const TensorLayout & /* dst */, | |||||
| size_t axis) | |||||
| { | |||||
| size_t A, B, C; | |||||
| reduce::get_ABC(src, A, B, C, axis); | |||||
| return get_reduce_workspace_in_bytes<argmxx::ArgmxxOp<T, is_max>>( | |||||
| A, B, C); | |||||
| } | |||||
| template <typename T, bool is_max> | |||||
| void exec_impl(const T *src, int *dst, void *workspace, | |||||
| size_t A, size_t B, size_t C, | |||||
| hipStream_t stream) | |||||
| { | |||||
| argmxx::ArgmxxOp<T, is_max> opr(const_cast<T *>(src), dst, A, B, C); | |||||
| run_reduce<argmxx::ArgmxxOp<T, is_max>, false>( | |||||
| (typename argmxx::ArgmxxOp<T, is_max>::wtype *)workspace, | |||||
| A, B, C, | |||||
| stream, opr); | |||||
| after_kernel_launch(); | |||||
| } | |||||
| } // anonymous namespace | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| size_t ArgmaxForwardImpl::get_workspace_in_bytes(const TensorLayout &src, | |||||
| const TensorLayout &dst) | |||||
| { | |||||
| #define cb(dt) \ | |||||
| if (src.dtype == dt()) { \ | |||||
| using ctype = typename DTypeTrait<dt>::ctype; \ | |||||
| return get_workspace_in_bytes_impl<ctype, true>(src, dst, param().axis); \ | |||||
| } | |||||
| MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) | |||||
| #undef cb | |||||
| megdnn_assert_internal(false); | |||||
| } | |||||
| void ArgmaxForwardImpl::exec(_megdnn_tensor_in src, | |||||
| _megdnn_tensor_out dst, | |||||
| _megdnn_workspace workspace) | |||||
| { | |||||
| check_exec(src.layout, dst.layout, workspace.size); | |||||
| size_t A, B, C; | |||||
| reduce::get_ABC(src.layout, A, B, C, param().axis); | |||||
| auto stream = hip_stream(handle()); | |||||
| #define cb(dt) \ | |||||
| if (src.layout.dtype.enumv() == DTypeTrait<dt>::enumv) { \ | |||||
| using ctype = typename DTypeTrait<dt>::ctype; \ | |||||
| exec_impl<ctype, true>(src.ptr<ctype>(), \ | |||||
| dst.ptr<dt_int32>(), \ | |||||
| workspace.raw_ptr, \ | |||||
| A, B, C, stream); \ | |||||
| return; \ | |||||
| } | |||||
| MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) | |||||
| #undef cb | |||||
| megdnn_throw(megdnn_mangle(ssprintf("Unsupported DType: %s", | |||||
| src.layout.dtype.name()))); | |||||
| } | |||||
| size_t ArgminForwardImpl::get_workspace_in_bytes(const TensorLayout &src, | |||||
| const TensorLayout &dst) | |||||
| { | |||||
| #define cb(dt) \ | |||||
| if (src.dtype == dt()) { \ | |||||
| using ctype = typename DTypeTrait<dt>::ctype; \ | |||||
| return get_workspace_in_bytes_impl<ctype, false>(src, dst, param().axis); \ | |||||
| } | |||||
| MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) | |||||
| #undef cb | |||||
| megdnn_assert_internal(false); | |||||
| } | |||||
| void ArgminForwardImpl::exec(_megdnn_tensor_in src, | |||||
| _megdnn_tensor_out dst, | |||||
| _megdnn_workspace workspace) | |||||
| { | |||||
| check_exec(src.layout, dst.layout, workspace.size); | |||||
| size_t A, B, C; | |||||
| reduce::get_ABC(src.layout, A, B, C, param().axis); | |||||
| auto stream = hip_stream(handle()); | |||||
| #define cb(dt) \ | |||||
| if (src.layout.dtype.enumv() == DTypeTrait<dt>::enumv) { \ | |||||
| using ctype = typename DTypeTrait<dt>::ctype; \ | |||||
| exec_impl<ctype, false>(src.ptr<ctype>(), \ | |||||
| dst.ptr<dt_int32>(), \ | |||||
| workspace.raw_ptr, \ | |||||
| A, B, C, stream); \ | |||||
| return; \ | |||||
| } | |||||
| MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) | |||||
| #undef cb | |||||
| megdnn_throw(megdnn_mangle(ssprintf("Unsupported DType: %s", | |||||
| src.layout.dtype.name()))); | |||||
| } | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,41 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/argmxx/opr_impl.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include "megdnn/oprs.h" | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| class ArgmaxForwardImpl final: public ArgmaxForward { | |||||
| public: | |||||
| using ArgmaxForward::ArgmaxForward; | |||||
| void exec(_megdnn_tensor_in src, | |||||
| _megdnn_tensor_out dst, | |||||
| _megdnn_workspace workspace) override; | |||||
| size_t get_workspace_in_bytes(const TensorLayout &src, | |||||
| const TensorLayout &dst) override; | |||||
| }; | |||||
| class ArgminForwardImpl: public ArgminForward { | |||||
| public: | |||||
| using ArgminForward::ArgminForward; | |||||
| void exec(_megdnn_tensor_in src, | |||||
| _megdnn_tensor_out dst, | |||||
| _megdnn_workspace) override; | |||||
| size_t get_workspace_in_bytes(const TensorLayout &src, | |||||
| const TensorLayout &dst) override; | |||||
| }; | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,119 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/batched_matrix_mul/opr_impl.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "hcc_detail/hcc_defs_prologue.h" | |||||
| #include "./opr_impl.h" | |||||
| #include "src/common/utils.cuh" | |||||
| #include "src/rocm/handle.h" | |||||
| #include "src/rocm/utils.h" | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| void BatchedMatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, | |||||
| _megdnn_tensor_out C, | |||||
| _megdnn_workspace workspace) { | |||||
| check_exec(A.layout, B.layout, C.layout, workspace.size); | |||||
| auto dtype = A.layout.dtype; | |||||
| megdnn_assert(dtype.category() == DTypeCategory::FLOAT && | |||||
| param().format == param::MatrixMul::Format::DEFAULT); | |||||
| if (dtype == dtype::Float32() || | |||||
| MEGDNN_FLOAT16_SELECT(dtype == dtype::Float16(), false)) { | |||||
| auto batch = A.layout.shape[0]; | |||||
| auto m = C.layout.shape[1], n = C.layout.shape[2]; | |||||
| auto k = A.layout.shape[param().transposeA ? 1 : 2]; | |||||
| auto handle = concrete_handle(this->handle()); | |||||
| auto rocblas_handle_ = handle->get_rocblas_handle(); | |||||
| auto io32_c32 = [&]() { | |||||
| auto zero = handle->zero_device(); | |||||
| auto one = handle->one_device(); | |||||
| rocblas_check(rocblas_sgemm_strided_batched( | |||||
| rocblas_handle_, | |||||
| param().transposeB ? rocblas_operation_transpose | |||||
| : rocblas_operation_none, | |||||
| param().transposeA ? rocblas_operation_transpose | |||||
| : rocblas_operation_none, | |||||
| n, m, k, one, B.ptr<dt_float32>(), | |||||
| (rocblas_int)(B.layout.stride[1]), | |||||
| (rocblas_int)(B.layout.stride[0]), A.ptr<dt_float32>(), | |||||
| (rocblas_int)(A.layout.stride[1]), | |||||
| (rocblas_int)(A.layout.stride[0]), zero, | |||||
| C.ptr<dt_float32>(), (rocblas_int)(C.layout.stride[1]), | |||||
| (rocblas_int)(C.layout.stride[0]), (rocblas_int)(batch))); | |||||
| }; | |||||
| #if !MEGDNN_DISABLE_FLOAT16 | |||||
| auto io16_c32 = [&]() { | |||||
| auto zero = handle->zero_device(); | |||||
| auto one = handle->one_device(); | |||||
| int32_t solution_index = 0; | |||||
| uint32_t flags = 1; | |||||
| size_t ws_size = 0; | |||||
| rocblas_check(rocblas_gemm_strided_batched_ex( | |||||
| rocblas_handle_, | |||||
| param().transposeB ? rocblas_operation_transpose | |||||
| : rocblas_operation_none, | |||||
| param().transposeA ? rocblas_operation_transpose | |||||
| : rocblas_operation_none, | |||||
| n, m, k, one, B.raw_ptr, rocblas_datatype_i8_r, | |||||
| B.layout.stride[1], B.layout.stride[0], A.raw_ptr, | |||||
| rocblas_datatype_i8_r, A.layout.stride[1], | |||||
| A.layout.stride[0], zero, C.raw_ptr, rocblas_datatype_i32_r, | |||||
| C.layout.stride[1], C.layout.stride[0], C.raw_ptr, | |||||
| rocblas_datatype_i32_r, C.layout.stride[1], | |||||
| C.layout.stride[0], batch, rocblas_datatype_i32_r, | |||||
| rocblas_gemm_algo_standard, solution_index, flags, &ws_size, | |||||
| nullptr)); | |||||
| }; | |||||
| auto io16_c16 = [&]() { | |||||
| auto zero_half = handle->zero_device_h(); | |||||
| auto one_half = handle->one_device_h(); | |||||
| rocblas_check(rocblas_hgemm_strided_batched( | |||||
| rocblas_handle_, | |||||
| param().transposeB ? rocblas_operation_transpose | |||||
| : rocblas_operation_none, | |||||
| param().transposeA ? rocblas_operation_transpose | |||||
| : rocblas_operation_none, | |||||
| n, m, k, reinterpret_cast<const rocblas_half*>(one_half), | |||||
| static_cast<const rocblas_half*>(B.raw_ptr), | |||||
| B.layout.stride[1], B.layout.stride[0], | |||||
| static_cast<const rocblas_half*>(A.raw_ptr), | |||||
| A.layout.stride[1], A.layout.stride[0], | |||||
| reinterpret_cast<const rocblas_half*>(zero_half), | |||||
| static_cast<rocblas_half*>(C.raw_ptr), | |||||
| C.layout.stride[1], C.layout.stride[0], batch)); | |||||
| }; | |||||
| #endif | |||||
| if (dtype == dtype::Float32()) { | |||||
| io32_c32(); | |||||
| } | |||||
| #if !MEGDNN_DISABLE_FLOAT16 | |||||
| else { | |||||
| if (param().compute_mode == Param::ComputeMode::FLOAT32) { | |||||
| io16_c32(); | |||||
| } else { | |||||
| io16_c16(); | |||||
| } | |||||
| } | |||||
| #endif | |||||
| } | |||||
| } | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,39 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/batched_matrix_mul/opr_impl.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include "megdnn/oprs.h" | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| class BatchedMatrixMulForwardImpl : public BatchedMatrixMulForward { | |||||
| public: | |||||
| using BatchedMatrixMulForward::BatchedMatrixMulForward; | |||||
| BatchedMatrixMulForwardImpl(Handle* handle) | |||||
| : BatchedMatrixMul(handle), | |||||
| m_opr(handle->create_operator<MatrixMul>()) {} | |||||
| void exec(_megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C, | |||||
| _megdnn_workspace workspace) override; | |||||
| size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&, | |||||
| const TensorLayout&) override { | |||||
| return 0; | |||||
| } | |||||
| bool is_thread_safe() const override { return true; } | |||||
| private: | |||||
| std::unique_ptr<MatrixMul> m_opr; | |||||
| }; | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,81 @@ | |||||
| /** | |||||
| * \file src/rocm/checksum/kern.cpp.hip | |||||
| * | |||||
| * This file is part of MegDNN, a deep neural network run-time library | |||||
| * developed by Megvii. | |||||
| * | |||||
| * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved. | |||||
| */ | |||||
| #include "hcc_detail/hcc_defs_prologue.h" | |||||
| #include "hip_header.h" | |||||
| #include "./kern.h.hip" | |||||
| #include "src/rocm/reduce_helper.h.hip" | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| namespace checksum { | |||||
| namespace { | |||||
| struct ChecksumOp { | |||||
| typedef uint32_t wtype; | |||||
| const uint32_t* src; | |||||
| uint32_t* dst; | |||||
| static const uint32_t INIT = 0; | |||||
| __host__ __device__ void write(uint32_t idx, uint32_t val) { | |||||
| dst[idx] = val; | |||||
| } | |||||
| __host__ __device__ static uint32_t apply(uint32_t a, uint32_t b) { | |||||
| return a + b; | |||||
| } | |||||
| }; | |||||
| struct NonFourAlignedChecksumOp : ChecksumOp { | |||||
| __host__ __device__ uint32_t read(uint32_t idx) { | |||||
| uint8_t* data = (uint8_t*)(src + idx); | |||||
| return (data[0] | ((uint32_t)data[1] << 8) | ((uint32_t)data[2] << 16) | | |||||
| ((uint32_t)data[3] << 24)) * | |||||
| (idx + 1); | |||||
| } | |||||
| }; | |||||
| struct FourAlignedChecksumOp : ChecksumOp { | |||||
| __host__ __device__ uint32_t read(uint32_t idx) { | |||||
| return src[idx] * (idx + 1); | |||||
| } | |||||
| }; | |||||
| } // anonymous namespace | |||||
| void calc(uint32_t* dest, const uint32_t* buf, uint32_t* workspace, | |||||
| size_t nr_elem, hipStream_t stream) { | |||||
| if (!nr_elem) | |||||
| return; | |||||
| if (reinterpret_cast<uint64_t>(buf) & 0b11) { | |||||
| NonFourAlignedChecksumOp op; | |||||
| op.src = buf; | |||||
| op.dst = dest; | |||||
| run_reduce<NonFourAlignedChecksumOp, false>(workspace, 1, nr_elem, 1, | |||||
| stream, op); | |||||
| } else { | |||||
| FourAlignedChecksumOp op; | |||||
| op.src = buf; | |||||
| op.dst = dest; | |||||
| run_reduce<FourAlignedChecksumOp, false>(workspace, 1, nr_elem, 1, | |||||
| stream, op); | |||||
| } | |||||
| } | |||||
| size_t get_workspace_in_bytes(size_t nr_elem) { | |||||
| return get_reduce_workspace_in_bytes<ChecksumOp>(1, nr_elem, 1); | |||||
| } | |||||
| } // namespace checksum | |||||
| } // namespace rocm` | |||||
| } // namespace megdnn | |||||
| // vim: ft=cpp syntax=cpp.doxygen | |||||
| @@ -0,0 +1,28 @@ | |||||
| /** | |||||
| * \file src/rocm/checksum/kern.h.hip | |||||
| * | |||||
| * This file is part of MegDNN, a deep neural network run-time library | |||||
| * developed by Megvii. | |||||
| * | |||||
| * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved. | |||||
| */ | |||||
| #pragma once | |||||
| #include "hip_header.h" | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| namespace checksum { | |||||
| void calc(uint32_t* dest, const uint32_t* buf, uint32_t* workspace, | |||||
| size_t nr_elem, hipStream_t stream); | |||||
| size_t get_workspace_in_bytes(size_t nr_elem); | |||||
| } // namespace checksum | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: ft=cpp syntax=cpp.doxygen | |||||
| @@ -0,0 +1,68 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/checksum/opr_impl.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "hcc_detail/hcc_defs_prologue.h" | |||||
| #include "./opr_impl.h" | |||||
| #include "src/rocm/checksum/kern.h.hip" | |||||
| #include "src/common/utils.h" | |||||
| #include "src/rocm/reduce_helper.h.hip" | |||||
| #include <algorithm> | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| namespace { | |||||
| WorkspaceBundle get_wbundle(const TensorLayout& data) { | |||||
| size_t size_all = data.shape[0], size_ints = size_all / sizeof(uint32_t); | |||||
| size_t part1 = checksum::get_workspace_in_bytes(size_ints); | |||||
| size_t part2 = sizeof(ChecksumForward::Result::checksum); | |||||
| return {nullptr, {part1, part2}}; | |||||
| } | |||||
| } // anonymous namespace | |||||
| size_t ChecksumForwardImpl::get_workspace_in_bytes(const TensorLayout& data) { | |||||
| auto wbundle = get_wbundle(data); | |||||
| return wbundle.total_size_in_bytes(); | |||||
| } | |||||
| ChecksumForward::Result ChecksumForwardImpl::exec(_megdnn_tensor_in data, | |||||
| _megdnn_workspace workspace) { | |||||
| auto wbundle = get_wbundle(data.layout); | |||||
| wbundle.set(workspace.raw_ptr); | |||||
| Result result; | |||||
| memset(&result, 0, sizeof(result)); | |||||
| check_exec(data.layout, workspace.size); | |||||
| auto stream = hip_stream(handle()); | |||||
| auto ptr = static_cast<uint8_t*>(data.raw_ptr); | |||||
| size_t size_all = data.layout.shape[0], | |||||
| size_ints = size_all / sizeof(uint32_t); | |||||
| auto last_val_size = std::min<size_t>(size_all, 4); | |||||
| hip_check(hipMemcpyAsync(&result.last_val, ptr + size_all - last_val_size, | |||||
| last_val_size, hipMemcpyDeviceToHost, stream)); | |||||
| if (size_ints) { | |||||
| checksum::calc(static_cast<uint32_t*>(wbundle.get(1)), | |||||
| static_cast<uint32_t*>(data.raw_ptr), | |||||
| static_cast<uint32_t*>(wbundle.get(0)), size_ints, | |||||
| stream); | |||||
| hip_check(hipMemcpyAsync(&result.checksum, wbundle.get(1), | |||||
| sizeof(result.checksum), hipMemcpyDeviceToHost, | |||||
| stream)); | |||||
| } | |||||
| hip_check(hipStreamSynchronize(stream)); | |||||
| return result; | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,35 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/checksum/opr_impl.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include "megdnn/oprs.h" | |||||
| #include "src/rocm/utils.h" | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| class ChecksumForwardImpl final : public ChecksumForward { | |||||
| public: | |||||
| using ChecksumForward::ChecksumForward; | |||||
| size_t get_workspace_in_bytes(const TensorLayout&) override; | |||||
| bool is_thread_safe() const override { return true; } | |||||
| Result exec(_megdnn_tensor_in data, _megdnn_workspace workspace) override; | |||||
| }; | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,95 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/backward_data/algo.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "hcc_detail/hcc_defs_prologue.h" | |||||
| #include "./algo.h" | |||||
| #include "src/rocm/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() { | |||||
| all_algos.push_back(&miopen); | |||||
| all_algos.push_back(&matmul); | |||||
| all_algos.push_back(&chanwise); | |||||
| non_miopen_algos.push_back(&matmul); | |||||
| non_miopen_algos.push_back(&chanwise); | |||||
| miopen_algos.push_back(&miopen); | |||||
| } | |||||
| ConvolutionBackwardDataImpl::AlgoPack ConvolutionBackwardDataImpl::sm_algo_pack; | |||||
| ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs( | |||||
| ConvolutionBackwardDataImpl* o, const TensorLayout& filter, | |||||
| const TensorLayout& diff, const TensorLayout& grad) | |||||
| : SizeArgs(o, o->check_layout_fwd(grad, filter, diff), diff, grad) {} | |||||
| ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs( | |||||
| ConvolutionBackwardDataImpl* o, const CanonizedFilterMeta& filter, | |||||
| const TensorLayout& diff, const TensorLayout& grad) | |||||
| : handle{concrete_handle(o->handle())}, | |||||
| filter_meta{filter}, | |||||
| diff_layout{&diff}, | |||||
| grad_layout{&grad}, | |||||
| opr{o} {} | |||||
| ConvolutionBackwardDataImpl::AlgoBase::ExecArgs::ExecArgs( | |||||
| ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter, | |||||
| _megdnn_tensor_in diff, _megdnn_tensor_out grad, | |||||
| _megdnn_workspace workspace) | |||||
| : SizeArgs(opr, filter.layout, diff.layout, grad.layout), | |||||
| filter_tensor{&filter}, | |||||
| diff_tensor{&diff}, | |||||
| grad_tensor{&grad}, | |||||
| workspace{workspace} {} | |||||
| std::string ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::to_string() const { | |||||
| auto&& fm = filter_meta; | |||||
| MEGDNN_MARK_USED_VAR(fm); | |||||
| return megdnn_mangle(ssprintf( | |||||
| "filter=%u{%u,%u,%u,%u}, diff=%s, grad=%s, " | |||||
| "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s", | |||||
| fm.group, fm.ocpg, fm.icpg, fm.spatial[0], fm.spatial[1], | |||||
| diff_layout->to_string().c_str(), grad_layout->to_string().c_str(), | |||||
| fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1], | |||||
| fm.dilation[0], fm.dilation[1], !fm.should_flip, | |||||
| diff_layout->dtype.name(), grad_layout->dtype.name())); | |||||
| } | |||||
| convolution::MIOpenCacheKey | |||||
| ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::to_miopen_algo_cache_key() | |||||
| const { | |||||
| convolution::MIOpenCacheKey res; | |||||
| res.miopen_handle = reinterpret_cast<intptr_t>(handle->miopen_handle()); | |||||
| res.batch = grad_layout->operator[](0); | |||||
| res.IC = grad_layout->operator[](1); | |||||
| res.IH = grad_layout->operator[](2); | |||||
| res.IW = grad_layout->operator[](3); | |||||
| res.OH = diff_layout->operator[](2); | |||||
| res.OW = diff_layout->operator[](3); | |||||
| res.FH = filter_meta.spatial[0]; | |||||
| res.FW = filter_meta.spatial[1]; | |||||
| res.SH = filter_meta.stride[0]; | |||||
| res.SW = filter_meta.stride[1]; | |||||
| res.PH = filter_meta.padding[0]; | |||||
| res.PW = filter_meta.padding[1]; | |||||
| res.DH = filter_meta.dilation[0]; | |||||
| res.DW = filter_meta.dilation[1]; | |||||
| res.group = filter_meta.group; | |||||
| res.ocpg = filter_meta.ocpg; | |||||
| res.icpg = filter_meta.icpg; | |||||
| res.dtype_enum = static_cast<uint32_t>(diff_layout->dtype.enumv()); | |||||
| res.exhaustive_search = | |||||
| static_cast<int32_t>(handle->enable_miopen_algo_search()); | |||||
| res.OC = res.group * res.ocpg; | |||||
| return res; | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,155 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/backward_data/algo.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include "src/rocm/convolution/helper.h" | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| /*! | |||||
| * \brief base class for convolution algos | |||||
| * | |||||
| */ | |||||
| class ConvolutionBackwardDataImpl::AlgoBase : public Algorithm { | |||||
| protected: | |||||
| ~AlgoBase() = default; | |||||
| public: | |||||
| struct SizeArgs { | |||||
| HandleImpl* handle; | |||||
| CanonizedFilterMeta filter_meta; | |||||
| const TensorLayout *diff_layout, *grad_layout; | |||||
| ConvolutionBackwardDataImpl* opr; | |||||
| std::string to_string() const; | |||||
| convolution::MIOpenCacheKey to_miopen_algo_cache_key() const; | |||||
| void init_desc(convolution::MIOpenBwdDataDescs& desc) const { | |||||
| desc.set(filter_meta, *diff_layout, *grad_layout, opr->param()); | |||||
| } | |||||
| SizeArgs(ConvolutionBackwardDataImpl* opr, const TensorLayout& filter, | |||||
| const TensorLayout& diff, const TensorLayout& grad); | |||||
| SizeArgs(ConvolutionBackwardDataImpl* opr, | |||||
| const CanonizedFilterMeta& filter, const TensorLayout& diff, | |||||
| const TensorLayout& grad); | |||||
| convolution::ForwardSizeArgs as_fwd_args() const { | |||||
| return {handle, grad_layout, filter_meta, diff_layout}; | |||||
| } | |||||
| }; | |||||
| struct ExecArgs : public SizeArgs { | |||||
| const TensorND *filter_tensor, *diff_tensor, *grad_tensor; | |||||
| Workspace workspace; | |||||
| ExecArgs(ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter, | |||||
| _megdnn_tensor_in diff, _megdnn_tensor_out grad, | |||||
| _megdnn_workspace workspace); | |||||
| }; | |||||
| virtual bool is_available(const SizeArgs& args) const = 0; | |||||
| virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; | |||||
| virtual void exec(const ExecArgs& args) const = 0; | |||||
| bool is_available_wk(const SizeArgs& args, size_t limit) { | |||||
| return is_available(args) && get_workspace_in_bytes(args) <= limit; | |||||
| } | |||||
| bool is_available_reproducible( | |||||
| const SizeArgs& args, bool reproducible = true, | |||||
| size_t limit = std::numeric_limits<size_t>::max()) { | |||||
| return (!reproducible || is_reproducible()) && | |||||
| is_available_wk(args, limit); | |||||
| } | |||||
| AlgoBase& check_workspace(const SizeArgs& args, | |||||
| const Workspace& workspace) { | |||||
| auto req = get_workspace_in_bytes(args); | |||||
| megdnn_assert(req <= workspace.size, | |||||
| "conv bwd data algo %s: " | |||||
| "required workspace %zu bytes, got %zu", | |||||
| name(), req, workspace.size); | |||||
| return *this; | |||||
| } | |||||
| virtual bool is_miopen() const { return false; } | |||||
| }; | |||||
| class ConvolutionBackwardDataImpl::AlgoMIOpen final : public AlgoBase { | |||||
| bool m_is_reproducible; | |||||
| const char* m_name; | |||||
| miopenConvBwdDataAlgorithm_t find_best_algo(const ExecArgs& args); | |||||
| public: | |||||
| AlgoMIOpen() = delete; | |||||
| AlgoMIOpen(bool is_reproducible) : m_is_reproducible(is_reproducible) {} | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| bool is_reproducible() const override { return m_is_reproducible; } | |||||
| const char* name() const override { | |||||
| return "MIOpenConvolutionBackwardData"; | |||||
| } | |||||
| bool is_miopen() const override { return true; } | |||||
| static convolution::MIOpenCache<SizeArgs, miopenConvBwdDataAlgorithm_t> | |||||
| sm_miopen_algo_cache; | |||||
| static convolution::MIOpenCache<SizeArgs, size_t> sm_miopen_ws_cache; | |||||
| }; | |||||
| class ConvolutionBackwardDataImpl::AlgoMatmul final : public AlgoBase { | |||||
| template <typename T> | |||||
| static void exec_internal(const ExecArgs& args); | |||||
| public: | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return "MATMUL"; } | |||||
| bool is_reproducible() const override { return true; } | |||||
| }; | |||||
| class ConvolutionBackwardDataImpl::AlgoChanwise final : public AlgoBase { | |||||
| public: | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return "CHANNEL_WISE"; } | |||||
| bool is_reproducible() const override { return true; } | |||||
| }; | |||||
| class ConvolutionBackwardDataImpl::AlgoPack { | |||||
| // defined in miopen.cpp | |||||
| void fill_miopen_algos(); | |||||
| AlgoPack(const AlgoPack&) = delete; | |||||
| AlgoPack& operator=(const AlgoPack&) = delete; | |||||
| public: | |||||
| AlgoPack(); | |||||
| AlgoMIOpen miopen{true}; | |||||
| AlgoMatmul matmul; | |||||
| AlgoChanwise chanwise; | |||||
| std::vector<AlgoBase*> | |||||
| //! all algorithms | |||||
| all_algos, miopen_algos, non_miopen_algos; | |||||
| }; | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,56 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/backward_data/chanwise.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "./algo.h" | |||||
| #include "src/rocm/utils.h" | |||||
| #include "src/rocm/convolution/chanwise/kern.h.hip" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| using namespace convolution; | |||||
| bool ConvolutionBackwardDataImpl::AlgoChanwise::is_available( | |||||
| const SizeArgs& args) const { | |||||
| auto&& fm = args.filter_meta; | |||||
| return args.filter_meta.format == Param::Format::NCHW && | |||||
| args.diff_layout->dtype.category() == DTypeCategory::FLOAT && | |||||
| args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 && | |||||
| fm.spatial_ndim == 2 && fm.icpg == 1 && fm.dilation[0] == 1 && | |||||
| fm.dilation[1] == 1 && !fm.should_flip; | |||||
| } | |||||
| size_t ConvolutionBackwardDataImpl::AlgoChanwise::get_workspace_in_bytes( | |||||
| const SizeArgs&) const { | |||||
| return 0; | |||||
| } | |||||
| void ConvolutionBackwardDataImpl::AlgoChanwise::exec( | |||||
| const ExecArgs& args) const { | |||||
| auto kparam = chanwise::Param::from_fwd_args(args.as_fwd_args()); | |||||
| auto stream = hip_stream(args.handle); | |||||
| switch (args.diff_layout->dtype.enumv()) { | |||||
| #define cb(_dt) \ | |||||
| case DTypeTrait<_dt>::enumv: { \ | |||||
| using ctype = DTypeTrait<_dt>::ctype; \ | |||||
| return chanwise::run_bwd_data(args.grad_tensor->ptr<ctype>(), \ | |||||
| args.diff_tensor->ptr<ctype>(), \ | |||||
| args.filter_tensor->ptr<ctype>(), \ | |||||
| kparam, stream); \ | |||||
| } | |||||
| MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) | |||||
| #undef cb | |||||
| default: | |||||
| break; | |||||
| } | |||||
| megdnn_assert_internal(0); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,94 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/backward_data/matmul.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "./algo.h" | |||||
| #include "src/rocm/utils.h" | |||||
| #include "src/rocm/convolution/helper.h" | |||||
| #include "src/rocm/convolution/im2col.h.hip" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| bool ConvolutionBackwardDataImpl::AlgoMatmul::is_available( | |||||
| const SizeArgs& args) const { | |||||
| auto&& fm = args.filter_meta; | |||||
| return args.filter_meta.format == Param::Format::NCHW && | |||||
| args.diff_layout->dtype.category() == DTypeCategory::FLOAT && | |||||
| args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 && | |||||
| fm.group == 1 && fm.spatial_ndim == 2; | |||||
| } | |||||
| size_t ConvolutionBackwardDataImpl::AlgoMatmul::get_workspace_in_bytes( | |||||
| const SizeArgs& args) const { | |||||
| return matmul_get_workspace_bundle(args.as_fwd_args()) | |||||
| .total_size_in_bytes(); | |||||
| } | |||||
| void ConvolutionBackwardDataImpl::AlgoMatmul::exec(const ExecArgs& args) const { | |||||
| #define cb(DType) \ | |||||
| if (args.diff_layout->dtype == DType()) { \ | |||||
| using ctype = typename DTypeTrait<DType>::ctype; \ | |||||
| exec_internal<ctype>(args); \ | |||||
| return; \ | |||||
| } | |||||
| MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) | |||||
| #undef cb | |||||
| megdnn_assert_internal(0); | |||||
| } | |||||
| template <typename T> | |||||
| void ConvolutionBackwardDataImpl::AlgoMatmul::exec_internal( | |||||
| const ExecArgs& args) { | |||||
| auto&& fm = args.filter_meta; | |||||
| size_t N = args.grad_layout->shape[0], IC = fm.icpg, | |||||
| IH = args.grad_layout->shape[2], IW = args.grad_layout->shape[3], | |||||
| OC = fm.ocpg, OH = args.diff_layout->shape[2], | |||||
| OW = args.diff_layout->shape[3], FH = fm.spatial[0], | |||||
| FW = fm.spatial[1], PH = fm.padding[0], PW = fm.padding[1], | |||||
| SH = fm.stride[0], SW = fm.stride[1], DH = fm.dilation[0], | |||||
| DW = fm.dilation[1]; | |||||
| auto stream = hip_stream(args.handle); | |||||
| auto wbundle = matmul_get_workspace_bundle(args.as_fwd_args()); | |||||
| wbundle.set(args.workspace.raw_ptr); | |||||
| T* diff_t = static_cast<T*>(wbundle.get(0)); | |||||
| T* col = static_cast<T*>(wbundle.get(1)); | |||||
| { | |||||
| // transpose diff | |||||
| TensorLayout froml({N, OC * OH * OW}, typename DTypeTrait<T>::dtype()), | |||||
| tol(froml); | |||||
| froml.stride[0] = args.diff_layout->stride[0]; | |||||
| tol.stride[0] = 1; | |||||
| tol.stride[1] = N; | |||||
| TensorND from(args.diff_tensor->ptr<T>(), froml), to(diff_t, tol); | |||||
| args.handle->relayout_opr()->exec(from, to); | |||||
| } | |||||
| { | |||||
| // take gemm grad | |||||
| TensorLayout Al({OC, IC * FH * FW}, typename DTypeTrait<T>::dtype()), | |||||
| Bl({IC * FH * FW, OH * OW * N}, | |||||
| typename DTypeTrait<T>::dtype()), | |||||
| Cl({OC, OH * OW * N}, typename DTypeTrait<T>::dtype()); | |||||
| TensorND A(args.filter_tensor->ptr<T>(), Al), B(col, Bl), C(diff_t, Cl); | |||||
| if (fm.should_flip) { | |||||
| convolution::flip_filter(args.as_fwd_args(), | |||||
| wbundle.get_workspace(2), A.raw_ptr); | |||||
| } | |||||
| args.handle->matmul_aT_opr()->exec(A, C, B, Workspace()); | |||||
| } | |||||
| { | |||||
| convolution::col2im<T>(col, args.grad_tensor->ptr<T>(), N, | |||||
| args.grad_layout->stride[0], IC, IH, IW, FH, FW, | |||||
| OH, OW, PH, PW, SH, SW, DH, DW, stream); | |||||
| } | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,108 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/backward_data/miopen.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "hcc_detail/hcc_defs_prologue.h" | |||||
| #include "./algo.h" | |||||
| #include "src/rocm/utils.h" | |||||
| #include "src/rocm/miopen_wrapper.h" | |||||
| #include "src/rocm/convolution/helper.h" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| using namespace convolution; | |||||
| MIOpenCache<ConvolutionBackwardDataImpl::AlgoBase::SizeArgs, | |||||
| miopenConvBwdDataAlgorithm_t> | |||||
| ConvolutionBackwardDataImpl::AlgoMIOpen::sm_miopen_algo_cache; | |||||
| MIOpenCache<ConvolutionBackwardDataImpl::AlgoBase::SizeArgs, size_t> | |||||
| ConvolutionBackwardDataImpl::AlgoMIOpen::sm_miopen_ws_cache; | |||||
| bool ConvolutionBackwardDataImpl::AlgoMIOpen::is_available( | |||||
| const SizeArgs& args) const { | |||||
| MIOpenBwdDataDescs D; | |||||
| if (!is_miopen_supported(args.as_fwd_args())) | |||||
| return false; | |||||
| auto got = sm_miopen_ws_cache.get(args); | |||||
| if (got.first) | |||||
| return true; | |||||
| args.init_desc(D); | |||||
| size_t workspace_size; | |||||
| auto status = miopenConvolutionBackwardDataGetWorkSpaceSize( | |||||
| args.handle->miopen_handle(), D.diff_desc.desc, D.filter_desc.desc, | |||||
| D.conv_desc.desc, D.grad_desc.desc, &workspace_size); | |||||
| if (status == miopenStatusSuccess) { | |||||
| sm_miopen_ws_cache.set(args, workspace_size); | |||||
| return true; | |||||
| } | |||||
| return false; | |||||
| } | |||||
| size_t ConvolutionBackwardDataImpl::AlgoMIOpen::get_workspace_in_bytes( | |||||
| const SizeArgs& args) const { | |||||
| auto got = sm_miopen_ws_cache.get(args); | |||||
| if (got.first) | |||||
| return got.second; | |||||
| MIOpenBwdDataDescs D; | |||||
| args.init_desc(D); | |||||
| size_t workspace_size; | |||||
| auto status = miopenConvolutionBackwardDataGetWorkSpaceSize( | |||||
| args.handle->miopen_handle(), D.diff_desc.desc, D.filter_desc.desc, | |||||
| D.conv_desc.desc, D.grad_desc.desc, &workspace_size); | |||||
| megdnn_assert(status == miopenStatusSuccess, | |||||
| "conv bwd_data get workspace failed: %s; info: %s", | |||||
| miopenGetErrorString(status), args.to_string().c_str()); | |||||
| sm_miopen_ws_cache.set(args, workspace_size); | |||||
| return workspace_size; | |||||
| } | |||||
| miopenConvBwdDataAlgorithm_t | |||||
| ConvolutionBackwardDataImpl::AlgoMIOpen::find_best_algo(const ExecArgs& args) { | |||||
| auto find_algo = sm_miopen_algo_cache.get(args); | |||||
| if (find_algo.first) | |||||
| return find_algo.second; | |||||
| bool exhaustive_search = args.handle->enable_miopen_algo_search(); | |||||
| MIOpenBwdDataDescs D; | |||||
| args.init_desc(D); | |||||
| const int req_algo_count = 1; | |||||
| int ret_algo_count; | |||||
| miopenConvAlgoPerf_t algo_perf; | |||||
| miopen_check(miopenFindConvolutionBackwardDataAlgorithm( | |||||
| args.handle->miopen_handle(), D.diff_desc.desc, | |||||
| args.diff_tensor->raw_ptr, D.filter_desc.desc, | |||||
| args.filter_tensor->raw_ptr, D.conv_desc.desc, D.grad_desc.desc, | |||||
| args.grad_tensor->raw_ptr, req_algo_count, &ret_algo_count, | |||||
| &algo_perf, args.workspace.raw_ptr, args.workspace.size, | |||||
| exhaustive_search)); | |||||
| sm_miopen_algo_cache.set(args, algo_perf.bwd_data_algo); | |||||
| return algo_perf.bwd_data_algo; | |||||
| } | |||||
| void ConvolutionBackwardDataImpl::AlgoMIOpen::exec(const ExecArgs& args) const { | |||||
| MIOpenBwdDataDescs D; | |||||
| args.init_desc(D); | |||||
| auto algo = const_cast<ConvolutionBackwardDataImpl::AlgoMIOpen*>(this) | |||||
| ->find_best_algo(args); | |||||
| float alpha = 1.0f, beta = 0.0f; | |||||
| auto status = miopenConvolutionBackwardData( | |||||
| args.handle->miopen_handle(), &alpha, D.diff_desc.desc, | |||||
| args.diff_tensor->raw_ptr, D.filter_desc.desc, | |||||
| args.filter_tensor->raw_ptr, D.conv_desc.desc, algo, &beta, | |||||
| D.grad_desc.desc, args.grad_tensor->raw_ptr, args.workspace.raw_ptr, | |||||
| args.workspace.size); | |||||
| megdnn_assert(status == miopenStatusSuccess, | |||||
| "conv bwd_data failed: %s; info: %s", | |||||
| miopenGetErrorString(status), args.to_string().c_str()); | |||||
| } | |||||
| void ConvolutionBackwardDataImpl::AlgoPack::fill_miopen_algos() {} | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,98 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/backward_filter/algo.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "hcc_detail/hcc_defs_prologue.h" | |||||
| #include "./algo.h" | |||||
| #include "src/rocm/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| ConvolutionBackwardFilterImpl::AlgoPack::AlgoPack() { | |||||
| all_algos.push_back(&miopen); | |||||
| all_algos.push_back(&matmul); | |||||
| all_algos.push_back(&chanwise); | |||||
| non_miopen_algos.push_back(&matmul); | |||||
| non_miopen_algos.push_back(&chanwise); | |||||
| non_miopen_algos.push_back(all_algos.back()); | |||||
| miopen_algos.push_back(&miopen); | |||||
| } | |||||
| ConvolutionBackwardFilterImpl::AlgoPack | |||||
| ConvolutionBackwardFilterImpl::sm_algo_pack; | |||||
| ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( | |||||
| ConvolutionBackwardFilterImpl* o, const TensorLayout& src, | |||||
| const TensorLayout& diff, const TensorLayout& grad) | |||||
| : SizeArgs(o, src, diff, o->check_layout_fwd(src, grad, diff)) {} | |||||
| ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( | |||||
| ConvolutionBackwardFilterImpl* o, const TensorLayout& src, | |||||
| const TensorLayout& diff, const CanonizedFilterMeta& grad) | |||||
| : handle{concrete_handle(o->handle())}, | |||||
| src_layout{&src}, | |||||
| diff_layout{&diff}, | |||||
| grad_filter_meta{grad}, | |||||
| opr{o} {} | |||||
| ConvolutionBackwardFilterImpl::AlgoBase::ExecArgs::ExecArgs( | |||||
| ConvolutionBackwardFilterImpl* opr, _megdnn_tensor_in src, | |||||
| _megdnn_tensor_in diff, _megdnn_tensor_out grad, | |||||
| _megdnn_workspace workspace) | |||||
| : SizeArgs(opr, src.layout, diff.layout, grad.layout), | |||||
| src_tensor{&src}, | |||||
| diff_tensor{&diff}, | |||||
| grad_tensor{&grad}, | |||||
| workspace{workspace} {} | |||||
| std::string ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::to_string() | |||||
| const { | |||||
| auto&& fm = grad_filter_meta; | |||||
| MEGDNN_MARK_USED_VAR(fm); | |||||
| return megdnn_mangle(ssprintf( | |||||
| "src=%s diff=%s grad_filter=%u{%u,%u,%u,%u}, " | |||||
| "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s", | |||||
| src_layout->to_string().c_str(), diff_layout->to_string().c_str(), | |||||
| fm.group, fm.ocpg, fm.icpg, fm.spatial[0], fm.spatial[1], | |||||
| fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1], | |||||
| fm.dilation[0], fm.dilation[1], !fm.should_flip, | |||||
| src_layout->dtype.name(), diff_layout->dtype.name())); | |||||
| } | |||||
| convolution::MIOpenCacheKey | |||||
| ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::to_miopen_algo_cache_key() | |||||
| const { | |||||
| convolution::MIOpenCacheKey res; | |||||
| res.miopen_handle = reinterpret_cast<intptr_t>(handle->miopen_handle()); | |||||
| res.batch = src_layout->operator[](0); | |||||
| res.IC = src_layout->operator[](1); | |||||
| res.IH = src_layout->operator[](2); | |||||
| res.IW = src_layout->operator[](3); | |||||
| res.OH = diff_layout->operator[](2); | |||||
| res.OW = diff_layout->operator[](3); | |||||
| res.FH = grad_filter_meta.spatial[0]; | |||||
| res.FW = grad_filter_meta.spatial[1]; | |||||
| res.SH = grad_filter_meta.stride[0]; | |||||
| res.SW = grad_filter_meta.stride[1]; | |||||
| res.PH = grad_filter_meta.padding[0]; | |||||
| res.PW = grad_filter_meta.padding[1]; | |||||
| res.DH = grad_filter_meta.dilation[0]; | |||||
| res.DW = grad_filter_meta.dilation[1]; | |||||
| res.group = grad_filter_meta.group; | |||||
| res.ocpg = grad_filter_meta.ocpg; | |||||
| res.icpg = grad_filter_meta.icpg; | |||||
| res.dtype_enum = static_cast<uint32_t>(src_layout->dtype.enumv()); | |||||
| res.exhaustive_search = | |||||
| static_cast<int32_t>(handle->enable_miopen_algo_search()); | |||||
| res.OC = res.group * res.ocpg; | |||||
| return res; | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,154 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/backward_filter/algo.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include <unordered_map> | |||||
| #include "src/rocm/convolution/helper.h" | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| /*! | |||||
| * \brief base class for convolution algos | |||||
| * | |||||
| */ | |||||
| class ConvolutionBackwardFilterImpl::AlgoBase : public Algorithm { | |||||
| protected: | |||||
| ~AlgoBase() = default; | |||||
| public: | |||||
| struct SizeArgs { | |||||
| HandleImpl* handle; | |||||
| const TensorLayout *src_layout, *diff_layout; | |||||
| CanonizedFilterMeta grad_filter_meta; | |||||
| ConvolutionBackwardFilterImpl* opr; | |||||
| std::string to_string() const; | |||||
| convolution::MIOpenCacheKey to_miopen_algo_cache_key() const; | |||||
| void init_desc(convolution::MIOpenBwdFilterDescs& desc) const { | |||||
| desc.set(*src_layout, *diff_layout, grad_filter_meta, opr->param()); | |||||
| } | |||||
| SizeArgs(ConvolutionBackwardFilterImpl* opr, const TensorLayout& src, | |||||
| const TensorLayout& diff, const TensorLayout& grad); | |||||
| SizeArgs(ConvolutionBackwardFilterImpl* opr, const TensorLayout& src, | |||||
| const TensorLayout& diff, const CanonizedFilterMeta& grad); | |||||
| convolution::ForwardSizeArgs as_fwd_args() const { | |||||
| return {handle, src_layout, grad_filter_meta, diff_layout}; | |||||
| } | |||||
| }; | |||||
| struct ExecArgs : public SizeArgs { | |||||
| const TensorND *src_tensor, *diff_tensor, *grad_tensor; | |||||
| Workspace workspace; | |||||
| ExecArgs(ConvolutionBackwardFilterImpl* opr, _megdnn_tensor_in src, | |||||
| _megdnn_tensor_in diff, _megdnn_tensor_out grad, | |||||
| _megdnn_workspace workspace); | |||||
| }; | |||||
| virtual bool is_available(const SizeArgs& args) const = 0; | |||||
| virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; | |||||
| virtual void exec(const ExecArgs& args) const = 0; | |||||
| bool is_available_wk(const SizeArgs& args, size_t limit) { | |||||
| return is_available(args) && get_workspace_in_bytes(args) <= limit; | |||||
| } | |||||
| bool is_available_reproducible( | |||||
| const SizeArgs& args, bool reproducible = true, | |||||
| size_t limit = std::numeric_limits<size_t>::max()) { | |||||
| return (!reproducible || is_reproducible()) && | |||||
| is_available_wk(args, limit); | |||||
| } | |||||
| AlgoBase& check_workspace(const SizeArgs& args, | |||||
| const Workspace& workspace) { | |||||
| auto req = get_workspace_in_bytes(args); | |||||
| megdnn_assert(req <= workspace.size, | |||||
| "conv bwd filter algo %s: " | |||||
| "required workspace %zu bytes, got %zu", | |||||
| name(), req, workspace.size); | |||||
| return *this; | |||||
| } | |||||
| virtual bool is_miopen() const { return false; } | |||||
| }; | |||||
| class ConvolutionBackwardFilterImpl::AlgoMIOpen final : public AlgoBase { | |||||
| bool m_is_reproducible; | |||||
| const char* m_name; | |||||
| miopenConvBwdWeightsAlgorithm_t find_best_algo(const ExecArgs& args); | |||||
| public: | |||||
| AlgoMIOpen() = delete; | |||||
| AlgoMIOpen(bool is_reproducible) : m_is_reproducible(is_reproducible) {} | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| bool is_reproducible() const override { return m_is_reproducible; } | |||||
| const char* name() const override { | |||||
| return "MIOpenConvolutionBackwardFilter"; | |||||
| } | |||||
| bool is_miopen() const override { return true; } | |||||
| static convolution::MIOpenCache<SizeArgs, miopenConvBwdWeightsAlgorithm_t> | |||||
| sm_miopen_algo_cache; | |||||
| static convolution::MIOpenCache<SizeArgs, size_t> sm_miopen_ws_cache; | |||||
| }; | |||||
| class ConvolutionBackwardFilterImpl::AlgoMatmul final : public AlgoBase { | |||||
| template <typename T> | |||||
| static void exec_internal(const ExecArgs& args); | |||||
| public: | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return "MATMUL"; } | |||||
| bool is_reproducible() const override { return true; } | |||||
| }; | |||||
| class ConvolutionBackwardFilterImpl::AlgoChanwise final : public AlgoBase { | |||||
| public: | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return "CHANNEL_WISE"; } | |||||
| bool is_reproducible() const override { return true; } | |||||
| }; | |||||
| class ConvolutionBackwardFilterImpl::AlgoPack { | |||||
| void fill_miopen_algos(); | |||||
| AlgoPack(const AlgoPack&) = delete; | |||||
| AlgoPack& operator=(const AlgoPack&) = delete; | |||||
| public: | |||||
| AlgoPack(); | |||||
| AlgoMIOpen miopen{true}; | |||||
| AlgoMatmul matmul; | |||||
| AlgoChanwise chanwise; | |||||
| std::vector<AlgoBase*> | |||||
| //! all algorithms | |||||
| all_algos, miopen_algos, non_miopen_algos; | |||||
| }; | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,55 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/backward_filter/chanwise.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "./algo.h" | |||||
| #include "src/rocm/utils.h" | |||||
| #include "src/rocm/convolution/chanwise/kern.h.hip" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| using namespace convolution; | |||||
| bool ConvolutionBackwardFilterImpl::AlgoChanwise::is_available( | |||||
| const SizeArgs& args) const { | |||||
| auto&& fm = args.grad_filter_meta; | |||||
| return fm.format == Param::Format::NCHW && | |||||
| args.diff_layout->dtype.category() == DTypeCategory::FLOAT && | |||||
| args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 && | |||||
| fm.spatial_ndim == 2 && fm.icpg == 1 && fm.dilation[0] == 1 && | |||||
| fm.dilation[1] == 1 && !fm.should_flip; | |||||
| } | |||||
| size_t ConvolutionBackwardFilterImpl::AlgoChanwise::get_workspace_in_bytes( | |||||
| const SizeArgs&) const { | |||||
| return 0; | |||||
| } | |||||
| void ConvolutionBackwardFilterImpl::AlgoChanwise::exec( | |||||
| const ExecArgs& args) const { | |||||
| auto kparam = chanwise::Param::from_fwd_args(args.as_fwd_args()); | |||||
| auto stream = hip_stream(args.handle); | |||||
| switch (args.diff_layout->dtype.enumv()) { | |||||
| #define cb(_dt) \ | |||||
| case DTypeTrait<_dt>::enumv: { \ | |||||
| using ctype = DTypeTrait<_dt>::ctype; \ | |||||
| return chanwise::run_bwd_filter( \ | |||||
| args.grad_tensor->ptr<ctype>(), args.src_tensor->ptr<ctype>(), \ | |||||
| args.diff_tensor->ptr<ctype>(), kparam, stream); \ | |||||
| } | |||||
| MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) | |||||
| #undef cb | |||||
| default: | |||||
| break; | |||||
| } | |||||
| megdnn_assert_internal(0); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,102 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/backward_filter/matmul.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "./algo.h" | |||||
| #include "src/rocm/utils.h" | |||||
| #include "src/rocm/convolution/helper.h" | |||||
| #include "src/rocm/convolution/im2col.h.hip" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| bool ConvolutionBackwardFilterImpl::AlgoMatmul::is_available( | |||||
| const SizeArgs& args) const { | |||||
| auto&& fm = args.grad_filter_meta; | |||||
| return fm.format == Param::Format::NCHW && | |||||
| args.diff_layout->dtype.category() == DTypeCategory::FLOAT && | |||||
| args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 && | |||||
| fm.group == 1 && fm.spatial_ndim == 2; | |||||
| } | |||||
| size_t ConvolutionBackwardFilterImpl::AlgoMatmul::get_workspace_in_bytes( | |||||
| const SizeArgs& args) const { | |||||
| return matmul_get_workspace_bundle(args.as_fwd_args()) | |||||
| .total_size_in_bytes(); | |||||
| } | |||||
| void ConvolutionBackwardFilterImpl::AlgoMatmul::exec( | |||||
| const ExecArgs& args) const { | |||||
| #define cb(DType) \ | |||||
| if (args.diff_layout->dtype == DType()) { \ | |||||
| using ctype = typename DTypeTrait<DType>::ctype; \ | |||||
| exec_internal<ctype>(args); \ | |||||
| return; \ | |||||
| } | |||||
| MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) | |||||
| #undef cb | |||||
| megdnn_assert_internal(0); | |||||
| } | |||||
| template <typename T> | |||||
| void ConvolutionBackwardFilterImpl::AlgoMatmul::exec_internal( | |||||
| const ExecArgs& args) { | |||||
| auto&& fm = args.grad_filter_meta; | |||||
| size_t N = args.src_layout->shape[0], IC = fm.icpg, | |||||
| IH = args.src_layout->shape[2], IW = args.src_layout->shape[3], | |||||
| OC = fm.ocpg, OH = args.diff_layout->shape[2], | |||||
| OW = args.diff_layout->shape[3], FH = fm.spatial[0], | |||||
| FW = fm.spatial[1], PH = fm.padding[0], PW = fm.padding[1], | |||||
| SH = fm.stride[0], SW = fm.stride[1], DH = fm.dilation[0], | |||||
| DW = fm.dilation[1]; | |||||
| auto stream = hip_stream(args.handle); | |||||
| auto wbundle = matmul_get_workspace_bundle(args.as_fwd_args()); | |||||
| wbundle.set(args.workspace.raw_ptr); | |||||
| T* diff_t = static_cast<T*>(wbundle.get(0)); | |||||
| T* col = static_cast<T*>(wbundle.get(1)); | |||||
| { | |||||
| // transpose diff | |||||
| TensorLayout froml({N, OC * OH * OW}, typename DTypeTrait<T>::dtype()), | |||||
| tol(froml); | |||||
| froml.stride[0] = args.diff_layout->stride[0]; | |||||
| tol.stride[0] = 1; | |||||
| tol.stride[1] = N; | |||||
| TensorND from(args.diff_tensor->ptr<T>(), froml), to(diff_t, tol); | |||||
| args.handle->relayout_opr()->exec(from, to); | |||||
| } | |||||
| { | |||||
| convolution::im2col<T>(args.src_tensor->ptr<T>(), col, N, | |||||
| args.src_tensor->layout.stride[0], IC, IH, IW, | |||||
| FH, FW, OH, OW, PH, PW, SH, SW, DH, DW, stream); | |||||
| } | |||||
| { | |||||
| // take gemm grad | |||||
| TensorLayout Al({OC, IC * FH * FW}, typename DTypeTrait<T>::dtype()), | |||||
| Bl({IC * FH * FW, OH * OW * N}, | |||||
| typename DTypeTrait<T>::dtype()), | |||||
| Cl({OC, OH * OW * N}, typename DTypeTrait<T>::dtype()); | |||||
| TensorND A(args.grad_tensor->ptr<T>(), Al), B(col, Bl), C(diff_t, Cl); | |||||
| if (fm.should_flip) { | |||||
| A.raw_ptr = wbundle.get(2); | |||||
| } | |||||
| args.handle->matmul_bT_opr()->exec(C, B, A, Workspace()); | |||||
| if (fm.should_flip) { | |||||
| convolution::flip_filter( | |||||
| args.as_fwd_args(), | |||||
| {static_cast<dt_byte*>(args.grad_tensor->raw_ptr), | |||||
| wbundle.get_size(2)}, | |||||
| A.raw_ptr); | |||||
| } | |||||
| } | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,110 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/backward_filter/miopen.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "hcc_detail/hcc_defs_prologue.h" | |||||
| #include "./algo.h" | |||||
| #include "src/rocm/utils.h" | |||||
| #include "src/rocm/miopen_wrapper.h" | |||||
| #include "src/rocm/convolution/helper.h" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| using namespace convolution; | |||||
| MIOpenCache<ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs, | |||||
| miopenConvBwdWeightsAlgorithm_t> | |||||
| ConvolutionBackwardFilterImpl::AlgoMIOpen::sm_miopen_algo_cache; | |||||
| MIOpenCache<ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs, size_t> | |||||
| ConvolutionBackwardFilterImpl::AlgoMIOpen::sm_miopen_ws_cache; | |||||
| bool ConvolutionBackwardFilterImpl::AlgoMIOpen::is_available( | |||||
| const SizeArgs& args) const { | |||||
| MIOpenBwdFilterDescs D; | |||||
| if (!is_miopen_supported(args.as_fwd_args())) | |||||
| return false; | |||||
| auto got = sm_miopen_ws_cache.get(args); | |||||
| if (got.first) | |||||
| return true; | |||||
| args.init_desc(D); | |||||
| size_t workspace_size; | |||||
| auto status = miopenConvolutionBackwardWeightsGetWorkSpaceSize( | |||||
| args.handle->miopen_handle(), D.diff_desc.desc, D.src_desc.desc, | |||||
| D.conv_desc.desc, D.grad_desc.desc, &workspace_size); | |||||
| if (status == miopenStatusSuccess) { | |||||
| sm_miopen_ws_cache.set(args, workspace_size); | |||||
| return true; | |||||
| } | |||||
| return false; | |||||
| } | |||||
| size_t ConvolutionBackwardFilterImpl::AlgoMIOpen::get_workspace_in_bytes( | |||||
| const SizeArgs& args) const { | |||||
| auto got = sm_miopen_ws_cache.get(args); | |||||
| if (got.first) | |||||
| return got.second; | |||||
| MIOpenBwdFilterDescs D; | |||||
| args.init_desc(D); | |||||
| size_t workspace_size; | |||||
| auto status = miopenConvolutionBackwardWeightsGetWorkSpaceSize( | |||||
| args.handle->miopen_handle(), D.diff_desc.desc, D.src_desc.desc, | |||||
| D.conv_desc.desc, D.grad_desc.desc, &workspace_size); | |||||
| megdnn_assert(status == miopenStatusSuccess, | |||||
| "conv bwd_filter get workspace failed: %s; info: %s", | |||||
| miopenGetErrorString(status), args.to_string().c_str()); | |||||
| sm_miopen_ws_cache.set(args, workspace_size); | |||||
| return workspace_size; | |||||
| } | |||||
| miopenConvBwdWeightsAlgorithm_t | |||||
| ConvolutionBackwardFilterImpl::AlgoMIOpen::find_best_algo(const ExecArgs& args) { | |||||
| auto find_algo = sm_miopen_algo_cache.get(args); | |||||
| if (find_algo.first) | |||||
| return find_algo.second; | |||||
| bool exhaustive_search = args.handle->enable_miopen_algo_search(); | |||||
| MIOpenBwdFilterDescs D; | |||||
| args.init_desc(D); | |||||
| const int req_algo_count = 1; | |||||
| int ret_algo_count; | |||||
| miopenConvAlgoPerf_t algo_perf; | |||||
| miopen_check(miopenFindConvolutionBackwardWeightsAlgorithm( | |||||
| args.handle->miopen_handle(), D.diff_desc.desc, | |||||
| args.diff_tensor->raw_ptr, D.src_desc.desc, | |||||
| args.src_tensor->raw_ptr, D.conv_desc.desc, D.grad_desc.desc, | |||||
| args.grad_tensor->raw_ptr, req_algo_count, &ret_algo_count, | |||||
| &algo_perf, args.workspace.raw_ptr, args.workspace.size, | |||||
| exhaustive_search)); | |||||
| // algo_perf.bwd_weights_algo = miopenConvolutionBwdWeightsAlgoGEMM; | |||||
| sm_miopen_algo_cache.set(args, algo_perf.bwd_weights_algo); | |||||
| return algo_perf.bwd_weights_algo; | |||||
| } | |||||
| void ConvolutionBackwardFilterImpl::AlgoMIOpen::exec( | |||||
| const ExecArgs& args) const { | |||||
| MIOpenBwdFilterDescs D; | |||||
| args.init_desc(D); | |||||
| auto algo = const_cast<ConvolutionBackwardFilterImpl::AlgoMIOpen*>(this) | |||||
| ->find_best_algo(args); | |||||
| float alpha = 1.0f, beta = 0.0f; | |||||
| auto status = miopenConvolutionBackwardWeights( | |||||
| args.handle->miopen_handle(), &alpha, D.diff_desc.desc, | |||||
| args.diff_tensor->raw_ptr, D.src_desc.desc, | |||||
| args.src_tensor->raw_ptr, D.conv_desc.desc, algo, &beta, | |||||
| D.grad_desc.desc, args.grad_tensor->raw_ptr, args.workspace.raw_ptr, | |||||
| args.workspace.size); | |||||
| megdnn_assert(status == miopenStatusSuccess, | |||||
| "conv bwd_filter failed: %s; info: %s", | |||||
| miopenGetErrorString(status), args.to_string().c_str()); | |||||
| } | |||||
| void ConvolutionBackwardFilterImpl::AlgoPack::fill_miopen_algos() {} | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,173 @@ | |||||
| /** | |||||
| * \file src/rocm/convolution/chanwise/bwd_data.cpp.hip | |||||
| * | |||||
| * This file is part of MegDNN, a deep neural network run-time library | |||||
| * developed by Megvii. | |||||
| * | |||||
| * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved. | |||||
| */ | |||||
| #include "hip_header.h" | |||||
| #include "./kern.h.hip" | |||||
| #include "./kern_helper.h.hip" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| using namespace convolution; | |||||
| using namespace chanwise; | |||||
| namespace { | |||||
| // grid idx is (inp_chl, worker_index) | |||||
| // each y-slice of a block works on an (N, IH, IW) spatial image at given | |||||
| // inp_chl | |||||
| template <typename T, int CHL_MUL_SET, int FH_SET, int FW_SET, int SH_SET, | |||||
| int SW_SET> | |||||
| __global__ void kern_bwd_data(T* src_grad, const T* dst_grad, const T* flt_tot, | |||||
| Param param) { | |||||
| extern __shared__ uint8_t flt_storage[]; | |||||
| T* const flt = reinterpret_cast<T*>(flt_storage); | |||||
| const uint32_t N = param.batch, IC = param.src_chl, ic = blockIdx.x, | |||||
| IH = param.src_h, IW = param.src_w, | |||||
| CHL_MUL = CHL_MUL_SET ? CHL_MUL_SET : param.chl_mul, | |||||
| FH = FH_SET ? FH_SET : param.flt_h, | |||||
| FW = FW_SET ? FW_SET : param.flt_w, FSIZE = FH * FW, | |||||
| PH = param.pad_h, PW = param.pad_w, | |||||
| SH = SH_SET ? SH_SET : param.stride_h, | |||||
| SW = SW_SET ? SW_SET : param.stride_w, OH = param.out_h, | |||||
| OW = param.out_w, TOT_OUT = N * IH * IW; | |||||
| block_memcpy(flt, flt_tot + ic * FSIZE * CHL_MUL, FSIZE * CHL_MUL); | |||||
| dst_grad += ic * CHL_MUL * OH * OW; | |||||
| src_grad += ic * IH * IW; | |||||
| uint32_t out_idx_ = blockIdx.y * blockDim.x + threadIdx.x, | |||||
| nr_out_per_launch = blockDim.x * gridDim.y; | |||||
| for (; out_idx_ < TOT_OUT; out_idx_ += nr_out_per_launch) { | |||||
| uint32_t out_idx = out_idx_, n, ih, iw; | |||||
| out_idx = div_mod(out_idx, IW, iw); | |||||
| out_idx = div_mod(out_idx, IH, ih); | |||||
| n = out_idx; | |||||
| const T* dst_grad_base = dst_grad + n * (IC * CHL_MUL * OH * OW); | |||||
| T sum(0); | |||||
| // o >= max(0, floor_div((i+P-F+1), S)) | |||||
| uint32_t ohmin = max(int32_t(ih + PH - FH + SH), 0) / SH, | |||||
| owmin = max(int32_t(iw + PW - FW + SW), 0) / SW, | |||||
| ohmax = min((ih + PH) / SH, OH - 1), | |||||
| owmax = min((iw + PW) / SW, OW - 1); | |||||
| if (SH_SET == 1 && SW_SET == 1 && FH_SET && FW_SET) { | |||||
| #pragma unroll | |||||
| for (uint32_t doh = 0; doh < FH; ++doh) { | |||||
| uint32_t oh = ohmin + doh; | |||||
| if (oh <= ohmax) { | |||||
| uint32_t fh = ih - oh * SH + PH; | |||||
| #pragma unroll | |||||
| for (uint32_t dow = 0; dow < FW; ++dow) { | |||||
| uint32_t ow = owmin + dow; | |||||
| if (ow <= owmax) { | |||||
| uint32_t fw = iw - ow * SW + PW; | |||||
| const T* pd = dst_grad_base + oh * OW + ow; | |||||
| const T* pf = flt + fh * FW + fw; | |||||
| #pragma unroll | |||||
| for (uint32_t chl_mul = 0; chl_mul < CHL_MUL; | |||||
| ++chl_mul) { | |||||
| sum += *pd * *pf; | |||||
| pd += OH * OW; | |||||
| pf += FSIZE; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } else { | |||||
| for (uint32_t oh = ohmin; oh <= ohmax; ++oh) { | |||||
| uint32_t fh = ih - oh * SH + PH; | |||||
| for (uint32_t ow = owmin; ow <= owmax; ++ow) { | |||||
| uint32_t fw = iw - ow * SW + PW; | |||||
| const T* pd = dst_grad_base + oh * OW + ow; | |||||
| const T* pf = flt + fh * FW + fw; | |||||
| #pragma unroll | |||||
| for (uint32_t chl_mul = 0; chl_mul < CHL_MUL; ++chl_mul) { | |||||
| sum += *pd * *pf; | |||||
| pd += OH * OW; | |||||
| pf += FSIZE; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| src_grad[(n * (IC * IH) + ih) * IW + iw] = sum; | |||||
| } | |||||
| } | |||||
| template <typename T> | |||||
| class KernDispatch { | |||||
| public: | |||||
| typedef void (*kern_ptr_t)(T*, const T*, const T*, Param); | |||||
| static kern_ptr_t dispatch(int chl_mul, int fh, int fw, int sh, int sw) { | |||||
| if (chl_mul == 1) { | |||||
| if (fh == 3 && fw == 3) | |||||
| return d1<1, 3, 3>(sh, sw); | |||||
| if (fh == 4 && fw == 4) | |||||
| return d1<1, 4, 4>(sh, sw); | |||||
| } | |||||
| return d1<0, 0, 0>(sh, sw); | |||||
| } | |||||
| private: | |||||
| template <int chl_mul, int fh, int fw> | |||||
| static kern_ptr_t d1(int sh, int sw) { | |||||
| if (sh == 1 && sw == 1) | |||||
| return kern_bwd_data<T, chl_mul, fh, fw, 1, 1>; | |||||
| if (sh == 1 && sw == 2) | |||||
| return kern_bwd_data<T, chl_mul, fh, fw, 1, 2>; | |||||
| if (sh == 2 && sw == 1) | |||||
| return kern_bwd_data<T, chl_mul, fh, fw, 2, 1>; | |||||
| if (sh == 2 && sw == 2) | |||||
| return kern_bwd_data<T, chl_mul, fh, fw, 2, 2>; | |||||
| return kern_bwd_data<T, chl_mul, fh, fw, 0, 0>; | |||||
| } | |||||
| }; | |||||
| } // anonymous namespace | |||||
| template <typename T> | |||||
| void chanwise::run_bwd_data(T* src_grad, const T* dst_grad, const T* flt, | |||||
| const Param& param, hipStream_t stream) { | |||||
| typename KernDispatch<T>::kern_ptr_t kern = | |||||
| KernDispatch<T>::dispatch(param.chl_mul, param.flt_h, param.flt_w, | |||||
| param.stride_h, param.stride_w); | |||||
| int nr_thread = 256, nr_out_dimx = param.src_h * param.src_w * param.batch; | |||||
| dim3 nr_block(param.src_chl, | |||||
| std::min(512, max(nr_out_dimx / (nr_thread * 4), 1))); | |||||
| uint32_t shared = param.chl_mul * param.flt_h * param.flt_w * sizeof(T); | |||||
| kern<<<nr_block, nr_thread, shared, stream>>>(src_grad, dst_grad, flt, | |||||
| param); | |||||
| after_kernel_launch(); | |||||
| } | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| namespace convolution { | |||||
| namespace chanwise { | |||||
| #define INST(_dt) \ | |||||
| template void run_bwd_data( \ | |||||
| DTypeTrait<_dt>::ctype*, const DTypeTrait<_dt>::ctype*, \ | |||||
| const DTypeTrait<_dt>::ctype*, const Param&, hipStream_t); | |||||
| MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(INST) | |||||
| #undef INST | |||||
| #undef DO_INST | |||||
| } // namespace chanwise | |||||
| } // namespace convolution | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cuda.doxygen | |||||
| @@ -0,0 +1,193 @@ | |||||
| /** | |||||
| * \file src/rocm/convolution/chanwise/bwd_filter.cpp.hip | |||||
| * | |||||
| * This file is part of MegDNN, a deep neural network run-time library | |||||
| * developed by Megvii. | |||||
| * | |||||
| * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved. | |||||
| */ | |||||
| #include "hip_header.h" | |||||
| #include "./kern.h.hip" | |||||
| #include "./kern_helper.h.hip" | |||||
| const uint32_t WARP_SIZE = 32, BATCH_UNROLL = 4; | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| using namespace convolution; | |||||
| using namespace chanwise; | |||||
| namespace { | |||||
| /*! | |||||
| * \brief compute grad w.r.t. filter | |||||
| * | |||||
| * block dim: out_id * kern_id | |||||
| * threads with the same out_id computes grad for corresponding kernel element | |||||
| * \tparam nr_thpf number of threads for one element in the filter; must be | |||||
| * power of 2; | |||||
| */ | |||||
| template <typename T, uint32_t nr_thpf> | |||||
| __global__ void kern_bwd_filter(T* flt_grad, const T* src, const T* dst_grad, | |||||
| Param param) { | |||||
| const uint32_t N = param.batch, IC = param.src_chl, IH = param.src_h, | |||||
| IW = param.src_w, CHL_MUL = param.chl_mul, FH = param.flt_h, | |||||
| FW = param.flt_w, PH = param.pad_h, PW = param.pad_w, | |||||
| SH = param.stride_h, SW = param.stride_w, OH = param.out_h, | |||||
| OW = param.out_w, SRC_BATCH_STRIDE = IC * IH * IW, | |||||
| DST_BATCH_STRIDE = IC * CHL_MUL * OH * OW, | |||||
| BLKDIM_X = blockDim.x / nr_thpf, | |||||
| THREADID_X = threadIdx.x / nr_thpf, | |||||
| OUT_IDX = blockIdx.x * BLKDIM_X + THREADID_X; | |||||
| uint32_t ic, chl_mul, fh, fw; | |||||
| { | |||||
| uint32_t i = OUT_IDX; | |||||
| i = div_mod(i, FW, fw); | |||||
| i = div_mod(i, FH, fh); | |||||
| i = div_mod(i, CHL_MUL, chl_mul); | |||||
| ic = i; | |||||
| } | |||||
| if (ic >= IC) { | |||||
| return; | |||||
| } | |||||
| src += ic * IH * IW; | |||||
| dst_grad += (ic * CHL_MUL + chl_mul) * OH * OW; | |||||
| const uint32_t oh_lo = max(int32_t(PH - fh + SH - 1), 0) / SH, | |||||
| oh_hi = min((IH - 1 + PH - fh) / SH + 1, OH), | |||||
| ow_lo = max(int32_t(PW - fw + SW - 1), 0) / SW, | |||||
| ow_hi = min((IW - 1 + PW - fw) / SW + 1, OW), | |||||
| oblk_h = oh_hi - oh_lo, oblk_w = ow_hi - ow_lo, | |||||
| oblk_tot = oblk_h * oblk_w * | |||||
| ((N + BATCH_UNROLL - 1) / BATCH_UNROLL), | |||||
| tid = threadIdx.x % nr_thpf; | |||||
| if (IH + PH < fh + 1 || oh_lo >= oh_hi || IW + PW < fw + 1 || | |||||
| ow_lo >= ow_hi) { | |||||
| if (!tid) | |||||
| flt_grad[OUT_IDX] = 0; | |||||
| return; | |||||
| } | |||||
| T sum(0); | |||||
| for (uint32_t oblk_idx = tid; oblk_idx < oblk_tot; oblk_idx += nr_thpf) { | |||||
| uint32_t n, oh, ow; | |||||
| n = div_mod(div_mod(oblk_idx, oblk_w, ow), oblk_h, oh) * BATCH_UNROLL; | |||||
| oh += oh_lo; | |||||
| ow += ow_lo; | |||||
| uint32_t ih = oh * SH - PH + fh, iw = ow * SW - PW + fw, | |||||
| soff = ih * IW + iw + n * SRC_BATCH_STRIDE, | |||||
| doff = oh * OW + ow + n * DST_BATCH_STRIDE; | |||||
| #pragma unroll | |||||
| for (uint32_t i = 0; i < BATCH_UNROLL; ++i) { | |||||
| if (!i || n + i < N) { | |||||
| sum += src[soff] * dst_grad[doff]; | |||||
| } | |||||
| soff += SRC_BATCH_STRIDE; | |||||
| doff += DST_BATCH_STRIDE; | |||||
| } | |||||
| } | |||||
| if (nr_thpf == 1) { | |||||
| flt_grad[OUT_IDX] = sum; | |||||
| } else { | |||||
| // reduce all sums in a block | |||||
| extern __shared__ uint8_t shared_storage[]; | |||||
| volatile T* thread_sum = reinterpret_cast<T*>(shared_storage); | |||||
| thread_sum += THREADID_X * nr_thpf; | |||||
| thread_sum[tid] = sum; | |||||
| #pragma unroll | |||||
| for (uint32_t i = nr_thpf / 2; i; i >>= 1) { | |||||
| bool cond = nr_thpf >= i * 2 && tid < i; | |||||
| if (i >= WARP_SIZE) { | |||||
| __syncthreads(); | |||||
| } | |||||
| if (cond) { | |||||
| T v0 = thread_sum[tid], v1 = v0 + thread_sum[tid + i]; | |||||
| thread_sum[tid] = v1; | |||||
| } | |||||
| } | |||||
| if (!tid) | |||||
| flt_grad[OUT_IDX] = thread_sum[0]; | |||||
| } | |||||
| } | |||||
| } // anonymous namespace | |||||
| template <typename T> | |||||
| void convolution::chanwise::run_bwd_filter(T* filter_grad, const T* src, | |||||
| const T* dst_grad, | |||||
| const Param& param, | |||||
| hipStream_t stream) { | |||||
| void (*kern)(T*, const T*, const T*, Param) = NULL; | |||||
| uint32_t nr_thread = 256, | |||||
| nr_thpf = std::min( | |||||
| nr_thread, | |||||
| std::max<uint32_t>(1, param.out_h * param.out_w * | |||||
| param.batch / | |||||
| (BATCH_UNROLL * 16))); | |||||
| // find nearest power-of-2 of nr_thpf | |||||
| do { | |||||
| #define CK(_n) \ | |||||
| if (nr_thpf >= _n) { \ | |||||
| kern = kern_bwd_filter<T, _n>; \ | |||||
| nr_thpf = _n; \ | |||||
| break; \ | |||||
| } | |||||
| CK(1 << 10); | |||||
| CK(1 << 9); | |||||
| CK(1 << 8); | |||||
| CK(1 << 7); | |||||
| CK(1 << 6); | |||||
| CK(1 << 5); | |||||
| CK(1 << 4); | |||||
| CK(1 << 3); | |||||
| CK(1 << 2); | |||||
| CK(1 << 1); | |||||
| CK(1 << 0); | |||||
| #undef CK | |||||
| } while (0); | |||||
| megdnn_assert(kern); | |||||
| nr_thread = 256; | |||||
| uint32_t nr_flt_per_blk = nr_thread / nr_thpf; | |||||
| while (nr_flt_per_blk * nr_thpf % WARP_SIZE) | |||||
| --nr_flt_per_blk; | |||||
| megdnn_assert(nr_flt_per_blk); | |||||
| int nr_block = | |||||
| DIVUP(param.flt_h * param.flt_w * param.src_chl * param.chl_mul, | |||||
| nr_flt_per_blk); | |||||
| nr_thread = nr_flt_per_blk * nr_thpf; | |||||
| uint32_t shared = nr_thread * 2 * sizeof(T); | |||||
| hipLaunchKernelGGL(kern, nr_block, nr_thread, shared, stream, filter_grad, | |||||
| src, dst_grad, param); | |||||
| after_kernel_launch(); | |||||
| } | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| namespace convolution { | |||||
| namespace chanwise { | |||||
| #define DO_INST(_ct) \ | |||||
| template void run_bwd_filter(_ct*, const _ct*, const _ct*, const Param&, \ | |||||
| hipStream_t); | |||||
| #define INST(_dt) DO_INST(DTypeTrait<_dt>::ctype) | |||||
| MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(INST) | |||||
| #undef INST | |||||
| #undef DO_INST | |||||
| } // namespace chanwise | |||||
| } // namespace convolution | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cuda.doxygen | |||||
| @@ -0,0 +1,132 @@ | |||||
| /** | |||||
| * \file src/rocm/convolution/chanwise/fwd.cpp.hip | |||||
| * | |||||
| * This file is part of MegDNN, a deep neural network run-time library | |||||
| * developed by Megvii. | |||||
| * | |||||
| * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved. | |||||
| */ | |||||
| #include "hip_header.h" | |||||
| #include "./kern.h.hip" | |||||
| #include "./kern_helper.h.hip" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| using namespace convolution; | |||||
| using namespace chanwise; | |||||
| namespace { | |||||
| // grid idx is (inp_chl, worker_index) | |||||
| // each y-slice of a block works on an (N, CHL_MUL, OH, OW) spatial image at | |||||
| // given inp_chl | |||||
| template <typename T, int CHL_MUL_SET, int FH_SET, int FW_SET> | |||||
| __global__ void kern_fwd(T* dst, const T* src, const T* flt_tot, Param param) { | |||||
| extern __shared__ uint8_t flt_storage[]; | |||||
| T* const flt = reinterpret_cast<T*>(flt_storage); | |||||
| const uint32_t N = param.batch, IC = param.src_chl, ic = blockIdx.x, | |||||
| IH = param.src_h, IW = param.src_w, | |||||
| CHL_MUL = CHL_MUL_SET ? CHL_MUL_SET : param.chl_mul, | |||||
| FH = FH_SET ? FH_SET : param.flt_h, | |||||
| FW = FW_SET ? FW_SET : param.flt_w, FSIZE = FH * FW, | |||||
| PH = param.pad_h, PW = param.pad_w, SH = param.stride_h, | |||||
| SW = param.stride_w, OH = param.out_h, OW = param.out_w, | |||||
| TOT_OUT = N * CHL_MUL * OH * OW; | |||||
| block_memcpy(flt, flt_tot + ic * FSIZE * CHL_MUL, FSIZE * CHL_MUL); | |||||
| uint32_t out_idx_ = blockIdx.y * blockDim.x + threadIdx.x, | |||||
| nr_out_per_launch = blockDim.x * gridDim.y; | |||||
| for (; out_idx_ < TOT_OUT; out_idx_ += nr_out_per_launch) { | |||||
| uint32_t out_idx = out_idx_, n, chl_mul, oh, ow; | |||||
| out_idx = div_mod(out_idx, OW, ow); | |||||
| out_idx = div_mod(out_idx, OH, oh); | |||||
| if (CHL_MUL_SET == 1) { | |||||
| chl_mul = 0; | |||||
| n = out_idx; | |||||
| } else { | |||||
| n = div_mod(out_idx, CHL_MUL, chl_mul); | |||||
| } | |||||
| int ih = int(oh * SH) - int(PH), iw = int(ow * SW) - int(PW); | |||||
| const T* flt_base = flt + chl_mul * FSIZE; | |||||
| const T* src_base = src + int(((n * IC + ic) * IH + ih) * IW + iw); | |||||
| T sum(0); | |||||
| if (FH_SET && FW_SET) { | |||||
| #pragma unroll | |||||
| for (uint32_t fh = 0; fh < FH; ++fh) { | |||||
| if (static_cast<uint32_t>(fh + ih) < IH) { | |||||
| #pragma unroll | |||||
| for (uint32_t fw = 0; fw < FW; ++fw) { | |||||
| if (static_cast<uint32_t>(fw + iw) < IW) { | |||||
| sum += flt_base[fh * FW + fw] * | |||||
| src_base[fh * IW + fw]; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } else { | |||||
| int fhmax = min(int(FH), int(IH - ih)), | |||||
| fwmax = min(int(FW), int(IW - iw)); | |||||
| for (int fh = max(0, -ih); fh < fhmax; ++fh) { | |||||
| for (int fw = max(0, -iw); fw < fwmax; ++fw) { | |||||
| sum += flt_base[fh * FW + fw] * src_base[fh * IW + fw]; | |||||
| } | |||||
| } | |||||
| } | |||||
| dst[(((n * IC + ic) * CHL_MUL + chl_mul) * OH + oh) * OW + ow] = sum; | |||||
| } | |||||
| } | |||||
| } // anonymous namespace | |||||
| template <typename T> | |||||
| void chanwise::run_fwd(T* dst, const T* src, const T* flt, const Param& param, | |||||
| hipStream_t stream) { | |||||
| void (*kern)(T*, const T*, const T*, Param); | |||||
| if (param.chl_mul == 1) { | |||||
| if (param.flt_h == 3 && param.flt_w == 3) { | |||||
| kern = kern_fwd<T, 1, 3, 3>; | |||||
| } else if (param.flt_h == 4 && param.flt_w == 4) { | |||||
| kern = kern_fwd<T, 1, 4, 4>; | |||||
| } else { | |||||
| kern = kern_fwd<T, 1, 0, 0>; | |||||
| } | |||||
| } else { | |||||
| kern = kern_fwd<T, 0, 0, 0>; | |||||
| } | |||||
| int nr_thread = 256, | |||||
| nr_out_dimx = param.out_h * param.out_w * param.batch * param.chl_mul; | |||||
| dim3 nr_block(param.src_chl, | |||||
| std::min(512, max(nr_out_dimx / (nr_thread * 4), 1))); | |||||
| uint32_t shared = param.chl_mul * param.flt_h * param.flt_w * sizeof(T); | |||||
| kern<<<nr_block, nr_thread, shared, stream>>>(dst, src, flt, param); | |||||
| after_kernel_launch(); | |||||
| } | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| namespace convolution { | |||||
| namespace chanwise { | |||||
| #define DO_INST(_ct) \ | |||||
| template void run_fwd(_ct*, const _ct*, const _ct*, const Param&, \ | |||||
| hipStream_t); | |||||
| #define INST(_dt) DO_INST(DTypeTrait<_dt>::ctype) | |||||
| MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(INST) | |||||
| #undef INST | |||||
| #undef DO_INST | |||||
| } // namespace chanwise | |||||
| } // namespace convolution | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cuda.doxygen | |||||
| @@ -0,0 +1,71 @@ | |||||
| /** | |||||
| * \file src/rocm/convolution/chanwise/kern.h.hip | |||||
| * | |||||
| * This file is part of MegDNN, a deep neural network run-time library | |||||
| * developed by Megvii. | |||||
| * | |||||
| * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved. | |||||
| */ | |||||
| #pragma once | |||||
| #include "src/rocm/utils.h.hip" | |||||
| #include <stdint.h> | |||||
| #include "hip_header.h" | |||||
| #if MEGDNN_CC_HOST | |||||
| #include "src/rocm/convolution/helper.h" | |||||
| #endif | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| namespace convolution { | |||||
| namespace chanwise { | |||||
| struct Param { | |||||
| uint32_t batch, src_chl, src_h, src_w, chl_mul, flt_h, flt_w, out_h, out_w, | |||||
| pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w; | |||||
| #if MEGDNN_CC_HOST | |||||
| static Param from_fwd_args(const ForwardSizeArgs& args) { | |||||
| #define U(v) static_cast<uint32_t>(v) | |||||
| auto&& src = args.src_layout->shape; | |||||
| auto&& dst = args.dst_layout->shape; | |||||
| auto&& fm = args.filter_meta; | |||||
| size_t c_pos, hw_pos; | |||||
| if (fm.format == param::Convolution::Format::NCHW) { | |||||
| c_pos = 1; | |||||
| hw_pos = 2; | |||||
| } else { | |||||
| c_pos = 3; | |||||
| hw_pos = 1; | |||||
| } | |||||
| return { | |||||
| U(src[0]), U(src[c_pos]), U(src[hw_pos]), | |||||
| U(src[hw_pos + 1]), U(fm.ocpg), U(fm.spatial[0]), | |||||
| U(fm.spatial[1]), U(dst[hw_pos]), U(dst[hw_pos + 1]), | |||||
| U(fm.padding[0]), U(fm.padding[1]), U(fm.stride[0]), | |||||
| U(fm.stride[1]), U(fm.dilation[0]), U(fm.dilation[1]), | |||||
| }; | |||||
| #undef U | |||||
| } | |||||
| #endif | |||||
| }; | |||||
| template <typename T> | |||||
| void run_fwd(T* dst, const T* src, const T* flt, const Param& param, | |||||
| hipStream_t stream); | |||||
| template <typename T> | |||||
| void run_bwd_data(T* src_grad, const T* dst_grad, const T* flt, | |||||
| const Param& param, hipStream_t stream); | |||||
| template <typename T> | |||||
| void run_bwd_filter(T* filter_grad, const T* src, const T* dst_grad, | |||||
| const Param& param, hipStream_t stream); | |||||
| } // namespace chanwise | |||||
| } // namespace convolution | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: ft=cpp syntax=cpp.doxygen | |||||
| @@ -0,0 +1,51 @@ | |||||
| /** | |||||
| * \file src/rocm/convolution/chanwise/kern_helper.h.hip | |||||
| * | |||||
| * This file is part of MegDNN, a deep neural network run-time library | |||||
| * developed by Megvii. | |||||
| * | |||||
| * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved. | |||||
| */ | |||||
| #pragma once | |||||
| #include "megdnn/dtype.h" | |||||
| #include "src/rocm/utils.h.hip" | |||||
| #include <stdint.h> | |||||
| #include <algorithm> | |||||
| #include "hip_header.h" | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| namespace convolution { | |||||
| namespace chanwise { | |||||
| /*! | |||||
| * \brief return a / b and set mod to a % b | |||||
| */ | |||||
| __device__ __forceinline__ uint32_t div_mod(uint32_t a, uint32_t b, | |||||
| uint32_t& mod) { | |||||
| uint32_t ret = a / b; | |||||
| mod = a - ret * b; | |||||
| return ret; | |||||
| } | |||||
| /*! | |||||
| * \brief copy a 2D matrix by all threads in a block | |||||
| * \param rs row stride | |||||
| */ | |||||
| template <typename T> | |||||
| __device__ __forceinline__ void block_memcpy(T* dst, const T* src, | |||||
| uint32_t size) { | |||||
| for (uint32_t i = threadIdx.x; i < size; i += blockDim.x) { | |||||
| dst[i] = src[i]; | |||||
| } | |||||
| __syncthreads(); | |||||
| } | |||||
| } // namespace chanwise | |||||
| } // namespace convolution | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cuda.doxygen | |||||
| @@ -0,0 +1,130 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/forward/1x1.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "./algo.h" | |||||
| #include "src/rocm/handle.h" | |||||
| #include "src/rocm/utils.h.hip" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| using namespace convolution; | |||||
| bool ConvolutionForwardImpl::Algo1x1::is_available(const SizeArgs& args) const { | |||||
| auto&& fm = args.filter_meta; | |||||
| const size_t MAX_WORKSPACE_SIZE = 2147483648; // 2 * 1024^3 | |||||
| if (!(fm.format == Param::Format::NCHW && | |||||
| args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 && | |||||
| (fm.dtype.enumv() == DTypeEnum::Float32 || | |||||
| fm.dtype.enumv() == DTypeEnum::Float16) && | |||||
| fm.spatial_ndim == 2 && fm.group == 1 && fm.dilation[0] == 1 && | |||||
| fm.dilation[1] == 1 && fm.spatial[0] == 1 && fm.spatial[1] == 1 && | |||||
| fm.padding[0] == 0 && fm.padding[1] == 0 && fm.stride[0] == 1 && | |||||
| fm.stride[1] == 1)) | |||||
| return false; | |||||
| if (get_workspace_in_bytes(args) > MAX_WORKSPACE_SIZE) { | |||||
| return false; | |||||
| } | |||||
| return true; | |||||
| } | |||||
| void ConvolutionForwardImpl::Algo1x1::extract_matmul_layouts( | |||||
| const SizeArgs& args, TensorLayout& A, TensorLayout& B, | |||||
| TensorLayout& C) { | |||||
| auto&& fm = args.filter_meta; | |||||
| A = {{fm.ocpg, fm.icpg}, fm.dtype}; | |||||
| B.ndim = 2; | |||||
| B.shape[0] = args.src_layout->shape[1]; | |||||
| B.shape[1] = args.src_layout->shape[2] * args.src_layout->shape[3]; | |||||
| B.stride[0] = args.src_layout->stride[1]; | |||||
| B.stride[1] = 1; | |||||
| B.dtype = args.src_layout->dtype; | |||||
| C = {{args.dst_layout->shape[1], B.shape[1]}, args.dst_layout->dtype}; | |||||
| } | |||||
| size_t ConvolutionForwardImpl::Algo1x1::get_workspace_in_bytes( | |||||
| const SizeArgs& args) const { | |||||
| TensorLayout A, B, C; | |||||
| extract_matmul_layouts(args, A, B, C); | |||||
| return args.handle->matmul_opr()->get_workspace_in_bytes(A, B, C); | |||||
| } | |||||
| void ConvolutionForwardImpl::Algo1x1::exec(const ExecArgs& args) const { | |||||
| TensorND A, B, C; | |||||
| extract_matmul_layouts(args, A.layout, B.layout, C.layout); | |||||
| A.raw_ptr = args.filter_tensor->raw_ptr; | |||||
| B.raw_ptr = args.src_tensor->raw_ptr; | |||||
| C.raw_ptr = args.dst_tensor->raw_ptr; | |||||
| size_t batch = args.src_layout->shape[0]; | |||||
| auto mm = args.handle->matmul_opr(); | |||||
| auto strd_B = args.src_layout->stride[0] * args.src_layout->dtype.size(), | |||||
| strd_C = args.dst_layout->stride[0] * args.dst_layout->dtype.size(); | |||||
| for (size_t i = 0; i < batch; ++i) { | |||||
| mm->exec(A, B, C, args.workspace); | |||||
| incr_voidp(B.raw_ptr, strd_B); | |||||
| incr_voidp(C.raw_ptr, strd_C); | |||||
| } | |||||
| } | |||||
| /* | |||||
| * Funcitons to handle large batch | |||||
| */ | |||||
| bool ConvolutionForwardImpl::Algo1x1LargeBatch::is_available( | |||||
| const SizeArgs& args) const { | |||||
| auto&& fm = args.filter_meta; | |||||
| return fm.format == Param::Format::NCHW && | |||||
| args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 && | |||||
| (fm.dtype.enumv() == DTypeEnum::Float32 || | |||||
| fm.dtype.enumv() == DTypeEnum::Float16) && | |||||
| fm.spatial_ndim == 2 && fm.group == 1 && fm.dilation[0] == 1 && | |||||
| fm.dilation[1] == 1 && fm.spatial[0] == 1 && fm.spatial[1] == 1 && | |||||
| fm.padding[0] == 0 && fm.padding[1] == 0 && fm.stride[0] == 1 && | |||||
| fm.stride[1] == 1; | |||||
| } | |||||
| void ConvolutionForwardImpl::Algo1x1LargeBatch::extract_matmul_layouts( | |||||
| const SizeArgs& args, TensorLayout& A, TensorLayout& B, | |||||
| TensorLayout& C) { | |||||
| auto&& fm = args.filter_meta; | |||||
| // A {N, OC, IC} | |||||
| // B {N, IC, H * W} | |||||
| // C {N, OC, H * W} | |||||
| size_t batched = args.src_layout->shape[0]; | |||||
| A = {{batched, fm.ocpg, fm.icpg}, fm.dtype}; | |||||
| A.stride[0] = 0; | |||||
| B.ndim = 3; | |||||
| B.shape[1] = args.src_layout->shape[1]; | |||||
| B.shape[2] = args.src_layout->shape[2] * args.src_layout->shape[3]; | |||||
| B.shape[0] = batched; | |||||
| B.stride[2] = 1; | |||||
| B.stride[1] = args.src_layout->stride[1]; | |||||
| B.stride[0] = args.src_layout->stride[0]; | |||||
| B.dtype = args.src_layout->dtype; | |||||
| C = {{args.dst_layout->shape[0], args.dst_layout->shape[1], B.shape[2]}, | |||||
| args.dst_layout->dtype}; | |||||
| } | |||||
| size_t ConvolutionForwardImpl::Algo1x1LargeBatch::get_workspace_in_bytes( | |||||
| const SizeArgs& args) const { | |||||
| TensorLayout A, B, C; | |||||
| extract_matmul_layouts(args, A, B, C); | |||||
| return args.handle->batched_matrix_mul()->get_workspace_in_bytes(A, B, C); | |||||
| } | |||||
| void ConvolutionForwardImpl::Algo1x1LargeBatch::exec( | |||||
| const ExecArgs& args) const { | |||||
| TensorND A, B, C; | |||||
| extract_matmul_layouts(args, A.layout, B.layout, C.layout); | |||||
| A.raw_ptr = args.filter_tensor->raw_ptr; | |||||
| B.raw_ptr = args.src_tensor->raw_ptr; | |||||
| C.raw_ptr = args.dst_tensor->raw_ptr; | |||||
| auto mm = args.handle->batched_matrix_mul(); | |||||
| mm->exec(A, B, C, args.workspace); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,100 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/forward/algo.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "hcc_detail/hcc_defs_prologue.h" | |||||
| #include "./algo.h" | |||||
| #include "src/rocm/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| ConvolutionForwardImpl::AlgoPack::AlgoPack() { | |||||
| miopen_algos.push_back(&miopen); | |||||
| non_miopen_algos.push_back(&matmul); | |||||
| non_miopen_algos.push_back(&inplace_matmul); | |||||
| non_miopen_algos.push_back(&a1x1); | |||||
| non_miopen_algos.push_back(&batched_matrix_mul); | |||||
| non_miopen_algos.push_back(&chanwise); | |||||
| all_algos.push_back(&matmul); | |||||
| all_algos.push_back(&inplace_matmul); | |||||
| all_algos.push_back(&a1x1); | |||||
| all_algos.push_back(&batched_matrix_mul); | |||||
| all_algos.push_back(&chanwise); | |||||
| all_algos.push_back(&miopen); | |||||
| } | |||||
| ConvolutionForwardImpl::AlgoPack ConvolutionForwardImpl::sm_algo_pack; | |||||
| ConvolutionForwardImpl::AlgoBase::SizeArgs::SizeArgs(ConvolutionForwardImpl* o, | |||||
| const TensorLayout& src, | |||||
| const TensorLayout& filter, | |||||
| const TensorLayout& dst) | |||||
| : SizeArgs(o, src, o->check_layout_fwd(src, filter, dst), dst) {} | |||||
| ConvolutionForwardImpl::AlgoBase::SizeArgs::SizeArgs( | |||||
| ConvolutionForwardImpl* o, const TensorLayout& src, | |||||
| const CanonizedFilterMeta& filter, const TensorLayout& dst) | |||||
| : ForwardSizeArgs{concrete_handle(o->handle()), &src, filter, &dst}, | |||||
| opr{o} {} | |||||
| ConvolutionForwardImpl::AlgoBase::ExecArgs::ExecArgs( | |||||
| ConvolutionForwardImpl* opr, _megdnn_tensor_in src, | |||||
| _megdnn_tensor_in filter, _megdnn_tensor_out dst, | |||||
| _megdnn_workspace workspace) | |||||
| : SizeArgs(opr, src.layout, filter.layout, dst.layout), | |||||
| src_tensor{&src}, | |||||
| filter_tensor{&filter}, | |||||
| dst_tensor{&dst}, | |||||
| workspace{workspace} {} | |||||
| std::string ConvolutionForwardImpl::AlgoBase::SizeArgs::to_string() const { | |||||
| auto&& fm = filter_meta; | |||||
| MEGDNN_MARK_USED_VAR(fm); | |||||
| return megdnn_mangle(ssprintf( | |||||
| "src=%s, filter=%u{%u,%u,%u,%u}, dst=%s, " | |||||
| "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s", | |||||
| src_layout->to_string().c_str(), fm.group, fm.ocpg, fm.icpg, | |||||
| fm.spatial[0], fm.spatial[1], dst_layout->to_string().c_str(), | |||||
| fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1], | |||||
| fm.dilation[0], fm.dilation[1], !fm.should_flip, | |||||
| src_layout->dtype.name(), dst_layout->dtype.name())); | |||||
| } | |||||
| convolution::MIOpenCacheKey | |||||
| ConvolutionForwardImpl::AlgoBase::SizeArgs::to_miopen_algo_cache_key() const { | |||||
| convolution::MIOpenCacheKey res; | |||||
| res.miopen_handle = reinterpret_cast<intptr_t>(handle->miopen_handle()); | |||||
| res.batch = src_layout->operator[](0); | |||||
| res.IC = src_layout->operator[](1); | |||||
| res.IH = src_layout->operator[](2); | |||||
| res.IW = src_layout->operator[](3); | |||||
| res.OH = dst_layout->operator[](2); | |||||
| res.OW = dst_layout->operator[](3); | |||||
| res.FH = filter_meta.spatial[0]; | |||||
| res.FW = filter_meta.spatial[1]; | |||||
| res.SH = filter_meta.stride[0]; | |||||
| res.SW = filter_meta.stride[1]; | |||||
| res.PH = filter_meta.padding[0]; | |||||
| res.PW = filter_meta.padding[1]; | |||||
| res.DH = filter_meta.dilation[0]; | |||||
| res.DW = filter_meta.dilation[1]; | |||||
| res.group = filter_meta.group; | |||||
| res.ocpg = filter_meta.ocpg; | |||||
| res.icpg = filter_meta.icpg; | |||||
| res.dtype_enum = static_cast<uint32_t>(src_layout->dtype.enumv()); | |||||
| res.exhaustive_search = | |||||
| static_cast<int32_t>(handle->enable_miopen_algo_search()); | |||||
| res.OC = res.group * res.ocpg; | |||||
| return res; | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,194 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/forward/algo.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include "megdnn/oprs.h" | |||||
| #include "src/common/utils.h" | |||||
| #include "src/rocm/convolution/helper.h" | |||||
| #include "src/rocm/convolution/opr_impl.h" | |||||
| #include "src/rocm/handle.h" | |||||
| #include <unordered_map> | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| /*! | |||||
| * \brief base class for convolution algos | |||||
| * | |||||
| */ | |||||
| class ConvolutionForwardImpl::AlgoBase : public Algorithm { | |||||
| protected: | |||||
| ~AlgoBase() = default; | |||||
| public: | |||||
| struct SizeArgs : public convolution::ForwardSizeArgs { | |||||
| ConvolutionForwardImpl* opr; | |||||
| std::string to_string() const; | |||||
| convolution::MIOpenCacheKey to_miopen_algo_cache_key() const; | |||||
| void init_desc(convolution::MIOpenForwardDescs& desc) const { | |||||
| desc.set(*src_layout, filter_meta, *dst_layout, opr->param()); | |||||
| } | |||||
| SizeArgs(ConvolutionForwardImpl* opr, const TensorLayout& src, | |||||
| const TensorLayout& filter, const TensorLayout& dst); | |||||
| SizeArgs(ConvolutionForwardImpl* opr, const TensorLayout& src, | |||||
| const CanonizedFilterMeta& filter, const TensorLayout& dst); | |||||
| }; | |||||
| struct ExecArgs : public SizeArgs { | |||||
| const TensorND *src_tensor, *filter_tensor, *dst_tensor; | |||||
| Workspace workspace; | |||||
| ExecArgs(ConvolutionForwardImpl* opr, _megdnn_tensor_in src, | |||||
| _megdnn_tensor_in filter, _megdnn_tensor_out dst, | |||||
| _megdnn_workspace workspace); | |||||
| }; | |||||
| virtual bool is_available(const SizeArgs& args) const = 0; | |||||
| virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; | |||||
| virtual void exec(const ExecArgs& args) const = 0; | |||||
| bool is_available_wk(const SizeArgs& args, size_t limit) { | |||||
| return is_available(args) && get_workspace_in_bytes(args) <= limit; | |||||
| } | |||||
| bool is_available_reproducible( | |||||
| const SizeArgs& args, bool reproducible = true, | |||||
| size_t limit = std::numeric_limits<size_t>::max()) { | |||||
| return (!reproducible || is_reproducible()) && | |||||
| is_available_wk(args, limit); | |||||
| } | |||||
| AlgoBase& check_workspace(const SizeArgs& args, | |||||
| const Workspace& workspace) { | |||||
| auto req = get_workspace_in_bytes(args); | |||||
| megdnn_assert(req <= workspace.size, | |||||
| "conv fwd algo %s: required workspace %zu bytes, got %zu", | |||||
| name(), req, workspace.size); | |||||
| return *this; | |||||
| } | |||||
| virtual bool is_miopen() const { return false; } | |||||
| }; | |||||
| class ConvolutionForwardImpl::AlgoMIOpen final : public AlgoBase { | |||||
| bool m_is_reproducible; | |||||
| const char* m_name; | |||||
| miopenConvFwdAlgorithm_t find_best_algo(const ExecArgs& args); | |||||
| public: | |||||
| AlgoMIOpen() = delete; | |||||
| AlgoMIOpen(bool is_reproducible) : m_is_reproducible(is_reproducible) {} | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| bool is_reproducible() const override { return m_is_reproducible; } | |||||
| const char* name() const override { return "MIOpenConvolutionForward"; } | |||||
| bool is_miopen() const override { return true; } | |||||
| static convolution::MIOpenCache<SizeArgs, miopenConvFwdAlgorithm_t> | |||||
| sm_miopen_algo_cache; | |||||
| static convolution::MIOpenCache<SizeArgs, size_t> sm_miopen_ws_cache; | |||||
| }; | |||||
| class ConvolutionForwardImpl::AlgoMatmul final : public AlgoBase { | |||||
| template <typename T> | |||||
| static void exec_internal(const ExecArgs& args); | |||||
| public: | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return "MATMUL"; } | |||||
| bool is_reproducible() const override { return true; } | |||||
| }; | |||||
| //! compute small matmul in the kernel | |||||
| class ConvolutionForwardImpl::AlgoInplaceMatmul final : public AlgoBase { | |||||
| public: | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return "INPLACE_MATMUL"; } | |||||
| bool is_reproducible() const override { return true; } | |||||
| }; | |||||
| //! optimized 1x1 conv | |||||
| class ConvolutionForwardImpl::Algo1x1 final : public AlgoBase { | |||||
| static void extract_matmul_layouts(const SizeArgs& args, TensorLayout& A, | |||||
| TensorLayout& B, TensorLayout& C); | |||||
| public: | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return "1x1"; } | |||||
| bool is_reproducible() const override { return true; } | |||||
| }; | |||||
| //! optimized 1x1 conv when input data batchsize is larger than 32 | |||||
| class ConvolutionForwardImpl::Algo1x1LargeBatch final : public AlgoBase { | |||||
| static void extract_matmul_layouts(const SizeArgs& args, TensorLayout& A, | |||||
| TensorLayout& B, TensorLayout& C); | |||||
| public: | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return "LARGE_BATCH_1x1"; } | |||||
| bool is_reproducible() const override { return true; } | |||||
| }; | |||||
| class ConvolutionForwardImpl::AlgoChanwise final : public AlgoBase { | |||||
| public: | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return "CHANNEL_WISE"; } | |||||
| bool is_reproducible() const override { return true; } | |||||
| }; | |||||
| class ConvolutionForwardImpl::AlgoPack { | |||||
| // defined in miopen.cpp | |||||
| void fill_miopen_algos(); | |||||
| AlgoPack(const AlgoPack&) = delete; | |||||
| AlgoPack& operator=(const AlgoPack&) = delete; | |||||
| public: | |||||
| AlgoPack(); | |||||
| AlgoMIOpen miopen{true}; | |||||
| AlgoMatmul matmul; | |||||
| AlgoInplaceMatmul inplace_matmul; | |||||
| Algo1x1 a1x1; | |||||
| Algo1x1LargeBatch batched_matrix_mul; | |||||
| AlgoChanwise chanwise; | |||||
| std::vector<AlgoBase*> | |||||
| //! all algorithms | |||||
| all_algos, miopen_algos, non_miopen_algos; | |||||
| }; | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,54 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/forward/chanwise.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "./algo.h" | |||||
| #include "src/rocm/utils.h" | |||||
| #include "src/rocm/convolution/chanwise/kern.h.hip" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| using namespace convolution; | |||||
| bool ConvolutionForwardImpl::AlgoChanwise::is_available( | |||||
| const SizeArgs& args) const { | |||||
| auto&& fm = args.filter_meta; | |||||
| return args.filter_meta.format == Param::Format::NCHW && | |||||
| args.src_layout->dtype.category() == DTypeCategory::FLOAT && | |||||
| args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 && | |||||
| fm.spatial_ndim == 2 && fm.icpg == 1 && fm.dilation[0] == 1 && | |||||
| fm.dilation[1] == 1 && !fm.should_flip; | |||||
| } | |||||
| size_t ConvolutionForwardImpl::AlgoChanwise::get_workspace_in_bytes( | |||||
| const SizeArgs&) const { | |||||
| return 0; | |||||
| } | |||||
| void ConvolutionForwardImpl::AlgoChanwise::exec(const ExecArgs& args) const { | |||||
| auto kparam = chanwise::Param::from_fwd_args(args); | |||||
| auto stream = hip_stream(args.handle); | |||||
| switch (args.src_layout->dtype.enumv()) { | |||||
| #define cb(_dt) \ | |||||
| case DTypeTrait<_dt>::enumv: { \ | |||||
| using ctype = DTypeTrait<_dt>::ctype; \ | |||||
| return chanwise::run_fwd( \ | |||||
| args.dst_tensor->ptr<ctype>(), args.src_tensor->ptr<ctype>(), \ | |||||
| args.filter_tensor->ptr<ctype>(), kparam, stream); \ | |||||
| } | |||||
| MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) | |||||
| #undef cb | |||||
| default: | |||||
| break; | |||||
| } | |||||
| megdnn_assert_internal(0); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,49 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/forward/inplace_matmul.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "./algo.h" | |||||
| #include "./inplace_matmul_impl.h.hip" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| bool ConvolutionForwardImpl::AlgoInplaceMatmul::is_available( | |||||
| const SizeArgs& args) const { | |||||
| auto&& fm = args.filter_meta; | |||||
| return args.filter_meta.format == Param::Format::NCHW && | |||||
| args.src_layout->dtype == dtype::Float32() && fm.group == 1 && | |||||
| fm.spatial_ndim == 2 && fm.dilation[0] == 1 && fm.dilation[1] == 1; | |||||
| } | |||||
| size_t ConvolutionForwardImpl::AlgoInplaceMatmul::get_workspace_in_bytes( | |||||
| const SizeArgs&) const { | |||||
| return 0; | |||||
| } | |||||
| void ConvolutionForwardImpl::AlgoInplaceMatmul::exec( | |||||
| const ExecArgs& args) const { | |||||
| auto&& fm = args.filter_meta; | |||||
| size_t N = args.src_layout->shape[0], IC = fm.icpg, | |||||
| IH = args.src_layout->shape[2], IW = args.src_layout->shape[3], | |||||
| OC = fm.ocpg, OH = args.dst_layout->shape[2], | |||||
| OW = args.dst_layout->shape[3], FH = fm.spatial[0], | |||||
| FW = fm.spatial[1]; | |||||
| auto stream = args.handle->stream(); | |||||
| convolution::exec_inplace_matmul_fwd( | |||||
| args.src_tensor->ptr<dt_float32>(), | |||||
| args.filter_tensor->ptr<dt_float32>(), | |||||
| args.dst_tensor->ptr<dt_float32>(), N, args.src_layout->stride[0], | |||||
| args.dst_layout->stride[0], IC, IH, IW, OC, OH, OW, FH, FW, | |||||
| fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1], | |||||
| !fm.should_flip, stream); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,377 @@ | |||||
| /** | |||||
| * \file src/rocm/convolution/forward/inplace_matmul_impl.cpp.hip | |||||
| * | |||||
| * This file is part of MegDNN, a deep neural network run-time library | |||||
| * developed by Megvii. | |||||
| * | |||||
| * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved. | |||||
| */ | |||||
| #include "./inplace_matmul_impl.h.hip" | |||||
| #include "src/rocm/utils.h.hip" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| namespace { | |||||
| struct BufferFetcherTexture { | |||||
| hipTextureObject_t tex; | |||||
| __device__ __forceinline__ float get(uint32_t offset) { | |||||
| return tex1Dfetch<float>(tex, offset); | |||||
| } | |||||
| }; | |||||
| struct BufferFetcherRaw { | |||||
| const float* ptr; | |||||
| __device__ __forceinline__ float get(uint32_t offset) { | |||||
| return ptr[offset]; | |||||
| } | |||||
| }; | |||||
| struct BufferFetcherTextureHost { | |||||
| bool init_succ; | |||||
| BufferFetcherTexture val; | |||||
| BufferFetcherTextureHost(float* p, const size_t n); | |||||
| ~BufferFetcherTextureHost() { reset(); } | |||||
| void reset() { | |||||
| if (init_succ) { | |||||
| hip_check(hipDestroyTextureObject(val.tex)); | |||||
| init_succ = false; | |||||
| } | |||||
| } | |||||
| }; | |||||
| BufferFetcherTextureHost::BufferFetcherTextureHost(float* p, const size_t n) { | |||||
| init_succ = false; | |||||
| hipTextureObject_t tex_obj; | |||||
| hipResourceDesc res_desc; | |||||
| memset(&res_desc, 0, sizeof(hipResourceDesc)); | |||||
| res_desc.resType = hipResourceTypeLinear; | |||||
| res_desc.res.linear.devPtr = static_cast<void*>(p); | |||||
| res_desc.res.linear.sizeInBytes = n * sizeof(float); | |||||
| res_desc.res.linear.desc = | |||||
| hipCreateChannelDesc(32, 0, 0, 0, hipChannelFormatKindFloat); | |||||
| hipTextureDesc tex_desc; | |||||
| memset(&tex_desc, 0, sizeof(hipTextureDesc)); | |||||
| if (hipCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL) == | |||||
| hipSuccess) { | |||||
| val.tex = tex_obj; | |||||
| init_succ = true; | |||||
| } else { | |||||
| hipGetLastError(); // reset error | |||||
| } | |||||
| } | |||||
| template <class BufferFetcher> | |||||
| struct KernelPtr { | |||||
| typedef void (*type)(BufferFetcher, BufferFetcher, float*, uint32_t, | |||||
| uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, | |||||
| uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, | |||||
| uint32_t, uint32_t, uint32_t); | |||||
| }; | |||||
| //! 1 -> 0xffffffff, 0 -> 0x00000000 | |||||
| __device__ __forceinline__ uint32_t bool_as_mask(uint32_t cond) { | |||||
| return (!cond) - 1u; | |||||
| } | |||||
| union FloatAndU32 { | |||||
| float f; | |||||
| uint32_t u; | |||||
| }; | |||||
| //! \p mask must be either all 1 or 0 bits | |||||
| template <class BufferFetcher> | |||||
| __device__ __forceinline__ float visit_with_mask(BufferFetcher buf, | |||||
| uint32_t offset, | |||||
| uint32_t mask) { | |||||
| FloatAndU32 f; | |||||
| f.f = buf.get(offset & mask); | |||||
| f.u &= mask; | |||||
| return f.f; | |||||
| } | |||||
| template <uint32_t BY, uint32_t BX, bool is_xcorr, class BufferFetcher> | |||||
| __global__ void conv_kernel(BufferFetcher src, BufferFetcher filter, float* dst, | |||||
| const uint32_t INP_BS, const uint32_t OUT_BS, | |||||
| const uint32_t IC, const uint32_t IH, | |||||
| const uint32_t IW, const uint32_t OC, | |||||
| const uint32_t OH, const uint32_t OW, | |||||
| const uint32_t FH, const uint32_t FW, | |||||
| const uint32_t SH, const uint32_t SW, | |||||
| const uint32_t PH, const uint32_t PW) { | |||||
| const uint32_t BM = BY < BX ? BY : BX; | |||||
| const uint32_t n = blockIdx.z; | |||||
| const uint32_t tidx = threadIdx.x; | |||||
| const uint32_t tidy = threadIdx.y; | |||||
| const uint32_t posx = blockIdx.x * blockDim.x + threadIdx.x; | |||||
| const uint32_t posy = blockIdx.y * blockDim.y + threadIdx.y; | |||||
| const uint32_t posx2 = posx << 2; | |||||
| const uint32_t posy2 = posy << 2; | |||||
| const uint32_t heightA = OC; | |||||
| const uint32_t widthA = IC * FH * FW; | |||||
| const uint32_t heightB = widthA; | |||||
| const uint32_t widthB = OH * OW; | |||||
| const uint32_t oh0 = (posx2 + 0) / OW * SH; | |||||
| const uint32_t ow0 = (posx2 + 0) % OW * SW; | |||||
| const uint32_t op0 = oh0 * IW + ow0; | |||||
| const uint32_t oh1 = (posx2 + 1) / OW * SH; | |||||
| const uint32_t ow1 = (posx2 + 1) % OW * SW; | |||||
| const uint32_t op1 = oh1 * IW + ow1; | |||||
| const uint32_t oh2 = (posx2 + 2) / OW * SH; | |||||
| const uint32_t ow2 = (posx2 + 2) % OW * SW; | |||||
| const uint32_t op2 = oh2 * IW + ow2; | |||||
| const uint32_t oh3 = (posx2 + 3) / OW * SH; | |||||
| const uint32_t ow3 = (posx2 + 3) % OW * SW; | |||||
| const uint32_t op3 = oh3 * IW + ow3; | |||||
| const uint32_t FP = FH * FW; | |||||
| __shared__ float4 localA[BY][BM]; | |||||
| __shared__ float4 localB[BM][BX]; | |||||
| uint32_t i = 0u; | |||||
| uint32_t offsetA = posy2 * widthA + tidx; | |||||
| uint32_t offsetB = n * INP_BS - PH * IW - PW; | |||||
| float4 sum0 = {0.0f, 0.0f, 0.0f, 0.0f}, sum1 = {0.0f, 0.0f, 0.0f, 0.0f}, | |||||
| sum2 = {0.0f, 0.0f, 0.0f, 0.0f}, sum3 = {0.0f, 0.0f, 0.0f, 0.0f}; | |||||
| uint32_t fh = tidy / FW % FH; | |||||
| uint32_t fw = tidy % FW; | |||||
| uint32_t ic = tidy / (FH * FW); | |||||
| uint32_t icm = tidy % (FH * FW); | |||||
| const uint32_t fhs = BM / FW % FH; | |||||
| const uint32_t fws = BM % FW; | |||||
| const uint32_t ics = BM / (FH * FW); | |||||
| const uint32_t icms = BM % (FH * FW); | |||||
| for (; i < widthA; i += BM, offsetA += BM) { | |||||
| // load localA | |||||
| if (tidx < BM) { | |||||
| localA[tidy][tidx].x = filter.get(offsetA + 0 * widthA); | |||||
| localA[tidy][tidx].y = filter.get(offsetA + 1 * widthA); | |||||
| localA[tidy][tidx].z = filter.get(offsetA + 2 * widthA); | |||||
| localA[tidy][tidx].w = filter.get(offsetA + 3 * widthA); | |||||
| } | |||||
| // load localB | |||||
| uint32_t fh2, fw2; | |||||
| if (is_xcorr) { | |||||
| fh2 = fh; | |||||
| fw2 = fw; | |||||
| } else { | |||||
| fh2 = FH - fh - 1; | |||||
| fw2 = FW - fw - 1; | |||||
| } | |||||
| if (tidy < BM) { | |||||
| uint32_t tmp = offsetB + (ic * IH + (fh2)) * IW + (fw2), | |||||
| ok = bool_as_mask(tidy + i < heightB), | |||||
| p0 = bool_as_mask(fh2 + oh0 >= PH && fh2 + oh0 < IH + PH && | |||||
| fw2 + ow0 >= PW && fw2 + ow0 < IW + PW), | |||||
| p1 = bool_as_mask(fh2 + oh1 >= PH && fh2 + oh1 < IH + PH && | |||||
| fw2 + ow1 >= PW && fw2 + ow1 < IW + PW), | |||||
| p2 = bool_as_mask(fh2 + oh2 >= PH && fh2 + oh2 < IH + PH && | |||||
| fw2 + ow2 >= PW && fw2 + ow2 < IW + PW), | |||||
| p3 = bool_as_mask(fh2 + oh3 >= PH && fh2 + oh3 < IH + PH && | |||||
| fw2 + ow3 >= PW && fw2 + ow3 < IW + PW); | |||||
| localB[tidy][tidx].x = visit_with_mask(src, tmp + op0, ok & p0); | |||||
| localB[tidy][tidx].y = visit_with_mask(src, tmp + op1, ok & p1); | |||||
| localB[tidy][tidx].z = visit_with_mask(src, tmp + op2, ok & p2); | |||||
| localB[tidy][tidx].w = visit_with_mask(src, tmp + op3, ok & p3); | |||||
| } | |||||
| __syncthreads(); | |||||
| for (uint32_t j = 0u; j < BM; ++j) { | |||||
| float4 tmpA = localA[tidy][j]; | |||||
| float4 tmpB = localB[j][tidx]; | |||||
| sum0.x += tmpA.x * tmpB.x; | |||||
| sum0.y += tmpA.x * tmpB.y; | |||||
| sum0.z += tmpA.x * tmpB.z; | |||||
| sum0.w += tmpA.x * tmpB.w; | |||||
| sum1.x += tmpA.y * tmpB.x; | |||||
| sum1.y += tmpA.y * tmpB.y; | |||||
| sum1.z += tmpA.y * tmpB.z; | |||||
| sum1.w += tmpA.y * tmpB.w; | |||||
| sum2.x += tmpA.z * tmpB.x; | |||||
| sum2.y += tmpA.z * tmpB.y; | |||||
| sum2.z += tmpA.z * tmpB.z; | |||||
| sum2.w += tmpA.z * tmpB.w; | |||||
| sum3.x += tmpA.w * tmpB.x; | |||||
| sum3.y += tmpA.w * tmpB.y; | |||||
| sum3.z += tmpA.w * tmpB.z; | |||||
| sum3.w += tmpA.w * tmpB.w; | |||||
| } | |||||
| fw += fws; | |||||
| fh += fhs; | |||||
| fh += (fw >= FW); | |||||
| fh -= (fh >= FH) * FH; | |||||
| fw -= (fw >= FW) * FW; | |||||
| ic += ics; | |||||
| icm += icms; | |||||
| ic += (icm >= FP); | |||||
| icm -= (icm >= FP) * FP; | |||||
| __syncthreads(); | |||||
| } | |||||
| const uint32_t dst_idx = n * OUT_BS + posy2 * widthB + posx2; | |||||
| bool y0 = (posy2 + 0 < heightA); | |||||
| bool y1 = (posy2 + 1 < heightA); | |||||
| bool y2 = (posy2 + 2 < heightA); | |||||
| bool y3 = (posy2 + 3 < heightA); | |||||
| bool x0 = (posx2 + 0 < widthB); | |||||
| bool x1 = (posx2 + 1 < widthB); | |||||
| bool x2 = (posx2 + 2 < widthB); | |||||
| bool x3 = (posx2 + 3 < widthB); | |||||
| if (y0) { | |||||
| if (x0) | |||||
| dst[dst_idx + 0 * widthB + 0] = sum0.x; | |||||
| if (x1) | |||||
| dst[dst_idx + 0 * widthB + 1] = sum0.y; | |||||
| if (x2) | |||||
| dst[dst_idx + 0 * widthB + 2] = sum0.z; | |||||
| if (x3) | |||||
| dst[dst_idx + 0 * widthB + 3] = sum0.w; | |||||
| } | |||||
| if (y1) { | |||||
| if (x0) | |||||
| dst[dst_idx + 1 * widthB + 0] = sum1.x; | |||||
| if (x1) | |||||
| dst[dst_idx + 1 * widthB + 1] = sum1.y; | |||||
| if (x2) | |||||
| dst[dst_idx + 1 * widthB + 2] = sum1.z; | |||||
| if (x3) | |||||
| dst[dst_idx + 1 * widthB + 3] = sum1.w; | |||||
| } | |||||
| if (y2) { | |||||
| if (x0) | |||||
| dst[dst_idx + 2 * widthB + 0] = sum2.x; | |||||
| if (x1) | |||||
| dst[dst_idx + 2 * widthB + 1] = sum2.y; | |||||
| if (x2) | |||||
| dst[dst_idx + 2 * widthB + 2] = sum2.z; | |||||
| if (x3) | |||||
| dst[dst_idx + 2 * widthB + 3] = sum2.w; | |||||
| } | |||||
| if (y3) { | |||||
| if (x0) | |||||
| dst[dst_idx + 3 * widthB + 0] = sum3.x; | |||||
| if (x1) | |||||
| dst[dst_idx + 3 * widthB + 1] = sum3.y; | |||||
| if (x2) | |||||
| dst[dst_idx + 3 * widthB + 2] = sum3.z; | |||||
| if (x3) | |||||
| dst[dst_idx + 3 * widthB + 3] = sum3.w; | |||||
| } | |||||
| } | |||||
| } // anonymous namespace | |||||
| void convolution::exec_inplace_matmul_fwd( | |||||
| const float* src, const float* filter, float* dst, size_t N, | |||||
| size_t INP_BS, size_t OUT_BS, size_t IC, size_t IH, size_t IW, | |||||
| size_t OC, size_t OH, size_t OW, size_t FH, size_t FW, size_t PH, | |||||
| size_t PW, size_t SH, size_t SW, bool is_xcorr, hipStream_t stream) { | |||||
| BufferFetcherTextureHost src_tex(const_cast<float*>(src), N * INP_BS), | |||||
| filter_tex(const_cast<float*>(filter), OC * IC * FH * FW); | |||||
| BufferFetcherRaw src_buf, filter_buf; | |||||
| src_buf.ptr = src; | |||||
| filter_buf.ptr = filter; | |||||
| if (!src_tex.init_succ || !filter_tex.init_succ) { | |||||
| src_tex.reset(); | |||||
| filter_tex.reset(); | |||||
| } | |||||
| int m = OC; | |||||
| int n = OH * OW; | |||||
| int BY = 1; | |||||
| int BX = 1; | |||||
| if (m <= 64) { | |||||
| while (BY < 16 && (BY << 2) < m) | |||||
| BY <<= 1; | |||||
| BX = 256 / BY; | |||||
| } else if (n <= 64) { | |||||
| while (BX < 16 && (BX << 2) < n) | |||||
| BX <<= 1; | |||||
| BY = 256 / BX; | |||||
| } else { | |||||
| BX = BY = 16; | |||||
| } | |||||
| dim3 blocks((OH * OW + BX * 4 - 1) / (BX * 4), (OC + BY * 4 - 1) / (BY * 4), | |||||
| N); | |||||
| dim3 threads(BX, BY); | |||||
| #define DISPATCH_BX_BY(BX, BY) \ | |||||
| do { \ | |||||
| if (src_tex.init_succ) { \ | |||||
| KernelPtr<BufferFetcherTexture>::type kptr; \ | |||||
| if (is_xcorr) { \ | |||||
| kptr = conv_kernel<BY, BX, true, BufferFetcherTexture>; \ | |||||
| } else { \ | |||||
| kptr = conv_kernel<BY, BX, false, BufferFetcherTexture>; \ | |||||
| } \ | |||||
| kptr<<<blocks, threads, 0, stream>>>( \ | |||||
| src_tex.val, filter_tex.val, dst, INP_BS, OUT_BS, IC, IH, \ | |||||
| IW, OC, OH, OW, FH, FW, SH, SW, PH, PW); \ | |||||
| } else { \ | |||||
| KernelPtr<BufferFetcherRaw>::type kptr; \ | |||||
| if (is_xcorr) { \ | |||||
| kptr = conv_kernel<BY, BX, true, BufferFetcherRaw>; \ | |||||
| } else { \ | |||||
| kptr = conv_kernel<BY, BX, false, BufferFetcherRaw>; \ | |||||
| } \ | |||||
| kptr<<<blocks, threads, 0, stream>>>( \ | |||||
| src_buf, filter_buf, dst, INP_BS, OUT_BS, IC, IH, IW, OC, \ | |||||
| OH, OW, FH, FW, SH, SW, PH, PW); \ | |||||
| } \ | |||||
| } while (0) | |||||
| #define DISPATCH_BX(BX) \ | |||||
| do { \ | |||||
| DISPATCH_BX_BY(BX, 256 / BX); \ | |||||
| } while (0) | |||||
| #define DISPATCH() \ | |||||
| do { \ | |||||
| switch (BX) { \ | |||||
| case 1: \ | |||||
| DISPATCH_BX(1); \ | |||||
| break; \ | |||||
| case 2: \ | |||||
| DISPATCH_BX(2); \ | |||||
| break; \ | |||||
| case 4: \ | |||||
| DISPATCH_BX(4); \ | |||||
| break; \ | |||||
| case 8: \ | |||||
| DISPATCH_BX(8); \ | |||||
| break; \ | |||||
| case 16: \ | |||||
| DISPATCH_BX(16); \ | |||||
| break; \ | |||||
| case 32: \ | |||||
| DISPATCH_BX(32); \ | |||||
| break; \ | |||||
| case 64: \ | |||||
| DISPATCH_BX(64); \ | |||||
| break; \ | |||||
| case 128: \ | |||||
| DISPATCH_BX(128); \ | |||||
| break; \ | |||||
| case 256: \ | |||||
| DISPATCH_BX(256); \ | |||||
| break; \ | |||||
| default: \ | |||||
| report_error("no usable kernel"); \ | |||||
| } \ | |||||
| } while (0) | |||||
| DISPATCH(); | |||||
| #undef DISPATCH | |||||
| #undef DISPATCH_BX | |||||
| #undef DISPATCH_BX_BY | |||||
| after_kernel_launch(); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,30 @@ | |||||
| /** | |||||
| * \file src/rocm/convolution/forward/inplace_matmul_impl.h.hip | |||||
| * | |||||
| * This file is part of MegDNN, a deep neural network run-time library | |||||
| * developed by Megvii. | |||||
| * | |||||
| * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved. | |||||
| */ | |||||
| #pragma once | |||||
| #include <stddef.h> | |||||
| #include <stdint.h> | |||||
| #include "hip_header.h" | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| namespace convolution { | |||||
| void exec_inplace_matmul_fwd(const float* src, const float* filter, float* dst, | |||||
| size_t N, size_t INP_BS, size_t OUT_BS, size_t IC, | |||||
| size_t IH, size_t IW, size_t OC, size_t OH, | |||||
| size_t OW, size_t FH, size_t FW, size_t PH, | |||||
| size_t PW, size_t SH, size_t SW, bool is_xcorr, | |||||
| hipStream_t stream); | |||||
| } // namespace convolution | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: ft=cpp syntax=cpp.doxygen | |||||
| @@ -0,0 +1,83 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/forward/matmul.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "./algo.h" | |||||
| #include "src/rocm/utils.h" | |||||
| #include "src/rocm/utils.h.hip" | |||||
| #include "src/rocm/convolution/helper.h" | |||||
| #include "src/rocm/convolution/im2col.h.hip" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| bool ConvolutionForwardImpl::AlgoMatmul::is_available( | |||||
| const SizeArgs& args) const { | |||||
| auto&& fm = args.filter_meta; | |||||
| return args.filter_meta.format == Param::Format::NCHW && | |||||
| args.src_layout->dtype.category() == DTypeCategory::FLOAT && | |||||
| args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 && | |||||
| fm.group == 1 && fm.spatial_ndim == 2; | |||||
| } | |||||
| size_t ConvolutionForwardImpl::AlgoMatmul::get_workspace_in_bytes( | |||||
| const SizeArgs& args) const { | |||||
| return matmul_get_workspace_bundle(args).total_size_in_bytes(); | |||||
| } | |||||
| void ConvolutionForwardImpl::AlgoMatmul::exec(const ExecArgs& args) const { | |||||
| #define cb(DType) \ | |||||
| if (args.src_layout->dtype == DType()) { \ | |||||
| using ctype = typename DTypeTrait<DType>::ctype; \ | |||||
| exec_internal<ctype>(args); \ | |||||
| return; \ | |||||
| } | |||||
| MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb) | |||||
| #undef cb | |||||
| megdnn_assert_internal(0); | |||||
| } | |||||
| template <typename T> | |||||
| void ConvolutionForwardImpl::AlgoMatmul::exec_internal(const ExecArgs& args) { | |||||
| auto&& fm = args.filter_meta; | |||||
| size_t N = args.src_layout->shape[0], IC = fm.icpg, | |||||
| IH = args.src_layout->shape[2], IW = args.src_layout->shape[3], | |||||
| OC = fm.ocpg, OH = args.dst_layout->shape[2], | |||||
| OW = args.dst_layout->shape[3], FH = fm.spatial[0], | |||||
| FW = fm.spatial[1], PH = fm.padding[0], PW = fm.padding[1], | |||||
| SH = fm.stride[0], SW = fm.stride[1], DH = fm.dilation[0], | |||||
| DW = fm.dilation[1]; | |||||
| auto stream = hip_stream(args.handle); | |||||
| auto wbundle = matmul_get_workspace_bundle(args); | |||||
| wbundle.set(args.workspace.raw_ptr); | |||||
| T* dst_t = static_cast<T*>(wbundle.get(0)); | |||||
| T* col = static_cast<T*>(wbundle.get(1)); | |||||
| convolution::im2col<T>(args.src_tensor->ptr<T>(), col, N, | |||||
| args.src_layout->stride[0], IC, IH, IW, FH, FW, OH, | |||||
| OW, PH, PW, SH, SW, DH, DW, stream); | |||||
| TensorLayout Al({OC, IC * FH * FW}, typename DTypeTrait<T>::dtype()), | |||||
| Bl({IC * FH * FW, OH * OW * N}, typename DTypeTrait<T>::dtype()), | |||||
| Cl({OC, OH * OW * N}, typename DTypeTrait<T>::dtype()); | |||||
| TensorND A(args.filter_tensor->ptr<T>(), Al), B(col, Bl), C(dst_t, Cl); | |||||
| if (fm.should_flip) { | |||||
| convolution::flip_filter(args, wbundle.get_workspace(2), A.raw_ptr); | |||||
| } | |||||
| args.handle->matmul_opr()->exec(A, B, C, Workspace()); | |||||
| TensorLayout C2l({OC * OH * OW, N}, typename DTypeTrait<T>::dtype()), | |||||
| C3l = C2l; | |||||
| C3l.stride[0] = 1; | |||||
| C3l.stride[1] = args.dst_tensor->layout.stride[0]; | |||||
| TensorND C2(dst_t, C2l); | |||||
| TensorND C3(args.dst_tensor->ptr<T>(), C3l); | |||||
| args.handle->relayout_opr()->exec(C2, C3); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,111 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/forward/miopen.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "hcc_detail/hcc_defs_prologue.h" | |||||
| #include "./algo.h" | |||||
| #include <mutex> | |||||
| #include "src/rocm/convolution/helper.h" | |||||
| #include "src/rocm/miopen_wrapper.h" | |||||
| #include "src/rocm/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| using namespace convolution; | |||||
| MIOpenCache<ConvolutionForwardImpl::AlgoBase::SizeArgs, | |||||
| miopenConvFwdAlgorithm_t> | |||||
| ConvolutionForwardImpl::AlgoMIOpen::sm_miopen_algo_cache; | |||||
| MIOpenCache<ConvolutionForwardImpl::AlgoBase::SizeArgs, size_t> | |||||
| ConvolutionForwardImpl::AlgoMIOpen::sm_miopen_ws_cache; | |||||
| bool ConvolutionForwardImpl::AlgoMIOpen::is_available( | |||||
| const SizeArgs& args) const { | |||||
| if (!is_miopen_supported(args)) | |||||
| return false; | |||||
| auto got = sm_miopen_ws_cache.get(args); | |||||
| if (got.first) | |||||
| return true; | |||||
| MIOpenForwardDescs D; | |||||
| args.init_desc(D); | |||||
| size_t workspace_size; | |||||
| auto status = miopenConvolutionForwardGetWorkSpaceSize( | |||||
| args.handle->miopen_handle(), D.filter_desc.desc, D.src_desc.desc, | |||||
| D.conv_desc.desc, D.dst_desc.desc, &workspace_size); | |||||
| if (status == miopenStatusSuccess) { | |||||
| sm_miopen_ws_cache.set(args, workspace_size); | |||||
| return true; | |||||
| } | |||||
| return false; | |||||
| } | |||||
| size_t ConvolutionForwardImpl::AlgoMIOpen::get_workspace_in_bytes( | |||||
| const SizeArgs& args) const { | |||||
| auto got = sm_miopen_ws_cache.get(args); | |||||
| if (got.first) | |||||
| return got.second; | |||||
| MIOpenForwardDescs D; | |||||
| args.init_desc(D); | |||||
| size_t workspace_size; | |||||
| auto status = miopenConvolutionForwardGetWorkSpaceSize( | |||||
| args.handle->miopen_handle(), D.filter_desc.desc, D.src_desc.desc, | |||||
| D.conv_desc.desc, D.dst_desc.desc, &workspace_size); | |||||
| megdnn_assert(status == miopenStatusSuccess, | |||||
| "conv fwd get workspace failed: %s; info: %s", | |||||
| miopenGetErrorString(status), args.to_string().c_str()); | |||||
| sm_miopen_ws_cache.set(args, workspace_size); | |||||
| return workspace_size; | |||||
| } | |||||
| miopenConvFwdAlgorithm_t ConvolutionForwardImpl::AlgoMIOpen::find_best_algo( | |||||
| const ExecArgs& args) { | |||||
| auto find_algo = sm_miopen_algo_cache.get(args); | |||||
| if (find_algo.first) | |||||
| return find_algo.second; | |||||
| bool exhaustive_search = args.handle->enable_miopen_algo_search(); | |||||
| MIOpenForwardDescs D; | |||||
| args.init_desc(D); | |||||
| const int req_algo_count = 1; | |||||
| int ret_algo_count; | |||||
| miopenConvAlgoPerf_t algo_perf; | |||||
| miopen_check(miopenFindConvolutionForwardAlgorithm( | |||||
| args.handle->miopen_handle(), D.src_desc.desc, | |||||
| args.src_tensor->raw_ptr, D.filter_desc.desc, | |||||
| args.filter_tensor->raw_ptr, D.conv_desc.desc, D.dst_desc.desc, | |||||
| args.dst_tensor->raw_ptr, req_algo_count, &ret_algo_count, | |||||
| &algo_perf, args.workspace.raw_ptr, args.workspace.size, | |||||
| exhaustive_search)); | |||||
| sm_miopen_algo_cache.set(args, algo_perf.fwd_algo); | |||||
| return algo_perf.fwd_algo; | |||||
| } | |||||
| void ConvolutionForwardImpl::AlgoMIOpen::exec(const ExecArgs& args) const { | |||||
| MIOpenForwardDescs D; | |||||
| args.init_desc(D); | |||||
| auto algo = const_cast<ConvolutionForwardImpl::AlgoMIOpen*>(this) | |||||
| ->find_best_algo(args); | |||||
| float alpha = 1.0f, beta = 0.0f; | |||||
| auto status = miopenConvolutionForward( | |||||
| args.handle->miopen_handle(), &alpha, D.src_desc.desc, | |||||
| args.src_tensor->raw_ptr, D.filter_desc.desc, | |||||
| args.filter_tensor->raw_ptr, D.conv_desc.desc, algo, &beta, | |||||
| D.dst_desc.desc, args.dst_tensor->raw_ptr, args.workspace.raw_ptr, | |||||
| args.workspace.size); | |||||
| megdnn_assert(status == miopenStatusSuccess, | |||||
| "conv fwd failed: %s; info: %s", miopenGetErrorString(status), | |||||
| args.to_string().c_str()); | |||||
| } | |||||
| void ConvolutionForwardImpl::AlgoPack::fill_miopen_algos() { | |||||
| megdnn_throw("MIOpen has implemented auto-tuning in the framework, so we do not need to choose algorithms manually"); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,102 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/helper.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "hcc_detail/hcc_defs_prologue.h" | |||||
| #include "./helper.h" | |||||
| #include "./forward/algo.h" | |||||
| #include "./backward_data/algo.h" | |||||
| #include "./backward_filter/algo.h" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| using namespace convolution; | |||||
| bool convolution::is_miopen_supported(const ForwardSizeArgs& args) { | |||||
| //! TODO: We only support NCHW format now. It seems MIOpen do not support | |||||
| //! NHWC or NCHW4 now | |||||
| if (args.filter_meta.format != param::Convolution::Format::NCHW) { | |||||
| return false; | |||||
| } | |||||
| auto& fm = args.filter_meta; | |||||
| //! TODO: It seems MIOpen do not support non xcorr convolution | |||||
| return !fm.should_flip; | |||||
| } | |||||
| std::string MIOpenCacheKey::to_string_binary() const { | |||||
| std::string ret(sizeof(MIOpenCacheKey), '\0'); | |||||
| auto ptr = reinterpret_cast<MIOpenCacheKey*>(&ret[0]); | |||||
| *ptr = *this; | |||||
| return ret; | |||||
| } | |||||
| template <typename Args, typename ValueType> | |||||
| void MIOpenCache<Args, ValueType>::set(const Args& args, ValueType val) { | |||||
| std::string key = args.to_miopen_algo_cache_key().to_string_binary(); | |||||
| std::lock_guard<std::mutex> guard{m_mtx}; | |||||
| m_cache[key] = val; | |||||
| } | |||||
| template <typename Args, typename ValueType> | |||||
| std::pair<bool, ValueType> MIOpenCache<Args, ValueType>::get(const Args& args) { | |||||
| std::string key = args.to_miopen_algo_cache_key().to_string_binary(); | |||||
| std::lock_guard<std::mutex> guard{m_mtx}; | |||||
| auto search = m_cache.find(key); | |||||
| bool find = search != m_cache.end(); | |||||
| ValueType val = ValueType(); | |||||
| if (find) { | |||||
| val = search->second; | |||||
| } | |||||
| return std::make_pair(find, val); | |||||
| } | |||||
| #define INST(_opr, _miopen_algo) \ | |||||
| template class megdnn::rocm::convolution::MIOpenCache< \ | |||||
| _opr::AlgoBase::SizeArgs, _miopen_algo>; \ | |||||
| template class megdnn::rocm::convolution::MIOpenCache< \ | |||||
| _opr::AlgoBase::SizeArgs, size_t>; | |||||
| INST(ConvolutionForwardImpl, miopenConvFwdAlgorithm_t); | |||||
| INST(ConvolutionBackwardDataImpl, miopenConvBwdDataAlgorithm_t); | |||||
| INST(ConvolutionBackwardFilterImpl, miopenConvBwdWeightsAlgorithm_t); | |||||
| WorkspaceBundle convolution::matmul_get_workspace_bundle( | |||||
| const ForwardSizeArgs& args) { | |||||
| auto dtype = args.src_layout->dtype; | |||||
| auto&& fm = args.filter_meta; | |||||
| megdnn_assert(fm.group == 1); | |||||
| auto N = args.src_layout->shape[0]; | |||||
| auto OC = fm.ocpg, IC = fm.icpg, FH = fm.spatial[0], FW = fm.spatial[1]; | |||||
| auto OH = args.dst_layout->shape[2], OW = args.dst_layout->shape[3]; | |||||
| SmallVector<size_t> sizes{dtype.size() * args.dst_layout->total_nr_elems(), | |||||
| dtype.size() * IC * FH * FW * OH * OW * N}; | |||||
| if (args.filter_meta.should_flip) { | |||||
| sizes.push_back(dtype.size() * OC * IC * FH * FW); | |||||
| } | |||||
| return {nullptr, std::move(sizes)}; | |||||
| } | |||||
| void convolution::flip_filter(const ForwardSizeArgs& args, | |||||
| const Workspace& workspace, void*& raw_ptr) { | |||||
| auto&& fm = args.filter_meta; | |||||
| megdnn_assert(fm.group == 1 && fm.spatial_ndim == 2); | |||||
| auto OC = fm.ocpg, IC = fm.icpg, FH = fm.spatial[0], FW = fm.spatial[1]; | |||||
| auto dtype = fm.dtype; | |||||
| megdnn_assert(workspace.size >= dtype.size() * OC * IC * FH * FW); | |||||
| TensorND src{raw_ptr, {{OC, IC, FH, FW}, dtype}}, | |||||
| dst{workspace.raw_ptr + (FH * FW - 1) * dtype.size(), src.layout}; | |||||
| dst.layout.stride[2] = -dst.layout.stride[2]; | |||||
| dst.layout.stride[3] = -dst.layout.stride[3]; | |||||
| args.handle->relayout_opr()->exec(src, dst); | |||||
| raw_ptr = workspace.raw_ptr; | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,139 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/helper.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include "./opr_impl.h" | |||||
| #include "src/rocm/miopen_wrapper.h" | |||||
| #include "src/rocm/handle.h" | |||||
| #include "src/common/utils.h" | |||||
| #include "src/common/algo_chooser.h" | |||||
| #include <unordered_map> | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| namespace convolution { | |||||
| struct MIOpenCacheKey { | |||||
| int64_t miopen_handle; | |||||
| uint32_t batch, IC, IH, IW, OC, OH, OW, FH, FW, SH, SW, PH, PW, DH, DW, | |||||
| group, ocpg, icpg, dtype_enum; | |||||
| int exhaustive_search; | |||||
| std::string to_string_binary() const; | |||||
| }; | |||||
| //! FIXME: MIOpenCache to avoid calling find() and GetWorkSpaceSize() | |||||
| //! redundantly | |||||
| template <typename Args, typename ValueType> | |||||
| class MIOpenCache { | |||||
| using HashMap = std::unordered_map<std::string, ValueType>; | |||||
| HashMap m_cache; | |||||
| std::mutex m_mtx; | |||||
| public: | |||||
| MIOpenCache() = default; | |||||
| ~MIOpenCache() noexcept = default; | |||||
| void set(const Args& args, ValueType val); | |||||
| std::pair<bool, ValueType> get(const Args& args); | |||||
| }; | |||||
| using CanonizedFilterMeta = ConvolutionForward::CanonizedFilterMeta; | |||||
| //! conv size descriptor in the forward view | |||||
| struct ForwardSizeArgs { | |||||
| HandleImpl* handle; | |||||
| const TensorLayout* src_layout; | |||||
| CanonizedFilterMeta filter_meta; | |||||
| const TensorLayout* dst_layout; | |||||
| }; | |||||
| //! whether miopen is supported for a filter meta | |||||
| bool is_miopen_supported(const ForwardSizeArgs& args); | |||||
| //! get workspace bundle for matmul algo | |||||
| WorkspaceBundle matmul_get_workspace_bundle(const ForwardSizeArgs& args); | |||||
| /*! | |||||
| * \brief flip conv filter | |||||
| * | |||||
| * Flip conv filter pointed by \p raw_ptr, store result in workspace, and | |||||
| * change \p raw_ptr to workspace. | |||||
| * */ | |||||
| void flip_filter(const ForwardSizeArgs& args, const Workspace& workspace, | |||||
| void*& raw_ptr); | |||||
| struct MIOpenForwardDescs { | |||||
| TensorDesc src_desc, filter_desc, dst_desc; | |||||
| ConvDesc conv_desc; | |||||
| void set(const TensorLayout& src, const CanonizedFilterMeta& filter, | |||||
| const TensorLayout& dst, const param::Convolution& param) { | |||||
| src_desc.set(src, param.format); | |||||
| auto&& group = filter.group; | |||||
| auto&& ocpg = filter.ocpg; | |||||
| auto&& icpg = filter.icpg; | |||||
| auto&& fh = filter.spatial[0]; | |||||
| auto&& fw = filter.spatial[1]; | |||||
| TensorLayout filter_layout{{group * ocpg, icpg, fh, fw}, filter.dtype}; | |||||
| filter_desc.set(filter_layout, param.format); | |||||
| dst_desc.set(dst, param.format); | |||||
| bool is_depthwise = param.sparse == param::Convolution::Sparse::GROUP && | |||||
| (icpg == 1) && (ocpg == 1); | |||||
| conv_desc.set(param, filter.group, is_depthwise); | |||||
| } | |||||
| }; | |||||
| struct MIOpenBwdDataDescs { | |||||
| TensorDesc diff_desc, filter_desc, grad_desc; | |||||
| ConvDesc conv_desc; | |||||
| void set(const CanonizedFilterMeta& filter, const TensorLayout& diff, | |||||
| const TensorLayout& grad, const param::Convolution& param) { | |||||
| auto&& group = filter.group; | |||||
| auto&& ocpg = filter.ocpg; | |||||
| auto&& icpg = filter.icpg; | |||||
| auto&& fh = filter.spatial[0]; | |||||
| auto&& fw = filter.spatial[1]; | |||||
| TensorLayout filter_layout{{group * ocpg, icpg, fh, fw}, filter.dtype}; | |||||
| filter_desc.set(filter_layout, param.format); | |||||
| diff_desc.set(diff, param.format); | |||||
| grad_desc.set(grad, param.format); | |||||
| bool is_depthwise = param.sparse == param::Convolution::Sparse::GROUP && | |||||
| (icpg == 1) && (ocpg == 1); | |||||
| conv_desc.set(param, filter.group, is_depthwise); | |||||
| } | |||||
| }; | |||||
| struct MIOpenBwdFilterDescs { | |||||
| TensorDesc diff_desc, src_desc, grad_desc; | |||||
| ConvDesc conv_desc; | |||||
| void set(const TensorLayout& src, const TensorLayout& diff, | |||||
| const CanonizedFilterMeta& grad, const param::Convolution& param) { | |||||
| src_desc.set(src, param.format); | |||||
| diff_desc.set(diff, param.format); | |||||
| auto&& group = grad.group; | |||||
| auto&& ocpg = grad.ocpg; | |||||
| auto&& icpg = grad.icpg; | |||||
| auto&& fh = grad.spatial[0]; | |||||
| auto&& fw = grad.spatial[1]; | |||||
| TensorLayout grad_layout{{group * ocpg, icpg, fh, fw}, grad.dtype}; | |||||
| grad_desc.set(grad_layout, param.format); | |||||
| bool is_depthwise = param.sparse == param::Convolution::Sparse::GROUP && | |||||
| (icpg == 1) && (ocpg == 1); | |||||
| conv_desc.set(param, grad.group, is_depthwise); | |||||
| } | |||||
| }; | |||||
| //! TODO:miopen does not support non xcorr convolution for now, expecting | |||||
| //! support in future. | |||||
| } // namespace convolution | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,129 @@ | |||||
| /** | |||||
| * \file src/rocm/convolution/im2col.cpp.hip | |||||
| * | |||||
| * This file is part of MegDNN, a deep neural network run-time library | |||||
| * developed by Megvii. | |||||
| * | |||||
| * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved. | |||||
| */ | |||||
| #include "./im2col.h.hip" | |||||
| #include "megdnn/dtype.h" | |||||
| #include "src/rocm/utils.h.hip" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| namespace { | |||||
| template <typename T> | |||||
| __global__ void im2col_kernel(const T* im, T* col, uint32_t N, uint32_t INP_BS, | |||||
| uint32_t IC, uint32_t IH, uint32_t IW, | |||||
| uint32_t FH, uint32_t FW, uint32_t OH, | |||||
| uint32_t OW, uint32_t PH, uint32_t PW, | |||||
| uint32_t SH, uint32_t SW, uint32_t DH, | |||||
| uint32_t DW) { | |||||
| uint32_t n = threadIdx.x + blockIdx.y * blockDim.x; | |||||
| uint32_t ow = threadIdx.y + blockIdx.z * blockDim.y; | |||||
| uint32_t oh = blockIdx.x % OH; | |||||
| uint32_t fw = blockIdx.x / OH % FW; | |||||
| uint32_t fh = blockIdx.x / OH / FW % FH; | |||||
| uint32_t ic = blockIdx.x / OH / FW / FH; | |||||
| if (n < N && ow < OW) { | |||||
| uint32_t didx = blockIdx.x * OW * N + ow * N + n; | |||||
| uint32_t ih = -PH + oh * SH + fh * DH; | |||||
| uint32_t iw = -PW + ow * SW + fw * DW; | |||||
| col[didx] = (ih < IH && iw < IW | |||||
| ? im[n * INP_BS + ic * IH * IW + ih * IW + iw] | |||||
| : T(0.0f)); | |||||
| } | |||||
| } | |||||
| template <typename T> | |||||
| __global__ void col2im_kernel(const T* col, T* im, uint32_t N, uint32_t INP_BS, | |||||
| uint32_t IC, uint32_t IH, uint32_t IW, | |||||
| uint32_t FH, uint32_t FW, uint32_t OH, | |||||
| uint32_t OW, uint32_t PH, uint32_t PW, | |||||
| uint32_t SH, uint32_t SW, uint32_t DH, | |||||
| uint32_t DW) { | |||||
| uint32_t iw = threadIdx.x + blockIdx.y * blockDim.x; | |||||
| uint32_t ih = threadIdx.y + blockIdx.z * blockDim.y; | |||||
| uint32_t ic = blockIdx.x % IC; | |||||
| uint32_t n = blockIdx.x / IC; | |||||
| if (iw < IW && ih < IH) { | |||||
| T res(0); | |||||
| for (uint32_t fh = 0; fh < FH; ++fh) { | |||||
| uint32_t anchorh = ih + PH - fh * DH; | |||||
| if (anchorh < OH * SH && anchorh % SH == 0) { | |||||
| uint32_t oh = anchorh / SH; | |||||
| for (uint32_t fw = 0; fw < FW; ++fw) { | |||||
| uint32_t anchorw = iw + PW - fw * DW; | |||||
| if (anchorw < OW * SW && anchorw % SW == 0) { | |||||
| uint32_t ow = anchorw / SW; | |||||
| res += col[ic * FH * FW * OH * OW * N + | |||||
| fh * FW * OH * OW * N + fw * OH * OW * N + | |||||
| oh * OW * N + ow * N + n]; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| im[n * INP_BS + ic * IH * IW + ih * IW + iw] = res; | |||||
| } | |||||
| } | |||||
| } // anonymous namespace | |||||
| template <typename T> | |||||
| void convolution::im2col(const T* im, T* col, size_t N, size_t INP_BS, | |||||
| size_t IC, size_t IH, size_t IW, size_t FH, size_t FW, | |||||
| size_t OH, size_t OW, size_t PH, size_t PW, size_t SH, | |||||
| size_t SW, size_t DH, size_t DW, hipStream_t stream) { | |||||
| dim3 threads(NR_THREADS_X, NR_THREADS_Y); | |||||
| dim3 blocks(IC * FH * FW * OH, DIVUP(N, NR_THREADS_X), | |||||
| DIVUP(OW, NR_THREADS_Y)); | |||||
| hipLaunchKernelGGL(im2col_kernel<T>, blocks, threads, 0, stream, im, col, N, | |||||
| INP_BS, IC, IH, IW, FH, FW, OH, OW, PH, PW, SH, SW, DH, | |||||
| DW); | |||||
| after_kernel_launch(); | |||||
| } | |||||
| template <typename T> | |||||
| void convolution::col2im(const T* col, T* im, size_t N, size_t INP_BS, | |||||
| size_t IC, size_t IH, size_t IW, size_t FH, size_t FW, | |||||
| size_t OH, size_t OW, size_t PH, size_t PW, size_t SH, | |||||
| size_t SW, size_t DH, size_t DW, hipStream_t stream) { | |||||
| dim3 threads(NR_THREADS_X, NR_THREADS_Y); | |||||
| dim3 blocks(N * IC, DIVUP(IW, NR_THREADS_X), DIVUP(IH, NR_THREADS_Y)); | |||||
| hipLaunchKernelGGL(col2im_kernel<T>, blocks, threads, 0, stream, col, im, N, | |||||
| INP_BS, IC, IH, IW, FH, FW, OH, OW, PH, PW, SH, SW, DH, | |||||
| DW); | |||||
| after_kernel_launch(); | |||||
| } | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| namespace convolution { | |||||
| #define DO_INST(T) \ | |||||
| template void im2col<T>(const T* im, T* col, size_t N, size_t INP_BS, \ | |||||
| size_t IC, size_t IH, size_t IW, size_t FH, \ | |||||
| size_t FW, size_t OH, size_t OW, size_t PH, \ | |||||
| size_t PW, size_t SH, size_t SW, size_t DH, \ | |||||
| size_t DW, hipStream_t stream); \ | |||||
| template void col2im<T>(const T* col, T* im, size_t N, size_t INP_BS, \ | |||||
| size_t IC, size_t IH, size_t IW, size_t FH, \ | |||||
| size_t FW, size_t OH, size_t OW, size_t PH, \ | |||||
| size_t PW, size_t SH, size_t SW, size_t DH, \ | |||||
| size_t DW, hipStream_t stream); | |||||
| #define INST(_dt) DO_INST(DTypeTrait<_dt>::ctype) | |||||
| MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(INST); | |||||
| #undef DO_INST | |||||
| #undef INST | |||||
| } // namespace convolution | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,34 @@ | |||||
| /** | |||||
| * \file src/rocm/convolution/im2col.h.hip | |||||
| * | |||||
| * This file is part of MegDNN, a deep neural network run-time library | |||||
| * developed by Megvii. | |||||
| * | |||||
| * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved. | |||||
| */ | |||||
| #pragma once | |||||
| #include "hip_header.h" | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| namespace convolution { | |||||
| //! col is of shape (ic*fh*fw, oh*ow*n) | |||||
| template <typename T> | |||||
| void im2col(const T* im, T* col, size_t N, size_t INP_BS, size_t IC, size_t IH, | |||||
| size_t IW, size_t FH, size_t FW, size_t OH, size_t OW, size_t PH, | |||||
| size_t PW, size_t SH, size_t SW, size_t DH, size_t DW, // dilation | |||||
| hipStream_t stream); | |||||
| template <typename T> | |||||
| void col2im(const T* col, T* im, size_t N, size_t INP_BS, size_t IC, size_t IH, | |||||
| size_t IW, size_t FH, size_t FW, size_t OH, size_t OW, size_t PH, | |||||
| size_t PW, size_t SH, size_t SW, size_t DH, size_t DW, // dilation | |||||
| hipStream_t stream); | |||||
| } // namespace convolution | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: ft=cpp syntax=cpp.doxygen | |||||
| @@ -0,0 +1,284 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/opr_impl.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "hcc_detail/hcc_defs_prologue.h" | |||||
| #include "./backward_data/algo.h" | |||||
| #include "./backward_filter/algo.h" | |||||
| #include "./forward/algo.h" | |||||
| #include "./opr_impl.h" | |||||
| #include "src/common/algo_chooser.h" | |||||
| #include "src/rocm/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace rocm; | |||||
| #define TO_STRING2(v) #v | |||||
| #define TO_STRING(v) TO_STRING2(v) | |||||
| #define MIOPEN_VERSION_STR \ | |||||
| TO_STRING(MIOPEN_VERSION_MAJOR) \ | |||||
| "." TO_STRING(MIOPEN_VERSION_MINOR) "." TO_STRING(MIOPEN_VERSION_PATCH) | |||||
| /* ============== ConvolutionForwardImpl ============== */ | |||||
| ConvolutionForwardImpl::Algorithm* | |||||
| ConvolutionForwardImpl::get_algorithm_heuristic(const TensorLayout& src, | |||||
| const TensorLayout& filter, | |||||
| const TensorLayout& dst, | |||||
| size_t workspace_limit_in_bytes, | |||||
| bool reproducible) { | |||||
| auto fm = check_layout_fwd(src, filter, dst); | |||||
| return get_algorithm_heuristic(src, fm, dst, workspace_limit_in_bytes, | |||||
| reproducible); | |||||
| } | |||||
| ConvolutionForwardImpl::Algorithm* | |||||
| ConvolutionForwardImpl::get_algorithm_heuristic( | |||||
| const TensorLayout& src, const CanonizedFilterMeta& filter, | |||||
| const TensorLayout& dst, size_t workspace_limit_in_bytes, | |||||
| bool reproducible) { | |||||
| AlgoBase::SizeArgs args(this, src, filter, dst); | |||||
| //! MIOpen auto-tuning need to run with actual tensors, so we cannot get | |||||
| //! best algorithm here. | |||||
| if (is_miopen_supported(args)) { | |||||
| auto algo = megdnn::get_reproducible_algo<ConvolutionForwardImpl>( | |||||
| sm_algo_pack.miopen_algos[0], reproducible); | |||||
| if (algo) | |||||
| return algo; | |||||
| } | |||||
| if (args.filter_meta.group > 1) { | |||||
| if (sm_algo_pack.chanwise.is_available_reproducible( | |||||
| args, reproducible, workspace_limit_in_bytes)) { | |||||
| return &sm_algo_pack.chanwise; | |||||
| } | |||||
| } | |||||
| auto prefer_1x1 = [&args, reproducible, workspace_limit_in_bytes]() { | |||||
| const size_t MAX_BATCH_SIZE_FOR_1x1_MAT_ALGO = 4; | |||||
| size_t batch_size = args.src_layout->shape[0]; | |||||
| if (batch_size > MAX_BATCH_SIZE_FOR_1x1_MAT_ALGO) { | |||||
| return false; | |||||
| } | |||||
| return sm_algo_pack.a1x1.is_available_reproducible( | |||||
| args, reproducible, workspace_limit_in_bytes); | |||||
| }; | |||||
| if (prefer_1x1()) { | |||||
| return &sm_algo_pack.a1x1; | |||||
| } | |||||
| auto prefer_1x1_large_batch = [&args, reproducible, | |||||
| workspace_limit_in_bytes]() { | |||||
| const size_t MIN_BATCH_SIZE_FOR_1x1_LARGE_BATCH_ALGO = 32; | |||||
| size_t batch_size = args.src_layout->shape[0]; | |||||
| if (batch_size < MIN_BATCH_SIZE_FOR_1x1_LARGE_BATCH_ALGO) { | |||||
| return false; | |||||
| } | |||||
| return sm_algo_pack.batched_matrix_mul.is_available_reproducible( | |||||
| args, reproducible, workspace_limit_in_bytes); | |||||
| }; | |||||
| if (prefer_1x1_large_batch()) { | |||||
| return &sm_algo_pack.batched_matrix_mul; | |||||
| } | |||||
| if (reproducible) { | |||||
| return megdnn::get_reproducible_algo<ConvolutionForwardImpl>( | |||||
| sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes, | |||||
| "rocm conv fwd"); | |||||
| } else { | |||||
| return megdnn::get_usable_algo<ConvolutionForwardImpl>( | |||||
| sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes, | |||||
| "rocm conv fwd"); | |||||
| } | |||||
| } | |||||
| std::vector<ConvolutionForwardImpl::Algorithm*> | |||||
| ConvolutionForwardImpl::get_all_algorithms(const TensorLayout& src, | |||||
| const TensorLayout& filter, | |||||
| const TensorLayout& dst) { | |||||
| return megdnn::get_all_algorithms<ConvolutionForwardImpl>( | |||||
| {this, src, filter, dst}); | |||||
| } | |||||
| size_t ConvolutionForwardImpl::get_workspace_in_bytes( | |||||
| const TensorLayout& src, const TensorLayout& filter, | |||||
| const TensorLayout& dst, const PreprocessedFilter*) { | |||||
| AlgoBase::SizeArgs args(this, src, filter, dst); | |||||
| return get_algorithm(this, src, args.filter_meta, dst) | |||||
| ->get_workspace_in_bytes(args); | |||||
| } | |||||
| void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, | |||||
| _megdnn_tensor_in filter, | |||||
| _megdnn_tensor_out dst, | |||||
| const PreprocessedFilter*, | |||||
| _megdnn_workspace workspace) { | |||||
| AlgoBase::ExecArgs args(this, src, filter, dst, workspace); | |||||
| auto algo = get_algorithm(this, src.layout, args.filter_meta, dst.layout); | |||||
| algo->check_workspace(args, workspace).exec(args); | |||||
| } | |||||
| const char* ConvolutionForwardImpl::get_algorithm_set_name() const { | |||||
| return "ROCMCONV0+MIOPEN" MIOPEN_VERSION_STR; | |||||
| } | |||||
| /* ============== ConvolutionBackwardDataImpl ============== */ | |||||
| void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter, | |||||
| _megdnn_tensor_in diff, | |||||
| _megdnn_tensor_out grad, | |||||
| _megdnn_workspace workspace) { | |||||
| AlgoBase::ExecArgs args(this, filter, diff, grad, workspace); | |||||
| auto algo = get_algorithm(this, args.filter_meta, diff.layout, grad.layout); | |||||
| algo->check_workspace(args, workspace).exec(args); | |||||
| } | |||||
| std::vector<ConvolutionBackwardDataImpl::Algorithm*> | |||||
| ConvolutionBackwardDataImpl::get_all_algorithms(const TensorLayout& filter, | |||||
| const TensorLayout& diff, | |||||
| const TensorLayout& grad) { | |||||
| return megdnn::get_all_algorithms<ConvolutionBackwardDataImpl>( | |||||
| {this, filter, diff, grad}); | |||||
| } | |||||
| ConvolutionBackwardDataImpl::Algorithm* | |||||
| ConvolutionBackwardDataImpl::get_algorithm_heuristic( | |||||
| const TensorLayout& filter, const TensorLayout& diff, | |||||
| const TensorLayout& grad, size_t workspace_limit_in_bytes, | |||||
| bool reproducible) { | |||||
| auto fm = check_layout_fwd(grad, filter, diff); | |||||
| return get_algorithm_heuristic(fm, diff, grad, workspace_limit_in_bytes, | |||||
| reproducible); | |||||
| } | |||||
| ConvolutionBackwardDataImpl::Algorithm* | |||||
| ConvolutionBackwardDataImpl::get_algorithm_heuristic( | |||||
| const CanonizedFilterMeta& filter, const TensorLayout& diff, | |||||
| const TensorLayout& grad, size_t workspace_limit_in_bytes, | |||||
| bool reproducible) { | |||||
| AlgoBase::SizeArgs args(this, filter, diff, grad); | |||||
| if (is_miopen_supported(args.as_fwd_args())) { | |||||
| auto algo = megdnn::get_reproducible_algo<ConvolutionBackwardDataImpl>( | |||||
| sm_algo_pack.miopen_algos[0], reproducible); | |||||
| if (algo) | |||||
| return algo; | |||||
| } | |||||
| if (args.filter_meta.group > 1 && | |||||
| sm_algo_pack.chanwise.is_available_reproducible( | |||||
| args, reproducible, workspace_limit_in_bytes)) { | |||||
| return &sm_algo_pack.chanwise; | |||||
| } | |||||
| if (reproducible) { | |||||
| return megdnn::get_reproducible_algo<ConvolutionBackwardDataImpl>( | |||||
| sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes, | |||||
| "rocm conv bwd_data"); | |||||
| } else { | |||||
| return megdnn::get_usable_algo<ConvolutionBackwardDataImpl>( | |||||
| sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes, | |||||
| "rocm conv bwd_data"); | |||||
| } | |||||
| } | |||||
| size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes( | |||||
| const TensorLayout& filter, const TensorLayout& diff, | |||||
| const TensorLayout& grad) { | |||||
| AlgoBase::SizeArgs args(this, filter, diff, grad); | |||||
| return get_algorithm(this, args.filter_meta, diff, grad) | |||||
| ->get_workspace_in_bytes(args); | |||||
| } | |||||
| const char* ConvolutionBackwardDataImpl::get_algorithm_set_name() const { | |||||
| return "ROCMCONV0+MIOPEN" MIOPEN_VERSION_STR; | |||||
| } | |||||
| /* ============== ConvolutionBackwardFilterImpl ============== */ | |||||
| void ConvolutionBackwardFilterImpl::exec(_megdnn_tensor_in src, | |||||
| _megdnn_tensor_in diff, | |||||
| _megdnn_tensor_out grad, | |||||
| _megdnn_workspace workspace) { | |||||
| AlgoBase::ExecArgs args(this, src, diff, grad, workspace); | |||||
| auto algo = | |||||
| get_algorithm(this, src.layout, diff.layout, args.grad_filter_meta); | |||||
| algo->check_workspace(args, workspace).exec(args); | |||||
| } | |||||
| std::vector<ConvolutionBackwardFilterImpl::Algorithm*> | |||||
| ConvolutionBackwardFilterImpl::get_all_algorithms(const TensorLayout& src, | |||||
| const TensorLayout& diff, | |||||
| const TensorLayout& grad) { | |||||
| return megdnn::get_all_algorithms<ConvolutionBackwardFilterImpl>( | |||||
| {this, src, diff, grad}); | |||||
| } | |||||
| ConvolutionBackwardFilterImpl::Algorithm* | |||||
| ConvolutionBackwardFilterImpl::get_algorithm_heuristic( | |||||
| const TensorLayout& src, const TensorLayout& diff, | |||||
| const TensorLayout& grad, size_t workspace_limit_in_bytes, | |||||
| bool reproducible) { | |||||
| auto fm = check_layout_fwd(src, grad, diff); | |||||
| return get_algorithm_heuristic(src, diff, fm, workspace_limit_in_bytes, | |||||
| reproducible); | |||||
| } | |||||
| ConvolutionBackwardFilterImpl::Algorithm* | |||||
| ConvolutionBackwardFilterImpl::get_algorithm_heuristic( | |||||
| const TensorLayout& src, const TensorLayout& diff, | |||||
| const CanonizedFilterMeta& grad, size_t workspace_limit_in_bytes, | |||||
| bool reproducible) { | |||||
| AlgoBase::SizeArgs args(this, src, diff, grad); | |||||
| if (is_miopen_supported(args.as_fwd_args())) { | |||||
| auto algo = | |||||
| megdnn::get_reproducible_algo<ConvolutionBackwardFilterImpl>( | |||||
| sm_algo_pack.miopen_algos[0], reproducible); | |||||
| if (algo) | |||||
| return algo; | |||||
| } | |||||
| if (args.grad_filter_meta.group > 1 && | |||||
| sm_algo_pack.chanwise.is_available_reproducible( | |||||
| args, reproducible, workspace_limit_in_bytes)) { | |||||
| // prefer special chanwise impl | |||||
| return &sm_algo_pack.chanwise; | |||||
| } | |||||
| if (reproducible) { | |||||
| return megdnn::get_reproducible_algo<ConvolutionBackwardFilterImpl>( | |||||
| sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes, | |||||
| "rocm conv bwd_filter"); | |||||
| } else { | |||||
| return megdnn::get_usable_algo<ConvolutionBackwardFilterImpl>( | |||||
| sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes, | |||||
| "rocm conv bwd_filter"); | |||||
| } | |||||
| } | |||||
| size_t ConvolutionBackwardFilterImpl::get_workspace_in_bytes( | |||||
| const TensorLayout& src, const TensorLayout& diff, | |||||
| const TensorLayout& grad) { | |||||
| AlgoBase::SizeArgs args(this, src, diff, grad); | |||||
| return get_algorithm(this, src, diff, args.grad_filter_meta) | |||||
| ->get_workspace_in_bytes(args); | |||||
| } | |||||
| const char* ConvolutionBackwardFilterImpl::get_algorithm_set_name() const { | |||||
| return "ROCMCONV0+MIOPEN" MIOPEN_VERSION_STR; | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,154 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/convolution/opr_impl.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include "megdnn/oprs/nn.h" | |||||
| #include "src/common/utils.h" | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| class ConvolutionForwardImpl : public ConvolutionForward { | |||||
| public: | |||||
| using ConvolutionForward::ConvolutionForward; | |||||
| void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||||
| _megdnn_tensor_out dst, | |||||
| const PreprocessedFilter* preprocessed_filter, | |||||
| _megdnn_workspace workspace) override; | |||||
| std::vector<Algorithm*> get_all_algorithms( | |||||
| const TensorLayout& src, const TensorLayout& filter, | |||||
| const TensorLayout& dst) override; | |||||
| Algorithm* get_algorithm_heuristic(const TensorLayout& src, | |||||
| const TensorLayout& filter, | |||||
| const TensorLayout& dst, | |||||
| size_t workspace_limit_in_bytes, | |||||
| bool reproducible) override; | |||||
| Algorithm* get_algorithm_heuristic(const TensorLayout& src, | |||||
| const CanonizedFilterMeta& filter, | |||||
| const TensorLayout& dst, | |||||
| size_t workspace_limit_in_bytes, | |||||
| bool reproducible); | |||||
| size_t get_workspace_in_bytes(const TensorLayout& src, | |||||
| const TensorLayout& filter, | |||||
| const TensorLayout& dst, | |||||
| const PreprocessedFilter*) override; | |||||
| size_t get_preprocess_workspace_in_bytes(const TensorLayout&, | |||||
| const TensorLayout&, | |||||
| const TensorLayout&) override { | |||||
| return 0; | |||||
| } | |||||
| void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, | |||||
| const TensorLayout&, PreprocessedFilter*, | |||||
| _megdnn_workspace) override { | |||||
| megdnn_throw("convolution exec_preprocess has not implemented yet"); | |||||
| } | |||||
| SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||||
| const TensorLayout&, const TensorLayout&, | |||||
| const TensorLayout&) override { | |||||
| return {}; | |||||
| } | |||||
| const char* get_algorithm_set_name() const override; | |||||
| class AlgoBase; | |||||
| class AlgoMIOpen; | |||||
| class AlgoMatmul; | |||||
| class AlgoInplaceMatmul; | |||||
| class Algo1x1; | |||||
| class Algo1x1LargeBatch; | |||||
| class AlgoChanwise; | |||||
| class AlgoPack; | |||||
| static const AlgoPack& algo_pack() { return sm_algo_pack; } | |||||
| private: | |||||
| static AlgoPack sm_algo_pack; | |||||
| }; | |||||
| class ConvolutionBackwardDataImpl : public ConvolutionBackwardData { | |||||
| public: | |||||
| using ConvolutionBackwardData::ConvolutionBackwardData; | |||||
| void exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff, | |||||
| _megdnn_tensor_out grad, _megdnn_workspace workspace) override; | |||||
| std::vector<Algorithm*> get_all_algorithms( | |||||
| const TensorLayout& filter, const TensorLayout& diff, | |||||
| const TensorLayout& grad) override; | |||||
| Algorithm* get_algorithm_heuristic(const TensorLayout& filter, | |||||
| const TensorLayout& diff, | |||||
| const TensorLayout& grad, | |||||
| size_t workspace_limit_in_bytes, | |||||
| bool reproducible) override; | |||||
| Algorithm* get_algorithm_heuristic(const CanonizedFilterMeta& filter, | |||||
| const TensorLayout& diff, | |||||
| const TensorLayout& grad, | |||||
| size_t workspace_limit_in_bytes, | |||||
| bool reproducible); | |||||
| size_t get_workspace_in_bytes(const TensorLayout& filter, | |||||
| const TensorLayout& diff, | |||||
| const TensorLayout& grad) override; | |||||
| const char* get_algorithm_set_name() const override; | |||||
| class AlgoBase; | |||||
| class AlgoMIOpen; | |||||
| class AlgoMatmul; | |||||
| class AlgoChanwise; | |||||
| class AlgoPack; | |||||
| static const AlgoPack& algo_pack() { return sm_algo_pack; } | |||||
| private: | |||||
| static AlgoPack sm_algo_pack; | |||||
| }; | |||||
| class ConvolutionBackwardFilterImpl : public ConvolutionBackwardFilter { | |||||
| public: | |||||
| using ConvolutionBackwardFilter::ConvolutionBackwardFilter; | |||||
| void exec(_megdnn_tensor_in src, _megdnn_tensor_in diff, | |||||
| _megdnn_tensor_out grad, _megdnn_workspace workspace) override; | |||||
| std::vector<Algorithm*> get_all_algorithms( | |||||
| const TensorLayout& src, const TensorLayout& diff, | |||||
| const TensorLayout& grad) override; | |||||
| Algorithm* get_algorithm_heuristic(const TensorLayout& src, | |||||
| const TensorLayout& diff, | |||||
| const TensorLayout& grad, | |||||
| size_t workspace_limit_in_bytes, | |||||
| bool reproducible) override; | |||||
| Algorithm* get_algorithm_heuristic(const TensorLayout& src, | |||||
| const TensorLayout& diff, | |||||
| const CanonizedFilterMeta& grad, | |||||
| size_t workspace_limit_in_bytes, | |||||
| bool reproducible); | |||||
| size_t get_workspace_in_bytes(const TensorLayout& src, | |||||
| const TensorLayout& diff, | |||||
| const TensorLayout& grad) override; | |||||
| const char* get_algorithm_set_name() const override; | |||||
| class AlgoBase; | |||||
| class AlgoMIOpen; | |||||
| class AlgoMatmul; | |||||
| class AlgoChanwise; | |||||
| class AlgoPack; | |||||
| static const AlgoPack& algo_pack() { return sm_algo_pack; } | |||||
| private: | |||||
| static AlgoPack sm_algo_pack; | |||||
| }; | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,36 @@ | |||||
| /** | |||||
| * \file dnn/src/rocm/elemwise/kern_impl.inl | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #pragma once | |||||
| #ifndef KERN_IMPL_MODE | |||||
| #error "KERN_IMPL_MODE, KERN_IMPL_ARITY and KERN_IMPL_CTYPE must be defined" | |||||
| #endif | |||||
| #include "src/rocm/elemwise/kern_wrapper.h.hip" | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| #define cb(_mode) \ | |||||
| typedef ElemwiseKern<megcorePlatformROCM, \ | |||||
| param_enumv::Elemwise::Mode::_mode, KERN_IMPL_CTYPE> \ | |||||
| KernImpl##_mode; \ | |||||
| typedef ElemArithKernWrapper<KERN_IMPL_ARITY, KernImpl##_mode> \ | |||||
| Wrapper##_mode; \ | |||||
| INST_RUN_ELEMWISE(Wrapper##_mode, KERN_IMPL_CTYPE, KERN_IMPL_ARITY); | |||||
| KERN_IMPL_MODE(cb) | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,62 @@ | |||||
| /** | |||||
| * \file src/rocm/elemwise/kern_wrapper.h.hip | |||||
| * | |||||
| * This file is part of MegDNN, a deep neural network run-time library | |||||
| * developed by Megvii. | |||||
| * | |||||
| * \brief helper for implementing elemwise oprs | |||||
| * | |||||
| * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved. | |||||
| */ | |||||
| #pragma once | |||||
| #include "src/rocm/elemwise_helper.h.hip" | |||||
| #include "src/common/elemwise/kern_defs.cuh" | |||||
| namespace megdnn { | |||||
| namespace rocm { | |||||
| template <int arity, class KernImpl> | |||||
| struct ElemArithKernWrapper; | |||||
| template <class KernImpl> | |||||
| struct ElemArithKernWrapper<1, KernImpl> { | |||||
| typedef typename KernImpl::ctype ctype; | |||||
| ctype* dst; | |||||
| #if MEGDNN_CC_CUDA | |||||
| __device__ void operator()(uint32_t idx, ctype x) { | |||||
| dst[idx] = KernImpl::apply(x); | |||||
| } | |||||
| #endif | |||||
| }; | |||||
| template <class KernImpl> | |||||
| struct ElemArithKernWrapper<2, KernImpl> { | |||||
| typedef typename KernImpl::ctype ctype; | |||||
| ctype* dst; | |||||
| #if MEGDNN_CC_CUDA | |||||
| __device__ void operator()(uint32_t idx, ctype x, ctype y) { | |||||
| dst[idx] = KernImpl::apply(x, y); | |||||
| } | |||||
| #endif | |||||
| }; | |||||
| template <class KernImpl> | |||||
| struct ElemArithKernWrapper<3, KernImpl> { | |||||
| typedef typename KernImpl::ctype ctype; | |||||
| ctype* dst; | |||||
| #if MEGDNN_CC_CUDA | |||||
| __device__ void operator()(uint32_t idx, ctype x, ctype y, ctype z) { | |||||
| dst[idx] = KernImpl::apply(x, y, z); | |||||
| } | |||||
| #endif | |||||
| }; | |||||
| } // namespace rocm | |||||
| } // namespace megdnn | |||||
| // vim: ft=cpp syntax=cpp.doxygen | |||||
| @@ -0,0 +1,7 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #if !MEGDNN_DISABLE_FLOAT16 | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb) | |||||
| #define KERN_IMPL_ARITY 2 | |||||
| #define KERN_IMPL_CTYPE dt_float16 | |||||
| #include "../kern_impl.inl" | |||||
| #endif | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb) | |||||
| #define KERN_IMPL_ARITY 2 | |||||
| #define KERN_IMPL_CTYPE dt_float32 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb) | |||||
| #define KERN_IMPL_ARITY 2 | |||||
| #define KERN_IMPL_CTYPE dt_int16 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb) | |||||
| #define KERN_IMPL_ARITY 2 | |||||
| #define KERN_IMPL_CTYPE dt_int32 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb) | |||||
| #define KERN_IMPL_ARITY 2 | |||||
| #define KERN_IMPL_CTYPE dt_int8 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb) | |||||
| #define KERN_IMPL_ARITY 2 | |||||
| #define KERN_IMPL_CTYPE dt_uint8 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,7 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #if !MEGDNN_DISABLE_FLOAT16 | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb) | |||||
| #define KERN_IMPL_ARITY 1 | |||||
| #define KERN_IMPL_CTYPE dt_float16 | |||||
| #include "../kern_impl.inl" | |||||
| #endif | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb) | |||||
| #define KERN_IMPL_ARITY 1 | |||||
| #define KERN_IMPL_CTYPE dt_float32 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb) | |||||
| #define KERN_IMPL_ARITY 1 | |||||
| #define KERN_IMPL_CTYPE dt_int16 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb) | |||||
| #define KERN_IMPL_ARITY 1 | |||||
| #define KERN_IMPL_CTYPE dt_int32 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb) | |||||
| #define KERN_IMPL_ARITY 1 | |||||
| #define KERN_IMPL_CTYPE dt_int8 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb) | |||||
| #define KERN_IMPL_ARITY 1 | |||||
| #define KERN_IMPL_CTYPE dt_uint8 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,7 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #if !MEGDNN_DISABLE_FLOAT16 | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ACOS, cb) | |||||
| #define KERN_IMPL_ARITY 1 | |||||
| #define KERN_IMPL_CTYPE dt_float16 | |||||
| #include "../kern_impl.inl" | |||||
| #endif | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ACOS, cb) | |||||
| #define KERN_IMPL_ARITY 1 | |||||
| #define KERN_IMPL_CTYPE dt_float32 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,7 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #if !MEGDNN_DISABLE_FLOAT16 | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb) | |||||
| #define KERN_IMPL_ARITY 2 | |||||
| #define KERN_IMPL_CTYPE dt_float16 | |||||
| #include "../kern_impl.inl" | |||||
| #endif | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb) | |||||
| #define KERN_IMPL_ARITY 2 | |||||
| #define KERN_IMPL_CTYPE dt_float32 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb) | |||||
| #define KERN_IMPL_ARITY 2 | |||||
| #define KERN_IMPL_CTYPE dt_int16 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb) | |||||
| #define KERN_IMPL_ARITY 2 | |||||
| #define KERN_IMPL_CTYPE dt_int32 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb) | |||||
| #define KERN_IMPL_ARITY 2 | |||||
| #define KERN_IMPL_CTYPE dt_int8 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb) | |||||
| #define KERN_IMPL_ARITY 2 | |||||
| #define KERN_IMPL_CTYPE dt_uint8 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,7 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #if !MEGDNN_DISABLE_FLOAT16 | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ASIN, cb) | |||||
| #define KERN_IMPL_ARITY 1 | |||||
| #define KERN_IMPL_CTYPE dt_float16 | |||||
| #include "../kern_impl.inl" | |||||
| #endif | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ASIN, cb) | |||||
| #define KERN_IMPL_ARITY 1 | |||||
| #define KERN_IMPL_CTYPE dt_float32 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,7 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #if !MEGDNN_DISABLE_FLOAT16 | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ATAN2, cb) | |||||
| #define KERN_IMPL_ARITY 2 | |||||
| #define KERN_IMPL_CTYPE dt_float16 | |||||
| #include "../kern_impl.inl" | |||||
| #endif | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ATAN2, cb) | |||||
| #define KERN_IMPL_ARITY 2 | |||||
| #define KERN_IMPL_CTYPE dt_float32 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,7 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #if !MEGDNN_DISABLE_FLOAT16 | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(CEIL, cb) | |||||
| #define KERN_IMPL_ARITY 1 | |||||
| #define KERN_IMPL_CTYPE dt_float16 | |||||
| #include "../kern_impl.inl" | |||||
| #endif | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(CEIL, cb) | |||||
| #define KERN_IMPL_ARITY 1 | |||||
| #define KERN_IMPL_CTYPE dt_float32 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,7 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #if !MEGDNN_DISABLE_FLOAT16 | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb) | |||||
| #define KERN_IMPL_ARITY 3 | |||||
| #define KERN_IMPL_CTYPE dt_float16 | |||||
| #include "../kern_impl.inl" | |||||
| #endif | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb) | |||||
| #define KERN_IMPL_ARITY 3 | |||||
| #define KERN_IMPL_CTYPE dt_float32 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb) | |||||
| #define KERN_IMPL_ARITY 3 | |||||
| #define KERN_IMPL_CTYPE dt_int16 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb) | |||||
| #define KERN_IMPL_ARITY 3 | |||||
| #define KERN_IMPL_CTYPE dt_int32 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb) | |||||
| #define KERN_IMPL_ARITY 3 | |||||
| #define KERN_IMPL_CTYPE dt_int8 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb) | |||||
| #define KERN_IMPL_ARITY 3 | |||||
| #define KERN_IMPL_CTYPE dt_uint8 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,7 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #if !MEGDNN_DISABLE_FLOAT16 | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COS, cb) | |||||
| #define KERN_IMPL_ARITY 1 | |||||
| #define KERN_IMPL_CTYPE dt_float16 | |||||
| #include "../kern_impl.inl" | |||||
| #endif | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COS, cb) | |||||
| #define KERN_IMPL_ARITY 1 | |||||
| #define KERN_IMPL_CTYPE dt_float32 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,7 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #if !MEGDNN_DISABLE_FLOAT16 | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb) | |||||
| #define KERN_IMPL_ARITY 2 | |||||
| #define KERN_IMPL_CTYPE dt_float16 | |||||
| #include "../kern_impl.inl" | |||||
| #endif | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb) | |||||
| #define KERN_IMPL_ARITY 2 | |||||
| #define KERN_IMPL_CTYPE dt_float32 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb) | |||||
| #define KERN_IMPL_ARITY 2 | |||||
| #define KERN_IMPL_CTYPE dt_int16 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb) | |||||
| #define KERN_IMPL_ARITY 2 | |||||
| #define KERN_IMPL_CTYPE dt_int32 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb) | |||||
| #define KERN_IMPL_ARITY 2 | |||||
| #define KERN_IMPL_CTYPE dt_int8 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb) | |||||
| #define KERN_IMPL_ARITY 2 | |||||
| #define KERN_IMPL_CTYPE dt_uint8 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,7 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #if !MEGDNN_DISABLE_FLOAT16 | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFCINV, cb) | |||||
| #define KERN_IMPL_ARITY 1 | |||||
| #define KERN_IMPL_CTYPE dt_float16 | |||||
| #include "../kern_impl.inl" | |||||
| #endif | |||||
| @@ -0,0 +1,5 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFCINV, cb) | |||||
| #define KERN_IMPL_ARITY 1 | |||||
| #define KERN_IMPL_CTYPE dt_float32 | |||||
| #include "../kern_impl.inl" | |||||
| @@ -0,0 +1,7 @@ | |||||
| // generated by gen_elemwise_kern_impls.py | |||||
| #if !MEGDNN_DISABLE_FLOAT16 | |||||
| #define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFC, cb) | |||||
| #define KERN_IMPL_ARITY 1 | |||||
| #define KERN_IMPL_CTYPE dt_float16 | |||||
| #include "../kern_impl.inl" | |||||
| #endif | |||||