You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

opr_impl.cpp 19 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. /**
  2. * \file dnn/src/cuda/convolution/opr_impl.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "src/cuda/convolution/opr_impl.h"
  12. #include "megdnn/dtype.h"
  13. #include "src/cuda/convolution/helper.h"
  14. #include "src/cuda/convolution/backward_data/algo.h"
  15. #include "src/cuda/convolution/backward_filter/algo.h"
  16. #include "src/cuda/conv_bias/opr_impl.h"
  17. #include "src/cuda/utils.h"
  18. using namespace megdnn;
  19. using namespace cuda;
  20. using namespace convolution;
  21. #define TO_STRING2(v) #v
  22. #define TO_STRING(v) TO_STRING2(v)
  23. #define CUDNN_VERSION_STR TO_STRING(CUDNN_MAJOR) "." \
  24. TO_STRING(CUDNN_MINOR) "." TO_STRING(CUDNN_PATCHLEVEL)
  25. /* ============== ConvolutionForwardImpl ============== */
  26. ConvolutionForwardImpl::ConvBiasExtraData
  27. ConvolutionForwardImpl::conv_bias_extra_data(const TensorLayout& src,
  28. const TensorLayout& filter,
  29. const TensorLayout& dst) {
  30. auto conv_param = param();
  31. DType bias_type;
  32. if (src.dtype.enumv() == DTypeEnum::QuantizedS8) {
  33. bias_type = dtype::QuantizedS32(
  34. src.dtype.param<dtype::QuantizedS8>().scale *
  35. filter.dtype.param<dtype::QuantizedS8>().scale);
  36. } else if (src.dtype.enumv() == DTypeEnum::Quantized8Asymm) {
  37. bias_type = dtype::QuantizedS32(
  38. src.dtype.param<dtype::Quantized8Asymm>().scale *
  39. filter.dtype.param<dtype::Quantized8Asymm>().scale);
  40. } else if (src.dtype.enumv() == DTypeEnum::Uint8 ||
  41. src.dtype.enumv() == DTypeEnum::Int8) {
  42. bias_type = dtype::Int32{};
  43. } else if (src.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
  44. bias_type = dtype::QuantizedS32(
  45. src.dtype.param<dtype::Quantized4Asymm>().scale *
  46. filter.dtype.param<dtype::Quantized4Asymm>().scale);
  47. } else {
  48. megdnn_assert(src.dtype.category() == DTypeCategory::FLOAT);
  49. bias_type = src.dtype;
  50. }
  51. ConvBiasExtraData ret = {this->handle()->create_operator<ConvBiasForward>(),
  52. TensorLayout(bias_type), TensorLayout(dst.dtype)};
  53. ret.convbias_opr->param() = {param::ConvBias::NonlineMode::IDENTITY,
  54. conv_param.mode,
  55. conv_param.sparse,
  56. conv_param.format,
  57. conv_param.pad_h,
  58. conv_param.pad_w,
  59. conv_param.stride_h,
  60. conv_param.stride_w,
  61. conv_param.dilate_h,
  62. conv_param.dilate_w,
  63. conv_param.compute_mode};
  64. ret.convbias_opr->execution_policy() = {this->execution_policy().algo};
  65. return ret;
  66. }
  67. ConvolutionForwardImpl::Algorithm*
  68. ConvolutionForwardImpl::get_algorithm_heuristic(const TensorLayout& src,
  69. const TensorLayout& filter,
  70. const TensorLayout& dst,
  71. size_t workspace_limit_in_bytes,
  72. bool reproducible) {
  73. auto extra_data = conv_bias_extra_data(src, filter, dst);
  74. return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get())
  75. ->get_algorithm_heuristic(src, filter, extra_data.bias_layout,
  76. extra_data.z_layout, dst,
  77. workspace_limit_in_bytes, reproducible);
  78. }
  79. ConvolutionForwardImpl::Algorithm*
  80. ConvolutionForwardImpl::get_algorithm_from_desc(
  81. const ConvolutionForward::AlgorithmDesc& desc) {
  82. auto conv_param = param();
  83. auto convbias_opr = this->handle()->create_operator<ConvBiasForward>();
  84. convbias_opr->param() = {param::ConvBias::NonlineMode::IDENTITY,
  85. conv_param.mode,
  86. conv_param.sparse,
  87. conv_param.format,
  88. conv_param.pad_h,
  89. conv_param.pad_w,
  90. conv_param.stride_h,
  91. conv_param.stride_w,
  92. conv_param.dilate_h,
  93. conv_param.dilate_w,
  94. conv_param.compute_mode};
  95. convbias_opr->execution_policy() = {this->execution_policy().algo};
  96. return static_cast<ConvBiasForwardImpl*>(convbias_opr.get())
  97. ->get_algorithm_from_desc(desc);
  98. }
  99. std::vector<ConvolutionForwardImpl::Algorithm*>
  100. ConvolutionForwardImpl::get_all_algorithms(const TensorLayout& src,
  101. const TensorLayout& filter,
  102. const TensorLayout& dst) {
  103. auto extra_data = conv_bias_extra_data(src, filter, dst);
  104. return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get())
  105. ->get_all_algorithms(src, filter, extra_data.bias_layout,
  106. extra_data.z_layout, dst);
  107. }
  108. size_t ConvolutionForwardImpl::get_workspace_in_bytes(
  109. const TensorLayout& src, const TensorLayout& filter,
  110. const TensorLayout& dst,
  111. const PreprocessedFilter* preprocessed_filter) {
  112. auto extra_data = conv_bias_extra_data(src, filter, dst);
  113. return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get())
  114. ->get_workspace_in_bytes(
  115. src, filter, extra_data.bias_layout, extra_data.z_layout,
  116. dst,
  117. reinterpret_cast<const ConvolutionBase<
  118. param::ConvBias>::PreprocessedFilter*>(
  119. preprocessed_filter));
  120. }
  121. void ConvolutionForwardImpl::exec(_megdnn_tensor_in src,
  122. _megdnn_tensor_in filter,
  123. _megdnn_tensor_out dst,
  124. const PreprocessedFilter* preprocessed_filter,
  125. _megdnn_workspace workspace) {
  126. auto extra_data =
  127. conv_bias_extra_data(src.layout, filter.layout, dst.layout);
  128. TensorND bias(nullptr, extra_data.bias_layout);
  129. TensorND z(nullptr, extra_data.z_layout);
  130. return static_cast<ConvBiasForwardImpl*>(extra_data.convbias_opr.get())
  131. ->exec(src, filter, bias, z, dst,
  132. reinterpret_cast<const ConvolutionBase<
  133. param::ConvBias>::PreprocessedFilter*>(
  134. preprocessed_filter),
  135. workspace);
  136. }
  137. const char* ConvolutionForwardImpl::get_algorithm_set_name() const {
  138. return "CUDACONV0+CUDNN" CUDNN_VERSION_STR;
  139. }
  140. /* ============== ConvolutionBackwardDataImpl ============== */
  141. void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter,
  142. _megdnn_tensor_in diff,
  143. _megdnn_tensor_out grad,
  144. _megdnn_workspace workspace) {
  145. AlgoBase::ExecArgs args(this, filter, diff, grad, workspace);
  146. auto algo = get_algorithm(this, filter.layout, args.filter_meta,
  147. diff.layout, grad.layout);
  148. algo->check_workspace(args, workspace).exec(args);
  149. }
  150. std::vector<ConvolutionBackwardDataImpl::Algorithm *>
  151. ConvolutionBackwardDataImpl::get_all_algorithms(const TensorLayout &filter,
  152. const TensorLayout &diff,
  153. const TensorLayout &grad) {
  154. return megdnn::get_all_algorithms<ConvolutionBackwardDataImpl>(
  155. {this, filter, diff, grad});
  156. }
  157. ConvolutionBackwardDataImpl::Algorithm*
  158. ConvolutionBackwardDataImpl::get_algorithm_heuristic(
  159. const TensorLayout& filter, const TensorLayout& diff,
  160. const TensorLayout& grad, size_t workspace_limit_in_bytes,
  161. bool reproducible) {
  162. auto fm = check_layout_fwd(grad, filter, diff);
  163. return get_algorithm_heuristic(filter, fm, diff, grad,
  164. workspace_limit_in_bytes, reproducible);
  165. }
  166. ConvolutionBackwardDataImpl::Algorithm*
  167. ConvolutionBackwardDataImpl::get_algorithm_heuristic(const TensorLayout& filter,
  168. const CanonizedFilterMeta& filter_meta, const TensorLayout& diff,
  169. const TensorLayout& grad, size_t workspace_limit_in_bytes,
  170. bool reproducible) {
  171. AlgoBase::SizeArgs args(this, filter, filter_meta, diff, grad);
  172. if (args.filter_meta.group > 1 &&
  173. sm_algo_pack.chanwise.is_available_reproducible(
  174. args, reproducible, workspace_limit_in_bytes)) {
  175. // prefer special chanwise impl
  176. return &sm_algo_pack.chanwise;
  177. }
  178. auto get_cudnn_algo =
  179. [this, &args, workspace_limit_in_bytes,
  180. reproducible]() -> ConvolutionBackwardDataImpl::AlgoBase* {
  181. auto cudnn_handle = cuda::cudnn_handle(this->handle());
  182. CUDNNBwdDataDescs desc;
  183. args.init_desc(desc);
  184. #if CUDNN_MAJOR >= 7
  185. int max_count = 0;
  186. cudnn_check(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
  187. cudnn_handle, &max_count));
  188. SmallVector<cudnnConvolutionBwdDataAlgoPerf_t> algo_perf(max_count);
  189. int ret_count = 0;
  190. cudnn_check(cudnnGetConvolutionBackwardDataAlgorithm_v7(
  191. cudnn_handle, desc.filter_desc.desc, desc.diff_desc.desc,
  192. desc.conv_desc.desc, desc.grad_desc.desc, max_count, &ret_count,
  193. algo_perf.data()));
  194. for (int i = 0; i < ret_count; ++i) {
  195. if (algo_perf[i].memory > workspace_limit_in_bytes)
  196. continue;
  197. if (reproducible) {
  198. if (algo_perf[i].determinism == CUDNN_DETERMINISTIC) {
  199. return reinterpret_cast<AlgoBase*>(
  200. sm_algo_pack.cudnn_from_enum(algo_perf[i].algo));
  201. }
  202. } else {
  203. return reinterpret_cast<AlgoBase*>(
  204. sm_algo_pack.cudnn_from_enum(algo_perf[i].algo));
  205. }
  206. }
  207. return nullptr;
  208. #else
  209. cudnnConvolutionBwdDataAlgo_t algo;
  210. cudnn_check(cudnnGetConvolutionBackwardDataAlgorithm(
  211. cudnn_handle, desc.filter_desc.desc, desc.diff_desc.desc,
  212. desc.conv_desc.desc, desc.grad_desc.desc,
  213. CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
  214. workspace_limit_in_bytes, &algo));
  215. auto&& cast_algo =
  216. reinterpret_cast<AlgoBase*>(sm_algo_pack.cudnn_from_enum(algo));
  217. return reinterpret_cast<AlgoBase*>(
  218. megdnn::get_reproducible_algo<ConvolutionBackwardDataImpl>(
  219. cast_algo, reproducible));
  220. #endif
  221. };
  222. if (is_cudnn_supported(args.as_fwd_args())) {
  223. if (auto algo = get_cudnn_algo())
  224. return algo;
  225. }
  226. if (args.filter_meta.group > 1) {
  227. auto orig_args = args;
  228. TensorLayout a, b;
  229. AlgoGroupConvGeneral::modify_size_args(args, a, b);
  230. if (is_cudnn_supported(args.as_fwd_args())) {
  231. if (auto algo = get_cudnn_algo())
  232. return sm_algo_pack.algo2gconv.at(algo);
  233. }
  234. args = orig_args;
  235. }
  236. if (args.filter_layout->dtype.enumv() !=
  237. DTypeTrait<dtype::BFloat16>::enumv) {
  238. if (reproducible) {
  239. return megdnn::get_reproducible_algo<ConvolutionBackwardDataImpl>(
  240. sm_algo_pack.non_cudnn_algos, args,
  241. workspace_limit_in_bytes, "cuda conv bwd_data");
  242. } else {
  243. return megdnn::get_usable_algo<ConvolutionBackwardDataImpl>(
  244. sm_algo_pack.non_cudnn_algos, args,
  245. workspace_limit_in_bytes, "cuda conv bwd_data");
  246. }
  247. } else {
  248. if (reproducible) {
  249. return megdnn::get_reproducible_algo<ConvolutionBackwardDataImpl>(
  250. sm_algo_pack.bfloat16_algos, args, workspace_limit_in_bytes,
  251. "cuda conv bwd_data");
  252. } else {
  253. return megdnn::get_usable_algo<ConvolutionBackwardDataImpl>(
  254. sm_algo_pack.bfloat16_algos, args, workspace_limit_in_bytes,
  255. "cuda conv bwd_data");
  256. }
  257. }
  258. }
  259. size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes(
  260. const TensorLayout &filter,
  261. const TensorLayout &diff,
  262. const TensorLayout &grad) {
  263. AlgoBase::SizeArgs args(this, filter, diff, grad);
  264. return get_algorithm(this, filter, args.filter_meta, diff, grad)->
  265. get_workspace_in_bytes(args);
  266. }
  267. const char* ConvolutionBackwardDataImpl::get_algorithm_set_name() const {
  268. return "CUDACONV0+CUDNN" CUDNN_VERSION_STR;
  269. }
  270. /* ============== ConvolutionBackwardFilterImpl ============== */
  271. void ConvolutionBackwardFilterImpl::exec(_megdnn_tensor_in src,
  272. _megdnn_tensor_in diff,
  273. _megdnn_tensor_out grad,
  274. _megdnn_workspace workspace) {
  275. AlgoBase::ExecArgs args(this, src, diff, grad, workspace);
  276. auto algo = get_algorithm(this, src.layout, diff.layout,
  277. grad.layout, args.grad_filter_meta);
  278. algo->check_workspace(args, workspace).exec(args);
  279. }
  280. std::vector<ConvolutionBackwardFilterImpl::Algorithm *>
  281. ConvolutionBackwardFilterImpl::get_all_algorithms(const TensorLayout &src,
  282. const TensorLayout &diff,
  283. const TensorLayout &grad) {
  284. return megdnn::get_all_algorithms<ConvolutionBackwardFilterImpl>(
  285. {this, src, diff, grad});
  286. }
  287. ConvolutionBackwardFilterImpl::Algorithm*
  288. ConvolutionBackwardFilterImpl::get_algorithm_heuristic(
  289. const TensorLayout& src, const TensorLayout& diff,
  290. const TensorLayout& grad, size_t workspace_limit_in_bytes,
  291. bool reproducible) {
  292. auto fm = check_layout_fwd(src, grad, diff);
  293. return get_algorithm_heuristic(src, diff, grad, fm,
  294. workspace_limit_in_bytes, reproducible);
  295. }
  296. ConvolutionBackwardFilterImpl::Algorithm*
  297. ConvolutionBackwardFilterImpl::get_algorithm_heuristic(
  298. const TensorLayout& src, const TensorLayout& diff,
  299. const TensorLayout& grad, const CanonizedFilterMeta& grad_meta,
  300. size_t workspace_limit_in_bytes, bool reproducible) {
  301. AlgoBase::SizeArgs args(this, src, diff, grad, grad_meta);
  302. if (args.grad_filter_meta.group > 1 &&
  303. sm_algo_pack.chanwise.is_available_reproducible(
  304. args, reproducible, workspace_limit_in_bytes)) {
  305. // prefer special chanwise impl
  306. return &sm_algo_pack.chanwise;
  307. }
  308. auto get_cudnn_algo =
  309. [this, &args, workspace_limit_in_bytes,
  310. reproducible]() -> ConvolutionBackwardFilterImpl::AlgoBase* {
  311. auto cudnn_handle = cuda::cudnn_handle(this->handle());
  312. CUDNNBwdFilterDescs desc;
  313. args.init_desc(desc);
  314. //disable, segfault in megbrain, need further investigate.
  315. #if 0
  316. auto is_heuristic_success =
  317. convolution::PerformanceModelBackwardFilter::
  318. get_algo_backward_filter_success(
  319. args, desc, workspace_limit_in_bytes, &algo);
  320. if (is_heuristic_success) {
  321. return sm_algo_pack.cudnn_from_enum(algo);
  322. }
  323. #endif
  324. #if CUDNN_MAJOR >= 7
  325. int max_count = 0;
  326. cudnn_check(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
  327. cudnn_handle, &max_count));
  328. SmallVector<cudnnConvolutionBwdFilterAlgoPerf_t> algo_perf(max_count);
  329. int ret_count = 0;
  330. cudnn_check(cudnnGetConvolutionBackwardFilterAlgorithm_v7(
  331. cudnn_handle, desc.src_desc.desc, desc.diff_desc.desc,
  332. desc.conv_desc.desc, desc.grad_desc.desc, max_count, &ret_count,
  333. algo_perf.data()));
  334. for (int i = 0; i < ret_count; ++i) {
  335. if (algo_perf[i].memory > workspace_limit_in_bytes)
  336. continue;
  337. if (reproducible) {
  338. if (algo_perf[i].determinism == CUDNN_DETERMINISTIC) {
  339. return reinterpret_cast<AlgoBase*>(
  340. sm_algo_pack.cudnn_from_enum(algo_perf[i].algo));
  341. }
  342. } else {
  343. return reinterpret_cast<AlgoBase*>(
  344. sm_algo_pack.cudnn_from_enum(algo_perf[i].algo));
  345. }
  346. }
  347. return nullptr;
  348. #else
  349. cudnnConvolutionBwdFilterAlgo_t algo;
  350. cudnn_check(cudnnGetConvolutionBackwardFilterAlgorithm(
  351. cudnn_handle, desc.src_desc.desc, desc.diff_desc.desc,
  352. desc.conv_desc.desc, desc.grad_desc.desc,
  353. CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
  354. workspace_limit_in_bytes, &algo));
  355. auto&& cast_algo =
  356. reinterpret_cast<AlgoBase*>(sm_algo_pack.cudnn_from_enum(algo));
  357. return reinterpret_cast<AlgoBase*>(
  358. megdnn::get_reproducible_algo<ConvolutionBackwardFilterImpl>(
  359. cast_algo, reproducible));
  360. #endif
  361. };
  362. if (is_cudnn_supported(args.as_fwd_args())) {
  363. if (auto algo = get_cudnn_algo())
  364. return algo;
  365. }
  366. if (args.grad_filter_meta.group > 1) {
  367. auto orig_args = args;
  368. TensorLayout a, b;
  369. AlgoGroupConvGeneral::modify_size_args(args, a, b);
  370. if (is_cudnn_supported(args.as_fwd_args())) {
  371. if (auto algo = get_cudnn_algo())
  372. return sm_algo_pack.algo2gconv.at(algo);
  373. }
  374. args = orig_args;
  375. }
  376. if (args.src_layout->dtype.enumv() != DTypeTrait<dtype::BFloat16>::enumv) {
  377. if (reproducible) {
  378. return megdnn::get_reproducible_algo<ConvolutionBackwardFilterImpl>(
  379. sm_algo_pack.non_cudnn_algos, args,
  380. workspace_limit_in_bytes, "cuda conv bwd_filter");
  381. } else {
  382. return megdnn::get_usable_algo<ConvolutionBackwardFilterImpl>(
  383. sm_algo_pack.non_cudnn_algos, args,
  384. workspace_limit_in_bytes, "cuda conv bwd_filter");
  385. }
  386. } else {
  387. if (reproducible) {
  388. return megdnn::get_reproducible_algo<ConvolutionBackwardFilterImpl>(
  389. sm_algo_pack.bfloat16_algos, args, workspace_limit_in_bytes,
  390. "cuda conv bwd_filter");
  391. } else {
  392. return megdnn::get_usable_algo<ConvolutionBackwardFilterImpl>(
  393. sm_algo_pack.bfloat16_algos, args, workspace_limit_in_bytes,
  394. "cuda conv bwd_filter");
  395. }
  396. }
  397. }
  398. size_t ConvolutionBackwardFilterImpl::get_workspace_in_bytes(
  399. const TensorLayout &src,
  400. const TensorLayout &diff,
  401. const TensorLayout &grad) {
  402. AlgoBase::SizeArgs args(this, src, diff, grad);
  403. return get_algorithm(this, src, diff, grad, args.grad_filter_meta)->
  404. get_workspace_in_bytes(args);
  405. }
  406. const char* ConvolutionBackwardFilterImpl::get_algorithm_set_name() const {
  407. return "CUDACONV0+CUDNN" CUDNN_VERSION_STR;
  408. }
  409. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台