You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

tensor_format.cpp 24 kB

feat(bazel/windows/xp/sp2/inference): implement inference on windows xp (os vesion >= sp2) build with bazel * bazel build support(define __DEPLOY_ON_XP_SP2__ when deploy on xp sp2): (dbg)./bazel build //brain/megbrain:load_and_run --cpu='x86_windows_xp' --compiler='clang_cl' -c dbg --copt "-D__DEPLOY_ON_XP_SP2__=1" (opt)./bazel build //brain/megbrain:load_and_run --cpu='x86_windows_xp' --compiler='clang_cl' -c opt --copt "-D__DEPLOY_ON_XP_SP2__=1" * internal behavior: will define MGB_HAVE_THREAD=0 when enable __DEPLOY_ON_XP_SP2__ * refer to https://docs.microsoft.com/en-us/cpp/build/configuring-programs-for-windows-xp?view=msvc-160 xp sp2(x86) do not support vc runtime fully, casused by KERNEL32.dll do not implement some base apis for c++ std function, for example, std::mutex/std::thread/std::condition_variable as a workround, we will disable some MegEngine features on xp sp2 env, for exampe, multi-thread etc! * about DNN_MUTEX/MGB_MUTEX, if your code will build in inference code (even CPU backends), please replace std::mutex to DNN_MUTEX/MGB_MUTEX, * about multi-thread, if you code need multi-thread support, please enable it when MGB_HAVE_THREAD=1 * about test build env status 1: Visual Studio 2019(MSVC version <= 14.26.28801)---- pass 2: Visual Studio 2019(MSVC version > 14.26.28801) ---- failed caused by this 'new' version will put VCR depends on win7 KERNEL32.DLL, this may be fixed at Visual Studio 2019 later version but we do not test at this MR merge point 3: Visual Studio 2017 ---------- pass 4: Visual Studio 2014 ---------- pass GitOrigin-RevId: 65ac48b95e99f2c510fe5db449cc8182d682e113
4 years ago
feat(bazel/windows/xp/sp2/inference): implement inference on windows xp (os vesion >= sp2) build with bazel * bazel build support(define __DEPLOY_ON_XP_SP2__ when deploy on xp sp2): (dbg)./bazel build //brain/megbrain:load_and_run --cpu='x86_windows_xp' --compiler='clang_cl' -c dbg --copt "-D__DEPLOY_ON_XP_SP2__=1" (opt)./bazel build //brain/megbrain:load_and_run --cpu='x86_windows_xp' --compiler='clang_cl' -c opt --copt "-D__DEPLOY_ON_XP_SP2__=1" * internal behavior: will define MGB_HAVE_THREAD=0 when enable __DEPLOY_ON_XP_SP2__ * refer to https://docs.microsoft.com/en-us/cpp/build/configuring-programs-for-windows-xp?view=msvc-160 xp sp2(x86) do not support vc runtime fully, casused by KERNEL32.dll do not implement some base apis for c++ std function, for example, std::mutex/std::thread/std::condition_variable as a workround, we will disable some MegEngine features on xp sp2 env, for exampe, multi-thread etc! * about DNN_MUTEX/MGB_MUTEX, if your code will build in inference code (even CPU backends), please replace std::mutex to DNN_MUTEX/MGB_MUTEX, * about multi-thread, if you code need multi-thread support, please enable it when MGB_HAVE_THREAD=1 * about test build env status 1: Visual Studio 2019(MSVC version <= 14.26.28801)---- pass 2: Visual Studio 2019(MSVC version > 14.26.28801) ---- failed caused by this 'new' version will put VCR depends on win7 KERNEL32.DLL, this may be fixed at Visual Studio 2019 later version but we do not test at this MR merge point 3: Visual Studio 2017 ---------- pass 4: Visual Studio 2014 ---------- pass GitOrigin-RevId: 65ac48b95e99f2c510fe5db449cc8182d682e113
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665
  1. /**
  2. * \file dnn/src/common/tensor_format.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megdnn/tensor_format.h"
  12. #include "megdnn/basic_types.h"
  13. #include "src/common/utils.h"
  14. #include <unordered_map>
  15. using namespace megdnn;
  16. using namespace megdnn::detail;
  17. namespace {
  18. DefaultTensorFormat* default_tensor_format_obj;
  19. }
  20. /* ===================== TensorFormat ===================== */
  21. TensorFormat TensorFormat::deserialize(const std::string& bin,
  22. const Handle* handle) {
  23. using Type = TensorFormat::Type;
  24. auto type = reinterpret_cast<const Type*>(bin.data());
  25. switch (*type) {
  26. case Type::DEFAULT:
  27. return DefaultTensorFormat::deserialize(handle, type + 1,
  28. bin.size() - sizeof(Type));
  29. case Type::IMAGE2D_PACK4:
  30. return Image2DPack4TensorFormat::deserialize(
  31. handle, type + 1, bin.size() - sizeof(Type));
  32. case Type::LOWBITS_ALIGNED_TO_BYTE:
  33. return LowbitsAlignedToBytesTensorFormat::deserialize(
  34. handle, type + 1, bin.size() - sizeof(Type));
  35. default:
  36. megdnn_throw("invalid tensor format type in deserialize");
  37. }
  38. }
  39. TensorFormat::Format() : m_impl{DefaultTensorFormat::make().m_impl} {}
  40. TensorFormat::Format(DType dtype) {
  41. if (dtype.valid() &&
  42. dtype.is_quantized_lowbit()) { // quantized lowbit, by default
  43. // aligned to bytes
  44. size_t size_nbits = dtype.low_bit();
  45. megdnn_assert(size_nbits == 1 || size_nbits == 2 || size_nbits == 4,
  46. "unsupported lowbits data type(%s, size in bits: %zu)",
  47. dtype.name(), size_nbits);
  48. m_impl = LowbitsAlignedToBytesTensorFormat::make(size_nbits).m_impl;
  49. } else { // non parameterized lowbit, default format
  50. m_impl = DefaultTensorFormat::make().m_impl;
  51. }
  52. }
  53. std::string TensorFormat::to_string() const {
  54. return m_impl->to_string();
  55. }
  56. std::string TensorFormat::serialize() const {
  57. std::string ret;
  58. ret.reserve(32);
  59. ret.assign(sizeof(Type), '\0');
  60. *reinterpret_cast<Type*>(&ret[0]) = type();
  61. m_impl->serialize_append(ret);
  62. return ret;
  63. }
  64. void TensorFormat::on_bad_cvt(Type dst_type) const {
  65. MEGDNN_MARK_USED_VAR(dst_type);
  66. megdnn_throw(ssprintf("can not convert tensor format %s to %d",
  67. impl()->to_string().c_str(),
  68. static_cast<int>(dst_type)));
  69. }
  70. bool TensorFormat::is_default() const {
  71. return m_impl == default_tensor_format_obj;
  72. }
  73. bool TensorFormat::is_lowbit_aligned() const {
  74. return type() == TensorFormat::Type::LOWBITS_ALIGNED_TO_BYTE;
  75. }
  76. /* ===================== DefaultFormat ===================== */
  77. void DefaultTensorFormat::assert_valid(const TensorLayout& layout) const {
  78. megdnn_assert(
  79. !layout.dtype.valid() || !layout.dtype.is_quantized_lowbit(),
  80. "DefaultTensorFormat does not support quantized lowbit tensor(dtype:%s)",
  81. layout.dtype.name());
  82. }
  83. size_t DefaultTensorFormat::init_contiguous_stride(TensorLayout& layout) const {
  84. assert_valid(layout);
  85. if (!layout.ndim)
  86. return 0;
  87. megdnn_assert(layout.ndim <= TensorLayout::MAX_NDIM);
  88. size_t accum = 1;
  89. SafeMultiplies<size_t> mul;
  90. for (size_t i = layout.ndim; i; --i) {
  91. layout.stride[i - 1] = accum;
  92. accum = mul(accum, layout.shape[i - 1]);
  93. }
  94. return accum;
  95. }
  96. bool DefaultTensorFormat::is_contiguous_spec(const TensorLayout& layout) const {
  97. assert_valid(layout);
  98. return layout.is_physical_contiguous();
  99. }
  100. TensorLayout DefaultTensorFormat::collapse_contiguous_spec(
  101. const TensorLayout& layout) const {
  102. assert_valid(layout);
  103. megdnn_assert(layout.ndim);
  104. TensorLayout res{layout};
  105. // remove all dims with shape 1
  106. for (int i = static_cast<int>(res.ndim) - 1; i >= 0 && res.ndim >= 2; --i) {
  107. if (!res.shape[i]) {
  108. // empty tensor
  109. res.ndim = 1;
  110. res.shape[0] = 0;
  111. res.stride[0] = 1;
  112. return res;
  113. }
  114. if (res.shape[i] == 1)
  115. res.remove_axis_inplace(i);
  116. }
  117. if (res.ndim == 1) {
  118. if (res.shape[0] <= 1) {
  119. // make it the "most canonical" contiguous layout for scalars or
  120. // empty tensors
  121. res.stride[0] = 1;
  122. }
  123. return res;
  124. }
  125. megdnn_assert(res.ndim && res.shape[res.ndim - 1]);
  126. for (int i = static_cast<int>(res.ndim) - 2; i >= 0; --i) {
  127. megdnn_assert(res.shape[i]);
  128. if (res.stride[i] ==
  129. res.stride[i + 1] * static_cast<ptrdiff_t>(res.shape[i + 1])) {
  130. res.shape[i] *= res.shape[i + 1];
  131. res.stride[i] = res.stride[i + 1];
  132. res.remove_axis_inplace(i + 1);
  133. }
  134. }
  135. return res;
  136. }
  137. TensorLayout::Span DefaultTensorFormat::span_spec(
  138. const TensorLayout& layout) const {
  139. assert_valid(layout);
  140. if (layout.ndim == 0)
  141. return {0, 0, 0, 0};
  142. ptrdiff_t low_elem = 0;
  143. size_t high_elem = 0;
  144. for (size_t i = 0; i < layout.ndim; ++i) {
  145. auto shape_val = layout.shape[i];
  146. if (!shape_val) {
  147. return {0, 0, 0, 0};
  148. }
  149. auto stride_val = layout.stride[i];
  150. if (stride_val > 0) {
  151. high_elem += (shape_val - 1) * stride_val;
  152. } else {
  153. low_elem += (shape_val - 1) * stride_val;
  154. }
  155. }
  156. ++high_elem;
  157. ptrdiff_t low_byte;
  158. if (low_elem < 0) {
  159. low_byte = low_elem * layout.dtype.size();
  160. } else {
  161. low_byte = 0;
  162. }
  163. size_t high_byte = layout.dtype.size(high_elem);
  164. return TensorLayout::Span(low_elem, low_byte, high_elem, high_byte);
  165. }
  166. std::string DefaultTensorFormat::to_string() const {
  167. return "default{}";
  168. }
  169. void DefaultTensorFormat::serialize_append(std::string&) const {}
  170. TensorFormat DefaultTensorFormat::deserialize(const Handle* handle,
  171. const void* buf, size_t size) {
  172. MEGDNN_MARK_USED_VAR(handle);
  173. MEGDNN_MARK_USED_VAR(buf);
  174. megdnn_assert(!size);
  175. return make();
  176. }
  177. TensorFormat DefaultTensorFormat::make() {
  178. // use static storage so the object is accessible in global destructing
  179. // phase
  180. static std::aligned_storage_t<sizeof(DefaultTensorFormat),
  181. alignof(DefaultTensorFormat)>
  182. storage;
  183. static DefaultTensorFormat* obj = default_tensor_format_obj =
  184. new (&storage) DefaultTensorFormat{};
  185. return impl_to_tensor_format(obj);
  186. }
  187. /* ===================== Image2DTensorFormatBase ===================== */
  188. Image2DTensorFormatBase::Image2DTensorFormatBase(Type type, size_t align_axis,
  189. size_t align_size_in_elements)
  190. : ImplBase(type), m_align_axis(align_axis) {
  191. megdnn_assert(align_size_in_elements && align_axis);
  192. m_align_size_in_elements_log2 = __builtin_ctz(align_size_in_elements);
  193. megdnn_assert(
  194. (1u << m_align_size_in_elements_log2) == align_size_in_elements,
  195. "align size not power of 2: %zu", align_size_in_elements);
  196. }
  197. void Image2DTensorFormatBase::serialize_append(std::string& result) const {
  198. SerializePack pack;
  199. pack.align_axis = m_align_axis;
  200. megdnn_assert(pack.align_axis == m_align_axis); // detect overflow
  201. result.append(reinterpret_cast<char*>(&pack), sizeof(pack));
  202. }
  203. size_t Image2DTensorFormatBase::image_height(const TensorLayout& layout) const {
  204. size_t accum = 1;
  205. for (int i = m_align_axis - 1; i >= 0; --i) {
  206. if (layout.stride[i] == 0) {
  207. // this dimension is broadcasted
  208. } else {
  209. accum *= layout.shape[i];
  210. }
  211. }
  212. return accum;
  213. }
  214. size_t Image2DTensorFormatBase::image_width_elems(
  215. const TensorLayout& layout) const {
  216. size_t high_elem = 0;
  217. for (size_t i = m_align_axis; i < layout.ndim; ++i) {
  218. high_elem += (layout.shape[i] - 1) * layout.stride[i];
  219. }
  220. return high_elem + 1;
  221. }
  222. std::string Image2DTensorFormatBase::to_string() const {
  223. return ssprintf("I2D{%zu,%d}", m_align_axis,
  224. 1 << m_align_size_in_elements_log2);
  225. }
  226. /* ===================== Image2DPackedTensorFormatBase ===================== */
  227. template <size_t PIXEL_SIZE>
  228. size_t Image2DPackedTensorFormatBase<PIXEL_SIZE>::image_width(
  229. const TensorLayout& layout) const {
  230. auto ret = image_width_elems(layout);
  231. megdnn_assert(ret % PIXEL_SIZE == 0);
  232. return ret / PIXEL_SIZE;
  233. }
  234. template <size_t PIXEL_SIZE>
  235. void Image2DPackedTensorFormatBase<PIXEL_SIZE>::assert_valid(
  236. const TensorLayout& layout) const {
  237. auto m_align_axis = align_axis();
  238. megdnn_assert(!(layout.shape[layout.ndim - 1] % PIXEL_SIZE),
  239. "bad shape: %zu", layout.shape[layout.ndim - 1]);
  240. megdnn_assert(layout.dtype.valid() && !layout.dtype.is_quantized_lowbit() &&
  241. layout.ndim > m_align_axis);
  242. ptrdiff_t first_non_zero_stride = 0;
  243. for (int i = layout.ndim - 1; i >= 0; --i) {
  244. megdnn_assert(layout.shape[i] && layout.stride[i] >= 0);
  245. if (i < static_cast<int>(m_align_axis) && !first_non_zero_stride) {
  246. first_non_zero_stride = layout.stride[i];
  247. }
  248. }
  249. size_t mask =
  250. image_pitch_alignment_in_bytes(
  251. align_size_in_elements(layout.dtype.size_log()), layout) -
  252. 1;
  253. megdnn_assert(!(first_non_zero_stride & mask),
  254. "first stride is %d, but alignment is %zu",
  255. static_cast<int>(first_non_zero_stride), mask + 1);
  256. }
  257. template <size_t PIXEL_SIZE>
  258. size_t Image2DPackedTensorFormatBase<PIXEL_SIZE>::image_row_pitch(
  259. const TensorLayout& layout) const {
  260. for (int i = align_axis() - 1; i >= 0; --i) {
  261. // find a non-broadcast axis
  262. if (auto s = layout.stride[i]) {
  263. return layout.dtype.size(s);
  264. }
  265. }
  266. // use width for all broadcasted case
  267. size_t alignment_in_bytes_log2 = align_size_in_elements_log2();
  268. if (m_vendor_type == Handle::HandleVendorType::MALI) {
  269. alignment_in_bytes_log2 +=
  270. __builtin_ctz(layout.dtype.size() * PIXEL_SIZE);
  271. }
  272. return get_aligned_power2<size_t>(
  273. layout.dtype.size(image_width_elems(layout)),
  274. 1 << alignment_in_bytes_log2);
  275. }
  276. template <size_t PIXEL_SIZE>
  277. size_t
  278. Image2DPackedTensorFormatBase<PIXEL_SIZE>::image_pitch_alignment_in_bytes(
  279. size_t align_size_in_elements, const TensorLayout& layout) const {
  280. return m_vendor_type == Handle::HandleVendorType::MALI
  281. ? (align_size_in_elements * layout.dtype.size() * PIXEL_SIZE)
  282. : align_size_in_elements;
  283. }
  284. template <size_t PIXEL_SIZE>
  285. TensorLayout::Span Image2DPackedTensorFormatBase<PIXEL_SIZE>::span_spec(
  286. const TensorLayout& layout) const {
  287. assert_valid(layout);
  288. size_t size = image_height(layout) * image_row_pitch(layout);
  289. auto mask = (1 << layout.dtype.size_log()) - 1;
  290. megdnn_assert(!(size & mask), "unaligned size: %zu", size);
  291. return {0, 0, size >> layout.dtype.size_log(), size};
  292. }
  293. template <size_t PIXEL_SIZE>
  294. size_t Image2DPackedTensorFormatBase<PIXEL_SIZE>::init_contiguous_stride(
  295. TensorLayout& layout) const {
  296. auto m_align_axis = align_axis();
  297. if (!layout.ndim)
  298. return 0;
  299. megdnn_assert(layout.dtype.valid() && layout.ndim > m_align_axis,
  300. "dtype=%s ndim=%zu align=%zu", layout.dtype.name(),
  301. layout.ndim, m_align_axis);
  302. size_t align_size = image_pitch_alignment_in_bytes(
  303. align_size_in_elements(layout.dtype.size_log()), layout);
  304. size_t accum = 1;
  305. SafeMultiplies<size_t> mul;
  306. for (size_t i = layout.ndim; i; --i) {
  307. if (i == m_align_axis) {
  308. accum = get_aligned_power2<size_t>(accum, align_size);
  309. }
  310. layout.stride[i - 1] = accum;
  311. accum = mul(accum, layout.shape[i - 1]);
  312. }
  313. assert_valid(layout);
  314. return accum;
  315. };
  316. template <size_t PIXEL_SIZE>
  317. bool Image2DPackedTensorFormatBase<PIXEL_SIZE>::is_contiguous_spec(
  318. const TensorLayout& layout) const {
  319. megdnn_assert(layout.dtype.valid());
  320. size_t align_size = image_pitch_alignment_in_bytes(
  321. align_size_in_elements(layout.dtype.size_log()), layout);
  322. ptrdiff_t expected = 1;
  323. int height_axis = static_cast<int>(align_axis() - 1);
  324. for (int i = layout.ndim - 1; i >= 0; --i) {
  325. if (i == height_axis) {
  326. expected = megdnn::get_aligned_power2<size_t>(expected, align_size);
  327. }
  328. if (layout.shape[i] != 1 && layout.stride[i] != expected) {
  329. if (i == height_axis) {
  330. // allow row pitch to be larger than minimal required
  331. auto s = layout.stride[i];
  332. if (!s) {
  333. // broadcast is not contiguous
  334. return false;
  335. }
  336. size_t mask =
  337. image_pitch_alignment_in_bytes(
  338. align_size_in_elements(layout.dtype.size_log()),
  339. layout) -
  340. 1;
  341. megdnn_assert(s > expected && !(s & mask),
  342. "invalid row pitch: %d; layout: %s",
  343. static_cast<int>(s), layout.to_string().c_str());
  344. expected = s;
  345. } else {
  346. return false;
  347. }
  348. }
  349. expected *= layout.shape[i];
  350. }
  351. // empty tensors are not contiguous
  352. return expected != 0;
  353. }
  354. template <size_t PIXEL_SIZE>
  355. TensorLayout Image2DPackedTensorFormatBase<PIXEL_SIZE>::collapse_contiguous_spec(
  356. const TensorLayout& layout) const {
  357. assert_valid(layout);
  358. TensorLayout res{layout};
  359. int new_axis = align_axis();
  360. // remove all dims with shape 1
  361. for (int i = static_cast<int>(res.ndim) - 1; i >= 0 && res.ndim >= 3; --i) {
  362. if (i == new_axis && static_cast<int>(res.ndim) == new_axis + 1) {
  363. // i is the only width dim
  364. continue;
  365. }
  366. if (i == new_axis - 1 && !i) {
  367. // new_xis == 1 && i == 0, i is the only height dim
  368. continue;
  369. }
  370. if (res.shape[i] == 1) {
  371. res.remove_axis_inplace(i);
  372. if (i < new_axis)
  373. new_axis -= 1;
  374. }
  375. }
  376. megdnn_assert(res.ndim >= 2);
  377. auto contig_with_next = [&](size_t i) {
  378. return res.stride[i] ==
  379. res.stride[i + 1] * static_cast<ptrdiff_t>(res.shape[i + 1]);
  380. };
  381. for (int i = static_cast<int>(res.ndim) - 2; i >= new_axis; --i) {
  382. megdnn_assert(res.shape[i]);
  383. if (contig_with_next(i)) {
  384. // remove next axis
  385. res.shape[i] *= res.shape[i + 1];
  386. res.stride[i] = res.stride[i + 1];
  387. res.remove_axis_inplace(i + 1);
  388. }
  389. }
  390. for (int i = new_axis - 2; i >= 0; --i) {
  391. megdnn_assert(res.shape[i]);
  392. if (contig_with_next(i)) {
  393. res.shape[i] *= res.shape[i + 1];
  394. res.stride[i] = res.stride[i + 1];
  395. res.remove_axis_inplace(i + 1);
  396. if (i <= new_axis - 2)
  397. new_axis -= 1;
  398. }
  399. }
  400. res.format = change_axis(new_axis);
  401. return res;
  402. }
  403. namespace megdnn {
  404. namespace detail {
  405. template class Image2DPackedTensorFormatBase<4>;
  406. } // namespace detail
  407. } // namespace megdnn
  408. /* =============== LowbitsAlignedTensorFormatBase ============== */
  409. LowbitsAlignedTensorFormatBase::LowbitsAlignedTensorFormatBase(
  410. Type type, size_t size_nbits, size_t align_size_in_bits)
  411. : ImplBase(type),
  412. m_size_nbits(size_nbits),
  413. m_align_size_in_bits(align_size_in_bits) {
  414. megdnn_assert(!(m_align_size_in_bits % m_size_nbits),
  415. "align size(%zu) must be a multiple of element size(%zu)",
  416. m_align_size_in_bits, m_size_nbits);
  417. m_align_size_in_elements = m_align_size_in_bits / m_size_nbits;
  418. }
  419. std::string LowbitsAlignedTensorFormatBase::to_string() const {
  420. return ssprintf("LOWBITS{%zu,%zu}", m_size_nbits, m_align_size_in_bits);
  421. }
  422. void LowbitsAlignedTensorFormatBase::assert_valid(
  423. const TensorLayout& layout) const {
  424. megdnn_assert(layout.dtype.valid() && layout.dtype.is_low_bit() &&
  425. layout.dtype.low_bit() == m_size_nbits);
  426. bool has_dim_unity_stride = false;
  427. bool has_dim_aligned_stride = false;
  428. for (int i = layout.ndim - 1; i >= 0; --i) {
  429. if (!has_dim_unity_stride && layout.stride[i] == 1)
  430. has_dim_unity_stride = true;
  431. megdnn_assert(
  432. layout.stride[i] >= 0 &&
  433. (layout.stride[i] % m_align_size_in_elements == 0 ||
  434. layout.stride[i] == 1),
  435. "bad stride:%s, %ld", layout.to_string().c_str(),
  436. static_cast<long>(layout.stride[i]));
  437. if (!has_dim_aligned_stride &&
  438. static_cast<size_t>(layout.stride[i]) == m_align_size_in_elements)
  439. has_dim_aligned_stride = true;
  440. }
  441. megdnn_assert(
  442. layout.ndim == 0 || has_dim_unity_stride || has_dim_aligned_stride,
  443. "innermost dim not contiguous");
  444. }
  445. void LowbitsAlignedTensorFormatBase::serialize_append(
  446. std::string& result) const {
  447. SerializePack pack;
  448. pack.size_nbits = m_size_nbits;
  449. pack.align_size_in_bits = m_align_size_in_bits;
  450. megdnn_assert(pack.align_size_in_bits ==
  451. m_align_size_in_bits); // detect overflow;
  452. result.append(reinterpret_cast<char*>(&pack), sizeof(pack));
  453. }
  454. TensorLayout::Span LowbitsAlignedTensorFormatBase::span_spec(
  455. const TensorLayout& layout) const {
  456. assert_valid(layout);
  457. if (layout.ndim == 0)
  458. return {0, 0, 0, 0};
  459. size_t high_elem = 0;
  460. for (size_t i = 0; i < layout.ndim; ++i) {
  461. auto shape_val = layout.shape[i];
  462. if (!shape_val) {
  463. return {0, 0, 0, 0};
  464. }
  465. auto stride_val = layout.stride[i];
  466. megdnn_assert(stride_val >= 0,
  467. "lowbit tensors shouldn't have negative strides");
  468. high_elem += (shape_val - 1) * stride_val;
  469. }
  470. ++high_elem;
  471. size_t high_byte = layout.dtype.size(high_elem);
  472. return TensorLayout::Span(0, 0, high_elem, high_byte);
  473. }
  474. size_t LowbitsAlignedTensorFormatBase::init_contiguous_stride(
  475. TensorLayout& layout) const {
  476. if (!layout.ndim)
  477. return 0;
  478. megdnn_assert(layout.ndim <= TensorLayout::MAX_NDIM);
  479. size_t accum = 1;
  480. SafeMultiplies<size_t> mul;
  481. for (size_t i = layout.ndim; i; --i) {
  482. layout.stride[i - 1] = accum;
  483. auto multiplier = layout.shape[i - 1];
  484. if (i == layout.ndim)
  485. multiplier = round_up(multiplier, m_align_size_in_elements);
  486. accum = mul(accum, multiplier);
  487. }
  488. assert_valid(layout);
  489. return accum;
  490. }
  491. bool LowbitsAlignedTensorFormatBase::is_contiguous_spec(
  492. const TensorLayout& layout) const {
  493. assert_valid(layout);
  494. ptrdiff_t expected = 1;
  495. for (int i = static_cast<int>(layout.ndim) - 1; i >= 0; --i) {
  496. bool is_valid_stride =
  497. (layout.stride[i] == expected) ||
  498. (expected == 1 &&
  499. (int)layout.stride[i] ==
  500. round_up(1, (int)m_align_size_in_elements));
  501. if (layout.shape[i] != 1 && !is_valid_stride)
  502. return false;
  503. auto multiplier = layout.shape[i];
  504. if (i == static_cast<int>(layout.ndim) - 1)
  505. multiplier = round_up(multiplier, m_align_size_in_elements);
  506. expected *= multiplier;
  507. }
  508. return expected != 0;
  509. }
  510. TensorLayout LowbitsAlignedTensorFormatBase::collapse_contiguous_spec(
  511. const TensorLayout& layout) const {
  512. assert_valid(layout);
  513. TensorLayout res{layout};
  514. for (int i = static_cast<int>(res.ndim) - 1; i >= 0; --i) {
  515. if (!res.shape[i]) {
  516. // empty tensor
  517. res.ndim = 1;
  518. res.shape[0] = 0;
  519. res.stride[0] = 1;
  520. return res;
  521. }
  522. if (res.shape[i] == 1) {
  523. res.remove_axis_inplace(i);
  524. }
  525. }
  526. megdnn_assert(res.ndim && res.shape[res.ndim - 1]);
  527. for (int i = static_cast<int>(res.ndim) - 2; i >= 0; --i) {
  528. megdnn_assert(res.shape[i]);
  529. if (res.stride[i] ==
  530. res.stride[i + 1] * static_cast<ptrdiff_t>(res.shape[i + 1])) {
  531. res.shape[i] *= res.shape[i + 1];
  532. res.stride[i] = res.stride[i + 1];
  533. res.remove_axis_inplace(i + 1);
  534. }
  535. }
  536. return res;
  537. }
  538. /* ===================== Image2DPack4TensorFormat ===================== */
  539. TensorFormat Image2DPack4TensorFormat::make_raw(
  540. size_t align_axis, size_t align_size_in_elements,
  541. Handle::HandleVendorType vendor_type) {
  542. static DNN_MUTEX mtx;
  543. static std::unordered_map<uint64_t,
  544. std::unique_ptr<Image2DPack4TensorFormat>>
  545. cache;
  546. megdnn_assert(std::max(align_axis, align_size_in_elements) <=
  547. std::numeric_limits<uint32_t>::max());
  548. MEGDNN_LOCK_GUARD(mtx);
  549. auto&& ptr = cache[(static_cast<uint64_t>(align_axis) << 32) |
  550. align_size_in_elements];
  551. if (!ptr) {
  552. ptr.reset(new Image2DPack4TensorFormat{
  553. align_axis, align_size_in_elements, vendor_type});
  554. }
  555. return impl_to_tensor_format(ptr.get());
  556. }
  557. TensorFormat Image2DPack4TensorFormat::make(size_t align_axis,
  558. const Handle* handle) {
  559. return make_raw(align_axis, handle->image2d_pitch_alignment(),
  560. handle->vendor_type());
  561. }
  562. TensorFormat Image2DPack4TensorFormat::deserialize(const Handle* handle,
  563. const void* buf,
  564. size_t size) {
  565. megdnn_assert(size == sizeof(SerializePack));
  566. auto pack = *static_cast<const SerializePack*>(buf);
  567. return make(pack.align_axis, handle);
  568. }
  569. TensorFormat Image2DPack4TensorFormat::change_axis(size_t axis) const {
  570. return make_raw(axis, align_size_in_elements(), vendor());
  571. }
  572. /* ===================== LowbitsitsAlignedToBytesTensorFormat
  573. * ===================== */
  574. TensorFormat LowbitsAlignedToBytesTensorFormat::make(size_t size_nbits) {
  575. static DNN_MUTEX mtx;
  576. static std::unordered_map<
  577. uint64_t, std::unique_ptr<LowbitsAlignedToBytesTensorFormat>>
  578. cache;
  579. megdnn_assert(!(8 % size_nbits));
  580. MEGDNN_LOCK_GUARD(mtx);
  581. auto&& ptr = cache[static_cast<uint32_t>(size_nbits)];
  582. if (!ptr) {
  583. ptr.reset(new LowbitsAlignedToBytesTensorFormat{size_nbits});
  584. }
  585. return impl_to_tensor_format(ptr.get());
  586. }
  587. TensorFormat LowbitsAlignedToBytesTensorFormat::deserialize(const Handle*,
  588. const void* buf,
  589. size_t size) {
  590. megdnn_assert(size == sizeof(SerializePack));
  591. auto pack = *static_cast<const SerializePack*>(buf);
  592. return make(pack.size_nbits);
  593. }
  594. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台