GitOrigin-RevId: 39f29ec990
tags/v1.5.0
| @@ -568,7 +568,7 @@ TensorLayout LowbitsAlignedTensorFormatBase::collapse_contiguous_spec( | |||||
| res.stride[0] = 1; | res.stride[0] = 1; | ||||
| return res; | return res; | ||||
| } | } | ||||
| if (res.shape[i] == 1) { | |||||
| if (res.shape[i] == 1 && res.stride[i] != 1) { | |||||
| res.remove_axis_inplace(i); | res.remove_axis_inplace(i); | ||||
| } | } | ||||
| } | } | ||||
| @@ -16,6 +16,7 @@ | |||||
| #include "src/cuda/int_fastdiv.cuh" | #include "src/cuda/int_fastdiv.cuh" | ||||
| #include "src/cuda/query_blocksize.cuh" | #include "src/cuda/query_blocksize.cuh" | ||||
| #include "src/cuda/utils.cuh" | #include "src/cuda/utils.cuh" | ||||
| #include "src/cuda/integer_subbyte_utils.cuh" | |||||
| /* | /* | ||||
| * please note that all arithmetics on GPU are 32-bit for best performance; this | * please note that all arithmetics on GPU are 32-bit for best performance; this | ||||
| @@ -633,7 +634,7 @@ public: | |||||
| int vec_idx = offset_ >> 1; | int vec_idx = offset_ >> 1; | ||||
| int lane_idx = offset_ & 0x1; | int lane_idx = offset_ & 0x1; | ||||
| Storage item = Storage(unpack_integer_4bits<true>( | |||||
| Storage item = Storage(integer_subbyte::unpack_integer_4bits<true>( | |||||
| *(Storage*)&Super::m_ptr[vec_idx], lane_idx * 4)); | *(Storage*)&Super::m_ptr[vec_idx], lane_idx * 4)); | ||||
| dt_qint4 result(item); | dt_qint4 result(item); | ||||
| @@ -664,7 +665,7 @@ public: | |||||
| int vec_idx = offset_ >> 1; | int vec_idx = offset_ >> 1; | ||||
| int lane_idx = offset_ & 0x1; | int lane_idx = offset_ & 0x1; | ||||
| Storage item = Storage(unpack_integer_4bits<false>( | |||||
| Storage item = Storage(integer_subbyte::unpack_integer_4bits<false>( | |||||
| *(Storage*)&Super::m_ptr[vec_idx], lane_idx * 4)); | *(Storage*)&Super::m_ptr[vec_idx], lane_idx * 4)); | ||||
| dt_quint4 result(item); | dt_quint4 result(item); | ||||
| @@ -15,6 +15,7 @@ | |||||
| #include "src/cuda/elemwise_helper_q4.cuh" | #include "src/cuda/elemwise_helper_q4.cuh" | ||||
| #include "src/cuda/elemwise_multi_type/kern.cuh" | #include "src/cuda/elemwise_multi_type/kern.cuh" | ||||
| #include "src/cuda/utils.cuh" | #include "src/cuda/utils.cuh" | ||||
| #include "src/cuda/integer_subbyte_utils.cuh" | |||||
| namespace megdnn { | namespace megdnn { | ||||
| namespace cuda { | namespace cuda { | ||||
| @@ -380,10 +381,10 @@ struct QuantizedMultiTypeOp< | |||||
| } | } | ||||
| __device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a) { | __device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a) { | ||||
| dst_storage x = apply( | |||||
| src_storage(unpack_integer_4bits<src_signedness>(a.x, 0))); | |||||
| dst_storage y = apply( | |||||
| src_storage(unpack_integer_4bits<src_signedness>(a.x, 4))); | |||||
| dst_storage x = apply(src_storage( | |||||
| integer_subbyte::unpack_integer_4bits<src_signedness>(a.x, 0))); | |||||
| dst_storage y = apply(src_storage( | |||||
| integer_subbyte::unpack_integer_4bits<src_signedness>(a.x, 4))); | |||||
| *(dst_vect_type*)(&dst[idx]) = | *(dst_vect_type*)(&dst[idx]) = | ||||
| elemwise_intl::VectTypeTrait<ctype_dst>::make_vector(x, y); | elemwise_intl::VectTypeTrait<ctype_dst>::make_vector(x, y); | ||||
| @@ -470,14 +471,14 @@ struct QuantizedMultiTypeOp< | |||||
| __device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a, | __device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a, | ||||
| src_vect_type b) { | src_vect_type b) { | ||||
| src_storage a_x = | |||||
| src_storage(unpack_integer_4bits<src_signedness>(a.x, 0)); | |||||
| src_storage a_y = | |||||
| src_storage(unpack_integer_4bits<src_signedness>(a.x, 4)); | |||||
| src_storage b_x = | |||||
| src_storage(unpack_integer_4bits<src_signedness>(b.x, 0)); | |||||
| src_storage b_y = | |||||
| src_storage(unpack_integer_4bits<src_signedness>(b.x, 4)); | |||||
| src_storage a_x = src_storage( | |||||
| integer_subbyte::unpack_integer_4bits<src_signedness>(a.x, 0)); | |||||
| src_storage a_y = src_storage( | |||||
| integer_subbyte::unpack_integer_4bits<src_signedness>(a.x, 4)); | |||||
| src_storage b_x = src_storage( | |||||
| integer_subbyte::unpack_integer_4bits<src_signedness>(b.x, 0)); | |||||
| src_storage b_y = src_storage( | |||||
| integer_subbyte::unpack_integer_4bits<src_signedness>(b.x, 4)); | |||||
| dst_storage x = apply(a_x, b_x), y = apply(a_y, b_y); | dst_storage x = apply(a_x, b_x), y = apply(a_y, b_y); | ||||
| @@ -572,18 +573,18 @@ struct QuantizedMultiTypeOp< | |||||
| __device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a, | __device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a, | ||||
| src_vect_type b, | src_vect_type b, | ||||
| src_vect_type c) { | src_vect_type c) { | ||||
| src_storage a_x = | |||||
| src_storage(unpack_integer_4bits<src_signedness>(a.x, 0)); | |||||
| src_storage a_y = | |||||
| src_storage(unpack_integer_4bits<src_signedness>(a.x, 4)); | |||||
| src_storage b_x = | |||||
| src_storage(unpack_integer_4bits<src_signedness>(b.x, 0)); | |||||
| src_storage b_y = | |||||
| src_storage(unpack_integer_4bits<src_signedness>(b.x, 4)); | |||||
| src_storage c_x = | |||||
| src_storage(unpack_integer_4bits<src_signedness>(c.x, 0)); | |||||
| src_storage c_y = | |||||
| src_storage(unpack_integer_4bits<src_signedness>(c.x, 4)); | |||||
| src_storage a_x = src_storage( | |||||
| integer_subbyte::unpack_integer_4bits<src_signedness>(a.x, 0)); | |||||
| src_storage a_y = src_storage( | |||||
| integer_subbyte::unpack_integer_4bits<src_signedness>(a.x, 4)); | |||||
| src_storage b_x = src_storage( | |||||
| integer_subbyte::unpack_integer_4bits<src_signedness>(b.x, 0)); | |||||
| src_storage b_y = src_storage( | |||||
| integer_subbyte::unpack_integer_4bits<src_signedness>(b.x, 4)); | |||||
| src_storage c_x = src_storage( | |||||
| integer_subbyte::unpack_integer_4bits<src_signedness>(c.x, 0)); | |||||
| src_storage c_y = src_storage( | |||||
| integer_subbyte::unpack_integer_4bits<src_signedness>(c.x, 4)); | |||||
| dst_storage x = apply(a_x, b_x, c_x), y = apply(a_y, b_y, c_y); | dst_storage x = apply(a_x, b_x, c_x), y = apply(a_y, b_y, c_y); | ||||
| @@ -0,0 +1,146 @@ | |||||
| /** | |||||
| * \file dnn/src/cuda/integer_subbyte_utils.cuh | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #if MEGDNN_CC_CUDA | |||||
| #pragma once | |||||
| #include "src/cuda/utils.cuh" | |||||
| namespace megdnn { | |||||
| namespace cuda { | |||||
| namespace integer_subbyte { | |||||
| template <bool signedness> | |||||
| struct integer_trait; | |||||
| template <> | |||||
| struct integer_trait<true> { | |||||
| using type = int; | |||||
| }; | |||||
| template <> | |||||
| struct integer_trait<false> { | |||||
| using type = unsigned; | |||||
| }; | |||||
| MEGDNN_DEVICE __forceinline__ static int transform_int8_to_int4x8( | |||||
| int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7) { | |||||
| unsigned out; | |||||
| #if __CUDA_ARCH__ >= 750 && \ | |||||
| ((__CUDACC_VER_MAJOR__ > 10) || \ | |||||
| ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2))) | |||||
| asm volatile( | |||||
| "{ .reg .u32 r4;" | |||||
| "cvt.pack.sat.s4.s32.b32 r4, %8, %7, 0;" | |||||
| "cvt.pack.sat.s4.s32.b32 r4, %6, %5, r4;" | |||||
| "cvt.pack.sat.s4.s32.b32 r4, %4, %3, r4;" | |||||
| "cvt.pack.sat.s4.s32.b32 %0, %2, %1, r4;" | |||||
| "}" | |||||
| : "=r"(out) | |||||
| : "r"(s0), "r"(s1), "r"(s2), "r"(s3), "r"(s4), "r"(s5), "r"(s6), | |||||
| "r"(s7)); | |||||
| #else | |||||
| #define CVT_SAT_S4_S32(r, bits) \ | |||||
| r = r <= -8 ? -8 : r; \ | |||||
| r = r > 7 ? 7 : r; \ | |||||
| r = (((unsigned)r & 0xf) << bits); | |||||
| CVT_SAT_S4_S32(s0, 0) | |||||
| CVT_SAT_S4_S32(s1, 4) | |||||
| CVT_SAT_S4_S32(s2, 8) | |||||
| CVT_SAT_S4_S32(s3, 12) | |||||
| CVT_SAT_S4_S32(s4, 16) | |||||
| CVT_SAT_S4_S32(s5, 20) | |||||
| CVT_SAT_S4_S32(s6, 24) | |||||
| CVT_SAT_S4_S32(s7, 28) | |||||
| out = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7; | |||||
| #undef CVT_SAT_S4_S32 | |||||
| #endif | |||||
| return reinterpret_cast<int const&>(out); | |||||
| } | |||||
| MEGDNN_DEVICE __forceinline__ static int transform_int8_to_uint4x8( | |||||
| int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7) { | |||||
| unsigned out; | |||||
| #if __CUDA_ARCH__ >= 750 && \ | |||||
| ((__CUDACC_VER_MAJOR__ > 10) || \ | |||||
| ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2))) | |||||
| asm volatile( | |||||
| "{ .reg .u32 r4;" | |||||
| "cvt.pack.sat.u4.s32.b32 r4, %8, %7, 0;" | |||||
| "cvt.pack.sat.u4.s32.b32 r4, %6, %5, r4;" | |||||
| "cvt.pack.sat.u4.s32.b32 r4, %4, %3, r4;" | |||||
| "cvt.pack.sat.u4.s32.b32 %0, %2, %1, r4;" | |||||
| "}" | |||||
| : "=r"(out) | |||||
| : "r"(s0), "r"(s1), "r"(s2), "r"(s3), "r"(s4), "r"(s5), "r"(s6), | |||||
| "r"(s7)); | |||||
| #else | |||||
| #define CVT_SAT_U4_S32(r, bits) \ | |||||
| r = r <= 0 ? 0 : r; \ | |||||
| r = r > 15 ? 15 : r; \ | |||||
| r = (((unsigned)r & 0xf) << bits); | |||||
| CVT_SAT_U4_S32(s0, 0) | |||||
| CVT_SAT_U4_S32(s1, 4) | |||||
| CVT_SAT_U4_S32(s2, 8) | |||||
| CVT_SAT_U4_S32(s3, 12) | |||||
| CVT_SAT_U4_S32(s4, 16) | |||||
| CVT_SAT_U4_S32(s5, 20) | |||||
| CVT_SAT_U4_S32(s6, 24) | |||||
| CVT_SAT_U4_S32(s7, 28) | |||||
| out = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7; | |||||
| #undef CVT_SAT_U4_S32 | |||||
| #endif | |||||
| return reinterpret_cast<int const&>(out); | |||||
| } | |||||
| template <bool signedness, typename T> | |||||
| MEGDNN_DEVICE __forceinline__ static int unpack_integer_4bits(T storage, | |||||
| int bits) { | |||||
| //! size in bits of 32 bit integer - 4 bits | |||||
| static constexpr int shift = 28; | |||||
| using type = typename integer_trait<signedness>::type; | |||||
| unsigned intermediate = static_cast<unsigned>(storage); | |||||
| type result = reinterpret_cast<type&>(intermediate); | |||||
| return (result << (shift - bits)) >> shift; | |||||
| } | |||||
| MEGDNN_DEVICE __forceinline__ static void transform_int4x8_to_int8( | |||||
| int (&result)[8], const int& source) { | |||||
| #pragma unroll | |||||
| for (int i = 0; i < 8; i++) { | |||||
| result[i] = unpack_integer_4bits<true>( | |||||
| reinterpret_cast<unsigned const&>(source), (i << 2)); | |||||
| } | |||||
| } | |||||
| MEGDNN_DEVICE __forceinline__ static void transform_uint4x8_to_int8( | |||||
| int (&result)[8], const int& source) { | |||||
| #pragma unroll | |||||
| for (int i = 0; i < 8; i++) { | |||||
| result[i] = unpack_integer_4bits<false>( | |||||
| reinterpret_cast<unsigned const&>(source), (i << 2)); | |||||
| } | |||||
| } | |||||
| MEGDNN_DEVICE __forceinline__ static void transform_int4x2_to_int8( | |||||
| int (&result)[2], const uint8_t& source) { | |||||
| result[0] = unpack_integer_4bits<true>(source, 0); | |||||
| result[1] = unpack_integer_4bits<true>(source, 4); | |||||
| } | |||||
| MEGDNN_DEVICE __forceinline__ static void transform_uint4x2_to_int8( | |||||
| int (&result)[2], const uint8_t& source) { | |||||
| result[0] = unpack_integer_4bits<false>(source, 0); | |||||
| result[1] = unpack_integer_4bits<false>(source, 4); | |||||
| } | |||||
| } // namespace integer_subbyte | |||||
| } // namespace cuda | |||||
| } // namespace megdnn | |||||
| #endif | |||||
| // vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||||
| @@ -0,0 +1,253 @@ | |||||
| /** | |||||
| * \file dnn/src/cuda/memory_utils.cuh | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #if MEGDNN_CC_CUDA | |||||
| #pragma once | |||||
| #include "src/cuda/utils.cuh" | |||||
| namespace megdnn { | |||||
| namespace cuda { | |||||
| namespace memory { | |||||
| ///////////////////////////////////////////////////////////////////////////////////////////////// | |||||
| template <typename AccessType, int LoadBytes> | |||||
| struct global_load; | |||||
| ///////////////////////////////////////////////////////////////////////////////////////////////// | |||||
| // | |||||
| // Specializations | |||||
| // | |||||
| ///////////////////////////////////////////////////////////////////////////////////////////////// | |||||
| ///////////////////////////////////////////////////////////////////////////////////////////////// | |||||
| // The redundant mov PTX instruction is used to enforce the compiler to | |||||
| // initialize data to zero before ld.global | |||||
| template <typename AccessType> | |||||
| struct global_load<AccessType, 32> { | |||||
| MEGDNN_DEVICE __forceinline__ global_load(AccessType& D, void const* ptr, | |||||
| bool pred_guard, int val = 0) { | |||||
| uint4* data = reinterpret_cast<uint4*>(&D); | |||||
| asm volatile( | |||||
| "{\n" | |||||
| " .reg .pred p;\n" | |||||
| " setp.ne.b32 p, %9, 0;\n" | |||||
| " mov.b32 %0, %10;\n" | |||||
| " mov.b32 %1, %10;\n" | |||||
| " mov.b32 %2, %10;\n" | |||||
| " mov.b32 %3, %10;\n" | |||||
| " mov.b32 %4, %10;\n" | |||||
| " mov.b32 %5, %10;\n" | |||||
| " mov.b32 %6, %10;\n" | |||||
| " mov.b32 %7, %10;\n" | |||||
| " @p ld.global.v4.u32 {%0, %1, %2, %3}, [%8];\n" | |||||
| " @p ld.global.v4.u32 {%4, %5, %6, %7}, [%11];\n" | |||||
| "}\n" | |||||
| : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), | |||||
| "=r"(data[0].w), "=r"(data[1].x), "=r"(data[1].y), | |||||
| "=r"(data[1].z), "=r"(data[1].w) | |||||
| : "l"(ptr), "r"((int)pred_guard), | |||||
| "r"(reinterpret_cast<unsigned&>(val)), | |||||
| "l"(((uint8_t*)ptr) + 16)); | |||||
| } | |||||
| }; | |||||
| template <typename AccessType> | |||||
| struct global_load<AccessType, 16> { | |||||
| MEGDNN_DEVICE __forceinline__ global_load(AccessType& D, void const* ptr, | |||||
| bool pred_guard, int val) { | |||||
| uint4& data = reinterpret_cast<uint4&>(D); | |||||
| asm volatile( | |||||
| "{\n" | |||||
| " .reg .pred p;\n" | |||||
| " setp.ne.b32 p, %5, 0;\n" | |||||
| " mov.b32 %0, %6;\n" | |||||
| " mov.b32 %1, %6;\n" | |||||
| " mov.b32 %2, %6;\n" | |||||
| " mov.b32 %3, %6;\n" | |||||
| " @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n" | |||||
| "}\n" | |||||
| : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w) | |||||
| : "l"(ptr), "r"((int)pred_guard), | |||||
| "r"(reinterpret_cast<unsigned&>(val))); | |||||
| } | |||||
| }; | |||||
| template <typename AccessType> | |||||
| struct global_load<AccessType, 8> { | |||||
| MEGDNN_DEVICE __forceinline__ global_load(AccessType& D, void const* ptr, | |||||
| bool pred_guard, int val) { | |||||
| uint2& data = reinterpret_cast<uint2&>(D); | |||||
| asm volatile( | |||||
| "{\n" | |||||
| " .reg .pred p;\n" | |||||
| " setp.ne.b32 p, %3, 0;\n" | |||||
| " mov.b32 %0, %4;\n" | |||||
| " mov.b32 %1, %4;\n" | |||||
| " @p ld.global.v2.u32 {%0, %1}, [%2];\n" | |||||
| "}\n" | |||||
| : "=r"(data.x), "=r"(data.y) | |||||
| : "l"(ptr), "r"((int)pred_guard), | |||||
| "r"(reinterpret_cast<unsigned&>(val))); | |||||
| } | |||||
| }; | |||||
| template <typename AccessType> | |||||
| struct global_load<AccessType, 4> { | |||||
| MEGDNN_DEVICE __forceinline__ global_load(AccessType& D, void const* ptr, | |||||
| bool pred_guard, int val) { | |||||
| unsigned& data = reinterpret_cast<unsigned&>(D); | |||||
| asm volatile( | |||||
| "{\n" | |||||
| " .reg .pred p;\n" | |||||
| " setp.ne.b32 p, %2, 0;\n" | |||||
| " mov.b32 %0, %3;\n" | |||||
| " @p ld.global.u32 %0, [%1];\n" | |||||
| "}\n" | |||||
| : "=r"(data) | |||||
| : "l"(ptr), "r"((int)pred_guard), | |||||
| "r"(reinterpret_cast<unsigned&>(val))); | |||||
| } | |||||
| }; | |||||
| template <typename AccessType> | |||||
| struct global_load<AccessType, 1> { | |||||
| MEGDNN_DEVICE __forceinline__ global_load(AccessType& D, void const* ptr, | |||||
| bool pred_guard, int val) { | |||||
| if (pred_guard) | |||||
| D = *(reinterpret_cast<AccessType const*>(ptr)); | |||||
| else { | |||||
| unsigned uv = reinterpret_cast<unsigned&>(val); | |||||
| uint8_t& data = reinterpret_cast<uint8_t&>(D); | |||||
| data = uv & 0xff; | |||||
| } | |||||
| } | |||||
| }; | |||||
| ///////////////////////////////////////////////////////////////////////////////////////////////// | |||||
| template < | |||||
| /// Fragment type to store loaded data | |||||
| typename AccessType, | |||||
| /// The bytes of loading | |||||
| int LoadBytes> | |||||
| struct global_store; | |||||
| ///////////////////////////////////////////////////////////////////////////////////////////////// | |||||
| // | |||||
| // Specializations | |||||
| // | |||||
| ///////////////////////////////////////////////////////////////////////////////////////////////// | |||||
| template <typename AccessType> | |||||
| struct global_store<AccessType, 32> { | |||||
| MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr, | |||||
| bool pred_guard) { | |||||
| uint4 const* data = reinterpret_cast<uint4 const*>(&D); | |||||
| asm volatile( | |||||
| "{\n" | |||||
| " .reg .pred p;\n" | |||||
| " setp.ne.b32 p, %5, 0;\n" | |||||
| " @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n" | |||||
| " @p st.global.v4.u32 [%6], {%7, %8, %9, %10};\n" | |||||
| "}\n" | |||||
| : | |||||
| : "l"(ptr), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z), | |||||
| "r"(data[0].w), "r"((int)pred_guard), | |||||
| "l"(((uint8_t*)ptr) + 16), "r"(data[1].x), "r"(data[1].y), | |||||
| "r"(data[1].z), "r"(data[1].w)); | |||||
| } | |||||
| }; | |||||
| template <typename AccessType> | |||||
| struct global_store<AccessType, 16> { | |||||
| MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr, | |||||
| bool pred_guard) { | |||||
| uint4 const& data = reinterpret_cast<uint4 const&>(D); | |||||
| asm volatile( | |||||
| "{\n" | |||||
| " .reg .pred p;\n" | |||||
| " setp.ne.b32 p, %5, 0;\n" | |||||
| " @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n" | |||||
| "}\n" | |||||
| : | |||||
| : "l"(ptr), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w), | |||||
| "r"((int)pred_guard)); | |||||
| } | |||||
| }; | |||||
| template <typename AccessType> | |||||
| struct global_store<AccessType, 8> { | |||||
| MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr, | |||||
| bool pred_guard) { | |||||
| uint2 const& data = reinterpret_cast<uint2 const&>(D); | |||||
| asm volatile( | |||||
| "{\n" | |||||
| " .reg .pred p;\n" | |||||
| " setp.ne.b32 p, %3, 0;\n" | |||||
| " @p st.global.v2.u32 [%0], {%1, %2};\n" | |||||
| "}\n" | |||||
| : | |||||
| : "l"(ptr), "r"(data.x), "r"(data.y), "r"((int)pred_guard)); | |||||
| } | |||||
| }; | |||||
| template <typename AccessType> | |||||
| struct global_store<AccessType, 4> { | |||||
| MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr, | |||||
| bool pred_guard) { | |||||
| uint32_t const& data = reinterpret_cast<uint32_t const&>(D); | |||||
| asm volatile( | |||||
| "{\n" | |||||
| " .reg .pred p;\n" | |||||
| " setp.ne.b32 p, %2, 0;\n" | |||||
| " @p st.global.u32 [%0], %1;\n" | |||||
| "}\n" | |||||
| : | |||||
| : "l"(ptr), "r"(data), "r"((int)pred_guard)); | |||||
| } | |||||
| }; | |||||
| template <typename AccessType> | |||||
| struct global_store<AccessType, 2> { | |||||
| MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr, | |||||
| bool pred_guard) { | |||||
| uint16_t const& data = reinterpret_cast<uint16_t const&>(D); | |||||
| asm volatile( | |||||
| "{\n" | |||||
| " .reg .pred p;\n" | |||||
| " setp.ne.b32 p, %2, 0;\n" | |||||
| " @p st.global.u16 [%0], %1;\n" | |||||
| "}\n" | |||||
| : | |||||
| : "l"(ptr), "h"(data), "r"((int)pred_guard)); | |||||
| } | |||||
| }; | |||||
| template <typename AccessType> | |||||
| struct global_store<AccessType, 1> { | |||||
| MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr, | |||||
| bool pred_guard) { | |||||
| if (pred_guard) | |||||
| *(reinterpret_cast<AccessType*>(ptr)) = D; | |||||
| } | |||||
| }; | |||||
| } // namespace memory | |||||
| } // namespace cuda | |||||
| } // namespace megdnn | |||||
| #endif | |||||
| // vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||||
| @@ -83,12 +83,7 @@ bool RelayoutForwardImpl::Param::try_copy_contig() { | |||||
| return false; | return false; | ||||
| if (lsrc.stride[0] != 1 || ldst.stride[0] != 1) | if (lsrc.stride[0] != 1 || ldst.stride[0] != 1) | ||||
| return false; | return false; | ||||
| size_t copy_size; | |||||
| if (ldst.dtype.is_low_bit()) { | |||||
| copy_size = ldst.access_bytes(); | |||||
| } else { | |||||
| copy_size = ldst.total_nr_elems() * dtype_size(); | |||||
| } | |||||
| size_t copy_size = ldst.span().dist_byte(); | |||||
| cuda_check(cudaMemcpyAsync(m_dst.raw_ptr, m_src.raw_ptr, copy_size, | cuda_check(cudaMemcpyAsync(m_dst.raw_ptr, m_src.raw_ptr, copy_size, | ||||
| cudaMemcpyDeviceToDevice, m_opr->stream())); | cudaMemcpyDeviceToDevice, m_opr->stream())); | ||||
| @@ -191,7 +186,6 @@ bool RelayoutForwardImpl::Param::try_copy_last_contig() { | |||||
| } | } | ||||
| void RelayoutForwardImpl::Param::copy_general() { | void RelayoutForwardImpl::Param::copy_general() { | ||||
| copy_noncontig_general(m_dst, m_src, m_opr->stream()); | copy_noncontig_general(m_dst, m_src, m_opr->stream()); | ||||
| } | } | ||||
| @@ -6,14 +6,15 @@ | |||||
| * | * | ||||
| * Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
| * software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | */ | ||||
| #include "megdnn/basic_types.h" | #include "megdnn/basic_types.h" | ||||
| #include "src/cuda/int_fastdiv.cuh" | #include "src/cuda/int_fastdiv.cuh" | ||||
| #include "src/cuda/integer_subbyte_utils.cuh" | |||||
| #include "src/cuda/utils.cuh" | #include "src/cuda/utils.cuh" | ||||
| #pragma once | #pragma once | ||||
| namespace megdnn { | namespace megdnn { | ||||
| @@ -56,13 +57,13 @@ y | |||||
| template <int ndim, typename ctype, ContigType contig_type> | template <int ndim, typename ctype, ContigType contig_type> | ||||
| class ParamElemVisitor; | class ParamElemVisitor; | ||||
| #define PARAM_ELEM_VISITOR_COMMON_DEV \ | #define PARAM_ELEM_VISITOR_COMMON_DEV \ | ||||
| devfunc ctype *ptr() { return m_ptr; } \ | |||||
| devfunc ctype &at(uint32_t idx) { return m_ptr[offset(idx)]; } | |||||
| devfunc ctype* ptr() { return m_ptr; } \ | |||||
| devfunc ctype& at(uint32_t idx) { return m_ptr[offset(idx)]; } | |||||
| //! specialization for CONTIG_OTHER | //! specialization for CONTIG_OTHER | ||||
| template <int ndim, typename ctype> | template <int ndim, typename ctype> | ||||
| class ParamElemVisitor<ndim, ctype, CONTIG_OTHER> { | class ParamElemVisitor<ndim, ctype, CONTIG_OTHER> { | ||||
| ctype *__restrict m_ptr; | |||||
| ctype* __restrict m_ptr; | |||||
| int m_stride[ndim]; | int m_stride[ndim]; | ||||
| //! m_shape_highdim[i] = original_shape[i + 1] | //! m_shape_highdim[i] = original_shape[i + 1] | ||||
| @@ -75,7 +76,7 @@ class ParamElemVisitor<ndim, ctype, CONTIG_OTHER> { | |||||
| public: | public: | ||||
| static const int NDIM = ndim; | static const int NDIM = ndim; | ||||
| void host_init(const TensorND &rv, int grid_size, int block_size); | |||||
| void host_init(const TensorND& rv, int grid_size, int block_size); | |||||
| #if MEGDNN_CC_CUDA | #if MEGDNN_CC_CUDA | ||||
| devfunc void thread_init(uint32_t) {} | devfunc void thread_init(uint32_t) {} | ||||
| @@ -86,7 +87,7 @@ public: | |||||
| int offset = 0; | int offset = 0; | ||||
| #pragma unroll | #pragma unroll | ||||
| for (int i = ndim - 1; i >= 1; --i) { | for (int i = ndim - 1; i >= 1; --i) { | ||||
| Uint32Fastdiv &shp = m_shape_highdim[i - 1]; | |||||
| Uint32Fastdiv& shp = m_shape_highdim[i - 1]; | |||||
| uint32_t idx_div = idx / shp; | uint32_t idx_div = idx / shp; | ||||
| offset += (idx - idx_div * shp.divisor()) * m_stride[i]; | offset += (idx - idx_div * shp.divisor()) * m_stride[i]; | ||||
| idx = idx_div; | idx = idx_div; | ||||
| @@ -102,12 +103,12 @@ public: | |||||
| //! specialization for CONTIG_FULL | //! specialization for CONTIG_FULL | ||||
| template <int ndim, typename ctype> | template <int ndim, typename ctype> | ||||
| class ParamElemVisitor<ndim, ctype, CONTIG_FULL> { | class ParamElemVisitor<ndim, ctype, CONTIG_FULL> { | ||||
| ctype *__restrict m_ptr; | |||||
| ctype* __restrict m_ptr; | |||||
| public: | public: | ||||
| static const int NDIM = ndim; | static const int NDIM = ndim; | ||||
| void host_init(const TensorND &rv, int grid_size, int block_size); | |||||
| void host_init(const TensorND& rv, int grid_size, int block_size); | |||||
| #if MEGDNN_CC_CUDA | #if MEGDNN_CC_CUDA | ||||
| devfunc void thread_init(uint32_t) {} | devfunc void thread_init(uint32_t) {} | ||||
| @@ -126,7 +127,6 @@ template <int ndim> | |||||
| class ParamElemVisitor<ndim, dt_quint4, CONTIG_OTHER> { | class ParamElemVisitor<ndim, dt_quint4, CONTIG_OTHER> { | ||||
| using Storage = uint8_t; | using Storage = uint8_t; | ||||
| protected: | |||||
| Storage* __restrict m_ptr; | Storage* __restrict m_ptr; | ||||
| int m_stride[ndim]; | int m_stride[ndim]; | ||||
| int m_shape[ndim]; | int m_shape[ndim]; | ||||
| @@ -205,7 +205,6 @@ public: | |||||
| for (int i = 0; i < ndim; ++i) { | for (int i = 0; i < ndim; ++i) { | ||||
| valid &= (shape_idx[i] < m_shape[i]); | valid &= (shape_idx[i] < m_shape[i]); | ||||
| } | } | ||||
| #pragma unroll | |||||
| for (int i = 0; i < ndim - 1; ++i) { | for (int i = 0; i < ndim - 1; ++i) { | ||||
| idx = (idx + shape_idx[i]) * m_shape[i + 1]; | idx = (idx + shape_idx[i]) * m_shape[i + 1]; | ||||
| } | } | ||||
| @@ -213,7 +212,6 @@ public: | |||||
| } | } | ||||
| return idx; | return idx; | ||||
| } | } | ||||
| devfunc Storage* ptr() { return m_ptr; } | devfunc Storage* ptr() { return m_ptr; } | ||||
| devfunc Storage at(uint32_t idx) { | devfunc Storage at(uint32_t idx) { | ||||
| @@ -221,7 +219,7 @@ public: | |||||
| int vec_idx = offset_ >> 1; | int vec_idx = offset_ >> 1; | ||||
| int lane_idx = offset_ & 0x1; | int lane_idx = offset_ & 0x1; | ||||
| Storage item = Storage(unpack_integer_4bits<false>( | |||||
| Storage item = Storage(integer_subbyte::unpack_integer_4bits<false>( | |||||
| *(Storage*)&m_ptr[vec_idx], lane_idx * 4)); | *(Storage*)&m_ptr[vec_idx], lane_idx * 4)); | ||||
| return item; | return item; | ||||
| @@ -235,7 +233,7 @@ public: | |||||
| #endif | #endif | ||||
| }; | }; | ||||
| } // namespace cuda | |||||
| } // namespace megdnn | |||||
| } // namespace cuda | |||||
| } // namespace megdnn | |||||
| // vim: ft=cpp syntax=cpp.doxygen | // vim: ft=cpp syntax=cpp.doxygen | ||||
| @@ -143,6 +143,109 @@ struct global_load_with_zero_point<AccessType, 1> { | |||||
| } | } | ||||
| }; | }; | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////////// | |||||
| template < | |||||
| /// Fragment type to store loaded data | |||||
| typename AccessType, | |||||
| /// The bytes of loading | |||||
| int LoadBytes> | |||||
| struct global_store; | |||||
| ///////////////////////////////////////////////////////////////////////////////////////////////// | |||||
| // | |||||
| // Specializations | |||||
| // | |||||
| ///////////////////////////////////////////////////////////////////////////////////////////////// | |||||
| template <typename AccessType> | |||||
| struct global_store<AccessType, 32> { | |||||
| devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) { | |||||
| uint4 const* data = reinterpret_cast<uint4 const*>(&D); | |||||
| asm volatile( | |||||
| "{\n" | |||||
| " .reg .pred p;\n" | |||||
| " setp.ne.b32 p, %5, 0;\n" | |||||
| " @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n" | |||||
| " @p st.global.v4.u32 [%6], {%7, %8, %9, %10};\n" | |||||
| "}\n" | |||||
| : | |||||
| : "l"(ptr), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z), | |||||
| "r"(data[0].w), "r"((int)pred_guard), | |||||
| "l"(((uint8_t*)ptr) + 16), "r"(data[1].x), "r"(data[1].y), | |||||
| "r"(data[1].z), "r"(data[1].w)); | |||||
| } | |||||
| }; | |||||
| template <typename AccessType> | |||||
| struct global_store<AccessType, 16> { | |||||
| devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) { | |||||
| uint4 const& data = reinterpret_cast<uint4 const&>(D); | |||||
| asm volatile( | |||||
| "{\n" | |||||
| " .reg .pred p;\n" | |||||
| " setp.ne.b32 p, %5, 0;\n" | |||||
| " @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n" | |||||
| "}\n" | |||||
| : | |||||
| : "l"(ptr), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w), | |||||
| "r"((int)pred_guard)); | |||||
| } | |||||
| }; | |||||
| template <typename AccessType> | |||||
| struct global_store<AccessType, 8> { | |||||
| devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) { | |||||
| uint2 const& data = reinterpret_cast<uint2 const&>(D); | |||||
| asm volatile( | |||||
| "{\n" | |||||
| " .reg .pred p;\n" | |||||
| " setp.ne.b32 p, %3, 0;\n" | |||||
| " @p st.global.v2.u32 [%0], {%1, %2};\n" | |||||
| "}\n" | |||||
| : | |||||
| : "l"(ptr), "r"(data.x), "r"(data.y), "r"((int)pred_guard)); | |||||
| } | |||||
| }; | |||||
| template <typename AccessType> | |||||
| struct global_store<AccessType, 4> { | |||||
| devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) { | |||||
| uint32_t const& data = reinterpret_cast<uint32_t const&>(D); | |||||
| asm volatile( | |||||
| "{\n" | |||||
| " .reg .pred p;\n" | |||||
| " setp.ne.b32 p, %2, 0;\n" | |||||
| " @p st.global.u32 [%0], %1;\n" | |||||
| "}\n" | |||||
| : | |||||
| : "l"(ptr), "r"(data), "r"((int)pred_guard)); | |||||
| } | |||||
| }; | |||||
| template <typename AccessType> | |||||
| struct global_store<AccessType, 2> { | |||||
| devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) { | |||||
| uint16_t const& data = reinterpret_cast<uint16_t const&>(D); | |||||
| asm volatile( | |||||
| "{\n" | |||||
| " .reg .pred p;\n" | |||||
| " setp.ne.b32 p, %2, 0;\n" | |||||
| " @p st.global.u16 [%0], %1;\n" | |||||
| "}\n" | |||||
| : | |||||
| : "l"(ptr), "h"(data), "r"((int)pred_guard)); | |||||
| } | |||||
| }; | |||||
| template <typename AccessType> | |||||
| struct global_store<AccessType, 1> { | |||||
| devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) { | |||||
| if (pred_guard) | |||||
| *(reinterpret_cast<AccessType*>(ptr)) = D; | |||||
| } | |||||
| }; | |||||
| #undef devfunc | #undef devfunc | ||||
| } // namespace relayout_format | } // namespace relayout_format | ||||
| } // namespace cuda | } // namespace cuda | ||||
| @@ -10,17 +10,14 @@ | |||||
| * implied. | * implied. | ||||
| */ | */ | ||||
| #pragma GCC diagnostic push | |||||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
| #include "cutlass/fast_math.h" | |||||
| #include "cutlass/arch/memory.h" | |||||
| #pragma GCC diagnostic pop | |||||
| #include "src/cuda/int_fastdiv.cuh" | |||||
| #include "src/cuda/query_blocksize.cuh" | #include "src/cuda/query_blocksize.cuh" | ||||
| #include "src/cuda/relayout_format/relayout_format.cuh" | #include "src/cuda/relayout_format/relayout_format.cuh" | ||||
| #include "src/cuda/relayout_format/helper.cuh" | |||||
| #include "src/cuda/integer_subbyte_utils.cuh" | |||||
| #include "src/cuda/memory_utils.cuh" | |||||
| using namespace megdnn; | using namespace megdnn; | ||||
| using namespace cuda; | using namespace cuda; | ||||
| using namespace integer_subbyte; | |||||
| namespace { | namespace { | ||||
| @@ -322,26 +319,34 @@ struct Translayout<2, 64, SrcType, dtype::QuantizedS4, dtype::QuantizedS4, | |||||
| int* dst_frag = reinterpret_cast<int*>(dst_width); | int* dst_frag = reinterpret_cast<int*>(dst_width); | ||||
| #pragma unroll | #pragma unroll | ||||
| for (int i = 0; i < 64; i += 8) { | for (int i = 0; i < 64; i += 8) { | ||||
| #define unpack_int4x2(_idx) \ | |||||
| intermediate[_idx][0] = unpack_integer_4bits<true>( \ | |||||
| reinterpret_cast<uint8_t&>(read_channel[i + _idx]), 0); \ | |||||
| intermediate[_idx][1] = unpack_integer_4bits<true>( \ | |||||
| reinterpret_cast<uint8_t&>(read_channel[i + _idx]), 4); | |||||
| // clang-format off | |||||
| unpack_int4x2(0) | |||||
| unpack_int4x2(1) | |||||
| unpack_int4x2(2) | |||||
| unpack_int4x2(3) | |||||
| unpack_int4x2(4) | |||||
| unpack_int4x2(5) | |||||
| unpack_int4x2(6) | |||||
| unpack_int4x2(7) | |||||
| // clang-format on | |||||
| transform_int4x2_to_int8( | |||||
| intermediate[0], | |||||
| reinterpret_cast<uint8_t&>(read_channel[i + 0])); | |||||
| transform_int4x2_to_int8( | |||||
| intermediate[1], | |||||
| reinterpret_cast<uint8_t&>(read_channel[i + 1])); | |||||
| transform_int4x2_to_int8( | |||||
| intermediate[2], | |||||
| reinterpret_cast<uint8_t&>(read_channel[i + 2])); | |||||
| transform_int4x2_to_int8( | |||||
| intermediate[3], | |||||
| reinterpret_cast<uint8_t&>(read_channel[i + 3])); | |||||
| transform_int4x2_to_int8( | |||||
| intermediate[4], | |||||
| reinterpret_cast<uint8_t&>(read_channel[i + 4])); | |||||
| transform_int4x2_to_int8( | |||||
| intermediate[5], | |||||
| reinterpret_cast<uint8_t&>(read_channel[i + 5])); | |||||
| transform_int4x2_to_int8( | |||||
| intermediate[6], | |||||
| reinterpret_cast<uint8_t&>(read_channel[i + 6])); | |||||
| transform_int4x2_to_int8( | |||||
| intermediate[7], | |||||
| reinterpret_cast<uint8_t&>(read_channel[i + 7])); | |||||
| int frag_idx = i / 8; | int frag_idx = i / 8; | ||||
| dst_frag[0 * 8 + frag_idx] = pack_channel(0); | dst_frag[0 * 8 + frag_idx] = pack_channel(0); | ||||
| dst_frag[1 * 8 + frag_idx] = pack_channel(1); | dst_frag[1 * 8 + frag_idx] = pack_channel(1); | ||||
| #undef unpack_int4x2 | |||||
| } | } | ||||
| } | } | ||||
| using Fragment = array_wrapper<SrcType, 64>; | using Fragment = array_wrapper<SrcType, 64>; | ||||
| @@ -429,26 +434,34 @@ struct Translayout<2, 64, SrcType, dtype::Quantized4Asymm, | |||||
| int* dst_frag = reinterpret_cast<int*>(dst_width); | int* dst_frag = reinterpret_cast<int*>(dst_width); | ||||
| #pragma unroll | #pragma unroll | ||||
| for (int i = 0; i < 64; i += 8) { | for (int i = 0; i < 64; i += 8) { | ||||
| #define unpack_int4x2(_idx) \ | |||||
| intermediate[_idx][0] = unpack_integer_4bits<false>( \ | |||||
| reinterpret_cast<uint8_t&>(read_channel[i + _idx]), 0); \ | |||||
| intermediate[_idx][1] = unpack_integer_4bits<false>( \ | |||||
| reinterpret_cast<uint8_t&>(read_channel[i + _idx]), 4); | |||||
| // clang-format off | |||||
| unpack_int4x2(0) | |||||
| unpack_int4x2(1) | |||||
| unpack_int4x2(2) | |||||
| unpack_int4x2(3) | |||||
| unpack_int4x2(4) | |||||
| unpack_int4x2(5) | |||||
| unpack_int4x2(6) | |||||
| unpack_int4x2(7) | |||||
| // clang-format on | |||||
| transform_uint4x2_to_int8( | |||||
| intermediate[0], | |||||
| reinterpret_cast<uint8_t&>(read_channel[i + 0])); | |||||
| transform_uint4x2_to_int8( | |||||
| intermediate[1], | |||||
| reinterpret_cast<uint8_t&>(read_channel[i + 1])); | |||||
| transform_uint4x2_to_int8( | |||||
| intermediate[2], | |||||
| reinterpret_cast<uint8_t&>(read_channel[i + 2])); | |||||
| transform_uint4x2_to_int8( | |||||
| intermediate[3], | |||||
| reinterpret_cast<uint8_t&>(read_channel[i + 3])); | |||||
| transform_uint4x2_to_int8( | |||||
| intermediate[4], | |||||
| reinterpret_cast<uint8_t&>(read_channel[i + 4])); | |||||
| transform_uint4x2_to_int8( | |||||
| intermediate[5], | |||||
| reinterpret_cast<uint8_t&>(read_channel[i + 5])); | |||||
| transform_uint4x2_to_int8( | |||||
| intermediate[6], | |||||
| reinterpret_cast<uint8_t&>(read_channel[i + 6])); | |||||
| transform_uint4x2_to_int8( | |||||
| intermediate[7], | |||||
| reinterpret_cast<uint8_t&>(read_channel[i + 7])); | |||||
| int frag_idx = i / 8; | int frag_idx = i / 8; | ||||
| dst_frag[0 * 8 + frag_idx] = pack_channel(0); | dst_frag[0 * 8 + frag_idx] = pack_channel(0); | ||||
| dst_frag[1 * 8 + frag_idx] = pack_channel(1); | dst_frag[1 * 8 + frag_idx] = pack_channel(1); | ||||
| #undef unpack_int4x2 | |||||
| } | } | ||||
| } | } | ||||
| using Fragment = array_wrapper<SrcType, 64>; | using Fragment = array_wrapper<SrcType, 64>; | ||||
| @@ -744,6 +757,16 @@ inline __device__ int4 make_zero_pad<int4>(const uint8_t zero_point) { | |||||
| return {zero_point, zero_point, zero_point, zero_point}; | return {zero_point, zero_point, zero_point, zero_point}; | ||||
| } | } | ||||
| template <int size_nbits> | |||||
| inline __device__ int make_zero(int zero_point); | |||||
| template <> | |||||
| inline __device__ int make_zero<4>(int zero_point) { | |||||
| return transform_int8_to_uint4x8(zero_point, zero_point, zero_point, | |||||
| zero_point, zero_point, zero_point, | |||||
| zero_point, zero_point); | |||||
| } | |||||
| template <typename DstDtype> | template <typename DstDtype> | ||||
| inline __device__ void write_helper(DstDtype* ptr, DstDtype val) { | inline __device__ void write_helper(DstDtype* ptr, DstDtype val) { | ||||
| *ptr = val; | *ptr = val; | ||||
| @@ -1062,11 +1085,11 @@ public: | |||||
| using AccessType = array_wrapper<Type, pack_size_in_type>; | using AccessType = array_wrapper<Type, pack_size_in_type>; | ||||
| using Fragment = array_wrapper<Type, elements_in_type>; | using Fragment = array_wrapper<Type, elements_in_type>; | ||||
| MEGDNN_DEVICE TensorIteratorOverChannel() | |||||
| MEGDNN_HOST TensorIteratorOverChannel() | |||||
| : pointer{nullptr}, chan_stride_in_elements{0}, channel{0} {} | : pointer{nullptr}, chan_stride_in_elements{0}, channel{0} {} | ||||
| MEGDNN_DEVICE TensorIteratorOverChannel(Type* pointer_, | |||||
| int chan_stride_in_elements_, | |||||
| int channel_, int, int) | |||||
| MEGDNN_HOST TensorIteratorOverChannel(Type* pointer_, | |||||
| int chan_stride_in_elements_, | |||||
| int channel_, int, int) | |||||
| : pointer{pointer_}, | : pointer{pointer_}, | ||||
| chan_stride_in_elements{chan_stride_in_elements_}, | chan_stride_in_elements{chan_stride_in_elements_}, | ||||
| channel{channel_} {} | channel{channel_} {} | ||||
| @@ -1093,8 +1116,7 @@ public: | |||||
| (lane_size_in_type / pack_size_in_type) + | (lane_size_in_type / pack_size_in_type) + | ||||
| j; | j; | ||||
| bool guard = i < channel; | bool guard = i < channel; | ||||
| relayout_format::global_load_with_zero_point<AccessType, | |||||
| pack_size_in_byte>( | |||||
| memory::global_load<AccessType, pack_size_in_byte>( | |||||
| frag_ptr[frag_idx], | frag_ptr[frag_idx], | ||||
| reinterpret_cast<void*>(pointer_ + | reinterpret_cast<void*>(pointer_ + | ||||
| j * pack_size_in_type), | j * pack_size_in_type), | ||||
| @@ -1115,7 +1137,7 @@ public: | |||||
| (lane_size_in_type / pack_size_in_type) + | (lane_size_in_type / pack_size_in_type) + | ||||
| j; | j; | ||||
| bool guard = i < channel; | bool guard = i < channel; | ||||
| cutlass::arch::global_store<AccessType, pack_size_in_byte>( | |||||
| memory::global_store<AccessType, pack_size_in_byte>( | |||||
| frag_ptr[frag_idx], | frag_ptr[frag_idx], | ||||
| reinterpret_cast<void*>(pointer_ + | reinterpret_cast<void*>(pointer_ + | ||||
| j * pack_size_in_type), | j * pack_size_in_type), | ||||
| @@ -1160,20 +1182,18 @@ public: | |||||
| using AccessType = array_wrapper<Type, pack_size_in_type>; | using AccessType = array_wrapper<Type, pack_size_in_type>; | ||||
| using Fragment = array_wrapper<Type, elements_in_type>; | using Fragment = array_wrapper<Type, elements_in_type>; | ||||
| MEGDNN_HOST MEGDNN_DEVICE MaskedTensorIteratorOverChannel() | |||||
| MEGDNN_HOST MaskedTensorIteratorOverChannel() | |||||
| : pointer{nullptr}, | : pointer{nullptr}, | ||||
| chan_stride_in_elements{0}, | chan_stride_in_elements{0}, | ||||
| channel{0} {} | channel{0} {} | ||||
| MEGDNN_HOST MEGDNN_DEVICE MaskedTensorIteratorOverChannel( | |||||
| MEGDNN_HOST MaskedTensorIteratorOverChannel( | |||||
| Type* pointer_, int chan_stride_in_elements_, int channel_, | Type* pointer_, int chan_stride_in_elements_, int channel_, | ||||
| int bound_, int div_) | int bound_, int div_) | ||||
| : pointer{pointer_}, | : pointer{pointer_}, | ||||
| chan_stride_in_elements{chan_stride_in_elements_}, | chan_stride_in_elements{chan_stride_in_elements_}, | ||||
| channel{channel_}, | channel{channel_}, | ||||
| bound{bound_}, | bound{bound_}, | ||||
| div{div_} { | |||||
| cutlass::find_divisor(mul, shr, div); | |||||
| } | |||||
| div{uint32_t(div_)} {} | |||||
| MEGDNN_DEVICE __forceinline__ void initialize(int c_idx, int hw_idx) { | MEGDNN_DEVICE __forceinline__ void initialize(int c_idx, int hw_idx) { | ||||
| pointer += (c_idx / pack_size) * chan_stride_in_elements; | pointer += (c_idx / pack_size) * chan_stride_in_elements; | ||||
| @@ -1187,8 +1207,8 @@ public: | |||||
| #pragma unroll | #pragma unroll | ||||
| for (int j = 0; j < lane_size_in_type / pack_size_in_type; j++) { | for (int j = 0; j < lane_size_in_type / pack_size_in_type; j++) { | ||||
| int offset = hw_idx + j; | int offset = hw_idx + j; | ||||
| int h, w; | |||||
| cutlass::fast_divmod(h, w, offset, div, mul, shr); | |||||
| int h = (int)((uint32_t)(offset) / div); | |||||
| int w = (int)((uint32_t)(offset) % div); | |||||
| bool guard = (i < channel) && (w < bound); | bool guard = (i < channel) && (w < bound); | ||||
| int index = (i / pack_size) * | int index = (i / pack_size) * | ||||
| (lane_size_in_type / pack_size_in_type) + | (lane_size_in_type / pack_size_in_type) + | ||||
| @@ -1219,8 +1239,7 @@ public: | |||||
| int mask_index = (frag_idx >> 5); | int mask_index = (frag_idx >> 5); | ||||
| int mask_shift = (frag_idx & 0x1f); | int mask_shift = (frag_idx & 0x1f); | ||||
| bool guard = (mask[mask_index] & (1 << mask_shift)); | bool guard = (mask[mask_index] & (1 << mask_shift)); | ||||
| relayout_format::global_load_with_zero_point<AccessType, | |||||
| pack_size_in_byte>( | |||||
| memory::global_load<AccessType, pack_size_in_byte>( | |||||
| frag_ptr[frag_idx], | frag_ptr[frag_idx], | ||||
| reinterpret_cast<void*>(pointer_ + stride[j]), guard, | reinterpret_cast<void*>(pointer_ + stride[j]), guard, | ||||
| zero_point); | zero_point); | ||||
| @@ -1242,7 +1261,7 @@ public: | |||||
| int mask_index = (frag_idx >> 5); | int mask_index = (frag_idx >> 5); | ||||
| int mask_shift = (frag_idx & 0x1f); | int mask_shift = (frag_idx & 0x1f); | ||||
| bool guard = (mask[mask_index] & (1 << mask_shift)); | bool guard = (mask[mask_index] & (1 << mask_shift)); | ||||
| cutlass::arch::global_store<AccessType, pack_size_in_byte>( | |||||
| memory::global_store<AccessType, pack_size_in_byte>( | |||||
| frag_ptr[frag_idx], | frag_ptr[frag_idx], | ||||
| reinterpret_cast<void*>(pointer_ + stride[j]), guard); | reinterpret_cast<void*>(pointer_ + stride[j]), guard); | ||||
| } | } | ||||
| @@ -1260,9 +1279,7 @@ private: | |||||
| int chan_stride_in_elements; | int chan_stride_in_elements; | ||||
| int channel; | int channel; | ||||
| int bound; | int bound; | ||||
| int div; | |||||
| uint32_t mul; | |||||
| uint32_t shr; | |||||
| Uint32Fastdiv div; | |||||
| uint32_t mask[mask_size]; | uint32_t mask[mask_size]; | ||||
| size_t stride[lane_size_in_type / pack_size_in_type]; | size_t stride[lane_size_in_type / pack_size_in_type]; | ||||
| }; | }; | ||||
| @@ -1355,8 +1372,7 @@ __global__ void relayout_kern(typename RelayoutProblem_::Param param) { | |||||
| param.dst_iterator.initialize(c_idx, hw_idx); | param.dst_iterator.initialize(c_idx, hw_idx); | ||||
| typename SrcIterator::Fragment src_frag; | typename SrcIterator::Fragment src_frag; | ||||
| typename DstIterator::Fragment dst_frag; | typename DstIterator::Fragment dst_frag; | ||||
| int zp = relayout_format::make_zero<SrcIterator::size_nbits>( | |||||
| param.zero_point); | |||||
| int zp = make_zero<SrcIterator::size_nbits>(param.zero_point); | |||||
| param.src_iterator.load(src_frag, zp); | param.src_iterator.load(src_frag, zp); | ||||
| RelayoutProblem_::Transpose::trans( | RelayoutProblem_::Transpose::trans( | ||||
| reinterpret_cast<typename SrcIterator::Fragment&>(dst_frag), | reinterpret_cast<typename SrcIterator::Fragment&>(dst_frag), | ||||
| @@ -1456,7 +1472,7 @@ void relayout_format::relayout_format_cuda_nchw_nchwx( | |||||
| megdnn_assert(src_layout.dtype.is_low_bit()); | megdnn_assert(src_layout.dtype.is_low_bit()); | ||||
| int n = src.layout[0]; | int n = src.layout[0]; | ||||
| int ic = src.layout[1]; | int ic = src.layout[1]; | ||||
| int oc = dst.layout[1] * 64; | |||||
| int oc = dst.layout[1] * pack_oc; | |||||
| int h = src.layout[2]; | int h = src.layout[2]; | ||||
| // align to byte | // align to byte | ||||
| int w = src.layout[3]; | int w = src.layout[3]; | ||||
| @@ -223,10 +223,12 @@ struct TypeCvtOpFromQuantizedToQuantized4bit< | |||||
| } | } | ||||
| __device__ __forceinline__ void operator()(uint32_t idx, | __device__ __forceinline__ void operator()(uint32_t idx, | ||||
| src_vect_type src) { | src_vect_type src) { | ||||
| dst_storage x = apply( | |||||
| src_storage(unpack_integer_4bits<src_signedness>(src.x, 0))); | |||||
| dst_storage y = apply( | |||||
| src_storage(unpack_integer_4bits<src_signedness>(src.x, 4))); | |||||
| dst_storage x = apply(src_storage( | |||||
| integer_subbyte::unpack_integer_4bits<src_signedness>(src.x, | |||||
| 0))); | |||||
| dst_storage y = apply(src_storage( | |||||
| integer_subbyte::unpack_integer_4bits<src_signedness>(src.x, | |||||
| 4))); | |||||
| *(dst_vect_type*)(&dest[idx]) = | *(dst_vect_type*)(&dest[idx]) = | ||||
| VectTypeTrait<ctype_dest>::make_vector(x, y); | VectTypeTrait<ctype_dest>::make_vector(x, y); | ||||
| @@ -21,7 +21,6 @@ | |||||
| #include "cuda.h" | #include "cuda.h" | ||||
| #include "src/cuda/cudnn_with_check.h" | #include "src/cuda/cudnn_with_check.h" | ||||
| #include "cutlass/cutlass.h" | #include "cutlass/cutlass.h" | ||||
| #include "cutlass/platform/platform.h" | |||||
| #define cuda_check(_x) \ | #define cuda_check(_x) \ | ||||
| do { \ | do { \ | ||||
| @@ -376,104 +375,6 @@ MEGDNN_DEVICE __forceinline__ static float4 operator+(float4 lval, | |||||
| lval.w + rval.w); | lval.w + rval.w); | ||||
| } | } | ||||
| MEGDNN_DEVICE __forceinline__ static int transform_int8_to_int4x8( | |||||
| int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7) { | |||||
| unsigned out; | |||||
| #if __CUDA_ARCH__ >= 750 && \ | |||||
| ((__CUDACC_VER_MAJOR__ > 10) || \ | |||||
| ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2))) | |||||
| asm volatile( | |||||
| "{ .reg .u32 r4;" | |||||
| "cvt.pack.sat.s4.s32.b32 r4, %8, %7, 0;" | |||||
| "cvt.pack.sat.s4.s32.b32 r4, %6, %5, r4;" | |||||
| "cvt.pack.sat.s4.s32.b32 r4, %4, %3, r4;" | |||||
| "cvt.pack.sat.s4.s32.b32 %0, %2, %1, r4;" | |||||
| "}" | |||||
| : "=r"(out) | |||||
| : "r"(s0), "r"(s1), "r"(s2), "r"(s3), "r"(s4), "r"(s5), "r"(s6), | |||||
| "r"(s7)); | |||||
| #else | |||||
| #define CVT_SAT_S4_S32(r, bits) \ | |||||
| r = r <= -8 ? -8 : r; \ | |||||
| r = r > 7 ? 7 : r; \ | |||||
| r = (((unsigned)r & 0xf) << bits); | |||||
| CVT_SAT_S4_S32(s0, 0) | |||||
| CVT_SAT_S4_S32(s1, 4) | |||||
| CVT_SAT_S4_S32(s2, 8) | |||||
| CVT_SAT_S4_S32(s3, 12) | |||||
| CVT_SAT_S4_S32(s4, 16) | |||||
| CVT_SAT_S4_S32(s5, 20) | |||||
| CVT_SAT_S4_S32(s6, 24) | |||||
| CVT_SAT_S4_S32(s7, 28) | |||||
| out = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7; | |||||
| #undef CVT_SAT_S4_S32 | |||||
| #endif | |||||
| return reinterpret_cast<int const&>(out); | |||||
| } | |||||
| MEGDNN_DEVICE __forceinline__ static int transform_int8_to_uint4x8( | |||||
| int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7) { | |||||
| unsigned out; | |||||
| #if __CUDA_ARCH__ >= 750 && \ | |||||
| ((__CUDACC_VER_MAJOR__ > 10) || \ | |||||
| ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2))) | |||||
| asm volatile( | |||||
| "{ .reg .u32 r4;" | |||||
| "cvt.pack.sat.u4.s32.b32 r4, %8, %7, 0;" | |||||
| "cvt.pack.sat.u4.s32.b32 r4, %6, %5, r4;" | |||||
| "cvt.pack.sat.u4.s32.b32 r4, %4, %3, r4;" | |||||
| "cvt.pack.sat.u4.s32.b32 %0, %2, %1, r4;" | |||||
| "}" | |||||
| : "=r"(out) | |||||
| : "r"(s0), "r"(s1), "r"(s2), "r"(s3), "r"(s4), "r"(s5), "r"(s6), | |||||
| "r"(s7)); | |||||
| #else | |||||
| #define CVT_SAT_U4_S32(r, bits) \ | |||||
| r = r <= 0 ? 0 : r; \ | |||||
| r = r > 15 ? 15 : r; \ | |||||
| r = (((unsigned)r & 0xf) << bits); | |||||
| CVT_SAT_U4_S32(s0, 0) | |||||
| CVT_SAT_U4_S32(s1, 4) | |||||
| CVT_SAT_U4_S32(s2, 8) | |||||
| CVT_SAT_U4_S32(s3, 12) | |||||
| CVT_SAT_U4_S32(s4, 16) | |||||
| CVT_SAT_U4_S32(s5, 20) | |||||
| CVT_SAT_U4_S32(s6, 24) | |||||
| CVT_SAT_U4_S32(s7, 28) | |||||
| out = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7; | |||||
| #undef CVT_SAT_U4_S32 | |||||
| #endif | |||||
| return reinterpret_cast<int const&>(out); | |||||
| } | |||||
| template <bool signedness, typename T> | |||||
| MEGDNN_DEVICE __forceinline__ static int unpack_integer_4bits(T storage, | |||||
| int bits) { | |||||
| static constexpr int shift = 28; | |||||
| using type = typename cutlass::platform::conditional<signedness, int, | |||||
| unsigned>::type; | |||||
| unsigned intermediate = static_cast<unsigned>(storage); | |||||
| type result = reinterpret_cast<type&>(intermediate); | |||||
| return (result << (shift - bits)) >> shift; | |||||
| } | |||||
| MEGDNN_DEVICE __forceinline__ static void transform_int4x8_to_int8( | |||||
| int (&result)[8], const int& source) { | |||||
| #pragma unroll | |||||
| for (int i = 0; i < 8; i++) { | |||||
| result[i] = unpack_integer_4bits<true>( | |||||
| reinterpret_cast<unsigned const&>(source), (i << 2)); | |||||
| } | |||||
| } | |||||
| MEGDNN_DEVICE __forceinline__ static void transform_uint4x8_to_int8( | |||||
| int (&result)[8], const int& source) { | |||||
| #pragma unroll | |||||
| for (int i = 0; i < 8; i++) { | |||||
| result[i] = unpack_integer_4bits<false>( | |||||
| reinterpret_cast<unsigned const&>(source), (i << 2)); | |||||
| } | |||||
| } | |||||
| #endif | #endif | ||||
| } // namespace cuda | } // namespace cuda | ||||
| } // namespace megdnn | } // namespace megdnn | ||||
| @@ -348,6 +348,7 @@ void WarpPerspectiveForwardImpl::exec(_megdnn_tensor_in ssrc, | |||||
| RelayoutFormat::Param trans_param; | RelayoutFormat::Param trans_param; | ||||
| trans_param.mode = | trans_param.mode = | ||||
| RelayoutFormat::Param::Mode::NCHW64_NCHW; | RelayoutFormat::Param::Mode::NCHW64_NCHW; | ||||
| trans_param.oc = sdst.layout[1]; | |||||
| relayout_opr->param() = trans_param; | relayout_opr->param() = trans_param; | ||||
| relayout_opr->exec(dst, sdst, {}); | relayout_opr->exec(dst, sdst, {}); | ||||
| } | } | ||||
| @@ -17,10 +17,12 @@ | |||||
| #include "src/common/rounding_converter.cuh" | #include "src/common/rounding_converter.cuh" | ||||
| #include "megdnn/dtype.h" | #include "megdnn/dtype.h" | ||||
| #include <cstdio> | #include <cstdio> | ||||
| #include "src/cuda/integer_subbyte_utils.cuh" | |||||
| using namespace megdnn; | using namespace megdnn; | ||||
| using namespace cuda; | using namespace cuda; | ||||
| using namespace warp_perspective; | using namespace warp_perspective; | ||||
| using namespace integer_subbyte; | |||||
| namespace { | namespace { | ||||
| @@ -247,14 +247,13 @@ TEST_F(CUDA, POOLING_FORWARD_NCHW_Q4) { | |||||
| using Param = param::Pooling; | using Param = param::Pooling; | ||||
| Checker<Pooling> checker(handle_cuda()); | Checker<Pooling> checker(handle_cuda()); | ||||
| Param param{Param::Mode::MAX, 0, 0, 2, 2, 2, 2}; | Param param{Param::Mode::MAX, 0, 0, 2, 2, 2, 2}; | ||||
| checker.set_dtype(0, dtype::QuantizedS4(0.1f)); | |||||
| checker.set_dtype(0, dtype::QuantizedS4(3.1415926f)); | |||||
| param.format = Param::Format::NCHW; | param.format = Param::Format::NCHW; | ||||
| checker.set_epsilon(1 + 1e-3); | |||||
| checker.set_param(param).exec({{20, 64, 22, 33}, {}}); | checker.set_param(param).exec({{20, 64, 22, 33}, {}}); | ||||
| param.mode = Param::Mode::AVERAGE; | param.mode = Param::Mode::AVERAGE; | ||||
| checker.set_param(param).exec({{20, 64, 22, 33}, {}}); | |||||
| checker.set_param(param).exec({{20, 96, 22, 33}, {}}); | |||||
| param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; | param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; | ||||
| checker.set_param(param).exec({{20, 64, 22, 33}, {}}); | |||||
| checker.set_param(param).exec({{20, 24, 22, 33}, {}}); | |||||
| } | } | ||||
| TEST_F(CUDA, POOLING_FORWARD_NCHW4) { | TEST_F(CUDA, POOLING_FORWARD_NCHW4) { | ||||
| @@ -107,7 +107,7 @@ TEST_F(CUDA, QUANTIZED_TYPECVT) { | |||||
| } | } | ||||
| TEST_F(CUDA, QUANTIZED_TYPECVT_4BIT) { | TEST_F(CUDA, QUANTIZED_TYPECVT_4BIT) { | ||||
| UniformIntRNG int_rng{0, 8}; | |||||
| UniformIntRNG int_rng{-8, 8}; | |||||
| Checker<TypeCvt> checker(handle_cuda()); | Checker<TypeCvt> checker(handle_cuda()); | ||||
| checker.set_rng(0, &int_rng).set_rng(1, &int_rng); | checker.set_rng(0, &int_rng).set_rng(1, &int_rng); | ||||
| @@ -627,9 +627,9 @@ TEST_F(CUDA, WARP_PERSPECTIVE_FORWARD_QINT4) { | |||||
| Checker<WarpPerspectiveForward> checker(handle_cuda()); | Checker<WarpPerspectiveForward> checker(handle_cuda()); | ||||
| WarpPerspectiveMatRNG rng; | WarpPerspectiveMatRNG rng; | ||||
| checker.set_rng(1, &rng); | checker.set_rng(1, &rng); | ||||
| checker.set_dtype(0, dtype::QuantizedS4(0.1f)) | |||||
| checker.set_dtype(0, dtype::QuantizedS4(1.25f)) | |||||
| .set_dtype(1, dtype::Float32()) | .set_dtype(1, dtype::Float32()) | ||||
| .set_dtype(2, dtype::QuantizedS4(0.1f)); | |||||
| .set_dtype(2, dtype::QuantizedS4(1.25f)); | |||||
| for (auto bmode : {WarpPerspective::BorderMode::WRAP, | for (auto bmode : {WarpPerspective::BorderMode::WRAP, | ||||
| WarpPerspective::BorderMode::REFLECT, | WarpPerspective::BorderMode::REFLECT, | ||||
| WarpPerspective::BorderMode::REPLICATE, | WarpPerspective::BorderMode::REPLICATE, | ||||
| @@ -68,7 +68,7 @@ using namespace gopt; | |||||
| * oprs should not get involved in any actual computing. | * oprs should not get involved in any actual computing. | ||||
| */ | */ | ||||
| MGB_DEFINE_OPR_CLASS(TensorReformatPass::RelayoutPlaceholder, | MGB_DEFINE_OPR_CLASS(TensorReformatPass::RelayoutPlaceholder, | ||||
| cg::SingleCNOperatorNodeBase) // { | |||||
| cg::SingleCNOperatorNodeBase) // { | |||||
| public: | public: | ||||
| //! relayout type of this opr | //! relayout type of this opr | ||||
| enum class LayoutType { | enum class LayoutType { | ||||
| @@ -124,14 +124,14 @@ public: | |||||
| NCHW4_TO_NCHW64, //! <from nchw4 layout to nchw64 layout | NCHW4_TO_NCHW64, //! <from nchw4 layout to nchw64 layout | ||||
| }; | }; | ||||
| RelayoutPlaceholder(VarNode * src_var, LayoutType layout_type); | |||||
| RelayoutPlaceholder(VarNode* src_var, LayoutType layout_type); | |||||
| /*! | /*! | ||||
| * \param src_var the input var | * \param src_var the input var | ||||
| * \param layout_type tensor layout transform type of this relayout | * \param layout_type tensor layout transform type of this relayout | ||||
| * placeholder as described in LayoutType | * placeholder as described in LayoutType | ||||
| */ | */ | ||||
| static SymbolVar make(VarNode * src_var, LayoutType layout_type); | |||||
| static SymbolVar make(VarNode* src_var, LayoutType layout_type); | |||||
| LayoutType layout_type() const { return m_layout_type; } | LayoutType layout_type() const { return m_layout_type; } | ||||
| @@ -141,7 +141,6 @@ private: | |||||
| void init_output_comp_node() override; | void init_output_comp_node() override; | ||||
| const LayoutType m_layout_type; | const LayoutType m_layout_type; | ||||
| }; | }; | ||||
| MGB_DYN_TYPE_OBJ_FINAL_IMPL(TensorReformatPass::RelayoutPlaceholder); | MGB_DYN_TYPE_OBJ_FINAL_IMPL(TensorReformatPass::RelayoutPlaceholder); | ||||
| TensorReformatPass::RelayoutPlaceholder::RelayoutPlaceholder( | TensorReformatPass::RelayoutPlaceholder::RelayoutPlaceholder( | ||||
| @@ -3866,8 +3865,12 @@ void PaddingChannelPass::apply(OptState& opt) const { | |||||
| }; | }; | ||||
| auto extract_subtensor = [](VarNode* inp, | auto extract_subtensor = [](VarNode* inp, | ||||
| size_t orig_channels) -> VarNode* { | |||||
| const TensorShape& orig_shape) -> VarNode* { | |||||
| mgb_assert(inp->shape().ndim == 4); | mgb_assert(inp->shape().ndim == 4); | ||||
| mgb_assert(inp->shape()[0] == orig_shape[0]); | |||||
| mgb_assert(inp->shape()[2] == orig_shape[2]); | |||||
| mgb_assert(inp->shape()[3] == orig_shape[3]); | |||||
| size_t orig_channels = orig_shape[1]; | |||||
| auto x = SymbolVar(inp); | auto x = SymbolVar(inp); | ||||
| auto cv = [&x](int v) { return x.make_scalar(v); }; | auto cv = [&x](int v) { return x.make_scalar(v); }; | ||||
| using AIdx = opr::Subtensor::AxisIndexer; | using AIdx = opr::Subtensor::AxisIndexer; | ||||
| @@ -4108,8 +4111,7 @@ void PaddingChannelPass::apply(OptState& opt) const { | |||||
| bool padding_cur_inp = | bool padding_cur_inp = | ||||
| padding_oprs.count(cur_inp->owner_opr()) > 0; | padding_oprs.count(cur_inp->owner_opr()) > 0; | ||||
| if (padding_cur_inp) { | if (padding_cur_inp) { | ||||
| size_t orig_channels = cur_inp->shape()[1]; | |||||
| inps[i] = extract_subtensor(inps[i], orig_channels); | |||||
| inps[i] = extract_subtensor(inps[i], cur_inp->shape()); | |||||
| } | } | ||||
| } | } | ||||
| return serialization::copy_opr_shallow(*opr, inps, opr->config()); | return serialization::copy_opr_shallow(*opr, inps, opr->config()); | ||||
| @@ -4133,8 +4135,7 @@ void PaddingChannelPass::apply(OptState& opt) const { | |||||
| auto cur_inp = opr->input(i); | auto cur_inp = opr->input(i); | ||||
| bool padding_cur_inp = padding_oprs.count(cur_inp->owner_opr()) > 0; | bool padding_cur_inp = padding_oprs.count(cur_inp->owner_opr()) > 0; | ||||
| if (padding_cur_inp) { | if (padding_cur_inp) { | ||||
| size_t orig_channels = cur_inp->shape()[1]; | |||||
| inps[i] = extract_subtensor(inps[i], orig_channels); | |||||
| inps[i] = extract_subtensor(inps[i], cur_inp->shape()); | |||||
| } | } | ||||
| } | } | ||||
| return serialization::copy_opr_shallow(*opr, inps, opr->config()); | return serialization::copy_opr_shallow(*opr, inps, opr->config()); | ||||
| @@ -4142,6 +4143,8 @@ void PaddingChannelPass::apply(OptState& opt) const { | |||||
| opr_replace_funcs[opr::Reshape::typeinfo()] = replace_nonpadding_oprs; | opr_replace_funcs[opr::Reshape::typeinfo()] = replace_nonpadding_oprs; | ||||
| opr_replace_funcs[opr::GetVarShape::typeinfo()] = replace_nonpadding_oprs; | opr_replace_funcs[opr::GetVarShape::typeinfo()] = replace_nonpadding_oprs; | ||||
| opr_replace_funcs[opr::Concat::typeinfo()] = replace_nonpadding_oprs; | opr_replace_funcs[opr::Concat::typeinfo()] = replace_nonpadding_oprs; | ||||
| opr_replace_funcs[opr::Reduce::typeinfo()] = replace_nonpadding_oprs; | |||||
| opr_replace_funcs[opr::Subtensor::typeinfo()] = replace_nonpadding_oprs; | |||||
| auto on_opr = [&opt, &rewriter, &opr_replace_funcs, | auto on_opr = [&opt, &rewriter, &opr_replace_funcs, | ||||
| &extract_subtensor](OperatorNodeBase* opr) { | &extract_subtensor](OperatorNodeBase* opr) { | ||||
| @@ -4169,8 +4172,7 @@ void PaddingChannelPass::apply(OptState& opt) const { | |||||
| auto dst = out1[i]; | auto dst = out1[i]; | ||||
| if (opt.graph().endpoint_contain(src) && | if (opt.graph().endpoint_contain(src) && | ||||
| !src->shape().eq_shape(dst->shape())) { | !src->shape().eq_shape(dst->shape())) { | ||||
| size_t orig_channels = src->shape()[1]; | |||||
| dst = extract_subtensor(dst, orig_channels); | |||||
| dst = extract_subtensor(dst, src->shape()); | |||||
| } | } | ||||
| rewriter.replace_var(src, dst, nullptr); | rewriter.replace_var(src, dst, nullptr); | ||||
| } | } | ||||
| @@ -4183,14 +4183,7 @@ TEST(TestGoptInference, PaddingChannels) { | |||||
| REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
| auto cn = CompNode::load("gpu0"); | auto cn = CompNode::load("gpu0"); | ||||
| cn.activate(); | cn.activate(); | ||||
| auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||||
| auto sm_ver = prop.major * 10 + prop.minor; | |||||
| if (sm_ver < 61) { | |||||
| printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||||
| "expected: %d)\n", | |||||
| sm_ver, 61); | |||||
| return; | |||||
| } | |||||
| REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1); | |||||
| HostTensorGenerator<dtype::Int8> gen; | HostTensorGenerator<dtype::Int8> gen; | ||||
| auto graph = ComputingGraph::make(); | auto graph = ComputingGraph::make(); | ||||
| @@ -4263,15 +4256,8 @@ TEST(TestGoptInference, ConcatAfterPaddingChannels) { | |||||
| REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
| auto cn = CompNode::load("gpu0"); | auto cn = CompNode::load("gpu0"); | ||||
| cn.activate(); | cn.activate(); | ||||
| auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||||
| auto sm_ver = prop.major * 10 + prop.minor; | |||||
| if (sm_ver < 61) { | |||||
| printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||||
| "expected: %d)\n", | |||||
| sm_ver, 61); | |||||
| return; | |||||
| } | |||||
| REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1); | |||||
| HostTensorGenerator<dtype::Int8> gen; | HostTensorGenerator<dtype::Int8> gen; | ||||
| auto graph = ComputingGraph::make(); | auto graph = ComputingGraph::make(); | ||||
| graph->options().graph_opt_level = 0; | graph->options().graph_opt_level = 0; | ||||
| @@ -4332,19 +4318,11 @@ TEST(TestGoptInference, ConcatAfterPaddingChannels) { | |||||
| MGB_ASSERT_TENSOR_EQ(t1, t2); | MGB_ASSERT_TENSOR_EQ(t1, t2); | ||||
| } | } | ||||
| // FIXME replace cpu with gpu to enable gpu validation | |||||
| TEST(TestGoptInference, PaddingChannelsWithPooling) { | TEST(TestGoptInference, PaddingChannelsWithPooling) { | ||||
| REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
| auto cn = CompNode::load("gpu0"); | auto cn = CompNode::load("gpu0"); | ||||
| cn.activate(); | cn.activate(); | ||||
| auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||||
| auto sm_ver = prop.major * 10 + prop.minor; | |||||
| if (sm_ver < 61) { | |||||
| printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||||
| "expected: %d)\n", | |||||
| sm_ver, 61); | |||||
| return; | |||||
| } | |||||
| REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1); | |||||
| HostTensorGenerator<dtype::Int8> gen; | HostTensorGenerator<dtype::Int8> gen; | ||||
| auto graph = ComputingGraph::make(); | auto graph = ComputingGraph::make(); | ||||
| @@ -4408,17 +4386,7 @@ TEST(TestGoptInference, PaddingChannelsWithPooling) { | |||||
| // FIXME replace cpu with gpu to enable gpu validation | // FIXME replace cpu with gpu to enable gpu validation | ||||
| TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) { | TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) { | ||||
| REQUIRE_GPU(1); | |||||
| auto cn = CompNode::load("cpu0"); | auto cn = CompNode::load("cpu0"); | ||||
| // cn.activate(); | |||||
| // auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||||
| // auto sm_ver = prop.major * 10 + prop.minor; | |||||
| // if (sm_ver < 61) { | |||||
| // printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||||
| // "expected: %d)\n", | |||||
| // sm_ver, 61); | |||||
| // return; | |||||
| // } | |||||
| HostTensorGenerator<dtype::Int8> gen; | HostTensorGenerator<dtype::Int8> gen; | ||||
| auto graph = ComputingGraph::make(); | auto graph = ComputingGraph::make(); | ||||
| @@ -4488,16 +4456,9 @@ TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) { | |||||
| TEST(TestGoptInference, EnableNCHW64Basic) { | TEST(TestGoptInference, EnableNCHW64Basic) { | ||||
| REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
| auto cn = CompNode::load("cpu0"); | |||||
| // cn.activate(); | |||||
| // auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||||
| // auto sm_ver = prop.major * 10 + prop.minor; | |||||
| // if (sm_ver < 61) { | |||||
| // printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||||
| // "expected: %d)\n", | |||||
| // sm_ver, 61); | |||||
| // return; | |||||
| // } | |||||
| auto cn = CompNode::load("gpu0"); | |||||
| cn.activate(); | |||||
| REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||||
| HostTensorGenerator<dtype::Int8> gen; | HostTensorGenerator<dtype::Int8> gen; | ||||
| auto graph = ComputingGraph::make(); | auto graph = ComputingGraph::make(); | ||||
| @@ -4517,8 +4478,8 @@ TEST(TestGoptInference, EnableNCHW64Basic) { | |||||
| }; | }; | ||||
| auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)), | auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)), | ||||
| w = mkcvar("w", {32, 4, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
| b = mkcvar("b", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
| w = mkcvar("w", {16, 4, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
| b = mkcvar("b", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
| opr::ConvBias::Param param; | opr::ConvBias::Param param; | ||||
| param.format = opr::ConvBias::Param::Format::NCHW; | param.format = opr::ConvBias::Param::Format::NCHW; | ||||
| param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; | param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; | ||||
| @@ -4527,7 +4488,7 @@ TEST(TestGoptInference, EnableNCHW64Basic) { | |||||
| auto y = opr::ConvBias::make(x, w, b, param, {}, | auto y = opr::ConvBias::make(x, w, b, param, {}, | ||||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | ||||
| auto w1 = mkcvar("w1", {32, 32, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
| auto w1 = mkcvar("w1", {32, 16, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
| b1 = mkcvar("b1", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)); | b1 = mkcvar("b1", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)); | ||||
| auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, | auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, | ||||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | ||||
| @@ -4541,14 +4502,14 @@ TEST(TestGoptInference, EnableNCHW64Basic) { | |||||
| auto y3 = opr::ConvBias::make(y2, w3, b3, param, {}, | auto y3 = opr::ConvBias::make(y2, w3, b3, param, {}, | ||||
| OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | ||||
| y3 = opr::TypeCvt::make(y3, dtype::QuantizedS8{2.5f}); | y3 = opr::TypeCvt::make(y3, dtype::QuantizedS8{2.5f}); | ||||
| auto w4 = mkcvar("w4", {32, 64, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
| b4 = mkcvar("b4", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
| auto w4 = mkcvar("w4", {16, 64, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
| b4 = mkcvar("b4", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f)); | |||||
| auto y4 = opr::ConvBias::make(y3, w4, b4, param, {}, | auto y4 = opr::ConvBias::make(y3, w4, b4, param, {}, | ||||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | ||||
| using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode; | using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode; | ||||
| auto y5 = opr::ElemwiseMultiType::make( | auto y5 = opr::ElemwiseMultiType::make( | ||||
| {y, y4}, {ElemMultiMode::QFUSE_ADD_RELU}, | {y, y4}, {ElemMultiMode::QFUSE_ADD_RELU}, | ||||
| OperatorNodeConfig{dtype::QuantizedS8{1.2f}}); | |||||
| OperatorNodeConfig{dtype::QuantizedS8{1.3f}}); | |||||
| y5 = opr::TypeCvt::make(y5, dtype::Float32()); | y5 = opr::TypeCvt::make(y5, dtype::Float32()); | ||||
| SymbolVar y5_pad; | SymbolVar y5_pad; | ||||
| unpack_vector( | unpack_vector( | ||||
| @@ -4573,10 +4534,10 @@ TEST(TestGoptInference, EnableNCHW64Basic) { | |||||
| ASSERT_EQ(o.param().format, Format::_fmt); \ | ASSERT_EQ(o.param().format, Format::_fmt); \ | ||||
| } | } | ||||
| CHECK(0, NCHW4); | CHECK(0, NCHW4); | ||||
| CHECK(1, NCHW32); | |||||
| CHECK(1, NCHW4); | |||||
| CHECK(2, NCHW32); | CHECK(2, NCHW32); | ||||
| CHECK(3, NCHW64); | CHECK(3, NCHW64); | ||||
| CHECK(4, NCHW32); | |||||
| CHECK(4, NCHW4); | |||||
| #undef CHECK | #undef CHECK | ||||
| HostTensorND t1, t2; | HostTensorND t1, t2; | ||||
| auto func1 = graph->compile({make_callback_copy(y5, t1)}); | auto func1 = graph->compile({make_callback_copy(y5, t1)}); | ||||
| @@ -4588,16 +4549,9 @@ TEST(TestGoptInference, EnableNCHW64Basic) { | |||||
| TEST(TestGoptInference, EnableNCHW64PaddingChannel) { | TEST(TestGoptInference, EnableNCHW64PaddingChannel) { | ||||
| REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
| auto cn = CompNode::load("cpu0"); | |||||
| // cn.activate(); | |||||
| // auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||||
| // auto sm_ver = prop.major * 10 + prop.minor; | |||||
| // if (sm_ver < 61) { | |||||
| // printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||||
| // "expected: %d)\n", | |||||
| // sm_ver, 61); | |||||
| // return; | |||||
| // } | |||||
| auto cn = CompNode::load("gpu0"); | |||||
| cn.activate(); | |||||
| REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||||
| HostTensorGenerator<dtype::Int8> gen; | HostTensorGenerator<dtype::Int8> gen; | ||||
| auto graph = ComputingGraph::make(); | auto graph = ComputingGraph::make(); | ||||
| @@ -4616,8 +4570,8 @@ TEST(TestGoptInference, EnableNCHW64PaddingChannel) { | |||||
| dtype); | dtype); | ||||
| }; | }; | ||||
| auto x = mkvar("x", {16, 3, 14, 14}, dtype::QuantizedS8(2.5f)), | |||||
| w = mkcvar("w", {20, 3, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
| auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)), | |||||
| w = mkcvar("w", {20, 4, 3, 3}, dtype::QuantizedS8(2.5f)), | |||||
| b = mkcvar("b", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f)); | b = mkcvar("b", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f)); | ||||
| opr::ConvBias::Param param; | opr::ConvBias::Param param; | ||||
| param.format = opr::ConvBias::Param::Format::NCHW; | param.format = opr::ConvBias::Param::Format::NCHW; | ||||
| @@ -4630,7 +4584,7 @@ TEST(TestGoptInference, EnableNCHW64PaddingChannel) { | |||||
| opr::Pooling::Param pool; | opr::Pooling::Param pool; | ||||
| pool.format = opr::Pooling::Param::Format::NCHW; | pool.format = opr::Pooling::Param::Format::NCHW; | ||||
| y = opr::Pooling::make(y, pool); | y = opr::Pooling::make(y, pool); | ||||
| auto w1 = mkcvar("w1", {24, 20, 3, 3}, dtype::QuantizedS8(2.5f)), | auto w1 = mkcvar("w1", {24, 20, 3, 3}, dtype::QuantizedS8(2.5f)), | ||||
| b1 = mkcvar("b1", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f)); | b1 = mkcvar("b1", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f)); | ||||
| auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, | auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, | ||||
| @@ -4657,11 +4611,12 @@ TEST(TestGoptInference, EnableNCHW64PaddingChannel) { | |||||
| deconv.format = opr::ConvolutionBackwardData::Param::Format::NCHW; | deconv.format = opr::ConvolutionBackwardData::Param::Format::NCHW; | ||||
| deconv.stride_h = deconv.stride_w = 2; | deconv.stride_h = deconv.stride_w = 2; | ||||
| deconv.pad_h = deconv.pad_w = 1; | deconv.pad_h = deconv.pad_w = 1; | ||||
| auto w6 = mkcvar("w6", {20, 20, 4, 4}, dtype::QuantizedS8{2.5f}); | |||||
| auto w6 = mkcvar("w6", {20, 64, 4, 4}, dtype::QuantizedS8{2.5f}); | |||||
| auto y6 = opr::ConvolutionBackwardData::make( | auto y6 = opr::ConvolutionBackwardData::make( | ||||
| w6, y5, deconv, {}, | w6, y5, deconv, {}, | ||||
| OperatorNodeConfig{dtype::QuantizedS8(2.0f)}); | OperatorNodeConfig{dtype::QuantizedS8(2.0f)}); | ||||
| y6 = opr::TypeCvt::make(y6, dtype::QuantizedS4{32.f}); | |||||
| std::shared_ptr<HostTensorND> mat = std::make_shared<HostTensorND>( | std::shared_ptr<HostTensorND> mat = std::make_shared<HostTensorND>( | ||||
| cn, TensorShape{16, 3, 3}, dtype::Float32()); | cn, TensorShape{16, 3, 3}, dtype::Float32()); | ||||
| warp_perspective_mat_gen(*mat, 16, 14, 14); | warp_perspective_mat_gen(*mat, 16, 14, 14); | ||||
| @@ -4676,25 +4631,31 @@ TEST(TestGoptInference, EnableNCHW64PaddingChannel) { | |||||
| opt.enable_nchw64(); | opt.enable_nchw64(); | ||||
| unpack_vector(gopt::optimize_for_inference({y7}, opt), y7_pad); | unpack_vector(gopt::optimize_for_inference({y7}, opt), y7_pad); | ||||
| EXPECT_TRUE(y7.node()->shape().eq_shape(y7_pad.node()->shape())); | EXPECT_TRUE(y7.node()->shape().eq_shape(y7_pad.node()->shape())); | ||||
| SmallVector<cg::OperatorNodeBase*> oprs; | |||||
| auto cb = [&oprs](cg::OperatorNodeBase* opr) { | |||||
| if (opr->same_type<opr::ConvBias>()) { | |||||
| oprs.push_back(opr); | |||||
| } | |||||
| }; | |||||
| cg::DepOprIter{cb}.add(y7_pad.node()->owner_opr()); | |||||
| ASSERT_EQ(oprs.size(), 5); | |||||
| HostTensorND t1, t2; | |||||
| auto func1 = graph->compile({make_callback_copy(y7, t1)}); | |||||
| func1->execute(); | |||||
| auto func2 = graph->compile({make_callback_copy(y7_pad, t2)}); | |||||
| func2->execute(); | |||||
| MGB_ASSERT_TENSOR_EQ(t1, t2); | |||||
| using Format = opr::ConvBiasForward::Param::Format; | using Format = opr::ConvBiasForward::Param::Format; | ||||
| SmallVector<cg::OperatorNodeBase*> oprs; | |||||
| auto cb = [&oprs](cg::OperatorNodeBase* opr) { | |||||
| if (opr->same_type<opr::ConvBias>()) { | |||||
| oprs.push_back(opr); | |||||
| } | |||||
| }; | |||||
| cg::DepOprIter{cb}.add(y7_pad.node()->owner_opr()); | |||||
| ASSERT_EQ(oprs.size(), 5); | |||||
| #define CHECK(_i, _fmt) \ | #define CHECK(_i, _fmt) \ | ||||
| { \ | { \ | ||||
| const auto& o = oprs[_i]->cast_final<opr::ConvBias>(); \ | const auto& o = oprs[_i]->cast_final<opr::ConvBias>(); \ | ||||
| ASSERT_EQ(o.param().format, Format::_fmt); \ | ASSERT_EQ(o.param().format, Format::_fmt); \ | ||||
| } | } | ||||
| CHECK(0, NCHW4); | |||||
| CHECK(1, NCHW32); | |||||
| CHECK(2, NCHW32); | |||||
| CHECK(3, NCHW64); | |||||
| CHECK(4, NCHW64); | |||||
| CHECK(0, NCHW4); | |||||
| CHECK(1, NCHW32); | |||||
| CHECK(2, NCHW32); | |||||
| CHECK(3, NCHW64); | |||||
| CHECK(4, NCHW64); | |||||
| #undef CHECK | #undef CHECK | ||||
| { | { | ||||
| const auto& deconv = find_opr<opr::ConvolutionBackwardData>(y7_pad); | const auto& deconv = find_opr<opr::ConvolutionBackwardData>(y7_pad); | ||||
| @@ -4702,30 +4663,19 @@ TEST(TestGoptInference, EnableNCHW64PaddingChannel) { | |||||
| const auto& pool = find_opr<opr::PoolingForward>(y7_pad); | const auto& pool = find_opr<opr::PoolingForward>(y7_pad); | ||||
| ASSERT_EQ(pool.param().format, Format::NCHW4); | ASSERT_EQ(pool.param().format, Format::NCHW4); | ||||
| const auto& warp = find_opr<opr::WarpPerspectiveForward>(y7_pad); | const auto& warp = find_opr<opr::WarpPerspectiveForward>(y7_pad); | ||||
| ASSERT_EQ(warp.param().format, Format::NCHW4); | |||||
| ASSERT_EQ(warp.param().format, Format::NCHW64); | |||||
| } | } | ||||
| size_t nr_dimshuffle = find_opr_num<opr::Dimshuffle>(y7_pad); | size_t nr_dimshuffle = find_opr_num<opr::Dimshuffle>(y7_pad); | ||||
| HostTensorND t1, t2; | |||||
| auto func1 = graph->compile({make_callback_copy(y7, t1)}); | |||||
| func1->execute(); | |||||
| auto func2 = graph->compile({make_callback_copy(y7_pad, t2)}); | |||||
| func2->execute(); | |||||
| MGB_ASSERT_TENSOR_EQ(t1, t2); | |||||
| ASSERT_EQ(nr_dimshuffle, 8); | |||||
| } | } | ||||
| TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) { | TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) { | ||||
| REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
| auto cn = CompNode::load("cpu0"); | |||||
| // cn.activate(); | |||||
| // auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||||
| // auto sm_ver = prop.major * 10 + prop.minor; | |||||
| // if (sm_ver < 61) { | |||||
| // printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||||
| // "expected: %d)\n", | |||||
| // sm_ver, 61); | |||||
| // return; | |||||
| // } | |||||
| auto cn = CompNode::load("gpu0"); | |||||
| cn.activate(); | |||||
| REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); | |||||
| HostTensorND t1, t2; | |||||
| HostTensorGenerator<dtype::Int8> gen; | HostTensorGenerator<dtype::Int8> gen; | ||||
| auto graph = ComputingGraph::make(); | auto graph = ComputingGraph::make(); | ||||
| graph->options().graph_opt_level = 0; | graph->options().graph_opt_level = 0; | ||||
| @@ -4757,7 +4707,7 @@ TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) { | |||||
| auto w1 = mkcvar("w1", {64, 32, 3, 3}, dtype::QuantizedS8(2.5f)), | auto w1 = mkcvar("w1", {64, 32, 3, 3}, dtype::QuantizedS8(2.5f)), | ||||
| b1 = mkcvar("b1", {1, 64, 1, 1}, dtype::QuantizedS32(6.25f)); | b1 = mkcvar("b1", {1, 64, 1, 1}, dtype::QuantizedS32(6.25f)); | ||||
| auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, | auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, | ||||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
| y1 = opr::TypeCvt::make(y1, dtype::QuantizedS4{40.f}); | y1 = opr::TypeCvt::make(y1, dtype::QuantizedS4{40.f}); | ||||
| auto w2 = mkcvar("w2", {64, 64, 3, 3}, dtype::QuantizedS4(2.5f)), | auto w2 = mkcvar("w2", {64, 64, 3, 3}, dtype::QuantizedS4(2.5f)), | ||||
| b2 = mkcvar("b2", {1, 64, 1, 1}, dtype::QuantizedS32(100.f)); | b2 = mkcvar("b2", {1, 64, 1, 1}, dtype::QuantizedS32(100.f)); | ||||
| @@ -4772,6 +4722,9 @@ TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) { | |||||
| {y1, y3}, {ElemMultiMode::QFUSE_ADD_RELU}, | {y1, y3}, {ElemMultiMode::QFUSE_ADD_RELU}, | ||||
| OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | ||||
| y4 = opr::TypeCvt::make(y4, dtype::Float32()); | y4 = opr::TypeCvt::make(y4, dtype::Float32()); | ||||
| auto y5 = opr::ConvBias::make(y2, w3, b3, y1, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS4(40.f)}); | |||||
| y5 = opr::TypeCvt::make(y5, dtype::Float32()); | |||||
| SymbolVar y4_pad; | SymbolVar y4_pad; | ||||
| auto opt = gopt::OptimizeForInferenceOptions{}; | auto opt = gopt::OptimizeForInferenceOptions{}; | ||||
| opt.enable_nchw64(); | opt.enable_nchw64(); | ||||
| @@ -4779,15 +4732,35 @@ TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) { | |||||
| EXPECT_TRUE(y4.node()->shape().eq_shape(y4_pad.node()->shape())); | EXPECT_TRUE(y4.node()->shape().eq_shape(y4_pad.node()->shape())); | ||||
| size_t nr_elem_mult_type = find_opr_num<opr::ElemwiseMultiType>(y4_pad); | size_t nr_elem_mult_type = find_opr_num<opr::ElemwiseMultiType>(y4_pad); | ||||
| ASSERT_EQ(nr_elem_mult_type, 0); | ASSERT_EQ(nr_elem_mult_type, 0); | ||||
| // FIXME need impl of elemwise/elemwise_multi_type on CUDA | |||||
| #if 0 | |||||
| HostTensorND t1, t2; | |||||
| auto func1 = graph->compile({make_callback_copy(y4, t1)}); | |||||
| func1->execute(); | |||||
| auto func2 = graph->compile({make_callback_copy(y4_pad, t2)}); | |||||
| func2->execute(); | |||||
| auto func = graph->compile({make_callback_copy(y4_pad, t1)}); | |||||
| func->execute(); | |||||
| { | |||||
| opr::ConvBias::Param param; | |||||
| param.format = opr::ConvBias::Param::Format::NCHW; | |||||
| param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; | |||||
| param.stride_h = param.stride_w = 1; | |||||
| param.pad_h = param.pad_w = 1; | |||||
| auto y = opr::ConvBias::make( | |||||
| x, w, b, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
| auto y1 = opr::ConvBias::make( | |||||
| y, w1, b1, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||||
| y1 = opr::TypeCvt::make(y1, dtype::QuantizedS4{40.f}); | |||||
| auto y2 = opr::ConvBias::make( | |||||
| y1, w2, b2, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS4{40.f}}); | |||||
| param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; | |||||
| auto y3 = opr::ConvBias::make( | |||||
| y2, w3, b3, y1, param, {}, | |||||
| OperatorNodeConfig{dtype::QuantizedS4(40.f)}); | |||||
| y3 = opr::TypeCvt::make(y3, dtype::Float32()); | |||||
| auto func = graph->compile({make_callback_copy(y3, t2)}); | |||||
| func->execute(); | |||||
| } | |||||
| MGB_ASSERT_TENSOR_EQ(t1, t2); | MGB_ASSERT_TENSOR_EQ(t1, t2); | ||||
| #endif | |||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -102,7 +102,8 @@ void WarpPerspectiveForward::outshape_by_symvar_do_get_output_shape( | |||||
| default: | default: | ||||
| size_t height_idx = 0; | size_t height_idx = 0; | ||||
| if (param().format == Param::Format::NCHW || | if (param().format == Param::Format::NCHW || | ||||
| param().format == Param::Format::NCHW4) { | |||||
| param().format == Param::Format::NCHW4 || | |||||
| param().format == Param::Format::NCHW64) { | |||||
| height_idx = 2; | height_idx = 2; | ||||
| } else { | } else { | ||||
| height_idx = 1; | height_idx = 1; | ||||
| @@ -2604,11 +2604,21 @@ TEST_F(TestNoWeightPreprocess, NoPreprocess) { | |||||
| #endif | #endif | ||||
| namespace { | namespace { | ||||
| // FIXME change comp node from "cpu0" to "gpu0" | |||||
| TEST(TestOprDNN, ConvBiasInt4NCHW) { | TEST(TestOprDNN, ConvBiasInt4NCHW) { | ||||
| auto run = [](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S, | |||||
| size_t P) { | |||||
| auto cn = CompNode::load("cpu0"); | |||||
| REQUIRE_GPU(1); | |||||
| auto cn = CompNode::load("gpu0"); | |||||
| cn.activate(); | |||||
| auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||||
| auto sm_ver = prop.major * 10 + prop.minor; | |||||
| if (sm_ver != 75) { | |||||
| printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||||
| "expected: %d)\n", | |||||
| sm_ver, 75); | |||||
| return; | |||||
| } | |||||
| auto run = [&cn](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S, | |||||
| size_t P) { | |||||
| auto graph = ComputingGraph::make(); | auto graph = ComputingGraph::make(); | ||||
| HostTensorGenerator<dtype::Int8> gen; | HostTensorGenerator<dtype::Int8> gen; | ||||
| @@ -2671,6 +2681,18 @@ TEST(TestOprDNN, ConvBiasInt4NCHW) { | |||||
| } | } | ||||
| TEST(TestOprDNN, ConvBiasInt4NCHW64) { | TEST(TestOprDNN, ConvBiasInt4NCHW64) { | ||||
| REQUIRE_GPU(1); | |||||
| auto cn = CompNode::load("gpu0"); | |||||
| cn.activate(); | |||||
| auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||||
| auto sm_ver = prop.major * 10 + prop.minor; | |||||
| if (sm_ver != 75) { | |||||
| printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||||
| "expected: %d)\n", | |||||
| sm_ver, 75); | |||||
| return; | |||||
| } | |||||
| auto nchw2nchw64 = [](SymbolVar x) { | auto nchw2nchw64 = [](SymbolVar x) { | ||||
| auto y = opr::RelayoutFormat::make( | auto y = opr::RelayoutFormat::make( | ||||
| x, opr::RelayoutFormat::Param::Mode::NCHW_NCHW64); | x, opr::RelayoutFormat::Param::Mode::NCHW_NCHW64); | ||||
| @@ -2685,7 +2707,6 @@ TEST(TestOprDNN, ConvBiasInt4NCHW64) { | |||||
| auto run = [&](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S, | auto run = [&](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S, | ||||
| size_t P) { | size_t P) { | ||||
| auto cn = CompNode::load("cpu0"); | |||||
| auto graph = ComputingGraph::make(); | auto graph = ComputingGraph::make(); | ||||
| HostTensorGenerator<dtype::Int8> gen; | HostTensorGenerator<dtype::Int8> gen; | ||||
| @@ -138,15 +138,19 @@ uint64_t eval_conv_computation(const TensorShape& src_shape, | |||||
| src_shape[1] / group * 2; | src_shape[1] / group * 2; | ||||
| return hybird_nchwx ? computation : computation * 4; | return hybird_nchwx ? computation : computation * 4; | ||||
| } | } | ||||
| if (param.format == Param::Format::NCHW32 || | |||||
| param.format == Param::Format::NCHW32_NCHW4) { | |||||
| return dst_shape.total_nr_elems() * fh * fw * src_shape[1] * 32 / | |||||
| group * 2; | |||||
| size_t packed_size; | |||||
| if (param.format == Param::Format::NCHW64) { | |||||
| packed_size = 64; | |||||
| } else if (param.format == Param::Format::NCHW32 || | |||||
| param.format == Param::Format::NCHW32_NCHW4) { | |||||
| packed_size = 32; | |||||
| } else { | |||||
| mgb_assert(param.format == Param::Format::NCHW4 || | |||||
| param.format == Param::Format::NCHW4_NCHW || | |||||
| param.format == Param::Format::NCHW4_NCHW32, | |||||
| "format should be NCHW4/NCHW4_NCHW/NCHW4_NCHW32"); | |||||
| packed_size = 4; | |||||
| } | } | ||||
| mgb_assert(param.format == Param::Format::NCHW4 || | |||||
| param.format == Param::Format::NCHW4_NCHW || | |||||
| param.format == Param::Format::NCHW4_NCHW32, | |||||
| "format should be NCHW4/NCHW4_NCHW/NCHW4_NCHW32"); | |||||
| return dst_shape.total_nr_elems() * fh * fw * src_shape[1] * 4 / group * | return dst_shape.total_nr_elems() * fh * fw * src_shape[1] * 4 / group * | ||||
| 2; | 2; | ||||
| }; | }; | ||||
| @@ -390,7 +390,37 @@ bool mgb::check_compute_capability(int major, int minor) { | |||||
| MGB_CUDA_CHECK(cudaGetDevice(&dev)); | MGB_CUDA_CHECK(cudaGetDevice(&dev)); | ||||
| cudaDeviceProp prop; | cudaDeviceProp prop; | ||||
| MGB_CUDA_CHECK(cudaGetDeviceProperties(&prop, dev)); | MGB_CUDA_CHECK(cudaGetDeviceProperties(&prop, dev)); | ||||
| return prop.major > major || (prop.major == major && prop.minor >= minor); | |||||
| bool available = prop.major > major || (prop.major == major && prop.minor >= minor); | |||||
| if (!available) { | |||||
| mgb_log_warn( | |||||
| "This testcase is ignored due to insufficient cuda cap(got: " | |||||
| "%d.%d, " | |||||
| "expected: %d.%d)", | |||||
| prop.major, prop.minor, major, minor); | |||||
| } | |||||
| return available; | |||||
| #else | |||||
| MGB_MARK_USED_VAR(major); | |||||
| MGB_MARK_USED_VAR(minor); | |||||
| return false; | |||||
| #endif | |||||
| } | |||||
| bool mgb::check_compute_capability_eq(int major, int minor) { | |||||
| #if MGB_CUDA | |||||
| int dev; | |||||
| MGB_CUDA_CHECK(cudaGetDevice(&dev)); | |||||
| cudaDeviceProp prop; | |||||
| MGB_CUDA_CHECK(cudaGetDeviceProperties(&prop, dev)); | |||||
| bool available = prop.major == major && prop.minor == minor; | |||||
| if (!available) { | |||||
| mgb_log_warn( | |||||
| "This testcase is ignored due to insufficient cuda cap(got: " | |||||
| "%d.%d, " | |||||
| "expected: %d.%d)", | |||||
| prop.major, prop.minor, major, minor); | |||||
| } | |||||
| return available; | |||||
| #else | #else | ||||
| MGB_MARK_USED_VAR(major); | MGB_MARK_USED_VAR(major); | ||||
| MGB_MARK_USED_VAR(minor); | MGB_MARK_USED_VAR(minor); | ||||
| @@ -504,6 +504,9 @@ bool check_cambricon_device_available(size_t num); | |||||
| //! check current capability >= major.minor | //! check current capability >= major.minor | ||||
| bool check_compute_capability(int major, int minor); | bool check_compute_capability(int major, int minor); | ||||
| //! check current capability == major.minor | |||||
| bool check_compute_capability_eq(int major, int minor); | |||||
| //! check compnode avaiable | //! check compnode avaiable | ||||
| bool check_device_type_avaiable(CompNode::DeviceType device_type); | bool check_device_type_avaiable(CompNode::DeviceType device_type); | ||||
| @@ -540,6 +543,12 @@ public: | |||||
| return; \ | return; \ | ||||
| } while (0) | } while (0) | ||||
| #define REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(major, minor) \ | |||||
| do { \ | |||||
| if (!check_compute_capability_eq(major, minor)) \ | |||||
| return; \ | |||||
| } while (0) | |||||
| //! skip a testcase if amd gpu not available | //! skip a testcase if amd gpu not available | ||||
| #define REQUIRE_AMD_GPU(n) do { \ | #define REQUIRE_AMD_GPU(n) do { \ | ||||
| if (!check_amd_gpu_available(n)) \ | if (!check_amd_gpu_available(n)) \ | ||||