| @@ -12,7 +12,7 @@ using BcastType = megdnn::elemwise::BcastType; | |||
| ///////////////////////////////// ParamElemVistor /////////////////////////// | |||
| #define cb(_ctype, _inner_ctype, _neon_type, _fun_suffix) \ | |||
| #define cb(_ctype, _inner_ctype, _neon_type, _fun_suffix, _neon_type_v2) \ | |||
| template <> \ | |||
| struct ParamElemVisitor<_ctype> { \ | |||
| _neon_type operator()(const _ctype* src) const { \ | |||
| @@ -24,29 +24,61 @@ using BcastType = megdnn::elemwise::BcastType; | |||
| _neon_type operator()(const _ctype* src) const { \ | |||
| return vdupq_n_##_fun_suffix(*reinterpret_cast<const _inner_ctype*>(src)); \ | |||
| } \ | |||
| }; \ | |||
| template <> \ | |||
| struct ParamElemVisitorV2<_ctype> { \ | |||
| _neon_type_v2 operator()(const _ctype* src, const _ctype* src_1) const { \ | |||
| _neon_type_v2 ret; \ | |||
| ret.val[0] = \ | |||
| vld1q_##_fun_suffix(reinterpret_cast<const _inner_ctype*>(src)); \ | |||
| ret.val[1] = \ | |||
| vld1q_##_fun_suffix(reinterpret_cast<const _inner_ctype*>(src_1)); \ | |||
| return ret; \ | |||
| } \ | |||
| }; \ | |||
| template <> \ | |||
| struct ParamElemVisitorDupV2<_ctype> { \ | |||
| _neon_type_v2 operator()(const _ctype* src) const { \ | |||
| _neon_type_v2 ret; \ | |||
| ret.val[0] = vdupq_n_##_fun_suffix( \ | |||
| *reinterpret_cast<const _inner_ctype*>(src)); \ | |||
| ret.val[1] = ret.val[0]; \ | |||
| return ret; \ | |||
| } \ | |||
| } | |||
| cb(dt_quint8, uint8_t, uint8x16_t, u8); | |||
| cb(dt_quint8, uint8_t, uint8x16_t, u8, uint8x16x2_t); | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| cb(__fp16, __fp16, float16x8_t, f16); | |||
| cb(__fp16, __fp16, float16x8_t, f16, float16x8x2_t); | |||
| #endif | |||
| cb(dt_int16, int16_t, int16x8_t, s16); | |||
| cb(dt_int16, int16_t, int16x8_t, s16, int16x8x2_t); | |||
| #undef cb | |||
| template <typename ctype> | |||
| struct ParamElemVisitorBcast101x4; | |||
| #define cb(_ctype, _inner_ctype, _neon_type, _fun_suffix, rel_suffix) \ | |||
| template <> \ | |||
| struct ParamElemVisitorBcast101x4<_ctype> { \ | |||
| _neon_type operator()(const _ctype* src) const { \ | |||
| return vreinterpretq_##_fun_suffix##_##rel_suffix(vld1q_dup_##rel_suffix( \ | |||
| reinterpret_cast<const _inner_ctype*>(src))); \ | |||
| } \ | |||
| #define cb(_ctype, _inner_ctype, _neon_type, _fun_suffix, rel_suffix, _neon_type_v2) \ | |||
| template <> \ | |||
| struct ParamElemVisitorBcast101x4<_ctype> { \ | |||
| _neon_type operator()(const _ctype* src) const { \ | |||
| return vreinterpretq_##_fun_suffix##_##rel_suffix(vld1q_dup_##rel_suffix( \ | |||
| reinterpret_cast<const _inner_ctype*>(src))); \ | |||
| } \ | |||
| }; \ | |||
| template <> \ | |||
| struct ParamElemVisitorBcast101x4V2<_ctype> { \ | |||
| _neon_type_v2 operator()(const _ctype* src) const { \ | |||
| _neon_type_v2 ret; \ | |||
| ret.val[0] = \ | |||
| vreinterpretq_##_fun_suffix##_##rel_suffix(vld1q_dup_##rel_suffix( \ | |||
| reinterpret_cast<const _inner_ctype*>(src))); \ | |||
| ret.val[1] = ret.val[0]; \ | |||
| return ret; \ | |||
| } \ | |||
| } | |||
| cb(dt_quint8, uint32_t, uint8x16_t, u8, u32); | |||
| cb(dt_int16, int64_t, int16x8_t, s16, s64); | |||
| cb(dt_quint8, uint32_t, uint8x16_t, u8, u32, uint8x16x2_t); | |||
| cb(dt_int16, int64_t, int16x8_t, s16, s64, int16x8x2_t); | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| cb(__fp16, uint64_t, float16x8_t, f16, u64); | |||
| cb(__fp16, uint64_t, float16x8_t, f16, u64, float16x8x2_t); | |||
| #endif | |||
| #undef cb | |||
| @@ -283,7 +283,7 @@ v4sf GiCosPsFloat32(v4sf x) { | |||
| v4sf GiTanPsFloat32(v4sf x) { | |||
| v4sf ysin, ycos; | |||
| GiSinCosPsFloat32(x, &ysin, &ycos); | |||
| return ysin / ycos; | |||
| return GiDivFloat32(ysin, ycos); | |||
| } | |||
| #undef c_exp_hi | |||
| @@ -20,22 +20,28 @@ struct AbsOpBase : UnaryOpBase<src_ctype, dst_ctype> { | |||
| template <typename src_ctype, typename dst_ctype = src_ctype> | |||
| struct AbsOp; | |||
| #define OP(_ctype, _gi_type, _func_suffix, _simd_width) \ | |||
| template <> \ | |||
| struct AbsOp<_ctype> : AbsOpBase<_ctype> { \ | |||
| using AbsOpBase::AbsOpBase; \ | |||
| using AbsOpBase::operator(); \ | |||
| constexpr static size_t SIMD_WIDTH = _simd_width; \ | |||
| void operator()(const _gi_type& src, _ctype* dst) const { \ | |||
| auto vitem = operator()(src); \ | |||
| GiStore##_func_suffix(dst, vitem.val[0]); \ | |||
| GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ | |||
| } \ | |||
| _gi_type operator()(const _gi_type& src) const { \ | |||
| auto vitem0 = GiAbs##_func_suffix(src.val[0]); \ | |||
| auto vitem1 = GiAbs##_func_suffix(src.val[1]); \ | |||
| return {{vitem0, vitem1}}; \ | |||
| } \ | |||
| #define OP(_ctype, _gi_type, _func_suffix, _simd_width) \ | |||
| template <> \ | |||
| struct AbsOp<_ctype> : AbsOpBase<_ctype> { \ | |||
| using AbsOpBase::AbsOpBase; \ | |||
| using AbsOpBase::operator(); \ | |||
| constexpr static size_t SIMD_WIDTH = _simd_width; \ | |||
| void operator()(const _gi_type& src, _ctype* dst) const { \ | |||
| auto vitem = operator()(src); \ | |||
| GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ | |||
| GiStore##_func_suffix( \ | |||
| dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ | |||
| } \ | |||
| _gi_type operator()(const _gi_type& src) const { \ | |||
| auto vitem0 = \ | |||
| GiAbs##_func_suffix(GiGetSubVector##_func_suffix##V2(src, 0)); \ | |||
| auto vitem1 = \ | |||
| GiAbs##_func_suffix(GiGetSubVector##_func_suffix##V2(src, 1)); \ | |||
| _gi_type ret; \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \ | |||
| return ret; \ | |||
| } \ | |||
| }; | |||
| OP(dt_float32, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(dt_float32)) | |||
| OP(dt_int32, GI_INT32_V2_t, Int32, GI_SIMD_LEN_BYTE / sizeof(dt_int32)) | |||
| @@ -64,11 +70,18 @@ struct AbsOp<dt_qint8, dt_qint8> : AbsOpBase<dt_qint8, dt_qint8> { | |||
| OPERATOR_UNARY_QINT8_FALLBACK; | |||
| } | |||
| GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const { | |||
| auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale); | |||
| auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale); | |||
| auto vitem0 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale)); | |||
| auto vitem1 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale)); | |||
| vitem0 = GiAbsFloat32(vitem0); | |||
| vitem1 = GiAbsFloat32(vitem1); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}}); | |||
| GI_FLOAT32_V2_t tmp; | |||
| GiSetSubVectorFloat32V2(tmp, 0, vitem0); | |||
| GiSetSubVectorFloat32V2(tmp, 1, vitem1); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp); | |||
| } | |||
| }; | |||
| @@ -33,13 +33,21 @@ struct AddOp; | |||
| void operator()( \ | |||
| const _gi_type2& src0, const _gi_type2& src1, dst_ctype* dst) const { \ | |||
| auto vitem = operator()(src0, src1); \ | |||
| GiStore##_func_suffix(dst, vitem.val[0]); \ | |||
| GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ | |||
| GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ | |||
| GiStore##_func_suffix( \ | |||
| dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ | |||
| } \ | |||
| _gi_type2 operator()(const _gi_type2& src0, const _gi_type2& src1) const { \ | |||
| auto vitem0 = GiAdd##_func_suffix(src0.val[0], src1.val[0]); \ | |||
| auto vitem1 = GiAdd##_func_suffix(src0.val[1], src1.val[1]); \ | |||
| return {{vitem0, vitem1}}; \ | |||
| auto vitem0 = GiAdd##_func_suffix( \ | |||
| GiGetSubVector##_func_suffix##V2(src0, 0), \ | |||
| GiGetSubVector##_func_suffix##V2(src1, 0)); \ | |||
| auto vitem1 = GiAdd##_func_suffix( \ | |||
| GiGetSubVector##_func_suffix##V2(src0, 1), \ | |||
| GiGetSubVector##_func_suffix##V2(src1, 1)); \ | |||
| _gi_type2 ret; \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \ | |||
| return ret; \ | |||
| } \ | |||
| void operator()( \ | |||
| const _gi_type& src0, const _gi_type& src1, dst_ctype* dst) const { \ | |||
| @@ -82,13 +90,24 @@ struct AddOp<dt_qint8, dt_qint8> : AddOpBase<dt_qint8, dt_qint8> { | |||
| GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const { | |||
| auto vitem0 = GiAddFloat32( | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0), | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1)); | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale0)), | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale1))); | |||
| auto vitem1 = GiAddFloat32( | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0), | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1)); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}}); | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale0)), | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale1))); | |||
| GI_FLOAT32_V2_t ret; | |||
| GiSetSubVectorFloat32V2(ret, 0, vitem0); | |||
| GiSetSubVectorFloat32V2(ret, 1, vitem1); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(ret); | |||
| } | |||
| }; | |||
| @@ -119,12 +138,24 @@ struct AddOp<dt_qint32, dt_qint8> : AddOpBase<dt_qint32, dt_qint8> { | |||
| GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const { | |||
| auto vitem0 = GiAddFloat32( | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0), | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1)); | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale0)), | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale1))); | |||
| auto vitem1 = GiAddFloat32( | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0), | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1)); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}}); | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale0)), | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale1))); | |||
| GI_FLOAT32_V2_t ret; | |||
| GiSetSubVectorFloat32V2(ret, 0, vitem0); | |||
| GiSetSubVectorFloat32V2(ret, 1, vitem1); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(ret); | |||
| } | |||
| }; | |||
| @@ -23,22 +23,28 @@ struct ExpOpBase : UnaryOpBase<src_ctype, dst_ctype> { | |||
| template <typename src_ctype, typename dst_ctype = src_ctype> | |||
| struct ExpOp; | |||
| #define OP(_ctype, _simd_type, _func_suffix, _simd_width) \ | |||
| template <> \ | |||
| struct ExpOp<_ctype> : ExpOpBase<_ctype> { \ | |||
| using ExpOpBase::ExpOpBase; \ | |||
| using ExpOpBase::operator(); \ | |||
| constexpr static size_t SIMD_WIDTH = _simd_width; \ | |||
| void operator()(const _simd_type& src, _ctype* dst) const { \ | |||
| auto vitem = operator()(src); \ | |||
| GiStore##_func_suffix(dst, vitem.val[0]); \ | |||
| GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ | |||
| } \ | |||
| _simd_type operator()(const _simd_type& src) const { \ | |||
| auto vitem0 = GiExpPs##_func_suffix(src.val[0]); \ | |||
| auto vitem1 = GiExpPs##_func_suffix(src.val[1]); \ | |||
| return {{vitem0, vitem1}}; \ | |||
| } \ | |||
| #define OP(_ctype, _simd_type, _func_suffix, _simd_width) \ | |||
| template <> \ | |||
| struct ExpOp<_ctype> : ExpOpBase<_ctype> { \ | |||
| using ExpOpBase::ExpOpBase; \ | |||
| using ExpOpBase::operator(); \ | |||
| constexpr static size_t SIMD_WIDTH = _simd_width; \ | |||
| void operator()(const _simd_type& src, _ctype* dst) const { \ | |||
| auto vitem = operator()(src); \ | |||
| GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ | |||
| GiStore##_func_suffix( \ | |||
| dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ | |||
| } \ | |||
| _simd_type operator()(const _simd_type& src) const { \ | |||
| auto vitem0 = \ | |||
| GiExpPs##_func_suffix(GiGetSubVector##_func_suffix##V2(src, 0)); \ | |||
| auto vitem1 = \ | |||
| GiExpPs##_func_suffix(GiGetSubVector##_func_suffix##V2(src, 1)); \ | |||
| _simd_type ret; \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \ | |||
| return ret; \ | |||
| } \ | |||
| }; | |||
| OP(dt_float32, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float)) | |||
| #undef OP | |||
| @@ -32,14 +32,15 @@ struct FastTanhOp; | |||
| constexpr static size_t SIMD_WIDTH = _simd_width; \ | |||
| void operator()(const _simd_type& src, _ctype* dst) const { \ | |||
| auto vitem = operator()(src); \ | |||
| GiStore##_func_suffix(dst, vitem.val[0]); \ | |||
| GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ | |||
| GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ | |||
| GiStore##_func_suffix( \ | |||
| dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ | |||
| } \ | |||
| _simd_type operator()(const _simd_type& src) const { \ | |||
| auto val_27 = GiBroadcast##_func_suffix(27.f); \ | |||
| auto val_9 = GiBroadcast##_func_suffix(9.f); \ | |||
| auto valx = src.val[0]; \ | |||
| auto valx1 = src.val[1]; \ | |||
| auto valx = GiGetSubVector##_func_suffix##V2(src, 0); \ | |||
| auto valx1 = GiGetSubVector##_func_suffix##V2(src, 1); \ | |||
| auto valxp2 = GiMultiply##_fix_func_suffix(valx, valx); \ | |||
| auto valx1p2 = GiMultiply##_fix_func_suffix(valx1, valx1); \ | |||
| auto denominator = GiAdd##_fix_func_suffix(valxp2, val_27); \ | |||
| @@ -58,7 +59,10 @@ struct FastTanhOp; | |||
| r_denominator1); \ | |||
| valx = GiMultiply##_fix_func_suffix(valx, r_denominator); \ | |||
| valx1 = GiMultiply##_fix_func_suffix(valx1, r_denominator1); \ | |||
| return {{valx, valx1}}; \ | |||
| _simd_type ret; \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 0, valx); \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 1, valx1); \ | |||
| return ret; \ | |||
| } \ | |||
| }; | |||
| OP(dt_float32, GI_FLOAT32_V2_t, Float32, Float32, GI_SIMD_LEN_BYTE / sizeof(float)) | |||
| @@ -36,19 +36,23 @@ struct FuseAddHSwishOp; | |||
| const _simd_type2& src0, const _simd_type2& src1, \ | |||
| dst_ctype* dst) const { \ | |||
| auto vitem = operator()(src0, src1); \ | |||
| GiStore##_func_suffix(dst, vitem.val[0]); \ | |||
| GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ | |||
| GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ | |||
| GiStore##_func_suffix( \ | |||
| dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ | |||
| } \ | |||
| _simd_type2 operator()( \ | |||
| const _simd_type2& src0, const _simd_type2& src1) const { \ | |||
| auto val1 = src0.val[0]; \ | |||
| auto val2 = src0.val[1]; \ | |||
| auto val3 = src1.val[0]; \ | |||
| auto val4 = src1.val[1]; \ | |||
| auto val1 = GiGetSubVector##_func_suffix##V2(src0, 0); \ | |||
| auto val2 = GiGetSubVector##_func_suffix##V2(src0, 1); \ | |||
| auto val3 = GiGetSubVector##_func_suffix##V2(src1, 0); \ | |||
| auto val4 = GiGetSubVector##_func_suffix##V2(src1, 1); \ | |||
| val1 = GiAdd##_func_suffix(val1, val3); \ | |||
| val2 = GiAdd##_func_suffix(val2, val4); \ | |||
| H_SWISH_KERN_FALLBACK(_func_suffix, val1, val2); \ | |||
| return {{val1, val2}}; \ | |||
| _simd_type2 ret; \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 0, val1); \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 1, val2); \ | |||
| return ret; \ | |||
| } \ | |||
| void operator()( \ | |||
| const _simd_type& src0, const _simd_type& src1, \ | |||
| @@ -98,15 +102,28 @@ struct FuseAddHSwishOp<dt_qint32, dt_qint8> : FuseAddHSwishOpBase<dt_qint32, dt_ | |||
| GI_FLOAT32_t vitem0, vitem1; | |||
| vitem0 = GiAddFloat32( | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale_src0), | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale_src1)); | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale_src0)), | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale_src1))); | |||
| vitem1 = GiAddFloat32( | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale_src0), | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale_src1)); | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale_src0)), | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale_src1))); | |||
| H_SWISH_KERN_FALLBACK(Float32, vitem0, vitem1); | |||
| vitem0 = GiMultiplyFloat32(vitem0, this->vscale_dst); | |||
| vitem1 = GiMultiplyFloat32(vitem1, this->vscale_dst); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}}); | |||
| vitem0 = | |||
| GiMultiplyFloat32(vitem0, GiFixLenType2GiFloat32Type(this->vscale_dst)); | |||
| vitem1 = | |||
| GiMultiplyFloat32(vitem1, GiFixLenType2GiFloat32Type(this->vscale_dst)); | |||
| GI_FLOAT32_V2_t ret; | |||
| GiSetSubVectorFloat32V2(ret, 0, vitem0); | |||
| GiSetSubVectorFloat32V2(ret, 1, vitem1); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(ret); | |||
| } | |||
| }; | |||
| @@ -35,17 +35,21 @@ struct FuseAddReluOp; | |||
| const _simd_type2& src0, const _simd_type2& src1, \ | |||
| dst_ctype* dst) const { \ | |||
| auto vitem = operator()(src0, src1); \ | |||
| GiStore##_func_suffix(dst, vitem.val[0]); \ | |||
| GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ | |||
| GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ | |||
| GiStore##_func_suffix( \ | |||
| dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ | |||
| } \ | |||
| _simd_type2 operator()( \ | |||
| const _simd_type2& src0, const _simd_type2& src1) const { \ | |||
| auto val1 = src0.val[0]; \ | |||
| auto val2 = src0.val[1]; \ | |||
| auto val3 = src1.val[0]; \ | |||
| auto val4 = src1.val[1]; \ | |||
| auto val1 = GiGetSubVector##_func_suffix##V2(src0, 0); \ | |||
| auto val2 = GiGetSubVector##_func_suffix##V2(src0, 1); \ | |||
| auto val3 = GiGetSubVector##_func_suffix##V2(src1, 0); \ | |||
| auto val4 = GiGetSubVector##_func_suffix##V2(src1, 1); \ | |||
| FUSE_ADD_RELU_SIMD_PACK2_FALLBACK(val1, val2, val3, val4, _func_suffix); \ | |||
| return {{val1, val2}}; \ | |||
| _simd_type2 ret; \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 0, val1); \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 1, val2); \ | |||
| return ret; \ | |||
| } \ | |||
| void operator()( \ | |||
| const _simd_type& src0, const _simd_type& src1, \ | |||
| @@ -105,15 +109,26 @@ struct FuseAddReluOp<dt_qint8, dt_qint8> : FuseAddReluOpBase<dt_qint8, dt_qint8> | |||
| GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const { | |||
| auto vitem0 = GiAddFloat32( | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0), | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1)); | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale0)), | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale1))); | |||
| auto vitem1 = GiAddFloat32( | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0), | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1)); | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale0)), | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale1))); | |||
| vitem0 = GiMaximumFloat32(vitem0, this->vzero()); | |||
| vitem1 = GiMaximumFloat32(vitem1, this->vzero()); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}}); | |||
| GI_FLOAT32_V2_t ret; | |||
| GiSetSubVectorFloat32V2(ret, 0, vitem0); | |||
| GiSetSubVectorFloat32V2(ret, 1, vitem1); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(ret); | |||
| } | |||
| }; | |||
| @@ -144,15 +159,26 @@ struct FuseAddReluOp<dt_qint32, dt_qint8> : FuseAddReluOpBase<dt_qint32, dt_qint | |||
| GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const { | |||
| auto vitem0 = GiAddFloat32( | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0), | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1)); | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale0)), | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale1))); | |||
| auto vitem1 = GiAddFloat32( | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0), | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1)); | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale0)), | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale1))); | |||
| vitem0 = GiMaximumFloat32(vitem0, this->vzero()); | |||
| vitem1 = GiMaximumFloat32(vitem1, this->vzero()); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}}); | |||
| GI_FLOAT32_V2_t ret; | |||
| GiSetSubVectorFloat32V2(ret, 0, vitem0); | |||
| GiSetSubVectorFloat32V2(ret, 1, vitem1); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(ret); | |||
| } | |||
| }; | |||
| @@ -36,19 +36,23 @@ struct FuseAddSigmoidOp; | |||
| const _simd_type& src0, const _simd_type& src1, \ | |||
| dst_ctype* dst) const { \ | |||
| auto vitem = operator()(src0, src1); \ | |||
| GiStore##_func_suffix(dst, vitem.val[0]); \ | |||
| GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ | |||
| GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ | |||
| GiStore##_func_suffix( \ | |||
| dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ | |||
| } \ | |||
| _simd_type operator()(const _simd_type& src0, const _simd_type& src1) const { \ | |||
| auto val1 = src0.val[0]; \ | |||
| auto val2 = src0.val[1]; \ | |||
| auto val3 = src1.val[0]; \ | |||
| auto val4 = src1.val[1]; \ | |||
| auto val1 = GiGetSubVector##_func_suffix##V2(src0, 0); \ | |||
| auto val2 = GiGetSubVector##_func_suffix##V2(src0, 1); \ | |||
| auto val3 = GiGetSubVector##_func_suffix##V2(src1, 0); \ | |||
| auto val4 = GiGetSubVector##_func_suffix##V2(src1, 1); \ | |||
| val1 = GiAdd##_func_suffix(val1, val3); \ | |||
| val2 = GiAdd##_func_suffix(val2, val4); \ | |||
| val1 = GiSigmoidPs##_func_suffix(val1); \ | |||
| val2 = GiSigmoidPs##_func_suffix(val2); \ | |||
| return {{val1, val2}}; \ | |||
| _simd_type ret; \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 0, val1); \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 1, val2); \ | |||
| return ret; \ | |||
| } \ | |||
| }; | |||
| OP(dt_float32, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float)) | |||
| @@ -35,14 +35,15 @@ struct FuseAddTanhOp; | |||
| const _simd_type& src0, const _simd_type& src1, \ | |||
| dst_ctype* dst) const { \ | |||
| auto vitem = operator()(src0, src1); \ | |||
| GiStore##_func_suffix(dst, vitem.val[0]); \ | |||
| GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ | |||
| GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ | |||
| GiStore##_func_suffix( \ | |||
| dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ | |||
| } \ | |||
| _simd_type operator()(const _simd_type& src0, const _simd_type& src1) const { \ | |||
| auto val1 = src0.val[0]; \ | |||
| auto val2 = src0.val[1]; \ | |||
| auto val3 = src1.val[0]; \ | |||
| auto val4 = src1.val[1]; \ | |||
| auto val1 = GiGetSubVector##_func_suffix##V2(src0, 0); \ | |||
| auto val2 = GiGetSubVector##_func_suffix##V2(src0, 1); \ | |||
| auto val3 = GiGetSubVector##_func_suffix##V2(src1, 0); \ | |||
| auto val4 = GiGetSubVector##_func_suffix##V2(src1, 1); \ | |||
| val1 = GiAdd##_func_suffix(val1, val3); \ | |||
| val2 = GiAdd##_func_suffix(val2, val4); \ | |||
| auto exp1 = GiExpPs##_func_suffix(val1); \ | |||
| @@ -65,7 +66,10 @@ struct FuseAddTanhOp; | |||
| GiRecpeS##_func_suffix(exp2, rexp2), rexp2); \ | |||
| val1 = GiMultiply##_func_suffix(val1, rexp1); \ | |||
| val2 = GiMultiply##_func_suffix(val2, rexp2); \ | |||
| return {{val1, val2}}; \ | |||
| _simd_type ret; \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 0, val1); \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 1, val2); \ | |||
| return ret; \ | |||
| } \ | |||
| }; | |||
| OP(dt_float32, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float)) | |||
| @@ -26,28 +26,36 @@ struct FuseMulAdd3OpBase : TernaryOpBase<src_ctype, dst_ctype> { | |||
| template <typename src_ctype, typename dst_ctype = src_ctype> | |||
| struct FuseMulAdd3Op; | |||
| #define OP(_ctype, _simd_type, _func_suffix, _simd_width) \ | |||
| template <> \ | |||
| struct FuseMulAdd3Op<_ctype> : FuseMulAdd3OpBase<_ctype> { \ | |||
| using FuseMulAdd3OpBase::FuseMulAdd3OpBase; \ | |||
| using FuseMulAdd3OpBase::operator(); \ | |||
| constexpr static size_t SIMD_WIDTH = _simd_width; \ | |||
| void operator()( \ | |||
| const _simd_type& src0, const _simd_type& src1, \ | |||
| const _simd_type& src2, dst_ctype* dst) const { \ | |||
| auto vitem = operator()(src0, src1, src2); \ | |||
| GiStore##_func_suffix(dst, vitem.val[0]); \ | |||
| GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ | |||
| } \ | |||
| _simd_type operator()( \ | |||
| const _simd_type& src0, const _simd_type& src1, \ | |||
| const _simd_type& src2) const { \ | |||
| auto vitem0 = GiMultiplyAdd##_func_suffix( \ | |||
| src2.val[0], src0.val[0], src1.val[0]); \ | |||
| auto vitem1 = GiMultiplyAdd##_func_suffix( \ | |||
| src2.val[1], src0.val[1], src1.val[1]); \ | |||
| return {{vitem0, vitem1}}; \ | |||
| } \ | |||
| #define OP(_ctype, _simd_type, _func_suffix, _simd_width) \ | |||
| template <> \ | |||
| struct FuseMulAdd3Op<_ctype> : FuseMulAdd3OpBase<_ctype> { \ | |||
| using FuseMulAdd3OpBase::FuseMulAdd3OpBase; \ | |||
| using FuseMulAdd3OpBase::operator(); \ | |||
| constexpr static size_t SIMD_WIDTH = _simd_width; \ | |||
| void operator()( \ | |||
| const _simd_type& src0, const _simd_type& src1, \ | |||
| const _simd_type& src2, dst_ctype* dst) const { \ | |||
| auto vitem = operator()(src0, src1, src2); \ | |||
| GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ | |||
| GiStore##_func_suffix( \ | |||
| dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ | |||
| } \ | |||
| _simd_type operator()( \ | |||
| const _simd_type& src0, const _simd_type& src1, \ | |||
| const _simd_type& src2) const { \ | |||
| auto vitem0 = GiMultiplyAdd##_func_suffix( \ | |||
| GiGetSubVector##_func_suffix##V2(src2, 0), \ | |||
| GiGetSubVector##_func_suffix##V2(src0, 0), \ | |||
| GiGetSubVector##_func_suffix##V2(src1, 0)); \ | |||
| auto vitem1 = GiMultiplyAdd##_func_suffix( \ | |||
| GiGetSubVector##_func_suffix##V2(src2, 1), \ | |||
| GiGetSubVector##_func_suffix##V2(src0, 1), \ | |||
| GiGetSubVector##_func_suffix##V2(src1, 1)); \ | |||
| _simd_type ret; \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \ | |||
| return ret; \ | |||
| } \ | |||
| }; | |||
| OP(dt_float32, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float)) | |||
| OP(dt_int32, GI_INT32_V2_t, Int32, GI_SIMD_LEN_BYTE / sizeof(int32_t)) | |||
| @@ -26,39 +26,43 @@ struct HSwishOpBase : UnaryOpBase<src_ctype, dst_ctype> { | |||
| template <typename src_ctype, typename dst_ctype = src_ctype> | |||
| struct HSwishOp; | |||
| #define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \ | |||
| template <> \ | |||
| struct HSwishOp<_ctype> : HSwishOpBase<_ctype> { \ | |||
| using HSwishOpBase::HSwishOpBase; \ | |||
| using HSwishOpBase::operator(); \ | |||
| constexpr static size_t SIMD_WIDTH = _simd_width; \ | |||
| void operator()(const _simd_type2& src, _ctype* dst) const { \ | |||
| auto vitem = operator()(src); \ | |||
| GiStore##_func_suffix(dst, vitem.val[0]); \ | |||
| GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ | |||
| } \ | |||
| void operator()(const _simd_type& src, _ctype* dst) const { \ | |||
| auto vitem = operator()(src); \ | |||
| GiStore##_func_suffix(dst, vitem); \ | |||
| } \ | |||
| _simd_type2 operator()(const _simd_type2& src) const { \ | |||
| auto val1 = src.val[0]; \ | |||
| auto val2 = src.val[1]; \ | |||
| H_SWISH_KERN_FALLBACK(_func_suffix, val1, val2); \ | |||
| return {{val1, val2}}; \ | |||
| } \ | |||
| _simd_type operator()(const _simd_type& src) const { \ | |||
| auto val_zero = GiBroadcast##_func_suffix(0.f); \ | |||
| auto val_six = GiBroadcast##_func_suffix(6.f); \ | |||
| auto val_three = GiBroadcast##_func_suffix(3.f); \ | |||
| auto val_rec_six = GiBroadcast##_func_suffix(1.f / 6.f); \ | |||
| auto clip1 = GiMaximum##_func_suffix( \ | |||
| GiMinimum##_func_suffix( \ | |||
| GiAdd##_func_suffix(src, val_three), val_six), \ | |||
| val_zero); \ | |||
| return GiMultiply##_func_suffix( \ | |||
| GiMultiply##_func_suffix(src, clip1), val_rec_six); \ | |||
| } \ | |||
| #define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \ | |||
| template <> \ | |||
| struct HSwishOp<_ctype> : HSwishOpBase<_ctype> { \ | |||
| using HSwishOpBase::HSwishOpBase; \ | |||
| using HSwishOpBase::operator(); \ | |||
| constexpr static size_t SIMD_WIDTH = _simd_width; \ | |||
| void operator()(const _simd_type2& src, _ctype* dst) const { \ | |||
| auto vitem = operator()(src); \ | |||
| GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ | |||
| GiStore##_func_suffix( \ | |||
| dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ | |||
| } \ | |||
| void operator()(const _simd_type& src, _ctype* dst) const { \ | |||
| auto vitem = operator()(src); \ | |||
| GiStore##_func_suffix(dst, vitem); \ | |||
| } \ | |||
| _simd_type2 operator()(const _simd_type2& src) const { \ | |||
| auto val1 = GiGetSubVector##_func_suffix##V2(src, 0); \ | |||
| auto val2 = GiGetSubVector##_func_suffix##V2(src, 1); \ | |||
| H_SWISH_KERN_FALLBACK(_func_suffix, val1, val2); \ | |||
| _simd_type2 ret; \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 0, val1); \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 1, val2); \ | |||
| return ret; \ | |||
| } \ | |||
| _simd_type operator()(const _simd_type& src) const { \ | |||
| auto val_zero = GiBroadcast##_func_suffix(0.f); \ | |||
| auto val_six = GiBroadcast##_func_suffix(6.f); \ | |||
| auto val_three = GiBroadcast##_func_suffix(3.f); \ | |||
| auto val_rec_six = GiBroadcast##_func_suffix(1.f / 6.f); \ | |||
| auto clip1 = GiMaximum##_func_suffix( \ | |||
| GiMinimum##_func_suffix( \ | |||
| GiAdd##_func_suffix(src, val_three), val_six), \ | |||
| val_zero); \ | |||
| return GiMultiply##_func_suffix( \ | |||
| GiMultiply##_func_suffix(src, clip1), val_rec_six); \ | |||
| } \ | |||
| }; | |||
| OP(dt_float32, GI_FLOAT32_t, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float)) | |||
| @@ -90,14 +94,23 @@ struct HSwishOp<dt_qint32, dt_qint8> : HSwishOpBase<dt_qint32, dt_qint8> { | |||
| } | |||
| GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const { | |||
| auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale_src); | |||
| auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale_src); | |||
| auto vitem0 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale_src)); | |||
| auto vitem1 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale_src)); | |||
| H_SWISH_KERN_FALLBACK(Float32, vitem0, vitem1); | |||
| vitem0 = GiMultiplyFloat32(vitem0, this->vscale_dst); | |||
| vitem1 = GiMultiplyFloat32(vitem1, this->vscale_dst); | |||
| vitem0 = | |||
| GiMultiplyFloat32(vitem0, GiFixLenType2GiFloat32Type(this->vscale_dst)); | |||
| vitem1 = | |||
| GiMultiplyFloat32(vitem1, GiFixLenType2GiFloat32Type(this->vscale_dst)); | |||
| GI_FLOAT32_V2_t tmp; | |||
| GiSetSubVectorFloat32V2(tmp, 0, vitem0); | |||
| GiSetSubVectorFloat32V2(tmp, 1, vitem1); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}}); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp); | |||
| } | |||
| }; | |||
| @@ -32,14 +32,22 @@ struct MaxOp; | |||
| const _simd_type2& src0, const _simd_type2& src1, \ | |||
| dst_ctype* dst) const { \ | |||
| auto vitem = operator()(src0, src1); \ | |||
| GiStore##_func_suffix(dst, vitem.val[0]); \ | |||
| GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ | |||
| GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ | |||
| GiStore##_func_suffix( \ | |||
| dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ | |||
| } \ | |||
| _simd_type2 operator()( \ | |||
| const _simd_type2& src0, const _simd_type2& src1) const { \ | |||
| auto vitem0 = GiMaximum##_func_suffix(src0.val[0], src1.val[0]); \ | |||
| auto vitem1 = GiMaximum##_func_suffix(src0.val[1], src1.val[1]); \ | |||
| return {{vitem0, vitem1}}; \ | |||
| auto vitem0 = GiMaximum##_func_suffix( \ | |||
| GiGetSubVector##_func_suffix##V2(src0, 0), \ | |||
| GiGetSubVector##_func_suffix##V2(src1, 0)); \ | |||
| auto vitem1 = GiMaximum##_func_suffix( \ | |||
| GiGetSubVector##_func_suffix##V2(src0, 1), \ | |||
| GiGetSubVector##_func_suffix##V2(src1, 1)); \ | |||
| _simd_type2 ret; \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \ | |||
| return ret; \ | |||
| } \ | |||
| void operator()( \ | |||
| const _simd_type& src0, const _simd_type& src1, \ | |||
| @@ -87,12 +95,23 @@ struct MaxOp<dt_qint8, dt_qint8> : MaxOpBase<dt_qint8, dt_qint8> { | |||
| GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const { | |||
| auto vitem0 = GiMaximumFloat32( | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0), | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1)); | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale0)), | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale1))); | |||
| auto vitem1 = GiMaximumFloat32( | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0), | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1)); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}}); | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale0)), | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale1))); | |||
| GI_FLOAT32_V2_t tmp; | |||
| GiSetSubVectorFloat32V2(tmp, 0, vitem0); | |||
| GiSetSubVectorFloat32V2(tmp, 1, vitem1); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp); | |||
| } | |||
| }; | |||
| @@ -33,14 +33,22 @@ struct MinOp; | |||
| const _simd_type2& src0, const _simd_type2& src1, \ | |||
| dst_ctype* dst) const { \ | |||
| auto vitem = operator()(src0, src1); \ | |||
| GiStore##_func_suffix(dst, vitem.val[0]); \ | |||
| GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ | |||
| GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ | |||
| GiStore##_func_suffix( \ | |||
| dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ | |||
| } \ | |||
| _simd_type2 operator()( \ | |||
| const _simd_type2& src0, const _simd_type2& src1) const { \ | |||
| auto vitem0 = GiMinimum##_func_suffix(src0.val[0], src1.val[0]); \ | |||
| auto vitem1 = GiMinimum##_func_suffix(src0.val[1], src1.val[1]); \ | |||
| return {{vitem0, vitem1}}; \ | |||
| auto vitem0 = GiMinimum##_func_suffix( \ | |||
| GiGetSubVector##_func_suffix##V2(src0, 0), \ | |||
| GiGetSubVector##_func_suffix##V2(src1, 0)); \ | |||
| auto vitem1 = GiMinimum##_func_suffix( \ | |||
| GiGetSubVector##_func_suffix##V2(src0, 1), \ | |||
| GiGetSubVector##_func_suffix##V2(src1, 1)); \ | |||
| _simd_type2 ret; \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \ | |||
| return ret; \ | |||
| } \ | |||
| void operator()( \ | |||
| const _simd_type& src0, const _simd_type& src1, \ | |||
| @@ -84,12 +92,23 @@ struct MinOp<dt_qint8, dt_qint8> : MinOpBase<dt_qint8, dt_qint8> { | |||
| GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const { | |||
| auto vitem0 = GiMinimumFloat32( | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0), | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1)); | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale0)), | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale1))); | |||
| auto vitem1 = GiMinimumFloat32( | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0), | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1)); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}}); | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale0)), | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale1))); | |||
| GI_FLOAT32_V2_t tmp; | |||
| GiSetSubVectorFloat32V2(tmp, 0, vitem0); | |||
| GiSetSubVectorFloat32V2(tmp, 1, vitem1); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp); | |||
| } | |||
| }; | |||
| @@ -33,14 +33,22 @@ struct MulOp; | |||
| const _simd_type2& src0, const _simd_type2& src1, \ | |||
| dst_ctype* dst) const { \ | |||
| auto vitem = operator()(src0, src1); \ | |||
| GiStore##_func_suffix(dst, vitem.val[0]); \ | |||
| GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ | |||
| GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ | |||
| GiStore##_func_suffix( \ | |||
| dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ | |||
| } \ | |||
| _simd_type2 operator()( \ | |||
| const _simd_type2& src0, const _simd_type2& src1) const { \ | |||
| auto vitem0 = GiMultiply##_func_suffix(src0.val[0], src1.val[0]); \ | |||
| auto vitem1 = GiMultiply##_func_suffix(src0.val[1], src1.val[1]); \ | |||
| return {{vitem0, vitem1}}; \ | |||
| auto vitem0 = GiMultiply##_func_suffix( \ | |||
| GiGetSubVector##_func_suffix##V2(src0, 0), \ | |||
| GiGetSubVector##_func_suffix##V2(src1, 0)); \ | |||
| auto vitem1 = GiMultiply##_func_suffix( \ | |||
| GiGetSubVector##_func_suffix##V2(src0, 1), \ | |||
| GiGetSubVector##_func_suffix##V2(src1, 1)); \ | |||
| _simd_type2 ret; \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \ | |||
| return ret; \ | |||
| } \ | |||
| void operator()( \ | |||
| const _simd_type& src0, const _simd_type& src1, \ | |||
| @@ -83,13 +91,24 @@ struct MulOp<dt_qint8, dt_qint8> : MulOpBase<dt_qint8, dt_qint8> { | |||
| GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const { | |||
| auto vitem0 = GiMultiplyFloat32( | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale_src0), | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1)); | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale_src0)), | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale1))); | |||
| auto vitem1 = GiMultiplyFloat32( | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale_src0), | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1)); | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale_src0)), | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale1))); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}}); | |||
| GI_FLOAT32_V2_t tmp; | |||
| GiSetSubVectorFloat32V2(tmp, 0, vitem0); | |||
| GiSetSubVectorFloat32V2(tmp, 1, vitem1); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp); | |||
| } | |||
| }; | |||
| @@ -16,23 +16,24 @@ struct NoneOpBase : UnaryOpBase<src_ctype, dst_ctype> { | |||
| template <typename src_ctype, typename dst_type = src_ctype> | |||
| struct NoneOp; | |||
| #define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \ | |||
| template <> \ | |||
| struct NoneOp<_ctype> : NoneOpBase<_ctype> { \ | |||
| NoneOp(){}; \ | |||
| NoneOp(float, float){}; \ | |||
| using NoneOpBase::NoneOpBase; \ | |||
| using NoneOpBase::operator(); \ | |||
| constexpr static size_t SIMD_WIDTH = _simd_width; \ | |||
| _simd_type2 operator()(const _simd_type2& src) const { return src; } \ | |||
| void operator()(const _simd_type2& src, _ctype* dst) const { \ | |||
| GiStore##_func_suffix(dst, src.val[0]); \ | |||
| GiStore##_func_suffix(dst + SIMD_WIDTH, src.val[1]); \ | |||
| } \ | |||
| void operator()(const _simd_type& src, _ctype* dst) const { \ | |||
| GiStore##_func_suffix(dst, src); \ | |||
| } \ | |||
| _simd_type operator()(const _simd_type& src) const { return src; } \ | |||
| #define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \ | |||
| template <> \ | |||
| struct NoneOp<_ctype> : NoneOpBase<_ctype> { \ | |||
| NoneOp(){}; \ | |||
| NoneOp(float, float){}; \ | |||
| using NoneOpBase::NoneOpBase; \ | |||
| using NoneOpBase::operator(); \ | |||
| constexpr static size_t SIMD_WIDTH = _simd_width; \ | |||
| _simd_type2 operator()(const _simd_type2& src) const { return src; } \ | |||
| void operator()(const _simd_type2& src, _ctype* dst) const { \ | |||
| GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(src, 0)); \ | |||
| GiStore##_func_suffix( \ | |||
| dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(src, 1)); \ | |||
| } \ | |||
| void operator()(const _simd_type& src, _ctype* dst) const { \ | |||
| GiStore##_func_suffix(dst, src); \ | |||
| } \ | |||
| _simd_type operator()(const _simd_type& src) const { return src; } \ | |||
| }; | |||
| OP(dt_float32, GI_FLOAT32_t, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float)) | |||
| @@ -61,8 +62,8 @@ struct NoneOp<dt_qint32, dt_qint8> : NoneOpBase<dt_qint32, dt_qint8> { | |||
| constexpr static size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int32_t); | |||
| void operator()(const GI_INT32_V2_t& vsrc, dt_qint8* dst) const { | |||
| GiStoreInt32(dst, vsrc.val[0]); | |||
| GiStoreInt32(dst + 16, vsrc.val[1]); | |||
| GiStoreInt32(dst, GiGetSubVectorInt32V2(vsrc, 0)); | |||
| GiStoreInt32(dst + 16, GiGetSubVectorInt32V2(vsrc, 1)); | |||
| } | |||
| void operator()(const GI_INT32_t& src, dt_qint8* dst) const { | |||
| GiStoreInt32(dst, src); | |||
| @@ -31,24 +31,24 @@ struct UnaryOpBase : OpBase<src_ctype, dst_ctype> { | |||
| UnaryOpBase(DType /*src_dtype*/, DType /*dst_dtype*/) {} | |||
| }; | |||
| #define OPERATOR_UNARY_QINT8_FALLBACK \ | |||
| GI_INT16_t vsrct0 = GiMoveLowLongInt8(vsrc.val[0]); \ | |||
| GiStoreLowInt8( \ | |||
| reinterpret_cast<int8_t*>(dst), operator()( \ | |||
| {{GiMoveLowLongInt16(vsrct0), \ | |||
| GiMoveHighLongInt16(vsrct0)}})); \ | |||
| GI_INT16_t vsrct1 = GiMoveHighLongInt8(vsrc.val[0]); \ | |||
| GiStoreLowInt8( \ | |||
| reinterpret_cast<int8_t*>(dst + 8), \ | |||
| operator()({{GiMoveLowLongInt16(vsrct1), GiMoveHighLongInt16(vsrct1)}})); \ | |||
| GI_INT16_t vsrct2 = GiMoveLowLongInt8(vsrc.val[1]); \ | |||
| GiStoreLowInt8( \ | |||
| reinterpret_cast<int8_t*>(dst + 16), \ | |||
| operator()({{GiMoveLowLongInt16(vsrct2), GiMoveHighLongInt16(vsrct2)}})); \ | |||
| GI_INT16_t vsrct3 = GiMoveHighLongInt8(vsrc.val[1]); \ | |||
| GiStoreLowInt8( \ | |||
| reinterpret_cast<int8_t*>(dst + 24), \ | |||
| operator()({{GiMoveLowLongInt16(vsrct3), GiMoveHighLongInt16(vsrct3)}})) | |||
| #define OPERATOR_UNARY_QINT8_FALLBACK \ | |||
| GI_INT16_t vsrct0 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc, 0)); \ | |||
| GI_INT32_V2_t tmp; \ | |||
| GiSetSubVectorInt32V2(tmp, 0, GiMoveLowLongInt16(vsrct0)); \ | |||
| GiSetSubVectorInt32V2(tmp, 1, GiMoveHighLongInt16(vsrct0)); \ | |||
| GiStoreLowInt8(reinterpret_cast<int8_t*>(dst), operator()(tmp)); \ | |||
| GI_INT16_t vsrct1 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc, 0)); \ | |||
| GiSetSubVectorInt32V2(tmp, 0, GiMoveLowLongInt16(vsrct1)); \ | |||
| GiSetSubVectorInt32V2(tmp, 1, GiMoveHighLongInt16(vsrct1)); \ | |||
| GiStoreLowInt8(reinterpret_cast<int8_t*>(dst + 8), operator()(tmp)); \ | |||
| GI_INT16_t vsrct2 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc, 1)); \ | |||
| GiSetSubVectorInt32V2(tmp, 0, GiMoveLowLongInt16(vsrct2)); \ | |||
| GiSetSubVectorInt32V2(tmp, 1, GiMoveHighLongInt16(vsrct2)); \ | |||
| GiStoreLowInt8(reinterpret_cast<int8_t*>(dst + 16), operator()(tmp)); \ | |||
| GI_INT16_t vsrct3 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc, 1)); \ | |||
| GiSetSubVectorInt32V2(tmp, 0, GiMoveLowLongInt16(vsrct3)); \ | |||
| GiSetSubVectorInt32V2(tmp, 1, GiMoveHighLongInt16(vsrct3)); \ | |||
| GiStoreLowInt8(reinterpret_cast<int8_t*>(dst + 24), operator()(tmp)) | |||
| //! scale_src = src.scale; scale_dst = 1.f / dst.scale (div -> mul) | |||
| //! scale = src.scale / dst.scale | |||
| @@ -56,17 +56,17 @@ template <> | |||
| struct UnaryOpBase<dt_qint8, dt_qint8> : OpBase<dt_qint8, dt_qint8> { | |||
| using OpBase::OpBase; | |||
| float scale_src, scale_dst; | |||
| GI_FLOAT32_t vscale_src, vscale_dst; | |||
| GI_FLOAT32_FIXLEN_t vscale_src, vscale_dst; | |||
| float scale; | |||
| GI_FLOAT32_t vscale; | |||
| GI_FLOAT32_FIXLEN_t vscale; | |||
| void init(float src_scale, float dst_scale) { | |||
| scale_src = src_scale; | |||
| vscale_src = GiBroadcastFloat32(scale_src); | |||
| vscale_src = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_src)); | |||
| scale_dst = 1.f / dst_scale; | |||
| vscale_dst = GiBroadcastFloat32(scale_dst); | |||
| vscale_dst = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_dst)); | |||
| scale = src_scale / dst_scale; | |||
| vscale = GiBroadcastFloat32(scale); | |||
| vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale)); | |||
| } | |||
| UnaryOpBase(DType src_dtype, DType dst_dtype) { | |||
| @@ -83,17 +83,17 @@ struct UnaryOpBase<dt_qint32, dt_qint8> : OpBase<dt_qint32, dt_qint8> { | |||
| using src_ctype = dt_qint32; | |||
| using dst_ctype = dt_qint8; | |||
| float scale; | |||
| GI_FLOAT32_t vscale; | |||
| GI_FLOAT32_FIXLEN_t vscale; | |||
| float scale_src, scale_dst; | |||
| GI_FLOAT32_t vscale_src, vscale_dst; | |||
| GI_FLOAT32_FIXLEN_t vscale_src, vscale_dst; | |||
| void init(float src_scale, float dst_scale) { | |||
| scale_src = src_scale; | |||
| vscale_src = GiBroadcastFloat32(src_scale); | |||
| vscale_src = GiFloat32Type2FixLenType(GiBroadcastFloat32(src_scale)); | |||
| scale_dst = 1 / dst_scale; | |||
| vscale_dst = GiBroadcastFloat32(scale_dst); | |||
| vscale_dst = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_dst)); | |||
| scale = src_scale / dst_scale; | |||
| vscale = GiBroadcastFloat32(scale); | |||
| vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale)); | |||
| } | |||
| UnaryOpBase(DType src_dtype, DType dst_dtype) { | |||
| @@ -115,35 +115,36 @@ struct BinaryOpBase : OpBase<src_ctype, dst_ctype> { | |||
| /* ================= binary op for quantized types ================== */ | |||
| #define OPERATOR_BINARY_QINT8_FALLBACK \ | |||
| GI_INT16_t vsrct0_0 = GiMoveLowLongInt8(vsrc0.val[0]); \ | |||
| GI_INT16_t vsrct1_0 = GiMoveLowLongInt8(vsrc1.val[0]); \ | |||
| GiStoreLowInt8( \ | |||
| reinterpret_cast<int8_t*>(dst), \ | |||
| operator()( \ | |||
| {{GiMoveLowLongInt16(vsrct0_0), GiMoveHighLongInt16(vsrct0_0)}}, \ | |||
| {{GiMoveLowLongInt16(vsrct1_0), GiMoveHighLongInt16(vsrct1_0)}})); \ | |||
| GI_INT16_t vsrct0_1 = GiMoveHighLongInt8(vsrc0.val[0]); \ | |||
| GI_INT16_t vsrct1_1 = GiMoveHighLongInt8(vsrc1.val[0]); \ | |||
| GiStoreLowInt8( \ | |||
| reinterpret_cast<int8_t*>(dst + 8), \ | |||
| operator()( \ | |||
| {{GiMoveLowLongInt16(vsrct0_1), GiMoveHighLongInt16(vsrct0_1)}}, \ | |||
| {{GiMoveLowLongInt16(vsrct1_1), GiMoveHighLongInt16(vsrct1_1)}})); \ | |||
| GI_INT16_t vsrct0_2 = GiMoveLowLongInt8(vsrc0.val[1]); \ | |||
| GI_INT16_t vsrct1_2 = GiMoveLowLongInt8(vsrc1.val[1]); \ | |||
| GiStoreLowInt8( \ | |||
| reinterpret_cast<int8_t*>(dst + 16), \ | |||
| operator()( \ | |||
| {{GiMoveLowLongInt16(vsrct0_2), GiMoveHighLongInt16(vsrct0_2)}}, \ | |||
| {{GiMoveLowLongInt16(vsrct1_2), GiMoveHighLongInt16(vsrct1_2)}})); \ | |||
| GI_INT16_t vsrct0_3 = GiMoveHighLongInt8(vsrc0.val[1]); \ | |||
| GI_INT16_t vsrct1_3 = GiMoveHighLongInt8(vsrc1.val[1]); \ | |||
| GiStoreLowInt8( \ | |||
| reinterpret_cast<int8_t*>(dst + 24), \ | |||
| operator()( \ | |||
| {{GiMoveLowLongInt16(vsrct0_3), GiMoveHighLongInt16(vsrct0_3)}}, \ | |||
| {{GiMoveLowLongInt16(vsrct1_3), GiMoveHighLongInt16(vsrct1_3)}})) | |||
| #define OPERATOR_BINARY_QINT8_FALLBACK \ | |||
| GI_INT16_t vsrct0_0 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc0, 0)); \ | |||
| GI_INT16_t vsrct1_0 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc1, 0)); \ | |||
| GI_INT32_V2_t tmp0, tmp1; \ | |||
| GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0_0)); \ | |||
| GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0_0)); \ | |||
| GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1_0)); \ | |||
| GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1_0)); \ | |||
| GiStoreLowInt8(reinterpret_cast<int8_t*>(dst), operator()(tmp0, tmp1)); \ | |||
| GI_INT16_t vsrct0_1 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc0, 0)); \ | |||
| GI_INT16_t vsrct1_1 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc1, 0)); \ | |||
| GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0_1)); \ | |||
| GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0_1)); \ | |||
| GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1_1)); \ | |||
| GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1_1)); \ | |||
| GiStoreLowInt8(reinterpret_cast<int8_t*>(dst + 8), operator()(tmp0, tmp1)); \ | |||
| GI_INT16_t vsrct0_2 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc0, 1)); \ | |||
| GI_INT16_t vsrct1_2 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc1, 1)); \ | |||
| GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0_2)); \ | |||
| GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0_2)); \ | |||
| GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1_2)); \ | |||
| GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1_2)); \ | |||
| GiStoreLowInt8(reinterpret_cast<int8_t*>(dst + 16), operator()(tmp0, tmp1)); \ | |||
| GI_INT16_t vsrct0_3 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc0, 1)); \ | |||
| GI_INT16_t vsrct1_3 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc1, 1)); \ | |||
| GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0_3)); \ | |||
| GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0_3)); \ | |||
| GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1_3)); \ | |||
| GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1_3)); \ | |||
| GiStoreLowInt8(reinterpret_cast<int8_t*>(dst + 24), operator()(tmp0, tmp1)); | |||
| //! scale_src0 = src0.scale; scale_src1 = src1.scale; scale_dst = 1.f / | |||
| //! dst.scale scale0 = src0.scale / dst.scale; scale1 = src1.scale / dst.scale | |||
| @@ -153,21 +154,21 @@ struct BinaryOpBase<dt_qint8, dt_qint8> : OpBase<dt_qint8, dt_qint8> { | |||
| using src_ctype = dt_qint8; | |||
| using dst_ctype = dt_qint8; | |||
| float scale_src0, scale_src1, scale_dst; | |||
| GI_FLOAT32_t vscale_src0, vscale_src1, vscale_dst; | |||
| GI_FLOAT32_FIXLEN_t vscale_src0, vscale_src1, vscale_dst; | |||
| float scale0, scale1; | |||
| GI_FLOAT32_t vscale0, vscale1; | |||
| GI_FLOAT32_FIXLEN_t vscale0, vscale1; | |||
| void init(float src0_scale, float src1_scale, float dst_scale) { | |||
| scale_src0 = src0_scale; | |||
| vscale_src0 = GiBroadcastFloat32(scale_src0); | |||
| vscale_src0 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_src0)); | |||
| scale_src1 = src1_scale; | |||
| vscale_src1 = GiBroadcastFloat32(scale_src1); | |||
| vscale_src1 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_src1)); | |||
| scale_dst = 1.f / dst_scale; | |||
| vscale_dst = GiBroadcastFloat32(scale_dst); | |||
| vscale_dst = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_dst)); | |||
| scale0 = src0_scale / dst_scale; | |||
| vscale0 = GiBroadcastFloat32(scale0); | |||
| vscale0 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale0)); | |||
| scale1 = src1_scale / dst_scale; | |||
| vscale1 = GiBroadcastFloat32(scale1); | |||
| vscale1 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale1)); | |||
| } | |||
| BinaryOpBase(DType src0_dtype, DType src1_dtype, DType dst_dtype) { | |||
| @@ -188,21 +189,21 @@ struct BinaryOpBase<dt_qint32, dt_qint8> : OpBase<dt_qint32, dt_qint8> { | |||
| using src_ctype = dt_qint32; | |||
| using dst_ctype = dt_qint8; | |||
| float scale0, scale1; | |||
| GI_FLOAT32_t vscale0, vscale1; | |||
| GI_FLOAT32_FIXLEN_t vscale0, vscale1; | |||
| float scale_src0, scale_src1, scale_dst; | |||
| GI_FLOAT32_t vscale_src0, vscale_src1, vscale_dst; | |||
| GI_FLOAT32_FIXLEN_t vscale_src0, vscale_src1, vscale_dst; | |||
| void init(float src0_scale, float src1_scale, float dst_scale) { | |||
| scale_src0 = src0_scale; | |||
| vscale_src0 = GiBroadcastFloat32(src0_scale); | |||
| vscale_src0 = GiFloat32Type2FixLenType(GiBroadcastFloat32(src0_scale)); | |||
| scale_src1 = src1_scale; | |||
| vscale_src1 = GiBroadcastFloat32(src1_scale); | |||
| vscale_src1 = GiFloat32Type2FixLenType(GiBroadcastFloat32(src1_scale)); | |||
| scale_dst = 1 / dst_scale; | |||
| vscale_dst = GiBroadcastFloat32(scale_dst); | |||
| vscale_dst = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_dst)); | |||
| scale0 = src0_scale / dst_scale; | |||
| vscale0 = GiBroadcastFloat32(scale0); | |||
| vscale0 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale0)); | |||
| scale1 = src1_scale / dst_scale; | |||
| vscale1 = GiBroadcastFloat32(scale1); | |||
| vscale1 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale1)); | |||
| } | |||
| BinaryOpBase(DType src0_dtype, DType src1_dtype, DType dst_dtype) { | |||
| @@ -227,43 +228,48 @@ struct TernaryOpBase : OpBase<src_ctype, dst_ctype> { | |||
| DType /*dst_dtype*/) {} | |||
| }; | |||
| #define OPERATOR_TERNARY_QINT8_FALLBACK \ | |||
| GI_INT16_t vsrct0 = GiMoveLowLongInt8(vsrc0.val[0]); \ | |||
| GI_INT16_t vsrct1 = GiMoveLowLongInt8(vsrc1.val[0]); \ | |||
| GI_INT16_t vsrct2 = GiMoveLowLongInt8(vsrc2.val[0]); \ | |||
| GiStoreLowInt8( \ | |||
| reinterpret_cast<int8_t*>(dst), \ | |||
| operator()( \ | |||
| {{GiMoveLowLongInt16(vsrct0), GiMoveHighLongInt16(vsrct0)}}, \ | |||
| {{GiMoveLowLongInt16(vsrct1), GiMoveHighLongInt16(vsrct1)}}, \ | |||
| {{GiMoveLowLongInt16(vsrct2), GiMoveHighLongInt16(vsrct2)}})); \ | |||
| vsrct0 = GiMoveHighLongInt8(vsrc0.val[0]); \ | |||
| vsrct1 = GiMoveHighLongInt8(vsrc1.val[0]); \ | |||
| vsrct2 = GiMoveHighLongInt8(vsrc2.val[0]); \ | |||
| GiStoreLowInt8( \ | |||
| reinterpret_cast<int8_t*>(dst + 8), \ | |||
| operator()( \ | |||
| {{GiMoveLowLongInt16(vsrct0), GiMoveHighLongInt16(vsrct0)}}, \ | |||
| {{GiMoveLowLongInt16(vsrct1), GiMoveHighLongInt16(vsrct1)}}, \ | |||
| {{GiMoveLowLongInt16(vsrct2), GiMoveHighLongInt16(vsrct2)}})); \ | |||
| vsrct0 = GiMoveLowLongInt8(vsrc0.val[1]); \ | |||
| vsrct1 = GiMoveLowLongInt8(vsrc1.val[1]); \ | |||
| vsrct2 = GiMoveLowLongInt8(vsrc2.val[1]); \ | |||
| GiStoreLowInt8( \ | |||
| reinterpret_cast<int8_t*>(dst + 16), \ | |||
| operator()( \ | |||
| {{GiMoveLowLongInt16(vsrct0), GiMoveHighLongInt16(vsrct0)}}, \ | |||
| {{GiMoveLowLongInt16(vsrct1), GiMoveHighLongInt16(vsrct1)}}, \ | |||
| {{GiMoveLowLongInt16(vsrct2), GiMoveHighLongInt16(vsrct2)}})); \ | |||
| vsrct0 = GiMoveHighLongInt8(vsrc0.val[1]); \ | |||
| vsrct1 = GiMoveHighLongInt8(vsrc1.val[1]); \ | |||
| vsrct2 = GiMoveHighLongInt8(vsrc2.val[1]); \ | |||
| GiStoreLowInt8( \ | |||
| reinterpret_cast<int8_t*>(dst + 24), \ | |||
| operator()( \ | |||
| {{GiMoveLowLongInt16(vsrct0), GiMoveHighLongInt16(vsrct0)}}, \ | |||
| {{GiMoveLowLongInt16(vsrct1), GiMoveHighLongInt16(vsrct1)}}, \ | |||
| {{GiMoveLowLongInt16(vsrct2), GiMoveHighLongInt16(vsrct2)}})) | |||
| #define OPERATOR_TERNARY_QINT8_FALLBACK \ | |||
| GI_INT16_t vsrct0 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc0, 0)); \ | |||
| GI_INT16_t vsrct1 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc1, 0)); \ | |||
| GI_INT16_t vsrct2 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc2, 0)); \ | |||
| GI_INT32_V2_t tmp0, tmp1, tmp2; \ | |||
| GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0)); \ | |||
| GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0)); \ | |||
| GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1)); \ | |||
| GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1)); \ | |||
| GiSetSubVectorInt32V2(tmp2, 0, GiMoveLowLongInt16(vsrct2)); \ | |||
| GiSetSubVectorInt32V2(tmp2, 1, GiMoveHighLongInt16(vsrct2)); \ | |||
| GiStoreLowInt8(reinterpret_cast<int8_t*>(dst), operator()(tmp0, tmp1, tmp2)); \ | |||
| vsrct0 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc0, 0)); \ | |||
| vsrct1 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc1, 0)); \ | |||
| vsrct2 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc2, 0)); \ | |||
| GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0)); \ | |||
| GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0)); \ | |||
| GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1)); \ | |||
| GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1)); \ | |||
| GiSetSubVectorInt32V2(tmp2, 0, GiMoveLowLongInt16(vsrct2)); \ | |||
| GiSetSubVectorInt32V2(tmp2, 1, GiMoveHighLongInt16(vsrct2)); \ | |||
| GiStoreLowInt8(reinterpret_cast<int8_t*>(dst + 8), operator()(tmp0, tmp1, tmp2)); \ | |||
| vsrct0 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc0, 1)); \ | |||
| vsrct1 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc1, 1)); \ | |||
| vsrct2 = GiMoveLowLongInt8(GiGetSubVectorInt8V2(vsrc2, 1)); \ | |||
| GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0)); \ | |||
| GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0)); \ | |||
| GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1)); \ | |||
| GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1)); \ | |||
| GiSetSubVectorInt32V2(tmp2, 0, GiMoveLowLongInt16(vsrct2)); \ | |||
| GiSetSubVectorInt32V2(tmp2, 1, GiMoveHighLongInt16(vsrct2)); \ | |||
| GiStoreLowInt8(reinterpret_cast<int8_t*>(dst + 16), operator()(tmp0, tmp1, tmp2)); \ | |||
| vsrct0 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc0, 1)); \ | |||
| vsrct1 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc1, 1)); \ | |||
| vsrct2 = GiMoveHighLongInt8(GiGetSubVectorInt8V2(vsrc2, 1)); \ | |||
| GiSetSubVectorInt32V2(tmp0, 0, GiMoveLowLongInt16(vsrct0)); \ | |||
| GiSetSubVectorInt32V2(tmp0, 1, GiMoveHighLongInt16(vsrct0)); \ | |||
| GiSetSubVectorInt32V2(tmp1, 0, GiMoveLowLongInt16(vsrct1)); \ | |||
| GiSetSubVectorInt32V2(tmp1, 1, GiMoveHighLongInt16(vsrct1)); \ | |||
| GiSetSubVectorInt32V2(tmp2, 0, GiMoveLowLongInt16(vsrct2)); \ | |||
| GiSetSubVectorInt32V2(tmp2, 1, GiMoveHighLongInt16(vsrct2)); \ | |||
| GiStoreLowInt8(reinterpret_cast<int8_t*>(dst + 24), operator()(tmp0, tmp1, tmp2)); | |||
| /*========================= ternaty op for quanzited ====================*/ | |||
| template <> | |||
| @@ -272,24 +278,24 @@ struct TernaryOpBase<dt_qint8, dt_qint8> : OpBase<dt_qint8, dt_qint8> { | |||
| using src_ctype = dt_qint8; | |||
| using dst_ctype = dt_qint8; | |||
| float scale_src0, scale_src1, scale_src2, scale_dst; | |||
| GI_FLOAT32_t vscale_src0, vscale_src1, vscale_src2, vscale_dst; | |||
| GI_FLOAT32_FIXLEN_t vscale_src0, vscale_src1, vscale_src2, vscale_dst; | |||
| float scale0, scale1, scale2; | |||
| GI_FLOAT32_t vscale0, vscale1, vscale2; | |||
| GI_FLOAT32_FIXLEN_t vscale0, vscale1, vscale2; | |||
| void init(float src0_scale, float src1_scale, float src2_scale, float dst_scale) { | |||
| scale_src0 = src0_scale; | |||
| scale_src1 = src1_scale; | |||
| scale_src2 = src2_scale; | |||
| scale_dst = 1.f / dst_scale; | |||
| vscale_src0 = GiBroadcastFloat32(scale_src0); | |||
| vscale_src1 = GiBroadcastFloat32(scale_src1); | |||
| vscale_src2 = GiBroadcastFloat32(scale_src2); | |||
| vscale_dst = GiBroadcastFloat32(scale_dst); | |||
| vscale_src0 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_src0)); | |||
| vscale_src1 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_src1)); | |||
| vscale_src2 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_src2)); | |||
| vscale_dst = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale_dst)); | |||
| scale0 = src0_scale / dst_scale; | |||
| scale1 = src1_scale / dst_scale; | |||
| scale2 = src2_scale / dst_scale; | |||
| vscale0 = GiBroadcastFloat32(scale0); | |||
| vscale1 = GiBroadcastFloat32(scale1); | |||
| vscale2 = GiBroadcastFloat32(scale2); | |||
| vscale0 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale0)); | |||
| vscale1 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale1)); | |||
| vscale2 = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale2)); | |||
| } | |||
| TernaryOpBase( | |||
| DType src0_dtype, DType src1_dtype, DType src2_dtype, DType dst_dtype) { | |||
| @@ -307,7 +313,7 @@ struct TernaryOpBase<dt_qint8, dt_qint8> : OpBase<dt_qint8, dt_qint8> { | |||
| ////////////////////////// fixup ////////////////////////// | |||
| struct FixupBase { | |||
| GI_INT32_t vmultiplier, vshift; | |||
| GI_INT32_FIXLEN_t vmultiplier, vshift; | |||
| FixupBase(float scale) { | |||
| //! ignore Fixup if scale >= 0.5, using typecvt instead of shift & | |||
| //! multiplier, as it may introduce errors. | |||
| @@ -317,9 +323,9 @@ struct FixupBase { | |||
| int shift = static_cast<int>(::ceilf(::log2f(0.5 / scale))); | |||
| scale *= ::powf(2, shift); | |||
| //! Using double can get full precision here, but it can be ignored. | |||
| vmultiplier = GiBroadcastInt32( | |||
| std::round(static_cast<double>(scale) * ((2LL) << 30))); | |||
| vshift = GiBroadcastInt32(-shift); | |||
| vmultiplier = GiInt32Type2FixLenType(GiBroadcastInt32( | |||
| std::round(static_cast<double>(scale) * ((2LL) << 30)))); | |||
| vshift = GiInt32Type2FixLenType(GiBroadcastInt32(-shift)); | |||
| } | |||
| }; | |||
| @@ -349,11 +355,25 @@ struct UnaryQuantizationOp<dt_qint8, dt_qint8, Op> : UnaryOpBase<dt_qint8, dt_qi | |||
| } | |||
| GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const { | |||
| auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale_src); | |||
| auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale_src); | |||
| auto val = this->op({{vitem0, vitem1}}); | |||
| val.val[0] = GiMultiplyFloat32(val.val[0], this->vscale_dst); | |||
| val.val[1] = GiMultiplyFloat32(val.val[1], this->vscale_dst); | |||
| auto vitem0 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale_src)); | |||
| auto vitem1 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale_src)); | |||
| GI_FLOAT32_V2_t tmp; | |||
| GiSetSubVectorFloat32V2(tmp, 0, vitem0); | |||
| GiSetSubVectorFloat32V2(tmp, 1, vitem1); | |||
| auto val = this->op(tmp); | |||
| GI_FLOAT32_t a = GiMultiplyFloat32( | |||
| GiGetSubVectorFloat32V2(val, 0), | |||
| GiFixLenType2GiFloat32Type(this->vscale_dst)); | |||
| GI_FLOAT32_t b = GiMultiplyFloat32( | |||
| GiGetSubVectorFloat32V2(val, 1), | |||
| GiFixLenType2GiFloat32Type(this->vscale_dst)); | |||
| GiSetSubVectorFloat32V2(val, 0, a); | |||
| GiSetSubVectorFloat32V2(val, 1, b); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(val); | |||
| } | |||
| }; | |||
| @@ -385,13 +405,32 @@ struct BinaryQuantizationOp<dt_qint8, dt_qint8, Op> : BinaryOpBase<dt_qint8, dt_ | |||
| } | |||
| GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const { | |||
| auto val0 = GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale_src0); | |||
| auto val1 = GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale_src0); | |||
| auto val2 = GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale_src1); | |||
| auto val3 = GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale_src1); | |||
| auto val = op({{val0, val1}}, {{val2, val3}}); | |||
| val.val[0] = GiMultiplyFloat32(val.val[0], this->vscale_dst); | |||
| val.val[1] = GiMultiplyFloat32(val.val[1], this->vscale_dst); | |||
| auto val0 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale_src0)); | |||
| auto val1 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale_src0)); | |||
| auto val2 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale_src1)); | |||
| auto val3 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale_src1)); | |||
| GI_FLOAT32_V2_t tmp0, tmp1; | |||
| GiSetSubVectorFloat32V2(tmp0, 0, val0); | |||
| GiSetSubVectorFloat32V2(tmp0, 1, val1); | |||
| GiSetSubVectorFloat32V2(tmp1, 0, val2); | |||
| GiSetSubVectorFloat32V2(tmp1, 1, val3); | |||
| auto val = op(tmp0, tmp1); | |||
| GI_FLOAT32_t a = GiMultiplyFloat32( | |||
| GiGetSubVectorFloat32V2(val, 0), | |||
| GiFixLenType2GiFloat32Type(this->vscale_dst)); | |||
| GI_FLOAT32_t b = GiMultiplyFloat32( | |||
| GiGetSubVectorFloat32V2(val, 1), | |||
| GiFixLenType2GiFloat32Type(this->vscale_dst)); | |||
| GiSetSubVectorFloat32V2(val, 0, a); | |||
| GiSetSubVectorFloat32V2(val, 1, b); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(val); | |||
| } | |||
| }; | |||
| @@ -431,15 +470,40 @@ struct TernaryQuantizationOp<dt_qint8, dt_qint8, Op> | |||
| GI_INT8_t operator()( | |||
| const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1, | |||
| const GI_INT32_V2_t& vsrc2) const { | |||
| auto val0 = GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale_src0); | |||
| auto val1 = GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale_src0); | |||
| auto val2 = GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale_src1); | |||
| auto val3 = GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale_src1); | |||
| auto val4 = GiMultiplyFloat32(GiCastToFloat32(vsrc2.val[0]), this->vscale_src2); | |||
| auto val5 = GiMultiplyFloat32(GiCastToFloat32(vsrc2.val[1]), this->vscale_src2); | |||
| auto val = op({{val0, val1}}, {{val2, val3}}, {{val4, val5}}); | |||
| val.val[0] = GiMultiplyFloat32(val.val[0], this->vscale_dst); | |||
| val.val[1] = GiMultiplyFloat32(val.val[1], this->vscale_dst); | |||
| auto val0 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale_src0)); | |||
| auto val1 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale_src0)); | |||
| auto val2 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale_src1)); | |||
| auto val3 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale_src1)); | |||
| auto val4 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc2, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale_src2)); | |||
| auto val5 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc2, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale_src2)); | |||
| GI_FLOAT32_V2_t tmp0, tmp1, tmp2; | |||
| GiSetSubVectorFloat32V2(tmp0, 0, val0); | |||
| GiSetSubVectorFloat32V2(tmp0, 1, val1); | |||
| GiSetSubVectorFloat32V2(tmp1, 0, val2); | |||
| GiSetSubVectorFloat32V2(tmp1, 1, val3); | |||
| GiSetSubVectorFloat32V2(tmp2, 0, val4); | |||
| GiSetSubVectorFloat32V2(tmp2, 1, val5); | |||
| auto val = op(tmp0, tmp1, tmp2); | |||
| GI_FLOAT32_t a = GiMultiplyFloat32( | |||
| GiGetSubVectorFloat32V2(val, 0), | |||
| GiFixLenType2GiFloat32Type(this->vscale_dst)); | |||
| GI_FLOAT32_t b = GiMultiplyFloat32( | |||
| GiGetSubVectorFloat32V2(val, 1), | |||
| GiFixLenType2GiFloat32Type(this->vscale_dst)); | |||
| GiSetSubVectorFloat32V2(val, 0, a); | |||
| GiSetSubVectorFloat32V2(val, 1, b); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(val); | |||
| } | |||
| }; | |||
| @@ -20,37 +20,43 @@ struct ReluOpBase : UnaryOpBase<src_ctype, dst_ctype> { | |||
| template <typename src_ctype, typename dst_type = src_ctype> | |||
| struct ReluOp; | |||
| #define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width, zero) \ | |||
| template <> \ | |||
| struct ReluOp<_ctype> : ReluOpBase<_ctype> { \ | |||
| using ReluOpBase::ReluOpBase; \ | |||
| using ReluOpBase::operator(); \ | |||
| constexpr static size_t SIMD_WIDTH = _simd_width; \ | |||
| void operator()(const _simd_type2& src, _ctype* dst) const { \ | |||
| auto vitem = operator()(src); \ | |||
| GiStore##_func_suffix(dst, vitem.val[0]); \ | |||
| GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ | |||
| } \ | |||
| _simd_type2 operator()(const _simd_type2& src) const { \ | |||
| auto vitem0 = GiMaximum##_func_suffix(src.val[0], zero); \ | |||
| auto vitem1 = GiMaximum##_func_suffix(src.val[1], zero); \ | |||
| return {{vitem0, vitem1}}; \ | |||
| } \ | |||
| void operator()(const _simd_type& src, _ctype* dst) const { \ | |||
| auto vitem = operator()(src); \ | |||
| GiStore##_func_suffix(dst, vitem); \ | |||
| } \ | |||
| _simd_type operator()(const _simd_type& src) const { \ | |||
| return GiMaximum##_func_suffix(src, zero); \ | |||
| } \ | |||
| #define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width, zero_num) \ | |||
| template <> \ | |||
| struct ReluOp<_ctype> : ReluOpBase<_ctype> { \ | |||
| using ReluOpBase::ReluOpBase; \ | |||
| using ReluOpBase::operator(); \ | |||
| constexpr static size_t SIMD_WIDTH = _simd_width; \ | |||
| void operator()(const _simd_type2& src, _ctype* dst) const { \ | |||
| auto vitem = operator()(src); \ | |||
| GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ | |||
| GiStore##_func_suffix( \ | |||
| dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ | |||
| } \ | |||
| _simd_type2 operator()(const _simd_type2& src) const { \ | |||
| _simd_type zero = GiBroadcast##_func_suffix(zero_num); \ | |||
| auto vitem0 = GiMaximum##_func_suffix( \ | |||
| GiGetSubVector##_func_suffix##V2(src, 0), zero); \ | |||
| auto vitem1 = GiMaximum##_func_suffix( \ | |||
| GiGetSubVector##_func_suffix##V2(src, 1), zero); \ | |||
| _simd_type2 ret; \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \ | |||
| return ret; \ | |||
| } \ | |||
| void operator()(const _simd_type& src, _ctype* dst) const { \ | |||
| auto vitem = operator()(src); \ | |||
| GiStore##_func_suffix(dst, vitem); \ | |||
| } \ | |||
| _simd_type operator()(const _simd_type& src) const { \ | |||
| _simd_type zero = GiBroadcast##_func_suffix(zero_num); \ | |||
| return GiMaximum##_func_suffix(src, zero); \ | |||
| } \ | |||
| }; | |||
| OP(dt_float32, GI_FLOAT32_t, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float), | |||
| vfzero) | |||
| OP(dt_int32, GI_INT32_t, GI_INT32_V2_t, Int32, GI_SIMD_LEN_BYTE / sizeof(int32_t), | |||
| vzero) | |||
| OP(dt_int8, GI_INT8_t, GI_INT8_V2_t, Int8, GI_SIMD_LEN_BYTE / sizeof(int8_t), | |||
| vzero_int8) | |||
| 0.0f) | |||
| OP(dt_int32, GI_INT32_t, GI_INT32_V2_t, Int32, GI_SIMD_LEN_BYTE / sizeof(int32_t), 0) | |||
| OP(dt_int8, GI_INT8_t, GI_INT8_V2_t, Int8, GI_SIMD_LEN_BYTE / sizeof(int8_t), 0) | |||
| #undef OP | |||
| template <> | |||
| @@ -76,11 +82,19 @@ struct ReluOp<dt_qint8, dt_qint8> : ReluOpBase<dt_qint8, dt_qint8> { | |||
| OPERATOR_UNARY_QINT8_FALLBACK; | |||
| } | |||
| GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const { | |||
| auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale); | |||
| auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale); | |||
| GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f); | |||
| auto vitem0 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale)); | |||
| auto vitem1 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale)); | |||
| vitem0 = GiMaximumFloat32(vitem0, vfzero); | |||
| vitem1 = GiMaximumFloat32(vitem1, vfzero); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}}); | |||
| GI_FLOAT32_V2_t tmp; | |||
| GiSetSubVectorFloat32V2(tmp, 0, vitem0); | |||
| GiSetSubVectorFloat32V2(tmp, 1, vitem1); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp); | |||
| } | |||
| }; | |||
| @@ -104,6 +118,8 @@ template <> | |||
| struct ReluOp<dt_qint32, dt_qint8> : ReluOpBase<dt_qint32, dt_qint8>, FixupBase { | |||
| using ReluOpBase::operator(); | |||
| constexpr static size_t SIMD_WIDTH = 4; | |||
| GI_INT32_t vzero = GiBroadcastInt32(0); | |||
| GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f); | |||
| ReluOp(DType src_dtype, DType dst_dtype) | |||
| : ReluOpBase(src_dtype, dst_dtype), FixupBase(scale) {} | |||
| @@ -115,8 +131,8 @@ struct ReluOp<dt_qint32, dt_qint8> : ReluOpBase<dt_qint32, dt_qint8>, FixupBase | |||
| vst1_s8(reinterpret_cast<int8_t*>(dst), vget_low_s8(operator()(vsrc))); | |||
| } | |||
| int8x16_t operator()(const int32x4x2_t& vsrc) const { | |||
| int32x4_t vitem0 = vqrdmulhq_s32(vsrc.val[0], vmultiplier); | |||
| int32x4_t vitem1 = vqrdmulhq_s32(vsrc.val[1], vmultiplier); | |||
| int32x4_t vitem0 = vqrdmulhq_s32(GiGetSubVectorInt32V2(vsrc, 0), vmultiplier); | |||
| int32x4_t vitem1 = vqrdmulhq_s32(GiGetSubVectorInt32V2(vsrc, 1), vmultiplier); | |||
| vitem0 = vmaxq_s32(vitem0, vzero); | |||
| vitem1 = vmaxq_s32(vitem1, vzero); | |||
| auto tmp = vqmovn_s16(vcombine_s16( | |||
| @@ -158,24 +174,36 @@ struct ReluOp<dt_qint32, dt_qint8> : ReluOpBase<dt_qint32, dt_qint8> { | |||
| } | |||
| void operator()(const GI_INT32_t& src, dt_qint8* dst) const { | |||
| GiStoreLane0Int32( | |||
| reinterpret_cast<int32_t*>(dst), (GI_INT32_t)(operator()(src))); | |||
| reinterpret_cast<int32_t*>(dst), | |||
| GiReinterpretInt8AsInt32(operator()(src))); | |||
| } | |||
| GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const { | |||
| auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale); | |||
| auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale); | |||
| auto vitem0 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale)); | |||
| auto vitem1 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale)); | |||
| GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f); | |||
| vitem0 = GiMaximumFloat32(vitem0, vfzero); | |||
| vitem1 = GiMaximumFloat32(vitem1, vfzero); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}}); | |||
| GI_FLOAT32_V2_t tmp; | |||
| GiSetSubVectorFloat32V2(tmp, 0, vitem0); | |||
| GiSetSubVectorFloat32V2(tmp, 1, vitem1); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp); | |||
| } | |||
| GI_INT8_t operator()(const GI_INT32_t& src) const { | |||
| auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(src), this->vscale); | |||
| auto vitem0 = GiMultiplyFloat32( | |||
| GiCastToFloat32(src), GiFixLenType2GiFloat32Type(this->vscale)); | |||
| GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f); | |||
| vitem0 = GiMaximumFloat32(vitem0, vfzero); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_t>(vitem0); | |||
| } | |||
| GI_INT8_t operator()(const GI_FLOAT32_t& src) const { | |||
| auto vitem0 = GiMultiplyFloat32(src, this->vscale); | |||
| auto vitem0 = GiMultiplyFloat32(src, GiFixLenType2GiFloat32Type(this->vscale)); | |||
| GI_FLOAT32_t vfzero = GiBroadcastFloat32(0.0f); | |||
| vitem0 = GiMaximumFloat32(vitem0, vfzero); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_t>(vitem0); | |||
| } | |||
| @@ -25,27 +25,33 @@ struct SigmoidOpBase : UnaryOpBase<src_ctype, dst_ctype> { | |||
| template <typename src_ctype, typename dst_ctype = src_ctype> | |||
| struct SigmoidOp; | |||
| #define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \ | |||
| template <> \ | |||
| struct SigmoidOp<_ctype> : SigmoidOpBase<_ctype> { \ | |||
| using SigmoidOpBase::SigmoidOpBase; \ | |||
| using SigmoidOpBase::operator(); \ | |||
| constexpr static size_t SIMD_WIDTH = _simd_width; \ | |||
| void operator()(const _simd_type2& src, _ctype* dst) const { \ | |||
| auto vitem = operator()(src); \ | |||
| GiStore##_func_suffix(dst, vitem.val[0]); \ | |||
| GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ | |||
| } \ | |||
| void operator()(const _simd_type& src, _ctype* dst) const { \ | |||
| auto vitem = operator()(src); \ | |||
| GiStore##_func_suffix(dst, vitem); \ | |||
| } \ | |||
| _simd_type2 operator()(const _simd_type2& src) const { \ | |||
| return {{operator()(src.val[0]), operator()(src.val[1])}}; \ | |||
| } \ | |||
| _simd_type operator()(const _simd_type& src) const { \ | |||
| return GiSigmoidPs##_func_suffix(src); \ | |||
| } \ | |||
| #define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \ | |||
| template <> \ | |||
| struct SigmoidOp<_ctype> : SigmoidOpBase<_ctype> { \ | |||
| using SigmoidOpBase::SigmoidOpBase; \ | |||
| using SigmoidOpBase::operator(); \ | |||
| constexpr static size_t SIMD_WIDTH = _simd_width; \ | |||
| void operator()(const _simd_type2& src, _ctype* dst) const { \ | |||
| auto vitem = operator()(src); \ | |||
| GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ | |||
| GiStore##_func_suffix( \ | |||
| dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ | |||
| } \ | |||
| void operator()(const _simd_type& src, _ctype* dst) const { \ | |||
| auto vitem = operator()(src); \ | |||
| GiStore##_func_suffix(dst, vitem); \ | |||
| } \ | |||
| _simd_type2 operator()(const _simd_type2& src) const { \ | |||
| _simd_type2 ret; \ | |||
| GiSetSubVector##_func_suffix##V2( \ | |||
| ret, 0, operator()(GiGetSubVector##_func_suffix##V2(src, 0))); \ | |||
| GiSetSubVector##_func_suffix##V2( \ | |||
| ret, 1, operator()(GiGetSubVector##_func_suffix##V2(src, 1))); \ | |||
| return ret; \ | |||
| } \ | |||
| _simd_type operator()(const _simd_type& src) const { \ | |||
| return GiSigmoidPs##_func_suffix(src); \ | |||
| } \ | |||
| }; | |||
| OP(dt_float32, GI_FLOAT32_t, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float)) | |||
| #undef OP | |||
| @@ -33,14 +33,22 @@ struct SubOp; | |||
| const _simd_type2& src0, const _simd_type2& src1, \ | |||
| dst_ctype* dst) const { \ | |||
| auto vitem = operator()(src0, src1); \ | |||
| GiStore##_func_suffix(dst, vitem.val[0]); \ | |||
| GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ | |||
| GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ | |||
| GiStore##_func_suffix( \ | |||
| dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ | |||
| } \ | |||
| _simd_type2 operator()( \ | |||
| const _simd_type2& src0, const _simd_type2& src1) const { \ | |||
| auto vitem0 = GiSubtract##_func_suffix(src0.val[0], src1.val[0]); \ | |||
| auto vitem1 = GiSubtract##_func_suffix(src0.val[1], src1.val[1]); \ | |||
| return {{vitem0, vitem1}}; \ | |||
| auto vitem0 = GiSubtract##_func_suffix( \ | |||
| GiGetSubVector##_func_suffix##V2(src0, 0), \ | |||
| GiGetSubVector##_func_suffix##V2(src1, 0)); \ | |||
| auto vitem1 = GiSubtract##_func_suffix( \ | |||
| GiGetSubVector##_func_suffix##V2(src0, 1), \ | |||
| GiGetSubVector##_func_suffix##V2(src1, 1)); \ | |||
| _simd_type2 ret; \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \ | |||
| return ret; \ | |||
| } \ | |||
| void operator()( \ | |||
| const _simd_type& src0, const _simd_type& src1, \ | |||
| @@ -82,12 +90,23 @@ struct SubOp<dt_qint8, dt_qint8> : SubOpBase<dt_qint8, dt_qint8> { | |||
| } | |||
| GI_INT8_t operator()(const GI_INT32_V2_t& vsrc0, const GI_INT32_V2_t& vsrc1) const { | |||
| auto vitem0 = GiSubtractFloat32( | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[0]), this->vscale0), | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[0]), this->vscale1)); | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale0)), | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale1))); | |||
| auto vitem1 = GiSubtractFloat32( | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc0.val[1]), this->vscale0), | |||
| GiMultiplyFloat32(GiCastToFloat32(vsrc1.val[1]), this->vscale1)); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}}); | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc0, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale0)), | |||
| GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc1, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale1))); | |||
| GI_FLOAT32_V2_t tmp; | |||
| GiSetSubVectorFloat32V2(tmp, 0, vitem0); | |||
| GiSetSubVectorFloat32V2(tmp, 1, vitem1); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp); | |||
| } | |||
| }; | |||
| @@ -23,54 +23,58 @@ struct TanhOpBase : UnaryOpBase<src_ctype, dst_ctype> { | |||
| template <typename src_ctype, typename dst_type = src_ctype> | |||
| struct TanhOp; | |||
| #define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \ | |||
| template <> \ | |||
| struct TanhOp<_ctype> : TanhOpBase<_ctype> { \ | |||
| using TanhOpBase::TanhOpBase; \ | |||
| using TanhOpBase::operator(); \ | |||
| constexpr static size_t SIMD_WIDTH = _simd_width; \ | |||
| void operator()(const _simd_type2& src, _ctype* dst) const { \ | |||
| auto vitem = operator()(src); \ | |||
| GiStore##_func_suffix(dst, vitem.val[0]); \ | |||
| GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ | |||
| } \ | |||
| _simd_type2 operator()(const _simd_type2& src) const { \ | |||
| auto one_val = GiBroadcast##_func_suffix(1.f); \ | |||
| auto two_val = GiBroadcast##_func_suffix(2.f); \ | |||
| auto val1 = src.val[0]; \ | |||
| auto val2 = src.val[1]; \ | |||
| val1 = GiMultiply##_func_suffix(two_val, val1); \ | |||
| val2 = GiMultiply##_func_suffix(two_val, val2); \ | |||
| val1 = GiExpPs##_func_suffix(val1); \ | |||
| val2 = GiExpPs##_func_suffix(val2); \ | |||
| val1 = GiAdd##_func_suffix(one_val, val1); \ | |||
| val2 = GiAdd##_func_suffix(one_val, val2); \ | |||
| auto rval1 = GiRecpe##_func_suffix(val1); \ | |||
| auto rval2 = GiRecpe##_func_suffix(val2); \ | |||
| rval1 = GiMultiply##_func_suffix( \ | |||
| GiRecpeS##_func_suffix(val1, rval1), rval1); \ | |||
| rval2 = GiMultiply##_func_suffix( \ | |||
| GiRecpeS##_func_suffix(val2, rval2), rval2); \ | |||
| val1 = GiMultiply##_func_suffix(two_val, rval1); \ | |||
| val2 = GiMultiply##_func_suffix(two_val, rval2); \ | |||
| val1 = GiSubtract##_func_suffix(one_val, val1); \ | |||
| val2 = GiSubtract##_func_suffix(one_val, val2); \ | |||
| return {{val1, val2}}; \ | |||
| } \ | |||
| _simd_type operator()(const _simd_type& src) const { \ | |||
| auto one_val = GiBroadcast##_func_suffix(1.f); \ | |||
| auto two_val = GiBroadcast##_func_suffix(2.f); \ | |||
| auto val1 = src; \ | |||
| val1 = GiMultiply##_func_suffix(two_val, val1); \ | |||
| val1 = GiExpPs##_func_suffix(val1); \ | |||
| val1 = GiAdd##_func_suffix(one_val, val1); \ | |||
| auto rval1 = GiRecpe##_func_suffix(val1); \ | |||
| rval1 = GiMultiply##_func_suffix( \ | |||
| GiRecpeS##_func_suffix(val1, rval1), rval1); \ | |||
| val1 = GiMultiply##_func_suffix(two_val, rval1); \ | |||
| val1 = GiSubtract##_func_suffix(one_val, val1); \ | |||
| return val1; \ | |||
| } \ | |||
| #define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \ | |||
| template <> \ | |||
| struct TanhOp<_ctype> : TanhOpBase<_ctype> { \ | |||
| using TanhOpBase::TanhOpBase; \ | |||
| using TanhOpBase::operator(); \ | |||
| constexpr static size_t SIMD_WIDTH = _simd_width; \ | |||
| void operator()(const _simd_type2& src, _ctype* dst) const { \ | |||
| auto vitem = operator()(src); \ | |||
| GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ | |||
| GiStore##_func_suffix( \ | |||
| dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ | |||
| } \ | |||
| _simd_type2 operator()(const _simd_type2& src) const { \ | |||
| auto one_val = GiBroadcast##_func_suffix(1.f); \ | |||
| auto two_val = GiBroadcast##_func_suffix(2.f); \ | |||
| auto val1 = GiGetSubVector##_func_suffix##V2(src, 0); \ | |||
| auto val2 = GiGetSubVector##_func_suffix##V2(src, 1); \ | |||
| val1 = GiMultiply##_func_suffix(two_val, val1); \ | |||
| val2 = GiMultiply##_func_suffix(two_val, val2); \ | |||
| val1 = GiExpPs##_func_suffix(val1); \ | |||
| val2 = GiExpPs##_func_suffix(val2); \ | |||
| val1 = GiAdd##_func_suffix(one_val, val1); \ | |||
| val2 = GiAdd##_func_suffix(one_val, val2); \ | |||
| auto rval1 = GiRecpe##_func_suffix(val1); \ | |||
| auto rval2 = GiRecpe##_func_suffix(val2); \ | |||
| rval1 = GiMultiply##_func_suffix( \ | |||
| GiRecpeS##_func_suffix(val1, rval1), rval1); \ | |||
| rval2 = GiMultiply##_func_suffix( \ | |||
| GiRecpeS##_func_suffix(val2, rval2), rval2); \ | |||
| val1 = GiMultiply##_func_suffix(two_val, rval1); \ | |||
| val2 = GiMultiply##_func_suffix(two_val, rval2); \ | |||
| val1 = GiSubtract##_func_suffix(one_val, val1); \ | |||
| val2 = GiSubtract##_func_suffix(one_val, val2); \ | |||
| _simd_type2 ret; \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 0, val1); \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 1, val2); \ | |||
| return ret; \ | |||
| } \ | |||
| _simd_type operator()(const _simd_type& src) const { \ | |||
| auto one_val = GiBroadcast##_func_suffix(1.f); \ | |||
| auto two_val = GiBroadcast##_func_suffix(2.f); \ | |||
| auto val1 = src; \ | |||
| val1 = GiMultiply##_func_suffix(two_val, val1); \ | |||
| val1 = GiExpPs##_func_suffix(val1); \ | |||
| val1 = GiAdd##_func_suffix(one_val, val1); \ | |||
| auto rval1 = GiRecpe##_func_suffix(val1); \ | |||
| rval1 = GiMultiply##_func_suffix( \ | |||
| GiRecpeS##_func_suffix(val1, rval1), rval1); \ | |||
| val1 = GiMultiply##_func_suffix(two_val, rval1); \ | |||
| val1 = GiSubtract##_func_suffix(one_val, val1); \ | |||
| return val1; \ | |||
| } \ | |||
| }; | |||
| OP(dt_float32, GI_FLOAT32_t, GI_FLOAT32_V2_t, Float32, GI_SIMD_LEN_BYTE / sizeof(float)) | |||
| #undef OP | |||
| @@ -36,18 +36,22 @@ struct TrueDivOp; | |||
| const _simd_type2& src0, const _simd_type2& src1, \ | |||
| dst_ctype* dst) const { \ | |||
| auto vitem = operator()(src0, src1); \ | |||
| GiStore##_func_suffix(dst, vitem.val[0]); \ | |||
| GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \ | |||
| GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \ | |||
| GiStore##_func_suffix( \ | |||
| dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \ | |||
| } \ | |||
| _simd_type2 operator()( \ | |||
| const _simd_type2& src0, const _simd_type2& src1) const { \ | |||
| auto val1 = src0.val[0]; \ | |||
| auto val2 = src0.val[1]; \ | |||
| auto val3 = src1.val[0]; \ | |||
| auto val4 = src1.val[1]; \ | |||
| auto val1 = GiGetSubVector##_func_suffix##V2(src0, 0); \ | |||
| auto val2 = GiGetSubVector##_func_suffix##V2(src0, 1); \ | |||
| auto val3 = GiGetSubVector##_func_suffix##V2(src1, 0); \ | |||
| auto val4 = GiGetSubVector##_func_suffix##V2(src1, 1); \ | |||
| val1 = GiDivide##_func_suffix(val1, val3); \ | |||
| val2 = GiDivide##_func_suffix(val2, val4); \ | |||
| return {{val1, val2}}; \ | |||
| _simd_type2 ret; \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 0, val1); \ | |||
| GiSetSubVector##_func_suffix##V2(ret, 1, val2); \ | |||
| return ret; \ | |||
| } \ | |||
| void operator()( \ | |||
| const _simd_type& src0, const _simd_type& src1, \ | |||
| @@ -21,7 +21,8 @@ struct TypeCvtOp<dt_qint32, dt_qint8> : UnaryOpBase<dt_qint32, dt_qint8> { | |||
| } | |||
| void operator()(const GI_INT32_t& vsrc, dt_qint8* dst) const { | |||
| GiStoreLane0Int32( | |||
| reinterpret_cast<int32_t*>(dst), (GI_INT32_t)(operator()(vsrc))); | |||
| reinterpret_cast<int32_t*>(dst), | |||
| GiReinterpretInt8AsInt32(operator()(vsrc))); | |||
| } | |||
| void operator()(const src_ctype& src, dst_ctype* dst) const { | |||
| *dst = operator()(src); | |||
| @@ -32,17 +33,25 @@ struct TypeCvtOp<dt_qint32, dt_qint8> : UnaryOpBase<dt_qint32, dt_qint8> { | |||
| } | |||
| GI_INT8_t operator()(const GI_INT32_V2_t& vsrc) const { | |||
| auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[0]), this->vscale); | |||
| auto vitem1 = GiMultiplyFloat32(GiCastToFloat32(vsrc.val[1]), this->vscale); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}}); | |||
| auto vitem0 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 0)), | |||
| GiFixLenType2GiFloat32Type(this->vscale)); | |||
| auto vitem1 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiGetSubVectorInt32V2(vsrc, 1)), | |||
| GiFixLenType2GiFloat32Type(this->vscale)); | |||
| GI_FLOAT32_V2_t tmp; | |||
| GiSetSubVectorFloat32V2(tmp, 0, vitem0); | |||
| GiSetSubVectorFloat32V2(tmp, 1, vitem1); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(tmp); | |||
| } | |||
| GI_INT8_t operator()(const GI_INT32_t& src) const { | |||
| auto vitem0 = GiMultiplyFloat32(GiCastToFloat32(src), this->vscale); | |||
| auto vitem0 = GiMultiplyFloat32( | |||
| GiCastToFloat32(src), GiFixLenType2GiFloat32Type(this->vscale)); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_t>(vitem0); | |||
| } | |||
| GI_INT8_t operator()(const GI_FLOAT32_t& src) const { | |||
| auto vitem0 = GiMultiplyFloat32(src, this->vscale); | |||
| auto vitem0 = GiMultiplyFloat32(src, GiFixLenType2GiFloat32Type(this->vscale)); | |||
| return QConverter::convert<GI_INT8_t, GI_FLOAT32_t>(vitem0); | |||
| } | |||
| }; | |||
| @@ -96,6 +96,82 @@ cb(dt_float32, float, GI_FLOAT32_t, Float32); | |||
| cb(dt_int32, int32_t, GI_INT32_t, Int32); | |||
| #undef cb | |||
| ///////////////////////////////// ParamElemVistor v2/////////////////////////// | |||
| template <typename ctype> | |||
| struct ParamElemVisitorV2; | |||
| //! visitor single elemwise, and dup to vector | |||
| template <typename ctype> | |||
| struct ParamElemVisitorDupV2; | |||
| template <typename ctype> | |||
| struct ParamElemVisitorBcast101x4V2; | |||
| #define cb(_ctype, _inner_ctype, _simd_type, _fun_suffix, _simd_type_v2) \ | |||
| template <> \ | |||
| struct ParamElemVisitorV2<_ctype> { \ | |||
| _simd_type_v2 operator()(const _ctype* src, const _ctype* src_1) const { \ | |||
| _simd_type_v2 ret; \ | |||
| GiSetSubVector##_fun_suffix##V2(ret, 0, GiLoad##_fun_suffix(src)); \ | |||
| GiSetSubVector##_fun_suffix##V2(ret, 1, GiLoad##_fun_suffix(src_1)); \ | |||
| return ret; \ | |||
| } \ | |||
| }; \ | |||
| template <> \ | |||
| struct ParamElemVisitorDupV2<_ctype> { \ | |||
| _simd_type_v2 operator()(const _ctype* src) const { \ | |||
| _simd_type_v2 ret; \ | |||
| _simd_type tmp = GiBroadcast##_fun_suffix( \ | |||
| *reinterpret_cast<const _inner_ctype*>(src)); \ | |||
| GiSetSubVector##_fun_suffix##V2(ret, 0, tmp); \ | |||
| GiSetSubVector##_fun_suffix##V2(ret, 1, tmp); \ | |||
| return ret; \ | |||
| } \ | |||
| } | |||
| cb(dt_qint32, int32_t, GI_INT32_t, Int32, GI_INT32_V2_t); | |||
| cb(dt_qint8, int8_t, GI_INT8_t, Int8, GI_INT8_V2_t); | |||
| cb(dt_float32, float, GI_FLOAT32_t, Float32, GI_FLOAT32_V2_t); | |||
| cb(dt_int32, int32_t, GI_INT32_t, Int32, GI_INT32_V2_t); | |||
| cb(dt_int8, int8_t, GI_INT8_t, Int8, GI_INT8_V2_t); | |||
| #undef cb | |||
| template <typename ctype> | |||
| struct ParamElemVisitorBcast101x4V2; | |||
| #define cb(_ctype, _inner_ctype, _simd_type, _fun_suffix, rel_suffix, _simd_type_v2) \ | |||
| template <> \ | |||
| struct ParamElemVisitorBcast101x4V2<_ctype> { \ | |||
| _simd_type_v2 operator()(const _ctype* src) const { \ | |||
| _simd_type_v2 ret; \ | |||
| _simd_type tmp = \ | |||
| GiReinter##rel_suffix##To##_fun_suffix(GiBroadcast##rel_suffix( \ | |||
| *reinterpret_cast<const _inner_ctype*>(src))); \ | |||
| GiSetSubVector##_fun_suffix##V2(ret, 0, tmp); \ | |||
| GiSetSubVector##_fun_suffix##V2(ret, 1, tmp); \ | |||
| return ret; \ | |||
| } \ | |||
| } | |||
| cb(dt_qint8, int32_t, GI_INT8_t, Int8, Int32, GI_INT8_V2_t); | |||
| cb(dt_int8, int32_t, GI_INT8_t, Int8, Int32, GI_INT8_V2_t); | |||
| #undef cb | |||
| #define cb(_ctype, _inner_ctype, _simd_type, _fun_suffix, _simd_type_v2) \ | |||
| template <> \ | |||
| struct ParamElemVisitorBcast101x4V2<_ctype> { \ | |||
| _simd_type_v2 operator()(const _ctype* src) const { \ | |||
| _simd_type_v2 ret; \ | |||
| _simd_type tmp = GiLoad##_fun_suffix(src); \ | |||
| GiSetSubVector##_fun_suffix##V2(ret, 0, tmp); \ | |||
| GiSetSubVector##_fun_suffix##V2(ret, 1, tmp); \ | |||
| return ret; \ | |||
| } \ | |||
| } | |||
| cb(dt_qint32, int32_t, GI_INT32_t, Int32, GI_INT32_V2_t); | |||
| cb(dt_float32, float, GI_FLOAT32_t, Float32, GI_FLOAT32_V2_t); | |||
| cb(dt_int32, int32_t, GI_INT32_t, Int32, GI_INT32_V2_t); | |||
| #undef cb | |||
| ///////////////////////////////// OpCaller ///////////////////////////// | |||
| template <typename Op, BcastType bcast_type> | |||
| struct OpCallerUnary; | |||
| @@ -106,10 +182,10 @@ struct OpCallerUnary<Op, VEC> { | |||
| const typename Op::src_ctype* src, typename Op::dst_ctype* dst, | |||
| DType src_dtype, DType dst_dtype, size_t nr_elems) { | |||
| Op op(src_dtype, dst_dtype); | |||
| ParamElemVisitor<typename Op::src_ctype> vis; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis; | |||
| size_t i = 0; | |||
| for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) { | |||
| op({{vis(src), vis(src + Op::SIMD_WIDTH)}}, dst); | |||
| op(vis(src, src + Op::SIMD_WIDTH), dst); | |||
| src += Op::SIMD_WIDTH * 2; | |||
| dst += Op::SIMD_WIDTH * 2; | |||
| } | |||
| @@ -364,12 +440,12 @@ struct OpCallerBinary<Op, VEC_VEC> { | |||
| typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype, | |||
| DType dst_dtype, size_t nr_elems) { | |||
| Op op(src0_dtype, src1_dtype, dst_dtype); | |||
| ParamElemVisitor<typename Op::src_ctype> vis0; | |||
| ParamElemVisitor<typename Op::src_ctype> vis1; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis0; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis1; | |||
| size_t i = 0; | |||
| for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) { | |||
| op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, | |||
| {{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}}, dst); | |||
| op(vis0(src0, src0 + Op::SIMD_WIDTH), vis1(src1, src1 + Op::SIMD_WIDTH), | |||
| dst); | |||
| src0 += Op::SIMD_WIDTH * 2; | |||
| src1 += Op::SIMD_WIDTH * 2; | |||
| dst += Op::SIMD_WIDTH * 2; | |||
| @@ -394,17 +470,16 @@ struct OpCallerBinary<Op, VEC_BCAST101> { | |||
| typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype, | |||
| DType dst_dtype, size_t batch, size_t channel, size_t channel_stride) { | |||
| Op op(src0_dtype, src1_dtype, dst_dtype); | |||
| ParamElemVisitor<typename Op::src_ctype> vis0; | |||
| ParamElemVisitorDup<typename Op::src_ctype> vis1; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis0; | |||
| ParamElemVisitorDupV2<typename Op::src_ctype> vis1; | |||
| for (size_t b = 0; b < batch; b++) { | |||
| const typename Op::src_ctype* src1_ptr = src1; | |||
| for (size_t c = 0; c < channel; c++) { | |||
| size_t i = 0; | |||
| auto src1_simd = vis1(src1_ptr); | |||
| auto src1_simd_v2 = vis1(src1_ptr); | |||
| for (; i + Op::SIMD_WIDTH * 2 <= channel_stride; | |||
| i += Op::SIMD_WIDTH * 2) { | |||
| op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, | |||
| {{src1_simd, src1_simd}}, dst); | |||
| op(vis0(src0, src0 + Op::SIMD_WIDTH), src1_simd_v2, dst); | |||
| src0 += Op::SIMD_WIDTH * 2; | |||
| dst += Op::SIMD_WIDTH * 2; | |||
| } | |||
| @@ -430,7 +505,7 @@ struct OpCallerBinary<Op, VEC_BCASTX0X> { | |||
| typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype, | |||
| DType dst_dtype, size_t batch, size_t channel, size_t channel_stride) { | |||
| Op op(src0_dtype, src1_dtype, dst_dtype); | |||
| ParamElemVisitor<typename Op::src_ctype> vis; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis; | |||
| for (size_t b = 0; b < batch; b++) { | |||
| const typename Op::src_ctype* src1_ptr_base = src1 + b * channel_stride; | |||
| for (size_t c = 0; c < channel; c++) { | |||
| @@ -438,11 +513,9 @@ struct OpCallerBinary<Op, VEC_BCASTX0X> { | |||
| auto src1_ptr = src1_ptr_base; | |||
| for (; i + Op::SIMD_WIDTH * 2 <= channel_stride; | |||
| i += Op::SIMD_WIDTH * 2) { | |||
| auto src0_simd0 = vis(src0); | |||
| auto src0_simd1 = vis(src0 + Op::SIMD_WIDTH); | |||
| auto src1_simd0 = vis(src1_ptr); | |||
| auto src1_simd1 = vis(src1_ptr + Op::SIMD_WIDTH); | |||
| op({{src0_simd0, src0_simd1}}, {{src1_simd0, src1_simd1}}, dst); | |||
| auto src0_simd01 = vis(src0, src0 + Op::SIMD_WIDTH); | |||
| auto src1_simd01 = vis(src1_ptr, src1_ptr + Op::SIMD_WIDTH); | |||
| op(src0_simd01, src1_simd01, dst); | |||
| src0 += Op::SIMD_WIDTH * 2; | |||
| src1_ptr += Op::SIMD_WIDTH * 2; | |||
| dst += Op::SIMD_WIDTH * 2; | |||
| @@ -469,19 +542,17 @@ struct OpCallerBinary<Op, VEC_BCAST111C> { | |||
| typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype, | |||
| DType dst_dtype, size_t batch, size_t channel, size_t channel_stride) { | |||
| Op op(src0_dtype, src1_dtype, dst_dtype); | |||
| ParamElemVisitor<typename Op::src_ctype> vis; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis; | |||
| for (size_t b = 0; b < batch; b++) { | |||
| for (size_t c = 0; c < channel; c++) { | |||
| size_t rest = channel_stride; | |||
| const typename Op::src_ctype* src1_ptr = src1; | |||
| while (rest >= Op::SIMD_WIDTH * 2) { | |||
| auto src0_simd0 = vis(src0); | |||
| auto src0_simd1 = vis(src0 + Op::SIMD_WIDTH); | |||
| auto src1_simd0 = vis(src1_ptr); | |||
| auto src1_simd1 = vis(src1_ptr + Op::SIMD_WIDTH); | |||
| auto src0_simd01 = vis(src0, src0 + Op::SIMD_WIDTH); | |||
| auto src1_simd01 = vis(src1_ptr, src1_ptr + Op::SIMD_WIDTH); | |||
| src0 += Op::SIMD_WIDTH * 2; | |||
| src1_ptr += Op::SIMD_WIDTH * 2; | |||
| op({{src0_simd0, src0_simd1}}, {{src1_simd0, src1_simd1}}, dst); | |||
| op(src0_simd01, src1_simd01, dst); | |||
| dst += Op::SIMD_WIDTH * 2; | |||
| rest -= Op::SIMD_WIDTH * 2; | |||
| } | |||
| @@ -508,19 +579,17 @@ struct OpCallerBinary<Op, BCAST111C_VEC> { | |||
| typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype, | |||
| DType dst_dtype, size_t batch, size_t channel, size_t channel_stride) { | |||
| Op op(src0_dtype, src1_dtype, dst_dtype); | |||
| ParamElemVisitor<typename Op::src_ctype> vis; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis; | |||
| for (size_t b = 0; b < batch; b++) { | |||
| for (size_t c = 0; c < channel; c++) { | |||
| size_t rest = channel_stride; | |||
| const typename Op::src_ctype* src0_ptr = src0; | |||
| while (rest >= Op::SIMD_WIDTH * 2) { | |||
| auto src0_simd0 = vis(src0_ptr); | |||
| auto src0_simd1 = vis(src0_ptr + Op::SIMD_WIDTH); | |||
| auto src1_simd0 = vis(src1); | |||
| auto src1_simd1 = vis(src1 + Op::SIMD_WIDTH); | |||
| auto src0_simd01 = vis(src0_ptr, src0_ptr + Op::SIMD_WIDTH); | |||
| auto src1_simd01 = vis(src1, src1 + Op::SIMD_WIDTH); | |||
| src0_ptr += Op::SIMD_WIDTH * 2; | |||
| src1 += Op::SIMD_WIDTH * 2; | |||
| op({{src0_simd0, src0_simd1}}, {{src1_simd0, src1_simd1}}, dst); | |||
| op(src0_simd01, src1_simd01, dst); | |||
| dst += Op::SIMD_WIDTH * 2; | |||
| rest -= Op::SIMD_WIDTH * 2; | |||
| } | |||
| @@ -599,13 +668,12 @@ struct OpCallerBinaryBcast101xDVec { | |||
| auto src0_ptr = src0; | |||
| for (size_t cb = 0; cb < nr_channel_blocks; cb++) { | |||
| auto src0_block_ptr = src0_ptr + cb * channel_block_dim; | |||
| auto channel_block_vec = vis0(src0_block_ptr); | |||
| auto channel_block_vec_v2 = vis0(src0_block_ptr); | |||
| size_t img_index = 0; | |||
| auto src1_offset = Op::SIMD_WIDTH / channel_block_dim; | |||
| for (; img_index + 2 * src1_offset <= channel_stride; | |||
| img_index += 2 * src1_offset) { | |||
| op({{channel_block_vec, channel_block_vec}}, | |||
| {{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}}, dst); | |||
| op(channel_block_vec_v2, vis1(src1, src1 + Op::SIMD_WIDTH), dst); | |||
| src1 += Op::SIMD_WIDTH * 2; | |||
| dst += Op::SIMD_WIDTH * 2; | |||
| } | |||
| @@ -629,8 +697,8 @@ struct OpCallerBinaryBcast101xXVec<src_ctype, 4> { | |||
| const src_ctype* src0, const src_ctype* src1, typename Op::dst_ctype* dst, | |||
| const Op& op, size_t batch, size_t nr_channel_blocks, | |||
| size_t channel_stride) { | |||
| ParamElemVisitorBcast101x4<typename Op::src_ctype> vis0; | |||
| ParamElemVisitor<typename Op::src_ctype> vis1; | |||
| ParamElemVisitorBcast101x4V2<typename Op::src_ctype> vis0; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis1; | |||
| OpCallerBinaryBcast101xDVec<src_ctype, 4>::run( | |||
| src0, src1, dst, op, vis0, vis1, batch, nr_channel_blocks, | |||
| channel_stride); | |||
| @@ -717,13 +785,12 @@ struct OpCallerBinaryVecBcast101xD { | |||
| auto src1_ptr = src1; | |||
| for (size_t cb = 0; cb < nr_channel_blocks; cb++) { | |||
| auto src1_block_ptr = src1_ptr + cb * channel_block_dim; | |||
| auto channel_block_vec = vis1(src1_block_ptr); | |||
| auto channel_block_vec_v2 = vis1(src1_block_ptr); | |||
| size_t img_index = 0; | |||
| auto src0_offset = Op::SIMD_WIDTH / channel_block_dim; | |||
| for (; img_index + 2 * src0_offset <= channel_stride; | |||
| img_index += 2 * src0_offset) { | |||
| op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, | |||
| {{channel_block_vec, channel_block_vec}}, dst); | |||
| op(vis0(src0, src0 + Op::SIMD_WIDTH), channel_block_vec_v2, dst); | |||
| src0 += Op::SIMD_WIDTH * 2; | |||
| dst += Op::SIMD_WIDTH * 2; | |||
| } | |||
| @@ -747,8 +814,8 @@ struct OpCallerBinaryVecBcast101xX<src_ctype, 4> { | |||
| const src_ctype* src0, const src_ctype* src1, typename Op::dst_ctype* dst, | |||
| const Op& op, size_t batch, size_t nr_channel_blocks, | |||
| size_t channel_stride) { | |||
| ParamElemVisitor<src_ctype> vis0; | |||
| ParamElemVisitorBcast101x4<src_ctype> vis1; | |||
| ParamElemVisitorV2<src_ctype> vis0; | |||
| ParamElemVisitorBcast101x4V2<src_ctype> vis1; | |||
| OpCallerBinaryVecBcast101xD<src_ctype, 4>::run( | |||
| src0, src1, dst, op, vis0, vis1, batch, nr_channel_blocks, | |||
| channel_stride); | |||
| @@ -783,13 +850,12 @@ struct OpCallerBinary<Op, VEC_SCALAR> { | |||
| typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype, | |||
| DType dst_dtype, size_t nr_elems) { | |||
| Op op(src0_dtype, src1_dtype, dst_dtype); | |||
| ParamElemVisitor<typename Op::src_ctype> vis0; | |||
| ParamElemVisitorDup<typename Op::src_ctype> vis1; | |||
| auto vis1_simd = vis1(&src1); | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis0; | |||
| ParamElemVisitorDupV2<typename Op::src_ctype> vis1; | |||
| auto vis1_simd_v2 = vis1(&src1); | |||
| size_t i = 0; | |||
| for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) { | |||
| op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, {{vis1_simd, vis1_simd}}, | |||
| dst); | |||
| op(vis0(src0, src0 + Op::SIMD_WIDTH), vis1_simd_v2, dst); | |||
| src0 += Op::SIMD_WIDTH * 2; | |||
| dst += Op::SIMD_WIDTH * 2; | |||
| } | |||
| @@ -813,13 +879,12 @@ struct OpCallerBinary<Op, SCALAR_VEC> { | |||
| typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype, | |||
| DType dst_dtype, size_t nr_elems) { | |||
| Op op(src0_dtype, src1_dtype, dst_dtype); | |||
| ParamElemVisitorDup<typename Op::src_ctype> vis0; | |||
| ParamElemVisitor<typename Op::src_ctype> vis1; | |||
| auto vis0_simd = vis0(&src0); | |||
| ParamElemVisitorDupV2<typename Op::src_ctype> vis0; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis1; | |||
| auto vis0_simd_v2 = vis0(&src0); | |||
| size_t i = 0; | |||
| for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) { | |||
| op({{vis0_simd, vis0_simd}}, {{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}}, | |||
| dst); | |||
| op(vis0_simd_v2, vis1(src1, src1 + Op::SIMD_WIDTH), dst); | |||
| src1 += Op::SIMD_WIDTH * 2; | |||
| dst += Op::SIMD_WIDTH * 2; | |||
| } | |||
| @@ -842,17 +907,16 @@ struct OpCallerBinary<Op, BCAST101_VEC> { | |||
| typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype, | |||
| DType dst_dtype, size_t batch, size_t channel, size_t channel_stride) { | |||
| Op op(src0_dtype, src1_dtype, dst_dtype); | |||
| ParamElemVisitorDup<typename Op::src_ctype> vis0; | |||
| ParamElemVisitor<typename Op::src_ctype> vis1; | |||
| ParamElemVisitorDupV2<typename Op::src_ctype> vis0; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis1; | |||
| for (size_t b = 0; b < batch; b++) { | |||
| auto src0_ptr = src0; | |||
| for (size_t c = 0; c < channel; c++) { | |||
| auto vis0_simd = vis0(src0_ptr); | |||
| auto vis0_simd_v2 = vis0(src0_ptr); | |||
| size_t i = 0; | |||
| for (; i + Op::SIMD_WIDTH * 2 <= channel_stride; | |||
| i += Op::SIMD_WIDTH * 2) { | |||
| op({{vis0_simd, vis0_simd}}, | |||
| {{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}}, dst); | |||
| op(vis0_simd_v2, vis1(src1, src1 + Op::SIMD_WIDTH), dst); | |||
| src1 += Op::SIMD_WIDTH * 2; | |||
| dst += Op::SIMD_WIDTH * 2; | |||
| } | |||
| @@ -878,7 +942,7 @@ struct OpCallerBinary<Op, BCASTX0X_VEC> { | |||
| typename Op::dst_ctype* dst, DType src0_dtype, DType src1_dtype, | |||
| DType dst_dtype, size_t batch, size_t channel, size_t channel_stride) { | |||
| Op op(src0_dtype, src1_dtype, dst_dtype); | |||
| ParamElemVisitor<typename Op::src_ctype> vis; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis; | |||
| for (size_t b = 0; b < batch; b++) { | |||
| auto src0_ptr_base = src0 + b * channel_stride; | |||
| for (size_t c = 0; c < channel; c++) { | |||
| @@ -886,11 +950,9 @@ struct OpCallerBinary<Op, BCASTX0X_VEC> { | |||
| size_t i = 0; | |||
| for (; i + Op::SIMD_WIDTH * 2 <= channel_stride; | |||
| i += Op::SIMD_WIDTH * 2) { | |||
| auto src0_simd0 = vis(src0_ptr); | |||
| auto src0_simd1 = vis(src0_ptr + Op::SIMD_WIDTH); | |||
| auto src1_simd0 = vis(src1); | |||
| auto src1_simd1 = vis(src1 + Op::SIMD_WIDTH); | |||
| op({{src0_simd0, src0_simd1}}, {{src1_simd0, src1_simd1}}, dst); | |||
| auto src0_simd01 = vis(src0_ptr, src0_ptr + Op::SIMD_WIDTH); | |||
| auto src1_simd01 = vis(src1, src1 + Op::SIMD_WIDTH); | |||
| op(src0_simd01, src1_simd01, dst); | |||
| src0_ptr += Op::SIMD_WIDTH * 2; | |||
| src1 += Op::SIMD_WIDTH * 2; | |||
| dst += Op::SIMD_WIDTH * 2; | |||
| @@ -921,14 +983,13 @@ struct OpCallerTernary<Op, VEC_VEC_VEC> { | |||
| DType src0_dtype, DType src1_dtype, DType src2_dtype, DType dst_dtype, | |||
| size_t nr_elems) { | |||
| Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype); | |||
| ParamElemVisitor<typename Op::src_ctype> vis0; | |||
| ParamElemVisitor<typename Op::src_ctype> vis1; | |||
| ParamElemVisitor<typename Op::src_ctype> vis2; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis0; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis1; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis2; | |||
| size_t i = 0; | |||
| for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) { | |||
| op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, | |||
| {{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}}, | |||
| {{vis2(src2), vis2(src2 + Op::SIMD_WIDTH)}}, dst); | |||
| op(vis0(src0, src0 + Op::SIMD_WIDTH), vis1(src1, src1 + Op::SIMD_WIDTH), | |||
| vis2(src2, src2 + Op::SIMD_WIDTH), dst); | |||
| src0 += Op::SIMD_WIDTH * 2; | |||
| src1 += Op::SIMD_WIDTH * 2; | |||
| src2 += Op::SIMD_WIDTH * 2; | |||
| @@ -957,15 +1018,14 @@ struct OpCallerTernary<Op, VEC_VEC_SCALAR> { | |||
| DType src0_dtype, DType src1_dtype, DType src2_dtype, DType dst_dtype, | |||
| size_t nr_elems) { | |||
| Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype); | |||
| ParamElemVisitor<typename Op::src_ctype> vis0; | |||
| ParamElemVisitor<typename Op::src_ctype> vis1; | |||
| ParamElemVisitorDup<typename Op::src_ctype> vis2; | |||
| auto vis2_simd = vis2(&src2); | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis0; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis1; | |||
| ParamElemVisitorDupV2<typename Op::src_ctype> vis2; | |||
| auto vis2_simd_v2 = vis2(&src2); | |||
| size_t i = 0; | |||
| for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) { | |||
| op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, | |||
| {{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}}, {{vis2_simd, vis2_simd}}, | |||
| dst); | |||
| op(vis0(src0, src0 + Op::SIMD_WIDTH), vis1(src1, src1 + Op::SIMD_WIDTH), | |||
| vis2_simd_v2, dst); | |||
| src0 += Op::SIMD_WIDTH * 2; | |||
| src1 += Op::SIMD_WIDTH * 2; | |||
| dst += Op::SIMD_WIDTH * 2; | |||
| @@ -993,22 +1053,21 @@ struct OpCallerTernary<Op, BCAST101_VEC_BCAST101> { | |||
| size_t batch_size, size_t channel_size, size_t channel_stride, | |||
| size_t batch_offset) { | |||
| Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype); | |||
| ParamElemVisitor<typename Op::src_ctype> vis1; | |||
| ParamElemVisitorDup<typename Op::src_ctype> vis0; | |||
| ParamElemVisitorDup<typename Op::src_ctype> vis2; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis1; | |||
| ParamElemVisitorDupV2<typename Op::src_ctype> vis0; | |||
| ParamElemVisitorDupV2<typename Op::src_ctype> vis2; | |||
| for (size_t batch = 0; batch < batch_size; batch++) { | |||
| auto src0_ptr = src0; | |||
| auto src2_ptr = src2; | |||
| auto b_offset = batch_offset; | |||
| for (size_t channel = 0; channel < channel_size; channel++) { | |||
| size_t i = 0; | |||
| auto src0_simd = vis0(src0_ptr); | |||
| auto src2_simd = vis2(src2_ptr); | |||
| auto src0_simd_v2 = vis0(src0_ptr); | |||
| auto src2_simd_v2 = vis2(src2_ptr); | |||
| for (; i + Op::SIMD_WIDTH * 2 <= channel_stride; | |||
| i += Op::SIMD_WIDTH * 2) { | |||
| op({{src0_simd, src0_simd}}, | |||
| {{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}}, | |||
| {{src2_simd, src2_simd}}, dst); | |||
| op(src0_simd_v2, vis1(src1, src1 + Op::SIMD_WIDTH), src2_simd_v2, | |||
| dst); | |||
| src1 += Op::SIMD_WIDTH * 2; | |||
| dst += Op::SIMD_WIDTH * 2; | |||
| b_offset -= Op::SIMD_WIDTH * 2; | |||
| @@ -1042,7 +1101,7 @@ struct OpCallerTernary<Op, BCAST111C_VEC_BCAST111C> { | |||
| DType src2_dtype, DType dst_dtype, size_t batch_size, size_t channel_size, | |||
| size_t channel_stride, size_t batch_offset) { | |||
| Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype); | |||
| ParamElemVisitor<typename Op::src_ctype> vis; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis; | |||
| for (size_t batch = 0; batch < batch_size; batch++) { | |||
| auto b_offset = batch_offset; | |||
| for (size_t channel = 0; channel < channel_size; channel++) { | |||
| @@ -1051,14 +1110,10 @@ struct OpCallerTernary<Op, BCAST111C_VEC_BCAST111C> { | |||
| size_t i = 0; | |||
| for (; i + Op::SIMD_WIDTH * 2 <= channel_stride; | |||
| i += Op::SIMD_WIDTH * 2) { | |||
| auto src0_simd0 = vis(src0_ptr); | |||
| auto src0_simd1 = vis(src0_ptr + Op::SIMD_WIDTH); | |||
| auto src1_simd0 = vis(src1); | |||
| auto src1_simd1 = vis(src1 + Op::SIMD_WIDTH); | |||
| auto src2_simd0 = vis(src2_ptr); | |||
| auto src2_simd1 = vis(src2_ptr + Op::SIMD_WIDTH); | |||
| op({{src0_simd0, src0_simd1}}, {{src1_simd0, src1_simd1}}, | |||
| {{src2_simd0, src2_simd1}}, dst); | |||
| auto src0_simd01 = vis(src0_ptr, src0_ptr + Op::SIMD_WIDTH); | |||
| auto src1_simd01 = vis(src1, src1 + Op::SIMD_WIDTH); | |||
| auto src2_simd01 = vis(src2_ptr, src2_ptr + Op::SIMD_WIDTH); | |||
| op(src0_simd01, src1_simd01, src2_simd01, dst); | |||
| src0_ptr += Op::SIMD_WIDTH * 2; | |||
| src1 += Op::SIMD_WIDTH * 2; | |||
| src2_ptr += Op::SIMD_WIDTH * 2; | |||
| @@ -1125,15 +1180,14 @@ struct OpCallerTernaryBcast101xDVecBcast101xD { | |||
| for (size_t cb = 0; cb < nr_channel_blocks; cb++) { | |||
| auto src0_block_ptr = src0_ptr + cb * channel_block_dim; | |||
| auto src2_block_ptr = src2_ptr + cb * channel_block_dim; | |||
| auto channel_block_vec0 = vis0(src0_block_ptr); | |||
| auto channel_block_vec2 = vis2(src2_block_ptr); | |||
| auto channel_block_vec0_v2 = vis0(src0_block_ptr); | |||
| auto channel_block_vec2_v2 = vis2(src2_block_ptr); | |||
| size_t img_index = 0; | |||
| auto src1_offset = Op::SIMD_WIDTH / channel_block_dim; | |||
| for (; img_index + 2 * src1_offset <= channel_stride; | |||
| img_index += 2 * src1_offset) { | |||
| op({{channel_block_vec0, channel_block_vec0}}, | |||
| {{vis1(src1), vis1(src1 + Op::SIMD_WIDTH)}}, | |||
| {{channel_block_vec2, channel_block_vec2}}, dst); | |||
| op(channel_block_vec0_v2, vis1(src1, src1 + Op::SIMD_WIDTH), | |||
| channel_block_vec2_v2, dst); | |||
| src1 += Op::SIMD_WIDTH * 2; | |||
| dst += Op::SIMD_WIDTH * 2; | |||
| } | |||
| @@ -1159,9 +1213,9 @@ struct OpCallerTernaryBcast101xXVecBcast101xX<src_ctype, 4> { | |||
| const src_ctype* src0, const src_ctype* src1, const src_ctype* src2, | |||
| typename Op::dst_ctype* dst, const Op& op, size_t batch, | |||
| size_t nr_channel_blocks, size_t channel_stride) { | |||
| ParamElemVisitorBcast101x4<src_ctype> vis0; | |||
| ParamElemVisitor<src_ctype> vis1; | |||
| ParamElemVisitorBcast101x4<src_ctype> vis2; | |||
| ParamElemVisitorBcast101x4V2<src_ctype> vis0; | |||
| ParamElemVisitorV2<src_ctype> vis1; | |||
| ParamElemVisitorBcast101x4V2<src_ctype> vis2; | |||
| OpCallerTernaryBcast101xDVecBcast101xD<src_ctype, 4>::run( | |||
| src0, src1, src2, dst, op, vis0, vis1, vis2, batch, nr_channel_blocks, | |||
| channel_stride); | |||
| @@ -1201,19 +1255,18 @@ struct OpCallerTernary<Op, VEC_BCAST101_VEC> { | |||
| DType src0_dtype, DType src1_dtype, DType src2_dtype, DType dst_dtype, | |||
| size_t batch_size, size_t channel_size, size_t channel_stride) { | |||
| Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype); | |||
| ParamElemVisitor<typename Op::src_ctype> vis0; | |||
| ParamElemVisitorDup<typename Op::src_ctype> vis1; | |||
| ParamElemVisitor<typename Op::src_ctype> vis2; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis0; | |||
| ParamElemVisitorDupV2<typename Op::src_ctype> vis1; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis2; | |||
| for (size_t batch = 0; batch < batch_size; batch++) { | |||
| auto src1_ptr = src1; | |||
| for (size_t channel = 0; channel < channel_size; channel++) { | |||
| size_t i = 0; | |||
| auto src1_simd = vis1(src1_ptr); | |||
| auto src1_simd_v2 = vis1(src1_ptr); | |||
| for (; i + Op::SIMD_WIDTH * 2 <= channel_stride; | |||
| i += Op::SIMD_WIDTH * 2) { | |||
| op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, | |||
| {{src1_simd, src1_simd}}, | |||
| {{vis2(src2), vis2(src2 + Op::SIMD_WIDTH)}}, dst); | |||
| op(vis0(src0, src0 + Op::SIMD_WIDTH), src1_simd_v2, | |||
| vis2(src2, src2 + Op::SIMD_WIDTH), dst); | |||
| src0 += Op::SIMD_WIDTH * 2; | |||
| src2 += Op::SIMD_WIDTH * 2; | |||
| dst += Op::SIMD_WIDTH * 2; | |||
| @@ -1244,18 +1297,18 @@ struct OpCallerTernary<Op, VEC_BCAST111C_VEC> { | |||
| DType src1_dtype, DType src2_dtype, DType dst_dtype, size_t batch_size, | |||
| size_t channel_size, size_t channel_stride) { | |||
| Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype); | |||
| ParamElemVisitor<typename Op::src_ctype> vis0; | |||
| ParamElemVisitor<typename Op::src_ctype> vis1; | |||
| ParamElemVisitor<typename Op::src_ctype> vis2; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis0; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis1; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis2; | |||
| for (size_t batch = 0; batch < batch_size; batch++) { | |||
| for (size_t channel = 0; channel < channel_size; channel++) { | |||
| auto src1_ptr = src1; | |||
| size_t i = 0; | |||
| for (; i + Op::SIMD_WIDTH * 2 <= channel_stride; | |||
| i += Op::SIMD_WIDTH * 2) { | |||
| op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, | |||
| {{vis1(src1_ptr), vis1(src1_ptr + Op::SIMD_WIDTH)}}, | |||
| {{vis2(src2), vis2(src2 + Op::SIMD_WIDTH)}}, dst); | |||
| op(vis0(src0, src0 + Op::SIMD_WIDTH), | |||
| vis1(src1_ptr, src1_ptr + Op::SIMD_WIDTH), | |||
| vis2(src2, src2 + Op::SIMD_WIDTH), dst); | |||
| src0 += Op::SIMD_WIDTH * 2; | |||
| src1_ptr += Op::SIMD_WIDTH * 2; | |||
| src2 += Op::SIMD_WIDTH * 2; | |||
| @@ -1316,14 +1369,13 @@ struct OpCallerTernaryVecBcast101xDVec { | |||
| auto src1_ptr = src1; | |||
| for (size_t cb = 0; cb < nr_channel_blocks; cb++) { | |||
| auto src1_block_ptr = src1_ptr + cb * channel_block_dim; | |||
| auto channel_block_vec = vis1(src1_block_ptr); | |||
| auto channel_block_vec_v2 = vis1(src1_block_ptr); | |||
| size_t img_index = 0; | |||
| auto offset = Op::SIMD_WIDTH / channel_block_dim; | |||
| for (; img_index + 2 * offset <= channel_stride; | |||
| img_index += 2 * offset) { | |||
| op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, | |||
| {{channel_block_vec, channel_block_vec}}, | |||
| {{vis2(src2), vis2(src2 + Op::SIMD_WIDTH)}}, dst); | |||
| op(vis0(src0, src0 + Op::SIMD_WIDTH), channel_block_vec_v2, | |||
| vis2(src2, src2 + Op::SIMD_WIDTH), dst); | |||
| src0 += Op::SIMD_WIDTH * 2; | |||
| src2 += Op::SIMD_WIDTH * 2; | |||
| dst += Op::SIMD_WIDTH * 2; | |||
| @@ -1349,9 +1401,9 @@ struct OpCallerTernaryVecBcast101xXVec<src_ctype, 4> { | |||
| const src_ctype* src0, const src_ctype* src1, const src_ctype* src2, | |||
| typename Op::dst_ctype* dst, const Op& op, size_t batch, | |||
| size_t nr_channel_blocks, size_t channel_stride) { | |||
| ParamElemVisitor<src_ctype> vis0; | |||
| ParamElemVisitorBcast101x4<src_ctype> vis1; | |||
| ParamElemVisitor<src_ctype> vis2; | |||
| ParamElemVisitorV2<src_ctype> vis0; | |||
| ParamElemVisitorBcast101x4V2<src_ctype> vis1; | |||
| ParamElemVisitorV2<src_ctype> vis2; | |||
| OpCallerTernaryVecBcast101xDVec<src_ctype, 4>::run( | |||
| src0, src1, src2, dst, op, vis0, vis1, vis2, batch, nr_channel_blocks, | |||
| channel_stride); | |||
| @@ -1392,14 +1444,14 @@ struct OpCallerTernary<Op, VEC_SCALAR_VEC> { | |||
| DType src0_dtype, DType src1_dtype, DType src2_dtype, DType dst_dtype, | |||
| size_t nr_elems) { | |||
| Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype); | |||
| ParamElemVisitor<typename Op::src_ctype> vis0; | |||
| ParamElemVisitorDup<typename Op::src_ctype> vis1; | |||
| ParamElemVisitor<typename Op::src_ctype> vis2; | |||
| auto vis1_simd = vis1(&src1); | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis0; | |||
| ParamElemVisitorDupV2<typename Op::src_ctype> vis1; | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis2; | |||
| auto vis1_simd_v2 = vis1(&src1); | |||
| size_t i = 0; | |||
| for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) { | |||
| op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, {{vis1_simd, vis1_simd}}, | |||
| {{vis2(src2), vis2(src2 + Op::SIMD_WIDTH)}}, dst); | |||
| op(vis0(src0, src0 + Op::SIMD_WIDTH), vis1_simd_v2, | |||
| vis2(src2, src2 + Op::SIMD_WIDTH), dst); | |||
| src0 += Op::SIMD_WIDTH * 2; | |||
| src2 += Op::SIMD_WIDTH * 2; | |||
| dst += Op::SIMD_WIDTH * 2; | |||
| @@ -1426,15 +1478,14 @@ struct OpCallerTernary<Op, VEC_SCALAR_SCALAR> { | |||
| DType src0_dtype, DType src1_dtype, DType src2_dtype, DType dst_dtype, | |||
| size_t nr_elems) { | |||
| Op op(src0_dtype, src1_dtype, src2_dtype, dst_dtype); | |||
| ParamElemVisitor<typename Op::src_ctype> vis0; | |||
| ParamElemVisitorDup<typename Op::src_ctype> vis1; | |||
| ParamElemVisitorDup<typename Op::src_ctype> vis2; | |||
| auto vis1_simd = vis1(&src1); | |||
| auto vis2_simd = vis2(&src2); | |||
| ParamElemVisitorV2<typename Op::src_ctype> vis0; | |||
| ParamElemVisitorDupV2<typename Op::src_ctype> vis1; | |||
| ParamElemVisitorDupV2<typename Op::src_ctype> vis2; | |||
| auto vis1_simd_v2 = vis1(&src1); | |||
| auto vis2_simd_v2 = vis2(&src2); | |||
| size_t i = 0; | |||
| for (; i + Op::SIMD_WIDTH * 2 <= nr_elems; i += Op::SIMD_WIDTH * 2) { | |||
| op({{vis0(src0), vis0(src0 + Op::SIMD_WIDTH)}}, {{vis1_simd, vis1_simd}}, | |||
| {{vis2_simd, vis2_simd}}, dst); | |||
| op(vis0(src0, src0 + Op::SIMD_WIDTH), vis1_simd_v2, vis2_simd_v2, dst); | |||
| src0 += Op::SIMD_WIDTH * 2; | |||
| dst += Op::SIMD_WIDTH * 2; | |||
| } | |||
| @@ -11,8 +11,9 @@ struct LoadHelper { | |||
| static GI_FORCEINLINE void impl(T& weight, T2 ptr, int oc_offset, XT... args); | |||
| }; | |||
| #define WEIGHT_CB(step) \ | |||
| src[step] = Func::impl(ptr + base_offset + step * ptr_step, args...); | |||
| #define WEIGHT_CB(step) \ | |||
| src[step] = GiFloat32Type2FixLenType( \ | |||
| Func::impl(ptr + base_offset + step * ptr_step, args...)); | |||
| #define LOAD_HELPER(step) \ | |||
| template < \ | |||
| @@ -38,7 +38,13 @@ template <> | |||
| inline GI_FLOAT32_V2_t QConverter::convert(const GI_INT16_t& vsrc) { | |||
| GI_INT32_t vhi = GiMoveHighLongInt16(vsrc); | |||
| GI_INT32_t vlo = GiMoveLowLongInt16(vsrc); | |||
| return {{GiCastToFloat32(vlo), GiCastToFloat32(vhi)}}; | |||
| GI_FLOAT32_t fhi = GiCastToFloat32(vhi); | |||
| GI_FLOAT32_t flo = GiCastToFloat32(vlo); | |||
| GI_FLOAT32_V2_t ret; | |||
| GiSetSubVectorFloat32V2(ret, 0, flo); | |||
| GiSetSubVectorFloat32V2(ret, 1, fhi); | |||
| return ret; | |||
| } | |||
| template <> | |||
| @@ -36,14 +36,14 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> { | |||
| using ctype = int8_t; | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); | |||
| GI_INT32_t res[4]; | |||
| GI_INT32_FIXLEN_t res[4]; | |||
| int32_t remain; | |||
| int32_t cnt; | |||
| float coef; | |||
| GI_FLOAT32_t vcoef; | |||
| GI_FLOAT32_FIXLEN_t vcoef; | |||
| MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) { | |||
| memset(res, 0, sizeof(res)); | |||
| vcoef = GiBroadcastFloat32(coef); | |||
| vcoef = GiFloat32Type2FixLenType(GiBroadcastFloat32(coef)); | |||
| } | |||
| MeanReducer() = default; | |||
| void feed(const int8_t* val) { | |||
| @@ -56,19 +56,27 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> { | |||
| const GI_INT32_t vval_high_low = GiMoveLowLongInt16(vval_high); | |||
| const GI_INT32_t vval_high_high = GiMoveHighLongInt16(vval_high); | |||
| res[0] = GiAddInt32(res[0], vval_low_low); | |||
| res[1] = GiAddInt32(res[1], vval_low_high); | |||
| res[2] = GiAddInt32(res[2], vval_high_low); | |||
| res[3] = GiAddInt32(res[3], vval_high_high); | |||
| res[0] = GiInt32Type2FixLenType( | |||
| GiAddInt32(GiFixLenType2GiInt32Type(res[0]), vval_low_low)); | |||
| res[1] = GiInt32Type2FixLenType( | |||
| GiAddInt32(GiFixLenType2GiInt32Type(res[1]), vval_low_high)); | |||
| res[2] = GiInt32Type2FixLenType( | |||
| GiAddInt32(GiFixLenType2GiInt32Type(res[2]), vval_high_low)); | |||
| res[3] = GiInt32Type2FixLenType( | |||
| GiAddInt32(GiFixLenType2GiInt32Type(res[3]), vval_high_high)); | |||
| } | |||
| void feed_remain(const int8_t* val) { remain += *val; } | |||
| void post(int8_t* dst) { | |||
| for (int i = 0; i < 4; i += 2) { | |||
| GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef); | |||
| GI_FLOAT32_t vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef); | |||
| GiStoreLowInt8( | |||
| dst, (QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>( | |||
| {{vitem0, vitem1}}))); | |||
| auto tmp = GiFixLenType2GiFloat32Type(vcoef); | |||
| GI_FLOAT32_t vitem0 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiFixLenType2GiInt32Type(res[i])), tmp); | |||
| GI_FLOAT32_t vitem1 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiFixLenType2GiInt32Type(res[i + 1])), tmp); | |||
| GI_FLOAT32_V2_t ret; | |||
| GiSetSubVectorFloat32V2(ret, 0, vitem0); | |||
| GiSetSubVectorFloat32V2(ret, 1, vitem1); | |||
| GiStoreLowInt8(dst, (QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(ret))); | |||
| dst += 8; | |||
| } | |||
| } | |||
| @@ -83,17 +91,20 @@ struct MeanReducer<dt_float32, float, float, true> { | |||
| using ctype = float; | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | |||
| GI_FLOAT32_t res; | |||
| GI_FLOAT32_FIXLEN_t res; | |||
| float result; | |||
| float coef; | |||
| MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) { | |||
| res = GiBroadcastFloat32(0.0f); | |||
| res = GiFloat32Type2FixLenType(GiBroadcastFloat32(0.0f)); | |||
| } | |||
| MeanReducer() = default; | |||
| void feed(const float* val) { res = GiAddFloat32(GiLoadFloat32(val), res); } | |||
| void feed(const float* val) { | |||
| res = GiFloat32Type2FixLenType( | |||
| GiAddFloat32(GiLoadFloat32(val), GiFixLenType2GiFloat32Type(res))); | |||
| } | |||
| void feed_remain(const float* val) { result += *val; } | |||
| void post(float* dst) { | |||
| result += GiReduceAddFloat32(res); | |||
| result += GiReduceAddFloat32(GiFixLenType2GiFloat32Type(res)); | |||
| *dst = result * coef; | |||
| } | |||
| }; | |||
| @@ -103,18 +114,22 @@ struct MeanReducer<dt_float32, float, float, false> { | |||
| using ctype = float; | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | |||
| GI_FLOAT32_t res; | |||
| GI_FLOAT32_FIXLEN_t res; | |||
| float remain; | |||
| float coef; | |||
| MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) { | |||
| res = GiBroadcastFloat32(0.0f); | |||
| res = GiFloat32Type2FixLenType(GiBroadcastFloat32(0.0f)); | |||
| } | |||
| MeanReducer() = default; | |||
| void feed(const float* val) { res = GiAddFloat32(GiLoadFloat32(val), res); } | |||
| void feed(const float* val) { | |||
| res = GiFloat32Type2FixLenType( | |||
| GiAddFloat32(GiLoadFloat32(val), GiFixLenType2GiFloat32Type(res))); | |||
| } | |||
| void feed_remain(const float* val) { remain += *val; } | |||
| void post(float* dst) { | |||
| res = GiMultiplyScalerFloat32(res, coef); | |||
| GiStoreFloat32(dst, res); | |||
| res = GiFloat32Type2FixLenType( | |||
| GiMultiplyScalerFloat32(GiFixLenType2GiFloat32Type(res), coef)); | |||
| GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); | |||
| } | |||
| void post_remain(float* dst) { *dst = remain * coef; } | |||
| }; | |||
| @@ -125,23 +140,29 @@ struct maxReducer; | |||
| template <typename dtype, typename ctype, typename comp_type, bool C1> | |||
| struct minReducer; | |||
| #define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \ | |||
| template <> \ | |||
| struct _mode##Reducer<dt_float32, float, float, true> { \ | |||
| using ctype = float; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||
| GI_FLOAT32_t res; \ | |||
| _mode##Reducer(DType, size_t) { res = GiBroadcastFloat32(_init); } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const float* val) { \ | |||
| auto vval = GiLoadFloat32(val); \ | |||
| res = Gi##_Mode##NanFloat32(res, vval); \ | |||
| } \ | |||
| void feed_remain(const float* val) { \ | |||
| auto vval = GiBroadcastFloat32(*val); \ | |||
| res = Gi##_Mode##NanFloat32(vval, res); \ | |||
| } \ | |||
| void post(float* dst) { *dst = GiReduce##_Mode##NanFloat32(res); } \ | |||
| #define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \ | |||
| template <> \ | |||
| struct _mode##Reducer<dt_float32, float, float, true> { \ | |||
| using ctype = float; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||
| GI_FLOAT32_FIXLEN_t res; \ | |||
| _mode##Reducer(DType, size_t) { \ | |||
| res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \ | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const float* val) { \ | |||
| auto vval = GiLoadFloat32(val); \ | |||
| res = GiFloat32Type2FixLenType( \ | |||
| Gi##_Mode##NanFloat32(GiFixLenType2GiFloat32Type(res), vval)); \ | |||
| } \ | |||
| void feed_remain(const float* val) { \ | |||
| auto vval = GiBroadcastFloat32(*val); \ | |||
| res = GiFloat32Type2FixLenType( \ | |||
| Gi##_Mode##NanFloat32(vval, GiFixLenType2GiFloat32Type(res))); \ | |||
| } \ | |||
| void post(float* dst) { \ | |||
| *dst = GiReduce##_Mode##NanFloat32(GiFixLenType2GiFloat32Type(res)); \ | |||
| } \ | |||
| } | |||
| REDUCER_MAX_MIN_C1(max, Max, std::numeric_limits<dt_float32>::lowest()); | |||
| @@ -151,28 +172,31 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max()); | |||
| #define Max_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); | |||
| #define Min_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b); | |||
| #define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ | |||
| template <> \ | |||
| struct _mode##Reducer<dt_float32, float, float, false> { \ | |||
| using ctype = float; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||
| GI_FLOAT32_t res; \ | |||
| float remain; \ | |||
| _mode##Reducer(DType, size_t) { \ | |||
| res = GiBroadcastFloat32(_init); \ | |||
| remain = _init; \ | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const float* val) { \ | |||
| GI_FLOAT32_t vval = GiLoadFloat32(val); \ | |||
| res = Gi##_Mode##NanFloat32(res, vval); \ | |||
| } \ | |||
| void feed_remain(const float* val) { \ | |||
| using namespace std; \ | |||
| remain = _Mode##_NAN(*val, remain); \ | |||
| } \ | |||
| void post(float* dst) { GiStoreFloat32(dst, res); } \ | |||
| void post_remain(float* dst) { *dst = remain; } \ | |||
| #define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ | |||
| template <> \ | |||
| struct _mode##Reducer<dt_float32, float, float, false> { \ | |||
| using ctype = float; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||
| GI_FLOAT32_FIXLEN_t res; \ | |||
| float remain; \ | |||
| _mode##Reducer(DType, size_t) { \ | |||
| res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \ | |||
| remain = _init; \ | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const float* val) { \ | |||
| GI_FLOAT32_t vval = GiLoadFloat32(val); \ | |||
| res = GiFloat32Type2FixLenType( \ | |||
| Gi##_Mode##NanFloat32(GiFixLenType2GiFloat32Type(res), vval)); \ | |||
| } \ | |||
| void feed_remain(const float* val) { \ | |||
| using namespace std; \ | |||
| remain = _Mode##_NAN(*val, remain); \ | |||
| } \ | |||
| void post(float* dst) { \ | |||
| GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); \ | |||
| } \ | |||
| void post_remain(float* dst) { *dst = remain; } \ | |||
| } | |||
| REDUCER_MAX_MIN_C(max, Max, std::numeric_limits<dt_float32>::lowest()); | |||
| @@ -181,51 +205,58 @@ REDUCER_MAX_MIN_C(min, Min, std::numeric_limits<dt_float32>::max()); | |||
| #undef Max_NAN | |||
| #undef Min_NAN | |||
| #define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \ | |||
| template <> \ | |||
| struct _mode##Reducer<dt_qint8, int8_t, int8_t, true> { \ | |||
| using ctype = int8_t; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ | |||
| GI_INT8_t res; \ | |||
| _mode##Reducer(DType, size_t) { res = GiBroadcastInt8(_init); } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const int8_t* val) { \ | |||
| GI_INT8_t vval = GiLoadInt8(val); \ | |||
| res = Gi##_Mode##imumInt8(vval, res); \ | |||
| } \ | |||
| void feed_remain(const int8_t* val) { \ | |||
| GI_INT8_t vval = GiBroadcastInt8(*val); \ | |||
| res = Gi##_Mode##imumInt8(res, vval); \ | |||
| } \ | |||
| void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); } \ | |||
| #define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \ | |||
| template <> \ | |||
| struct _mode##Reducer<dt_qint8, int8_t, int8_t, true> { \ | |||
| using ctype = int8_t; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ | |||
| GI_INT8_FIXLEN_t res; \ | |||
| _mode##Reducer(DType, size_t) { \ | |||
| res = GiInt8Type2FixLenType(GiBroadcastInt8(_init)); \ | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const int8_t* val) { \ | |||
| GI_INT8_t vval = GiLoadInt8(val); \ | |||
| res = GiInt8Type2FixLenType( \ | |||
| Gi##_Mode##imumInt8(vval, GiFixLenType2GiInt8Type(res))); \ | |||
| } \ | |||
| void feed_remain(const int8_t* val) { \ | |||
| GI_INT8_t vval = GiBroadcastInt8(*val); \ | |||
| res = GiInt8Type2FixLenType( \ | |||
| Gi##_Mode##imumInt8(GiFixLenType2GiInt8Type(res), vval)); \ | |||
| } \ | |||
| void post(int8_t* dst) { \ | |||
| *dst = GiReduce##_Mode##Int8(GiFixLenType2GiInt8Type(res)); \ | |||
| } \ | |||
| } | |||
| REDUCER_MAX_MIN_C1(max, Max, -128); | |||
| REDUCER_MAX_MIN_C1(min, Min, 127); | |||
| #undef REDUCER_MAX_MIN_C1 | |||
| #define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ | |||
| template <> \ | |||
| struct _mode##Reducer<dt_qint8, int8_t, int8_t, false> { \ | |||
| using ctype = int8_t; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ | |||
| GI_INT8_t res; \ | |||
| int8_t remain; \ | |||
| _mode##Reducer(DType, size_t) { \ | |||
| res = GiBroadcastInt8(_init); \ | |||
| remain = _init; \ | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const int8_t* val) { \ | |||
| GI_INT8_t vval = GiLoadInt8(val); \ | |||
| res = Gi##_Mode##imumInt8(res, vval); \ | |||
| } \ | |||
| void feed_remain(const int8_t* val) { \ | |||
| using namespace std; \ | |||
| remain = _mode(*val, remain); \ | |||
| } \ | |||
| void post(int8_t* dst) { GiStoreInt8(dst, res); } \ | |||
| void post_remain(int8_t* dst) { *dst = remain; } \ | |||
| #define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ | |||
| template <> \ | |||
| struct _mode##Reducer<dt_qint8, int8_t, int8_t, false> { \ | |||
| using ctype = int8_t; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ | |||
| GI_INT8_FIXLEN_t res; \ | |||
| int8_t remain; \ | |||
| _mode##Reducer(DType, size_t) { \ | |||
| res = GiInt8Type2FixLenType(GiBroadcastInt8(_init)); \ | |||
| remain = _init; \ | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const int8_t* val) { \ | |||
| GI_INT8_t vval = GiLoadInt8(val); \ | |||
| res = GiInt8Type2FixLenType( \ | |||
| Gi##_Mode##imumInt8(GiFixLenType2GiInt8Type(res), vval)); \ | |||
| } \ | |||
| void feed_remain(const int8_t* val) { \ | |||
| using namespace std; \ | |||
| remain = _mode(*val, remain); \ | |||
| } \ | |||
| void post(int8_t* dst) { GiStoreInt8(dst, GiFixLenType2GiInt8Type(res)); } \ | |||
| void post_remain(int8_t* dst) { *dst = remain; } \ | |||
| } | |||
| REDUCER_MAX_MIN_C(max, Max, -128); | |||
| @@ -238,61 +269,67 @@ struct SumReducer; | |||
| template <typename dtype, typename ctype, typename comp_type, bool C1> | |||
| struct ProductReducer; | |||
| #define REDUCER_SUM_PRODUCT_C1(_mode, _Mode, _op, _init) \ | |||
| template <> \ | |||
| struct _mode##Reducer<dt_float32, float, float, true> { \ | |||
| using ctype = float; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||
| GI_FLOAT32_t res; \ | |||
| float remain; \ | |||
| _mode##Reducer(DType, size_t) { \ | |||
| res = GiBroadcastFloat32(_init); \ | |||
| remain = _init; \ | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const float* val) { \ | |||
| GI_FLOAT32_t vval = GiLoadFloat32(val); \ | |||
| res = Gi##_Mode##Float32(vval, res); \ | |||
| } \ | |||
| void feed_remain(const float* val) { \ | |||
| using namespace std; \ | |||
| auto op = _op<float>(); \ | |||
| remain = op(remain, *val); \ | |||
| } \ | |||
| void post(float* dst) { \ | |||
| using namespace std; \ | |||
| auto op = _op<float>(); \ | |||
| *dst = op(remain, GiReduce##_Mode##Float32(res)); \ | |||
| } \ | |||
| #define REDUCER_SUM_PRODUCT_C1(_mode, _Mode, _op, _init) \ | |||
| template <> \ | |||
| struct _mode##Reducer<dt_float32, float, float, true> { \ | |||
| using ctype = float; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||
| GI_FLOAT32_FIXLEN_t res; \ | |||
| float remain; \ | |||
| _mode##Reducer(DType, size_t) { \ | |||
| res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \ | |||
| remain = _init; \ | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const float* val) { \ | |||
| GI_FLOAT32_t vval = GiLoadFloat32(val); \ | |||
| res = GiFloat32Type2FixLenType( \ | |||
| Gi##_Mode##Float32(vval, GiFixLenType2GiFloat32Type(res))); \ | |||
| } \ | |||
| void feed_remain(const float* val) { \ | |||
| using namespace std; \ | |||
| auto op = _op<float>(); \ | |||
| remain = op(remain, *val); \ | |||
| } \ | |||
| void post(float* dst) { \ | |||
| using namespace std; \ | |||
| auto op = _op<float>(); \ | |||
| *dst = \ | |||
| op(remain, \ | |||
| GiReduce##_Mode##Float32(GiFixLenType2GiFloat32Type(res))); \ | |||
| } \ | |||
| } | |||
| REDUCER_SUM_PRODUCT_C1(Sum, Add, plus, 0.0f); | |||
| REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f); | |||
| #undef REDUCER_SUM_PRODUCT_C1 | |||
| #define REDUCER_SUM_PRODUCT_C(_mode, _Mode, _op, _init) \ | |||
| template <> \ | |||
| struct _mode##Reducer<dt_float32, float, float, false> { \ | |||
| using ctype = float; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||
| GI_FLOAT32_t res; \ | |||
| float remain; \ | |||
| _mode##Reducer(DType, size_t) { \ | |||
| res = GiBroadcastFloat32(_init); \ | |||
| remain = _init; \ | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const float* val) { \ | |||
| GI_FLOAT32_t vval = GiLoadFloat32(val); \ | |||
| res = Gi##_Mode##Float32(vval, res); \ | |||
| } \ | |||
| void feed_remain(const float* val) { \ | |||
| using namespace std; \ | |||
| auto op = _op<float>(); \ | |||
| remain = op(remain, (*val)); \ | |||
| } \ | |||
| void post(float* dst) { GiStoreFloat32(dst, res); } \ | |||
| void post_remain(float* dst) { *dst = remain; } \ | |||
| #define REDUCER_SUM_PRODUCT_C(_mode, _Mode, _op, _init) \ | |||
| template <> \ | |||
| struct _mode##Reducer<dt_float32, float, float, false> { \ | |||
| using ctype = float; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||
| GI_FLOAT32_FIXLEN_t res; \ | |||
| float remain; \ | |||
| _mode##Reducer(DType, size_t) { \ | |||
| res = GiFloat32Type2FixLenType(GiBroadcastFloat32(_init)); \ | |||
| remain = _init; \ | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const float* val) { \ | |||
| GI_FLOAT32_t vval = GiLoadFloat32(val); \ | |||
| res = GiFloat32Type2FixLenType( \ | |||
| Gi##_Mode##Float32(vval, GiFixLenType2GiFloat32Type(res))); \ | |||
| } \ | |||
| void feed_remain(const float* val) { \ | |||
| using namespace std; \ | |||
| auto op = _op<float>(); \ | |||
| remain = op(remain, (*val)); \ | |||
| } \ | |||
| void post(float* dst) { \ | |||
| GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); \ | |||
| } \ | |||
| void post_remain(float* dst) { *dst = remain; } \ | |||
| } | |||
| REDUCER_SUM_PRODUCT_C(Sum, Add, plus, 0.0f); | |||
| @@ -308,23 +345,24 @@ struct SumSqrReducer<dt_float32, float, float, true> { | |||
| using ctype = float; | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | |||
| GI_FLOAT32_t res; | |||
| GI_FLOAT32_FIXLEN_t res; | |||
| float result; | |||
| SumSqrReducer(DType, size_t cnt) : result(0.0f) { | |||
| MEGDNN_MARK_USED_VAR(cnt); | |||
| res = GiBroadcastFloat32(0.0f); | |||
| res = GiFloat32Type2FixLenType(GiBroadcastFloat32(0.0f)); | |||
| } | |||
| SumSqrReducer() = default; | |||
| void feed(const float* val) { | |||
| GI_FLOAT32_t vval = GiLoadFloat32(val); | |||
| res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); | |||
| res = GiFloat32Type2FixLenType(GiAddFloat32( | |||
| GiMultiplyFloat32(vval, vval), GiFixLenType2GiFloat32Type(res))); | |||
| } | |||
| void feed_remain(const float* val) { | |||
| float vval = *val; | |||
| result += vval * vval; | |||
| } | |||
| void post(float* dst) { | |||
| result += GiReduceAddFloat32(res); | |||
| result += GiReduceAddFloat32(GiFixLenType2GiFloat32Type(res)); | |||
| *dst = result; | |||
| } | |||
| }; | |||
| @@ -333,19 +371,20 @@ struct SumSqrReducer<dt_float32, float, float, false> { | |||
| using ctype = float; | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | |||
| GI_FLOAT32_t res; | |||
| GI_FLOAT32_FIXLEN_t res; | |||
| float remain; | |||
| SumSqrReducer(DType, size_t cnt) : remain(0.0f) { | |||
| MEGDNN_MARK_USED_VAR(cnt); | |||
| res = GiBroadcastFloat32(0.0f); | |||
| res = GiFloat32Type2FixLenType(GiBroadcastFloat32(0.0f)); | |||
| } | |||
| SumSqrReducer() = default; | |||
| void feed(const float* val) { | |||
| GI_FLOAT32_t vval = GiLoadFloat32(val); | |||
| res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); | |||
| res = GiFloat32Type2FixLenType(GiAddFloat32( | |||
| GiMultiplyFloat32(vval, vval), GiFixLenType2GiFloat32Type(res))); | |||
| } | |||
| void feed_remain(const float* val) { remain += (*val) * (*val); } | |||
| void post(float* dst) { GiStoreFloat32(dst, res); } | |||
| void post(float* dst) { GiStoreFloat32(dst, GiFixLenType2GiFloat32Type(res)); } | |||
| void post_remain(float* dst) { *dst = remain; } | |||
| }; | |||
| /**************************************do reduce*************************/ | |||
| @@ -18,22 +18,26 @@ struct QuantizedTypeCvter<int32_t, int8_t> { | |||
| static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int32_t) * 2; | |||
| static constexpr size_t SIMD_STEP = GI_SIMD_LEN_BYTE / sizeof(int32_t); | |||
| float scale; | |||
| GI_FLOAT32_t vscale; | |||
| GI_FLOAT32_FIXLEN_t vscale; | |||
| QuantizedTypeCvter(DType src_dtype, DType dst_dtype) { | |||
| float src_scale = src_dtype.param<dtype::QuantizedS32>().scale; | |||
| float dst_scale = dst_dtype.param<dtype::QuantizedS8>().scale; | |||
| scale = src_scale / dst_scale; | |||
| vscale = GiBroadcastFloat32(scale); | |||
| vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale)); | |||
| } | |||
| void cvt(const int32_t* src, int8_t* dst) { | |||
| GI_FLOAT32_t vitem0 = | |||
| GiMultiplyFloat32(GiCastToFloat32(GiLoadInt32(src)), vscale); | |||
| GI_FLOAT32_t vitem1 = GiMultiplyFloat32( | |||
| GiCastToFloat32(GiLoadInt32(src + SIMD_STEP)), vscale); | |||
| auto vres = QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}}); | |||
| GI_FLOAT32_t t; | |||
| t = GiFixLenType2GiFloat32Type(vscale); | |||
| GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiCastToFloat32(GiLoadInt32(src)), t); | |||
| GI_FLOAT32_t vitem1 = | |||
| GiMultiplyFloat32(GiCastToFloat32(GiLoadInt32(src + SIMD_STEP)), t); | |||
| GI_FLOAT32_V2_t v2; | |||
| GiSetSubVectorFloat32V2(v2, 0, vitem0); | |||
| GiSetSubVectorFloat32V2(v2, 1, vitem1); | |||
| auto vres = QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(v2); | |||
| GiStoreLowInt8(dst, vres); | |||
| } | |||
| @@ -48,27 +52,29 @@ struct QuantizedTypeCvter<int8_t, int32_t> { | |||
| using dst_type = int32_t; | |||
| static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); | |||
| float scale; | |||
| GI_FLOAT32_t vscale; | |||
| GI_FLOAT32_FIXLEN_t vscale; | |||
| QuantizedTypeCvter(DType src_dtype, DType dst_dtype) { | |||
| float src_scale = src_dtype.param<dtype::QuantizedS8>().scale; | |||
| float dst_scale = dst_dtype.param<dtype::QuantizedS32>().scale; | |||
| scale = src_scale / dst_scale; | |||
| vscale = GiBroadcastFloat32(scale); | |||
| vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale)); | |||
| } | |||
| void cvt(const int8_t* src, int32_t* dst) { | |||
| GI_FLOAT32_t t; | |||
| t = GiFixLenType2GiFloat32Type(vscale); | |||
| GI_INT8_t data = GiLoadInt8(src); | |||
| GI_INT16_t vitem0 = GiMoveLowLongInt8(data); | |||
| GI_INT16_t vitem1 = GiMoveHighLongInt8(data); | |||
| auto vret0 = QConverter::round<GI_INT32_t, GI_FLOAT32_t>( | |||
| GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), vscale)); | |||
| auto vret1 = QConverter::round<GI_INT32_t, GI_FLOAT32_t>(GiMultiplyFloat32( | |||
| GiCastToFloat32(GiMoveHighLongInt16(vitem0)), vscale)); | |||
| GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), t)); | |||
| auto vret1 = QConverter::round<GI_INT32_t, GI_FLOAT32_t>( | |||
| GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem0)), t)); | |||
| auto vret2 = QConverter::round<GI_INT32_t, GI_FLOAT32_t>( | |||
| GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), vscale)); | |||
| auto vret3 = QConverter::round<GI_INT32_t, GI_FLOAT32_t>(GiMultiplyFloat32( | |||
| GiCastToFloat32(GiMoveHighLongInt16(vitem1)), vscale)); | |||
| GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), t)); | |||
| auto vret3 = QConverter::round<GI_INT32_t, GI_FLOAT32_t>( | |||
| GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem1)), t)); | |||
| constexpr size_t step = GI_SIMD_LEN_BYTE / sizeof(int32_t); | |||
| GiStoreInt32(dst, vret0); | |||
| @@ -90,21 +96,26 @@ struct QuantizedTypeCvter<float, int8_t> { | |||
| static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float) * 2; | |||
| static constexpr size_t SIMD_STEP = GI_SIMD_LEN_BYTE / sizeof(float); | |||
| float scale; | |||
| GI_FLOAT32_t vscale; | |||
| GI_FLOAT32_FIXLEN_t vscale; | |||
| QuantizedTypeCvter(DType src_dtype, DType dst_dtype) { | |||
| MEGDNN_MARK_USED_VAR(src_dtype); | |||
| float src_scale = 1; | |||
| float dst_scale = dst_dtype.param<dtype::QuantizedS8>().scale; | |||
| scale = src_scale / dst_scale; | |||
| vscale = GiBroadcastFloat32(scale); | |||
| vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale)); | |||
| } | |||
| void cvt(const float* src, int8_t* dst) { | |||
| GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiLoadFloat32(src), vscale); | |||
| GI_FLOAT32_t vitem1 = GiMultiplyFloat32(GiLoadFloat32(src + SIMD_STEP), vscale); | |||
| auto vres = QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>({{vitem0, vitem1}}); | |||
| GI_FLOAT32_t t; | |||
| t = GiFixLenType2GiFloat32Type(vscale); | |||
| GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiLoadFloat32(src), t); | |||
| GI_FLOAT32_t vitem1 = GiMultiplyFloat32(GiLoadFloat32(src + SIMD_STEP), t); | |||
| GI_FLOAT32_V2_t v2; | |||
| GiSetSubVectorFloat32V2(v2, 0, vitem0); | |||
| GiSetSubVectorFloat32V2(v2, 1, vitem1); | |||
| auto vres = QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>(v2); | |||
| GiStoreLowInt8(dst, vres); | |||
| } | |||
| @@ -119,18 +130,19 @@ struct QuantizedTypeCvter<int32_t, int32_t> { | |||
| using dst_type = int32_t; | |||
| static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int32_t); | |||
| float scale; | |||
| GI_FLOAT32_t vscale; | |||
| GI_FLOAT32_FIXLEN_t vscale; | |||
| QuantizedTypeCvter(DType src_dtype, DType dst_dtype) { | |||
| float src_scale = src_dtype.param<dtype::QuantizedS32>().scale; | |||
| float dst_scale = dst_dtype.param<dtype::QuantizedS32>().scale; | |||
| scale = src_scale / dst_scale; | |||
| vscale = GiBroadcastFloat32(scale); | |||
| vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale)); | |||
| } | |||
| void cvt(const int32_t* src, int32_t* dst) { | |||
| GI_FLOAT32_t vitem = | |||
| GiMultiplyFloat32(GiCastToFloat32(GiLoadInt32(src)), vscale); | |||
| GI_FLOAT32_t t; | |||
| t = GiFixLenType2GiFloat32Type(vscale); | |||
| GI_FLOAT32_t vitem = GiMultiplyFloat32(GiCastToFloat32(GiLoadInt32(src)), t); | |||
| auto vres = QConverter::round<GI_INT32_t, GI_FLOAT32_t>(vitem); | |||
| GiStoreInt32(dst, vres); | |||
| @@ -148,30 +160,32 @@ struct QuantizedTypeCvter<int8_t, int8_t> { | |||
| using dst_type = int8_t; | |||
| static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); | |||
| float scale; | |||
| GI_FLOAT32_t vscale; | |||
| GI_FLOAT32_FIXLEN_t vscale; | |||
| QuantizedTypeCvter(DType src_dtype, DType dst_dtype) { | |||
| float src_scale = src_dtype.param<dtype::QuantizedS8>().scale; | |||
| float dst_scale = dst_dtype.param<dtype::QuantizedS8>().scale; | |||
| scale = src_scale / dst_scale; | |||
| vscale = GiBroadcastFloat32(scale); | |||
| vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(scale)); | |||
| } | |||
| void cvt(const int8_t* src, int8_t* dst) { | |||
| GI_FLOAT32_t t; | |||
| t = GiFixLenType2GiFloat32Type(vscale); | |||
| GI_INT8_t data = GiLoadInt8(src); | |||
| GI_INT16_t vitem0 = GiMoveLowLongInt8(data); | |||
| GI_INT16_t vitem1 = GiMoveHighLongInt8(data); | |||
| auto vret0 = | |||
| GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), vscale); | |||
| auto vret1 = | |||
| GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem0)), vscale); | |||
| auto vret2 = | |||
| GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), vscale); | |||
| auto vret3 = | |||
| GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem1)), vscale); | |||
| auto vres = QConverter::convert<GI_INT8_t, GI_FLOAT32_V4_t>( | |||
| {{vret0, vret1, vret2, vret3}}); | |||
| auto vret0 = GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), t); | |||
| auto vret1 = GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem0)), t); | |||
| auto vret2 = GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), t); | |||
| auto vret3 = GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem1)), t); | |||
| GI_FLOAT32_V4_t v4; | |||
| GiSetSubVectorFloat32V4(v4, 0, vret0); | |||
| GiSetSubVectorFloat32V4(v4, 1, vret1); | |||
| GiSetSubVectorFloat32V4(v4, 2, vret2); | |||
| GiSetSubVectorFloat32V4(v4, 3, vret3); | |||
| auto vres = QConverter::convert<GI_INT8_t, GI_FLOAT32_V4_t>(v4); | |||
| GiStoreInt8(dst, vres); | |||
| } | |||
| @@ -245,26 +259,24 @@ struct Quan2FloatTypeCvter<int8_t, float> { | |||
| static constexpr size_t SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); | |||
| static constexpr size_t SIMD_STEP = GI_SIMD_LEN_BYTE / sizeof(float); | |||
| float _scale = 0.0f; | |||
| GI_FLOAT32_t vscale; | |||
| GI_FLOAT32_FIXLEN_t vscale; | |||
| Quan2FloatTypeCvter(DType src_dtype, DType dst_dtype) { | |||
| _scale = src_dtype.param<dtype::QuantizedS8>().scale; | |||
| vscale = GiBroadcastFloat32(_scale); | |||
| vscale = GiFloat32Type2FixLenType(GiBroadcastFloat32(_scale)); | |||
| MEGDNN_MARK_USED_VAR(dst_dtype); | |||
| } | |||
| void cvt(const int8_t* src, float* dst) { | |||
| GI_FLOAT32_t t; | |||
| t = GiFixLenType2GiFloat32Type(vscale); | |||
| GI_INT8_t data = GiLoadInt8(src); | |||
| GI_INT16_t vitem0 = GiMoveLowLongInt8(data); | |||
| GI_INT16_t vitem1 = GiMoveHighLongInt8(data); | |||
| auto vret0 = | |||
| GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), vscale); | |||
| auto vret1 = | |||
| GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem0)), vscale); | |||
| auto vret2 = | |||
| GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), vscale); | |||
| auto vret3 = | |||
| GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem1)), vscale); | |||
| auto vret0 = GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem0)), t); | |||
| auto vret1 = GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem0)), t); | |||
| auto vret2 = GiMultiplyFloat32(GiCastToFloat32(GiMoveLowLongInt16(vitem1)), t); | |||
| auto vret3 = GiMultiplyFloat32(GiCastToFloat32(GiMoveHighLongInt16(vitem1)), t); | |||
| GiStoreFloat32(dst, vret0); | |||
| GiStoreFloat32(dst + SIMD_STEP, vret1); | |||