feat(dnn/opr): add megdnn fake quant opr

GitOrigin-RevId: 5a04b6da2f
5 years ago · c03249c059
--- a/dnn/include/megdnn/oprs/nn.h
+++ b/dnn/include/megdnn/oprs/nn.h
@@ -60,7 +60,7 @@ struct PreprocessedFilter {
    TensorNDArray tensors;
 };

 }  // namespace intl
 }  // namespace detail

 /**
 * \brief base class for convolution operation
@@ -1562,6 +1562,58 @@ protected:
 };
 using BatchConvBias = BatchConvBiasForward;

 class FakeQuantBase : public OperatorBase {
    DEF_OPR_IMPL_CTOR(FakeQuantBase, OperatorBase);
    DEF_OPR_PARAM(FakeQuant);

 protected:
    void deduce_layout_fwd(const TensorLayout& input, TensorLayout& output);
    void check_layout_fwd(const TensorLayout& input, const TensorLayout& scale,
                          const TensorLayout& zero_point,
                          const TensorLayout& output);
 };

 class FakeQuantForward : public FakeQuantBase {
    DEF_OPR_IMPL(FakeQuantForward, FakeQuantBase, 3, 1);

 public:
    virtual void exec(_megdnn_tensor_in input, _megdnn_tensor_in scale,
                      _megdnn_tensor_in zero_point, _megdnn_tensor_out output,
                      _megdnn_workspace workspace) = 0;
    void deduce_layout(const TensorLayout& input, const TensorLayout& scale,
                       const TensorLayout& zero_point, TensorLayout& output);
    virtual size_t get_workspace_in_bytes(const TensorLayout& input,
                                          const TensorLayout& scale,
                                          const TensorLayout& zero_point,
                                          const TensorLayout& output) = 0;

 protected:
    void check_exec(const TensorLayout& input, const TensorLayout& scale,
                    const TensorLayout& zero_point, const TensorLayout& output,
                    size_t workspace_in_bytes);
 };

 using FakeQuant = FakeQuantForward;

 class FakeQuantBackward : public FakeQuantBase {
    DEF_OPR_IMPL(FakeQuantBackward, FakeQuantBase, 4, 1);

 public:
    virtual void exec(_megdnn_tensor_in diff, _megdnn_tensor_in input,
                      _megdnn_tensor_in scale, _megdnn_tensor_in zero_point,
                      _megdnn_tensor_out grad, _megdnn_workspace workspace) = 0;
    virtual size_t get_workspace_in_bytes(const TensorLayout& diff,
                                          const TensorLayout& input,
                                          const TensorLayout& scale,
                                          const TensorLayout& zero_point,
                                          const TensorLayout& grad) = 0;

 protected:
    void check_exec(const TensorLayout& diff, const TensorLayout& input,
                    const TensorLayout& scale, const TensorLayout& zero_point,
                    const TensorLayout& grad, size_t workspace_in_bytes);
 };

 }  // namespace megdnn
 #include "megdnn/internal/opr_header_epilogue.h"

--- a/dnn/scripts/opr_param_defs.py
+++ b/dnn/scripts/opr_param_defs.py
@@ -943,5 +943,9 @@ when the ``I`` suffix is present.
 add_enum_alias('Format', 'ConvolutionV0').
 add_enum_alias('ComputeMode', 'Convolution', name_field="compute_mode")
 )
 (pdef('FakeQuant').
 add_fields('int32','qmin','-2147483648').
 add_fields('int32','qmax','2147483647')
 )


--- a/dnn/src/common/fake_quant.cpp
+++ b/dnn/src/common/fake_quant.cpp
@@ -0,0 +1,66 @@
 /**
 * \file dnn/src/common/fakequant.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "megdnn/oprs.h"
 #include "src/common/utils.h"

 namespace megdnn {

 void FakeQuantBase::deduce_layout_fwd(const TensorLayout& input,
                                      TensorLayout& output) {
    output = TensorLayout(input, input.dtype);
 }

 void FakeQuantBase::check_layout_fwd(const TensorLayout& input,
                                     const TensorLayout& scale,
                                     const TensorLayout& zero_point,
                                     const TensorLayout& output) {
    megdnn_assert(input.dtype == dtype::Float32());
    megdnn_assert(scale.dtype == dtype::Float32());
    megdnn_assert(zero_point.dtype == dtype::Float32());
    TensorLayout expected;
    deduce_layout_fwd(input, expected);
    megdnn_assert_eq_layout(expected, output);
 }

 void FakeQuantForward::deduce_layout(const TensorLayout& input,
                                     const TensorLayout& /*scale*/,
                                     const TensorLayout& /*zero_point*/,
                                     TensorLayout& output) {
    deduce_layout_fwd(input, output);
 }

 void FakeQuantForward::check_exec(const TensorLayout& input,
                                  const TensorLayout& scale,
                                  const TensorLayout& zero_point,
                                  const TensorLayout& output,
                                  size_t workspace_in_bytes) {
    check_layout_fwd(input, scale, zero_point, output);
    auto required_workspace_space =
            get_workspace_in_bytes(input, scale, zero_point, output);
    megdnn_assert(workspace_in_bytes >= required_workspace_space);
 }

 void FakeQuantBackward::check_exec(const TensorLayout& diff,
                                   const TensorLayout& input,
                                   const TensorLayout& scale,
                                   const TensorLayout& zero_point,
                                   const TensorLayout& grad,
                                   size_t workspace_in_bytes) {
    megdnn_assert_eq_shape(input, diff);
    megdnn_assert_eq_shape(input, grad);
    auto required_worspace_space =
            get_workspace_in_bytes(diff, input, scale, zero_point, grad);
    megdnn_assert(workspace_in_bytes >= required_worspace_space);
 }

 }  // namespace megdnn
--- a/dnn/src/common/handle_impl.h
+++ b/dnn/src/common/handle_impl.h
@@ -201,7 +201,9 @@ private:
    cb(RemapBackwardMat) \
    cb(AdaptivePoolingForward) \
    cb(AdaptivePoolingBackward) \
    cb(DctChannelSelectForward) 
    cb(DctChannelSelectForward) \
    cb(FakeQuantForward) \
    cb(FakeQuantBackward)

 /*!
 * \brief specialize HandleImpl::create_operator for a single opr type;
--- a/dnn/src/cuda/elemwise_helper.cuh
+++ b/dnn/src/cuda/elemwise_helper.cuh
@@ -13,9 +13,9 @@
 #pragma once

 #include "src/common/elemwise_helper.cuh"
 #include "src/cuda/utils.cuh"
 #include "src/cuda/int_fastdiv.cuh"
 #include "src/cuda/query_blocksize.cuh"
 #include "src/cuda/utils.cuh"

 /*
 * please note that all arithmetics on GPU are 32-bit for best performance; this
@@ -649,6 +649,102 @@ struct OpCallerUniform<Op, 3, PVis> {
    }
 };

 //! specialization for arity == 4
 template <class Op, class PVis>
 struct OpCallerUniform<Op, 4, PVis> {
    Op op;
    PVis par[4];
    static const uint32_t packed_size = PVis::packed_size;

    devfunc void thread_init(uint32_t idx) {
        idx = idx * packed_size;
        par[0].thread_init(idx);
        par[1].thread_init(idx);
        par[2].thread_init(idx);
        par[3].thread_init(idx);
    }

    devfunc void on(uint32_t idx) {
        idx = idx * packed_size;
        op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx), par[3].at(idx));
    }

    devfunc void on(uint32_t idx, uint32_t remain) {
        idx = idx * packed_size;
        if (remain >= packed_size) {
            op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx),
               par[3].at(idx));
        } else {
            auto ptr0 = par[0].ptr();
            auto ptr1 = par[1].ptr();
            auto ptr2 = par[2].ptr();
            auto ptr3 = par[3].ptr();
            for (int i = 0; i < remain; i++) {
                op(idx + i, ptr0[par[0].offset(idx + i)],
                   ptr1[par[1].offset(idx + i)], ptr2[par[2].offset(idx + i)],
                   ptr3[par[3].offset(idx + i)]);
            }
        }
    }

    devfunc void next() {
        par[0].next();
        par[1].next();
        par[2].next();
        par[3].next();
    }
 };

 //! specialization for arity == 5
 template <class Op, class PVis>
 struct OpCallerUniform<Op, 5, PVis> {
    Op op;
    PVis par[5];
    static const uint32_t packed_size = PVis::packed_size;

    devfunc void thread_init(uint32_t idx) {
        idx = idx * packed_size;
        par[0].thread_init(idx);
        par[1].thread_init(idx);
        par[2].thread_init(idx);
        par[3].thread_init(idx);
        par[4].thread_init(idx);
    }

    devfunc void on(uint32_t idx) {
        idx = idx * packed_size;
        op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx), par[3].at(idx),
           par[4].at(idx));
    }

    devfunc void on(uint32_t idx, uint32_t remain) {
        idx = idx * packed_size;
        if (remain >= packed_size) {
            op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx),
               par[3].at(idx), par[4].at(idx));
        } else {
            auto ptr0 = par[0].ptr();
            auto ptr1 = par[1].ptr();
            auto ptr2 = par[2].ptr();
            auto ptr3 = par[3].ptr();
            auto ptr4 = par[4].ptr();
            for (int i = 0; i < remain; i++) {
                op(idx + i, ptr0[par[0].offset(idx + i)],
                   ptr1[par[1].offset(idx + i)], ptr2[par[2].offset(idx + i)],
                   ptr3[par[3].offset(idx + i)], ptr4[par[4].offset(idx + i)]);
            }
        }
    }

    devfunc void next() {
        par[0].next();
        par[1].next();
        par[2].next();
        par[3].next();
        par[4].next();
    }
 };

 /*!
 * \brief call binary (i.e. arity == 2) operator with different param
 *      visitors
--- a/dnn/src/cuda/fake_quant/kern.cu
+++ b/dnn/src/cuda/fake_quant/kern.cu
@@ -0,0 +1,30 @@
 /**
 * \file dnn/src/cuda/fake_quant/kern.cu
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "./kern.cuh"

 namespace megdnn {
 namespace cuda {

 #define cb(_dtype)                                                            \
    INST_RUN_ELEMWISE(FakeQuantKernOp<DTypeTrait<_dtype>::ctype>,             \
                      DTypeTrait<_dtype>::ctype, 2);                          \
    INST_RUN_ELEMWISE(FakeQuantBwdKernOp<DTypeTrait<_dtype>::ctype>,          \
                      DTypeTrait<_dtype>::ctype, 2);                          \
    INST_RUN_ELEMWISE(FakeQuantKernOpNonContig<DTypeTrait<_dtype>::ctype>,    \
                      DTypeTrait<_dtype>::ctype, 4);                          \
    INST_RUN_ELEMWISE(FakeQuantBwdKernOpNonContig<DTypeTrait<_dtype>::ctype>, \
                      DTypeTrait<_dtype>::ctype, 5);
 cb(megdnn::dtype::Float32)

 }  // namespace cuda
 }  // namespace megdnn
--- a/dnn/src/cuda/fake_quant/kern.cuh
+++ b/dnn/src/cuda/fake_quant/kern.cuh
@@ -0,0 +1,106 @@
 /**
 * \file dnn/src/cuda/elemwise_helper.cuh
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #pragma once

 #include "src/cuda/elemwise_helper.cuh"
 #include "src/cuda/utils.cuh"

 #if MEGDNN_CC_HOST
 #include "megdnn/oprs.h"
 #endif

 namespace megdnn {
 namespace cuda {

 template <typename ctype>
 struct FakeQuantKernOp {
    ctype* input;
    ctype* output;
    ctype qmin, qmax;

    __device__ void operator()(uint32_t idx, ctype scale, ctype zero_point) {
        ctype x = round(input[idx] / scale) + zero_point;
        x = fmaxf(fminf(x, qmax), qmin);
        output[idx] = (x - zero_point) * scale;
    }

 #if MEGDNN_CC_HOST
    FakeQuantKernOp(const TensorND& input, const TensorND& output,
                    const FakeQuant::Param& param)
            : input{input.ptr<ctype>()},
              output{output.ptr<ctype>()},
              qmin(param.qmin),
              qmax(param.qmax) {}
 #endif
 };

 template <typename ctype>
 struct FakeQuantBwdKernOp {
    ctype* diff;
    ctype* input;
    ctype* grad;
    ctype qmin, qmax;

    __device__ void operator()(uint32_t idx, ctype scale, ctype zero_point) {
        ctype x = round(input[idx] / scale) + zero_point;
        grad[idx] = x <= qmax && x >= qmin ? diff[idx] : 0.0;
    }

 #if MEGDNN_CC_HOST
    FakeQuantBwdKernOp(const TensorND& diff, const TensorND& input,
                       const TensorND& grad, const FakeQuant::Param& param)
            : diff{diff.ptr<ctype>()},
              input{input.ptr<ctype>()},
              grad{grad.ptr<ctype>()},
              qmin(param.qmin),
              qmax(param.qmax) {}
 #endif
 };

 template <typename ctype>
 struct FakeQuantKernOpNonContig {
    ctype qmin;
    ctype qmax;

    __device__ void operator()(uint32_t, ctype& output, ctype input,
                               ctype scale, ctype zero_point) {
        ctype x = round(input / scale) + zero_point;
        x = fmaxf(fminf(x, qmax), qmin);
        output = (x - zero_point) * scale;
    }

 #if MEGDNN_CC_HOST
    FakeQuantKernOpNonContig(const FakeQuant::Param& param)
            : qmin(param.qmin), qmax(param.qmax) {}
 #endif
 };

 template <typename ctype>
 struct FakeQuantBwdKernOpNonContig {
    ctype qmin;
    ctype qmax;

    __device__ void operator()(uint32_t, ctype& grad, ctype diff, ctype input,
                               ctype scale, ctype zero_point) {
        ctype x = round(input / scale) + zero_point;
        grad = x <= qmax && x >= qmin ? diff : 0.0;
    }

 #if MEGDNN_CC_HOST
    FakeQuantBwdKernOpNonContig(const FakeQuant::Param& param)
            : qmin(param.qmin), qmax(param.qmax) {}
 #endif
 };

 }  // namespace cuda
 }  // namespace megdnn
--- a/dnn/src/cuda/fake_quant/opr_impl.cpp
+++ b/dnn/src/cuda/fake_quant/opr_impl.cpp
@@ -0,0 +1,134 @@
 /**
 * \file dnn/src/cuda/fake_quant/opr_impl.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "./opr_impl.h"
 #include "./kern.cuh"
 #include "src/common/utils.h"
 namespace megdnn {
 namespace cuda {

 void FakeQuantForwardImpl::exec(_megdnn_tensor_in input,
                                _megdnn_tensor_in scale,
                                _megdnn_tensor_in zero_point,
                                _megdnn_tensor_out output,
                                _megdnn_workspace workspace) {
    check_exec(input.layout, scale.layout, zero_point.layout, output.layout,
               workspace.size);

    if (!input.layout.is_contiguous() || !output.layout.is_contiguous()) {
        return exec_noncontig(input, scale, zero_point, output);
    }
    ElemwiseOpParamN<2> ele_param;
    ele_param[0] = scale;
    ele_param[0].layout = ele_param[0].layout.broadcast(input.layout);
    ele_param[1] = zero_point;
    ele_param[1].layout = ele_param[1].layout.broadcast(input.layout);
    ele_param.init_from_given_tensor();
    auto stream = cuda_stream(handle());

 #define cb(DType)                                                         \
    if (input.layout.dtype == DType()) {                                  \
        using T = typename DTypeTrait<DType>::ctype;                      \
        run_elemwise<FakeQuantKernOp<T>, T, 2>(ele_param, stream,         \
                                               {input, output, m_param}); \
        return;                                                           \
    }
    cb(megdnn::dtype::Float32)
 #undef cb
 }

 void FakeQuantForwardImpl::exec_noncontig(_megdnn_tensor_in input,
                                          _megdnn_tensor_in scale,
                                          _megdnn_tensor_in zero_point,
                                          _megdnn_tensor_out output) {
    ElemwiseOpParamN<4> ele_param;
    ele_param[0] = output;
    ele_param[1] = input;
    ele_param[2] = scale;
    ele_param[2].layout = ele_param[2].layout.broadcast(input.layout);
    ele_param[3] = zero_point;
    ele_param[3].layout = ele_param[3].layout.broadcast(input.layout);
    ele_param.init_from_given_tensor();
    auto stream = cuda_stream(handle());

 #define cb(DType)                                                          \
    if (input.layout.dtype == DType()) {                                   \
        using T = typename DTypeTrait<DType>::ctype;                       \
        run_elemwise<FakeQuantKernOpNonContig<T>, T, 4>(ele_param, stream, \
                                                        {m_param});        \
        return;                                                            \
    }
    cb(megdnn::dtype::Float32)
 #undef cb
 }

 void FakeQuantBackwardImpl::exec(_megdnn_tensor_in diff,
                                 _megdnn_tensor_in input,
                                 _megdnn_tensor_in scale,
                                 _megdnn_tensor_in zero_point,
                                 _megdnn_tensor_out grad,
                                 _megdnn_workspace workspace) {
    check_exec(diff.layout, input.layout, scale.layout, zero_point.layout,
               grad.layout, workspace.size);

    if (!input.layout.is_contiguous() || !diff.layout.is_contiguous() ||
        !grad.layout.is_contiguous()) {
        return exec_noncontig(diff, input, scale, zero_point, grad);
    }
    ElemwiseOpParamN<2> ele_param;
    ele_param[0] = scale;
    ele_param[0].layout = ele_param[0].layout.broadcast(input.layout);
    ele_param[1] = zero_point;
    ele_param[1].layout = ele_param[1].layout.broadcast(input.layout);
    ele_param.init_from_given_tensor();
    auto m_param = param();
    auto stream = cuda_stream(handle());
 #define cb(DType)                                                 \
    if (grad.layout.dtype == DType()) {                           \
        using T = typename DTypeTrait<DType>::ctype;              \
        run_elemwise<FakeQuantBwdKernOp<T>, T, 2>(                \
                ele_param, stream, {diff, input, grad, m_param}); \
        return;                                                   \
    }
    cb(megdnn::dtype::Float32)
 #undef cb
 }

 void FakeQuantBackwardImpl::exec_noncontig(_megdnn_tensor_in diff,
                                           _megdnn_tensor_in input,
                                           _megdnn_tensor_in scale,
                                           _megdnn_tensor_in zero_point,
                                           _megdnn_tensor_out grad) {
    ElemwiseOpParamN<5> ele_param;
    ele_param[0] = grad;
    ele_param[1] = diff;
    ele_param[2] = input;
    ele_param[3] = scale;
    ele_param[3].layout = ele_param[3].layout.broadcast(input.layout);
    ele_param[4] = zero_point;
    ele_param[4].layout = ele_param[4].layout.broadcast(input.layout);
    ele_param.init_from_given_tensor();
    auto m_param = param();
    auto stream = cuda_stream(handle());
 #define cb(DType)                                                             \
    if (grad.layout.dtype == DType()) {                                       \
        using T = typename DTypeTrait<DType>::ctype;                          \
        run_elemwise<FakeQuantBwdKernOpNonContig<T>, T, 5>(ele_param, stream, \
                                                           {m_param});        \
        return;                                                               \
    }
    cb(megdnn::dtype::Float32)
 #undef cb
 }

 }  // namespace cuda
 }  // namespace megdnn
--- a/dnn/src/cuda/fake_quant/opr_impl.h
+++ b/dnn/src/cuda/fake_quant/opr_impl.h
@@ -0,0 +1,55 @@
 /**
 * \file dnn/src/cuda/fake_quant/opr_impl.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #pragma once
 #include "megdnn/oprs.h"
 #include "src/cuda/utils.h"
 namespace megdnn {
 namespace cuda {
 class FakeQuantForwardImpl : public FakeQuantForward {
 public:
    using FakeQuantForward::FakeQuantForward;
    void exec(_megdnn_tensor_in input, _megdnn_tensor_in scale,
              _megdnn_tensor_in zero_point, _megdnn_tensor_out output,
              _megdnn_workspace workspace) override;
    size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&,
                                  const TensorLayout&,
                                  const TensorLayout&) override {
        return 0;
    }

 private:
    void exec_noncontig(_megdnn_tensor_in input, _megdnn_tensor_in scale,
                        _megdnn_tensor_in zero_point,
                        _megdnn_tensor_out output);
 };

 class FakeQuantBackwardImpl : public FakeQuantBackward {
 public:
    using FakeQuantBackward::FakeQuantBackward;
    void exec(_megdnn_tensor_in diff, _megdnn_tensor_in input,
              _megdnn_tensor_in scale, _megdnn_tensor_in zero_point,
              _megdnn_tensor_out grad, _megdnn_workspace workspace) override;
    size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&,
                                  const TensorLayout&, const TensorLayout&,
                                  const TensorLayout&) override {
        return 0;
    }

 private:
    void exec_noncontig(_megdnn_tensor_in diff, _megdnn_tensor_in input,
                        _megdnn_tensor_in scale, _megdnn_tensor_in zero_point,
                        _megdnn_tensor_out grad);
 };

 }  // namespace cuda
 }  // namespace megdnn
--- a/dnn/src/cuda/handle_create.cpp
+++ b/dnn/src/cuda/handle_create.cpp
@@ -77,6 +77,7 @@
 #include "src/cuda/roi_align/opr_impl.h"
 #include "src/cuda/batch_conv_bias/opr_impl.h"
 #include "src/cuda/remap/opr_impl.h"
 #include "src/cuda/fake_quant/opr_impl.h"

 namespace megdnn {
 namespace cuda {
--- a/dnn/src/naive/fake_quant/opr_impl.cpp
+++ b/dnn/src/naive/fake_quant/opr_impl.cpp
@@ -0,0 +1,118 @@
 /**
 * \file dnn/src/naive/fakequant/opr_impl.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "src/naive/fake_quant/opr_impl.h"
 #include <cmath>
 #include <iostream>
 #include "megdnn/tensor_iter.h"
 #include "src/common/elemwise_helper.cuh"
 #include "src/common/utils.h"
 #include "src/naive/handle.h"
 namespace {
 using namespace megdnn;

 template <typename T>
 void forward_impl(const ElemwiseOpParamN<4> src, float qmin, float qmax) {
    auto inp = tensor_iter_valonly<T>(src[0]).begin();
    auto out = tensor_iter_valonly<T>(src[1]).begin();
    auto scale = tensor_iter_valonly<T>(src[2]).begin();
    auto zero_point = tensor_iter_valonly<T>(src[3]).begin();
    size_t total = src[0].layout.total_nr_elems();
    for (size_t i = 0; i < total; ++i) {
        T x = round(*inp / (*scale)) + *zero_point;
        x = x <= qmin ? qmin : x;
        x = x >= qmax ? qmax : x;
        *out = (x - *zero_point) * *scale;
        ++inp;
        ++out;
        ++scale;
        ++zero_point;
    }
 }

 template <typename T>
 void backward_impl(const ElemwiseOpParamN<5> src, float qmin, float qmax) {
    auto diff = tensor_iter_valonly<T>(src[0]).begin();
    auto input = tensor_iter_valonly<T>(src[1]).begin();
    auto scale = tensor_iter_valonly<T>(src[2]).begin();
    auto zero_point = tensor_iter_valonly<T>(src[3]).begin();
    auto grad = tensor_iter_valonly<T>(src[4]).begin();
    size_t total = src[0].layout.total_nr_elems();
    for (size_t i = 0; i < total; ++i) {
        T x = round(*input / (*scale)) + *zero_point;
        *grad = (x >= qmin && x <= qmax) ? *diff : 0.0;
        ++diff;
        ++input;
        ++scale;
        ++zero_point;
        ++grad;
    }
 }

 }  // namespace
 namespace megdnn {
 namespace naive {

 void FakeQuantForwardImpl::exec(_megdnn_tensor_in input,
                                _megdnn_tensor_in scale,
                                _megdnn_tensor_in zero_point,
                                _megdnn_tensor_out output,
                                _megdnn_workspace workspace) {
    check_exec(input.layout, scale.layout, zero_point.layout, output.layout,
               workspace.size);
    ElemwiseOpParamN<4> src;
    src[0] = input;
    src[1] = output;
    src[2] = scale;
    src[2].layout = src[2].layout.broadcast(input.layout);
    src[3] = zero_point;
    src[3].layout = src[3].layout.broadcast(input.layout);
 #define cb(DType)                                                  \
    if (input.layout.dtype == DType()) {                           \
        using T = typename DTypeTrait<DType>::ctype;               \
        MEGDNN_DISPATCH_CPU_KERN_OPR(                              \
                forward_impl<T>(src, param().qmin, param().qmax)); \
        return;                                                    \
    }
    cb(dtype::Float32)
 #undef cb
 }

 void FakeQuantBackwardImpl::exec(_megdnn_tensor_in diff,
                                 _megdnn_tensor_in input,
                                 _megdnn_tensor_in scale,
                                 _megdnn_tensor_in zero_point,
                                 _megdnn_tensor_out grad,
                                 _megdnn_workspace workspace) {
    check_exec(diff.layout, input.layout, scale.layout, zero_point.layout,
               grad.layout, workspace.size);
    ElemwiseOpParamN<5> src;
    src[0] = diff;
    src[1] = input;
    src[2] = scale;
    src[2].layout = src[2].layout.broadcast(input.layout);
    src[3] = zero_point;
    src[3].layout = src[3].layout.broadcast(input.layout);
    src[4] = grad;
 #define cb(DType)                                                       \
    if (diff.layout.dtype == DType() && grad.layout.dtype == DType() && \
        input.layout.dtype == DType()) {                                \
        using T = typename DTypeTrait<DType>::ctype;                    \
        MEGDNN_DISPATCH_CPU_KERN_OPR(                                   \
                backward_impl<T>(src, param().qmin, param().qmax));     \
        return;                                                         \
    }
    cb(dtype::Float32)
 #undef cb
 }
 }  // namespace naive
 }  // namespace megdnn
--- a/dnn/src/naive/fake_quant/opr_impl.h
+++ b/dnn/src/naive/fake_quant/opr_impl.h
@@ -0,0 +1,45 @@
 /**
 * \file dnn/src/naive/fakequant/opr_impl.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #pragma once
 #include "megdnn/oprs.h"

 namespace megdnn {
 namespace naive {
 class FakeQuantForwardImpl : public FakeQuantForward {
 public:
    using FakeQuantForward::FakeQuantForward;
    void exec(_megdnn_tensor_in input, _megdnn_tensor_in scale,
              _megdnn_tensor_in zero_point, _megdnn_tensor_out output,
              _megdnn_workspace workspace) override;
    size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&,
                                  const TensorLayout&,
                                  const TensorLayout&) override {
        return 0;
    }
 };

 class FakeQuantBackwardImpl : public FakeQuantBackward {
 public:
    using FakeQuantBackward::FakeQuantBackward;
    void exec(_megdnn_tensor_in diff, _megdnn_tensor_in input,
              _megdnn_tensor_in scale, _megdnn_tensor_in zero_point,
              _megdnn_tensor_out grad, _megdnn_workspace workspace) override;
    size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&,
                                  const TensorLayout&, const TensorLayout&,
                                  const TensorLayout&) override {
        return 0;
    }
 };

 }  // namespace naive
 }  // namespace megdnn
--- a/dnn/src/naive/handle.cpp
+++ b/dnn/src/naive/handle.cpp
@@ -79,6 +79,8 @@
 #include "src/naive/warp_affine/opr_impl.h"
 #include "src/naive/warp_perspective/opr_impl.h"
 #include "src/naive/winograd_filter_preprocess/opr_impl.h"
 #include "src/naive/remap/opr_impl.h"
 #include "src/naive/fake_quant/opr_impl.h"

 static size_t g_image2d_pitch_alignment = 1;

--- a/dnn/test/common/fake_quant.h
+++ b/dnn/test/common/fake_quant.h
@@ -0,0 +1,60 @@
 /**
 * \file dnn/test/common/fake_quant.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #pragma once
 #include "megdnn/basic_types.h"
 #include "megdnn/opr_param_defs.h"

 namespace megdnn {
 namespace test {
 namespace fake_quant {

 struct TestArg {
    param::FakeQuant param;
    TensorShape ishape;
    TensorShape scale_shape;
    TensorShape zeropoint_shape;
    TestArg(param::FakeQuant param, TensorShape ishape, TensorShape scale_shape,
            TensorShape zeropoint_shape)
            : param(param),
              ishape(ishape),
              scale_shape(scale_shape),
              zeropoint_shape(zeropoint_shape) {}
 };

 inline std::vector<TestArg> get_args() {
    std::vector<TestArg> args;
    param::FakeQuant cur_param;

    cur_param.qmin = -128;
    cur_param.qmax = 128;

    for (size_t i = 10; i < 40; i += 2) {
        args.emplace_back(cur_param, TensorShape{10, 64, i, i}, TensorShape{1},
                          TensorShape{1});
    }

    for (size_t m : {1, 10})
        for (size_t n : {1, 10})
            for (size_t j : {1, 10})
                for (size_t k : {1, 10}) {
                    args.emplace_back(cur_param, TensorShape{10, 64, 10, 10},
                                      TensorShape{10, 64, m, n},
                                      TensorShape{10, 64, j, k});
                }
    return args;
 }

 }  // namespace fake_quant
 }  // namespace test

 }  // namespace megdnn
--- a/dnn/test/common/opr_trait.h
+++ b/dnn/test/common/opr_trait.h
@@ -111,6 +111,8 @@ DEF(Remap, 3, true, true);
 DEF(RemapBackwardData, 3, true, false);
 DEF(RemapBackwardMat, 4, true, false);
 DEF(DctChannelSelectForward, 4, true, true);
 DEF(FakeQuantForward, 4, true, true);
 DEF(FakeQuantBackward, 5, true, false);
 }  // namespace test
 }  // namespace megdnn

--- a/dnn/test/cuda/fake_quant.cpp
+++ b/dnn/test/cuda/fake_quant.cpp
@@ -0,0 +1,99 @@
 /**
 * \file dnn/test/cuda/fake_quant.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "test/common/fake_quant.h"
 #include "megdnn/oprs.h"
 #include "test/common/checker.h"
 #include "test/cuda/fixture.h"

 namespace megdnn {
 namespace test {
 using namespace fake_quant;
 TEST_F(CUDA, FAKE_QUANT) {
    std::vector<TestArg> args = get_args();
    auto dtype = dtype::Float32();
    std::unique_ptr<RNG> rng;

    for (auto&& arg : args) {
        auto param = arg.param;
        auto ishape = arg.ishape;
        auto scale_shape = arg.scale_shape;
        auto zeropoint_shape = arg.zeropoint_shape;
        Checker<FakeQuantForward> checker(handle_cuda());
        checker.set_param(param)
                .set_dtype(0, dtype)
                .set_dtype(1, dtype)
                .set_dtype(2, dtype)
                .set_dtype(3, dtype)
                .execs(TensorShapeArray{ishape, scale_shape, zeropoint_shape,
                                        ishape});
    }
    // test noncontiguous layout
    for (auto&& arg : args) {
        auto param = arg.param;
        auto ishape = arg.ishape;
        auto scale_shape = arg.scale_shape;
        auto zeropoint_shape = arg.zeropoint_shape;
        Checker<FakeQuantForward> checker(handle_cuda());
        TensorLayout ilayout(
                ishape,
                {(long int)(ishape[1] * ishape[2] * ishape[3] * 2),
                 (long int)(ishape[2] * ishape[3]), (long int)ishape[3], 1},
                dtype::Float32());
        checker.set_param(param).execl({ilayout,
                                        {scale_shape, dtype::Float32()},
                                        {zeropoint_shape, dtype::Float32()},
                                        ilayout});
    }
 }

 TEST_F(CUDA, FAKE_QUANT_BACKWARD) {
    std::vector<TestArg> args = get_args();
    auto dtype = dtype::Float32();

    for (auto&& arg : args) {
        auto param = arg.param;
        auto ishape = arg.ishape;
        auto scale_shape = arg.scale_shape;
        auto zeropoint_shape = arg.zeropoint_shape;
        Checker<FakeQuantBackward> checker(handle_cuda());
        checker.set_param(param)
                .set_dtype(0, dtype)
                .set_dtype(1, dtype)
                .set_dtype(2, dtype)
                .set_dtype(3, dtype)
                .set_dtype(4, dtype)
                .execs(TensorShapeArray{ishape, ishape, scale_shape,
                                        zeropoint_shape, ishape});
    }
    // test noncontiguous layout
    for (auto&& arg : args) {
        auto param = arg.param;
        auto ishape = arg.ishape;
        auto scale_shape = arg.scale_shape;
        auto zeropoint_shape = arg.zeropoint_shape;
        Checker<FakeQuantBackward> checker(handle_cuda());
        TensorLayout ilayout(
                ishape,
                {(long int)(ishape[1] * ishape[2] * ishape[3] * 2),
                 (long int)(ishape[2] * ishape[3]), (long int)ishape[3], 1},
                dtype::Float32());
        checker.set_param(param).execl({ilayout,
                                        ilayout,
                                        {scale_shape, dtype::Float32()},
                                        {zeropoint_shape, dtype::Float32()},
                                        ilayout});
    }
 }

 }  // namespace test
 }  // namespace megdnn