| @@ -60,7 +60,7 @@ struct PreprocessedFilter { | |||
| TensorNDArray tensors; | |||
| }; | |||
| } // namespace intl | |||
| } // namespace detail | |||
| /** | |||
| * \brief base class for convolution operation | |||
| @@ -1562,6 +1562,58 @@ protected: | |||
| }; | |||
| using BatchConvBias = BatchConvBiasForward; | |||
| class FakeQuantBase : public OperatorBase { | |||
| DEF_OPR_IMPL_CTOR(FakeQuantBase, OperatorBase); | |||
| DEF_OPR_PARAM(FakeQuant); | |||
| protected: | |||
| void deduce_layout_fwd(const TensorLayout& input, TensorLayout& output); | |||
| void check_layout_fwd(const TensorLayout& input, const TensorLayout& scale, | |||
| const TensorLayout& zero_point, | |||
| const TensorLayout& output); | |||
| }; | |||
| class FakeQuantForward : public FakeQuantBase { | |||
| DEF_OPR_IMPL(FakeQuantForward, FakeQuantBase, 3, 1); | |||
| public: | |||
| virtual void exec(_megdnn_tensor_in input, _megdnn_tensor_in scale, | |||
| _megdnn_tensor_in zero_point, _megdnn_tensor_out output, | |||
| _megdnn_workspace workspace) = 0; | |||
| void deduce_layout(const TensorLayout& input, const TensorLayout& scale, | |||
| const TensorLayout& zero_point, TensorLayout& output); | |||
| virtual size_t get_workspace_in_bytes(const TensorLayout& input, | |||
| const TensorLayout& scale, | |||
| const TensorLayout& zero_point, | |||
| const TensorLayout& output) = 0; | |||
| protected: | |||
| void check_exec(const TensorLayout& input, const TensorLayout& scale, | |||
| const TensorLayout& zero_point, const TensorLayout& output, | |||
| size_t workspace_in_bytes); | |||
| }; | |||
| using FakeQuant = FakeQuantForward; | |||
| class FakeQuantBackward : public FakeQuantBase { | |||
| DEF_OPR_IMPL(FakeQuantBackward, FakeQuantBase, 4, 1); | |||
| public: | |||
| virtual void exec(_megdnn_tensor_in diff, _megdnn_tensor_in input, | |||
| _megdnn_tensor_in scale, _megdnn_tensor_in zero_point, | |||
| _megdnn_tensor_out grad, _megdnn_workspace workspace) = 0; | |||
| virtual size_t get_workspace_in_bytes(const TensorLayout& diff, | |||
| const TensorLayout& input, | |||
| const TensorLayout& scale, | |||
| const TensorLayout& zero_point, | |||
| const TensorLayout& grad) = 0; | |||
| protected: | |||
| void check_exec(const TensorLayout& diff, const TensorLayout& input, | |||
| const TensorLayout& scale, const TensorLayout& zero_point, | |||
| const TensorLayout& grad, size_t workspace_in_bytes); | |||
| }; | |||
| } // namespace megdnn | |||
| #include "megdnn/internal/opr_header_epilogue.h" | |||
| @@ -943,5 +943,9 @@ when the ``I`` suffix is present. | |||
| add_enum_alias('Format', 'ConvolutionV0'). | |||
| add_enum_alias('ComputeMode', 'Convolution', name_field="compute_mode") | |||
| ) | |||
| (pdef('FakeQuant'). | |||
| add_fields('int32','qmin','-2147483648'). | |||
| add_fields('int32','qmax','2147483647') | |||
| ) | |||
| @@ -0,0 +1,66 @@ | |||
| /** | |||
| * \file dnn/src/common/fakequant.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #include "megdnn/oprs.h" | |||
| #include "src/common/utils.h" | |||
| namespace megdnn { | |||
| void FakeQuantBase::deduce_layout_fwd(const TensorLayout& input, | |||
| TensorLayout& output) { | |||
| output = TensorLayout(input, input.dtype); | |||
| } | |||
| void FakeQuantBase::check_layout_fwd(const TensorLayout& input, | |||
| const TensorLayout& scale, | |||
| const TensorLayout& zero_point, | |||
| const TensorLayout& output) { | |||
| megdnn_assert(input.dtype == dtype::Float32()); | |||
| megdnn_assert(scale.dtype == dtype::Float32()); | |||
| megdnn_assert(zero_point.dtype == dtype::Float32()); | |||
| TensorLayout expected; | |||
| deduce_layout_fwd(input, expected); | |||
| megdnn_assert_eq_layout(expected, output); | |||
| } | |||
| void FakeQuantForward::deduce_layout(const TensorLayout& input, | |||
| const TensorLayout& /*scale*/, | |||
| const TensorLayout& /*zero_point*/, | |||
| TensorLayout& output) { | |||
| deduce_layout_fwd(input, output); | |||
| } | |||
| void FakeQuantForward::check_exec(const TensorLayout& input, | |||
| const TensorLayout& scale, | |||
| const TensorLayout& zero_point, | |||
| const TensorLayout& output, | |||
| size_t workspace_in_bytes) { | |||
| check_layout_fwd(input, scale, zero_point, output); | |||
| auto required_workspace_space = | |||
| get_workspace_in_bytes(input, scale, zero_point, output); | |||
| megdnn_assert(workspace_in_bytes >= required_workspace_space); | |||
| } | |||
| void FakeQuantBackward::check_exec(const TensorLayout& diff, | |||
| const TensorLayout& input, | |||
| const TensorLayout& scale, | |||
| const TensorLayout& zero_point, | |||
| const TensorLayout& grad, | |||
| size_t workspace_in_bytes) { | |||
| megdnn_assert_eq_shape(input, diff); | |||
| megdnn_assert_eq_shape(input, grad); | |||
| auto required_worspace_space = | |||
| get_workspace_in_bytes(diff, input, scale, zero_point, grad); | |||
| megdnn_assert(workspace_in_bytes >= required_worspace_space); | |||
| } | |||
| } // namespace megdnn | |||
| @@ -201,7 +201,9 @@ private: | |||
| cb(RemapBackwardMat) \ | |||
| cb(AdaptivePoolingForward) \ | |||
| cb(AdaptivePoolingBackward) \ | |||
| cb(DctChannelSelectForward) | |||
| cb(DctChannelSelectForward) \ | |||
| cb(FakeQuantForward) \ | |||
| cb(FakeQuantBackward) | |||
| /*! | |||
| * \brief specialize HandleImpl::create_operator for a single opr type; | |||
| @@ -13,9 +13,9 @@ | |||
| #pragma once | |||
| #include "src/common/elemwise_helper.cuh" | |||
| #include "src/cuda/utils.cuh" | |||
| #include "src/cuda/int_fastdiv.cuh" | |||
| #include "src/cuda/query_blocksize.cuh" | |||
| #include "src/cuda/utils.cuh" | |||
| /* | |||
| * please note that all arithmetics on GPU are 32-bit for best performance; this | |||
| @@ -649,6 +649,102 @@ struct OpCallerUniform<Op, 3, PVis> { | |||
| } | |||
| }; | |||
| //! specialization for arity == 4 | |||
| template <class Op, class PVis> | |||
| struct OpCallerUniform<Op, 4, PVis> { | |||
| Op op; | |||
| PVis par[4]; | |||
| static const uint32_t packed_size = PVis::packed_size; | |||
| devfunc void thread_init(uint32_t idx) { | |||
| idx = idx * packed_size; | |||
| par[0].thread_init(idx); | |||
| par[1].thread_init(idx); | |||
| par[2].thread_init(idx); | |||
| par[3].thread_init(idx); | |||
| } | |||
| devfunc void on(uint32_t idx) { | |||
| idx = idx * packed_size; | |||
| op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx), par[3].at(idx)); | |||
| } | |||
| devfunc void on(uint32_t idx, uint32_t remain) { | |||
| idx = idx * packed_size; | |||
| if (remain >= packed_size) { | |||
| op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx), | |||
| par[3].at(idx)); | |||
| } else { | |||
| auto ptr0 = par[0].ptr(); | |||
| auto ptr1 = par[1].ptr(); | |||
| auto ptr2 = par[2].ptr(); | |||
| auto ptr3 = par[3].ptr(); | |||
| for (int i = 0; i < remain; i++) { | |||
| op(idx + i, ptr0[par[0].offset(idx + i)], | |||
| ptr1[par[1].offset(idx + i)], ptr2[par[2].offset(idx + i)], | |||
| ptr3[par[3].offset(idx + i)]); | |||
| } | |||
| } | |||
| } | |||
| devfunc void next() { | |||
| par[0].next(); | |||
| par[1].next(); | |||
| par[2].next(); | |||
| par[3].next(); | |||
| } | |||
| }; | |||
| //! specialization for arity == 5 | |||
| template <class Op, class PVis> | |||
| struct OpCallerUniform<Op, 5, PVis> { | |||
| Op op; | |||
| PVis par[5]; | |||
| static const uint32_t packed_size = PVis::packed_size; | |||
| devfunc void thread_init(uint32_t idx) { | |||
| idx = idx * packed_size; | |||
| par[0].thread_init(idx); | |||
| par[1].thread_init(idx); | |||
| par[2].thread_init(idx); | |||
| par[3].thread_init(idx); | |||
| par[4].thread_init(idx); | |||
| } | |||
| devfunc void on(uint32_t idx) { | |||
| idx = idx * packed_size; | |||
| op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx), par[3].at(idx), | |||
| par[4].at(idx)); | |||
| } | |||
| devfunc void on(uint32_t idx, uint32_t remain) { | |||
| idx = idx * packed_size; | |||
| if (remain >= packed_size) { | |||
| op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx), | |||
| par[3].at(idx), par[4].at(idx)); | |||
| } else { | |||
| auto ptr0 = par[0].ptr(); | |||
| auto ptr1 = par[1].ptr(); | |||
| auto ptr2 = par[2].ptr(); | |||
| auto ptr3 = par[3].ptr(); | |||
| auto ptr4 = par[4].ptr(); | |||
| for (int i = 0; i < remain; i++) { | |||
| op(idx + i, ptr0[par[0].offset(idx + i)], | |||
| ptr1[par[1].offset(idx + i)], ptr2[par[2].offset(idx + i)], | |||
| ptr3[par[3].offset(idx + i)], ptr4[par[4].offset(idx + i)]); | |||
| } | |||
| } | |||
| } | |||
| devfunc void next() { | |||
| par[0].next(); | |||
| par[1].next(); | |||
| par[2].next(); | |||
| par[3].next(); | |||
| par[4].next(); | |||
| } | |||
| }; | |||
| /*! | |||
| * \brief call binary (i.e. arity == 2) operator with different param | |||
| * visitors | |||
| @@ -0,0 +1,30 @@ | |||
| /** | |||
| * \file dnn/src/cuda/fake_quant/kern.cu | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #include "./kern.cuh" | |||
| namespace megdnn { | |||
| namespace cuda { | |||
| #define cb(_dtype) \ | |||
| INST_RUN_ELEMWISE(FakeQuantKernOp<DTypeTrait<_dtype>::ctype>, \ | |||
| DTypeTrait<_dtype>::ctype, 2); \ | |||
| INST_RUN_ELEMWISE(FakeQuantBwdKernOp<DTypeTrait<_dtype>::ctype>, \ | |||
| DTypeTrait<_dtype>::ctype, 2); \ | |||
| INST_RUN_ELEMWISE(FakeQuantKernOpNonContig<DTypeTrait<_dtype>::ctype>, \ | |||
| DTypeTrait<_dtype>::ctype, 4); \ | |||
| INST_RUN_ELEMWISE(FakeQuantBwdKernOpNonContig<DTypeTrait<_dtype>::ctype>, \ | |||
| DTypeTrait<_dtype>::ctype, 5); | |||
| cb(megdnn::dtype::Float32) | |||
| } // namespace cuda | |||
| } // namespace megdnn | |||
| @@ -0,0 +1,106 @@ | |||
| /** | |||
| * \file dnn/src/cuda/elemwise_helper.cuh | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #pragma once | |||
| #include "src/cuda/elemwise_helper.cuh" | |||
| #include "src/cuda/utils.cuh" | |||
| #if MEGDNN_CC_HOST | |||
| #include "megdnn/oprs.h" | |||
| #endif | |||
| namespace megdnn { | |||
| namespace cuda { | |||
| template <typename ctype> | |||
| struct FakeQuantKernOp { | |||
| ctype* input; | |||
| ctype* output; | |||
| ctype qmin, qmax; | |||
| __device__ void operator()(uint32_t idx, ctype scale, ctype zero_point) { | |||
| ctype x = round(input[idx] / scale) + zero_point; | |||
| x = fmaxf(fminf(x, qmax), qmin); | |||
| output[idx] = (x - zero_point) * scale; | |||
| } | |||
| #if MEGDNN_CC_HOST | |||
| FakeQuantKernOp(const TensorND& input, const TensorND& output, | |||
| const FakeQuant::Param& param) | |||
| : input{input.ptr<ctype>()}, | |||
| output{output.ptr<ctype>()}, | |||
| qmin(param.qmin), | |||
| qmax(param.qmax) {} | |||
| #endif | |||
| }; | |||
| template <typename ctype> | |||
| struct FakeQuantBwdKernOp { | |||
| ctype* diff; | |||
| ctype* input; | |||
| ctype* grad; | |||
| ctype qmin, qmax; | |||
| __device__ void operator()(uint32_t idx, ctype scale, ctype zero_point) { | |||
| ctype x = round(input[idx] / scale) + zero_point; | |||
| grad[idx] = x <= qmax && x >= qmin ? diff[idx] : 0.0; | |||
| } | |||
| #if MEGDNN_CC_HOST | |||
| FakeQuantBwdKernOp(const TensorND& diff, const TensorND& input, | |||
| const TensorND& grad, const FakeQuant::Param& param) | |||
| : diff{diff.ptr<ctype>()}, | |||
| input{input.ptr<ctype>()}, | |||
| grad{grad.ptr<ctype>()}, | |||
| qmin(param.qmin), | |||
| qmax(param.qmax) {} | |||
| #endif | |||
| }; | |||
| template <typename ctype> | |||
| struct FakeQuantKernOpNonContig { | |||
| ctype qmin; | |||
| ctype qmax; | |||
| __device__ void operator()(uint32_t, ctype& output, ctype input, | |||
| ctype scale, ctype zero_point) { | |||
| ctype x = round(input / scale) + zero_point; | |||
| x = fmaxf(fminf(x, qmax), qmin); | |||
| output = (x - zero_point) * scale; | |||
| } | |||
| #if MEGDNN_CC_HOST | |||
| FakeQuantKernOpNonContig(const FakeQuant::Param& param) | |||
| : qmin(param.qmin), qmax(param.qmax) {} | |||
| #endif | |||
| }; | |||
| template <typename ctype> | |||
| struct FakeQuantBwdKernOpNonContig { | |||
| ctype qmin; | |||
| ctype qmax; | |||
| __device__ void operator()(uint32_t, ctype& grad, ctype diff, ctype input, | |||
| ctype scale, ctype zero_point) { | |||
| ctype x = round(input / scale) + zero_point; | |||
| grad = x <= qmax && x >= qmin ? diff : 0.0; | |||
| } | |||
| #if MEGDNN_CC_HOST | |||
| FakeQuantBwdKernOpNonContig(const FakeQuant::Param& param) | |||
| : qmin(param.qmin), qmax(param.qmax) {} | |||
| #endif | |||
| }; | |||
| } // namespace cuda | |||
| } // namespace megdnn | |||
| @@ -0,0 +1,134 @@ | |||
| /** | |||
| * \file dnn/src/cuda/fake_quant/opr_impl.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #include "./opr_impl.h" | |||
| #include "./kern.cuh" | |||
| #include "src/common/utils.h" | |||
| namespace megdnn { | |||
| namespace cuda { | |||
| void FakeQuantForwardImpl::exec(_megdnn_tensor_in input, | |||
| _megdnn_tensor_in scale, | |||
| _megdnn_tensor_in zero_point, | |||
| _megdnn_tensor_out output, | |||
| _megdnn_workspace workspace) { | |||
| check_exec(input.layout, scale.layout, zero_point.layout, output.layout, | |||
| workspace.size); | |||
| if (!input.layout.is_contiguous() || !output.layout.is_contiguous()) { | |||
| return exec_noncontig(input, scale, zero_point, output); | |||
| } | |||
| ElemwiseOpParamN<2> ele_param; | |||
| ele_param[0] = scale; | |||
| ele_param[0].layout = ele_param[0].layout.broadcast(input.layout); | |||
| ele_param[1] = zero_point; | |||
| ele_param[1].layout = ele_param[1].layout.broadcast(input.layout); | |||
| ele_param.init_from_given_tensor(); | |||
| auto stream = cuda_stream(handle()); | |||
| #define cb(DType) \ | |||
| if (input.layout.dtype == DType()) { \ | |||
| using T = typename DTypeTrait<DType>::ctype; \ | |||
| run_elemwise<FakeQuantKernOp<T>, T, 2>(ele_param, stream, \ | |||
| {input, output, m_param}); \ | |||
| return; \ | |||
| } | |||
| cb(megdnn::dtype::Float32) | |||
| #undef cb | |||
| } | |||
| void FakeQuantForwardImpl::exec_noncontig(_megdnn_tensor_in input, | |||
| _megdnn_tensor_in scale, | |||
| _megdnn_tensor_in zero_point, | |||
| _megdnn_tensor_out output) { | |||
| ElemwiseOpParamN<4> ele_param; | |||
| ele_param[0] = output; | |||
| ele_param[1] = input; | |||
| ele_param[2] = scale; | |||
| ele_param[2].layout = ele_param[2].layout.broadcast(input.layout); | |||
| ele_param[3] = zero_point; | |||
| ele_param[3].layout = ele_param[3].layout.broadcast(input.layout); | |||
| ele_param.init_from_given_tensor(); | |||
| auto stream = cuda_stream(handle()); | |||
| #define cb(DType) \ | |||
| if (input.layout.dtype == DType()) { \ | |||
| using T = typename DTypeTrait<DType>::ctype; \ | |||
| run_elemwise<FakeQuantKernOpNonContig<T>, T, 4>(ele_param, stream, \ | |||
| {m_param}); \ | |||
| return; \ | |||
| } | |||
| cb(megdnn::dtype::Float32) | |||
| #undef cb | |||
| } | |||
| void FakeQuantBackwardImpl::exec(_megdnn_tensor_in diff, | |||
| _megdnn_tensor_in input, | |||
| _megdnn_tensor_in scale, | |||
| _megdnn_tensor_in zero_point, | |||
| _megdnn_tensor_out grad, | |||
| _megdnn_workspace workspace) { | |||
| check_exec(diff.layout, input.layout, scale.layout, zero_point.layout, | |||
| grad.layout, workspace.size); | |||
| if (!input.layout.is_contiguous() || !diff.layout.is_contiguous() || | |||
| !grad.layout.is_contiguous()) { | |||
| return exec_noncontig(diff, input, scale, zero_point, grad); | |||
| } | |||
| ElemwiseOpParamN<2> ele_param; | |||
| ele_param[0] = scale; | |||
| ele_param[0].layout = ele_param[0].layout.broadcast(input.layout); | |||
| ele_param[1] = zero_point; | |||
| ele_param[1].layout = ele_param[1].layout.broadcast(input.layout); | |||
| ele_param.init_from_given_tensor(); | |||
| auto m_param = param(); | |||
| auto stream = cuda_stream(handle()); | |||
| #define cb(DType) \ | |||
| if (grad.layout.dtype == DType()) { \ | |||
| using T = typename DTypeTrait<DType>::ctype; \ | |||
| run_elemwise<FakeQuantBwdKernOp<T>, T, 2>( \ | |||
| ele_param, stream, {diff, input, grad, m_param}); \ | |||
| return; \ | |||
| } | |||
| cb(megdnn::dtype::Float32) | |||
| #undef cb | |||
| } | |||
| void FakeQuantBackwardImpl::exec_noncontig(_megdnn_tensor_in diff, | |||
| _megdnn_tensor_in input, | |||
| _megdnn_tensor_in scale, | |||
| _megdnn_tensor_in zero_point, | |||
| _megdnn_tensor_out grad) { | |||
| ElemwiseOpParamN<5> ele_param; | |||
| ele_param[0] = grad; | |||
| ele_param[1] = diff; | |||
| ele_param[2] = input; | |||
| ele_param[3] = scale; | |||
| ele_param[3].layout = ele_param[3].layout.broadcast(input.layout); | |||
| ele_param[4] = zero_point; | |||
| ele_param[4].layout = ele_param[4].layout.broadcast(input.layout); | |||
| ele_param.init_from_given_tensor(); | |||
| auto m_param = param(); | |||
| auto stream = cuda_stream(handle()); | |||
| #define cb(DType) \ | |||
| if (grad.layout.dtype == DType()) { \ | |||
| using T = typename DTypeTrait<DType>::ctype; \ | |||
| run_elemwise<FakeQuantBwdKernOpNonContig<T>, T, 5>(ele_param, stream, \ | |||
| {m_param}); \ | |||
| return; \ | |||
| } | |||
| cb(megdnn::dtype::Float32) | |||
| #undef cb | |||
| } | |||
| } // namespace cuda | |||
| } // namespace megdnn | |||
| @@ -0,0 +1,55 @@ | |||
| /** | |||
| * \file dnn/src/cuda/fake_quant/opr_impl.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #pragma once | |||
| #include "megdnn/oprs.h" | |||
| #include "src/cuda/utils.h" | |||
| namespace megdnn { | |||
| namespace cuda { | |||
| class FakeQuantForwardImpl : public FakeQuantForward { | |||
| public: | |||
| using FakeQuantForward::FakeQuantForward; | |||
| void exec(_megdnn_tensor_in input, _megdnn_tensor_in scale, | |||
| _megdnn_tensor_in zero_point, _megdnn_tensor_out output, | |||
| _megdnn_workspace workspace) override; | |||
| size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&, | |||
| const TensorLayout&, | |||
| const TensorLayout&) override { | |||
| return 0; | |||
| } | |||
| private: | |||
| void exec_noncontig(_megdnn_tensor_in input, _megdnn_tensor_in scale, | |||
| _megdnn_tensor_in zero_point, | |||
| _megdnn_tensor_out output); | |||
| }; | |||
| class FakeQuantBackwardImpl : public FakeQuantBackward { | |||
| public: | |||
| using FakeQuantBackward::FakeQuantBackward; | |||
| void exec(_megdnn_tensor_in diff, _megdnn_tensor_in input, | |||
| _megdnn_tensor_in scale, _megdnn_tensor_in zero_point, | |||
| _megdnn_tensor_out grad, _megdnn_workspace workspace) override; | |||
| size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&, | |||
| const TensorLayout&, const TensorLayout&, | |||
| const TensorLayout&) override { | |||
| return 0; | |||
| } | |||
| private: | |||
| void exec_noncontig(_megdnn_tensor_in diff, _megdnn_tensor_in input, | |||
| _megdnn_tensor_in scale, _megdnn_tensor_in zero_point, | |||
| _megdnn_tensor_out grad); | |||
| }; | |||
| } // namespace cuda | |||
| } // namespace megdnn | |||
| @@ -77,6 +77,7 @@ | |||
| #include "src/cuda/roi_align/opr_impl.h" | |||
| #include "src/cuda/batch_conv_bias/opr_impl.h" | |||
| #include "src/cuda/remap/opr_impl.h" | |||
| #include "src/cuda/fake_quant/opr_impl.h" | |||
| namespace megdnn { | |||
| namespace cuda { | |||
| @@ -0,0 +1,118 @@ | |||
| /** | |||
| * \file dnn/src/naive/fakequant/opr_impl.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #include "src/naive/fake_quant/opr_impl.h" | |||
| #include <cmath> | |||
| #include <iostream> | |||
| #include "megdnn/tensor_iter.h" | |||
| #include "src/common/elemwise_helper.cuh" | |||
| #include "src/common/utils.h" | |||
| #include "src/naive/handle.h" | |||
| namespace { | |||
| using namespace megdnn; | |||
| template <typename T> | |||
| void forward_impl(const ElemwiseOpParamN<4> src, float qmin, float qmax) { | |||
| auto inp = tensor_iter_valonly<T>(src[0]).begin(); | |||
| auto out = tensor_iter_valonly<T>(src[1]).begin(); | |||
| auto scale = tensor_iter_valonly<T>(src[2]).begin(); | |||
| auto zero_point = tensor_iter_valonly<T>(src[3]).begin(); | |||
| size_t total = src[0].layout.total_nr_elems(); | |||
| for (size_t i = 0; i < total; ++i) { | |||
| T x = round(*inp / (*scale)) + *zero_point; | |||
| x = x <= qmin ? qmin : x; | |||
| x = x >= qmax ? qmax : x; | |||
| *out = (x - *zero_point) * *scale; | |||
| ++inp; | |||
| ++out; | |||
| ++scale; | |||
| ++zero_point; | |||
| } | |||
| } | |||
| template <typename T> | |||
| void backward_impl(const ElemwiseOpParamN<5> src, float qmin, float qmax) { | |||
| auto diff = tensor_iter_valonly<T>(src[0]).begin(); | |||
| auto input = tensor_iter_valonly<T>(src[1]).begin(); | |||
| auto scale = tensor_iter_valonly<T>(src[2]).begin(); | |||
| auto zero_point = tensor_iter_valonly<T>(src[3]).begin(); | |||
| auto grad = tensor_iter_valonly<T>(src[4]).begin(); | |||
| size_t total = src[0].layout.total_nr_elems(); | |||
| for (size_t i = 0; i < total; ++i) { | |||
| T x = round(*input / (*scale)) + *zero_point; | |||
| *grad = (x >= qmin && x <= qmax) ? *diff : 0.0; | |||
| ++diff; | |||
| ++input; | |||
| ++scale; | |||
| ++zero_point; | |||
| ++grad; | |||
| } | |||
| } | |||
| } // namespace | |||
| namespace megdnn { | |||
| namespace naive { | |||
| void FakeQuantForwardImpl::exec(_megdnn_tensor_in input, | |||
| _megdnn_tensor_in scale, | |||
| _megdnn_tensor_in zero_point, | |||
| _megdnn_tensor_out output, | |||
| _megdnn_workspace workspace) { | |||
| check_exec(input.layout, scale.layout, zero_point.layout, output.layout, | |||
| workspace.size); | |||
| ElemwiseOpParamN<4> src; | |||
| src[0] = input; | |||
| src[1] = output; | |||
| src[2] = scale; | |||
| src[2].layout = src[2].layout.broadcast(input.layout); | |||
| src[3] = zero_point; | |||
| src[3].layout = src[3].layout.broadcast(input.layout); | |||
| #define cb(DType) \ | |||
| if (input.layout.dtype == DType()) { \ | |||
| using T = typename DTypeTrait<DType>::ctype; \ | |||
| MEGDNN_DISPATCH_CPU_KERN_OPR( \ | |||
| forward_impl<T>(src, param().qmin, param().qmax)); \ | |||
| return; \ | |||
| } | |||
| cb(dtype::Float32) | |||
| #undef cb | |||
| } | |||
| void FakeQuantBackwardImpl::exec(_megdnn_tensor_in diff, | |||
| _megdnn_tensor_in input, | |||
| _megdnn_tensor_in scale, | |||
| _megdnn_tensor_in zero_point, | |||
| _megdnn_tensor_out grad, | |||
| _megdnn_workspace workspace) { | |||
| check_exec(diff.layout, input.layout, scale.layout, zero_point.layout, | |||
| grad.layout, workspace.size); | |||
| ElemwiseOpParamN<5> src; | |||
| src[0] = diff; | |||
| src[1] = input; | |||
| src[2] = scale; | |||
| src[2].layout = src[2].layout.broadcast(input.layout); | |||
| src[3] = zero_point; | |||
| src[3].layout = src[3].layout.broadcast(input.layout); | |||
| src[4] = grad; | |||
| #define cb(DType) \ | |||
| if (diff.layout.dtype == DType() && grad.layout.dtype == DType() && \ | |||
| input.layout.dtype == DType()) { \ | |||
| using T = typename DTypeTrait<DType>::ctype; \ | |||
| MEGDNN_DISPATCH_CPU_KERN_OPR( \ | |||
| backward_impl<T>(src, param().qmin, param().qmax)); \ | |||
| return; \ | |||
| } | |||
| cb(dtype::Float32) | |||
| #undef cb | |||
| } | |||
| } // namespace naive | |||
| } // namespace megdnn | |||
| @@ -0,0 +1,45 @@ | |||
| /** | |||
| * \file dnn/src/naive/fakequant/opr_impl.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #pragma once | |||
| #include "megdnn/oprs.h" | |||
| namespace megdnn { | |||
| namespace naive { | |||
| class FakeQuantForwardImpl : public FakeQuantForward { | |||
| public: | |||
| using FakeQuantForward::FakeQuantForward; | |||
| void exec(_megdnn_tensor_in input, _megdnn_tensor_in scale, | |||
| _megdnn_tensor_in zero_point, _megdnn_tensor_out output, | |||
| _megdnn_workspace workspace) override; | |||
| size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&, | |||
| const TensorLayout&, | |||
| const TensorLayout&) override { | |||
| return 0; | |||
| } | |||
| }; | |||
| class FakeQuantBackwardImpl : public FakeQuantBackward { | |||
| public: | |||
| using FakeQuantBackward::FakeQuantBackward; | |||
| void exec(_megdnn_tensor_in diff, _megdnn_tensor_in input, | |||
| _megdnn_tensor_in scale, _megdnn_tensor_in zero_point, | |||
| _megdnn_tensor_out grad, _megdnn_workspace workspace) override; | |||
| size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&, | |||
| const TensorLayout&, const TensorLayout&, | |||
| const TensorLayout&) override { | |||
| return 0; | |||
| } | |||
| }; | |||
| } // namespace naive | |||
| } // namespace megdnn | |||
| @@ -79,6 +79,8 @@ | |||
| #include "src/naive/warp_affine/opr_impl.h" | |||
| #include "src/naive/warp_perspective/opr_impl.h" | |||
| #include "src/naive/winograd_filter_preprocess/opr_impl.h" | |||
| #include "src/naive/remap/opr_impl.h" | |||
| #include "src/naive/fake_quant/opr_impl.h" | |||
| static size_t g_image2d_pitch_alignment = 1; | |||
| @@ -0,0 +1,60 @@ | |||
| /** | |||
| * \file dnn/test/common/fake_quant.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #pragma once | |||
| #include "megdnn/basic_types.h" | |||
| #include "megdnn/opr_param_defs.h" | |||
| namespace megdnn { | |||
| namespace test { | |||
| namespace fake_quant { | |||
| struct TestArg { | |||
| param::FakeQuant param; | |||
| TensorShape ishape; | |||
| TensorShape scale_shape; | |||
| TensorShape zeropoint_shape; | |||
| TestArg(param::FakeQuant param, TensorShape ishape, TensorShape scale_shape, | |||
| TensorShape zeropoint_shape) | |||
| : param(param), | |||
| ishape(ishape), | |||
| scale_shape(scale_shape), | |||
| zeropoint_shape(zeropoint_shape) {} | |||
| }; | |||
| inline std::vector<TestArg> get_args() { | |||
| std::vector<TestArg> args; | |||
| param::FakeQuant cur_param; | |||
| cur_param.qmin = -128; | |||
| cur_param.qmax = 128; | |||
| for (size_t i = 10; i < 40; i += 2) { | |||
| args.emplace_back(cur_param, TensorShape{10, 64, i, i}, TensorShape{1}, | |||
| TensorShape{1}); | |||
| } | |||
| for (size_t m : {1, 10}) | |||
| for (size_t n : {1, 10}) | |||
| for (size_t j : {1, 10}) | |||
| for (size_t k : {1, 10}) { | |||
| args.emplace_back(cur_param, TensorShape{10, 64, 10, 10}, | |||
| TensorShape{10, 64, m, n}, | |||
| TensorShape{10, 64, j, k}); | |||
| } | |||
| return args; | |||
| } | |||
| } // namespace fake_quant | |||
| } // namespace test | |||
| } // namespace megdnn | |||
| @@ -111,6 +111,8 @@ DEF(Remap, 3, true, true); | |||
| DEF(RemapBackwardData, 3, true, false); | |||
| DEF(RemapBackwardMat, 4, true, false); | |||
| DEF(DctChannelSelectForward, 4, true, true); | |||
| DEF(FakeQuantForward, 4, true, true); | |||
| DEF(FakeQuantBackward, 5, true, false); | |||
| } // namespace test | |||
| } // namespace megdnn | |||
| @@ -0,0 +1,99 @@ | |||
| /** | |||
| * \file dnn/test/cuda/fake_quant.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #include "test/common/fake_quant.h" | |||
| #include "megdnn/oprs.h" | |||
| #include "test/common/checker.h" | |||
| #include "test/cuda/fixture.h" | |||
| namespace megdnn { | |||
| namespace test { | |||
| using namespace fake_quant; | |||
| TEST_F(CUDA, FAKE_QUANT) { | |||
| std::vector<TestArg> args = get_args(); | |||
| auto dtype = dtype::Float32(); | |||
| std::unique_ptr<RNG> rng; | |||
| for (auto&& arg : args) { | |||
| auto param = arg.param; | |||
| auto ishape = arg.ishape; | |||
| auto scale_shape = arg.scale_shape; | |||
| auto zeropoint_shape = arg.zeropoint_shape; | |||
| Checker<FakeQuantForward> checker(handle_cuda()); | |||
| checker.set_param(param) | |||
| .set_dtype(0, dtype) | |||
| .set_dtype(1, dtype) | |||
| .set_dtype(2, dtype) | |||
| .set_dtype(3, dtype) | |||
| .execs(TensorShapeArray{ishape, scale_shape, zeropoint_shape, | |||
| ishape}); | |||
| } | |||
| // test noncontiguous layout | |||
| for (auto&& arg : args) { | |||
| auto param = arg.param; | |||
| auto ishape = arg.ishape; | |||
| auto scale_shape = arg.scale_shape; | |||
| auto zeropoint_shape = arg.zeropoint_shape; | |||
| Checker<FakeQuantForward> checker(handle_cuda()); | |||
| TensorLayout ilayout( | |||
| ishape, | |||
| {(long int)(ishape[1] * ishape[2] * ishape[3] * 2), | |||
| (long int)(ishape[2] * ishape[3]), (long int)ishape[3], 1}, | |||
| dtype::Float32()); | |||
| checker.set_param(param).execl({ilayout, | |||
| {scale_shape, dtype::Float32()}, | |||
| {zeropoint_shape, dtype::Float32()}, | |||
| ilayout}); | |||
| } | |||
| } | |||
| TEST_F(CUDA, FAKE_QUANT_BACKWARD) { | |||
| std::vector<TestArg> args = get_args(); | |||
| auto dtype = dtype::Float32(); | |||
| for (auto&& arg : args) { | |||
| auto param = arg.param; | |||
| auto ishape = arg.ishape; | |||
| auto scale_shape = arg.scale_shape; | |||
| auto zeropoint_shape = arg.zeropoint_shape; | |||
| Checker<FakeQuantBackward> checker(handle_cuda()); | |||
| checker.set_param(param) | |||
| .set_dtype(0, dtype) | |||
| .set_dtype(1, dtype) | |||
| .set_dtype(2, dtype) | |||
| .set_dtype(3, dtype) | |||
| .set_dtype(4, dtype) | |||
| .execs(TensorShapeArray{ishape, ishape, scale_shape, | |||
| zeropoint_shape, ishape}); | |||
| } | |||
| // test noncontiguous layout | |||
| for (auto&& arg : args) { | |||
| auto param = arg.param; | |||
| auto ishape = arg.ishape; | |||
| auto scale_shape = arg.scale_shape; | |||
| auto zeropoint_shape = arg.zeropoint_shape; | |||
| Checker<FakeQuantBackward> checker(handle_cuda()); | |||
| TensorLayout ilayout( | |||
| ishape, | |||
| {(long int)(ishape[1] * ishape[2] * ishape[3] * 2), | |||
| (long int)(ishape[2] * ishape[3]), (long int)ishape[3], 1}, | |||
| dtype::Float32()); | |||
| checker.set_param(param).execl({ilayout, | |||
| ilayout, | |||
| {scale_shape, dtype::Float32()}, | |||
| {zeropoint_shape, dtype::Float32()}, | |||
| ilayout}); | |||
| } | |||
| } | |||
| } // namespace test | |||
| } // namespace megdnn | |||