| @@ -60,7 +60,7 @@ struct PreprocessedFilter { | |||||
| TensorNDArray tensors; | TensorNDArray tensors; | ||||
| }; | }; | ||||
| } // namespace intl | |||||
| } // namespace detail | |||||
| /** | /** | ||||
| * \brief base class for convolution operation | * \brief base class for convolution operation | ||||
| @@ -1562,6 +1562,58 @@ protected: | |||||
| }; | }; | ||||
| using BatchConvBias = BatchConvBiasForward; | using BatchConvBias = BatchConvBiasForward; | ||||
| class FakeQuantBase : public OperatorBase { | |||||
| DEF_OPR_IMPL_CTOR(FakeQuantBase, OperatorBase); | |||||
| DEF_OPR_PARAM(FakeQuant); | |||||
| protected: | |||||
| void deduce_layout_fwd(const TensorLayout& input, TensorLayout& output); | |||||
| void check_layout_fwd(const TensorLayout& input, const TensorLayout& scale, | |||||
| const TensorLayout& zero_point, | |||||
| const TensorLayout& output); | |||||
| }; | |||||
| class FakeQuantForward : public FakeQuantBase { | |||||
| DEF_OPR_IMPL(FakeQuantForward, FakeQuantBase, 3, 1); | |||||
| public: | |||||
| virtual void exec(_megdnn_tensor_in input, _megdnn_tensor_in scale, | |||||
| _megdnn_tensor_in zero_point, _megdnn_tensor_out output, | |||||
| _megdnn_workspace workspace) = 0; | |||||
| void deduce_layout(const TensorLayout& input, const TensorLayout& scale, | |||||
| const TensorLayout& zero_point, TensorLayout& output); | |||||
| virtual size_t get_workspace_in_bytes(const TensorLayout& input, | |||||
| const TensorLayout& scale, | |||||
| const TensorLayout& zero_point, | |||||
| const TensorLayout& output) = 0; | |||||
| protected: | |||||
| void check_exec(const TensorLayout& input, const TensorLayout& scale, | |||||
| const TensorLayout& zero_point, const TensorLayout& output, | |||||
| size_t workspace_in_bytes); | |||||
| }; | |||||
| using FakeQuant = FakeQuantForward; | |||||
| class FakeQuantBackward : public FakeQuantBase { | |||||
| DEF_OPR_IMPL(FakeQuantBackward, FakeQuantBase, 4, 1); | |||||
| public: | |||||
| virtual void exec(_megdnn_tensor_in diff, _megdnn_tensor_in input, | |||||
| _megdnn_tensor_in scale, _megdnn_tensor_in zero_point, | |||||
| _megdnn_tensor_out grad, _megdnn_workspace workspace) = 0; | |||||
| virtual size_t get_workspace_in_bytes(const TensorLayout& diff, | |||||
| const TensorLayout& input, | |||||
| const TensorLayout& scale, | |||||
| const TensorLayout& zero_point, | |||||
| const TensorLayout& grad) = 0; | |||||
| protected: | |||||
| void check_exec(const TensorLayout& diff, const TensorLayout& input, | |||||
| const TensorLayout& scale, const TensorLayout& zero_point, | |||||
| const TensorLayout& grad, size_t workspace_in_bytes); | |||||
| }; | |||||
| } // namespace megdnn | } // namespace megdnn | ||||
| #include "megdnn/internal/opr_header_epilogue.h" | #include "megdnn/internal/opr_header_epilogue.h" | ||||
| @@ -943,5 +943,9 @@ when the ``I`` suffix is present. | |||||
| add_enum_alias('Format', 'ConvolutionV0'). | add_enum_alias('Format', 'ConvolutionV0'). | ||||
| add_enum_alias('ComputeMode', 'Convolution', name_field="compute_mode") | add_enum_alias('ComputeMode', 'Convolution', name_field="compute_mode") | ||||
| ) | ) | ||||
| (pdef('FakeQuant'). | |||||
| add_fields('int32','qmin','-2147483648'). | |||||
| add_fields('int32','qmax','2147483647') | |||||
| ) | |||||
| @@ -0,0 +1,66 @@ | |||||
| /** | |||||
| * \file dnn/src/common/fakequant.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "megdnn/oprs.h" | |||||
| #include "src/common/utils.h" | |||||
| namespace megdnn { | |||||
| void FakeQuantBase::deduce_layout_fwd(const TensorLayout& input, | |||||
| TensorLayout& output) { | |||||
| output = TensorLayout(input, input.dtype); | |||||
| } | |||||
| void FakeQuantBase::check_layout_fwd(const TensorLayout& input, | |||||
| const TensorLayout& scale, | |||||
| const TensorLayout& zero_point, | |||||
| const TensorLayout& output) { | |||||
| megdnn_assert(input.dtype == dtype::Float32()); | |||||
| megdnn_assert(scale.dtype == dtype::Float32()); | |||||
| megdnn_assert(zero_point.dtype == dtype::Float32()); | |||||
| TensorLayout expected; | |||||
| deduce_layout_fwd(input, expected); | |||||
| megdnn_assert_eq_layout(expected, output); | |||||
| } | |||||
| void FakeQuantForward::deduce_layout(const TensorLayout& input, | |||||
| const TensorLayout& /*scale*/, | |||||
| const TensorLayout& /*zero_point*/, | |||||
| TensorLayout& output) { | |||||
| deduce_layout_fwd(input, output); | |||||
| } | |||||
| void FakeQuantForward::check_exec(const TensorLayout& input, | |||||
| const TensorLayout& scale, | |||||
| const TensorLayout& zero_point, | |||||
| const TensorLayout& output, | |||||
| size_t workspace_in_bytes) { | |||||
| check_layout_fwd(input, scale, zero_point, output); | |||||
| auto required_workspace_space = | |||||
| get_workspace_in_bytes(input, scale, zero_point, output); | |||||
| megdnn_assert(workspace_in_bytes >= required_workspace_space); | |||||
| } | |||||
| void FakeQuantBackward::check_exec(const TensorLayout& diff, | |||||
| const TensorLayout& input, | |||||
| const TensorLayout& scale, | |||||
| const TensorLayout& zero_point, | |||||
| const TensorLayout& grad, | |||||
| size_t workspace_in_bytes) { | |||||
| megdnn_assert_eq_shape(input, diff); | |||||
| megdnn_assert_eq_shape(input, grad); | |||||
| auto required_worspace_space = | |||||
| get_workspace_in_bytes(diff, input, scale, zero_point, grad); | |||||
| megdnn_assert(workspace_in_bytes >= required_worspace_space); | |||||
| } | |||||
| } // namespace megdnn | |||||
| @@ -201,7 +201,9 @@ private: | |||||
| cb(RemapBackwardMat) \ | cb(RemapBackwardMat) \ | ||||
| cb(AdaptivePoolingForward) \ | cb(AdaptivePoolingForward) \ | ||||
| cb(AdaptivePoolingBackward) \ | cb(AdaptivePoolingBackward) \ | ||||
| cb(DctChannelSelectForward) | |||||
| cb(DctChannelSelectForward) \ | |||||
| cb(FakeQuantForward) \ | |||||
| cb(FakeQuantBackward) | |||||
| /*! | /*! | ||||
| * \brief specialize HandleImpl::create_operator for a single opr type; | * \brief specialize HandleImpl::create_operator for a single opr type; | ||||
| @@ -13,9 +13,9 @@ | |||||
| #pragma once | #pragma once | ||||
| #include "src/common/elemwise_helper.cuh" | #include "src/common/elemwise_helper.cuh" | ||||
| #include "src/cuda/utils.cuh" | |||||
| #include "src/cuda/int_fastdiv.cuh" | #include "src/cuda/int_fastdiv.cuh" | ||||
| #include "src/cuda/query_blocksize.cuh" | #include "src/cuda/query_blocksize.cuh" | ||||
| #include "src/cuda/utils.cuh" | |||||
| /* | /* | ||||
| * please note that all arithmetics on GPU are 32-bit for best performance; this | * please note that all arithmetics on GPU are 32-bit for best performance; this | ||||
| @@ -649,6 +649,102 @@ struct OpCallerUniform<Op, 3, PVis> { | |||||
| } | } | ||||
| }; | }; | ||||
| //! specialization for arity == 4 | |||||
| template <class Op, class PVis> | |||||
| struct OpCallerUniform<Op, 4, PVis> { | |||||
| Op op; | |||||
| PVis par[4]; | |||||
| static const uint32_t packed_size = PVis::packed_size; | |||||
| devfunc void thread_init(uint32_t idx) { | |||||
| idx = idx * packed_size; | |||||
| par[0].thread_init(idx); | |||||
| par[1].thread_init(idx); | |||||
| par[2].thread_init(idx); | |||||
| par[3].thread_init(idx); | |||||
| } | |||||
| devfunc void on(uint32_t idx) { | |||||
| idx = idx * packed_size; | |||||
| op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx), par[3].at(idx)); | |||||
| } | |||||
| devfunc void on(uint32_t idx, uint32_t remain) { | |||||
| idx = idx * packed_size; | |||||
| if (remain >= packed_size) { | |||||
| op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx), | |||||
| par[3].at(idx)); | |||||
| } else { | |||||
| auto ptr0 = par[0].ptr(); | |||||
| auto ptr1 = par[1].ptr(); | |||||
| auto ptr2 = par[2].ptr(); | |||||
| auto ptr3 = par[3].ptr(); | |||||
| for (int i = 0; i < remain; i++) { | |||||
| op(idx + i, ptr0[par[0].offset(idx + i)], | |||||
| ptr1[par[1].offset(idx + i)], ptr2[par[2].offset(idx + i)], | |||||
| ptr3[par[3].offset(idx + i)]); | |||||
| } | |||||
| } | |||||
| } | |||||
| devfunc void next() { | |||||
| par[0].next(); | |||||
| par[1].next(); | |||||
| par[2].next(); | |||||
| par[3].next(); | |||||
| } | |||||
| }; | |||||
| //! specialization for arity == 5 | |||||
| template <class Op, class PVis> | |||||
| struct OpCallerUniform<Op, 5, PVis> { | |||||
| Op op; | |||||
| PVis par[5]; | |||||
| static const uint32_t packed_size = PVis::packed_size; | |||||
| devfunc void thread_init(uint32_t idx) { | |||||
| idx = idx * packed_size; | |||||
| par[0].thread_init(idx); | |||||
| par[1].thread_init(idx); | |||||
| par[2].thread_init(idx); | |||||
| par[3].thread_init(idx); | |||||
| par[4].thread_init(idx); | |||||
| } | |||||
| devfunc void on(uint32_t idx) { | |||||
| idx = idx * packed_size; | |||||
| op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx), par[3].at(idx), | |||||
| par[4].at(idx)); | |||||
| } | |||||
| devfunc void on(uint32_t idx, uint32_t remain) { | |||||
| idx = idx * packed_size; | |||||
| if (remain >= packed_size) { | |||||
| op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx), | |||||
| par[3].at(idx), par[4].at(idx)); | |||||
| } else { | |||||
| auto ptr0 = par[0].ptr(); | |||||
| auto ptr1 = par[1].ptr(); | |||||
| auto ptr2 = par[2].ptr(); | |||||
| auto ptr3 = par[3].ptr(); | |||||
| auto ptr4 = par[4].ptr(); | |||||
| for (int i = 0; i < remain; i++) { | |||||
| op(idx + i, ptr0[par[0].offset(idx + i)], | |||||
| ptr1[par[1].offset(idx + i)], ptr2[par[2].offset(idx + i)], | |||||
| ptr3[par[3].offset(idx + i)], ptr4[par[4].offset(idx + i)]); | |||||
| } | |||||
| } | |||||
| } | |||||
| devfunc void next() { | |||||
| par[0].next(); | |||||
| par[1].next(); | |||||
| par[2].next(); | |||||
| par[3].next(); | |||||
| par[4].next(); | |||||
| } | |||||
| }; | |||||
| /*! | /*! | ||||
| * \brief call binary (i.e. arity == 2) operator with different param | * \brief call binary (i.e. arity == 2) operator with different param | ||||
| * visitors | * visitors | ||||
| @@ -0,0 +1,30 @@ | |||||
| /** | |||||
| * \file dnn/src/cuda/fake_quant/kern.cu | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "./kern.cuh" | |||||
| namespace megdnn { | |||||
| namespace cuda { | |||||
| #define cb(_dtype) \ | |||||
| INST_RUN_ELEMWISE(FakeQuantKernOp<DTypeTrait<_dtype>::ctype>, \ | |||||
| DTypeTrait<_dtype>::ctype, 2); \ | |||||
| INST_RUN_ELEMWISE(FakeQuantBwdKernOp<DTypeTrait<_dtype>::ctype>, \ | |||||
| DTypeTrait<_dtype>::ctype, 2); \ | |||||
| INST_RUN_ELEMWISE(FakeQuantKernOpNonContig<DTypeTrait<_dtype>::ctype>, \ | |||||
| DTypeTrait<_dtype>::ctype, 4); \ | |||||
| INST_RUN_ELEMWISE(FakeQuantBwdKernOpNonContig<DTypeTrait<_dtype>::ctype>, \ | |||||
| DTypeTrait<_dtype>::ctype, 5); | |||||
| cb(megdnn::dtype::Float32) | |||||
| } // namespace cuda | |||||
| } // namespace megdnn | |||||
| @@ -0,0 +1,106 @@ | |||||
| /** | |||||
| * \file dnn/src/cuda/elemwise_helper.cuh | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include "src/cuda/elemwise_helper.cuh" | |||||
| #include "src/cuda/utils.cuh" | |||||
| #if MEGDNN_CC_HOST | |||||
| #include "megdnn/oprs.h" | |||||
| #endif | |||||
| namespace megdnn { | |||||
| namespace cuda { | |||||
| template <typename ctype> | |||||
| struct FakeQuantKernOp { | |||||
| ctype* input; | |||||
| ctype* output; | |||||
| ctype qmin, qmax; | |||||
| __device__ void operator()(uint32_t idx, ctype scale, ctype zero_point) { | |||||
| ctype x = round(input[idx] / scale) + zero_point; | |||||
| x = fmaxf(fminf(x, qmax), qmin); | |||||
| output[idx] = (x - zero_point) * scale; | |||||
| } | |||||
| #if MEGDNN_CC_HOST | |||||
| FakeQuantKernOp(const TensorND& input, const TensorND& output, | |||||
| const FakeQuant::Param& param) | |||||
| : input{input.ptr<ctype>()}, | |||||
| output{output.ptr<ctype>()}, | |||||
| qmin(param.qmin), | |||||
| qmax(param.qmax) {} | |||||
| #endif | |||||
| }; | |||||
| template <typename ctype> | |||||
| struct FakeQuantBwdKernOp { | |||||
| ctype* diff; | |||||
| ctype* input; | |||||
| ctype* grad; | |||||
| ctype qmin, qmax; | |||||
| __device__ void operator()(uint32_t idx, ctype scale, ctype zero_point) { | |||||
| ctype x = round(input[idx] / scale) + zero_point; | |||||
| grad[idx] = x <= qmax && x >= qmin ? diff[idx] : 0.0; | |||||
| } | |||||
| #if MEGDNN_CC_HOST | |||||
| FakeQuantBwdKernOp(const TensorND& diff, const TensorND& input, | |||||
| const TensorND& grad, const FakeQuant::Param& param) | |||||
| : diff{diff.ptr<ctype>()}, | |||||
| input{input.ptr<ctype>()}, | |||||
| grad{grad.ptr<ctype>()}, | |||||
| qmin(param.qmin), | |||||
| qmax(param.qmax) {} | |||||
| #endif | |||||
| }; | |||||
| template <typename ctype> | |||||
| struct FakeQuantKernOpNonContig { | |||||
| ctype qmin; | |||||
| ctype qmax; | |||||
| __device__ void operator()(uint32_t, ctype& output, ctype input, | |||||
| ctype scale, ctype zero_point) { | |||||
| ctype x = round(input / scale) + zero_point; | |||||
| x = fmaxf(fminf(x, qmax), qmin); | |||||
| output = (x - zero_point) * scale; | |||||
| } | |||||
| #if MEGDNN_CC_HOST | |||||
| FakeQuantKernOpNonContig(const FakeQuant::Param& param) | |||||
| : qmin(param.qmin), qmax(param.qmax) {} | |||||
| #endif | |||||
| }; | |||||
| template <typename ctype> | |||||
| struct FakeQuantBwdKernOpNonContig { | |||||
| ctype qmin; | |||||
| ctype qmax; | |||||
| __device__ void operator()(uint32_t, ctype& grad, ctype diff, ctype input, | |||||
| ctype scale, ctype zero_point) { | |||||
| ctype x = round(input / scale) + zero_point; | |||||
| grad = x <= qmax && x >= qmin ? diff : 0.0; | |||||
| } | |||||
| #if MEGDNN_CC_HOST | |||||
| FakeQuantBwdKernOpNonContig(const FakeQuant::Param& param) | |||||
| : qmin(param.qmin), qmax(param.qmax) {} | |||||
| #endif | |||||
| }; | |||||
| } // namespace cuda | |||||
| } // namespace megdnn | |||||
| @@ -0,0 +1,134 @@ | |||||
| /** | |||||
| * \file dnn/src/cuda/fake_quant/opr_impl.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "./opr_impl.h" | |||||
| #include "./kern.cuh" | |||||
| #include "src/common/utils.h" | |||||
| namespace megdnn { | |||||
| namespace cuda { | |||||
| void FakeQuantForwardImpl::exec(_megdnn_tensor_in input, | |||||
| _megdnn_tensor_in scale, | |||||
| _megdnn_tensor_in zero_point, | |||||
| _megdnn_tensor_out output, | |||||
| _megdnn_workspace workspace) { | |||||
| check_exec(input.layout, scale.layout, zero_point.layout, output.layout, | |||||
| workspace.size); | |||||
| if (!input.layout.is_contiguous() || !output.layout.is_contiguous()) { | |||||
| return exec_noncontig(input, scale, zero_point, output); | |||||
| } | |||||
| ElemwiseOpParamN<2> ele_param; | |||||
| ele_param[0] = scale; | |||||
| ele_param[0].layout = ele_param[0].layout.broadcast(input.layout); | |||||
| ele_param[1] = zero_point; | |||||
| ele_param[1].layout = ele_param[1].layout.broadcast(input.layout); | |||||
| ele_param.init_from_given_tensor(); | |||||
| auto stream = cuda_stream(handle()); | |||||
| #define cb(DType) \ | |||||
| if (input.layout.dtype == DType()) { \ | |||||
| using T = typename DTypeTrait<DType>::ctype; \ | |||||
| run_elemwise<FakeQuantKernOp<T>, T, 2>(ele_param, stream, \ | |||||
| {input, output, m_param}); \ | |||||
| return; \ | |||||
| } | |||||
| cb(megdnn::dtype::Float32) | |||||
| #undef cb | |||||
| } | |||||
| void FakeQuantForwardImpl::exec_noncontig(_megdnn_tensor_in input, | |||||
| _megdnn_tensor_in scale, | |||||
| _megdnn_tensor_in zero_point, | |||||
| _megdnn_tensor_out output) { | |||||
| ElemwiseOpParamN<4> ele_param; | |||||
| ele_param[0] = output; | |||||
| ele_param[1] = input; | |||||
| ele_param[2] = scale; | |||||
| ele_param[2].layout = ele_param[2].layout.broadcast(input.layout); | |||||
| ele_param[3] = zero_point; | |||||
| ele_param[3].layout = ele_param[3].layout.broadcast(input.layout); | |||||
| ele_param.init_from_given_tensor(); | |||||
| auto stream = cuda_stream(handle()); | |||||
| #define cb(DType) \ | |||||
| if (input.layout.dtype == DType()) { \ | |||||
| using T = typename DTypeTrait<DType>::ctype; \ | |||||
| run_elemwise<FakeQuantKernOpNonContig<T>, T, 4>(ele_param, stream, \ | |||||
| {m_param}); \ | |||||
| return; \ | |||||
| } | |||||
| cb(megdnn::dtype::Float32) | |||||
| #undef cb | |||||
| } | |||||
| void FakeQuantBackwardImpl::exec(_megdnn_tensor_in diff, | |||||
| _megdnn_tensor_in input, | |||||
| _megdnn_tensor_in scale, | |||||
| _megdnn_tensor_in zero_point, | |||||
| _megdnn_tensor_out grad, | |||||
| _megdnn_workspace workspace) { | |||||
| check_exec(diff.layout, input.layout, scale.layout, zero_point.layout, | |||||
| grad.layout, workspace.size); | |||||
| if (!input.layout.is_contiguous() || !diff.layout.is_contiguous() || | |||||
| !grad.layout.is_contiguous()) { | |||||
| return exec_noncontig(diff, input, scale, zero_point, grad); | |||||
| } | |||||
| ElemwiseOpParamN<2> ele_param; | |||||
| ele_param[0] = scale; | |||||
| ele_param[0].layout = ele_param[0].layout.broadcast(input.layout); | |||||
| ele_param[1] = zero_point; | |||||
| ele_param[1].layout = ele_param[1].layout.broadcast(input.layout); | |||||
| ele_param.init_from_given_tensor(); | |||||
| auto m_param = param(); | |||||
| auto stream = cuda_stream(handle()); | |||||
| #define cb(DType) \ | |||||
| if (grad.layout.dtype == DType()) { \ | |||||
| using T = typename DTypeTrait<DType>::ctype; \ | |||||
| run_elemwise<FakeQuantBwdKernOp<T>, T, 2>( \ | |||||
| ele_param, stream, {diff, input, grad, m_param}); \ | |||||
| return; \ | |||||
| } | |||||
| cb(megdnn::dtype::Float32) | |||||
| #undef cb | |||||
| } | |||||
| void FakeQuantBackwardImpl::exec_noncontig(_megdnn_tensor_in diff, | |||||
| _megdnn_tensor_in input, | |||||
| _megdnn_tensor_in scale, | |||||
| _megdnn_tensor_in zero_point, | |||||
| _megdnn_tensor_out grad) { | |||||
| ElemwiseOpParamN<5> ele_param; | |||||
| ele_param[0] = grad; | |||||
| ele_param[1] = diff; | |||||
| ele_param[2] = input; | |||||
| ele_param[3] = scale; | |||||
| ele_param[3].layout = ele_param[3].layout.broadcast(input.layout); | |||||
| ele_param[4] = zero_point; | |||||
| ele_param[4].layout = ele_param[4].layout.broadcast(input.layout); | |||||
| ele_param.init_from_given_tensor(); | |||||
| auto m_param = param(); | |||||
| auto stream = cuda_stream(handle()); | |||||
| #define cb(DType) \ | |||||
| if (grad.layout.dtype == DType()) { \ | |||||
| using T = typename DTypeTrait<DType>::ctype; \ | |||||
| run_elemwise<FakeQuantBwdKernOpNonContig<T>, T, 5>(ele_param, stream, \ | |||||
| {m_param}); \ | |||||
| return; \ | |||||
| } | |||||
| cb(megdnn::dtype::Float32) | |||||
| #undef cb | |||||
| } | |||||
| } // namespace cuda | |||||
| } // namespace megdnn | |||||
| @@ -0,0 +1,55 @@ | |||||
| /** | |||||
| * \file dnn/src/cuda/fake_quant/opr_impl.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include "megdnn/oprs.h" | |||||
| #include "src/cuda/utils.h" | |||||
| namespace megdnn { | |||||
| namespace cuda { | |||||
| class FakeQuantForwardImpl : public FakeQuantForward { | |||||
| public: | |||||
| using FakeQuantForward::FakeQuantForward; | |||||
| void exec(_megdnn_tensor_in input, _megdnn_tensor_in scale, | |||||
| _megdnn_tensor_in zero_point, _megdnn_tensor_out output, | |||||
| _megdnn_workspace workspace) override; | |||||
| size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&, | |||||
| const TensorLayout&, | |||||
| const TensorLayout&) override { | |||||
| return 0; | |||||
| } | |||||
| private: | |||||
| void exec_noncontig(_megdnn_tensor_in input, _megdnn_tensor_in scale, | |||||
| _megdnn_tensor_in zero_point, | |||||
| _megdnn_tensor_out output); | |||||
| }; | |||||
| class FakeQuantBackwardImpl : public FakeQuantBackward { | |||||
| public: | |||||
| using FakeQuantBackward::FakeQuantBackward; | |||||
| void exec(_megdnn_tensor_in diff, _megdnn_tensor_in input, | |||||
| _megdnn_tensor_in scale, _megdnn_tensor_in zero_point, | |||||
| _megdnn_tensor_out grad, _megdnn_workspace workspace) override; | |||||
| size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&, | |||||
| const TensorLayout&, const TensorLayout&, | |||||
| const TensorLayout&) override { | |||||
| return 0; | |||||
| } | |||||
| private: | |||||
| void exec_noncontig(_megdnn_tensor_in diff, _megdnn_tensor_in input, | |||||
| _megdnn_tensor_in scale, _megdnn_tensor_in zero_point, | |||||
| _megdnn_tensor_out grad); | |||||
| }; | |||||
| } // namespace cuda | |||||
| } // namespace megdnn | |||||
| @@ -77,6 +77,7 @@ | |||||
| #include "src/cuda/roi_align/opr_impl.h" | #include "src/cuda/roi_align/opr_impl.h" | ||||
| #include "src/cuda/batch_conv_bias/opr_impl.h" | #include "src/cuda/batch_conv_bias/opr_impl.h" | ||||
| #include "src/cuda/remap/opr_impl.h" | #include "src/cuda/remap/opr_impl.h" | ||||
| #include "src/cuda/fake_quant/opr_impl.h" | |||||
| namespace megdnn { | namespace megdnn { | ||||
| namespace cuda { | namespace cuda { | ||||
| @@ -0,0 +1,118 @@ | |||||
| /** | |||||
| * \file dnn/src/naive/fakequant/opr_impl.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "src/naive/fake_quant/opr_impl.h" | |||||
| #include <cmath> | |||||
| #include <iostream> | |||||
| #include "megdnn/tensor_iter.h" | |||||
| #include "src/common/elemwise_helper.cuh" | |||||
| #include "src/common/utils.h" | |||||
| #include "src/naive/handle.h" | |||||
| namespace { | |||||
| using namespace megdnn; | |||||
| template <typename T> | |||||
| void forward_impl(const ElemwiseOpParamN<4> src, float qmin, float qmax) { | |||||
| auto inp = tensor_iter_valonly<T>(src[0]).begin(); | |||||
| auto out = tensor_iter_valonly<T>(src[1]).begin(); | |||||
| auto scale = tensor_iter_valonly<T>(src[2]).begin(); | |||||
| auto zero_point = tensor_iter_valonly<T>(src[3]).begin(); | |||||
| size_t total = src[0].layout.total_nr_elems(); | |||||
| for (size_t i = 0; i < total; ++i) { | |||||
| T x = round(*inp / (*scale)) + *zero_point; | |||||
| x = x <= qmin ? qmin : x; | |||||
| x = x >= qmax ? qmax : x; | |||||
| *out = (x - *zero_point) * *scale; | |||||
| ++inp; | |||||
| ++out; | |||||
| ++scale; | |||||
| ++zero_point; | |||||
| } | |||||
| } | |||||
| template <typename T> | |||||
| void backward_impl(const ElemwiseOpParamN<5> src, float qmin, float qmax) { | |||||
| auto diff = tensor_iter_valonly<T>(src[0]).begin(); | |||||
| auto input = tensor_iter_valonly<T>(src[1]).begin(); | |||||
| auto scale = tensor_iter_valonly<T>(src[2]).begin(); | |||||
| auto zero_point = tensor_iter_valonly<T>(src[3]).begin(); | |||||
| auto grad = tensor_iter_valonly<T>(src[4]).begin(); | |||||
| size_t total = src[0].layout.total_nr_elems(); | |||||
| for (size_t i = 0; i < total; ++i) { | |||||
| T x = round(*input / (*scale)) + *zero_point; | |||||
| *grad = (x >= qmin && x <= qmax) ? *diff : 0.0; | |||||
| ++diff; | |||||
| ++input; | |||||
| ++scale; | |||||
| ++zero_point; | |||||
| ++grad; | |||||
| } | |||||
| } | |||||
| } // namespace | |||||
| namespace megdnn { | |||||
| namespace naive { | |||||
| void FakeQuantForwardImpl::exec(_megdnn_tensor_in input, | |||||
| _megdnn_tensor_in scale, | |||||
| _megdnn_tensor_in zero_point, | |||||
| _megdnn_tensor_out output, | |||||
| _megdnn_workspace workspace) { | |||||
| check_exec(input.layout, scale.layout, zero_point.layout, output.layout, | |||||
| workspace.size); | |||||
| ElemwiseOpParamN<4> src; | |||||
| src[0] = input; | |||||
| src[1] = output; | |||||
| src[2] = scale; | |||||
| src[2].layout = src[2].layout.broadcast(input.layout); | |||||
| src[3] = zero_point; | |||||
| src[3].layout = src[3].layout.broadcast(input.layout); | |||||
| #define cb(DType) \ | |||||
| if (input.layout.dtype == DType()) { \ | |||||
| using T = typename DTypeTrait<DType>::ctype; \ | |||||
| MEGDNN_DISPATCH_CPU_KERN_OPR( \ | |||||
| forward_impl<T>(src, param().qmin, param().qmax)); \ | |||||
| return; \ | |||||
| } | |||||
| cb(dtype::Float32) | |||||
| #undef cb | |||||
| } | |||||
| void FakeQuantBackwardImpl::exec(_megdnn_tensor_in diff, | |||||
| _megdnn_tensor_in input, | |||||
| _megdnn_tensor_in scale, | |||||
| _megdnn_tensor_in zero_point, | |||||
| _megdnn_tensor_out grad, | |||||
| _megdnn_workspace workspace) { | |||||
| check_exec(diff.layout, input.layout, scale.layout, zero_point.layout, | |||||
| grad.layout, workspace.size); | |||||
| ElemwiseOpParamN<5> src; | |||||
| src[0] = diff; | |||||
| src[1] = input; | |||||
| src[2] = scale; | |||||
| src[2].layout = src[2].layout.broadcast(input.layout); | |||||
| src[3] = zero_point; | |||||
| src[3].layout = src[3].layout.broadcast(input.layout); | |||||
| src[4] = grad; | |||||
| #define cb(DType) \ | |||||
| if (diff.layout.dtype == DType() && grad.layout.dtype == DType() && \ | |||||
| input.layout.dtype == DType()) { \ | |||||
| using T = typename DTypeTrait<DType>::ctype; \ | |||||
| MEGDNN_DISPATCH_CPU_KERN_OPR( \ | |||||
| backward_impl<T>(src, param().qmin, param().qmax)); \ | |||||
| return; \ | |||||
| } | |||||
| cb(dtype::Float32) | |||||
| #undef cb | |||||
| } | |||||
| } // namespace naive | |||||
| } // namespace megdnn | |||||
| @@ -0,0 +1,45 @@ | |||||
| /** | |||||
| * \file dnn/src/naive/fakequant/opr_impl.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include "megdnn/oprs.h" | |||||
| namespace megdnn { | |||||
| namespace naive { | |||||
| class FakeQuantForwardImpl : public FakeQuantForward { | |||||
| public: | |||||
| using FakeQuantForward::FakeQuantForward; | |||||
| void exec(_megdnn_tensor_in input, _megdnn_tensor_in scale, | |||||
| _megdnn_tensor_in zero_point, _megdnn_tensor_out output, | |||||
| _megdnn_workspace workspace) override; | |||||
| size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&, | |||||
| const TensorLayout&, | |||||
| const TensorLayout&) override { | |||||
| return 0; | |||||
| } | |||||
| }; | |||||
| class FakeQuantBackwardImpl : public FakeQuantBackward { | |||||
| public: | |||||
| using FakeQuantBackward::FakeQuantBackward; | |||||
| void exec(_megdnn_tensor_in diff, _megdnn_tensor_in input, | |||||
| _megdnn_tensor_in scale, _megdnn_tensor_in zero_point, | |||||
| _megdnn_tensor_out grad, _megdnn_workspace workspace) override; | |||||
| size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&, | |||||
| const TensorLayout&, const TensorLayout&, | |||||
| const TensorLayout&) override { | |||||
| return 0; | |||||
| } | |||||
| }; | |||||
| } // namespace naive | |||||
| } // namespace megdnn | |||||
| @@ -79,6 +79,8 @@ | |||||
| #include "src/naive/warp_affine/opr_impl.h" | #include "src/naive/warp_affine/opr_impl.h" | ||||
| #include "src/naive/warp_perspective/opr_impl.h" | #include "src/naive/warp_perspective/opr_impl.h" | ||||
| #include "src/naive/winograd_filter_preprocess/opr_impl.h" | #include "src/naive/winograd_filter_preprocess/opr_impl.h" | ||||
| #include "src/naive/remap/opr_impl.h" | |||||
| #include "src/naive/fake_quant/opr_impl.h" | |||||
| static size_t g_image2d_pitch_alignment = 1; | static size_t g_image2d_pitch_alignment = 1; | ||||
| @@ -0,0 +1,60 @@ | |||||
| /** | |||||
| * \file dnn/test/common/fake_quant.h | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include "megdnn/basic_types.h" | |||||
| #include "megdnn/opr_param_defs.h" | |||||
| namespace megdnn { | |||||
| namespace test { | |||||
| namespace fake_quant { | |||||
| struct TestArg { | |||||
| param::FakeQuant param; | |||||
| TensorShape ishape; | |||||
| TensorShape scale_shape; | |||||
| TensorShape zeropoint_shape; | |||||
| TestArg(param::FakeQuant param, TensorShape ishape, TensorShape scale_shape, | |||||
| TensorShape zeropoint_shape) | |||||
| : param(param), | |||||
| ishape(ishape), | |||||
| scale_shape(scale_shape), | |||||
| zeropoint_shape(zeropoint_shape) {} | |||||
| }; | |||||
| inline std::vector<TestArg> get_args() { | |||||
| std::vector<TestArg> args; | |||||
| param::FakeQuant cur_param; | |||||
| cur_param.qmin = -128; | |||||
| cur_param.qmax = 128; | |||||
| for (size_t i = 10; i < 40; i += 2) { | |||||
| args.emplace_back(cur_param, TensorShape{10, 64, i, i}, TensorShape{1}, | |||||
| TensorShape{1}); | |||||
| } | |||||
| for (size_t m : {1, 10}) | |||||
| for (size_t n : {1, 10}) | |||||
| for (size_t j : {1, 10}) | |||||
| for (size_t k : {1, 10}) { | |||||
| args.emplace_back(cur_param, TensorShape{10, 64, 10, 10}, | |||||
| TensorShape{10, 64, m, n}, | |||||
| TensorShape{10, 64, j, k}); | |||||
| } | |||||
| return args; | |||||
| } | |||||
| } // namespace fake_quant | |||||
| } // namespace test | |||||
| } // namespace megdnn | |||||
| @@ -111,6 +111,8 @@ DEF(Remap, 3, true, true); | |||||
| DEF(RemapBackwardData, 3, true, false); | DEF(RemapBackwardData, 3, true, false); | ||||
| DEF(RemapBackwardMat, 4, true, false); | DEF(RemapBackwardMat, 4, true, false); | ||||
| DEF(DctChannelSelectForward, 4, true, true); | DEF(DctChannelSelectForward, 4, true, true); | ||||
| DEF(FakeQuantForward, 4, true, true); | |||||
| DEF(FakeQuantBackward, 5, true, false); | |||||
| } // namespace test | } // namespace test | ||||
| } // namespace megdnn | } // namespace megdnn | ||||
| @@ -0,0 +1,99 @@ | |||||
| /** | |||||
| * \file dnn/test/cuda/fake_quant.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "test/common/fake_quant.h" | |||||
| #include "megdnn/oprs.h" | |||||
| #include "test/common/checker.h" | |||||
| #include "test/cuda/fixture.h" | |||||
| namespace megdnn { | |||||
| namespace test { | |||||
| using namespace fake_quant; | |||||
| TEST_F(CUDA, FAKE_QUANT) { | |||||
| std::vector<TestArg> args = get_args(); | |||||
| auto dtype = dtype::Float32(); | |||||
| std::unique_ptr<RNG> rng; | |||||
| for (auto&& arg : args) { | |||||
| auto param = arg.param; | |||||
| auto ishape = arg.ishape; | |||||
| auto scale_shape = arg.scale_shape; | |||||
| auto zeropoint_shape = arg.zeropoint_shape; | |||||
| Checker<FakeQuantForward> checker(handle_cuda()); | |||||
| checker.set_param(param) | |||||
| .set_dtype(0, dtype) | |||||
| .set_dtype(1, dtype) | |||||
| .set_dtype(2, dtype) | |||||
| .set_dtype(3, dtype) | |||||
| .execs(TensorShapeArray{ishape, scale_shape, zeropoint_shape, | |||||
| ishape}); | |||||
| } | |||||
| // test noncontiguous layout | |||||
| for (auto&& arg : args) { | |||||
| auto param = arg.param; | |||||
| auto ishape = arg.ishape; | |||||
| auto scale_shape = arg.scale_shape; | |||||
| auto zeropoint_shape = arg.zeropoint_shape; | |||||
| Checker<FakeQuantForward> checker(handle_cuda()); | |||||
| TensorLayout ilayout( | |||||
| ishape, | |||||
| {(long int)(ishape[1] * ishape[2] * ishape[3] * 2), | |||||
| (long int)(ishape[2] * ishape[3]), (long int)ishape[3], 1}, | |||||
| dtype::Float32()); | |||||
| checker.set_param(param).execl({ilayout, | |||||
| {scale_shape, dtype::Float32()}, | |||||
| {zeropoint_shape, dtype::Float32()}, | |||||
| ilayout}); | |||||
| } | |||||
| } | |||||
| TEST_F(CUDA, FAKE_QUANT_BACKWARD) { | |||||
| std::vector<TestArg> args = get_args(); | |||||
| auto dtype = dtype::Float32(); | |||||
| for (auto&& arg : args) { | |||||
| auto param = arg.param; | |||||
| auto ishape = arg.ishape; | |||||
| auto scale_shape = arg.scale_shape; | |||||
| auto zeropoint_shape = arg.zeropoint_shape; | |||||
| Checker<FakeQuantBackward> checker(handle_cuda()); | |||||
| checker.set_param(param) | |||||
| .set_dtype(0, dtype) | |||||
| .set_dtype(1, dtype) | |||||
| .set_dtype(2, dtype) | |||||
| .set_dtype(3, dtype) | |||||
| .set_dtype(4, dtype) | |||||
| .execs(TensorShapeArray{ishape, ishape, scale_shape, | |||||
| zeropoint_shape, ishape}); | |||||
| } | |||||
| // test noncontiguous layout | |||||
| for (auto&& arg : args) { | |||||
| auto param = arg.param; | |||||
| auto ishape = arg.ishape; | |||||
| auto scale_shape = arg.scale_shape; | |||||
| auto zeropoint_shape = arg.zeropoint_shape; | |||||
| Checker<FakeQuantBackward> checker(handle_cuda()); | |||||
| TensorLayout ilayout( | |||||
| ishape, | |||||
| {(long int)(ishape[1] * ishape[2] * ishape[3] * 2), | |||||
| (long int)(ishape[2] * ishape[3]), (long int)ishape[3], 1}, | |||||
| dtype::Float32()); | |||||
| checker.set_param(param).execl({ilayout, | |||||
| ilayout, | |||||
| {scale_shape, dtype::Float32()}, | |||||
| {zeropoint_shape, dtype::Float32()}, | |||||
| ilayout}); | |||||
| } | |||||
| } | |||||
| } // namespace test | |||||
| } // namespace megdnn | |||||