|
- /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright notice, this list of
- * conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright notice, this list of
- * conditions and the following disclaimer in the documentation and/or other materials
- * provided with the distribution.
- * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- * to endorse or promote products derived from this software without specific prior written
- * permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
- /**
- * \file dnn/src/cuda/convolution_helper/kernel.cuh
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
- #pragma once
- #include "src/cuda/convolution_helper/bias_visitor.cuh"
- #include "src/cuda/convolution_helper/config.cuh"
- #include "src/cuda/convolution_helper/conv_trait/conv_trait.cuh"
- #include "src/cuda/convolution_helper/epilogue.cuh"
-
- namespace megdnn {
- namespace cuda {
- namespace convolution {
-
- template <typename ConvTrait, typename BiasVisitor, typename Epilogue>
- __global__ void convolution_kernel(
- const typename ConvTrait::src_dtype* __restrict__ src,
- const typename ConvTrait::filter_dtype* __restrict__ filter,
- BiasVisitor bias, Epilogue epilogue, typename ConvTrait::Param param,
- float alpha, float beta) {
- static bool constexpr check_bounds = ConvTrait::check_bounds;
- using BlockTileIterator = typename ConvTrait::BlockTileIterator;
- BlockTileIterator block_iterator;
- // determine batch, out_channel, out_height, out_width of current thread
- // block
- block_iterator.template init_with_param<ConvTrait::Param>(param);
-
- using DataTileCount = typename ConvTrait::DataTileCount;
- using FilterTileCount = typename ConvTrait::FilterTileCount;
-
- using DataGlobal2ShareMemVisitor =
- typename ConvTrait::DataGlobal2ShareMemVisitor;
- using FilterGlobal2ShareMemVisitor =
- typename ConvTrait::FilterGlobal2ShareMemVisitor;
-
- using smem_storage_dtype = typename ConvTrait::smem_storage_dtype;
- extern __shared__ smem_storage_dtype smem[];
- smem_storage_dtype* smem_src = smem;
- smem_storage_dtype* smem_filter = smem + DataTileCount::smem_tot;
- smem_storage_dtype* smem_dst = smem_filter + FilterTileCount::smem_tot;
-
- DataGlobal2ShareMemVisitor src_gl2sh_visitor{smem_src};
- FilterGlobal2ShareMemVisitor filter_gl2sh_visitor{smem_filter};
- if (check_bounds) {
- block_iterator.set_remain(src_gl2sh_visitor,
- filter_gl2sh_visitor);
- }
-
- using BlockConsumer = typename ConvTrait::BlockConsumer;
- BlockConsumer block_consumer;
- block_consumer.init_accumulator();
-
- block_iterator.template iterate_with_param<typename ConvTrait::InputLayout,
- typename ConvTrait::KernLayout>(
- src, filter, param, src_gl2sh_visitor, filter_gl2sh_visitor,
- block_consumer);
-
- using GlobalMemoryWriter = typename ConvTrait::GlobalMemoryWriter;
- GlobalMemoryWriter global_memory_writer;
- global_memory_writer.init(smem_dst, alpha, beta);
- if (check_bounds) {
- block_iterator.set_remain(global_memory_writer);
- }
- bias.move(block_iterator.block_batch, block_iterator.block_out_channel,
- block_iterator.block_out_height, block_iterator.block_out_width);
- epilogue.move(block_iterator.block_batch, block_iterator.block_out_channel,
- block_iterator.block_out_height,
- block_iterator.block_out_width);
- global_memory_writer.template write<check_bounds>(bias, epilogue,
- block_consumer);
- }
-
- template <typename ConvTrait, typename BiasVisitor, typename Epilogue>
- __global__ void convolution_kernel_precomp_offset(
- const typename ConvTrait::src_dtype* __restrict__ src,
- const typename ConvTrait::filter_dtype* __restrict__ filter,
- const int* __restrict__ offset, BiasVisitor bias, Epilogue epilogue,
- typename ConvTrait::Param param, float alpha, float beta) {
- static bool constexpr check_bounds = ConvTrait::check_bounds;
- using BlockTileIterator = typename ConvTrait::BlockTileIterator;
- BlockTileIterator block_iterator;
- // determine batch, out_channel, out_height, out_width of current thread
- // block
- block_iterator.template init_with_param<ConvTrait::Param>(param);
-
- using DataTileCount = typename ConvTrait::DataTileCount;
- using FilterTileCount = typename ConvTrait::FilterTileCount;
-
- using DataGlobal2ShareMemVisitor =
- typename ConvTrait::DataGlobal2ShareMemVisitor;
- using FilterGlobal2ShareMemVisitor =
- typename ConvTrait::FilterGlobal2ShareMemVisitor;
-
- using smem_storage_dtype = typename ConvTrait::smem_storage_dtype;
- extern __shared__ smem_storage_dtype smem[];
- smem_storage_dtype* smem_src = smem;
- smem_storage_dtype* smem_filter = smem + DataTileCount::smem_tot;
- smem_storage_dtype* smem_dst = smem_filter + FilterTileCount::smem_tot;
-
- DataGlobal2ShareMemVisitor src_gl2sh_visitor{smem_src, offset};
- FilterGlobal2ShareMemVisitor filter_gl2sh_visitor{smem_filter};
- if (check_bounds) {
- block_iterator.set_remain(src_gl2sh_visitor,
- filter_gl2sh_visitor);
- }
-
- using BlockConsumer = typename ConvTrait::BlockConsumer;
- BlockConsumer block_consumer;
- block_consumer.init_accumulator();
-
- block_iterator.template iterate_with_param<typename ConvTrait::InputLayout,
- typename ConvTrait::KernLayout>(
- src, filter, param, src_gl2sh_visitor, filter_gl2sh_visitor,
- block_consumer);
-
- using GlobalMemoryWriter = typename ConvTrait::GlobalMemoryWriter;
- GlobalMemoryWriter global_memory_writer;
- global_memory_writer.init(smem_dst, alpha, beta);
- if (check_bounds) {
- block_iterator.set_remain(global_memory_writer);
- }
- bias.move(block_iterator.block_batch, block_iterator.block_out_channel,
- block_iterator.block_out_height, block_iterator.block_out_width);
- epilogue.move(block_iterator.block_batch, block_iterator.block_out_channel,
- block_iterator.block_out_height,
- block_iterator.block_out_width);
- global_memory_writer.template write<check_bounds>(bias, epilogue,
- block_consumer);
- }
-
- } // namespace convolution
- } // namespace cuda
- } // namespace megdnn
-
- // vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}}
|