Merge pull request !5349 from limingqi107/mastertags/v1.0.0
| @@ -63,19 +63,21 @@ class ConcatV2GpuFwdKernel : public GpuKernel { | |||
| if (!CheckParam(kernel_node)) { | |||
| return false; | |||
| } | |||
| axis_ = GetAttr<int>(kernel_node, "axis"); | |||
| if (axis_ < 0) { | |||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| axis_ += SizeToInt(input_shape.size()); | |||
| } | |||
| auto origin_data_format = AnfAlgo::GetOriginDataFormat(kernel_node); | |||
| auto input_format = AnfAlgo::GetInputFormat(kernel_node, 0); | |||
| axis_ = AxisTransform(origin_data_format, input_format, axis_); | |||
| input_num_ = SizeToInt(AnfAlgo::GetInputTensorNum(kernel_node)); | |||
| inputs_host_ = std::make_unique<T *[]>(input_num_); | |||
| len_axis_ = std::make_unique<int[]>(input_num_); | |||
| for (int i = 0; i < input_num_; i++) { | |||
| size_t input_size = 1; | |||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i); | |||
| auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, i); | |||
| for (size_t j = 0; j < input_shape.size(); j++) { | |||
| input_size *= input_shape[j]; | |||
| } | |||
| @@ -85,7 +87,7 @@ class ConcatV2GpuFwdKernel : public GpuKernel { | |||
| workspace_size_list_.push_back(sizeof(T *) * input_num_); | |||
| workspace_size_list_.push_back(sizeof(int) * input_num_); | |||
| auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); | |||
| auto output_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); | |||
| output_size_ = 1; | |||
| for (int i = 0; i < SizeToInt(output_shape.size()); i++) { | |||
| output_size_ *= output_shape[i]; | |||
| @@ -98,7 +100,6 @@ class ConcatV2GpuFwdKernel : public GpuKernel { | |||
| } | |||
| } | |||
| output_size_list_.push_back(output_size_ * sizeof(T)); | |||
| InitSizeLists(); | |||
| return true; | |||
| } | |||
| @@ -22,6 +22,7 @@ | |||
| #include <string> | |||
| #include <vector> | |||
| #include <utility> | |||
| #include <map> | |||
| #include "backend/kernel_compiler/kernel.h" | |||
| #include "backend/kernel_compiler/gpu/kernel_constants.h" | |||
| #include "runtime/device/gpu/gpu_device_manager.h" | |||
| @@ -31,6 +32,19 @@ using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm; | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| static std::map<int, int> kNCHWToNHWCAxisMap = { | |||
| {0, 0}, | |||
| {1, 3}, | |||
| {2, 1}, | |||
| {3, 2}, | |||
| }; | |||
| static std::map<int, int> kNHWCToNCHWAxisMap = { | |||
| {0, 0}, | |||
| {1, 2}, | |||
| {2, 3}, | |||
| {3, 1}, | |||
| }; | |||
| class GpuKernel : public KernelMod { | |||
| public: | |||
| virtual ~GpuKernel() = default; | |||
| @@ -74,6 +88,18 @@ class GpuKernel : public KernelMod { | |||
| dst->push_back(src.size() == 0 ? 1 : SizeToInt(src[src.size() - 1])); | |||
| } | |||
| int AxisTransform(const std::string &origin_data_format, const std::string &cal_format, int axis) { | |||
| if (((origin_data_format == kOpFormat_DEFAULT) || (origin_data_format == kOpFormat_NCHW)) && | |||
| (cal_format == kOpFormat_NHWC)) { | |||
| return kNCHWToNHWCAxisMap[axis]; | |||
| } else if (((cal_format == kOpFormat_DEFAULT) || (cal_format == kOpFormat_NCHW)) && | |||
| (origin_data_format == kOpFormat_NHWC)) { | |||
| return kNHWCToNCHWAxisMap[axis]; | |||
| } else { | |||
| return axis; | |||
| } | |||
| } | |||
| // transpose shape: NCHW To NHWC | |||
| void ShapeNCHW2NHWC(std::vector<size_t> *shape) { | |||
| std::swap((*shape)[1], (*shape)[3]); | |||
| @@ -82,7 +82,7 @@ class AddNGpuFwdKernel : public GpuKernel { | |||
| MS_LOG(ERROR) << "Output number is " << output_num << ", but cudnnAddTensor needs 1 output."; | |||
| return false; | |||
| } | |||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| is_null_input_ = CHECK_NULL_INPUT(input_shape); | |||
| if (is_null_input_) { | |||
| MS_LOG(WARNING) << "AddNGpuFwdKernel input is null"; | |||
| @@ -96,9 +96,16 @@ class AddNGpuFwdKernel : public GpuKernel { | |||
| for (size_t i = 0; i < input_shape.size(); i++) { | |||
| dimA[i] = SizeToInt(input_shape[i]); | |||
| } | |||
| CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, | |||
| SizeToInt(input_shape.size()), dimA), | |||
| "cudnnSetTensorNdDescriptor failed"); | |||
| auto input_format = AnfAlgo::GetInputFormat(kernel_node, 0); | |||
| if (input_format == kOpFormat_NHWC) { | |||
| CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NHWC, cudnn_data_type_, | |||
| SizeToInt(input_shape.size()), dimA), | |||
| "cudnnSetTensorNdDescriptor failed"); | |||
| } else { | |||
| CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, | |||
| SizeToInt(input_shape.size()), dimA), | |||
| "cudnnSetTensorNdDescriptor failed"); | |||
| } | |||
| InitSizeLists(); | |||
| return true; | |||
| } | |||
| @@ -194,6 +194,12 @@ void UpdateKernelFormatInfo(const CNodePtr &kernel_node, const std::vector<TypeI | |||
| auto cal_format = (inputs_type[0] == kNumberTypeFloat16) ? kOpFormat_NHWC : kOpFormat_NCHW; | |||
| MS_LOG(DEBUG) << "Kernel node: " << kernel_node->fullname_with_scope() << ", format: " << cal_format; | |||
| auto inputs_format_position = iter->second.first; | |||
| // If input position is empty, then insert all the input positions, because the input numbers of this op are variable. | |||
| if (inputs_format_position.size() == 0) { | |||
| for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(kernel_node); input_index++) { | |||
| inputs_format_position.push_back(input_index); | |||
| } | |||
| } | |||
| for (const auto &input_format_position : inputs_format_position) { | |||
| if (input_format_position >= inputs_format->size()) { | |||
| MS_LOG(EXCEPTION) << "The position [" << input_format_position << "] is out of range of the input size [" | |||
| @@ -30,6 +30,7 @@ namespace mindspore { | |||
| namespace device { | |||
| namespace gpu { | |||
| // map<opName, (inputFormatPosition, outputFormatPosition)>, used for getting the insert position of format transform. | |||
| // If input position is empty, then insert all the input positions, because the input numbers of this op are variable. | |||
| static std::map<std::string, std::pair<std::vector<size_t>, std::vector<size_t>>> kKernelFormatPositionMap = { | |||
| {prim::kPrimConv2D->name(), {{0, 1}, {0}}}, | |||
| {prim::kPrimConv2DBackpropInput->name(), {{0, 1}, {0}}}, | |||
| @@ -47,6 +48,8 @@ static std::map<std::string, std::pair<std::vector<size_t>, std::vector<size_t>> | |||
| {kFusedBatchNormGradEx, {{0, 1}, {0}}}, | |||
| {kFusedBatchNormGradExWithActivation, {{0, 1, 7}, {0}}}, | |||
| {kFusedBatchNormGradExWithAddAndActivation, {{0, 1, 7}, {0, 3}}}, | |||
| {prim::kPrimConcat->name(), {{}, {0}}}, | |||
| {prim::kPrimAddN->name(), {{}, {0}}}, | |||
| }; | |||
| void SetKernelInfo(const CNodePtr &apply_kernel_ptr); | |||