Merge pull request !5349 from limingqi107/mastertags/v1.0.0
| @@ -63,19 +63,21 @@ class ConcatV2GpuFwdKernel : public GpuKernel { | |||||
| if (!CheckParam(kernel_node)) { | if (!CheckParam(kernel_node)) { | ||||
| return false; | return false; | ||||
| } | } | ||||
| axis_ = GetAttr<int>(kernel_node, "axis"); | axis_ = GetAttr<int>(kernel_node, "axis"); | ||||
| if (axis_ < 0) { | if (axis_ < 0) { | ||||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||||
| auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||||
| axis_ += SizeToInt(input_shape.size()); | axis_ += SizeToInt(input_shape.size()); | ||||
| } | } | ||||
| auto origin_data_format = AnfAlgo::GetOriginDataFormat(kernel_node); | |||||
| auto input_format = AnfAlgo::GetInputFormat(kernel_node, 0); | |||||
| axis_ = AxisTransform(origin_data_format, input_format, axis_); | |||||
| input_num_ = SizeToInt(AnfAlgo::GetInputTensorNum(kernel_node)); | input_num_ = SizeToInt(AnfAlgo::GetInputTensorNum(kernel_node)); | ||||
| inputs_host_ = std::make_unique<T *[]>(input_num_); | inputs_host_ = std::make_unique<T *[]>(input_num_); | ||||
| len_axis_ = std::make_unique<int[]>(input_num_); | len_axis_ = std::make_unique<int[]>(input_num_); | ||||
| for (int i = 0; i < input_num_; i++) { | for (int i = 0; i < input_num_; i++) { | ||||
| size_t input_size = 1; | size_t input_size = 1; | ||||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i); | |||||
| auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, i); | |||||
| for (size_t j = 0; j < input_shape.size(); j++) { | for (size_t j = 0; j < input_shape.size(); j++) { | ||||
| input_size *= input_shape[j]; | input_size *= input_shape[j]; | ||||
| } | } | ||||
| @@ -85,7 +87,7 @@ class ConcatV2GpuFwdKernel : public GpuKernel { | |||||
| workspace_size_list_.push_back(sizeof(T *) * input_num_); | workspace_size_list_.push_back(sizeof(T *) * input_num_); | ||||
| workspace_size_list_.push_back(sizeof(int) * input_num_); | workspace_size_list_.push_back(sizeof(int) * input_num_); | ||||
| auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); | |||||
| auto output_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); | |||||
| output_size_ = 1; | output_size_ = 1; | ||||
| for (int i = 0; i < SizeToInt(output_shape.size()); i++) { | for (int i = 0; i < SizeToInt(output_shape.size()); i++) { | ||||
| output_size_ *= output_shape[i]; | output_size_ *= output_shape[i]; | ||||
| @@ -98,7 +100,6 @@ class ConcatV2GpuFwdKernel : public GpuKernel { | |||||
| } | } | ||||
| } | } | ||||
| output_size_list_.push_back(output_size_ * sizeof(T)); | output_size_list_.push_back(output_size_ * sizeof(T)); | ||||
| InitSizeLists(); | InitSizeLists(); | ||||
| return true; | return true; | ||||
| } | } | ||||
| @@ -22,6 +22,7 @@ | |||||
| #include <string> | #include <string> | ||||
| #include <vector> | #include <vector> | ||||
| #include <utility> | #include <utility> | ||||
| #include <map> | |||||
| #include "backend/kernel_compiler/kernel.h" | #include "backend/kernel_compiler/kernel.h" | ||||
| #include "backend/kernel_compiler/gpu/kernel_constants.h" | #include "backend/kernel_compiler/gpu/kernel_constants.h" | ||||
| #include "runtime/device/gpu/gpu_device_manager.h" | #include "runtime/device/gpu/gpu_device_manager.h" | ||||
| @@ -31,6 +32,19 @@ using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm; | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace kernel { | namespace kernel { | ||||
| static std::map<int, int> kNCHWToNHWCAxisMap = { | |||||
| {0, 0}, | |||||
| {1, 3}, | |||||
| {2, 1}, | |||||
| {3, 2}, | |||||
| }; | |||||
| static std::map<int, int> kNHWCToNCHWAxisMap = { | |||||
| {0, 0}, | |||||
| {1, 2}, | |||||
| {2, 3}, | |||||
| {3, 1}, | |||||
| }; | |||||
| class GpuKernel : public KernelMod { | class GpuKernel : public KernelMod { | ||||
| public: | public: | ||||
| virtual ~GpuKernel() = default; | virtual ~GpuKernel() = default; | ||||
| @@ -74,6 +88,18 @@ class GpuKernel : public KernelMod { | |||||
| dst->push_back(src.size() == 0 ? 1 : SizeToInt(src[src.size() - 1])); | dst->push_back(src.size() == 0 ? 1 : SizeToInt(src[src.size() - 1])); | ||||
| } | } | ||||
| int AxisTransform(const std::string &origin_data_format, const std::string &cal_format, int axis) { | |||||
| if (((origin_data_format == kOpFormat_DEFAULT) || (origin_data_format == kOpFormat_NCHW)) && | |||||
| (cal_format == kOpFormat_NHWC)) { | |||||
| return kNCHWToNHWCAxisMap[axis]; | |||||
| } else if (((cal_format == kOpFormat_DEFAULT) || (cal_format == kOpFormat_NCHW)) && | |||||
| (origin_data_format == kOpFormat_NHWC)) { | |||||
| return kNHWCToNCHWAxisMap[axis]; | |||||
| } else { | |||||
| return axis; | |||||
| } | |||||
| } | |||||
| // transpose shape: NCHW To NHWC | // transpose shape: NCHW To NHWC | ||||
| void ShapeNCHW2NHWC(std::vector<size_t> *shape) { | void ShapeNCHW2NHWC(std::vector<size_t> *shape) { | ||||
| std::swap((*shape)[1], (*shape)[3]); | std::swap((*shape)[1], (*shape)[3]); | ||||
| @@ -82,7 +82,7 @@ class AddNGpuFwdKernel : public GpuKernel { | |||||
| MS_LOG(ERROR) << "Output number is " << output_num << ", but cudnnAddTensor needs 1 output."; | MS_LOG(ERROR) << "Output number is " << output_num << ", but cudnnAddTensor needs 1 output."; | ||||
| return false; | return false; | ||||
| } | } | ||||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||||
| auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||||
| is_null_input_ = CHECK_NULL_INPUT(input_shape); | is_null_input_ = CHECK_NULL_INPUT(input_shape); | ||||
| if (is_null_input_) { | if (is_null_input_) { | ||||
| MS_LOG(WARNING) << "AddNGpuFwdKernel input is null"; | MS_LOG(WARNING) << "AddNGpuFwdKernel input is null"; | ||||
| @@ -96,9 +96,16 @@ class AddNGpuFwdKernel : public GpuKernel { | |||||
| for (size_t i = 0; i < input_shape.size(); i++) { | for (size_t i = 0; i < input_shape.size(); i++) { | ||||
| dimA[i] = SizeToInt(input_shape[i]); | dimA[i] = SizeToInt(input_shape[i]); | ||||
| } | } | ||||
| CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, | |||||
| SizeToInt(input_shape.size()), dimA), | |||||
| "cudnnSetTensorNdDescriptor failed"); | |||||
| auto input_format = AnfAlgo::GetInputFormat(kernel_node, 0); | |||||
| if (input_format == kOpFormat_NHWC) { | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NHWC, cudnn_data_type_, | |||||
| SizeToInt(input_shape.size()), dimA), | |||||
| "cudnnSetTensorNdDescriptor failed"); | |||||
| } else { | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, | |||||
| SizeToInt(input_shape.size()), dimA), | |||||
| "cudnnSetTensorNdDescriptor failed"); | |||||
| } | |||||
| InitSizeLists(); | InitSizeLists(); | ||||
| return true; | return true; | ||||
| } | } | ||||
| @@ -194,6 +194,12 @@ void UpdateKernelFormatInfo(const CNodePtr &kernel_node, const std::vector<TypeI | |||||
| auto cal_format = (inputs_type[0] == kNumberTypeFloat16) ? kOpFormat_NHWC : kOpFormat_NCHW; | auto cal_format = (inputs_type[0] == kNumberTypeFloat16) ? kOpFormat_NHWC : kOpFormat_NCHW; | ||||
| MS_LOG(DEBUG) << "Kernel node: " << kernel_node->fullname_with_scope() << ", format: " << cal_format; | MS_LOG(DEBUG) << "Kernel node: " << kernel_node->fullname_with_scope() << ", format: " << cal_format; | ||||
| auto inputs_format_position = iter->second.first; | auto inputs_format_position = iter->second.first; | ||||
| // If input position is empty, then insert all the input positions, because the input numbers of this op are variable. | |||||
| if (inputs_format_position.size() == 0) { | |||||
| for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(kernel_node); input_index++) { | |||||
| inputs_format_position.push_back(input_index); | |||||
| } | |||||
| } | |||||
| for (const auto &input_format_position : inputs_format_position) { | for (const auto &input_format_position : inputs_format_position) { | ||||
| if (input_format_position >= inputs_format->size()) { | if (input_format_position >= inputs_format->size()) { | ||||
| MS_LOG(EXCEPTION) << "The position [" << input_format_position << "] is out of range of the input size [" | MS_LOG(EXCEPTION) << "The position [" << input_format_position << "] is out of range of the input size [" | ||||
| @@ -30,6 +30,7 @@ namespace mindspore { | |||||
| namespace device { | namespace device { | ||||
| namespace gpu { | namespace gpu { | ||||
| // map<opName, (inputFormatPosition, outputFormatPosition)>, used for getting the insert position of format transform. | // map<opName, (inputFormatPosition, outputFormatPosition)>, used for getting the insert position of format transform. | ||||
| // If input position is empty, then insert all the input positions, because the input numbers of this op are variable. | |||||
| static std::map<std::string, std::pair<std::vector<size_t>, std::vector<size_t>>> kKernelFormatPositionMap = { | static std::map<std::string, std::pair<std::vector<size_t>, std::vector<size_t>>> kKernelFormatPositionMap = { | ||||
| {prim::kPrimConv2D->name(), {{0, 1}, {0}}}, | {prim::kPrimConv2D->name(), {{0, 1}, {0}}}, | ||||
| {prim::kPrimConv2DBackpropInput->name(), {{0, 1}, {0}}}, | {prim::kPrimConv2DBackpropInput->name(), {{0, 1}, {0}}}, | ||||
| @@ -47,6 +48,8 @@ static std::map<std::string, std::pair<std::vector<size_t>, std::vector<size_t>> | |||||
| {kFusedBatchNormGradEx, {{0, 1}, {0}}}, | {kFusedBatchNormGradEx, {{0, 1}, {0}}}, | ||||
| {kFusedBatchNormGradExWithActivation, {{0, 1, 7}, {0}}}, | {kFusedBatchNormGradExWithActivation, {{0, 1, 7}, {0}}}, | ||||
| {kFusedBatchNormGradExWithAddAndActivation, {{0, 1, 7}, {0, 3}}}, | {kFusedBatchNormGradExWithAddAndActivation, {{0, 1, 7}, {0, 3}}}, | ||||
| {prim::kPrimConcat->name(), {{}, {0}}}, | |||||
| {prim::kPrimAddN->name(), {{}, {0}}}, | |||||
| }; | }; | ||||
| void SetKernelInfo(const CNodePtr &apply_kernel_ptr); | void SetKernelInfo(const CNodePtr &apply_kernel_ptr); | ||||