| @@ -76,7 +76,7 @@ class Conv2dGpuFwdKernel : public GpuKernel { | |||
| const float alpha = 1; | |||
| const float beta = 0; | |||
| if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_) { | |||
| if (use_pad_) { | |||
| T *padded_addr = GetDeviceAddress<T>(workspace, 1); | |||
| if (data_format_ == kOpFormat_NHWC) { | |||
| CalPadNHWC(padded_size_ / sizeof(T), input_addr, n_, old_height_, old_width_, c_, old_height_ + pad_height_, | |||
| @@ -133,23 +133,18 @@ class Conv2dGpuFwdKernel : public GpuKernel { | |||
| [](const int64_t &value) { return static_cast<int>(value); }); | |||
| pad_height_ = pad_list[0]; | |||
| pad_width_ = pad_list[2]; | |||
| auto symmetry_pad = (pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3]); | |||
| use_pad_ = !((pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3])); | |||
| pad_mode_ = GetAttr<std::string>(kernel_node, "pad_mode"); | |||
| SetStrideAndDilation(kernel_node); | |||
| cudnnTensorDescriptor_t input_descriptor_real = nullptr; | |||
| int padA[2]; | |||
| int strideA[2] = {stride_[2], stride_[3]}; | |||
| int dilaA[2] = {dilation_[2], dilation_[3]}; | |||
| if (pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase || !symmetry_pad) { | |||
| if (use_pad_) { | |||
| pad_height_ = pad_list[0] + pad_list[1]; | |||
| pad_width_ = pad_list[2] + pad_list[3]; | |||
| pad_top_ = pad_list[0]; | |||
| pad_left_ = pad_list[2]; | |||
| // if use_pad_ == true, using zero padding in advance, else using the default cudnn pad. | |||
| if (pad_height_ % 2 == 0 && pad_width_ % 2 == 0) { | |||
| use_pad_ = false; | |||
| } | |||
| int dimA[4]; | |||
| int strideApadded[4]; | |||
| if (data_format_ == kOpFormat_NCHW || data_format_ == kOpFormat_DEFAULT) { | |||
| @@ -165,18 +160,12 @@ class Conv2dGpuFwdKernel : public GpuKernel { | |||
| } | |||
| CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(padded_desc_, cudnn_data_type_, 4, dimA, strideApadded), | |||
| "cudnnSetTensor4dDescriptor failed"); | |||
| if (use_pad_) { | |||
| padA[0] = 0; | |||
| padA[1] = 0; | |||
| } else { | |||
| padA[0] = pad_top_; | |||
| padA[1] = pad_left_; | |||
| } | |||
| padA[0] = 0; | |||
| padA[1] = 0; | |||
| CHECK_CUDNN_RET_WITH_EXCEPT( | |||
| cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT), | |||
| "cudnnSetConvolutionNdDescriptor failed"); | |||
| input_descriptor_real = use_pad_ ? padded_desc_ : input_desc_; | |||
| input_descriptor_real = padded_desc_; | |||
| } else { | |||
| if (pad_mode_ == kValidPadModeUpperCase || pad_mode_ == kValidPadModeLowerCase) { | |||
| pad_height_ = 0; | |||
| @@ -232,7 +221,7 @@ class Conv2dGpuFwdKernel : public GpuKernel { | |||
| input_size_list_.push_back(input_size_); | |||
| input_size_list_.push_back(filter_size_); | |||
| output_size_list_.push_back(output_size_); | |||
| if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_ && !is_null_input_) { | |||
| if (use_pad_ && !is_null_input_) { | |||
| CHECK_CUDNN_RET_WITH_EXCEPT( | |||
| cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle_, padded_desc_, filter_desc_, conv_desc_, output_desc_, | |||
| conv_algorithm_, &workspace_size_), | |||
| @@ -78,7 +78,7 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { | |||
| const float alpha = 1; | |||
| const float beta = 0; | |||
| if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_) { | |||
| if (use_pad_) { | |||
| T *padded = GetDeviceAddress<T>(workspace, 1); | |||
| if (data_format_ == kOpFormat_NHWC) { | |||
| CalPadNHWC(padded_size_ / sizeof(T), x, n_, old_height_, old_width_, c_, old_height_ + pad_height_, | |||
| @@ -136,14 +136,14 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { | |||
| [](const int64_t &value) { return static_cast<int>(value); }); | |||
| pad_height_ = pad_list[0]; | |||
| pad_width_ = pad_list[2]; | |||
| auto symmetry_pad = (pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3]); | |||
| use_pad_ = !((pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3])); | |||
| pad_mode_ = GetAttr<std::string>(kernel_node, "pad_mode"); | |||
| SetStrideAndDilation(kernel_node); | |||
| cudnnTensorDescriptor_t x_desc_real = nullptr; | |||
| int padA[2]; | |||
| int strideA[2] = {stride_[0], stride_[1]}; | |||
| int dilaA[2] = {dilation_[2], dilation_[3]}; | |||
| if (pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase || !symmetry_pad) { | |||
| if (use_pad_) { | |||
| pad_height_ = pad_list[0] + pad_list[1]; | |||
| pad_width_ = pad_list[2] + pad_list[3]; | |||
| pad_top_ = pad_list[0]; | |||
| @@ -167,17 +167,12 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { | |||
| CHECK_CUDNN_RET_WITH_EXCEPT( | |||
| cudnnSetTensorNdDescriptor(padded_descriptor_, cudnn_data_type_, 4, dimA, strideApadded), | |||
| "cudnnSetTensor4dDescriptor failed"); | |||
| if (use_pad_) { | |||
| padA[0] = 0; | |||
| padA[1] = 0; | |||
| } else { | |||
| padA[0] = pad_top_; | |||
| padA[1] = pad_left_; | |||
| } | |||
| padA[0] = 0; | |||
| padA[1] = 0; | |||
| CHECK_CUDNN_RET_WITH_EXCEPT( | |||
| cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT), | |||
| "cudnnSetConvolutionNdDescriptor failed"); | |||
| x_desc_real = use_pad_ ? padded_descriptor_ : x_desc_; | |||
| x_desc_real = padded_descriptor_; | |||
| } else { | |||
| if (pad_mode_ == kValidPadModeUpperCase || pad_mode_ == kValidPadModeLowerCase) { | |||
| pad_height_ = 0; | |||
| @@ -231,7 +226,7 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { | |||
| input_size_list_.push_back(input_size_); | |||
| output_size_list_.push_back(output_size_); | |||
| if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_ && !is_null_input_) { | |||
| if (use_pad_ && !is_null_input_) { | |||
| CHECK_CUDNN_RET_WITH_EXCEPT( | |||
| cudnnGetTensorSizeInBytes(padded_descriptor_, reinterpret_cast<size_t *>(&padded_size_)), | |||
| "cudnnGetTensorSizeInBytes failed"); | |||
| @@ -77,7 +77,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { | |||
| } | |||
| const float alpha = 1; | |||
| if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_) { | |||
| if (use_pad_) { | |||
| T *padded = GetDeviceAddress<T>(workspace, 1); | |||
| CHECK_CUDNN_RET_WITH_EXCEPT( | |||
| @@ -139,14 +139,14 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { | |||
| [](const int64_t &value) { return static_cast<int>(value); }); | |||
| pad_height_ = pad_list[0]; | |||
| pad_width_ = pad_list[2]; | |||
| auto symmetry_pad = (pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3]); | |||
| use_pad_ = !((pad_height_ == pad_list[1]) && (pad_width_ == pad_list[3])); | |||
| pad_mode_ = GetAttr<std::string>(kernel_node, "pad_mode"); | |||
| SetStrideAndDilation(kernel_node); | |||
| cudnnTensorDescriptor_t dx_desc_real = nullptr; | |||
| int padA[2]; | |||
| int strideA[2] = {stride_[0], stride_[1]}; | |||
| int dilaA[2] = {dilation_[2], dilation_[3]}; | |||
| if (pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase || !symmetry_pad) { | |||
| if (use_pad_) { | |||
| pad_height_ = pad_list[0] + pad_list[1]; | |||
| pad_width_ = pad_list[2] + pad_list[3]; | |||
| pad_top_ = pad_list[0]; | |||
| @@ -170,17 +170,12 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { | |||
| CHECK_CUDNN_RET_WITH_EXCEPT( | |||
| cudnnSetTensorNdDescriptor(padded_descriptor_, cudnn_data_type_, 4, dimA, strideApadded), | |||
| "cudnnSetTensor4dDescriptor failed"); | |||
| if (use_pad_) { | |||
| padA[0] = 0; | |||
| padA[1] = 0; | |||
| } else { | |||
| padA[0] = pad_top_; | |||
| padA[1] = pad_left_; | |||
| } | |||
| padA[0] = 0; | |||
| padA[1] = 0; | |||
| CHECK_CUDNN_RET_WITH_EXCEPT( | |||
| cudnnSetConvolutionNdDescriptor(conv_desc_, 2, padA, strideA, dilaA, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT), | |||
| "cudnnSetConvolutionNdDescriptor failed"); | |||
| dx_desc_real = use_pad_ ? padded_descriptor_ : dx_desc_; | |||
| dx_desc_real = padded_descriptor_; | |||
| } else { | |||
| if (pad_mode_ == kValidPadModeUpperCase || pad_mode_ == kValidPadModeLowerCase) { | |||
| pad_height_ = 0; | |||
| @@ -233,7 +228,7 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { | |||
| input_size_list_.push_back(w_size_); | |||
| output_size_list_.push_back(output_size_); | |||
| if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_ && !is_null_input_) { | |||
| if (use_pad_ && !is_null_input_) { | |||
| CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(padded_descriptor_, &padded_size_), | |||
| "cudnnGetTensorSizeInBytes failed"); | |||
| @@ -662,8 +662,9 @@ void Pipeline::Run() { | |||
| auto manager = func_graph->manager(); | |||
| size_t graph_nums = manager->func_graphs().size(); | |||
| if (graph_nums == 1) { | |||
| resource_->set_gpu_loopsink_flag(true); | |||
| MS_LOG(INFO) << "Change gpu_loopsink_flag_ to true."; | |||
| int64_t sinksize = ConfigManager::GetInstance().iter_num(); | |||
| resource_->set_gpu_loopsink(true, sinksize); | |||
| MS_LOG(INFO) << "Change gpu_loopsink_flag_ to true,set loopsink size to " << sinksize; | |||
| } | |||
| } | |||
| } | |||
| @@ -834,7 +835,7 @@ py::object ExecutorPy::Run(const py::tuple &args, const py::object &phase) { | |||
| } | |||
| // Set loopsink size for each phase. | |||
| bool is_loopsink = info_[phase_s]->resource->gpu_loopsink_flag(); | |||
| int64_t sinksize = ConfigManager::GetInstance().iter_num(); | |||
| int64_t sinksize = info_[phase_s]->resource->gpu_loopsink_size(); | |||
| ConfigManager::GetInstance().set_gpu_loopsink_size(is_loopsink ? sinksize : 1); | |||
| // If target is not gpu or is loopsink, keep vmloop 1. | |||
| bool g = (MsContext::GetInstance()->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice); | |||
| @@ -74,8 +74,12 @@ class Resource : public ResourceBase { | |||
| const abstract::AbstractBasePtrList &args_spec() const { return args_spec_; } | |||
| void set_args_spec(const abstract::AbstractBasePtrList &args_spec) { args_spec_ = args_spec; } | |||
| void set_gpu_loopsink_flag(const bool &flag) { gpu_loopsink_flag_ = flag; } | |||
| void set_gpu_loopsink(const bool &flag, const int64_t size) { | |||
| gpu_loopsink_flag_ = flag; | |||
| gpu_loopsink_size_ = size; | |||
| } | |||
| bool gpu_loopsink_flag() { return gpu_loopsink_flag_; } | |||
| int64_t gpu_loopsink_size() { return gpu_loopsink_size_; } | |||
| // Reclaim resource and clear the cache. | |||
| // ExecutorPy::Compile() can be called multiple times, so cache | |||
| @@ -89,6 +93,7 @@ class Resource : public ResourceBase { | |||
| py::object input_; | |||
| bool is_cleaned_; | |||
| bool gpu_loopsink_flag_{false}; | |||
| int64_t gpu_loopsink_size_{1}; | |||
| }; | |||
| using ResourcePtr = std::shared_ptr<pipeline::Resource>; | |||
| @@ -129,6 +129,8 @@ class DatasetHelper: | |||
| Validator.check_is_int(sink_size) | |||
| if sink_size < -1 or sink_size == 0: | |||
| raise ValueError("The sink_size must be -1 or positive, but got sink_size {}.".format(sink_size)) | |||
| if sink_size == -1: | |||
| sink_size = dataset.get_dataset_size() | |||
| if dataset_sink_mode: | |||
| if context.get_context("enable_ge"): | |||
| @@ -14,15 +14,14 @@ | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| # an simple tutorial as follows, more parameters can be setting | |||
| if [ $# != 3 ] | |||
| if [ $# != 2 ] | |||
| then | |||
| echo "Usage: sh run_standalone_train_gpu.sh [cifar10|imagenet] [DATA_PATH] [DEVICE_ID]" | |||
| echo "Usage: sh run_standalone_train_gpu.sh [cifar10|imagenet] [DATA_PATH]" | |||
| exit 1 | |||
| fi | |||
| export DATASET_NAME=$1 | |||
| export DATA_PATH=$2 | |||
| export DEVICE_ID=$3 | |||
| python train.py --dataset_name=$DATASET_NAME --data_path=$DATA_PATH \ | |||
| --device_id=$DEVICE_ID --device_target="GPU" > log 2>&1 & | |||
| --device_target="GPU" > log 2>&1 & | |||
| @@ -345,11 +345,11 @@ epoch: 5 step: 5004, loss is 3.3501816 | |||
| ``` | |||
| # ========START RESNET50 GPU BENCHMARK======== | |||
| step time: 12416.098 ms, fps: 412 img/sec. epoch: 1 step: 20, loss is 6.940182 | |||
| step time: 3472.037 ms, fps: 1474 img/sec. epoch: 2 step: 20, loss is 7.078993 | |||
| step time: 3469.523 ms, fps: 1475 img/sec. epoch: 3 step: 20, loss is 7.559594 | |||
| step time: 3460.311 ms, fps: 1479 img/sec. epoch: 4 step: 20, loss is 6.920937 | |||
| step time: 3460.543 ms, fps: 1479 img/sec. epoch: 5 step: 20, loss is 6.814013 | |||
| Epoch time: 12416.098 ms, fps: 412 img/sec. epoch: 1 step: 20, loss is 6.940182 | |||
| Epoch time: 3472.037 ms, fps: 1474 img/sec. epoch: 2 step: 20, loss is 7.078993 | |||
| Epoch time: 3469.523 ms, fps: 1475 img/sec. epoch: 3 step: 20, loss is 7.559594 | |||
| Epoch time: 3460.311 ms, fps: 1479 img/sec. epoch: 4 step: 20, loss is 6.920937 | |||
| Epoch time: 3460.543 ms, fps: 1479 img/sec. epoch: 5 step: 20, loss is 6.814013 | |||
| ... | |||
| ``` | |||
| ## [Evaluation Process](#contents) | |||
| @@ -53,7 +53,7 @@ class MyTimeMonitor(Callback): | |||
| def step_end(self, run_context): | |||
| step_mseconds = (time.time() - self.step_time) * 1000 | |||
| fps = self.batch_size / step_mseconds *1000 * self.size | |||
| print("step time: {:5.3f} ms, fps: {:d} img/sec.".format(step_mseconds, int(fps)), flush=True, end=" ") | |||
| print("Epoch time: {:5.3f} ms, fps: {:d} img/sec.".format(step_mseconds, int(fps)), flush=True, end=" ") | |||
| def pad(image): | |||
| zeros = np.zeros([224, 224, 1], dtype=np.uint8) | |||