!7591 fix stuck problem when using sink_size.

Merge pull request !7591 from anzhengqi/I1YI87-fix-sink_size-stuck
5 years ago · 45913d0682
--- a/mindspore/ccsrc/minddata/dataset/api/python/de_pipeline.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/de_pipeline.cc
@@ -1067,6 +1067,8 @@ Status DEPipeline::ParseDeviceQueueOp(const py::dict &args, std::shared_ptr<Data
        (void)builder->SetDeviceId(ToInt(value));
      } else if (key == "send_epoch_end") {
        (void)builder->SetSendEpochEnd(ToBool(value));
      } else if (key == "total_batch") {
        (void)builder->SetTotalBatch(ToInt(value));
      }
    }
  }
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
@@ -33,14 +33,15 @@
 namespace mindspore {
 namespace dataset {
 DeviceQueueOp::DeviceQueueOp(std::string channel_name, DeviceType device_type, int32_t device_id, int32_t prefetch_size,
                             bool send_epoch_end)
                             bool send_epoch_end, int total_batch)
    : PipelineOp(1),
      channel_name_(channel_name),
      device_type_(device_type),
      device_id_(device_id),
      prefetch_size_(prefetch_size),
      send_epoch_end_(send_epoch_end),
      stop_send_(false) {
      stop_send_(false),
      total_batch_(total_batch) {
 #ifdef ENABLE_TDTQUE
  ascend_keep_waiting_ = true;
 #endif
@@ -60,7 +61,8 @@ DeviceQueueOp::Builder::Builder(int32_t prefetch_size)
    : builder_prefetch_size_(prefetch_size),
      builder_device_id_(0),
      builder_device_type_(DeviceType::CPU),
      builder_channel_name_("") {}
      builder_channel_name_(""),
      builder_total_batch_(0) {}
 Status DeviceQueueOp::EoeReceived(int32_t worker_id) {
  state_ = OpState::kDeOpIdle;
@@ -102,11 +104,13 @@ Status DeviceQueueOp::operator()() {
 #ifdef ENABLE_TDTQUE
 Status DeviceQueueOp::SendDataToAscend() {
  MS_LOG(INFO) << "Device queue, sending data to Ascend.";
  int64_t total_batch = 0;
  int64_t send_batch = 0;
  double batch_start_time, end_time;
  int32_t batch_cost, tdt_cost;
  int32_t connector_size = 0;
  int32_t connector_capacity;
  bool is_break_loop = false;
  std::shared_ptr<DeviceQueueTracing> profiling_node;
  bool isProfilingEnable = tree_->GetProfilingManager()->IsProfilingEnable();
  if (isProfilingEnable) {
@@ -119,8 +123,8 @@ Status DeviceQueueOp::SendDataToAscend() {
  std::unique_ptr<DataBuffer> current_buffer;
  RETURN_IF_NOT_OK(GetNextInput(&current_buffer));
  while (!current_buffer->eof()) {
    while (!current_buffer->eoe()) {
  while (!current_buffer->eof() && !is_break_loop) {
    while (!current_buffer->eoe() && !is_break_loop) {
      RETURN_IF_NOT_OK(CheckExceptions(current_buffer));
      TensorRow currRow;
      for (int row_id = 0; row_id < current_buffer->NumRows(); row_id++) {
@@ -142,17 +146,21 @@ Status DeviceQueueOp::SendDataToAscend() {
        if (isProfilingEnable) {
          end_time = ProfilingTime::GetCurMilliSecond();
          // record push tdt time
          profiling_node->Record(TIME, TDT_PUSH_TIME, total_batch + 1, tdt_cost);
          profiling_node->Record(TIME, TDT_PUSH_TIME, send_batch + 1, tdt_cost);
          batch_cost = (int32_t)(end_time - batch_start_time);
          // record batch time
          profiling_node->Record(TIME, BATCH_TIME, total_batch + 1, batch_cost);
          profiling_node->Record(TIME, BATCH_TIME, send_batch + 1, batch_cost);
          // record pipeline time
          profiling_node->Record(TIME, PIPELINE_TIME, total_batch + 1, batch_cost - tdt_cost);
          profiling_node->Record(TIME, PIPELINE_TIME, send_batch + 1, batch_cost - tdt_cost);
          batch_start_time = end_time;
          // record connector depth
          profiling_node->Record(CONNECTOR_DEPTH, connector_capacity, total_batch + 1, connector_size);
          profiling_node->Record(CONNECTOR_DEPTH, connector_capacity, send_batch + 1, connector_size);
        }
        send_batch++;
        if (total_batch_ > 0 && send_batch >= total_batch_) {
          is_break_loop = true;
          break;
        }
        total_batch++;
      }
      if (isProfilingEnable) {
        connector_size = ChildOpConnectorSize();
@@ -184,7 +192,7 @@ Status DeviceQueueOp::SendDataToAscend() {
  }
  tree_->SetFinished();
  MS_LOG(INFO) << "Device queue total batch is " << total_batch;
  MS_LOG(INFO) << "Device queue total batch is " << send_batch;
  return Status::OK();
 }
@@ -193,7 +201,7 @@ Status DeviceQueueOp::SendDataToAscend() {
 #ifdef ENABLE_GPUQUE
 Status DeviceQueueOp::SendDataToGPU() {
  MS_LOG(INFO) << "Device queue, sending data to GPU.";
  int64_t total_batch = 0;
  int64_t send_batch = 0;
  bool is_break_loop = false;
  bool is_open = false;
  uint32_t handle = INVALID_HANDLE;
@@ -235,19 +243,23 @@ Status DeviceQueueOp::SendDataToGPU() {
          is_open = true;
        }
        RETURN_IF_NOT_OK(RetryPushGPUData(data_size, curr_row, handle, isProfilingEnable, &push_cost));
        total_batch++;
        send_batch++;
        if (isProfilingEnable) {
          end_time = ProfilingTime::GetCurMilliSecond();
          // record push data time
          profiling_node->Record(TIME, TDT_PUSH_TIME, total_batch, push_cost);
          profiling_node->Record(TIME, TDT_PUSH_TIME, send_batch, push_cost);
          batch_cost = (int32_t)(end_time - batch_start_time);
          // record batch time
          profiling_node->Record(TIME, BATCH_TIME, total_batch, batch_cost);
          profiling_node->Record(TIME, BATCH_TIME, send_batch, batch_cost);
          // record pipeline time
          profiling_node->Record(TIME, PIPELINE_TIME, total_batch, batch_cost - push_cost);
          profiling_node->Record(TIME, PIPELINE_TIME, send_batch, batch_cost - push_cost);
          batch_start_time = end_time;
          // record connector depth
          profiling_node->Record(CONNECTOR_DEPTH, connector_capacity, total_batch, connector_size);
          profiling_node->Record(CONNECTOR_DEPTH, connector_capacity, send_batch, connector_size);
        }
        if (total_batch_ > 0 && send_batch >= total_batch_) {
          is_break_loop = true;
          break;
        }
      }
      if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed()) {
@@ -272,7 +284,7 @@ Status DeviceQueueOp::SendDataToGPU() {
  }
  tree_->SetFinished();
  MS_LOG(INFO) << "Device queue total batch is " << total_batch << ".";
  MS_LOG(INFO) << "Device queue total batch is " << send_batch << ".";
  GpuBufferMgr::GetInstance().Close(handle);
  GpuBufferMgr::GetInstance().CloseConfirm();
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.h
@@ -86,13 +86,18 @@ class DeviceQueueOp : public PipelineOp {
      return *this;
    }
    Builder &SetTotalBatch(int total_batch) {
      builder_total_batch_ = total_batch;
      return *this;
    }
    //  Name: Build()
    //  Description: The final step for building a DeviceQueueOp via the Builder is
    //              to call this Build() method.  It will instantiate the DeviceQueueOp
    //              and return it to caller as a shared pointer.
    Status Build(std::shared_ptr<DeviceQueueOp> *ptr) {
      *ptr = std::make_shared<DeviceQueueOp>(builder_channel_name_, builder_device_type_, builder_device_id_,
                                             builder_prefetch_size_, builder_send_epoch_end_);
                                             builder_prefetch_size_, builder_send_epoch_end_, builder_total_batch_);
      return Status::OK();
    }
@@ -102,12 +107,13 @@ class DeviceQueueOp : public PipelineOp {
    DeviceType builder_device_type_;
    std::string builder_channel_name_;
    bool builder_send_epoch_end_;
    int builder_total_batch_;
  };
  //  Name: constructor
  //  Description
  DeviceQueueOp(std::string channel_name, DeviceType device_type, int32_t device_id, int32_t prefetch_size,
                bool send_epoch_end);
                bool send_epoch_end, int total_batch);
  //  Name: destructor
  //  Description
@@ -183,6 +189,7 @@ class DeviceQueueOp : public PipelineOp {
  const int32_t prefetch_size_;
  const bool send_epoch_end_;
  bool stop_send_;
  int total_batch_;
 #ifdef ENABLE_TDTQUE
  std::shared_ptr<TdtPlugin> tdtInstancePtr;
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@@ -2623,6 +2623,8 @@ class TransferDataset(DatasetOp):
        args["device_type"] = self._device_type
        args["device_id"] = self._device_id
        args["send_epoch_end"] = self._send_epoch_end
        if hasattr(self.children[0], "__total_batch__"):
            args["total_batch"] = self.children[0].__total_batch__
        return args
    def create_dict_iterator(self, num_epochs=-1, output_numpy=False):
--- a/mindspore/train/model.py
+++ b/mindspore/train/model.py
@@ -403,6 +403,7 @@ class Model:
            epoch_num = epoch
        else:
            epoch_num = math.ceil(epoch * sink_size / train_dataset.get_dataset_size())
            train_dataset.__total_batch__ = epoch * sink_size
        dataset_helper, train_network = self._exec_preprocess(self._train_network,
                                                              is_train=True,