/** * Copyright 2019 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "dataset/engine/dataset_iterator.h" #include #include #include "dataset/core/data_type.h" #include "dataset/core/tensor.h" #include "dataset/core/tensor_shape.h" #include "dataset/engine/data_buffer.h" #include "dataset/engine/execution_tree.h" #include "dataset/util/status.h" #include "dataset/engine/datasetops/dataset_op.h" namespace mindspore { namespace dataset { // Constructor of the IteratorBase IteratorBase::IteratorBase() : curr_buffer_(nullptr), eof_handled_(false) {} IteratorBase::~IteratorBase() = default; // Fetches one row of data from the iterator as a column map. Status IteratorBase::GetNextAsMap(TensorMap *out_map) { if (out_map == nullptr) { RETURN_STATUS_UNEXPECTED("Null output map in iterator!"); } out_map->clear(); TensorRow curr_row; RETURN_IF_NOT_OK(FetchNextTensorRow(&curr_row)); // Return empty map if there's no data if (curr_row.empty()) { return Status::OK(); } // The column name mapping is needed to be able to produce the tensor map output. // The column name mapping comes from the source operator that is producing the data into the iterator. // To avoid having to fetch this for every time, we'll take a local copy of the column name id mapping // and save in the iterator. We only have to do this once. All subsequent iterations use the same mapping. if (col_name_id_map_.empty()) { // Determine the column name map by calling the derived class method to retrieve the column // name map col_name_id_map_ = this->GetColumnNameMap(); } // Populate the out map from the row and return it for (auto colMap : col_name_id_map_) { (*out_map)[colMap.first] = std::move(curr_row[colMap.second]); } return Status::OK(); } // Fetches one row of data from the iterator. // The base class version simply performs error handling and returns empty row. Actual // functionality exists in the derived versions of this function. Status IteratorBase::FetchNextTensorRow(TensorRow *out_row) { if (out_row == nullptr) { RETURN_STATUS_UNEXPECTED("Null output row in iterator!"); } // clear the old tensor row out_row->clear(); return Status::OK(); } // Constructor of the DatasetIterator DatasetIterator::DatasetIterator(std::shared_ptr exe_tree) : IteratorBase(), root_(exe_tree->root()), tracing_(nullptr), cur_batch_num_(0), cur_connector_size_(0), cur_connector_capacity_(0) { std::shared_ptr node; Status s = exe_tree->GetProfilingManager()->GetTracingNode(kDatasetIteratorTracingName, &node); if (s.IsOk()) { tracing_ = std::dynamic_pointer_cast(node); } } DatasetIterator::~DatasetIterator() = default; // Fetches one row of data from the iterator. Overrides the base class. This one fetches // from the tree root node directly. Status DatasetIterator::FetchNextTensorRow(TensorRow *out_row) { // Common code init and error checking in the base class. RETURN_IF_NOT_OK(IteratorBase::FetchNextTensorRow(out_row)); // Once eof is handled, always return empty row. Class must be destroyed and recreated if you // want to iterate again. if (eof_handled_) { return Status::OK(); } // Check if we need to get a new DataBuffer to iterate. if (curr_buffer_ == nullptr || curr_buffer_->NumRows() == 0) { if (tracing_ != nullptr) { cur_connector_size_ = root_->ConnectorSize(); cur_connector_capacity_ = root_->ConnectorCapacity(); } RETURN_IF_NOT_OK(root_->GetNextBuffer(&curr_buffer_)); // Since GetNextBuffer was used rather than GetNextInput(), it means we need to manually // handle eoe and eof messages here. // // An eoe buffer means we have iterated fully to the end of the tree. // An eoe buffer will be immediately followed by an eof buffer, which signals the shutdown of // all operators. if (curr_buffer_->eoe()) { MS_LOG(DEBUG) << "End of data iteration. Fetch eof and then return empty row."; // Before returning the last empty vector, fetch the eof buffer which should be the last // buffer, and then free it. RETURN_IF_NOT_OK(root_->GetNextBuffer(&curr_buffer_)); if (!curr_buffer_->eof()) { RETURN_STATUS_UNEXPECTED("Non-eof after getting eoe in iterator!"); } eof_handled_ = true; curr_buffer_.reset(); // explicitly free the eof buffer // Set tree to Finished state root_->Tree()->SetFinished(); return Status::OK(); } if (curr_buffer_->eof()) { // An eof by itself, without being preceded by an eoe, is possible if a repeat operator // exists below us in the stack. Repeat operator eats eoe's but eventually allows the // flow of an eof up the pipeline by itself. eof_handled_ = true; curr_buffer_.reset(); // explicitly free the eof buffer // Set tree to Finished state root_->Tree()->SetFinished(); return Status::OK(); } } // If we got this far, now it's time to pop that next row for return to caller RETURN_IF_NOT_OK(curr_buffer_->PopRow(out_row)); if (tracing_ != nullptr) { cur_batch_num_++; tracing_->Record(CONNECTOR_DEPTH, cur_connector_capacity_, cur_batch_num_, cur_connector_size_); } return Status::OK(); } Status DatasetIterator::GetOutputShapes(std::vector *out_shapes) { if (out_shapes == nullptr) { RETURN_STATUS_UNEXPECTED("Null output shape argument"); } if (device_queue_row_.empty()) { RETURN_IF_NOT_OK(FetchNextTensorRow(&device_queue_row_)); } for (auto ts : device_queue_row_) { out_shapes->push_back(ts->shape()); } return Status::OK(); } Status DatasetIterator::GetOutputTypes(std::vector *out_types) { if (out_types == nullptr) { RETURN_STATUS_UNEXPECTED("Null output type argument"); } if (device_queue_row_.empty()) { RETURN_IF_NOT_OK(FetchNextTensorRow(&device_queue_row_)); } for (auto ts : device_queue_row_) { out_types->push_back(ts->type()); } return Status::OK(); } // Getter std::unordered_map DatasetIterator::GetColumnNameMap() const { return root_->column_name_id_map(); } // Constructor of the ChildIterator ChildIterator::ChildIterator(DatasetOp *current_op, int32_t worker_id, int32_t child_idx) : IteratorBase(), current_op_(current_op), child_idx_(child_idx), worker_id_(worker_id), end_epoch_(false) {} ChildIterator::~ChildIterator() { current_op_ = nullptr; } // Fetches one row of data from the iterator. Overrides the base class. This one fetches // only from the child/worker id as given from the constructor. Status ChildIterator::FetchNextTensorRow(TensorRow *out_row) { // Common code init and error checking in the base class. RETURN_IF_NOT_OK(IteratorBase::FetchNextTensorRow(out_row)); // Once eof is handled, always return empty row. Class must be destroyed and recreated if you // want to iterate again. if (eof_handled_) { return Status::OK(); } // Check if we need to get a new DataBuffer to iterate. if (curr_buffer_ == nullptr || curr_buffer_->NumRows() == 0) { RETURN_IF_NOT_OK(current_op_->GetNextInput(&curr_buffer_, worker_id_, child_idx_)); // Unlike the DatasetIterator, this child iterator does not quit after eoe. // Instead, if an eoe is picked up here, we simply return an empty vector and it's up to the // caller to decide what it wants to do next. if (curr_buffer_->eoe()) { MS_LOG(DEBUG) << "Child iterator picked up EOE."; end_epoch_ = true; return Status::OK(); } if (curr_buffer_->eof()) { MS_LOG(DEBUG) << "Child iterator picked up EOF."; eof_handled_ = true; return Status::OK(); } } // If we got this far, now it's time to pop that next row for return to caller RETURN_IF_NOT_OK(curr_buffer_->PopRow(out_row)); return Status::OK(); } // drain till the next eoe Status ChildIterator::Drain() { if (end_epoch_ == true) { // Calling drain against a child that is already at it's eoe state will not result in any action. // This allows you to do: // - fetch until empty row // - drain (will not actually drain because you are already at the end of the iteration) // However, the next time after that, it will perform it's normal draining activities. end_epoch_ = false; MS_LOG(DEBUG) << "No operation drain, already at end of epoch."; return Status::OK(); } MS_LOG(DEBUG) << "Child draining buffers until eoe."; // else we drain until eoe or eof, eof here is for sanity check while (!curr_buffer_->eoe() && !curr_buffer_->eof()) { RETURN_IF_NOT_OK(current_op_->GetNextInput(&curr_buffer_, worker_id_, child_idx_)); } if (curr_buffer_->eof()) { return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, "Child iterator picked up EOF in drain."); } return Status::OK(); } // Getter std::unordered_map ChildIterator::GetColumnNameMap() const { return current_op_->child(child_idx_)->column_name_id_map(); } } // namespace dataset } // namespace mindspore