You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

batch_op.cc 19 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "dataset/engine/datasetops/batch_op.h"
  17. #include <utility>
  18. #include <iomanip>
  19. #include "common/utils.h"
  20. #include "dataset/core/pybind_support.h"
  21. #include "dataset/engine/data_buffer.h"
  22. #include "dataset/engine/db_connector.h"
  23. #include "dataset/engine/opt/pass.h"
  24. #include "dataset/kernels/data/data_utils.h"
  25. using float16 = Eigen::half;
  26. namespace mindspore {
  27. namespace dataset {
  28. BatchOp::Builder::Builder(int32_t batch_size) : builder_drop_(false), builder_pad_(false), builder_pad_map_({}) {
  29. builder_batch_size_ = batch_size;
  30. std::shared_ptr<ConfigManager> cfg = GlobalContext::config_manager();
  31. builder_num_workers_ = cfg->num_parallel_workers();
  32. builder_op_connector_size_ = cfg->op_connector_size();
  33. }
  34. Status BatchOp::Builder::Build(std::shared_ptr<BatchOp> *ptr) {
  35. RETURN_IF_NOT_OK(SanityCheck());
  36. *ptr = std::make_shared<BatchOp>(builder_batch_size_, builder_drop_, builder_pad_, builder_op_connector_size_,
  37. builder_num_workers_, builder_cols_to_map_, builder_batch_size_func_,
  38. builder_batch_map_func_, builder_pad_map_);
  39. return Status::OK();
  40. }
  41. Status BatchOp::Builder::SanityCheck() {
  42. std::string err;
  43. err += builder_op_connector_size_ <= 0 ? "connector size <= 0\n" : "";
  44. err += builder_batch_size_ <= 0 ? "batch size <= 0\n" : "";
  45. err += builder_num_workers_ <= 0 ? "batch num_parallel_workers <= 0\n" : "";
  46. return err.empty() ? Status::OK() : Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, common::SafeCStr(err));
  47. }
  48. BatchOp::BatchOp(int32_t batch_size, bool drop, bool pad, int32_t op_queue_size, int32_t num_workers,
  49. const std::vector<std::string> &cols_to_map, py::function batch_size_func, py::function batch_map_func,
  50. PadInfo pad_map)
  51. : ParallelOp(num_workers, op_queue_size),
  52. start_batch_size_(batch_size),
  53. drop_(drop),
  54. pad_(pad),
  55. pyfunc_column_names_(cols_to_map),
  56. batch_size_func_(batch_size_func),
  57. batch_map_func_(batch_map_func),
  58. pad_info_(pad_map) {
  59. worker_queues_.Init(num_workers, op_queue_size);
  60. }
  61. Status BatchOp::operator()() {
  62. Status rc = LaunchThreadsAndInitOp();
  63. // Synchronize with TaskManager
  64. TaskManager::FindMe()->Post();
  65. RETURN_IF_NOT_OK(rc);
  66. int64_t epoch_num = 0, batch_num = 0, cnt = 0;
  67. TensorRow new_row;
  68. std::unique_ptr<TensorQTable> table = std::make_unique<TensorQTable>();
  69. child_iterator_ = std::make_unique<ChildIterator>(this, 0, 0);
  70. RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
  71. int32_t cur_batch_size = 0;
  72. RETURN_IF_NOT_OK(GetBatchSize(&cur_batch_size, CBatchInfo(0, 0, 0)));
  73. while (child_iterator_->eof_handled() == false) {
  74. while (new_row.empty() == false) {
  75. table->emplace_back(new_row);
  76. // if # of rows is enough to make 1 batch (1 batch is buffer), send it to worker_queue
  77. if (table->size() == static_cast<size_t>(cur_batch_size)) {
  78. RETURN_IF_NOT_OK(worker_queues_[cnt++ % num_workers_]->EmplaceBack(
  79. std::make_pair(std::move(table), CBatchInfo(epoch_num, batch_num++, cnt - epoch_num))));
  80. table = std::make_unique<TensorQTable>();
  81. RETURN_IF_NOT_OK(GetBatchSize(&cur_batch_size, CBatchInfo(epoch_num, batch_num, cnt - epoch_num)));
  82. }
  83. RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
  84. }
  85. // Reminder logic, execute only when there is a remainder (table is non empty) and don't drop
  86. if (drop_ == false && table->empty() == false) {
  87. RETURN_IF_NOT_OK(worker_queues_[cnt++ % num_workers_]->EmplaceBack(
  88. std::make_pair(std::move(table), CBatchInfo(epoch_num, batch_num++, cnt - epoch_num))));
  89. }
  90. table = std::make_unique<TensorQTable>(); // this drops when drop == true
  91. // end of the current epoch, batch_num should start from 0 again
  92. batch_num = 0;
  93. epoch_num++;
  94. RETURN_IF_NOT_OK(
  95. worker_queues_[cnt++ % num_workers_]->EmplaceBack(std::make_pair(nullptr, CBatchInfo(batchCtrl::kEOE))));
  96. RETURN_IF_NOT_OK(GetBatchSize(&cur_batch_size, CBatchInfo(epoch_num, batch_num, cnt - epoch_num)));
  97. RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
  98. } // end of eof_handled() == false
  99. RETURN_IF_NOT_OK(
  100. worker_queues_[cnt++ % num_workers_]->EmplaceBack(std::make_pair(nullptr, CBatchInfo(batchCtrl::kEOF))));
  101. // EOF received, send quit signal (an empty buffer) to all workers
  102. for (int32_t ind = 0; ind < num_workers_; ind++) {
  103. RETURN_IF_NOT_OK(
  104. worker_queues_[cnt++ % num_workers_]->EmplaceBack(std::make_pair(nullptr, CBatchInfo(batchCtrl::kQuit))));
  105. }
  106. return Status::OK();
  107. }
  108. void BatchOp::Print(std::ostream &out, bool show_all) const {
  109. // Always show the id and name as first line regardless if this summary or detailed print
  110. out << "(" << std::setw(2) << operator_id_ << ") <BatchOp>:";
  111. if (!show_all) {
  112. // Call the super class for displaying any common 1-liner info
  113. ParallelOp::Print(out, show_all);
  114. // Then show any custom derived-internal 1-liner info for this op
  115. out << " [batch size: " << start_batch_size_ << "]\n";
  116. } else {
  117. // Call the super class for displaying any common detailed info
  118. ParallelOp::Print(out, show_all);
  119. // Then show any custom derived-internal stuff
  120. out << "\nStart batch size: " << start_batch_size_ << "\nDrop remainder: " << (drop_ ? "yes" : "no") << "\n\n";
  121. }
  122. }
  123. Status BatchOp::BatchRows(const std::unique_ptr<TensorQTable> *src, const std::unique_ptr<TensorQTable> *dest,
  124. dsize_t batch_size) {
  125. if ((*src)->size() != batch_size) {
  126. RETURN_STATUS_UNEXPECTED("[Internal Batch ERROR] Source table size does not match the batch_size");
  127. }
  128. if (batch_size == 1) {
  129. TensorRow row = std::move((*src)->front());
  130. (*src)->pop_front();
  131. (*dest)->push_back(row);
  132. for (const auto &tensor : (*dest)->front()) {
  133. RETURN_IF_NOT_OK(tensor->ExpandDim(0));
  134. }
  135. return Status::OK();
  136. }
  137. TensorRow batched_row;
  138. auto num_columns = (*src)->front().size();
  139. for (size_t i = 0; i < num_columns; i++) {
  140. std::shared_ptr<Tensor> first_tensor = (*src)->at(0).at(i); // first row, column i
  141. TensorShape first_shape = first_tensor->shape();
  142. DataType first_type = first_tensor->type();
  143. TensorShape new_shape = first_shape.PrependDim(static_cast<int64_t>(batch_size));
  144. std::shared_ptr<Tensor> new_tensor;
  145. if (first_type.IsNumeric()) { // numeric tensor
  146. RETURN_IF_NOT_OK(Tensor::CreateTensor(&new_tensor, TensorImpl::kFlexible, new_shape, first_type));
  147. dsize_t j = 0;
  148. for (auto row : **src) {
  149. std::shared_ptr<Tensor> old_tensor = row.at(i); // row j, column i
  150. if (old_tensor->shape() == first_shape) { // check the newly popped rows have the same dim as the first
  151. RETURN_IF_NOT_OK(new_tensor->InsertTensor({j++}, old_tensor));
  152. } else {
  153. RETURN_STATUS_UNEXPECTED("[Batch ERROR] Inconsistent TensorShapes of Column " + std::to_string(i));
  154. }
  155. }
  156. } else { // handle string column differently
  157. std::vector<std::string> strings;
  158. for (dsize_t j = 0; j < batch_size; j++) {
  159. std::shared_ptr<Tensor> old_tensor = (*src)->at(j).at(i);
  160. for (auto itr = old_tensor->begin<std::string_view>(); itr != old_tensor->end<std::string_view>(); itr++) {
  161. strings.emplace_back(*itr);
  162. }
  163. }
  164. RETURN_IF_NOT_OK(Tensor::CreateTensor(&new_tensor, strings, new_shape));
  165. }
  166. batched_row.emplace_back(new_tensor);
  167. }
  168. (*dest)->emplace_back(batched_row);
  169. return Status::OK();
  170. }
  171. Status BatchOp::WorkerEntry(int32_t workerId) {
  172. TaskManager::FindMe()->Post();
  173. std::pair<std::unique_ptr<TensorQTable>, CBatchInfo> table_pair;
  174. RETURN_IF_NOT_OK(worker_queues_[workerId]->PopFront(&table_pair));
  175. while (table_pair.second.ctrl_ != batchCtrl::kQuit) {
  176. if (table_pair.second.ctrl_ == batchCtrl::kEOE) {
  177. RETURN_IF_NOT_OK(out_connector_->Add(workerId, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
  178. } else if (table_pair.second.ctrl_ == batchCtrl::kEOF) {
  179. RETURN_IF_NOT_OK(out_connector_->Add(workerId, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF)));
  180. } else if (table_pair.second.ctrl_ == batchCtrl::kNoCtrl) {
  181. std::unique_ptr<DataBuffer> db = nullptr;
  182. RETURN_IF_NOT_OK(MakeBatchedBuffer(std::move(table_pair), &db));
  183. RETURN_IF_NOT_OK(out_connector_->Add(workerId, std::move(db)));
  184. }
  185. RETURN_IF_NOT_OK(worker_queues_[workerId]->PopFront(&table_pair));
  186. }
  187. return Status::OK();
  188. }
  189. Status BatchOp::MakeBatchedBuffer(std::pair<std::unique_ptr<TensorQTable>, CBatchInfo> table_pair,
  190. std::unique_ptr<DataBuffer> *db) {
  191. RETURN_UNEXPECTED_IF_NULL(table_pair.first);
  192. if (!pyfunc_column_names_.empty()) RETURN_IF_NOT_OK(MapColumns(&table_pair)); // pass it through pyfunc
  193. if (pad_) RETURN_IF_NOT_OK(PadColumns(&table_pair.first, pad_info_, column_name_id_map_)); // do padding if needed
  194. (*db) = std::make_unique<DataBuffer>(table_pair.second.batch_num_, DataBuffer::kDeBFlagNone);
  195. std::unique_ptr<TensorQTable> dest_table = std::make_unique<TensorQTable>();
  196. RETURN_IF_NOT_OK(BatchRows(&table_pair.first, &dest_table, table_pair.first->size()));
  197. (*db)->set_tensor_table(std::move(dest_table));
  198. return Status::OK();
  199. }
  200. Status BatchOp::LaunchThreadsAndInitOp() {
  201. RETURN_UNEXPECTED_IF_NULL(tree_);
  202. RETURN_IF_NOT_OK(worker_queues_.Register(tree_->AllTasks()));
  203. RETURN_IF_NOT_OK(tree_->LaunchWorkers(num_workers_, std::bind(&BatchOp::WorkerEntry, this, std::placeholders::_1)));
  204. return Status::OK();
  205. }
  206. Status BatchOp::EofReceived(int32_t) { return Status::OK(); }
  207. Status BatchOp::EoeReceived(int32_t) {
  208. state_ = OpState::kDeOpIdle;
  209. return Status::OK();
  210. }
  211. Status BatchOp::MapColumns(std::pair<std::unique_ptr<TensorQTable>, CBatchInfo> *table_pair) {
  212. TensorBatchTable input_table;
  213. input_table.reserve(pyfunc_column_names_.size());
  214. for (std::string col_name : pyfunc_column_names_) {
  215. if (column_name_id_map_.find(col_name) == column_name_id_map_.end()) {
  216. RETURN_STATUS_UNEXPECTED("column : '" + col_name + "' does not exist\n");
  217. }
  218. TensorBatch tensor_batch;
  219. tensor_batch.reserve(table_pair->first->size());
  220. size_t col_idx = static_cast<size_t>(column_name_id_map_[col_name]);
  221. for (size_t row_idx = 0; row_idx < table_pair->first->size(); row_idx++) {
  222. tensor_batch.push_back(std::move(table_pair->first->at(row_idx)[col_idx]));
  223. }
  224. input_table.push_back(std::move(tensor_batch));
  225. }
  226. // Perform batch map
  227. TensorBatchTable output_table;
  228. RETURN_IF_NOT_OK(InvokeBatchMapFunc(&input_table, &output_table, table_pair->second));
  229. // Write back to TensorQTable
  230. for (size_t input_idx = 0; input_idx < pyfunc_column_names_.size(); input_idx++) {
  231. size_t col_idx = static_cast<size_t>(column_name_id_map_[pyfunc_column_names_[input_idx]]);
  232. size_t row_id = 0;
  233. for (TensorRow &row : *(table_pair->first)) {
  234. row[col_idx] = std::move(output_table[input_idx][row_id++]);
  235. }
  236. }
  237. return Status::OK();
  238. }
  239. Status BatchOp::GetBatchSize(int32_t *batch_size, CBatchInfo info) {
  240. if (batch_size_func_ != nullptr) {
  241. RETURN_IF_NOT_OK(InvokeBatchSizeFunc(batch_size, info));
  242. } else {
  243. (*batch_size) = start_batch_size_;
  244. }
  245. return Status::OK();
  246. }
  247. Status BatchOp::InvokeBatchSizeFunc(int32_t *batch_size, CBatchInfo info) {
  248. {
  249. // Acquire Python GIL
  250. py::gil_scoped_acquire gil_acquire;
  251. if (Py_IsInitialized() == 0) {
  252. return Status(StatusCode::kPythonInterpreterFailure, "Python Interpreter is finalized");
  253. }
  254. try {
  255. py::object size = batch_size_func_(info);
  256. *batch_size = size.cast<int32_t>();
  257. if (*batch_size <= 0) {
  258. return Status(StatusCode::kPyFuncException, "Batch size function should return an integer > 0");
  259. }
  260. } catch (const py::error_already_set &e) {
  261. return Status(StatusCode::kPyFuncException, e.what());
  262. } catch (const py::cast_error &e) {
  263. return Status(StatusCode::kPyFuncException, "Batch size function should return an integer > 0");
  264. }
  265. }
  266. return Status(StatusCode::kOK, "Batch size func call succeed");
  267. }
  268. Status BatchOp::InvokeBatchMapFunc(TensorBatchTable *input, TensorBatchTable *output, CBatchInfo info) {
  269. {
  270. // Acquire Python GIL
  271. py::gil_scoped_acquire gil_acquire;
  272. if (Py_IsInitialized() == 0) {
  273. return Status(StatusCode::kPythonInterpreterFailure, "Python Interpreter is finalized");
  274. }
  275. try {
  276. // Prepare batch map call back parameters
  277. py::tuple input_args(input->size() + 1);
  278. for (size_t i = 0; i < input->size(); i++) {
  279. std::vector<py::array> np_batch;
  280. for (std::shared_ptr<Tensor> t : input->at(i)) {
  281. py::array np_array;
  282. RETURN_IF_NOT_OK(t->GetDataAsNumpy(&np_array));
  283. np_batch.push_back(std::move(np_array));
  284. }
  285. input_args[i] = np_batch;
  286. }
  287. input_args[input->size()] = info;
  288. // Invoke batch map func
  289. py::object ret_py_obj = batch_map_func_(*input_args);
  290. // Parse batch map return value
  291. py::tuple ret_tuple = py::cast<py::tuple>(ret_py_obj);
  292. if (ret_tuple.size() != pyfunc_column_names_.size() || !py::isinstance<py::tuple>(ret_tuple)) {
  293. return Status(StatusCode::kPyFuncException, "Batch map function should return a tuple");
  294. }
  295. for (size_t i = 0; i < ret_tuple.size(); i++) {
  296. TensorBatch output_batch;
  297. py::list output_list = py::cast<py::list>(ret_tuple[i]);
  298. for (size_t j = 0; j < output_list.size(); j++) {
  299. std::shared_ptr<Tensor> out;
  300. RETURN_IF_NOT_OK(Tensor::CreateTensor(&out, py::cast<py::array>(output_list[j])));
  301. output_batch.push_back(std::move(out));
  302. }
  303. output->push_back(std::move(output_batch));
  304. }
  305. } catch (const py::error_already_set &e) {
  306. return Status(StatusCode::kPyFuncException, e.what());
  307. } catch (const py::cast_error &e) {
  308. return Status(StatusCode::kPyFuncException, "Batch map function should return an tuple of list of numpy array");
  309. }
  310. }
  311. return Status(StatusCode::kOK);
  312. }
  313. Status BatchOp::PadColumns(std::unique_ptr<TensorQTable> *table, const PadInfo &pad_info,
  314. const std::unordered_map<std::string, int32_t> &column_name_id_map) {
  315. RETURN_UNEXPECTED_IF_NULL(table); // placeholder for now, might need this in the future
  316. CHECK_FAIL_RETURN_UNEXPECTED((*table)->front().size() == column_name_id_map.size(), "col_name_map mismatch");
  317. std::vector<std::shared_ptr<Tensor>> pad_vals(column_name_id_map.size(),
  318. 0); // value to pad each column's tensor with, default 0
  319. std::set<int32_t> pad_cols;
  320. // padded_shape provided by user, maximum shapes of current batch of tensors
  321. std::vector<std::vector<dsize_t>> pad_shapes(column_name_id_map.size()), max_shapes(column_name_id_map.size());
  322. RETURN_IF_NOT_OK(UnpackPadInfo(pad_info, column_name_id_map, &pad_cols, &pad_vals, &pad_shapes));
  323. // init each shape in max_shape to {-1,-1...} init each unspecified shape in pad_shape to -1 as well
  324. for (size_t col_id : pad_cols) {
  325. max_shapes[col_id] = std::vector<dsize_t>((*table)->front()[col_id]->Rank(), -1);
  326. if (pad_shapes[col_id].empty()) pad_shapes[col_id] = max_shapes[col_id]; // fill pad shape with -1
  327. CHECK_FAIL_RETURN_UNEXPECTED(pad_shapes[col_id].size() == max_shapes[col_id].size(), "wrong rank in pad_shape");
  328. }
  329. // calculate maximum shape for each column that needs to be padded
  330. for (const TensorRow &row : **table) { // iterator each row in a batch
  331. for (size_t col_id : pad_cols) { // iterator each tensor in a row
  332. CHECK_FAIL_RETURN_UNEXPECTED(row[col_id]->Rank() == max_shapes[col_id].size(),
  333. "Tensor to be padded together need to have the same rank");
  334. for (size_t dim = 0; dim < row[col_id]->Rank(); dim++) { // pick the largest number in each dimension
  335. max_shapes[col_id][dim] = std::max(max_shapes[col_id][dim], row[col_id]->shape()[dim]);
  336. }
  337. }
  338. }
  339. // if user sets a dimension to -1 (None in python), use the max value for current dimension
  340. for (size_t col_id : pad_cols) {
  341. for (size_t dim = 0; dim < pad_shapes[col_id].size(); dim++) {
  342. if (pad_shapes[col_id][dim] < 0) pad_shapes[col_id][dim] = max_shapes[col_id][dim];
  343. }
  344. }
  345. // call pad on each tensor that needs to be padded
  346. for (TensorRow &row : **table) {
  347. for (size_t col_id : pad_cols) {
  348. std::shared_ptr<Tensor> pad_tensor;
  349. RETURN_IF_NOT_OK(PadEnd(row[col_id], &pad_tensor, pad_shapes[col_id], pad_vals[col_id]));
  350. row[col_id] = pad_tensor;
  351. }
  352. }
  353. return Status::OK();
  354. }
  355. Status BatchOp::UnpackPadInfo(const PadInfo &pad_info,
  356. const std::unordered_map<std::string, int32_t> &column_name_id_map,
  357. std::set<int32_t> *pad_cols, std::vector<std::shared_ptr<Tensor>> *pad_vals,
  358. std::vector<std::vector<dsize_t>> *pad_shapes) {
  359. if (pad_info.empty()) { // if pad_info empty, pad every columns automatically
  360. for (dsize_t col_id = 0; col_id < column_name_id_map.size(); col_id++) {
  361. pad_cols->insert(col_id);
  362. }
  363. } else {
  364. for (const auto &p : pad_info) {
  365. auto location = column_name_id_map.find(p.first);
  366. CHECK_FAIL_RETURN_UNEXPECTED(location != column_name_id_map.end(), "no column exists with name:" + p.first);
  367. auto col_id = static_cast<dsize_t>(location->second);
  368. CHECK_FAIL_RETURN_UNEXPECTED(col_id < pad_vals->size() && col_id < pad_shapes->size(), "col_id out of bound");
  369. pad_cols->insert(col_id);
  370. (*pad_vals)[col_id] = p.second.second; // set pad values
  371. (*pad_shapes)[col_id] = p.second.first.AsVector(); // empty vector if shape is unknown
  372. }
  373. }
  374. return Status::OK();
  375. }
  376. // Visitor accept method for NodePass
  377. Status BatchOp::Accept(NodePass *p, bool *modified) {
  378. // Downcast shared pointer then call visitor
  379. return p->RunOnNode(shared_from_base<BatchOp>(), modified);
  380. }
  381. } // namespace dataset
  382. } // namespace mindspore