You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

map_op.h 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef DATASET_ENGINE_DATASETOPS_MAP_OP_H_
  17. #define DATASET_ENGINE_DATASETOPS_MAP_OP_H_
  18. #include <memory>
  19. #include <string>
  20. #include <unordered_map>
  21. #include <utility>
  22. #include <vector>
  23. #include "dataset/engine/datasetops/parallel_op.h"
  24. #include "dataset/kernels/tensor_op.h"
  25. #include "dataset/util/queue.h"
  26. namespace mindspore {
  27. namespace dataset {
  28. // Forward declare
  29. class DataBuffer;
  30. class ExecutionTree;
  31. // MapOp class implements the Map operator. It will apply a list of operations to each record specified by column names.
  32. // The column order behavior after MapOp is as follows.
  33. // [Case 1] If the number of Input Columns == the number of Output Column, column ordering after MapOp
  34. // is the same as the original column order where the Remainder Columns stay in the same position,
  35. // and the Output Columns are placed the same position of the Input Columns.
  36. // For example, initially if the dataset has column order |A, B, C, D, E|,
  37. // and we apply MapOp() with Input Columns {B, C} and Output Columns {X, Y}.
  38. // The column order after applying MapOp will be |A, X, Y, D, E|.
  39. // Note that in this case, |X, Y| is the Output Columns and |A, D, E| which is the Remainder Columns stay in
  40. // their original position, and column B is replaced by column X and column C is replace by column Y.
  41. // [Case 2] If the number of Input Columns != the number of Output Column, column ordering after MapOp
  42. // is Output Columns followed by Remainder Columns.
  43. // For example, initially if the dataset has column order |A, B, C, D, E|,
  44. // and we apply MapOp() with Input Columns {B, C, A} and Output Columns {X, Y}.
  45. // The column order after applying MapOp will be |X, Y, D, E|.
  46. // Note that in this case, |X, Y| is the Output Columns and |D, E| is the Remainder Columns,
  47. // and the Input Columns are gone and replaced by the Output Columns.
  48. // Keywords:
  49. // Input Columns : a vector of column names (string) passed to MapOp specifying the column names from which
  50. // Tensors are taken and passed to the TensorOp Compute().
  51. // Output Columns : a vector of column names (string) passed to MapOp specifying what are the column names
  52. // for the Tensors produced by TensorOp Compute().
  53. // Remainder Columns : columns that exist in the dataset but are not mentioned in Input Columns.
  54. // These columns will not be passed to TensorOp Compute(), but will be appended to the end of the Output Columns.
  55. class MapOp : public ParallelOp {
  56. public:
  57. // The nested builder class inside of the MapOp is used to help manage all of
  58. // the arguments for constructing it. Use the builder by setting each argument
  59. // with the provided set methods, and then finally call the build method to execute
  60. // the actual construction.
  61. class Builder {
  62. public:
  63. // Builder constructor. Creates the builder object.
  64. // @note No default args
  65. // @return This is a constructor.
  66. Builder();
  67. // Default destructor
  68. ~Builder() = default;
  69. // Setter method.
  70. // @return Builder setter method returns reference to the builder.
  71. Builder &SetInColNames(const std::vector<std::string> &in_col_names) {
  72. build_in_col_names_ = in_col_names;
  73. return *this;
  74. }
  75. // Setter method.
  76. // @return Builder setter method returns reference to the builder.
  77. Builder &SetOutColNames(const std::vector<std::string> &out_col_names) {
  78. build_out_col_names_ = out_col_names;
  79. return *this;
  80. }
  81. // Setter method.
  82. // @return Builder setter method returns reference to the builder.
  83. Builder &SetTensorFuncs(std::vector<std::shared_ptr<TensorOp>> funcs) {
  84. build_tensor_funcs_ = std::move(funcs);
  85. return *this;
  86. }
  87. // Setter method.
  88. // @return Builder setter method returns reference to the builder.
  89. Builder &SetNumWorkers(int32_t num_workers) {
  90. build_num_workers_ = num_workers;
  91. return *this;
  92. }
  93. // Setter method.
  94. // @return Builder setter method returns reference to the builder.
  95. Builder &SetOpConnectorSize(int32_t connector_size) {
  96. build_op_connector_size_ = connector_size;
  97. return *this;
  98. }
  99. // Setter method.
  100. // @return Builder setter method returns reference to the builder.
  101. Builder &SetPerformanceMode(bool perf_mode) {
  102. build_perf_mode_ = perf_mode;
  103. return *this;
  104. }
  105. // The builder "build" method creates the final object.
  106. // @param ptr The shared_ptr to the new MapOp object
  107. // @return Status
  108. Status Build(std::shared_ptr<MapOp> *ptr);
  109. private:
  110. std::vector<std::string> build_in_col_names_;
  111. std::vector<std::string> build_out_col_names_;
  112. std::vector<std::shared_ptr<TensorOp>> build_tensor_funcs_;
  113. int32_t build_num_workers_;
  114. int32_t build_op_connector_size_;
  115. bool build_perf_mode_; // Default true.
  116. // Check if the required parameters are set by the builder.
  117. // @return Status The error code return
  118. Status sanityCheck() const;
  119. };
  120. // Constructor of MapOp
  121. // @note The builder class should be used to call it.
  122. // @param in_col_names A list of input column names (should match the input/output \p tensorFuncs).
  123. // @param out_col_names A list of output column names (should match the input/output \p tensorFuncs).
  124. // @param tensor_funcs A list of TensorOp pointers for MapOp to apply to each data.
  125. // @param num_workers The number of worker threads.
  126. // @param op_connector_size The size of each queue in the connector.
  127. MapOp(const std::vector<std::string> &in_col_names, const std::vector<std::string> &out_col_names,
  128. std::vector<std::shared_ptr<TensorOp>> tensor_funcs, int32_t num_workers, int32_t op_connector_size,
  129. bool perf_mode);
  130. // Destructor
  131. ~MapOp() = default;
  132. // A print method typically used for debugging
  133. // @param out The output stream to write output to
  134. // @param show_all A bool to control if you want to show all info or just a summary
  135. void Print(std::ostream &out, bool show_all) const override;
  136. // << Stream output operator overload
  137. // @notes This allows you to write the debug print info using stream operators
  138. // @param out reference to the output stream being overloaded
  139. // @param mo reference to the MapOp to display
  140. // @return the output stream must be returned
  141. friend std::ostream &operator<<(std::ostream &out, const MapOp &mo) {
  142. mo.Print(out, false);
  143. return out;
  144. }
  145. // Class functor operator () override.
  146. // All dataset ops operate by launching a thread (see ExecutionTree). This class functor will
  147. // provide the master loop that drives the logic for performing the work
  148. // @return Status The error code return
  149. Status operator()() override;
  150. // Getter
  151. // @return the number of threads consuming data from previous op's output Connector.
  152. int32_t num_consumers() const override;
  153. // Base-class override for NodePass visitor acceptor.
  154. // @param p - Pointer to the NodePass to be accepted.
  155. // @param modified - Whether this node visit modified the pipeline.
  156. // @return - Status of the node visit.
  157. Status Accept(NodePass *p, bool *modified) override;
  158. // Op name getter
  159. // @return Name of the current Op
  160. std::string Name() const override { return "MapOp"; }
  161. private:
  162. // Local queues where worker threads can pop from.
  163. // Popping directly from the Connector can block if the previous designated threads haven't pop.
  164. // Setting the size of these queues to 0 is essentially the same as pulling directly from Connector.
  165. QueueList<std::unique_ptr<DataBuffer>> local_queues_;
  166. // Static variables to be ready by worker threads, no modification and readonly
  167. const std::vector<std::shared_ptr<TensorOp>> tfuncs_;
  168. // Variable to store the column name that the tensorOps are consuming
  169. std::vector<std::string> in_columns_;
  170. // Variable to store the column name that the tensorOps are producing
  171. std::vector<std::string> out_columns_;
  172. // Boolean mapping, true means to keep the column.
  173. std::vector<bool> keep_input_columns_;
  174. // Indices of the columns to process.
  175. std::vector<size_t> to_process_indices_;
  176. // Performance mode is when the main thread creates local queues, pulls databuffers from the previous
  177. // op's Connector and distributes them to the local queues. Workers pull from the local queues.
  178. // If this flag is false, each worker pulls directly from the Connector. This use less resources
  179. // (thread and memory), but when the computation cost is heavy (e.g. DecodeOp) and fluctuating, it can
  180. // cause additional blocking because pop calls to Connector from the threads are synchronized to enforce the order.
  181. bool perf_mode_;
  182. // Private function for worker/thread to loop continuously. It comprises the main
  183. // logic of MapOp: getting the data from previous Op, validating user specified column names,
  184. // applying a list of TensorOps to each of the data, process the results and then
  185. // pushing them back to MapOp's output Connector to be fetched by the next Op.
  186. // @param worker_id The id assigned to this thread/worker upon creation.
  187. // @return Status The error code return
  188. Status WorkerEntry(int32_t worker_id) override; // In: workerId assigned by tree_
  189. // Private helper function for getting the next buffer
  190. // When PerformanceMode is enabled, workers pop from the local queue.
  191. // Otherwise, workers pop from the first child output Connector.
  192. // @param p_buffer - the buffer to return
  193. // @return Status return code
  194. Status FetchNextBuffer(std::unique_ptr<DataBuffer> *p_buffer, int32_t worker_id) {
  195. if (perf_mode_) {
  196. RETURN_IF_NOT_OK(local_queues_[worker_id]->PopFront(p_buffer));
  197. } else {
  198. RETURN_IF_NOT_OK(child_[0]->GetNextBuffer(p_buffer, worker_id));
  199. }
  200. return Status::OK();
  201. }
  202. // Private function for worker thread to perform TensorOp's compute function and get the result.
  203. // @param in_buffer A raw pointer to the DataBuffer. A raw pointer is fine because this function doesn't manage memory
  204. // and is not shared with other threads.
  205. // @param[out] new_tensor_table A new Tensor Table to be populated in this function.
  206. Status WorkerCompute(DataBuffer *in_buffer, TensorQTable *new_tensor_table);
  207. // Private function that create the final column name to index mapping and
  208. // get indices of the columns this mapop does not use.
  209. // @param col_name_id_map The column name to index mapping obtained from child operator
  210. void CreateFinalColMap(std::unordered_map<std::string, int32_t> *col_name_id_map);
  211. // Validating if each of the input_columns exists in the DataBuffer.
  212. // @param - the column map to check
  213. // @return - status return code
  214. Status ValidateInColumns(const std::unordered_map<std::string, int32_t> &col_name_id_map);
  215. // Private function for computing the assignment of the column name map.
  216. // @return - Status
  217. Status ComputeColMap() override;
  218. // Private function for initializing private variables such as in_columns_, out_columns_.
  219. // @return - Status
  220. Status InitPrivateVariable(std::unordered_map<std::string, int32_t> *col_name_id_map);
  221. };
  222. } // namespace dataset
  223. } // namespace mindspore
  224. #endif // DATASET_ENGINE_DATASETOPS_MAP_OP_H_