You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

map_op.h 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef DATASET_ENGINE_DATASETOPS_MAP_OP_H_
  17. #define DATASET_ENGINE_DATASETOPS_MAP_OP_H_
  18. #include <memory>
  19. #include <string>
  20. #include <unordered_map>
  21. #include <utility>
  22. #include <vector>
  23. #include "dataset/engine/datasetops/parallel_op.h"
  24. #include "dataset/kernels/tensor_op.h"
  25. #include "dataset/util/queue.h"
  26. namespace mindspore {
  27. namespace dataset {
  28. // Forward declare
  29. class DataBuffer;
  30. class ExecutionTree;
  31. // MapOp class implements the Map operator. It will apply a list of operations to each record specified by column names.
  32. // The column order behavior after MapOp is as follows.
  33. // [Case 1] If the number of Input Columns == the number of Output Column, column ordering after MapOp
  34. // is the same as the original column order where the Remainder Columns stay in the same position,
  35. // and the Output Columns are placed the same position of the Input Columns.
  36. // For example, initially if the dataset has column order |A, B, C, D, E|,
  37. // and we apply MapOp() with Input Columns {B, C} and Output Columns {X, Y}.
  38. // The column order after applying MapOp will be |A, X, Y, D, E|.
  39. // Note that in this case, |X, Y| is the Output Columns and |A, D, E| which is the Remainder Columns stay in
  40. // their original position, and column B is replaced by column X and column C is replace by column Y.
  41. // [Case 2] If the number of Input Columns != the number of Output Column, column ordering after MapOp
  42. // is Output Columns followed by Remainder Columns.
  43. // For example, initially if the dataset has column order |A, B, C, D, E|,
  44. // and we apply MapOp() with Input Columns {B, C, A} and Output Columns {X, Y}.
  45. // The column order after applying MapOp will be |X, Y, D, E|.
  46. // Note that in this case, |X, Y| is the Output Columns and |D, E| is the Remainder Columns,
  47. // and the Input Columns are gone and replaced by the Output Columns.
  48. // Keywords:
  49. // Input Columns : a vector of column names (string) passed to MapOp specifying the column names from which
  50. // Tensors are taken and passed to the TensorOp Compute().
  51. // Output Columns : a vector of column names (string) passed to MapOp specifying what are the column names
  52. // for the Tensors produced by TensorOp Compute().
  53. // Remainder Columns : columns that exist in the dataset but are not mentioned in Input Columns.
  54. // These columns will not be passed to TensorOp Compute(), but will be appended to the end of the Output Columns.
  55. class MapOp : public ParallelOp {
  56. public:
  57. // The nested builder class inside of the MapOp is used to help manage all of
  58. // the arguments for constructing it. Use the builder by setting each argument
  59. // with the provided set methods, and then finally call the build method to execute
  60. // the actual construction.
  61. class Builder {
  62. public:
  63. // Builder constructor. Creates the builder object.
  64. // @note No default args
  65. // @return This is a constructor.
  66. Builder();
  67. // Default destructor
  68. ~Builder() = default;
  69. // Setter method.
  70. // @return Builder setter method returns reference to the builder.
  71. Builder &SetInColNames(const std::vector<std::string> &in_col_names) {
  72. build_in_col_names_ = in_col_names;
  73. return *this;
  74. }
  75. // Setter method.
  76. // @return Builder setter method returns reference to the builder.
  77. Builder &SetOutColNames(const std::vector<std::string> &out_col_names) {
  78. build_out_col_names_ = out_col_names;
  79. return *this;
  80. }
  81. // Setter method.
  82. // @return Builder setter method returns reference to the builder.
  83. Builder &SetTensorFuncs(std::vector<std::shared_ptr<TensorOp>> funcs) {
  84. build_tensor_funcs_ = std::move(funcs);
  85. return *this;
  86. }
  87. // Setter method.
  88. // @return Builder setter method returns reference to the builder.
  89. Builder &SetNumWorkers(int32_t num_workers) {
  90. build_num_workers_ = num_workers;
  91. return *this;
  92. }
  93. // Setter method.
  94. // @return Builder setter method returns reference to the builder.
  95. Builder &SetOpConnectorSize(int32_t connector_size) {
  96. build_op_connector_size_ = connector_size;
  97. return *this;
  98. }
  99. // Setter method.
  100. // @return Builder setter method returns reference to the builder.
  101. Builder &SetPerformanceMode(bool perf_mode) {
  102. build_perf_mode_ = perf_mode;
  103. return *this;
  104. }
  105. // The builder "build" method creates the final object.
  106. // @param ptr The shared_ptr to the new MapOp object
  107. // @return Status
  108. Status Build(std::shared_ptr<MapOp> *ptr);
  109. private:
  110. std::vector<std::string> build_in_col_names_;
  111. std::vector<std::string> build_out_col_names_;
  112. std::vector<std::shared_ptr<TensorOp>> build_tensor_funcs_;
  113. int32_t build_num_workers_;
  114. int32_t build_op_connector_size_;
  115. bool build_perf_mode_; // Default true.
  116. // Check if the required parameters are set by the builder.
  117. // @return Status The error code return
  118. Status sanityCheck() const;
  119. };
  120. // Constructor of MapOp
  121. // @note The builder class should be used to call it.
  122. // @param in_col_names A list of input column names (should match the input/output \p tensorFuncs).
  123. // @param out_col_names A list of output column names (should match the input/output \p tensorFuncs).
  124. // @param tensor_funcs A list of TensorOp pointers for MapOp to apply to each data.
  125. // @param num_workers The number of worker threads.
  126. // @param op_connector_size The size of each queue in the connector.
  127. MapOp(const std::vector<std::string> &in_col_names, const std::vector<std::string> &out_col_names,
  128. std::vector<std::shared_ptr<TensorOp>> tensor_funcs, int32_t num_workers, int32_t op_connector_size,
  129. bool perf_mode);
  130. // Destructor
  131. ~MapOp() = default;
  132. // A print method typically used for debugging
  133. // @param out The output stream to write output to
  134. // @param show_all A bool to control if you want to show all info or just a summary
  135. void Print(std::ostream &out, bool show_all) const override;
  136. // << Stream output operator overload
  137. // @notes This allows you to write the debug print info using stream operators
  138. // @param out reference to the output stream being overloaded
  139. // @param mo reference to the MapOp to display
  140. // @return the output stream must be returned
  141. friend std::ostream &operator<<(std::ostream &out, const MapOp &mo) {
  142. mo.Print(out, false);
  143. return out;
  144. }
  145. // Class functor operator () override.
  146. // All dataset ops operate by launching a thread (see ExecutionTree). This class functor will
  147. // provide the master loop that drives the logic for performing the work
  148. // @return Status The error code return
  149. Status operator()() override;
  150. // Getter
  151. // @return the number of threads consuming data from previous op's output Connector.
  152. int32_t num_consumers() const override;
  153. private:
  154. // Local queues where worker threads can pop from.
  155. // Popping directly from the Connector can block if the previous designated threads haven't pop.
  156. // Setting the size of these queues to 0 is essentially the same as pulling directly from Connector.
  157. QueueList<std::unique_ptr<DataBuffer>> local_queues_;
  158. // Static variables to be ready by worker threads, no modification and readonly
  159. const std::vector<std::shared_ptr<TensorOp>> tfuncs_;
  160. // Variable to store the column name that the tensorOps are consuming
  161. std::vector<std::string> in_columns_;
  162. // Variable to store the column name that the tensorOps are producing
  163. std::vector<std::string> out_columns_;
  164. // Performance mode is when the main thread creates local queues, pulls databuffers from the previous
  165. // op's Connector and distributes them to the local queues. Workers pull from the local queues.
  166. // If this flag is false, each worker pulls directly from the Connector. This use less resources
  167. // (thread and memory), but when the computation cost is heavy (e.g. DecodeOp) and fluctuating, it can
  168. // cause additional blocking because pop calls to Connector from the threads are synchronized to enforce the order.
  169. bool perf_mode_;
  170. // Private function for worker/thread to loop continuously. It comprises the main
  171. // logic of MapOp: getting the data from previous Op, validating user specified column names,
  172. // applying a list of TensorOps to each of the data, process the results and then
  173. // pushing them back to MapOp's output Connector to be fetched by the next Op.
  174. // @param worker_id The id assigned to this thread/worker upon creation.
  175. // @return Status The error code return
  176. Status WorkerEntry(int32_t worker_id) override; // In: workerId assigned by tree_
  177. // Private function for worker thread to perform TensorOp's compute function and get the result.
  178. // @param in_buffer A raw pointer to the DataBuffer. A raw pointer is fine because this function doesn't manage memory
  179. // and is not shared with other threads.
  180. // @param to_process_indices Indices of columns to be processed by the TensorOp.
  181. // @param[out] new_tensor_table A new Tensor Table to be populated in this function.
  182. // @param keep_input_columns Keeping track of which columns to keep (not used by TensorOp).
  183. // @param input_columns The vector of input column names used in the current thread.
  184. // @param output_columns The vector of output column names used in the current thread.
  185. Status WorkerCompute(DataBuffer *in_buffer, const std::vector<size_t> &to_process_indices,
  186. TensorQTable *new_tensor_table, const std::vector<bool> &keep_input_columns,
  187. std::vector<std::string> *input_columns, std::vector<std::string> *output_columns);
  188. // Private function for validating if each of the user specified input column names
  189. // exist in the DataBuffer.
  190. // @param col_name_id_map The column name to index mapping obtained from DataBuffer.
  191. // @param input_columns The vector of input column names used in the current thread.
  192. // @return Status The error code return
  193. Status ValidateInColumns(const std::unordered_map<std::string, int32_t> &col_name_id_map,
  194. std::vector<std::string> *input_columns);
  195. // Private function that create the final column name to index mapping and
  196. // get indices of the columns this mapop does not use.
  197. // @param col_name_id_map The column name to index mapping obtained from DataBuffer.
  198. // @param keep_input_columns To mark which columns are to be kept (not used in mapOp).
  199. // @param input_columns The vector of input column names used in the current thread.
  200. // @param output_columns The vector of output column names used in the current thread.
  201. // @return finalColNameIdMap The final column name to index mapping.
  202. std::unordered_map<std::string, int32_t> CreateFinalColMap(std::unordered_map<std::string, int32_t> *col_name_id_map,
  203. const std::vector<bool> &keep_input_columns,
  204. std::vector<std::string> *input_columns,
  205. std::vector<std::string> *output_columns);
  206. // Private function that initialize some internal data structure used by WorkerEntry()
  207. // @param in_buf A raw pointer to the DataBuffer. A raw pointer is fine because this function does not manage memory
  208. // and is not shared with other threads.
  209. // @param[out] keep_input_columns Keeping track of which columns to keep (not used by TensorOp)
  210. // @param[out] to_process_indices Indices of columns to be processed by the TensorOp
  211. // @param[out] final_col_name_id_map Create the final column name id map. This final mapping will replace the old one
  212. // if the TensorOp Compute() is successful.
  213. // @param input_columns The vector of input column names used in the current thread.
  214. // @param output_columns The vector of output column names used in the current thread.
  215. Status WorkerEntryInit(const DataBuffer *in_buf, std::vector<bool> *keep_input_columns,
  216. std::vector<size_t> *to_process_indices,
  217. std::unordered_map<std::string, int32_t> *final_col_name_id_map,
  218. std::vector<std::string> *input_columns, std::vector<std::string> *output_columns);
  219. };
  220. } // namespace dataset
  221. } // namespace mindspore
  222. #endif // DATASET_ENGINE_DATASETOPS_MAP_OP_H_