/** * Copyright 2019 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef DATASET_ENGINE_DATASETOPS_MAP_OP_H_ #define DATASET_ENGINE_DATASETOPS_MAP_OP_H_ #include #include #include #include #include #include "dataset/engine/datasetops/parallel_op.h" #include "dataset/kernels/tensor_op.h" #include "dataset/util/queue.h" namespace mindspore { namespace dataset { // Forward declare class DataBuffer; class ExecutionTree; // MapOp class implements the Map operator. It will apply a list of operations to each record specified by column names. // The column order behavior after MapOp is as follows. // [Case 1] If the number of Input Columns == the number of Output Column, column ordering after MapOp // is the same as the original column order where the Remainder Columns stay in the same position, // and the Output Columns are placed the same position of the Input Columns. // For example, initially if the dataset has column order |A, B, C, D, E|, // and we apply MapOp() with Input Columns {B, C} and Output Columns {X, Y}. // The column order after applying MapOp will be |A, X, Y, D, E|. // Note that in this case, |X, Y| is the Output Columns and |A, D, E| which is the Remainder Columns stay in // their original position, and column B is replaced by column X and column C is replace by column Y. // [Case 2] If the number of Input Columns != the number of Output Column, column ordering after MapOp // is Output Columns followed by Remainder Columns. // For example, initially if the dataset has column order |A, B, C, D, E|, // and we apply MapOp() with Input Columns {B, C, A} and Output Columns {X, Y}. // The column order after applying MapOp will be |X, Y, D, E|. // Note that in this case, |X, Y| is the Output Columns and |D, E| is the Remainder Columns, // and the Input Columns are gone and replaced by the Output Columns. // Keywords: // Input Columns : a vector of column names (string) passed to MapOp specifying the column names from which // Tensors are taken and passed to the TensorOp Compute(). // Output Columns : a vector of column names (string) passed to MapOp specifying what are the column names // for the Tensors produced by TensorOp Compute(). // Remainder Columns : columns that exist in the dataset but are not mentioned in Input Columns. // These columns will not be passed to TensorOp Compute(), but will be appended to the end of the Output Columns. class MapOp : public ParallelOp { public: // The nested builder class inside of the MapOp is used to help manage all of // the arguments for constructing it. Use the builder by setting each argument // with the provided set methods, and then finally call the build method to execute // the actual construction. class Builder { public: // Builder constructor. Creates the builder object. // @note No default args // @return This is a constructor. Builder(); // Default destructor ~Builder() = default; // Setter method. // @return Builder setter method returns reference to the builder. Builder &SetInColNames(const std::vector &in_col_names) { build_in_col_names_ = in_col_names; return *this; } // Setter method. // @return Builder setter method returns reference to the builder. Builder &SetOutColNames(const std::vector &out_col_names) { build_out_col_names_ = out_col_names; return *this; } // Setter method. // @return Builder setter method returns reference to the builder. Builder &SetTensorFuncs(std::vector> funcs) { build_tensor_funcs_ = std::move(funcs); return *this; } // Setter method. // @return Builder setter method returns reference to the builder. Builder &SetNumWorkers(int32_t num_workers) { build_num_workers_ = num_workers; return *this; } // Setter method. // @return Builder setter method returns reference to the builder. Builder &SetOpConnectorSize(int32_t connector_size) { build_op_connector_size_ = connector_size; return *this; } // Setter method. // @return Builder setter method returns reference to the builder. Builder &SetPerformanceMode(bool perf_mode) { build_perf_mode_ = perf_mode; return *this; } // The builder "build" method creates the final object. // @param ptr The shared_ptr to the new MapOp object // @return Status Status Build(std::shared_ptr *ptr); private: std::vector build_in_col_names_; std::vector build_out_col_names_; std::vector> build_tensor_funcs_; int32_t build_num_workers_; int32_t build_op_connector_size_; bool build_perf_mode_; // Default true. // Check if the required parameters are set by the builder. // @return Status The error code return Status sanityCheck() const; }; // Constructor of MapOp // @note The builder class should be used to call it. // @param in_col_names A list of input column names (should match the input/output \p tensorFuncs). // @param out_col_names A list of output column names (should match the input/output \p tensorFuncs). // @param tensor_funcs A list of TensorOp pointers for MapOp to apply to each data. // @param num_workers The number of worker threads. // @param op_connector_size The size of each queue in the connector. MapOp(const std::vector &in_col_names, const std::vector &out_col_names, std::vector> tensor_funcs, int32_t num_workers, int32_t op_connector_size, bool perf_mode); // Destructor ~MapOp() = default; // A print method typically used for debugging // @param out The output stream to write output to // @param show_all A bool to control if you want to show all info or just a summary void Print(std::ostream &out, bool show_all) const override; // << Stream output operator overload // @notes This allows you to write the debug print info using stream operators // @param out reference to the output stream being overloaded // @param mo reference to the MapOp to display // @return the output stream must be returned friend std::ostream &operator<<(std::ostream &out, const MapOp &mo) { mo.Print(out, false); return out; } // Class functor operator () override. // All dataset ops operate by launching a thread (see ExecutionTree). This class functor will // provide the master loop that drives the logic for performing the work // @return Status The error code return Status operator()() override; // Getter // @return the number of threads consuming data from previous op's output Connector. int32_t num_consumers() const override; private: // Local queues where worker threads can pop from. // Popping directly from the Connector can block if the previous designated threads haven't pop. // Setting the size of these queues to 0 is essentially the same as pulling directly from Connector. QueueList> local_queues_; // Static variables to be ready by worker threads, no modification and readonly const std::vector> tfuncs_; // Variable to store the column name that the tensorOps are consuming std::vector in_columns_; // Variable to store the column name that the tensorOps are producing std::vector out_columns_; // Performance mode is when the main thread creates local queues, pulls databuffers from the previous // op's Connector and distributes them to the local queues. Workers pull from the local queues. // If this flag is false, each worker pulls directly from the Connector. This use less resources // (thread and memory), but when the computation cost is heavy (e.g. DecodeOp) and fluctuating, it can // cause additional blocking because pop calls to Connector from the threads are synchronized to enforce the order. bool perf_mode_; // Private function for worker/thread to loop continuously. It comprises the main // logic of MapOp: getting the data from previous Op, validating user specified column names, // applying a list of TensorOps to each of the data, process the results and then // pushing them back to MapOp's output Connector to be fetched by the next Op. // @param worker_id The id assigned to this thread/worker upon creation. // @return Status The error code return Status WorkerEntry(int32_t worker_id) override; // In: workerId assigned by tree_ // Private function for worker thread to perform TensorOp's compute function and get the result. // @param in_buffer A raw pointer to the DataBuffer. A raw pointer is fine because this function doesn't manage memory // and is not shared with other threads. // @param to_process_indices Indices of columns to be processed by the TensorOp. // @param[out] new_tensor_table A new Tensor Table to be populated in this function. // @param keep_input_columns Keeping track of which columns to keep (not used by TensorOp). // @param input_columns The vector of input column names used in the current thread. // @param output_columns The vector of output column names used in the current thread. Status WorkerCompute(DataBuffer *in_buffer, const std::vector &to_process_indices, TensorQTable *new_tensor_table, const std::vector &keep_input_columns, std::vector *input_columns, std::vector *output_columns); // Private function for validating if each of the user specified input column names // exist in the DataBuffer. // @param col_name_id_map The column name to index mapping obtained from DataBuffer. // @param input_columns The vector of input column names used in the current thread. // @return Status The error code return Status ValidateInColumns(const std::unordered_map &col_name_id_map, std::vector *input_columns); // Private function that create the final column name to index mapping and // get indices of the columns this mapop does not use. // @param col_name_id_map The column name to index mapping obtained from DataBuffer. // @param keep_input_columns To mark which columns are to be kept (not used in mapOp). // @param input_columns The vector of input column names used in the current thread. // @param output_columns The vector of output column names used in the current thread. // @return finalColNameIdMap The final column name to index mapping. std::unordered_map CreateFinalColMap(std::unordered_map *col_name_id_map, const std::vector &keep_input_columns, std::vector *input_columns, std::vector *output_columns); // Private function that initialize some internal data structure used by WorkerEntry() // @param in_buf A raw pointer to the DataBuffer. A raw pointer is fine because this function does not manage memory // and is not shared with other threads. // @param[out] keep_input_columns Keeping track of which columns to keep (not used by TensorOp) // @param[out] to_process_indices Indices of columns to be processed by the TensorOp // @param[out] final_col_name_id_map Create the final column name id map. This final mapping will replace the old one // if the TensorOp Compute() is successful. // @param input_columns The vector of input column names used in the current thread. // @param output_columns The vector of output column names used in the current thread. Status WorkerEntryInit(const DataBuffer *in_buf, std::vector *keep_input_columns, std::vector *to_process_indices, std::unordered_map *final_col_name_id_map, std::vector *input_columns, std::vector *output_columns); }; } // namespace dataset } // namespace mindspore #endif // DATASET_ENGINE_DATASETOPS_MAP_OP_H_