| @@ -18,56 +18,56 @@ LITE_API inline LiteAlgoSelectStrategy operator|( | |||
| } | |||
| /*! | |||
| * \brief the inference options which will be translated to megenine | |||
| * @brief the inference options which can optimize the network forwarding | |||
| * performance | |||
| * | |||
| * \param weight_preprocess is the option wich optimize the inferece performance | |||
| * with preprocess the const weights | |||
| * @param weight_preprocess is the option which optimize the inference performance | |||
| * with processing the weights of the network ahead | |||
| * | |||
| * \param fuse_preprocess fuse preprocess patten, like astype + pad_channel + | |||
| * @param fuse_preprocess fuse preprocess patten, like astype + pad_channel + | |||
| * dimshuffle | |||
| * | |||
| * \param fake_next_exec whether only to perform non-computing tasks (like | |||
| * memory allocation and queue initialization) for next exec. This would be | |||
| * @param fake_next_exec whether only to perform non-computing tasks (like | |||
| * memory allocation and queue initialization) for next exec. This will be | |||
| * reset to false when the graph is executed. | |||
| * | |||
| * \param var_sanity_check_first_run Disable var sanity check on the first run. | |||
| * @param var_sanity_check_first_run Disable var sanity check on the first run. | |||
| * Var sanity check is enabled on the first-time execution by default, and can | |||
| * be used to find some potential memory access errors in the operator | |||
| * implementation. | |||
| * | |||
| * \param const_shape This can be used to reduce memory usage since some | |||
| * static inference data structures can be omitted. | |||
| * @param const_shape used to reduce memory usage and improve performance since some | |||
| * static inference data structures can be omitted and some operators can be | |||
| * compute before forwarding | |||
| * | |||
| * \param force_dynamic_alloc force dynamic memory alloc for all vars | |||
| * @param force_dynamic_alloc force dynamic allocate memory for all vars | |||
| * | |||
| * \param force_output_dynamic_alloc force dynamic memory alloc for output vars | |||
| * which are used as CallbackCaller input when call compile() function | |||
| * @param force_output_dynamic_alloc force dynamic allocate memory for output tensor | |||
| * which are used as the input of CallbackCaller Operator | |||
| * | |||
| * \param no_profiling_on_shape_change do not re-profile to select best impl | |||
| * @param no_profiling_on_shape_change do not re-profile to select best implement | |||
| * algo when input shape changes (use previous algo) | |||
| * | |||
| * \param jit_level Execute supported operators with JIT (support MLIR, | |||
| * NVRTC). Can only be used on Nvidia GPUs, this value indicates JIT level: | |||
| * 1 for basic elemwise opr; | |||
| * 2 for including reduce operator | |||
| * @param jit_level Execute supported operators with JIT (support MLIR, | |||
| * NVRTC). Can only be used on Nvidia GPUs and X86 CPU, this value indicates JIT level: | |||
| * level 1: for JIT execute with basic elemwise operator | |||
| * level 2: for JIT execute elemwise and reduce operators | |||
| * | |||
| * \param record_level flag optimize the inference performace with record the | |||
| * kernel tasks in first run, hereafter the inference all need to execute the | |||
| * @param record_level flags to optimize the inference performance with record the | |||
| * kernel tasks in first run, hereafter the inference all need is to execute the | |||
| * recorded tasks. | |||
| * level = 0 means the normal inference, | |||
| * level = 1 means use record inference, | |||
| * level = 2 means record inference with free the extra memory | |||
| * | |||
| * \param graph_opt_level optimization level: | |||
| * @param graph_opt_level network optimization level: | |||
| * 0: disable | |||
| * 1: level-1: inplace arith transformations during graph | |||
| * construction | |||
| * 2: level-2: level-1, plus global optimization before graph | |||
| * compiling | |||
| * 3: also enable JIT | |||
| * <0: corresponding level, with result check for debug | |||
| * | |||
| * \param async_exec_level exec: dispatch on separate threads for different | |||
| * @param async_exec_level level of dispatch on separate threads for different | |||
| * comp_node. | |||
| * 0: do not perform async dispatch | |||
| * 1: dispatch async if there are more than one comp node with limited queue | |||
| @@ -99,14 +99,21 @@ struct LITE_API Options { | |||
| bool enable_nchw64 = false; | |||
| }; | |||
| /*! | |||
| * \brief Configuration when load and compile the graph | |||
| /** | |||
| * @brief Configuration when load and compile a network | |||
| * | |||
| * @param has_compression flag whether the model is compressed, the compress | |||
| * method is stored in the model | |||
| * | |||
| * @param device_id configure the device id of a network | |||
| * @param device_type configure the device type of a network | |||
| * @param backend configure the inference backend of a network, now only support | |||
| * megengine | |||
| * | |||
| * \param bare_model_cryption_name is the bare model cryption method name, bare | |||
| *model is not pack json info inside | |||
| * @param bare_model_cryption_name is the bare model encryption method name, bare | |||
| * model is not pack json information data inside | |||
| * | |||
| *\param has_compression flag whether the model is compressed, the compress | |||
| *method will read form the model | |||
| * @param options configuration of Options | |||
| */ | |||
| struct LITE_API Config { | |||
| bool has_compression = false; | |||
| @@ -118,9 +125,9 @@ struct LITE_API Config { | |||
| }; | |||
| /*! | |||
| * \brief Extra Configuration for a network | |||
| * @brief Extra Configuration for a network | |||
| * | |||
| * \param disable_configure_by_model_info disable the configuration dumped with model, | |||
| * @param disable_configure_by_model_info disable the configuration dumped with model, | |||
| * if set true, all configuration in the model will not apply, users should configure | |||
| * the network. | |||
| */ | |||
| @@ -128,90 +135,136 @@ struct LITE_API ExtraConfig { | |||
| bool disable_configure_by_model_info = false; | |||
| }; | |||
| /*! | |||
| * \brief config the network input and output item | |||
| /** | |||
| * @brief config the network input and output item, the input and output tensor | |||
| * information will describe there | |||
| * | |||
| * @param name the input/output tensor name | |||
| * | |||
| * @param is_host Used to mark where the input tensor comes from and where the output | |||
| * tensor will copy to, if is_host is true, the input is from host and output copy | |||
| * to host, otherwise in device. Sometimes the input is from device and output no need | |||
| * copy to host, default is true. | |||
| * | |||
| * @param io_type The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or | |||
| * output tensor value is invaid, only shape will be set, default is VALUE | |||
| * | |||
| * @param config_layout The layout of input or output tensor | |||
| * | |||
| * \verbatim embed:rst:leading-asterisk | |||
| * | |||
| * .. note:: | |||
| * | |||
| * * if other layout is set to input tensor before forwarding, this layout will not | |||
| * work | |||
| * * if no layout is set before forwarding, the model will forward with its origin | |||
| * layout | |||
| * * if layout is set in output tensor, it will used to check whether the | |||
| * layout computed from the network is correct | |||
| * | |||
| * \endverbatim | |||
| */ | |||
| struct LITE_API IO { | |||
| //! the tensor name in the graph corresponding to the IO | |||
| std::string name; | |||
| //! Used to mark where the input tensor comes from and the output where copy | |||
| //! to, if is_host is true, the input is from host and output copy to host, | |||
| //! otherwise device. Sometimes The input is from device and output no need | |||
| //! copy to host, default is true. | |||
| bool is_host = true; | |||
| //! The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or | |||
| //! output tensor value is invaid, only shape will be set, default is VALUE | |||
| LiteIOType io_type = LiteIOType::LITE_IO_VALUE; | |||
| //! The layout of the config from user, if other layout is set before | |||
| //! forward or get after forward by input tensor reset, this layout will by | |||
| //! pass. if no other layout is set before forward, this layout will work. | |||
| //! if this layout is no set, the model will forward with its origin layout. | |||
| //! if in output, it will used to check. | |||
| Layout config_layout = {}; | |||
| }; | |||
| /*! | |||
| * \brief the input and output information when load the network | |||
| * the NetworkIO will remain in the network until the network is destroyed | |||
| /** | |||
| * @brief the input and output information when load the network | |||
| * the NetworkIO will remain in the network until the network is destroyed. | |||
| * | |||
| * @param inputs The all input tensors information that will configure to the network | |||
| * @param outputs The all output tensors information that will configure to the network | |||
| */ | |||
| struct LITE_API NetworkIO { | |||
| std::vector<IO> inputs = {}; | |||
| std::vector<IO> outputs = {}; | |||
| }; | |||
| /*! | |||
| * \brief A user-implemented allocator interface | |||
| /** | |||
| * @brief A user-implemented allocator interface, user can register an allocator | |||
| * to the megengine, then all the runtime memory will allocate by this allocator | |||
| */ | |||
| class LITE_API Allocator { | |||
| public: | |||
| virtual ~Allocator() = default; | |||
| //! allocate memory of size in the given device with the given align | |||
| /** @brief allocate memory of size in the given device with the given align | |||
| * | |||
| * @param device_type the device type the memory will allocate from | |||
| * @param device_id the device id the memory will allocate from | |||
| * @param size the byte size of memory will be allocated | |||
| * @param align the align size require when allocate the memory | |||
| */ | |||
| virtual void* allocate( | |||
| LiteDeviceType device_type, int device_id, size_t size, size_t align) = 0; | |||
| //! free the memory pointed by ptr in the given device | |||
| /** @brief free the memory pointed by ptr in the given device | |||
| * | |||
| * @param device_type the device type the memory will allocate from | |||
| * @param device_id the device id the memory will allocate from | |||
| * @param ptr the memory pointer to be free | |||
| */ | |||
| virtual void free(LiteDeviceType device_type, int device_id, void* ptr) = 0; | |||
| }; | |||
| /*! | |||
| * \brief the thread affinith callback type | |||
| * \param thread_id thread_id is the a number begin from 0 to (nr_threads - 1), | |||
| * thread_id of (nr_threads - 1) is the main worker thread. | |||
| /** | |||
| * @brief the thread affinith callback function type | |||
| * | |||
| * @param thread_id the id of the current thread, the id is a number begin from 0 to | |||
| * (nr_threads - 1), thread id of (nr_threads - 1) is the main worker thread. | |||
| */ | |||
| using ThreadAffinityCallback = std::function<void(int thread_id)>; | |||
| /** | |||
| * @brief the network async callback function type | |||
| */ | |||
| using AsyncCallback = std::function<void(void)>; | |||
| /*! | |||
| * \brief the start/finish callback function | |||
| * \param unordered_map map from the io tensor name to the pair of which is the | |||
| * corresponding IO of user config and the realy input or output tensor. | |||
| /** | |||
| * @brief the start/finish callback function type | |||
| * | |||
| * @param unordered_map map from the io tensor name to the pair of the | |||
| * user configuration information and the really input or output tensor. | |||
| */ | |||
| //@{ | |||
| using StartCallback = | |||
| std::function<void(const std::unordered_map< | |||
| std::string, std::pair<IO, std::shared_ptr<Tensor>>>&)>; | |||
| using FinishCallback = | |||
| std::function<void(const std::unordered_map< | |||
| std::string, std::pair<IO, std::shared_ptr<Tensor>>>&)>; | |||
| //@} | |||
| /*! | |||
| * \brief The network is construct form a model, implement model load, init, | |||
| * forward, and display some model information | |||
| /** | |||
| * @brief The network is the main class to perform forwarding, which is construct form a | |||
| * model, and implement model load, init, forward, and display some model information | |||
| */ | |||
| class LITE_API Network { | |||
| public: | |||
| class NetworkImplBase; | |||
| friend class NetworkHelper; | |||
| ~Network(); | |||
| /*! @brief Construct a network with given configuration and IO information | |||
| * | |||
| * @name Constructor | |||
| * | |||
| * @param config The configuration to create the network | |||
| * @param networkio The NetworkIO to describe the input and output | |||
| * tensor of the network | |||
| */ | |||
| //@{ | |||
| Network(const Config& config = {}, const NetworkIO& networkio = {}); | |||
| Network(const NetworkIO& networkio, const Config& config = {}); | |||
| //@} | |||
| //! load the model form memory | |||
| void load_model(void* model_mem, size_t size); | |||
| @@ -219,32 +272,37 @@ public: | |||
| //! load the model from a model path | |||
| void load_model(std::string model_path); | |||
| //! only compute the output tensor in user configured | |||
| //! only compute the output tensor configured by the IO information | |||
| void compute_only_configured_output(); | |||
| //! get the network input and output tensor, the layout of which is | |||
| //! sync from mge tensor, when the name of input and output tensor are the | |||
| //! same, use LiteTensorPhase to separate | |||
| /** @brief get the network input and output tensor, the layout of which is | |||
| * sync from megengine tensor, when the name of input and output tensor are the | |||
| * same, use LiteTensorPhase to separate them | |||
| * | |||
| * @param io_name the name of the tensor | |||
| * @param phase indicate whether the tensor is input tensor or output tensor, | |||
| * maybe the input tensor name is the same with the output tensor name | |||
| */ | |||
| std::shared_ptr<Tensor> get_io_tensor( | |||
| std::string io_name, LiteTensorPhase phase = LiteTensorPhase::LITE_IO); | |||
| //! get the network input by index | |||
| //! get the network input tensor by index | |||
| std::shared_ptr<Tensor> get_input_tensor(size_t index); | |||
| //! get the network output tensor by index | |||
| std::shared_ptr<Tensor> get_output_tensor(size_t index); | |||
| //! set the network forward in async mode and set the async callback | |||
| //! set the network forwarding in async mode and set the AsyncCallback callback | |||
| //! function | |||
| Network& set_async_callback(const AsyncCallback& async_callback); | |||
| //! set the start forward callback function, which will be execute before | |||
| //! forward. this can be used to check network input or dump model inputs | |||
| //! for debug | |||
| //! set the start forwarding callback function of type StartCallback, which will be | |||
| //! execute before forward. this can be used to check network input or dump model | |||
| //! inputs for debug | |||
| Network& set_start_callback(const StartCallback& start_callback); | |||
| //! set the finish forward callback function, which will be execute after | |||
| //! forward. this can be used to dump model outputs for debug | |||
| //! set the finish forwarding callback function of type FinishCallback, which will | |||
| //! be execute after forward. this can be used to dump model outputs for debug | |||
| Network& set_finish_callback(const FinishCallback& finish_callback); | |||
| //! forward the network with filled input data and fill the output data | |||
| @@ -254,33 +312,37 @@ public: | |||
| //! waite until forward finish in sync model | |||
| void wait(); | |||
| //! get the input tensor name in the order in load return | |||
| //! get the input tensor name by index | |||
| std::string get_input_name(size_t index) const; | |||
| //! get the output tensor name in the order in load return | |||
| //! get the output tensor name by index | |||
| std::string get_output_name(size_t index) const; | |||
| //! get all the input tensor name in the order in load return | |||
| //! get all the input tensor names | |||
| std::vector<std::string> get_all_input_name() const; | |||
| //! get all the output tensor name in the order in load return | |||
| //! get all the output tensor names | |||
| std::vector<std::string> get_all_output_name() const; | |||
| //! set/get device id, default device id = 0 | |||
| //! set the network forwarding device id, default device id = 0 | |||
| Network& set_device_id(int device_id); | |||
| //! get the network forwarding device id | |||
| int get_device_id() const; | |||
| //! set/get stream id, default stream id = 0 | |||
| //! set the network stream id, default stream id = 0 | |||
| Network& set_stream_id(int stream_id); | |||
| //! get the network stream id | |||
| int get_stream_id() const; | |||
| //! enable profile the network, a file will be generated | |||
| //! enable profile the network, a file will be generated to the given path | |||
| void enable_profile_performance(std::string profile_file_path); | |||
| //! get model extra info | |||
| //! get model extra info, the extra information is packed into model by user | |||
| const std::string& get_model_extra_info(); | |||
| //! get device type | |||
| //! get the network device type | |||
| LiteDeviceType get_device_type() const; | |||
| //! get static peak memory info showed by Graph visualization | |||
| @@ -312,80 +374,163 @@ private: | |||
| }; | |||
| /*********************** MGE special network function ***************/ | |||
| /*! | |||
| * @brief All the runtime configuration function is define in Runtime class, as | |||
| * a static member function | |||
| */ | |||
| class LITE_API Runtime { | |||
| public: | |||
| //! When device is CPU, this interface will set the to be loaded model | |||
| //! run in multi thread mode with the given thread number. | |||
| /** @brief The multithread number setter and getter interface | |||
| * When device is CPU, this interface will set the network | |||
| * running in multi thread mode with the given thread number. | |||
| * | |||
| * @param dst_network the target network to set/get the thread number | |||
| * @param nr_threads the thread number set to the target network | |||
| */ | |||
| //@{ | |||
| static void set_cpu_threads_number( | |||
| std::shared_ptr<Network> dst_network, size_t nr_threads); | |||
| static size_t get_cpu_threads_number(std::shared_ptr<Network> dst_network); | |||
| //@} | |||
| //! set threads affinity callback; | |||
| /** @brief set threads affinity callback | |||
| * | |||
| * @param dst_network the target network to set the thread affinity callback | |||
| * @param thread_affinity_callback the ThreadAffinityCallback callback to set the | |||
| * thread affinity | |||
| */ | |||
| static void set_runtime_thread_affinity( | |||
| std::shared_ptr<Network> network, | |||
| const ThreadAffinityCallback& thread_affinity_callback); | |||
| //! Set cpu default mode when device is CPU, in some low computation | |||
| //! device or single core device, this mode will get good performace | |||
| /** @brief Set cpu default mode when device is CPU, in some low computation | |||
| * device or single core device, this mode will get good performace | |||
| * | |||
| * @param dst_network the target network to set/get cpu inplace model | |||
| */ | |||
| //@{ | |||
| static void set_cpu_inplace_mode(std::shared_ptr<Network> dst_network); | |||
| static bool is_cpu_inplace_mode(std::shared_ptr<Network> dst_network); | |||
| //@} | |||
| //! Set use tensorrt forward | |||
| //! Set the network forwarding use tensorrt | |||
| static void use_tensorrt(std::shared_ptr<Network> dst_network); | |||
| //! set opr algorithm selection strategy in the network | |||
| //! shared_batch_size: the batch size used by fastrun, | |||
| //! Non-zero value means that fastrun use this batch size | |||
| //! regardless of the batch size of the model. Zero means | |||
| //! fastrun use batch size of the model | |||
| //! binary_equal_between_batch: if the content of each input batch is binary | |||
| //! equal,whether the content of each output | |||
| //! batch is promised to be equal | |||
| /** @brief set opr algorithm selection strategy in the target network | |||
| * | |||
| * @param dst_network the target network to set the algorithm strategy | |||
| * @param strategy the algorithm strategy will set to the network, if multi | |||
| * strategy should set, use | operator can pack them together | |||
| * @param shared_batch_size the batch size used by fast-run, Non-zero value means | |||
| * that fast-run use this batch size regardless of the batch size of the model, if | |||
| * set to zero means fast-run use batch size of the model | |||
| * | |||
| * @param binary_equal_between_batch if set true means if the content of each input | |||
| * batch is binary equal, whether the content of each output batch is promised to be | |||
| * equal, otherwise not | |||
| */ | |||
| static void set_network_algo_policy( | |||
| std::shared_ptr<Network> dst_network, LiteAlgoSelectStrategy strategy, | |||
| uint32_t shared_batch_size = 0, bool binary_equal_between_batch = false); | |||
| //! set workspace_limit for oprs with multiple algorithms, set | |||
| //! workspace limitation can save memory but may influence the performance | |||
| /** @brief set the opr workspace limitation in the target network, some opr | |||
| * maybe use large of workspace to get good performance, set workspace limitation | |||
| * can save memory but may influence the performance | |||
| * | |||
| * @param dst_network the target network to set/get workspace limitation | |||
| * @param workspace_limit the byte size of workspace limitation | |||
| */ | |||
| static void set_network_algo_workspace_limit( | |||
| std::shared_ptr<Network> dst_network, size_t workspace_limit); | |||
| //! set the network memroy allocator, the allocator is defined by user | |||
| /** @brief set the network runtime memory Allocator, the Allocator is defined by | |||
| * user, through this method, user can implement a memory pool for network | |||
| * forwarding | |||
| * | |||
| * @param dst_network the target network | |||
| * @param user_allocator the user defined Allocator | |||
| */ | |||
| static void set_memory_allocator( | |||
| std::shared_ptr<Network> dst_network, | |||
| std::shared_ptr<Allocator> user_allocator); | |||
| //! share the runtime memory with other network, the weights is not shared | |||
| /** @brief share the runtime memory with other network, the weights is not shared | |||
| * | |||
| * \verbatim embed:rst:leading-asterisk | |||
| * | |||
| * .. warning:: | |||
| * | |||
| * the src network and the dst network can not execute in simultaneous | |||
| * | |||
| * \endverbatim | |||
| * | |||
| * @param dst_network the target network to share the runtime memory from | |||
| * src_network | |||
| * @param src_network the source network to shared runtime memory to dst_network | |||
| */ | |||
| static void share_runtime_memory_with( | |||
| std::shared_ptr<Network> dst_network, std::shared_ptr<Network> src_network); | |||
| //! Dump input/output values of all internal variables to output | |||
| //! file, in txt format | |||
| /** @brief dump all input/output tensor of all operators to the output file, in txt | |||
| * format, user can use this function to debug compute error | |||
| * | |||
| * @param dst_network the target network to dump its tensors | |||
| * @param io_txt_out_file the txt file | |||
| */ | |||
| static void enable_io_txt_dump( | |||
| std::shared_ptr<Network> dst_network, std::string io_txt_out_file); | |||
| //! Dump input/output values of all internal variables to output | |||
| //! directory, in binary format | |||
| /** @brief dump all input/output tensor of all operators to the output file, in | |||
| * binary format, user can use this function to debug compute error | |||
| * | |||
| * @param dst_network the target network to dump its tensors | |||
| * @param io_bin_out_dir the binary file director | |||
| */ | |||
| static void enable_io_bin_dump( | |||
| std::shared_ptr<Network> dst_network, std::string io_bin_out_dir); | |||
| //! load a new network which will share weights with src network | |||
| /** @brief load a new network which will share weights with src network, | |||
| * this can reduce memory usage when user want to load the same model multi | |||
| * times | |||
| * | |||
| * @param dst_network the target network to share weights from src_network | |||
| * @param src_network the source network to shared weights to dst_network | |||
| */ | |||
| static void shared_weight_with_network( | |||
| std::shared_ptr<Network> dst_network, | |||
| const std::shared_ptr<Network> src_network); | |||
| //! set global layout transform optimization for network | |||
| /** @brief set global layout transform optimization for network, global | |||
| * layout optimization can auto determine the layout of every operator in | |||
| * the network by profile, thus it can improve the performance of the | |||
| * network forwarding | |||
| */ | |||
| static void enable_global_layout_transform(std::shared_ptr<Network> network); | |||
| //! dump network after global layout transform optimization | |||
| /** @brief dump network after global layout transform optimization to the | |||
| * specific path | |||
| */ | |||
| static void dump_layout_transform_model( | |||
| std::shared_ptr<Network> network, std::string optimized_model_path); | |||
| //! get the model io information before model loaded by model path. | |||
| /** @brief get the model io information before model loaded by model path. | |||
| * | |||
| * @param model_path the model path to get the model IO information | |||
| * @param config the model configuration | |||
| * | |||
| * @return the model NetworkIO information | |||
| */ | |||
| static NetworkIO get_model_io_info( | |||
| const std::string& model_path, const Config& config = {}); | |||
| //! get the model io information before model loaded by model memory. | |||
| /** @brief get the model io information before model loaded by model memory. | |||
| * | |||
| * @param model_mem the model memory to get the model IO information | |||
| * @param size model memory size in byte | |||
| * @param config the model configuration | |||
| * | |||
| * @return the model NetworkIO information | |||
| */ | |||
| static NetworkIO get_model_io_info( | |||
| const void* model_mem, size_t size, const Config& config = {}); | |||
| }; | |||