| @@ -143,6 +143,15 @@ if(CXX_SUPPORT_GOLD AND NOT ANDROID AND NOT APPLE AND NOT MSVC AND NOT WIN32) | |||||
| set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fuse-ld=gold") | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fuse-ld=gold") | ||||
| endif() | endif() | ||||
| option(MGE_WITH_JIT "Build MegEngine with JIT." ON) | |||||
| option(MGE_WITH_HALIDE "Build MegEngine with Halide JIT" ON) | |||||
| option(MGE_DISABLE_FLOAT16 "Disable MegEngine float16 support." OFF) | |||||
| option(MGE_WITH_CUDA "Enable MegEngine CUDA support." ON) | |||||
| option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON) | |||||
| option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON) | |||||
| option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF) | |||||
| option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON) | |||||
| if(NOT MGE_WITH_JIT) | if(NOT MGE_WITH_JIT) | ||||
| if(MGE_WITH_HALIDE) | if(MGE_WITH_HALIDE) | ||||
| message(WARNING "MGE_WITH_HALIDE is set to OFF with MGE_WITH_JIT disabled") | message(WARNING "MGE_WITH_HALIDE is set to OFF with MGE_WITH_JIT disabled") | ||||
| @@ -84,6 +84,7 @@ megcoreStatus_t megcoreGetDeviceFlags( | |||||
| unsigned int *flags); | unsigned int *flags); | ||||
| megcoreStatus_t megcoreActivate(megcoreDeviceHandle_t handle); | megcoreStatus_t megcoreActivate(megcoreDeviceHandle_t handle); | ||||
| megcoreStatus_t megcoreDeactivate(megcoreDeviceHandle_t handle); | |||||
| megcoreStatus_t megcoreMalloc(megcoreDeviceHandle_t handle, | megcoreStatus_t megcoreMalloc(megcoreDeviceHandle_t handle, | ||||
| void **devPtr, size_t sizeInBytes); | void **devPtr, size_t sizeInBytes); | ||||
| megcoreStatus_t megcoreFree(megcoreDeviceHandle_t handle, | megcoreStatus_t megcoreFree(megcoreDeviceHandle_t handle, | ||||
| @@ -86,6 +86,7 @@ if (BUILD_SHARED_LIBS) | |||||
| else() | else() | ||||
| target_link_libraries(megdnn PRIVATE ${MGE_BLAS_LIBS}) | target_link_libraries(megdnn PRIVATE ${MGE_BLAS_LIBS}) | ||||
| endif() | endif() | ||||
| if(CMAKE_THREAD_LIBS_INIT) | if(CMAKE_THREAD_LIBS_INIT) | ||||
| target_link_libraries(megdnn PRIVATE Threads::Threads) | target_link_libraries(megdnn PRIVATE Threads::Threads) | ||||
| endif() | endif() | ||||
| @@ -38,6 +38,7 @@ class DeviceContext { | |||||
| virtual size_t mem_alignment_in_bytes() const noexcept = 0; | virtual size_t mem_alignment_in_bytes() const noexcept = 0; | ||||
| virtual void activate() = 0; | virtual void activate() = 0; | ||||
| virtual void deactivate() {} | |||||
| virtual void *malloc(size_t size_in_bytes) = 0; | virtual void *malloc(size_t size_in_bytes) = 0; | ||||
| virtual void free(void *ptr) = 0; | virtual void free(void *ptr) = 0; | ||||
| @@ -74,6 +74,13 @@ megcoreStatus_t megcoreActivate(megcoreDeviceHandle_t handle) | |||||
| return megcoreSuccess; | return megcoreSuccess; | ||||
| } | } | ||||
| megcoreStatus_t megcoreDeactivate(megcoreDeviceHandle_t handle) | |||||
| { | |||||
| megdnn_assert(handle); | |||||
| handle->content->deactivate(); | |||||
| return megcoreSuccess; | |||||
| } | |||||
| megcoreStatus_t megcoreMalloc(megcoreDeviceHandle_t handle, | megcoreStatus_t megcoreMalloc(megcoreDeviceHandle_t handle, | ||||
| void **devPtr, size_t sizeInBytes) | void **devPtr, size_t sizeInBytes) | ||||
| { | { | ||||
| @@ -27,7 +27,6 @@ endif() | |||||
| add_executable(megdnn_test ${SOURCES}) | add_executable(megdnn_test ${SOURCES}) | ||||
| target_link_libraries(megdnn_test gtest) | target_link_libraries(megdnn_test gtest) | ||||
| target_link_libraries(megdnn_test megdnn ${MGE_BLAS_LIBS}) | target_link_libraries(megdnn_test megdnn ${MGE_BLAS_LIBS}) | ||||
| @@ -246,6 +246,7 @@ SymbolVarArray _Opr::tensor_rt_runtime(const SymbolVarArray& inputs, | |||||
| } | } | ||||
| #endif | #endif | ||||
| SymbolVar _Opr::timestamp(SymbolVar input, PyObject* dest, size_t dest_off, | SymbolVar _Opr::timestamp(SymbolVar input, PyObject* dest, size_t dest_off, | ||||
| const OperatorNodeConfig& config) { | const OperatorNodeConfig& config) { | ||||
| auto tensor = std::make_shared<HostTensorND>( | auto tensor = std::make_shared<HostTensorND>( | ||||
| @@ -118,6 +118,8 @@ static SymbolVarArray tensor_rt_runtime(const SymbolVarArray& inputs, | |||||
| PyObject* data_bytes, | PyObject* data_bytes, | ||||
| const OperatorNodeConfig& config); | const OperatorNodeConfig& config); | ||||
| static SymbolVar timestamp(SymbolVar input, PyObject* dest, size_t dest_off, | static SymbolVar timestamp(SymbolVar input, PyObject* dest, size_t dest_off, | ||||
| const OperatorNodeConfig& config); | const OperatorNodeConfig& config); | ||||
| @@ -18,7 +18,6 @@ | |||||
| #if MGB_ENABLE_OPR_MM | #if MGB_ENABLE_OPR_MM | ||||
| #include "megbrain/opr/collective_comm.h" | #include "megbrain/opr/collective_comm.h" | ||||
| #endif | #endif | ||||
| using AxisIndexer = mgb::opr::indexing::AxisIndexer; | using AxisIndexer = mgb::opr::indexing::AxisIndexer; | ||||
| /*! | /*! | ||||
| @@ -88,7 +88,7 @@ if (MGB_WITH_FLATBUFFERS) | |||||
| ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/opr_param_defs.fbs | ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/opr_param_defs.fbs | ||||
| COMMAND | COMMAND | ||||
| ${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY} ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/opr_param_defs.fbs | ${PYTHON_EXECUTABLE} ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY} ${CMAKE_CURRENT_BINARY_DIR}/serialization/impl/opr_param_defs.fbs | ||||
| DEPENDS ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY} | |||||
| DEPENDS ${GEN_FLATBUFFERS_SCHEMA_PY} ${OPR_PARAM_DEFS_PY} | |||||
| VERBATIM | VERBATIM | ||||
| ) | ) | ||||
| add_custom_command( | add_custom_command( | ||||
| @@ -124,7 +124,6 @@ if (MGB_WITH_FLATBUFFERS) | |||||
| target_include_directories(megbrain PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/serialization/include) | target_include_directories(megbrain PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/serialization/include) | ||||
| target_compile_definitions(megbrain PUBLIC MGB_ENABLE_FBS_SERIALIZATION=1) | target_compile_definitions(megbrain PUBLIC MGB_ENABLE_FBS_SERIALIZATION=1) | ||||
| target_link_libraries(megbrain PUBLIC flatbuffers) | target_link_libraries(megbrain PUBLIC flatbuffers) | ||||
| set (GENERATED_FLATBUFFERS_CONVERTER_PATH ${CMAKE_CURRENT_BINARY_DIR}/genfiles) | set (GENERATED_FLATBUFFERS_CONVERTER_PATH ${CMAKE_CURRENT_BINARY_DIR}/genfiles) | ||||
| set (GEN_FLATBUFFERS_CONVERTER_PY ${PROJECT_SOURCE_DIR}/dnn/scripts/gen_flatbuffers_converter.py) | set (GEN_FLATBUFFERS_CONVERTER_PY ${PROJECT_SOURCE_DIR}/dnn/scripts/gen_flatbuffers_converter.py) | ||||
| file (MAKE_DIRECTORY ${GENERATED_FLATBUFFERS_CONVERTER_PATH}) | file (MAKE_DIRECTORY ${GENERATED_FLATBUFFERS_CONVERTER_PATH}) | ||||
| @@ -96,7 +96,7 @@ megcore::AsyncErrorInfo* MegDNNHandle::make_async_error_info( | |||||
| cn.free_device(ptr); | cn.free_device(ptr); | ||||
| } | } | ||||
| }; | }; | ||||
| megcore::AsyncErrorInfo zero_info{0, nullptr, "", {0,0,0,0}}; | |||||
| megcore::AsyncErrorInfo zero_info{0, nullptr, "", {0, 0, 0, 0}}; | |||||
| auto ptr = static_cast<megcore::AsyncErrorInfo*>( | auto ptr = static_cast<megcore::AsyncErrorInfo*>( | ||||
| env.comp_node().alloc_device(sizeof(zero_info))); | env.comp_node().alloc_device(sizeof(zero_info))); | ||||
| cn.copy_to_device(ptr, &zero_info, sizeof(zero_info)); | cn.copy_to_device(ptr, &zero_info, sizeof(zero_info)); | ||||
| @@ -106,7 +106,7 @@ megcore::AsyncErrorInfo* MegDNNHandle::make_async_error_info( | |||||
| } | } | ||||
| #endif | #endif | ||||
| /* =================== misc =================== */ | |||||
| /* =================== misc =================== */ | |||||
| #if MGB_CUDA | #if MGB_CUDA | ||||
| @@ -123,9 +123,9 @@ StaticDeviceMemoryManager::make_default_impl() { | |||||
| } | } | ||||
| #endif // MGB_THREAD_SAFE | #endif // MGB_THREAD_SAFE | ||||
| /* ==================== CUDAAsyncVarReleaser ==================== */ | |||||
| #if MGB_CUDA | |||||
| class VarNodeMemManager::CUDAAsyncVarReleaser { | |||||
| /* ==================== AsyncVarReleaser ==================== */ | |||||
| #if MGB_CUDA | |||||
| class VarNodeMemManager::AsyncVarReleaser { | |||||
| struct WaiterParam { | struct WaiterParam { | ||||
| CompNode cn; | CompNode cn; | ||||
| CompNode::Event *event; | CompNode::Event *event; | ||||
| @@ -133,10 +133,10 @@ class VarNodeMemManager::CUDAAsyncVarReleaser { | |||||
| }; | }; | ||||
| class Waiter final: public AsyncQueueSC<WaiterParam, Waiter> { | class Waiter final: public AsyncQueueSC<WaiterParam, Waiter> { | ||||
| CUDAAsyncVarReleaser *m_par_releaser; | |||||
| AsyncVarReleaser *m_par_releaser; | |||||
| public: | public: | ||||
| Waiter(CUDAAsyncVarReleaser *releaser): | |||||
| Waiter(AsyncVarReleaser *releaser): | |||||
| m_par_releaser(releaser) | m_par_releaser(releaser) | ||||
| { | { | ||||
| } | } | ||||
| @@ -159,7 +159,7 @@ class VarNodeMemManager::CUDAAsyncVarReleaser { | |||||
| Spinlock m_event_pool_lock; | Spinlock m_event_pool_lock; | ||||
| public: | public: | ||||
| ~CUDAAsyncVarReleaser() { | |||||
| ~AsyncVarReleaser() { | |||||
| wait_release_finish(); | wait_release_finish(); | ||||
| } | } | ||||
| @@ -247,15 +247,16 @@ bool VarNodeMemManager::ImpureMemPlanManager::check_need_realloc() { | |||||
| VarNodeMemManager::VarNodeMemManager(ComputingGraphImpl *graph): | VarNodeMemManager::VarNodeMemManager(ComputingGraphImpl *graph): | ||||
| m_owner_graph(graph), | m_owner_graph(graph), | ||||
| m_seq_mem_opt(graph) | m_seq_mem_opt(graph) | ||||
| #if MGB_CUDA | |||||
| ,m_cuda_asyn_var_releaser(new CUDAAsyncVarReleaser) | |||||
| #if MGB_CUDA | |||||
| ,m_asyn_var_releaser(new AsyncVarReleaser) | |||||
| #endif | #endif | ||||
| { | { | ||||
| auto on_comp_seq_finish = [this](const event::CompSeqExecFinished& ev) { | auto on_comp_seq_finish = [this](const event::CompSeqExecFinished& ev) { | ||||
| MGB_MARK_USED_VAR(ev); | |||||
| // async release is only used for sync between multiple comp nodes, and | // async release is only used for sync between multiple comp nodes, and | ||||
| // does not wait for device to finish | // does not wait for device to finish | ||||
| #if MGB_CUDA | |||||
| m_cuda_asyn_var_releaser->wait_release_finish(); | |||||
| #if MGB_CUDA | |||||
| m_asyn_var_releaser->wait_release_finish(); | |||||
| #endif | #endif | ||||
| m_cpu_async_release_barrier.wait_zero(); | m_cpu_async_release_barrier.wait_zero(); | ||||
| }; | }; | ||||
| @@ -295,9 +296,10 @@ VarNodeMemManager::VarNodeMemManager(ComputingGraphImpl *graph): | |||||
| graph->event().register_receiver_permanent<event::CompSeqExecError>( | graph->event().register_receiver_permanent<event::CompSeqExecError>( | ||||
| on_comp_seq_error); | on_comp_seq_error); | ||||
| #if MGB_ENABLE_VAR_DEV_MEM_DEFRAGMENTER | |||||
| #if MGB_ENABLE_VAR_DEV_MEM_DEFRAGMENTER && (MGB_CUDA \ | |||||
| ) | |||||
| auto on_mem_defrag_start = [this](const event::BeforeMemDefrag&) { | auto on_mem_defrag_start = [this](const event::BeforeMemDefrag&) { | ||||
| m_cuda_asyn_var_releaser->wait_release_finish(); | |||||
| m_asyn_var_releaser->wait_release_finish(); | |||||
| }; | }; | ||||
| graph->event().register_receiver_permanent<event::BeforeMemDefrag>( | graph->event().register_receiver_permanent<event::BeforeMemDefrag>( | ||||
| on_mem_defrag_start); | on_mem_defrag_start); | ||||
| @@ -1341,7 +1343,7 @@ void VarNodeMemManager::decr_var_mem_refcnt( | |||||
| } | } | ||||
| #if MGB_CUDA | #if MGB_CUDA | ||||
| case DT::CUDA: | case DT::CUDA: | ||||
| m_cuda_asyn_var_releaser->add(dispatch_cn, var); | |||||
| m_asyn_var_releaser->add(dispatch_cn, var); | |||||
| break; | break; | ||||
| #endif | #endif | ||||
| default: | default: | ||||
| @@ -431,10 +431,10 @@ class VarNodeMemManager { | |||||
| SyncableCounter m_cpu_async_release_barrier; | SyncableCounter m_cpu_async_release_barrier; | ||||
| #if MGB_CUDA | |||||
| //! release dynamic var on after cuda event finishes | |||||
| class CUDAAsyncVarReleaser; | |||||
| std::unique_ptr<CUDAAsyncVarReleaser> m_cuda_asyn_var_releaser; | |||||
| #if MGB_CUDA | |||||
| //! release dynamic var on after compnode event finishes | |||||
| class AsyncVarReleaser; | |||||
| std::unique_ptr<AsyncVarReleaser> m_asyn_var_releaser; | |||||
| #endif | #endif | ||||
| VarDevMemDefragmenter m_var_dev_mem_defragmenter{this}; | VarDevMemDefragmenter m_var_dev_mem_defragmenter{this}; | ||||
| @@ -41,9 +41,9 @@ | |||||
| } \ | } \ | ||||
| } while (0) | } while (0) | ||||
| #endif // MGB_ENABLE_LOGGING | |||||
| #endif //MGB_ENABLE_LOGGING | |||||
| #endif //MGB_CUDA | |||||
| #endif | |||||
| //! whether to enable asynchronous initialization for CompNode and CompNodeEnv | //! whether to enable asynchronous initialization for CompNode and CompNodeEnv | ||||
| #define MGB_ENABLE_COMP_NODE_ASYNC_INIT (MGB_CUDA) | #define MGB_ENABLE_COMP_NODE_ASYNC_INIT (MGB_CUDA) | ||||
| @@ -136,7 +136,6 @@ public: | |||||
| * error message | * error message | ||||
| */ | */ | ||||
| static std::string get_cuda_extra_info(); | static std::string get_cuda_extra_info(); | ||||
| CudaError(const std::string& msg); | CudaError(const std::string& msg); | ||||
| }; | }; | ||||
| @@ -59,9 +59,6 @@ TEST(TestCompNode, Parse) { | |||||
| ASSERT_THROW(L::parse("cpu0:"), MegBrainError); | ASSERT_THROW(L::parse("cpu0:"), MegBrainError); | ||||
| ASSERT_THROW(L::parse("cpu0:x"), MegBrainError); | ASSERT_THROW(L::parse("cpu0:x"), MegBrainError); | ||||
| ASSERT_THROW(L::parse("cpu2:23x"), MegBrainError); | ASSERT_THROW(L::parse("cpu2:23x"), MegBrainError); | ||||
| ASSERT_THROW(L::parse("heaxgon0"), MegBrainError); | |||||
| ASSERT_THROW(L::parse("rcom0"), MegBrainError); | |||||
| ASSERT_THROW(L::parse("cmabricon0"), MegBrainError); | |||||
| ASSERT_THROW(L::parse("multithread"), MegBrainError); | ASSERT_THROW(L::parse("multithread"), MegBrainError); | ||||
| ASSERT_THROW(L::parse("multithread1:"), MegBrainError); | ASSERT_THROW(L::parse("multithread1:"), MegBrainError); | ||||
| ASSERT_THROW(L::parse("multithread1:default"), MegBrainError); | ASSERT_THROW(L::parse("multithread1:default"), MegBrainError); | ||||
| @@ -53,6 +53,7 @@ | |||||
| #cmakedefine01 MEGDNN_THREADS_512 | #cmakedefine01 MEGDNN_THREADS_512 | ||||
| #cmakedefine01 MEGDNN_ENABLE_MULTI_THREADS | #cmakedefine01 MEGDNN_ENABLE_MULTI_THREADS | ||||
| // whether cuda is available | // whether cuda is available | ||||
| #ifndef MGB_CUDA | #ifndef MGB_CUDA | ||||
| #define MGB_CUDA 1 | #define MGB_CUDA 1 | ||||
| @@ -15,6 +15,7 @@ if (MGE_WITH_CUDA AND MGE_WITH_TRT) | |||||
| list(APPEND SOURCES ${SOURCES_}) | list(APPEND SOURCES ${SOURCES_}) | ||||
| endif() | endif() | ||||
| add_executable(megbrain_test ${SOURCES}) | add_executable(megbrain_test ${SOURCES}) | ||||
| target_link_libraries(megbrain_test gtest) | target_link_libraries(megbrain_test gtest) | ||||
| target_link_libraries(megbrain_test megengine) | target_link_libraries(megbrain_test megengine) | ||||
| @@ -98,22 +98,48 @@ dtype, RandomDistribution::UNIFORM>::operator ()( | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| template<typename dtype> | |||||
| std::shared_ptr<HostTensorND> HostTensorGenerator< | |||||
| dtype, RandomDistribution::CONSTANT>::operator ()( | |||||
| const TensorShape &shape, CompNode cn) { | |||||
| if (!cn.valid()) | |||||
| cn = CompNode::load("xpu0"); | |||||
| std::shared_ptr<HostTensorND> ret = | |||||
| std::make_shared<HostTensorND>(cn, shape, dtype()); | |||||
| auto ptr = ret->ptr<ctype>(); | |||||
| for (size_t i = 0, it = shape.total_nr_elems(); i < it; ++ i) { | |||||
| ptr[i] = m_default_val; | |||||
| } | |||||
| return ret; | |||||
| } | |||||
| // explicit instantialization of HostTensorGenerator | // explicit instantialization of HostTensorGenerator | ||||
| namespace mgb { | namespace mgb { | ||||
| template class HostTensorGenerator< | template class HostTensorGenerator< | ||||
| dtype::Float32, RandomDistribution::GAUSSIAN>; | dtype::Float32, RandomDistribution::GAUSSIAN>; | ||||
| template class HostTensorGenerator< | template class HostTensorGenerator< | ||||
| dtype::Float32, RandomDistribution::UNIFORM>; | dtype::Float32, RandomDistribution::UNIFORM>; | ||||
| template class HostTensorGenerator< | |||||
| dtype::Float32, RandomDistribution::CONSTANT>; | |||||
| template class HostTensorGenerator< | template class HostTensorGenerator< | ||||
| dtype::Float16, RandomDistribution::GAUSSIAN>; | dtype::Float16, RandomDistribution::GAUSSIAN>; | ||||
| template class HostTensorGenerator< | template class HostTensorGenerator< | ||||
| dtype::Int8, RandomDistribution::UNIFORM>; | dtype::Int8, RandomDistribution::UNIFORM>; | ||||
| template class HostTensorGenerator< | |||||
| dtype::Int8, RandomDistribution::CONSTANT>; | |||||
| template class HostTensorGenerator< | template class HostTensorGenerator< | ||||
| dtype::Uint8, RandomDistribution::UNIFORM>; | dtype::Uint8, RandomDistribution::UNIFORM>; | ||||
| template class HostTensorGenerator< | |||||
| dtype::Uint8, RandomDistribution::CONSTANT>; | |||||
| template class HostTensorGenerator< | template class HostTensorGenerator< | ||||
| dtype::Int16, RandomDistribution::UNIFORM>; | dtype::Int16, RandomDistribution::UNIFORM>; | ||||
| template class HostTensorGenerator< | |||||
| dtype::Int16, RandomDistribution::CONSTANT>; | |||||
| template class HostTensorGenerator< | template class HostTensorGenerator< | ||||
| dtype::Int32, RandomDistribution::UNIFORM>; | dtype::Int32, RandomDistribution::UNIFORM>; | ||||
| template class HostTensorGenerator< | |||||
| dtype::Int32, RandomDistribution::CONSTANT>; | |||||
| std::shared_ptr<HostTensorND> | std::shared_ptr<HostTensorND> | ||||
| HostTensorGenerator<dtype::QuantizedS8, RandomDistribution::UNIFORM>:: | HostTensorGenerator<dtype::QuantizedS8, RandomDistribution::UNIFORM>:: | ||||
| operator()(const TensorShape& shape, CompNode cn) { | operator()(const TensorShape& shape, CompNode cn) { | ||||
| @@ -175,7 +175,7 @@ class RNGxorshf { | |||||
| }; | }; | ||||
| enum class RandomDistribution { | enum class RandomDistribution { | ||||
| GAUSSIAN, UNIFORM | |||||
| GAUSSIAN, UNIFORM, CONSTANT | |||||
| }; | }; | ||||
| template<class dtype> | template<class dtype> | ||||
| @@ -322,6 +322,26 @@ class HostTensorGenerator<dtype, RandomDistribution::UNIFORM> final: | |||||
| ctype m_lo, m_hi; | ctype m_lo, m_hi; | ||||
| }; | }; | ||||
| //! const value | |||||
| template<class dtype> | |||||
| class HostTensorGenerator<dtype, RandomDistribution::CONSTANT> final: | |||||
| public HostTensorGeneratorBase { | |||||
| public: | |||||
| using ctype = typename DTypeTrait<dtype>::ctype; | |||||
| HostTensorGenerator(ctype default_val) | |||||
| : HostTensorGeneratorBase{next_rand_seed()}, | |||||
| m_default_val{default_val} {} | |||||
| std::shared_ptr<HostTensorND> operator ()( | |||||
| const TensorShape &shape, CompNode cn = {}) override; | |||||
| using HostTensorGeneratorBase::operator(); | |||||
| private: | |||||
| ctype m_default_val; | |||||
| }; | |||||
| template <> | template <> | ||||
| class HostTensorGenerator<dtype::QuantizedS8, RandomDistribution::UNIFORM> final | class HostTensorGenerator<dtype::QuantizedS8, RandomDistribution::UNIFORM> final | ||||
| : public HostTensorGeneratorBase { | : public HostTensorGeneratorBase { | ||||
| @@ -21,8 +21,8 @@ pdef('PersistentOutputStorage').add_fields( | |||||
| (pdef('ExecutionPolicy', 'specify how to select an algorithm for an operator'). | (pdef('ExecutionPolicy', 'specify how to select an algorithm for an operator'). | ||||
| add_enum('Strategy', | add_enum('Strategy', | ||||
| Doc('HEURISTIC', 'use heuristic to choose the fastest algorithm'), | Doc('HEURISTIC', 'use heuristic to choose the fastest algorithm'), | ||||
| Doc('HEURISTIC_REPRODUCIBLE', 'use heuristic to choose the fastest algorithm, ' | |||||
| 'and the chosen algorithm is reproducible'), | |||||
| Doc('HEURISTIC_REPRODUCIBLE', 'use heuristic to choose the fastest algorithm, ' | |||||
| 'and the chosen algorithm is reproducible'), | |||||
| Doc('PROFILE', | Doc('PROFILE', | ||||
| 'run possible algorithms on real device to find the best'), | 'run possible algorithms on real device to find the best'), | ||||
| Doc('PROFILE_REPRODUCIBLE', | Doc('PROFILE_REPRODUCIBLE', | ||||