| @@ -76,6 +76,7 @@ from .core._imperative_rt.core2 import full_sync as _full_sync | |||||
| from .core._imperative_rt.core2 import sync as _sync | from .core._imperative_rt.core2 import sync as _sync | ||||
| from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func | from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func | ||||
| from .device import * | from .device import * | ||||
| from .dtr import * | |||||
| from .logger import enable_debug_log, get_logger, set_log_file, set_log_level | from .logger import enable_debug_log, get_logger, set_log_file, set_log_level | ||||
| from .serialization import load, save | from .serialization import load, save | ||||
| from .tensor import Parameter, Tensor, tensor | from .tensor import Parameter, Tensor, tensor | ||||
| @@ -0,0 +1,131 @@ | |||||
| # -*- coding: utf-8 -*- | |||||
| # MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| # | |||||
| # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, | |||||
| # software distributed under the License is distributed on an | |||||
| # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| import re | |||||
| from typing import Union | |||||
| from mprop import mproperty | |||||
| from .core._imperative_rt.core2 import set_option | |||||
| from .core._imperative_rt.utils import _set_defrag | |||||
| _eviction_threshold = 0 | |||||
| _evictee_minimum_size = 1024 ** 2 | |||||
| def str2bytes(text: str) -> int: | |||||
| regex = re.compile(r"(\d+(?:\.\d+)?)\s*([kmg]?b)", re.IGNORECASE) | |||||
| order = ["b", "kb", "mb", "gb"] | |||||
| result = regex.findall(text) | |||||
| if len(result) != 1: | |||||
| raise ValueError( | |||||
| "Formatting of `value` only supports bytes(B), kilobyte(KB), megabyte(MB) and gigabyte(GB) units" | |||||
| ) | |||||
| return int(float(result[0][0]) * 1024 ** order.index(result[0][1].lower())) | |||||
| @mproperty | |||||
| def eviction_threshold(mod): | |||||
| r""" | |||||
| Returns the eviction threshold in bytes. | |||||
| .. note:: | |||||
| When GPU memory usage exceeds this value, DTR will heuristically select | |||||
| and evict resident tensors until the amount of used memory falls below | |||||
| this threshold. | |||||
| """ | |||||
| return mod._eviction_threshold | |||||
| @eviction_threshold.setter | |||||
| def eviction_threshold(mod, value: Union[int, str]): | |||||
| r""" | |||||
| Change the eviction threshold. If `value` is an int, it represents the | |||||
| number of bytes. If `value` is a string, its formatting supports bytes(B), | |||||
| kilobyte(KB), megabyte(MB) and gigabyte(GB) units. | |||||
| Examples: | |||||
| .. code-block:: | |||||
| import megengine as mge | |||||
| mge.dtr.eviction_threshold = 2 * 1024 ** 3 | |||||
| mge.dtr.eviction_threshold = "2GB" | |||||
| mge.dtr.eviction_threshold = "2048MB" | |||||
| """ | |||||
| if isinstance(value, str): | |||||
| mod._eviction_threshold = mod.str2bytes(value) | |||||
| elif isinstance(value, int): | |||||
| mod._eviction_threshold = value | |||||
| else: | |||||
| raise TypeError("`value` should be a str or an int") | |||||
| set_option("dtr_eviction_threshold", mod._eviction_threshold) | |||||
| @mproperty | |||||
| def evictee_minimum_size(mod): | |||||
| r""" | |||||
| Returns the memory threshold of tensors in bytes. | |||||
| .. note:: | |||||
| Only tensors whose size exceeds this threshold will be added to the | |||||
| candidate set. A tensor that is not added to the candidate set will | |||||
| never be evicted during its lifetime. | |||||
| """ | |||||
| return mod._evictee_minimum_size | |||||
| @evictee_minimum_size.setter | |||||
| def evictee_minimum_size(mod, value: Union[int, str]): | |||||
| r""" | |||||
| Change the memory threshold of tensors. If `value` is an int, it represents | |||||
| the number of bytes. If `value` is a string, its formatting supports bytes(B), | |||||
| kilobyte(KB), megabyte(MB) and gigabyte(GB) units. | |||||
| Examples: | |||||
| .. code-block:: | |||||
| import megengine as mge | |||||
| mge.dtr.evictee_minimum_size = 2 * 1024 ** 2 | |||||
| mge.dtr.evictee_minimum_size = "2MB" | |||||
| mge.dtr.evictee_minimum_size = "2048KB" | |||||
| """ | |||||
| if isinstance(value, str): | |||||
| mod._evictee_minimum_size = mod.str2bytes(value) | |||||
| elif isinstance(value, int): | |||||
| mod._evictee_minimum_size = value | |||||
| else: | |||||
| raise TypeError("`value` should be a str or an int") | |||||
| set_option("dtr_evictee_minimum_size", mod._evictee_minimum_size) | |||||
| def enable(): | |||||
| r""" | |||||
| Enable to record computing path of tensors and to perform DTR policy. | |||||
| """ | |||||
| _set_defrag(True) | |||||
| set_option("enable_dtr_auto_drop", 1) | |||||
| set_option("enable_drop", 1) | |||||
| set_option("buffer_length", 0) | |||||
| set_option("record_computing_path", 1) | |||||
| def disable(): | |||||
| r""" | |||||
| Stop recording computing path of tensors and performing DTR policy. | |||||
| """ | |||||
| set_option("enable_dtr_auto_drop", 0) | |||||
| set_option("enable_drop", 0) | |||||
| set_option("record_computing_path", 0) | |||||
| @@ -1,44 +0,0 @@ | |||||
| # -*- coding: utf-8 -*- | |||||
| # MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| # | |||||
| # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, | |||||
| # software distributed under the License is distributed on an | |||||
| # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| from ..core._imperative_rt.core2 import set_option | |||||
| from ..core._imperative_rt.utils import _set_defrag | |||||
| class DTR: | |||||
| r""" | |||||
| DTR implements `Dynamic Tensor Rematerialization <https://arxiv.org/abs/2006.09616>`_ in MegEngine. | |||||
| It is basically an online algorithm for checkpointing driven by certain eviction policies. | |||||
| .. code-block:: | |||||
| from megengine.utils.dtr import DTR | |||||
| ds = DTR(memory_budget=5*1024**3) | |||||
| # your training code | |||||
| """ | |||||
| def __init__(self, memory_budget=0, tensor_lowerbound=1048576): | |||||
| r""" | |||||
| :param memory_budget: int. The threshold of memory usage. When memory | |||||
| usage exceeds this value, auto evict will be triggered. | |||||
| :param tensor_lowerbound: int. The minimum memory limit of the tensor | |||||
| that can be evicted. Default: 1MB. | |||||
| """ | |||||
| if memory_budget > 0: | |||||
| set_option("enable_auto_drop", 1) | |||||
| set_option("enable_drop", 1) | |||||
| set_option("buffer_length", 0) | |||||
| set_option("memory_budget", memory_budget) | |||||
| set_option("tensor_lowerbound", tensor_lowerbound) | |||||
| set_option("record_computing_path", 1) | |||||
| _set_defrag(True) | |||||
| @@ -6,3 +6,4 @@ tabulate | |||||
| tqdm | tqdm | ||||
| redispy | redispy | ||||
| deprecated | deprecated | ||||
| mprop | |||||
| @@ -422,7 +422,7 @@ void ChannelImpl::do_drop(TensorInfo* ptr, bool user=false) { | |||||
| } | } | ||||
| void ChannelImpl::free(TensorInfo* ptr) { | void ChannelImpl::free(TensorInfo* ptr) { | ||||
| if (m_worker_state.options.enable_auto_drop) { | |||||
| if (m_worker_state.options.enable_dtr_auto_drop) { | |||||
| // Evicting a tensor, rather than freeing it, can avoid pinning | // Evicting a tensor, rather than freeing it, can avoid pinning | ||||
| // potentially exploding amounts of memory and allow us to save | // potentially exploding amounts of memory and allow us to save | ||||
| // more memory. | // more memory. | ||||
| @@ -459,7 +459,7 @@ void ChannelImpl::real_free(TensorInfo* ptr) { | |||||
| if (m_channel_state.profiler->is_profiling()) { | if (m_channel_state.profiler->is_profiling()) { | ||||
| m_channel_state.profiler->record_host<TensorEraseEvent>(ptr->id); | m_channel_state.profiler->record_host<TensorEraseEvent>(ptr->id); | ||||
| } | } | ||||
| if (ptr->size_exceeds_thd(m_worker_state.options.tensor_lowerbound)) { | |||||
| if (ptr->size_exceeds_thd(m_worker_state.options.dtr_evictee_minimum_size)) { | |||||
| m_dtr.erase_candidate(ptr); | m_dtr.erase_candidate(ptr); | ||||
| } | } | ||||
| detach_users(ptr); | detach_users(ptr); | ||||
| @@ -487,7 +487,7 @@ void ChannelImpl::produce_tensor(TensorInfo* dest, TensorPtr ptr, bool notice=tr | |||||
| dest->memory = ptr->blob()->size(); | dest->memory = ptr->blob()->size(); | ||||
| dest->ptr = std::move(ptr); | dest->ptr = std::move(ptr); | ||||
| dest->evict_type = EvictType::NONE; | dest->evict_type = EvictType::NONE; | ||||
| if (notice && dest->size_exceeds_thd(m_worker_state.options.tensor_lowerbound)) { | |||||
| if (notice && dest->size_exceeds_thd(m_worker_state.options.dtr_evictee_minimum_size)) { | |||||
| m_dtr.insert_candidate(dest); | m_dtr.insert_candidate(dest); | ||||
| } | } | ||||
| if (notice && m_waitee == dest) { | if (notice && m_waitee == dest) { | ||||
| @@ -519,7 +519,7 @@ void ChannelImpl::recompute(TensorInfo::ComputePath* path) { | |||||
| inputs.push_back(i->ptr); | inputs.push_back(i->ptr); | ||||
| m_dtr.update_used_time(i); | m_dtr.update_used_time(i); | ||||
| } | } | ||||
| if (m_worker_state.options.enable_auto_drop && m_worker_state.options.memory_budget > 0) { | |||||
| if (m_worker_state.options.enable_dtr_auto_drop && m_worker_state.options.dtr_eviction_threshold > 0) { | |||||
| auto_evict(); | auto_evict(); | ||||
| } | } | ||||
| auto outputs = OpDef::apply_on_physical_tensor(*path->op, inputs); | auto outputs = OpDef::apply_on_physical_tensor(*path->op, inputs); | ||||
| @@ -531,7 +531,7 @@ void ChannelImpl::recompute(TensorInfo::ComputePath* path) { | |||||
| o->recompute_times ++; | o->recompute_times ++; | ||||
| if (!o->ptr) { | if (!o->ptr) { | ||||
| produce_tensor(o, std::move(outputs[i]), false); | produce_tensor(o, std::move(outputs[i]), false); | ||||
| if (m_worker_state.options.enable_auto_drop) { | |||||
| if (m_worker_state.options.enable_dtr_auto_drop) { | |||||
| m_dtr.update_dsu_after_recompute(o); | m_dtr.update_dsu_after_recompute(o); | ||||
| } | } | ||||
| } | } | ||||
| @@ -544,7 +544,7 @@ void ChannelImpl::auto_evict() { | |||||
| return; | return; | ||||
| } | } | ||||
| size_t current_memory = m_dtr.comp_node.get_used_memory(); | size_t current_memory = m_dtr.comp_node.get_used_memory(); | ||||
| while (current_memory > m_worker_state.options.memory_budget) { | |||||
| while (current_memory > m_worker_state.options.dtr_eviction_threshold) { | |||||
| auto best = m_dtr.find_best_tensor(); | auto best = m_dtr.find_best_tensor(); | ||||
| if (!best) { | if (!best) { | ||||
| if (!m_dtr.warn_printed) { | if (!m_dtr.warn_printed) { | ||||
| @@ -642,7 +642,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||||
| uint64_t apply_id = ++m_last_id; | uint64_t apply_id = ++m_last_id; | ||||
| SmallVector<TensorPtr> tensor_inputs; | SmallVector<TensorPtr> tensor_inputs; | ||||
| SmallVector<CompNode> devices; | SmallVector<CompNode> devices; | ||||
| if (m_worker_state.options.enable_auto_drop) { | |||||
| if (m_worker_state.options.enable_dtr_auto_drop) { | |||||
| m_dtr.pin(cmd.inputs); | m_dtr.pin(cmd.inputs); | ||||
| } | } | ||||
| for (auto i : cmd.inputs) { | for (auto i : cmd.inputs) { | ||||
| @@ -696,7 +696,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||||
| m_worker_state.profiler->record_device<DeviceOpExecuteEvent>(device, event_data); | m_worker_state.profiler->record_device<DeviceOpExecuteEvent>(device, event_data); | ||||
| } | } | ||||
| } | } | ||||
| if (m_worker_state.options.enable_auto_drop && m_worker_state.options.memory_budget > 0) { | |||||
| if (m_worker_state.options.enable_dtr_auto_drop && m_worker_state.options.dtr_eviction_threshold > 0) { | |||||
| auto_evict(); | auto_evict(); | ||||
| } | } | ||||
| // Apply op | // Apply op | ||||
| @@ -712,7 +712,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||||
| } | } | ||||
| // End profiling operator | // End profiling operator | ||||
| double estimate_compute_time = 0; | double estimate_compute_time = 0; | ||||
| if (m_worker_state.options.enable_auto_drop) { | |||||
| if (m_worker_state.options.enable_dtr_auto_drop) { | |||||
| for (auto i : cmd.inputs) { | for (auto i : cmd.inputs) { | ||||
| estimate_compute_time += i->memory; | estimate_compute_time += i->memory; | ||||
| } | } | ||||
| @@ -735,7 +735,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||||
| continue; | continue; | ||||
| } | } | ||||
| produce_tensor(cmd.outputs[i], std::move(tensor_outputs[i])); | produce_tensor(cmd.outputs[i], std::move(tensor_outputs[i])); | ||||
| if (m_worker_state.options.enable_auto_drop) { | |||||
| if (m_worker_state.options.enable_dtr_auto_drop) { | |||||
| cmd.outputs[i]->dsu_ptr = std::make_shared<DsuNode>(estimate_compute_time); | cmd.outputs[i]->dsu_ptr = std::make_shared<DsuNode>(estimate_compute_time); | ||||
| } | } | ||||
| } | } | ||||
| @@ -774,7 +774,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||||
| TensorInfo::ComputePath::make(cmd.op, cmd.inputs, cmd.outputs); | TensorInfo::ComputePath::make(cmd.op, cmd.inputs, cmd.outputs); | ||||
| size_t detach_cnt = 0; | size_t detach_cnt = 0; | ||||
| for (auto output : cmd.outputs) { | for (auto output : cmd.outputs) { | ||||
| if (!output->size_exceeds_thd(m_worker_state.options.tensor_lowerbound)) { | |||||
| if (!output->size_exceeds_thd(m_worker_state.options.dtr_evictee_minimum_size)) { | |||||
| output->detach_producer(); | output->detach_producer(); | ||||
| detach_cnt ++; | detach_cnt ++; | ||||
| } | } | ||||
| @@ -39,10 +39,10 @@ public: | |||||
| "set command buffer length."); | "set command buffer length."); | ||||
| DEF_OPTION(enable_host_compute, "MEGENGINE_HOST_COMPUTE", 1, | DEF_OPTION(enable_host_compute, "MEGENGINE_HOST_COMPUTE", 1, | ||||
| "enable host compute, thus computation may be done in host event if it's device is gpu."); | "enable host compute, thus computation may be done in host event if it's device is gpu."); | ||||
| DEF_OPTION(enable_auto_drop, "MEGENGINE_AUTO_DROP", 0, ""); | |||||
| DEF_OPTION(memory_budget, "MEGENGINE_MEMORY_BUDGET", 0, | |||||
| DEF_OPTION(enable_dtr_auto_drop, "MEGENGINE_DTR_AUTO_DROP", 0, ""); | |||||
| DEF_OPTION(dtr_eviction_threshold, "MEGENGINE_DTR_EVICTION_THRESHOLD", 0, | |||||
| "auto drop will start whenever gpu memory usage exceeds this value."); | "auto drop will start whenever gpu memory usage exceeds this value."); | ||||
| DEF_OPTION(tensor_lowerbound, "MEGENGINE_TENSOR_LOWERBOUND", 1048576, | |||||
| DEF_OPTION(dtr_evictee_minimum_size, "MEGENGINE_DTR_EVICTEE_MINIMUM_SIZE", 1048576, | |||||
| "the minimum memory value of a tensor added to the candidate set"); | "the minimum memory value of a tensor added to the candidate set"); | ||||
| DEF_OPTION(record_computing_path, "MEGENGINE_RECORD_COMPUTING_PATH", 0, ""); | DEF_OPTION(record_computing_path, "MEGENGINE_RECORD_COMPUTING_PATH", 0, ""); | ||||