diff --git a/ge/hybrid/common/tensor_value.h b/ge/hybrid/common/tensor_value.h index 19e1ba27..348e4e6d 100644 --- a/ge/hybrid/common/tensor_value.h +++ b/ge/hybrid/common/tensor_value.h @@ -40,6 +40,12 @@ class TensorBuffer { TensorBuffer &operator = (const TensorBuffer &) = delete; ~TensorBuffer(); + void* Release() { + auto ret = buffer_; + buffer_ = nullptr; + return ret; + } + void *GetData() { return buffer_; } @@ -48,6 +54,10 @@ class TensorBuffer { return size_; } + MemStorageType GetMemType() const { + return mem_type_; + } + private: TensorBuffer(NpuMemoryAllocator *allocator, void *buffer, size_t size, MemStorageType mem_type = HBM); @@ -69,6 +79,10 @@ class TensorValue { void Destroy(); + void *Release() { + return buffer_->Release(); + } + bool IsEmpty() { return ref_buffer_ == nullptr && buffer_ == nullptr; } @@ -80,6 +94,10 @@ class TensorValue { void SetName(const std::string &name) { name_ = name; } + + MemStorageType GetMemType() const { + return buffer_->GetMemType(); + } void *MutableData(); diff --git a/ge/hybrid/executor/hybrid_model_async_executor.cc b/ge/hybrid/executor/hybrid_model_async_executor.cc index af06e27b..68786822 100644 --- a/ge/hybrid/executor/hybrid_model_async_executor.cc +++ b/ge/hybrid/executor/hybrid_model_async_executor.cc @@ -19,6 +19,12 @@ #include "graph/utils/tensor_utils.h" #include "graph/utils/type_utils.h" #include "graph/ge_context.h" +#include "graph/types.h" +#include "graph/debug/ge_attr_define.h" +#include "graph/manager/graph_caching_allocator.h" +#include "graph/manager/graph_mem_allocator.h" +#include "graph/manager/rdma_pool_allocator.h" +#include "graph/manager/host_mem_allocator.h" namespace ge { namespace hybrid { @@ -27,6 +33,7 @@ const int kDataOutputIndex = 0; const size_t kMinimumPiplineStages = 2; const int kDefaultLoopCount = 10; const size_t kAlignment = 64; +const char *const kLazyRecompile = "lazy_recompile"; } HybridModelAsyncExecutor::HybridModelAsyncExecutor(HybridModel *model) : model_(model), run_flag_(false), data_dumper_(nullptr) { @@ -442,17 +449,39 @@ Status HybridModelAsyncExecutor::CopyOutputs(HybridModelExecutor::ExecuteArgs &a ge_tensor_desc.SetShape(ge_shape); GeTensor ge_tensor(ge_tensor_desc); if (output_size > 0) { - auto aligned_ptr = MakeShared(output_size, kAlignment); - GE_CHECK_NOTNULL(aligned_ptr); - auto data_buf = aligned_ptr->MutableGet(); - GE_CHECK_NOTNULL(data_buf); - GE_CHK_RT_RET(rtMemcpy(data_buf, output_size, output_tensor.GetData(), output_size, RT_MEMCPY_DEVICE_TO_HOST)); - ge_tensor.SetData(aligned_ptr, output_size); - output_data->blobs.emplace_back(data_buf, static_cast(output_size), false); - } else { - GELOGW("Output[%zu] is empty. shape = [%s]", i, tensor_desc->GetShape().ToString().c_str()); - ge_tensor.SetData(nullptr, 0U); - output_data->blobs.emplace_back(nullptr, 0U, false); + if (execute_mode != kLazyRecompile) { + auto aligned_ptr = MakeShared(output_size, kAlignment); + GE_CHECK_NOTNULL(aligned_ptr); + auto data_buf = aligned_ptr->MutableGet(); + GE_CHECK_NOTNULL(data_buf); + GE_CHK_RT_RET(rtMemcpy(data_buf, output_size, output_tensor.GetData(), output_size, RT_MEMCPY_DEVICE_TO_HOST)); + ge_tensor.SetData(aligned_ptr, output_size); + output_data->blobs.emplace_back(data_buf, static_cast(output_size), false); + } else { + auto mem_type = output_tensor.GetMemType(); + auto deleter = [=] (uint8_t *device_data) { + if (device_data != nullptr) { + GELOGI("Deallocating buffer successfully. addr = %p", device_data); + if (mem_type == RDMA_HBM) { + MemManager::Instance().RdmaPoolInstance(RT_MEMORY_HBM).Free(device_data, device_id_); + } else if (mem_type == HOST_DDR) { + MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).Free(device_data); + } else { + MemManager::Instance().CachingInstance(RT_MEMORY_HBM).Free(device_data, device_id_); + } + } + }; + auto tensor = TensorAdapter::AsTensor(ge_tensor); + tensor.GetTensorDesc().SetPlacement(kPlacementDevice); + tensor.SetData(reinterpret_cast(output_tensor.Release()),static_cast(output_size), deleter); + output_data->blobs.emplace_back(output_tensor.Release(), static_cast(output_size), false, 1); + outputs.emplace_back(std::move(tensor)); + continue; + } + } else { + GELOGW("Output [%zu] is empty. shape = [%s]", i, tensor_desc->GetShape().ToString().c_str()); + ge_tensor.SetData(nullptr, 0U); + output_data->blobs.emplace_back(nullptr, 0U, false); } auto tensor = TensorAdapter::AsTensor(ge_tensor); outputs.emplace_back(std::move(tensor));