|
|
|
@@ -19,6 +19,12 @@ |
|
|
|
#include "graph/utils/tensor_utils.h" |
|
|
|
#include "graph/utils/type_utils.h" |
|
|
|
#include "graph/ge_context.h" |
|
|
|
#include "graph/types.h" |
|
|
|
#include "graph/debug/ge_attr_define.h" |
|
|
|
#include "graph/manager/graph_caching_allocator.h" |
|
|
|
#include "graph/manager/graph_mem_allocator.h" |
|
|
|
#include "graph/manager/rdma_pool_allocator.h" |
|
|
|
#include "graph/manager/host_mem_allocator.h" |
|
|
|
|
|
|
|
namespace ge { |
|
|
|
namespace hybrid { |
|
|
|
@@ -27,6 +33,7 @@ const int kDataOutputIndex = 0; |
|
|
|
const size_t kMinimumPiplineStages = 2; |
|
|
|
const int kDefaultLoopCount = 10; |
|
|
|
const size_t kAlignment = 64; |
|
|
|
const char *const kLazyRecompile = "lazy_recompile"; |
|
|
|
} |
|
|
|
HybridModelAsyncExecutor::HybridModelAsyncExecutor(HybridModel *model) |
|
|
|
: model_(model), run_flag_(false), data_dumper_(nullptr) { |
|
|
|
@@ -442,17 +449,39 @@ Status HybridModelAsyncExecutor::CopyOutputs(HybridModelExecutor::ExecuteArgs &a |
|
|
|
ge_tensor_desc.SetShape(ge_shape); |
|
|
|
GeTensor ge_tensor(ge_tensor_desc); |
|
|
|
if (output_size > 0) { |
|
|
|
auto aligned_ptr = MakeShared<AlignedPtr>(output_size, kAlignment); |
|
|
|
GE_CHECK_NOTNULL(aligned_ptr); |
|
|
|
auto data_buf = aligned_ptr->MutableGet(); |
|
|
|
GE_CHECK_NOTNULL(data_buf); |
|
|
|
GE_CHK_RT_RET(rtMemcpy(data_buf, output_size, output_tensor.GetData(), output_size, RT_MEMCPY_DEVICE_TO_HOST)); |
|
|
|
ge_tensor.SetData(aligned_ptr, output_size); |
|
|
|
output_data->blobs.emplace_back(data_buf, static_cast<uint32_t>(output_size), false); |
|
|
|
} else { |
|
|
|
GELOGW("Output[%zu] is empty. shape = [%s]", i, tensor_desc->GetShape().ToString().c_str()); |
|
|
|
ge_tensor.SetData(nullptr, 0U); |
|
|
|
output_data->blobs.emplace_back(nullptr, 0U, false); |
|
|
|
if (execute_mode != kLazyRecompile) { |
|
|
|
auto aligned_ptr = MakeShared<AlignedPtr>(output_size, kAlignment); |
|
|
|
GE_CHECK_NOTNULL(aligned_ptr); |
|
|
|
auto data_buf = aligned_ptr->MutableGet(); |
|
|
|
GE_CHECK_NOTNULL(data_buf); |
|
|
|
GE_CHK_RT_RET(rtMemcpy(data_buf, output_size, output_tensor.GetData(), output_size, RT_MEMCPY_DEVICE_TO_HOST)); |
|
|
|
ge_tensor.SetData(aligned_ptr, output_size); |
|
|
|
output_data->blobs.emplace_back(data_buf, static_cast<uint32_t>(output_size), false); |
|
|
|
} else { |
|
|
|
auto mem_type = output_tensor.GetMemType(); |
|
|
|
auto deleter = [=] (uint8_t *device_data) { |
|
|
|
if (device_data != nullptr) { |
|
|
|
GELOGI("Deallocating buffer successfully. addr = %p", device_data); |
|
|
|
if (mem_type == RDMA_HBM) { |
|
|
|
MemManager::Instance().RdmaPoolInstance(RT_MEMORY_HBM).Free(device_data, device_id_); |
|
|
|
} else if (mem_type == HOST_DDR) { |
|
|
|
MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).Free(device_data); |
|
|
|
} else { |
|
|
|
MemManager::Instance().CachingInstance(RT_MEMORY_HBM).Free(device_data, device_id_); |
|
|
|
} |
|
|
|
} |
|
|
|
}; |
|
|
|
auto tensor = TensorAdapter::AsTensor(ge_tensor); |
|
|
|
tensor.GetTensorDesc().SetPlacement(kPlacementDevice); |
|
|
|
tensor.SetData(reinterpret_cast<uint8_t *>(output_tensor.Release()),static_cast<size_t>(output_size), deleter); |
|
|
|
output_data->blobs.emplace_back(output_tensor.Release(), static_cast<uint32_t>(output_size), false, 1); |
|
|
|
outputs.emplace_back(std::move(tensor)); |
|
|
|
continue; |
|
|
|
} |
|
|
|
} else { |
|
|
|
GELOGW("Output [%zu] is empty. shape = [%s]", i, tensor_desc->GetShape().ToString().c_str()); |
|
|
|
ge_tensor.SetData(nullptr, 0U); |
|
|
|
output_data->blobs.emplace_back(nullptr, 0U, false); |
|
|
|
} |
|
|
|
auto tensor = TensorAdapter::AsTensor(ge_tensor); |
|
|
|
outputs.emplace_back(std::move(tensor)); |
|
|
|
|