GitOrigin-RevId: d56f4ebf1f
tags/v0.4.0
| @@ -836,9 +836,10 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::do_device_wait_by( | |||
| { | |||
| auto type = cn_impl->env().property().type; | |||
| mgb_throw_if(type != CompNode::DeviceType::CPU | |||
| && type != CompNode::DeviceType::CUDA | |||
| , | |||
| MegBrainError, | |||
| "currently CPU can only wait for CPU" | |||
| "currently CPU can only wait for CPU, CUDA" | |||
| ); | |||
| } | |||
| @@ -40,6 +40,16 @@ namespace { | |||
| return std::max<size_t>(300 * 1024 * 1024, available / 20); | |||
| } | |||
| } | |||
| using CudaHostFunc = megdnn::thin_function<void()>; | |||
| void CUDART_CB cuda_host_func_caller(void* ud) { | |||
| mgb_assert(ud); | |||
| CudaHostFunc* func_ptr = reinterpret_cast<CudaHostFunc*>(ud); | |||
| MGB_TRY { | |||
| (*func_ptr)(); | |||
| } MGB_FINALLY( | |||
| delete func_ptr; | |||
| ); | |||
| } | |||
| } // anonymous namespace | |||
| namespace mgb { | |||
| @@ -223,6 +233,18 @@ class CudaCompNode::CompNodeImpl final: public CompNode::Impl { | |||
| Locator locator_logical() override { | |||
| return m_locator_logical; | |||
| } | |||
| void add_callback(CudaHostFunc&& cb) override { | |||
| activate(); | |||
| CudaHostFunc* func_ptr = new CudaHostFunc(std::move(cb)); | |||
| MGB_TRY { | |||
| MGB_CUDA_CHECK(cudaLaunchHostFunc(m_env.cuda_env().stream, | |||
| cuda_host_func_caller, static_cast<void*>(func_ptr))); | |||
| } MGB_CATCH(..., { | |||
| delete func_ptr; | |||
| throw; | |||
| }); | |||
| } | |||
| }; | |||
| MGB_DYN_TYPE_OBJ_FINAL_IMPL(CudaCompNode::CompNodeImpl); | |||
| @@ -28,15 +28,32 @@ namespace { | |||
| //! implement non-contiguous d2d copy | |||
| void noncont_tensor_copy( | |||
| const DeviceTensorND &dest, const DeviceTensorND &src, bool, bool) { | |||
| auto &&src_env = CompNodeEnv::from_comp_node(src.comp_node()); | |||
| const DeviceTensorND &dest, const DeviceTensorND &src, | |||
| bool contig_dest, bool contig_src) { | |||
| auto src_cn = src.comp_node(); | |||
| auto dst_cn = dest.comp_node(); | |||
| auto relayout = opr::intl::get_megdnn_global_opr<megdnn::Relayout>( | |||
| dst_cn); | |||
| dst_cn.activate(); | |||
| relayout->exec( | |||
| const_cast<DeviceTensorND&>(src).as_megdnn(), | |||
| dest.as_megdnn(), MegDNNHandle::get(src_env).handle()); | |||
| if (src_cn.device_type() == dst_cn.device_type()) { | |||
| // perform relayout op for better performance when src and dst are | |||
| // placed on comp nodes with the same device type | |||
| auto &&src_env = CompNodeEnv::from_comp_node(src.comp_node()); | |||
| auto relayout = opr::intl::get_megdnn_global_opr<megdnn::Relayout>( | |||
| dst_cn); | |||
| dst_cn.activate(); | |||
| relayout->exec( | |||
| const_cast<DeviceTensorND&>(src).as_megdnn(), | |||
| dest.as_megdnn(), MegDNNHandle::get(src_env).handle()); | |||
| } else { | |||
| if (contig_src) { | |||
| mgb_assert(!contig_dest); | |||
| DeviceTensorND tmp{dst_cn}; | |||
| tmp.copy_from(src); | |||
| dest.copy_from_fixlayout(tmp); | |||
| return; | |||
| } | |||
| DeviceTensorND tmp; | |||
| tmp.copy_from(src); | |||
| dest.copy_from_fixlayout(tmp); | |||
| } | |||
| } | |||
| //! implement non-contiguous h2h copy | |||
| @@ -346,7 +363,28 @@ template<> template<> | |||
| void TensorStorage<DeviceTensorStorageTrait>::copy_from( | |||
| const TensorStorage<DeviceTensorStorageTrait> &src, size_t size) const { | |||
| mgb_assert(size <= this->size() && size <= src.size()); | |||
| src.comp_node().peer_copy_to(m_comp_node, ptr(), src.ptr(), size); | |||
| if (src.comp_node().device_type() == CompNode::DeviceType::CPU && | |||
| comp_node().device_type() == CompNode::DeviceType::CUDA) { | |||
| // current thread(i.e. cuda dispatcher thread) should wait for all | |||
| // operations on src's comp_node to finish, otherwise a race condition | |||
| // might occur between the worker thread of src's comp_node and the | |||
| // thread responsible for copying pageable memory in \p src to a pinned | |||
| // buffer, refer to | |||
| // https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html | |||
| // | |||
| // Note: it is highly recommended that copy tensor from cpu to cuda | |||
| // with asynchronized disaptching(see graph option async_exec_level), | |||
| // or main thread might be blocked by worker thread corresponding to | |||
| // the src's comp_node, resulting in bad performance | |||
| // | |||
| // TODO: consider using cudaMallocHost or cudaHostRegister | |||
| // to pin the memory of src tensor, so it does not require synchronization | |||
| // and is more efficient | |||
| src.comp_node().sync(); | |||
| comp_node().copy_to_device(ptr(), src.ptr(), size); | |||
| } else { | |||
| src.comp_node().peer_copy_to(m_comp_node, ptr(), src.ptr(), size); | |||
| } | |||
| } | |||
| @@ -1733,22 +1733,25 @@ TEST(TestGraph, UpdateStaticAllocPlan) { | |||
| TEST(TestGraph, CPUGPUHybrid) { | |||
| REQUIRE_GPU(1); | |||
| auto cn_cpu = CompNode::load("cpu:default"), | |||
| cn_gpu = CompNode::load("gpu0"); | |||
| auto graph = ComputingGraph::make(); | |||
| HostTensorGenerator<> gen; | |||
| auto host_x = gen({42}); | |||
| auto x = opr::Host2DeviceCopy::make(*graph, host_x, {cn_cpu}), | |||
| y = x * 2, | |||
| z = opr::Copy::make(y, cn_gpu) + 1; | |||
| HostTensorND host_z; | |||
| auto func = graph->compile({make_callback_copy(z, host_z)}); | |||
| func->execute(); | |||
| for (size_t i = 0; i < 42; ++ i) { | |||
| MGB_ASSERT_FLOAT_EQ(host_x->ptr<float>()[i] * 2 + 1, | |||
| host_z.ptr<float>()[i]); | |||
| auto cn_gpu = CompNode::load("gpu0"); | |||
| for (auto&& cn_cpu : {CompNode::load("cpu0"), CompNode::default_cpu()}) { | |||
| auto graph = ComputingGraph::make(); | |||
| HostTensorGenerator<> gen; | |||
| constexpr size_t length = 23333; | |||
| auto host_x = gen({length}); | |||
| graph->options().var_sanity_check_first_run = false; | |||
| auto x = opr::Host2DeviceCopy::make(*graph, host_x, {cn_cpu}), | |||
| y = opr::Sleep::make(x, 0.5) * 2, | |||
| z_gpu = opr::Copy::make(y, cn_gpu) + 1, | |||
| z = opr::Copy::make(z_gpu, cn_cpu) * 2; | |||
| HostTensorND host_z; | |||
| auto func = graph->compile({make_callback_copy(z, host_z)}); | |||
| func->execute(); | |||
| for (size_t i = 0; i < length; ++ i) { | |||
| MGB_ASSERT_FLOAT_EQ((host_x->ptr<float>()[i] * 2 + 1) * 2, | |||
| host_z.ptr<float>()[i]); | |||
| } | |||
| } | |||
| } | |||
| TEST(TestGraph, In2OutOpStreamPropagate) { | |||
| @@ -11,6 +11,7 @@ | |||
| #include "megbrain/test/helper.h" | |||
| #include "megbrain/comp_node_env.h" | |||
| #include "megbrain/tensor.h" | |||
| #include "megbrain/opr/utility.h" | |||
| #include "megbrain/utils/timer.h" | |||
| @@ -382,4 +383,39 @@ TEST(TestTensor, NegativeIndex) { | |||
| run_negative_index_test<HostTensorND, DeviceTensorND>(); | |||
| } | |||
| TEST(TestTensor, CpuCudaD2DCopy) { | |||
| REQUIRE_GPU(1); | |||
| auto cn_cpu = CompNode::load("cpu0"), | |||
| cn_gpu = CompNode::load("gpu0"); | |||
| HostTensorGenerator<> gen; | |||
| constexpr size_t length = 233333; | |||
| auto a = gen({length}); | |||
| for (auto config: {true, false}) { | |||
| DeviceTensorND dev_a{cn_cpu}, dev_b{cn_gpu, a->shape(), a->dtype()}; | |||
| dev_a.copy_from(*a).sync(); | |||
| if (!config) { | |||
| auto subspec = Slice(0, length, 3).apply(a->layout(), 0); | |||
| dev_a = dev_a.sub(subspec); | |||
| dev_b = dev_b.sub(subspec); | |||
| } | |||
| auto iadd = [ptr = dev_a.ptr<float>(), length = dev_a.shape()[0], | |||
| stride = dev_a.layout().stride[0]]() { | |||
| for (size_t i = 0; i < length; ++ i) { | |||
| ptr[i * stride] += 1; | |||
| } | |||
| }; | |||
| CompNodeEnv::from_comp_node(cn_cpu).cpu_env().dispatch(iadd); | |||
| auto event = cn_cpu.create_event(); | |||
| event->record(); | |||
| cn_gpu.device_wait_event(*event); | |||
| dev_b.copy_from_fixlayout(dev_a); | |||
| HostTensorND res; | |||
| res.copy_from(dev_b).sync(); | |||
| MGB_ASSERT_TENSOR_EQ(HostTensorND::make_proxy(dev_a), res); | |||
| } | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||