GitOrigin-RevId: d56f4ebf1f
tags/v0.4.0
| @@ -836,9 +836,10 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::do_device_wait_by( | |||||
| { | { | ||||
| auto type = cn_impl->env().property().type; | auto type = cn_impl->env().property().type; | ||||
| mgb_throw_if(type != CompNode::DeviceType::CPU | mgb_throw_if(type != CompNode::DeviceType::CPU | ||||
| && type != CompNode::DeviceType::CUDA | |||||
| , | , | ||||
| MegBrainError, | MegBrainError, | ||||
| "currently CPU can only wait for CPU" | |||||
| "currently CPU can only wait for CPU, CUDA" | |||||
| ); | ); | ||||
| } | } | ||||
| @@ -40,6 +40,16 @@ namespace { | |||||
| return std::max<size_t>(300 * 1024 * 1024, available / 20); | return std::max<size_t>(300 * 1024 * 1024, available / 20); | ||||
| } | } | ||||
| } | } | ||||
| using CudaHostFunc = megdnn::thin_function<void()>; | |||||
| void CUDART_CB cuda_host_func_caller(void* ud) { | |||||
| mgb_assert(ud); | |||||
| CudaHostFunc* func_ptr = reinterpret_cast<CudaHostFunc*>(ud); | |||||
| MGB_TRY { | |||||
| (*func_ptr)(); | |||||
| } MGB_FINALLY( | |||||
| delete func_ptr; | |||||
| ); | |||||
| } | |||||
| } // anonymous namespace | } // anonymous namespace | ||||
| namespace mgb { | namespace mgb { | ||||
| @@ -223,6 +233,18 @@ class CudaCompNode::CompNodeImpl final: public CompNode::Impl { | |||||
| Locator locator_logical() override { | Locator locator_logical() override { | ||||
| return m_locator_logical; | return m_locator_logical; | ||||
| } | } | ||||
| void add_callback(CudaHostFunc&& cb) override { | |||||
| activate(); | |||||
| CudaHostFunc* func_ptr = new CudaHostFunc(std::move(cb)); | |||||
| MGB_TRY { | |||||
| MGB_CUDA_CHECK(cudaLaunchHostFunc(m_env.cuda_env().stream, | |||||
| cuda_host_func_caller, static_cast<void*>(func_ptr))); | |||||
| } MGB_CATCH(..., { | |||||
| delete func_ptr; | |||||
| throw; | |||||
| }); | |||||
| } | |||||
| }; | }; | ||||
| MGB_DYN_TYPE_OBJ_FINAL_IMPL(CudaCompNode::CompNodeImpl); | MGB_DYN_TYPE_OBJ_FINAL_IMPL(CudaCompNode::CompNodeImpl); | ||||
| @@ -28,15 +28,32 @@ namespace { | |||||
| //! implement non-contiguous d2d copy | //! implement non-contiguous d2d copy | ||||
| void noncont_tensor_copy( | void noncont_tensor_copy( | ||||
| const DeviceTensorND &dest, const DeviceTensorND &src, bool, bool) { | |||||
| auto &&src_env = CompNodeEnv::from_comp_node(src.comp_node()); | |||||
| const DeviceTensorND &dest, const DeviceTensorND &src, | |||||
| bool contig_dest, bool contig_src) { | |||||
| auto src_cn = src.comp_node(); | |||||
| auto dst_cn = dest.comp_node(); | auto dst_cn = dest.comp_node(); | ||||
| auto relayout = opr::intl::get_megdnn_global_opr<megdnn::Relayout>( | |||||
| dst_cn); | |||||
| dst_cn.activate(); | |||||
| relayout->exec( | |||||
| const_cast<DeviceTensorND&>(src).as_megdnn(), | |||||
| dest.as_megdnn(), MegDNNHandle::get(src_env).handle()); | |||||
| if (src_cn.device_type() == dst_cn.device_type()) { | |||||
| // perform relayout op for better performance when src and dst are | |||||
| // placed on comp nodes with the same device type | |||||
| auto &&src_env = CompNodeEnv::from_comp_node(src.comp_node()); | |||||
| auto relayout = opr::intl::get_megdnn_global_opr<megdnn::Relayout>( | |||||
| dst_cn); | |||||
| dst_cn.activate(); | |||||
| relayout->exec( | |||||
| const_cast<DeviceTensorND&>(src).as_megdnn(), | |||||
| dest.as_megdnn(), MegDNNHandle::get(src_env).handle()); | |||||
| } else { | |||||
| if (contig_src) { | |||||
| mgb_assert(!contig_dest); | |||||
| DeviceTensorND tmp{dst_cn}; | |||||
| tmp.copy_from(src); | |||||
| dest.copy_from_fixlayout(tmp); | |||||
| return; | |||||
| } | |||||
| DeviceTensorND tmp; | |||||
| tmp.copy_from(src); | |||||
| dest.copy_from_fixlayout(tmp); | |||||
| } | |||||
| } | } | ||||
| //! implement non-contiguous h2h copy | //! implement non-contiguous h2h copy | ||||
| @@ -346,7 +363,28 @@ template<> template<> | |||||
| void TensorStorage<DeviceTensorStorageTrait>::copy_from( | void TensorStorage<DeviceTensorStorageTrait>::copy_from( | ||||
| const TensorStorage<DeviceTensorStorageTrait> &src, size_t size) const { | const TensorStorage<DeviceTensorStorageTrait> &src, size_t size) const { | ||||
| mgb_assert(size <= this->size() && size <= src.size()); | mgb_assert(size <= this->size() && size <= src.size()); | ||||
| src.comp_node().peer_copy_to(m_comp_node, ptr(), src.ptr(), size); | |||||
| if (src.comp_node().device_type() == CompNode::DeviceType::CPU && | |||||
| comp_node().device_type() == CompNode::DeviceType::CUDA) { | |||||
| // current thread(i.e. cuda dispatcher thread) should wait for all | |||||
| // operations on src's comp_node to finish, otherwise a race condition | |||||
| // might occur between the worker thread of src's comp_node and the | |||||
| // thread responsible for copying pageable memory in \p src to a pinned | |||||
| // buffer, refer to | |||||
| // https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html | |||||
| // | |||||
| // Note: it is highly recommended that copy tensor from cpu to cuda | |||||
| // with asynchronized disaptching(see graph option async_exec_level), | |||||
| // or main thread might be blocked by worker thread corresponding to | |||||
| // the src's comp_node, resulting in bad performance | |||||
| // | |||||
| // TODO: consider using cudaMallocHost or cudaHostRegister | |||||
| // to pin the memory of src tensor, so it does not require synchronization | |||||
| // and is more efficient | |||||
| src.comp_node().sync(); | |||||
| comp_node().copy_to_device(ptr(), src.ptr(), size); | |||||
| } else { | |||||
| src.comp_node().peer_copy_to(m_comp_node, ptr(), src.ptr(), size); | |||||
| } | |||||
| } | } | ||||
| @@ -1733,22 +1733,25 @@ TEST(TestGraph, UpdateStaticAllocPlan) { | |||||
| TEST(TestGraph, CPUGPUHybrid) { | TEST(TestGraph, CPUGPUHybrid) { | ||||
| REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
| auto cn_cpu = CompNode::load("cpu:default"), | |||||
| cn_gpu = CompNode::load("gpu0"); | |||||
| auto graph = ComputingGraph::make(); | |||||
| HostTensorGenerator<> gen; | |||||
| auto host_x = gen({42}); | |||||
| auto x = opr::Host2DeviceCopy::make(*graph, host_x, {cn_cpu}), | |||||
| y = x * 2, | |||||
| z = opr::Copy::make(y, cn_gpu) + 1; | |||||
| HostTensorND host_z; | |||||
| auto func = graph->compile({make_callback_copy(z, host_z)}); | |||||
| func->execute(); | |||||
| for (size_t i = 0; i < 42; ++ i) { | |||||
| MGB_ASSERT_FLOAT_EQ(host_x->ptr<float>()[i] * 2 + 1, | |||||
| host_z.ptr<float>()[i]); | |||||
| auto cn_gpu = CompNode::load("gpu0"); | |||||
| for (auto&& cn_cpu : {CompNode::load("cpu0"), CompNode::default_cpu()}) { | |||||
| auto graph = ComputingGraph::make(); | |||||
| HostTensorGenerator<> gen; | |||||
| constexpr size_t length = 23333; | |||||
| auto host_x = gen({length}); | |||||
| graph->options().var_sanity_check_first_run = false; | |||||
| auto x = opr::Host2DeviceCopy::make(*graph, host_x, {cn_cpu}), | |||||
| y = opr::Sleep::make(x, 0.5) * 2, | |||||
| z_gpu = opr::Copy::make(y, cn_gpu) + 1, | |||||
| z = opr::Copy::make(z_gpu, cn_cpu) * 2; | |||||
| HostTensorND host_z; | |||||
| auto func = graph->compile({make_callback_copy(z, host_z)}); | |||||
| func->execute(); | |||||
| for (size_t i = 0; i < length; ++ i) { | |||||
| MGB_ASSERT_FLOAT_EQ((host_x->ptr<float>()[i] * 2 + 1) * 2, | |||||
| host_z.ptr<float>()[i]); | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| TEST(TestGraph, In2OutOpStreamPropagate) { | TEST(TestGraph, In2OutOpStreamPropagate) { | ||||
| @@ -11,6 +11,7 @@ | |||||
| #include "megbrain/test/helper.h" | #include "megbrain/test/helper.h" | ||||
| #include "megbrain/comp_node_env.h" | |||||
| #include "megbrain/tensor.h" | #include "megbrain/tensor.h" | ||||
| #include "megbrain/opr/utility.h" | #include "megbrain/opr/utility.h" | ||||
| #include "megbrain/utils/timer.h" | #include "megbrain/utils/timer.h" | ||||
| @@ -382,4 +383,39 @@ TEST(TestTensor, NegativeIndex) { | |||||
| run_negative_index_test<HostTensorND, DeviceTensorND>(); | run_negative_index_test<HostTensorND, DeviceTensorND>(); | ||||
| } | } | ||||
| TEST(TestTensor, CpuCudaD2DCopy) { | |||||
| REQUIRE_GPU(1); | |||||
| auto cn_cpu = CompNode::load("cpu0"), | |||||
| cn_gpu = CompNode::load("gpu0"); | |||||
| HostTensorGenerator<> gen; | |||||
| constexpr size_t length = 233333; | |||||
| auto a = gen({length}); | |||||
| for (auto config: {true, false}) { | |||||
| DeviceTensorND dev_a{cn_cpu}, dev_b{cn_gpu, a->shape(), a->dtype()}; | |||||
| dev_a.copy_from(*a).sync(); | |||||
| if (!config) { | |||||
| auto subspec = Slice(0, length, 3).apply(a->layout(), 0); | |||||
| dev_a = dev_a.sub(subspec); | |||||
| dev_b = dev_b.sub(subspec); | |||||
| } | |||||
| auto iadd = [ptr = dev_a.ptr<float>(), length = dev_a.shape()[0], | |||||
| stride = dev_a.layout().stride[0]]() { | |||||
| for (size_t i = 0; i < length; ++ i) { | |||||
| ptr[i * stride] += 1; | |||||
| } | |||||
| }; | |||||
| CompNodeEnv::from_comp_node(cn_cpu).cpu_env().dispatch(iadd); | |||||
| auto event = cn_cpu.create_event(); | |||||
| event->record(); | |||||
| cn_gpu.device_wait_event(*event); | |||||
| dev_b.copy_from_fixlayout(dev_a); | |||||
| HostTensorND res; | |||||
| res.copy_from(dev_b).sync(); | |||||
| MGB_ASSERT_TENSOR_EQ(HostTensorND::make_proxy(dev_a), res); | |||||
| } | |||||
| } | |||||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | ||||