diff --git a/benchmark/benchncnn.cpp b/benchmark/benchncnn.cpp index 5bcadee1e..7ea854363 100644 --- a/benchmark/benchncnn.cpp +++ b/benchmark/benchncnn.cpp @@ -409,20 +409,20 @@ void mobilenet_yolo_run(const ncnn::Net& net) ncnn::Mat in(416, 416, 3); ex.input("data", in); -#if NCNN_VULKAN - // TODO chain conv22 conv23 - if (g_use_vulkan_compute) - { - ncnn::Mat conv22; - ex.extract("conv22", conv22); - - ncnn::Mat conv23; - ex.extract("conv23", conv23); - - // use cpu for detection_out - ex.set_vulkan_compute(false); - } -#endif // NCNN_VULKAN +// #if NCNN_VULKAN +// // TODO chain conv22 conv23 +// if (g_use_vulkan_compute) +// { +// ncnn::Mat conv22; +// ex.extract("conv22", conv22); +// +// ncnn::Mat conv23; +// ex.extract("conv23", conv23); +// +// // use cpu for detection_out +// ex.set_vulkan_compute(false); +// } +// #endif // NCNN_VULKAN ncnn::Mat out; ex.extract("detection_out", out); @@ -440,20 +440,20 @@ void mobilenet_yolov3_run(const ncnn::Net& net) ncnn::Mat in(416, 416, 3); ex.input("data", in); -#if NCNN_VULKAN - // TODO chain conv19 conv20 - if (g_use_vulkan_compute) - { - ncnn::Mat conv19; - ex.extract("conv19", conv19); - - ncnn::Mat conv20; - ex.extract("conv20", conv20); - - // use cpu for detection_out - ex.set_vulkan_compute(false); - } -#endif // NCNN_VULKAN +// #if NCNN_VULKAN +// // TODO chain conv19 conv20 +// if (g_use_vulkan_compute) +// { +// ncnn::Mat conv19; +// ex.extract("conv19", conv19); +// +// ncnn::Mat conv20; +// ex.extract("conv20", conv20); +// +// // use cpu for detection_out +// ex.set_vulkan_compute(false); +// } +// #endif // NCNN_VULKAN ncnn::Mat out; ex.extract("detection_out", out); diff --git a/src/layer/permute.cpp b/src/layer/permute.cpp index 33948d39c..f6e00f8df 100644 --- a/src/layer/permute.cpp +++ b/src/layer/permute.cpp @@ -317,8 +317,6 @@ int Permute::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, } } - fprintf(stderr, "%d %d %d\n", top_blob.w, top_blob.h, top_blob.c); - // fprintf(stderr, "Permute::forward %p %p\n", bottom_blob.buffer(), top_blob.buffer()); std::vector bindings(2); diff --git a/src/net.cpp b/src/net.cpp index 9b63cf97a..25d202572 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -1145,201 +1145,425 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, Option& opt } #if NCNN_VULKAN -int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector& wait_barrier_counts, VkCompute& cmd, Option& opt) const +int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector& blob_mats_gpu, std::vector& wait_barrier_counts, VkCompute& cmd, Option& opt) const { const Layer* layer = layers[layer_index]; -// fprintf(stderr, "forward_layer %d %s\n", layer_index, layer->name.c_str()); +// fprintf(stderr, "forward_layer %d %d %s\n", layer->support_vulkan, layer_index, layer->name.c_str()); - if (layer->one_blob_only) + if (layer->support_vulkan) { - // load bottom blob - int bottom_blob_index = layer->bottoms[0]; - int top_blob_index = layer->tops[0]; - - if (blob_mats[bottom_blob_index].dims == 0) - { - int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, wait_barrier_counts, cmd, opt); - if (ret != 0) - return ret; - } - else if (blob_mats[bottom_blob_index].staging_data) + if (layer->one_blob_only) { - // upload - const VkMat& bottom_blob = blob_mats[bottom_blob_index]; - cmd.record_prepare_transfer_barrier(bottom_blob); - cmd.record_upload(bottom_blob); - } + // load bottom blob + int bottom_blob_index = layer->bottoms[0]; + int top_blob_index = layer->tops[0]; - VkMat bottom_blob = blob_mats[bottom_blob_index]; + if (blob_mats_gpu[bottom_blob_index].dims == 0) + { + if (blob_mats[bottom_blob_index].dims == 0) + { + int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, wait_barrier_counts, cmd, opt); + if (ret != 0) + return ret; + } - if (opt.lightmode) - { + if (blob_mats_gpu[bottom_blob_index].dims == 0) + { + // upload + VkMat& bottom_blob = blob_mats_gpu[bottom_blob_index]; + bottom_blob.create_like(blob_mats[bottom_blob_index], opt.blob_vkallocator, opt.staging_vkallocator); + + if (!bottom_blob.allocator->mappable) + { + bottom_blob.prepare_staging_buffer(); + } - wait_barrier_counts[bottom_blob_index] += layer->tops.size(); + bottom_blob.upload(blob_mats[bottom_blob_index]); - // deep copy for inplace forward if data is shared - if (layer->support_inplace && wait_barrier_counts[bottom_blob_index] != 1) + cmd.record_prepare_transfer_barrier(bottom_blob); + cmd.record_upload(bottom_blob); + + // TODO convert packing +// fprintf(stderr, "upload %d %d %d %lu %d\n", bottom_blob.w, bottom_blob.h, bottom_blob.c, bottom_blob.elemsize, bottom_blob.packing); + } + } + + VkMat bottom_blob = blob_mats_gpu[bottom_blob_index]; + + if (opt.lightmode) { - VkMat bottom_blob_copy; - bottom_blob_copy.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator); -// fprintf(stderr, "clone %p %p\n", bottom_blob.buffer(), bottom_blob_copy.buffer()); + wait_barrier_counts[bottom_blob_index] += layer->tops.size(); + + // deep copy for inplace forward if data is shared + if (layer->support_inplace && wait_barrier_counts[bottom_blob_index] != 1) + { + VkMat bottom_blob_copy; + bottom_blob_copy.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator); + +// fprintf(stderr, "clone %p %p\n", bottom_blob.buffer(), bottom_blob_copy.buffer()); - cmd.record_prepare_transfer_barrier(bottom_blob); - cmd.record_clone(bottom_blob, bottom_blob_copy); - bottom_blob = bottom_blob_copy; + cmd.record_prepare_transfer_barrier(bottom_blob); + cmd.record_clone(bottom_blob, bottom_blob_copy); + bottom_blob = bottom_blob_copy; - wait_barrier_counts[bottom_blob_index]--; + wait_barrier_counts[bottom_blob_index]--; + } } - } - // forward - if (opt.lightmode && layer->support_inplace) - { - VkMat& bottom_top_blob = bottom_blob; - int ret = layer->forward_inplace(bottom_top_blob, cmd, opt); - if (ret != 0) - return ret; + // forward + if (opt.lightmode && layer->support_inplace) + { + VkMat& bottom_top_blob = bottom_blob; + int ret = layer->forward_inplace(bottom_top_blob, cmd, opt); + if (ret != 0) + return ret; - // store top blob - blob_mats[top_blob_index] = bottom_top_blob; + // store top blob + blob_mats_gpu[top_blob_index] = bottom_top_blob; + } + else + { + VkMat top_blob; + int ret = layer->forward(bottom_blob, top_blob, cmd, opt); + if (ret != 0) + return ret; + + // store top blob + blob_mats_gpu[top_blob_index] = top_blob; + } + + if (opt.lightmode) + { + // reclaim producer bottom_blob as free when consuming bottom_blob + const Layer* producer = layers[ blobs[bottom_blob_index].producer ]; + for (size_t i=0; ibottoms.size(); i++) + { + int producer_bottom_blob_index = producer->bottoms[i]; + + wait_barrier_counts[producer_bottom_blob_index]--; + if (wait_barrier_counts[producer_bottom_blob_index] == 0) + { +// fprintf(stderr, "reclaim free %p\n", blob_mats_gpu[producer_bottom_blob_index].buffer()); + + blob_mats_gpu[producer_bottom_blob_index].release(); + } + } + } } else { - VkMat top_blob; - int ret = layer->forward(bottom_blob, top_blob, cmd, opt); - if (ret != 0) - return ret; + // load bottom blobs + std::vector bottom_blobs; + bottom_blobs.resize(layer->bottoms.size()); + for (size_t i=0; ibottoms.size(); i++) + { + int bottom_blob_index = layer->bottoms[i]; - // store top blob - blob_mats[top_blob_index] = top_blob; - } + if (blob_mats_gpu[bottom_blob_index].dims == 0) + { + if (blob_mats[bottom_blob_index].dims == 0) + { + int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, wait_barrier_counts, cmd, opt); + if (ret != 0) + return ret; + } - if (opt.lightmode) - { - // reclaim producer bottom_blob as free when consuming bottom_blob - const Layer* producer = layers[ blobs[bottom_blob_index].producer ]; - for (size_t i=0; ibottoms.size(); i++) - { - int producer_bottom_blob_index = producer->bottoms[i]; + if (blob_mats_gpu[bottom_blob_index].dims == 0) + { + // upload + VkMat& bottom_blob = blob_mats_gpu[bottom_blob_index]; + bottom_blob.create_like(blob_mats[bottom_blob_index], opt.blob_vkallocator, opt.staging_vkallocator); + + if (!bottom_blob.allocator->mappable) + { + bottom_blob.prepare_staging_buffer(); + } + + bottom_blob.upload(blob_mats[bottom_blob_index]); + + cmd.record_prepare_transfer_barrier(bottom_blob); + cmd.record_upload(bottom_blob); + + // TODO convert packing +// fprintf(stderr, "upload %d %d %d %lu %d\n", bottom_blob.w, bottom_blob.h, bottom_blob.c, bottom_blob.elemsize, bottom_blob.packing); + } + } - wait_barrier_counts[producer_bottom_blob_index]--; - if (wait_barrier_counts[producer_bottom_blob_index] == 0) + bottom_blobs[i] = blob_mats_gpu[bottom_blob_index]; + + if (opt.lightmode) { -// fprintf(stderr, "reclaim free %p\n", blob_mats[producer_bottom_blob_index].buffer()); - blob_mats[producer_bottom_blob_index].release(); + wait_barrier_counts[bottom_blob_index] = layer->tops.size(); + + // deep copy for inplace forward if data is shared + if (layer->support_inplace && wait_barrier_counts[bottom_blob_index] != 1) + { + VkMat bottom_blob_copy; + bottom_blob_copy.create_like(bottom_blobs[i], bottom_blobs[i].allocator, bottom_blobs[i].staging_allocator); + +// fprintf(stderr, "clone %p %p\n", bottom_blobs[i].buffer(), bottom_blob_copy.buffer()); + + cmd.record_prepare_transfer_barrier(bottom_blobs[i]); + cmd.record_clone(bottom_blobs[i], bottom_blob_copy); + bottom_blobs[i] = bottom_blob_copy; + + wait_barrier_counts[bottom_blob_index]--; + } } } - } - } - else - { - // load bottom blobs - std::vector bottom_blobs; - bottom_blobs.resize(layer->bottoms.size()); - for (size_t i=0; ibottoms.size(); i++) - { - int bottom_blob_index = layer->bottoms[i]; - if (blob_mats[bottom_blob_index].dims == 0) + // forward + if (opt.lightmode && layer->support_inplace) { - int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, wait_barrier_counts, cmd, opt); + std::vector& bottom_top_blobs = bottom_blobs; + int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt); if (ret != 0) return ret; + + // store top blobs + for (size_t i=0; itops.size(); i++) + { + int top_blob_index = layer->tops[i]; + + blob_mats_gpu[top_blob_index] = bottom_top_blobs[i]; + } } - else if (blob_mats[bottom_blob_index].staging_data) + else { - // upload - const VkMat& bottom_blob = blob_mats[bottom_blob_index]; - cmd.record_prepare_transfer_barrier(bottom_blob); - cmd.record_upload(bottom_blob); - } + std::vector top_blobs; + top_blobs.resize(layer->tops.size()); + int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt); + if (ret != 0) + return ret; - bottom_blobs[i] = blob_mats[bottom_blob_index]; + // store top blobs + for (size_t i=0; itops.size(); i++) + { + int top_blob_index = layer->tops[i]; + + blob_mats_gpu[top_blob_index] = top_blobs[i]; + } + } if (opt.lightmode) { - - wait_barrier_counts[bottom_blob_index] = layer->tops.size(); - - // deep copy for inplace forward if data is shared - if (layer->support_inplace && wait_barrier_counts[bottom_blob_index] != 1) + for (size_t i=0; ibottoms.size(); i++) { - VkMat bottom_blob_copy; - bottom_blob_copy.create_like(bottom_blobs[i], bottom_blobs[i].allocator, bottom_blobs[i].staging_allocator); + int bottom_blob_index = layer->bottoms[i]; -// fprintf(stderr, "clone %p %p\n", bottom_blobs[i].buffer(), bottom_blob_copy.buffer()); + // reclaim producer bottom_blob as free when consuming bottom_blob + const Layer* producer = layers[ blobs[bottom_blob_index].producer ]; + for (size_t i=0; ibottoms.size(); i++) + { + int producer_bottom_blob_index = producer->bottoms[i]; - cmd.record_prepare_transfer_barrier(bottom_blobs[i]); - cmd.record_clone(bottom_blobs[i], bottom_blob_copy); - bottom_blobs[i] = bottom_blob_copy; + wait_barrier_counts[producer_bottom_blob_index]--; + if (wait_barrier_counts[producer_bottom_blob_index] == 0) + { +// fprintf(stderr, "reclaim free %p\n", blob_mats_gpu[producer_bottom_blob_index].buffer()); - wait_barrier_counts[bottom_blob_index]--; + blob_mats_gpu[producer_bottom_blob_index].release(); + } + } } } + } - // forward - if (opt.lightmode && layer->support_inplace) + } + else + { + if (layer->one_blob_only) { - std::vector& bottom_top_blobs = bottom_blobs; - int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt); - if (ret != 0) - return ret; + // load bottom blob + int bottom_blob_index = layer->bottoms[0]; + int top_blob_index = layer->tops[0]; - // store top blobs - for (size_t i=0; itops.size(); i++) + if (blob_mats[bottom_blob_index].dims == 0) { - int top_blob_index = layer->tops[i]; + if (blob_mats_gpu[bottom_blob_index].dims == 0) + { + int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, wait_barrier_counts, cmd, opt); + if (ret != 0) + return ret; + } - blob_mats[top_blob_index] = bottom_top_blobs[i]; + if (blob_mats[bottom_blob_index].dims == 0) + { + // download + VkMat& bottom_blob = blob_mats_gpu[bottom_blob_index]; + + if (!bottom_blob.allocator->mappable) + { + cmd.record_prepare_transfer_barrier(bottom_blob); + bottom_blob.prepare_staging_buffer(); + cmd.record_download(bottom_blob); + } + + cmd.submit(); + + cmd.wait(); + + cmd.reset(); + + blob_mats[bottom_blob_index].create_like(bottom_blob, opt.blob_allocator); + bottom_blob.download(blob_mats[bottom_blob_index]); + + if (!bottom_blob.allocator->mappable) + { + bottom_blob.discard_staging_buffer(); + } + + // TODO convert packing +// fprintf(stderr, "download %d %d %d %lu %d\n", bottom_blob.w, bottom_blob.h, bottom_blob.c, bottom_blob.elemsize, bottom_blob.packing); + } } + + Mat bottom_blob = blob_mats[bottom_blob_index]; + + if (opt.lightmode) + { + // delete after taken in light mode + blob_mats[bottom_blob_index].release(); + // deep copy for inplace forward if data is shared + if (layer->support_inplace && *bottom_blob.refcount != 1) + { + bottom_blob = bottom_blob.clone(); + } + } + + // forward + if (opt.lightmode && layer->support_inplace) + { + Mat& bottom_top_blob = bottom_blob; + + int ret = layer->forward_inplace(bottom_top_blob, opt); + if (ret != 0) + return ret; + + // store top blob + blob_mats[top_blob_index] = bottom_top_blob; + } + else + { + Mat top_blob; + + int ret = layer->forward(bottom_blob, top_blob, opt); + if (ret != 0) + return ret; + + // store top blob + blob_mats[top_blob_index] = top_blob; + } + } else { - std::vector top_blobs; - top_blobs.resize(layer->tops.size()); - int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt); - if (ret != 0) - return ret; + // load bottom blobs + std::vector bottom_blobs; + bottom_blobs.resize(layer->bottoms.size()); + for (size_t i=0; ibottoms.size(); i++) + { + int bottom_blob_index = layer->bottoms[i]; + + if (blob_mats[bottom_blob_index].dims == 0) + { + if (blob_mats_gpu[bottom_blob_index].dims == 0) + { + int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, wait_barrier_counts, cmd, opt); + if (ret != 0) + return ret; + } + + if (blob_mats[bottom_blob_index].dims == 0) + { + // download + VkMat& bottom_blob = blob_mats_gpu[bottom_blob_index]; + + if (!bottom_blob.allocator->mappable) + { + cmd.record_prepare_transfer_barrier(bottom_blob); + bottom_blob.prepare_staging_buffer(); + cmd.record_download(bottom_blob); + } + } + } + } - // store top blobs - for (size_t i=0; itops.size(); i++) { - int top_blob_index = layer->tops[i]; + cmd.submit(); - blob_mats[top_blob_index] = top_blobs[i]; + cmd.wait(); + + cmd.reset(); } - } - if (opt.lightmode) - { for (size_t i=0; ibottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; - // reclaim producer bottom_blob as free when consuming bottom_blob - const Layer* producer = layers[ blobs[bottom_blob_index].producer ]; - for (size_t i=0; ibottoms.size(); i++) + if (blob_mats[bottom_blob_index].dims == 0) { - int producer_bottom_blob_index = producer->bottoms[i]; + // download + VkMat& bottom_blob = blob_mats_gpu[bottom_blob_index]; - wait_barrier_counts[producer_bottom_blob_index]--; - if (wait_barrier_counts[producer_bottom_blob_index] == 0) - { -// fprintf(stderr, "reclaim free %p\n", blob_mats[producer_bottom_blob_index].buffer()); + blob_mats[bottom_blob_index].create_like(bottom_blob, opt.blob_allocator); + bottom_blob.download(blob_mats[bottom_blob_index]); - blob_mats[producer_bottom_blob_index].release(); + if (!bottom_blob.allocator->mappable) + { + bottom_blob.discard_staging_buffer(); } + + // TODO convert packing +// fprintf(stderr, "download %d %d %d %lu %d\n", bottom_blob.w, bottom_blob.h, bottom_blob.c, bottom_blob.elemsize, bottom_blob.packing); } + + bottom_blobs[i] = blob_mats[bottom_blob_index]; } - } + // forward + if (opt.lightmode && layer->support_inplace) + { + std::vector& bottom_top_blobs = bottom_blobs; + + int ret = layer->forward_inplace(bottom_top_blobs, opt); + if (ret != 0) + return ret; + + // store top blobs + for (size_t i=0; itops.size(); i++) + { + int top_blob_index = layer->tops[i]; + + blob_mats[top_blob_index] = bottom_top_blobs[i]; + } + } + else + { + std::vector top_blobs; + top_blobs.resize(layer->tops.size()); + + int ret = layer->forward(bottom_blobs, top_blobs, opt); + if (ret != 0) + return ret; + + // store top blobs + for (size_t i=0; itops.size(); i++) + { + int top_blob_index = layer->tops[i]; + + blob_mats[top_blob_index] = top_blobs[i]; + } + } + + } } -// fprintf(stderr, "forward_layer %d %s done\n", layer_index, layer->name.c_str()); +// fprintf(stderr, "forward_layer %d %d %s done\n", layer->support_vulkan, layer_index, layer->name.c_str()); return 0; } @@ -1440,22 +1664,6 @@ int Extractor::input(int blob_index, const Mat& in) blob_mats[blob_index] = in; -#if NCNN_VULKAN - if (opt.vulkan_compute) - { - VkMat& in_gpu = blob_mats_gpu[blob_index]; - - in_gpu.create_like(in, opt.blob_vkallocator, opt.staging_vkallocator); - - if (!in_gpu.allocator->mappable) - { - in_gpu.prepare_staging_buffer(); - } - - in_gpu.upload(in); - } -#endif // NCNN_VULKAN - return 0; } @@ -1473,33 +1681,35 @@ int Extractor::extract(int blob_index, Mat& feat) #if NCNN_VULKAN if (opt.vulkan_compute) { - VkMat feat_gpu; - ncnn::VkCompute cmd(net->vkdev); + VkMat feat_gpu; ret = extract(blob_index, feat_gpu, cmd); - if (!feat_gpu.allocator->mappable) + if (blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0) { // download - cmd.record_prepare_transfer_barrier(feat_gpu); + if (!feat_gpu.allocator->mappable) + { + cmd.record_prepare_transfer_barrier(feat_gpu); - feat_gpu.prepare_staging_buffer(); + feat_gpu.prepare_staging_buffer(); - cmd.record_download(feat_gpu); - } + cmd.record_download(feat_gpu); + } - cmd.submit(); + cmd.submit(); - cmd.wait(); + cmd.wait(); - blob_mats[blob_index].create_like(feat_gpu, opt.blob_allocator); + blob_mats[blob_index].create_like(feat_gpu, opt.blob_allocator); - feat_gpu.download(blob_mats[blob_index]); + feat_gpu.download(blob_mats[blob_index]); - if (!feat_gpu.allocator->mappable) - { - feat_gpu.discard_staging_buffer(); + if (!feat_gpu.allocator->mappable) + { + feat_gpu.discard_staging_buffer(); + } } } else @@ -1558,7 +1768,7 @@ int Extractor::extract(int blob_index, VkMat& feat, VkCompute& cmd) if (blob_mats_gpu[blob_index].dims == 0) { int layer_index = net->blobs[blob_index].producer; - ret = net->forward_layer(layer_index, blob_mats_gpu, wait_barrier_counts, cmd, opt); + ret = net->forward_layer(layer_index, blob_mats, blob_mats_gpu, wait_barrier_counts, cmd, opt); } feat = blob_mats_gpu[blob_index]; diff --git a/src/net.h b/src/net.h index 25b68b34a..c4b9d1966 100644 --- a/src/net.h +++ b/src/net.h @@ -122,7 +122,7 @@ protected: int forward_layer(int layer_index, std::vector& blob_mats, Option& opt) const; #if NCNN_VULKAN - int forward_layer(int layer_index, std::vector& blob_mats, std::vector& wait_barrier_counts, VkCompute& cmd, Option& opt) const; + int forward_layer(int layer_index, std::vector& blob_mats, std::vector& blob_mats_gpu, std::vector& wait_barrier_counts, VkCompute& cmd, Option& opt) const; #endif // NCNN_VULKAN protected: