// Tencent is pleased to support the open source community by making ncnn available. // // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at // // https://opensource.org/licenses/BSD-3-Clause // // Unless required by applicable law or agreed to in writing, software distributed // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. #include "net.h" #include "convolution.h" #include "convolutiondepthwise.h" #include "cpu.h" #include "datareader.h" #include "layer_type.h" #include "modelbin.h" #include "paramdict.h" #include "relu.h" #include #include #include #if NCNN_BENCHMARK #include "benchmark.h" #endif // NCNN_BENCHMARK #if NCNN_VULKAN #include "command.h" #include "pipelinecache.h" #endif // NCNN_VULKAN namespace ncnn { Net::Net() { #if NCNN_VULKAN vkdev = 0; weight_vkallocator = 0; weight_staging_vkallocator = 0; pipeline_cache = 0; #endif // NCNN_VULKAN } Net::~Net() { clear(); #if NCNN_VULKAN #endif // NCNN_VULKAN } #if NCNN_STRING int Net::register_custom_layer(const char* type, layer_creator_func creator) { int typeindex = layer_to_index(type); if (typeindex != -1) { NCNN_LOGE("can not register build-in layer type %s", type); return -1; } int custom_index = custom_layer_to_index(type); if (custom_index == -1) { struct layer_registry_entry entry = {type, creator}; custom_layer_registry.push_back(entry); } else { NCNN_LOGE("overwrite existing custom layer type %s", type); custom_layer_registry[custom_index].name = type; custom_layer_registry[custom_index].creator = creator; } return 0; } #endif // NCNN_STRING int Net::register_custom_layer(int index, layer_creator_func creator) { int custom_index = index & ~LayerType::CustomBit; if (index == custom_index) { NCNN_LOGE("can not register build-in layer index %d", custom_index); return -1; } if ((int)custom_layer_registry.size() <= custom_index) { #if NCNN_STRING struct layer_registry_entry dummy = {"", 0}; #else struct layer_registry_entry dummy = {0}; #endif // NCNN_STRING custom_layer_registry.resize(custom_index + 1, dummy); } if (custom_layer_registry[custom_index].creator) { NCNN_LOGE("overwrite existing custom layer index %d", custom_index); } custom_layer_registry[custom_index].creator = creator; return 0; } #if NCNN_STRING int Net::load_param(const DataReader& dr) { #define SCAN_VALUE(fmt, v) \ if (dr.scan(fmt, &v) != 1) \ { \ NCNN_LOGE("parse " #v " failed"); \ return -1; \ } int magic = 0; SCAN_VALUE("%d", magic) if (magic != 7767517) { NCNN_LOGE("param is too old, please regenerate"); return -1; } // parse int layer_count = 0; int blob_count = 0; SCAN_VALUE("%d", layer_count) SCAN_VALUE("%d", blob_count) if (layer_count <= 0 || blob_count <= 0) { NCNN_LOGE("invalid layer_count or blob_count"); return -1; } layers.resize((size_t)layer_count); blobs.resize((size_t)blob_count); #if NCNN_VULKAN // TODO enable gpu when bf16 conversion implemented if (opt.use_bf16_storage) opt.use_vulkan_compute = false; if (opt.use_vulkan_compute) { if (!vkdev) vkdev = get_gpu_device(); if (!vkdev) opt.use_vulkan_compute = false; // no vulkan device, fallback to cpu } if (opt.use_vulkan_compute) { // sanitize use options if (!vkdev->info.support_fp16_packed) opt.use_fp16_packed = false; if (!vkdev->info.support_fp16_storage) opt.use_fp16_storage = false; if (!vkdev->info.support_fp16_arithmetic) opt.use_fp16_arithmetic = false; if (!vkdev->info.support_int8_storage) opt.use_int8_storage = false; if (!vkdev->info.support_int8_arithmetic) opt.use_int8_arithmetic = false; // TODO give user a choice if (vkdev->info.bug_storage_buffer_no_l1) opt.use_image_storage = true; if (vkdev->info.bug_layout_binding_id_alias) opt.use_image_storage = false; } #endif // NCNN_VULKAN ParamDict pd; int blob_index = 0; for (int i = 0; i < layer_count; i++) { char layer_type[256]; char layer_name[256]; int bottom_count = 0; int top_count = 0; SCAN_VALUE("%255s", layer_type) SCAN_VALUE("%255s", layer_name) SCAN_VALUE("%d", bottom_count) SCAN_VALUE("%d", top_count) Layer* layer = create_layer(layer_type); if (!layer) { layer = create_custom_layer(layer_type); } if (!layer) { NCNN_LOGE("layer %s not exists or registered", layer_type); clear(); return -1; } if (layer->use_int8_inference) { // no int8 gpu or packing layout support yet opt.use_vulkan_compute = false; opt.use_packing_layout = false; opt.use_fp16_storage = false; opt.use_bf16_storage = false; } #if NCNN_VULKAN if (opt.use_vulkan_compute) layer->vkdev = vkdev; #endif // NCNN_VULKAN layer->type = std::string(layer_type); layer->name = std::string(layer_name); // NCNN_LOGE("new layer %d %s", i, layer_name); layer->bottoms.resize(bottom_count); for (int j = 0; j < bottom_count; j++) { char bottom_name[256]; SCAN_VALUE("%255s", bottom_name) int bottom_blob_index = find_blob_index_by_name(bottom_name); if (bottom_blob_index == -1) { Blob& blob = blobs[blob_index]; bottom_blob_index = blob_index; blob.name = std::string(bottom_name); // NCNN_LOGE("new blob %s", bottom_name); blob_index++; } Blob& blob = blobs[bottom_blob_index]; blob.consumers.push_back(i); layer->bottoms[j] = bottom_blob_index; } layer->tops.resize(top_count); for (int j = 0; j < top_count; j++) { Blob& blob = blobs[blob_index]; char blob_name[256]; SCAN_VALUE("%255s", blob_name) blob.name = std::string(blob_name); // NCNN_LOGE("new blob %s", blob_name); blob.producer = i; layer->tops[j] = blob_index; blob_index++; } // layer specific params int pdlr = pd.load_param(dr); if (pdlr != 0) { NCNN_LOGE("ParamDict load_param failed"); continue; } // pull out top shape hints Mat shape_hints = pd.get(30, Mat()); if (!shape_hints.empty()) { const int* psh = shape_hints; for (int j = 0; j < top_count; j++) { Blob& blob = blobs[layer->tops[j]]; int dims = psh[0]; if (dims == 1) { blob.shape = Mat(psh[1], (void*)0, 4u, 1); } if (dims == 2) { blob.shape = Mat(psh[1], psh[2], (void*)0, 4u, 1); } if (dims == 3) { blob.shape = Mat(psh[1], psh[2], psh[3], (void*)0, 4u, 1); } psh += 4; } } // set bottom and top shape hints layer->bottom_shapes.resize(bottom_count); for (int j = 0; j < bottom_count; j++) { layer->bottom_shapes[j] = blobs[layer->bottoms[j]].shape; } layer->top_shapes.resize(top_count); for (int j = 0; j < top_count; j++) { layer->top_shapes[j] = blobs[layer->tops[j]].shape; } int lr = layer->load_param(pd); if (lr != 0) { NCNN_LOGE("layer load_param failed"); continue; } layers[i] = layer; } #undef SCAN_VALUE return 0; } #endif // NCNN_STRING int Net::load_param_bin(const DataReader& dr) { #define READ_VALUE(buf) \ if (dr.read(&buf, sizeof(buf)) != sizeof(buf)) \ { \ NCNN_LOGE("read " #buf " failed"); \ return -1; \ } int magic = 0; READ_VALUE(magic) if (magic != 7767517) { NCNN_LOGE("param is too old, please regenerate"); return -1; } int layer_count = 0; int blob_count = 0; READ_VALUE(layer_count) READ_VALUE(blob_count) if (layer_count <= 0 || blob_count <= 0) { NCNN_LOGE("invalid layer_count or blob_count"); return -1; } layers.resize(layer_count); blobs.resize(blob_count); #if NCNN_VULKAN // TODO enable gpu when bf16 conversion implemented if (opt.use_bf16_storage) opt.use_vulkan_compute = false; if (opt.use_vulkan_compute) { if (!vkdev) vkdev = get_gpu_device(); if (!vkdev) opt.use_vulkan_compute = false; // no vulkan device, fallback to cpu } if (opt.use_vulkan_compute) { // sanitize use options if (!vkdev->info.support_fp16_packed) opt.use_fp16_packed = false; if (!vkdev->info.support_fp16_storage) opt.use_fp16_storage = false; if (!vkdev->info.support_fp16_arithmetic) opt.use_fp16_arithmetic = false; if (!vkdev->info.support_int8_storage) opt.use_int8_storage = false; if (!vkdev->info.support_int8_arithmetic) opt.use_int8_arithmetic = false; // TODO give user a choice if (vkdev->info.bug_storage_buffer_no_l1) opt.use_image_storage = true; if (vkdev->info.bug_layout_binding_id_alias) opt.use_image_storage = false; } #endif // NCNN_VULKAN ParamDict pd; for (int i = 0; i < layer_count; i++) { int typeindex; int bottom_count; int top_count; READ_VALUE(typeindex) READ_VALUE(bottom_count) READ_VALUE(top_count) Layer* layer = create_layer(typeindex); if (!layer) { int custom_index = typeindex & ~LayerType::CustomBit; layer = create_custom_layer(custom_index); } if (!layer) { NCNN_LOGE("layer %d not exists or registered", typeindex); clear(); return -1; } if (layer->use_int8_inference) { // no int8 gpu or packing layout support yet opt.use_vulkan_compute = false; opt.use_packing_layout = false; opt.use_fp16_storage = false; opt.use_bf16_storage = false; } #if NCNN_VULKAN if (opt.use_vulkan_compute) layer->vkdev = vkdev; #endif // NCNN_VULKAN // layer->type = std::string(layer_type); // layer->name = std::string(layer_name); // NCNN_LOGE("new layer %d", typeindex); layer->bottoms.resize(bottom_count); for (int j = 0; j < bottom_count; j++) { int bottom_blob_index; READ_VALUE(bottom_blob_index) Blob& blob = blobs[bottom_blob_index]; blob.consumers.push_back(i); layer->bottoms[j] = bottom_blob_index; } layer->tops.resize(top_count); for (int j = 0; j < top_count; j++) { int top_blob_index; READ_VALUE(top_blob_index) Blob& blob = blobs[top_blob_index]; // blob.name = std::string(blob_name); // NCNN_LOGE("new blob %s", blob_name); blob.producer = i; layer->tops[j] = top_blob_index; } // layer specific params int pdlr = pd.load_param_bin(dr); if (pdlr != 0) { NCNN_LOGE("ParamDict load_param failed"); continue; } // pull out top blob shape hints Mat shape_hints = pd.get(30, Mat()); if (!shape_hints.empty()) { const int* psh = shape_hints; for (int j = 0; j < top_count; j++) { Blob& blob = blobs[layer->tops[j]]; int dims = psh[0]; if (dims == 1) { blob.shape = Mat(psh[1], (void*)0, 4u, 1); } if (dims == 2) { blob.shape = Mat(psh[1], psh[2], (void*)0, 4u, 1); } if (dims == 3) { blob.shape = Mat(psh[1], psh[2], psh[3], (void*)0, 4u, 1); } psh += 4; } } // set bottom and top shape hints layer->bottom_shapes.resize(bottom_count); for (int j = 0; j < bottom_count; j++) { layer->bottom_shapes[j] = blobs[layer->bottoms[j]].shape; } layer->top_shapes.resize(top_count); for (int j = 0; j < top_count; j++) { layer->top_shapes[j] = blobs[layer->tops[j]].shape; } int lr = layer->load_param(pd); if (lr != 0) { NCNN_LOGE("layer load_param failed"); continue; } layers[i] = layer; } #undef READ_VALUE return 0; } int Net::load_model(const DataReader& dr) { if (layers.empty()) { NCNN_LOGE("network graph not ready"); return -1; } // load file int ret = 0; ModelBinFromDataReader mb(dr); for (size_t i = 0; i < layers.size(); i++) { Layer* layer = layers[i]; //Here we found inconsistent content in the parameter file. if (!layer) { NCNN_LOGE("load_model error at layer %d, parameter file has inconsistent content.", (int)i); ret = -1; break; } int lret = layer->load_model(mb); if (lret != 0) { NCNN_LOGE("layer load_model %d failed", (int)i); ret = -1; break; } if (layer->use_int8_inference) { // no int8 gpu or packing layout support yet opt.use_vulkan_compute = false; opt.use_packing_layout = false; opt.use_fp16_storage = false; opt.use_bf16_storage = false; } } fuse_network(); #if NCNN_VULKAN if (opt.use_vulkan_compute) { if (!opt.pipeline_cache) { if (!pipeline_cache) pipeline_cache = new PipelineCache(vkdev); opt.pipeline_cache = pipeline_cache; } if (vkdev->info.bug_layout_binding_id_alias) opt.use_image_storage = false; } #endif // NCNN_VULKAN for (size_t i = 0; i < layers.size(); i++) { Layer* layer = layers[i]; //Here we found inconsistent content in the parameter file. if (!layer) { NCNN_LOGE("load_model error at layer %d, parameter file has inconsistent content.", (int)i); ret = -1; break; } Option opt1 = opt; #if NCNN_VULKAN if (opt.use_vulkan_compute) { if (!layer->support_image_storage) opt1.use_image_storage = false; } #endif // NCNN_VULKAN int cret = layer->create_pipeline(opt1); if (cret != 0) { NCNN_LOGE("layer create_pipeline %d failed", (int)i); ret = -1; break; } } #if NCNN_VULKAN if (opt.use_vulkan_compute) { create_pipeline(); upload_model(); } #endif // NCNN_VULKAN return ret; } #if NCNN_STDIO #if NCNN_STRING int Net::load_param(FILE* fp) { DataReaderFromStdio dr(fp); return load_param(dr); } int Net::load_param_mem(const char* _mem) { const unsigned char* mem = (const unsigned char*)_mem; DataReaderFromMemory dr(mem); return load_param(dr); } int Net::load_param(const char* protopath) { FILE* fp = fopen(protopath, "rb"); if (!fp) { NCNN_LOGE("fopen %s failed", protopath); return -1; } int ret = load_param(fp); fclose(fp); return ret; } #endif // NCNN_STRING int Net::load_param_bin(FILE* fp) { DataReaderFromStdio dr(fp); return load_param_bin(dr); } int Net::load_param_bin(const char* protopath) { FILE* fp = fopen(protopath, "rb"); if (!fp) { NCNN_LOGE("fopen %s failed", protopath); return -1; } int ret = load_param_bin(fp); fclose(fp); return ret; } int Net::load_model(FILE* fp) { DataReaderFromStdio dr(fp); return load_model(dr); } int Net::load_model(const char* modelpath) { FILE* fp = fopen(modelpath, "rb"); if (!fp) { NCNN_LOGE("fopen %s failed", modelpath); return -1; } int ret = load_model(fp); fclose(fp); return ret; } #endif // NCNN_STDIO int Net::load_param(const unsigned char* _mem) { const unsigned char* mem = _mem; DataReaderFromMemory dr(mem); load_param_bin(dr); return static_cast(mem - _mem); } int Net::load_model(const unsigned char* _mem) { const unsigned char* mem = _mem; DataReaderFromMemory dr(mem); load_model(dr); return static_cast(mem - _mem); } #if __ANDROID_API__ >= 9 #if NCNN_STRING int Net::load_param(AAsset* asset) { DataReaderFromAndroidAsset dr(asset); return load_param(dr); } int Net::load_param(AAssetManager* mgr, const char* assetpath) { AAsset* asset = AAssetManager_open(mgr, assetpath, AASSET_MODE_BUFFER); if (!asset) { NCNN_LOGE("AAssetManager_open %s failed", assetpath); return -1; } int ret = load_param(asset); AAsset_close(asset); return ret; } #endif // NCNN_STRING int Net::load_param_bin(AAsset* asset) { DataReaderFromAndroidAsset dr(asset); return load_param_bin(dr); } int Net::load_param_bin(AAssetManager* mgr, const char* assetpath) { AAsset* asset = AAssetManager_open(mgr, assetpath, AASSET_MODE_BUFFER); if (!asset) { NCNN_LOGE("AAssetManager_open %s failed", assetpath); return -1; } int ret = load_param_bin(asset); AAsset_close(asset); return ret; } int Net::load_model(AAsset* asset) { DataReaderFromAndroidAsset dr(asset); return load_model(dr); } int Net::load_model(AAssetManager* mgr, const char* assetpath) { AAsset* asset = AAssetManager_open(mgr, assetpath, AASSET_MODE_STREAMING); if (!asset) { NCNN_LOGE("AAssetManager_open %s failed", assetpath); return -1; } int ret = load_model(asset); AAsset_close(asset); return ret; } #endif // __ANDROID_API__ >= 9 int Net::fuse_network() { // set the int8 op fusion:requantize #if NCNN_STRING && NCNN_REQUANT // NCNN_LOGE("Test op fusion to int8 implement:"); // parse the network whether is a quantization model bool net_quantized = false; for (size_t i = 0; i < layers.size(); i++) { Layer* layer = layers[i]; if (layer->type == "Convolution" || layer->type == "ConvolutionDepthWise") { if (layer->type == "Convolution" && (((Convolution*)layer)->weight_data.elemsize != 1u)) continue; if (layer->type == "ConvolutionDepthWise" && (((ConvolutionDepthWise*)layer)->weight_data.elemsize != 1u)) continue; net_quantized = true; } } if (net_quantized == false) return 0; for (size_t i = 0; i < layers.size(); i++) { Layer* layer = layers[i]; if (layer->type == "Convolution" || layer->type == "ConvolutionDepthWise") { if (layer->type == "Convolution" && (((Convolution*)layer)->weight_data.elemsize != 1u)) continue; if (layer->type == "ConvolutionDepthWise" && (((ConvolutionDepthWise*)layer)->weight_data.elemsize != 1u)) continue; for (size_t n = 0; n < blobs[layer->tops[0]].consumers.size(); n++) { int layer_next_index = blobs[layer->tops[0]].consumers[n]; Layer* layer_next = layers[layer_next_index]; if (layer_next->type == "Convolution" || layer_next->type == "ConvolutionDepthWise") { if (layer_next->type == "Convolution" && ((Convolution*)layer_next)->weight_data.elemsize != 1u) continue; if (layer_next->type == "ConvolutionDepthWise" && ((ConvolutionDepthWise*)layer_next)->weight_data.elemsize != 1u) continue; // NCNN_LOGE("%s, %s", layer->name.c_str(), layer_next->name.c_str()); if (layer->type == "Convolution" && layer_next->type == "Convolution") { ((Convolution*)layer)->use_int8_requantize = true; ((Convolution*)layer)->top_blob_int8_scale = ((Convolution*)layer_next)->bottom_blob_int8_scale; } else if (layer->type == "ConvolutionDepthWise" && layer_next->type == "Convolution") { ((ConvolutionDepthWise*)layer)->use_int8_requantize = true; ((ConvolutionDepthWise*)layer)->top_blob_int8_scale = ((Convolution*)layer_next)->bottom_blob_int8_scale; } else if (layer->type == "Convolution" && layer_next->type == "ConvolutionDepthWise") { ((Convolution*)layer)->use_int8_requantize = true; ((Convolution*)layer)->top_blob_int8_scale = ((ConvolutionDepthWise*)layer_next)->bottom_blob_int8_scales[0]; } else { ((ConvolutionDepthWise*)layer)->use_int8_requantize = true; ((ConvolutionDepthWise*)layer)->top_blob_int8_scale = ((ConvolutionDepthWise*)layer_next)->bottom_blob_int8_scales[0]; } } else if (layer_next->type == "ReLU") { int layer_next_2_index = blobs[layer_next->tops[0]].consumers[0]; Layer* layer_next_2 = layers[layer_next_2_index]; if (layer_next_2->type == "Convolution" || layer_next_2->type == "ConvolutionDepthWise") { if (layer_next_2->type == "Convolution" && ((Convolution*)layer_next_2)->weight_data.elemsize != 1u) continue; if (layer_next_2->type == "ConvolutionDepthWise" && ((ConvolutionDepthWise*)layer_next_2)->weight_data.elemsize != 1u) continue; // NCNN_LOGE("%s, %s, %s", layer->name.c_str(), layer_next->name.c_str(), layer_next_2->name.c_str()); if (layer->type == "Convolution" && layer_next_2->type == "Convolution") { ((Convolution*)layer)->use_int8_requantize = true; ((Convolution*)layer)->top_blob_int8_scale = ((Convolution*)layer_next_2)->bottom_blob_int8_scale; } else if (layer->type == "ConvolutionDepthWise" && layer_next_2->type == "Convolution") { ((ConvolutionDepthWise*)layer)->use_int8_requantize = true; ((ConvolutionDepthWise*)layer)->top_blob_int8_scale = ((Convolution*)layer_next_2)->bottom_blob_int8_scale; } else if (layer->type == "Convolution" && layer_next_2->type == "ConvolutionDepthWise") { ((Convolution*)layer)->use_int8_requantize = true; ((Convolution*)layer)->top_blob_int8_scale = ((ConvolutionDepthWise*)layer_next_2)->bottom_blob_int8_scales[0]; } else { ((ConvolutionDepthWise*)layer)->use_int8_requantize = true; ((ConvolutionDepthWise*)layer)->top_blob_int8_scale = ((ConvolutionDepthWise*)layer_next_2)->bottom_blob_int8_scales[0]; } } else if (layer_next_2->type == "Split") { bool all_conv = true; for (size_t i = 0; i < layer_next_2->tops.size(); i++) { int layer_next_3_index = blobs[layer_next_2->tops[i]].consumers[0]; if (layers[layer_next_3_index]->type != "Convolution" && layers[layer_next_3_index]->type != "ConvolutionDepthWise" && layers[layer_next_3_index]->type != "PriorBox") { // NCNN_LOGE("%s, %s, %s, %s", layer->name.c_str(), layer_next->name.c_str(), layer_next_2->name.c_str(), layers[layer_next_3_index]->name.c_str()); all_conv = false; } } if (all_conv == true && layer_next_2->tops.size() >= size_t(2)) { // NCNN_LOGE("%s, %s, %s, ", layer->name.c_str(), layer_next->name.c_str(), layer_next_2->name.c_str()); for (size_t i = 0; i < layer_next_2->tops.size(); i++) { int layer_next_3_index = blobs[layer_next_2->tops[i]].consumers[0]; Layer* layer_next_3 = layers[layer_next_3_index]; // NCNN_LOGE("%s, ", layer_next_3->name.c_str()); if (layer_next_3->type == "Convolution") { ((Convolution*)layer)->top_blob_int8_scale = ((Convolution*)layer_next_3)->bottom_blob_int8_scale; } } ((Convolution*)layer)->use_int8_requantize = true; // NCNN_LOGE(""); } } else { // NCNN_LOGE("%s, %s", layer->name.c_str(), layer_next->name.c_str()); } } else if (layer_next->type == "Pooling") { // ToDo } else { // NCNN_LOGE("%s", layer->name.c_str()); } } } } #endif return 0; } void Net::clear() { #if NCNN_VULKAN destroy_pipeline(); #endif // NCNN_VULKAN blobs.clear(); for (size_t i = 0; i < layers.size(); i++) { Layer* layer = layers[i]; Option opt1 = opt; if (!layer->support_image_storage) { opt1.use_image_storage = false; } int dret = layer->destroy_pipeline(opt1); if (dret != 0) { NCNN_LOGE("layer destroy_pipeline failed"); // ignore anyway } delete layer; } layers.clear(); #if NCNN_VULKAN if (weight_vkallocator) { delete weight_vkallocator; weight_vkallocator = 0; } if (weight_staging_vkallocator) { delete weight_staging_vkallocator; weight_staging_vkallocator = 0; } if (pipeline_cache) { delete pipeline_cache; pipeline_cache = 0; opt.pipeline_cache = 0; } #endif // NCNN_VULKAN } Extractor Net::create_extractor() const { return Extractor(this, blobs.size()); } #if NCNN_VULKAN void Net::set_vulkan_device(int device_index) { vkdev = get_gpu_device(device_index); } void Net::set_vulkan_device(const VulkanDevice* _vkdev) { vkdev = _vkdev; } const VulkanDevice* Net::vulkan_device() const { return vkdev; } int Net::upload_model() { ncnn::VkTransfer cmd(vkdev); // create gpu device allocator if null if (!weight_vkallocator) { weight_vkallocator = new VkWeightAllocator(vkdev); } if (!weight_staging_vkallocator) { weight_staging_vkallocator = new VkWeightStagingAllocator(vkdev); } Option opt_upload = opt; opt_upload.blob_vkallocator = weight_vkallocator; opt_upload.workspace_vkallocator = weight_vkallocator; opt_upload.staging_vkallocator = weight_staging_vkallocator; for (size_t i = 0; i < layers.size(); i++) { if (layers[i]->support_vulkan) { int uret = layers[i]->upload_model(cmd, opt_upload); if (uret != 0) { NCNN_LOGE("layer upload_model %d failed", (int)i); return -1; } } } cmd.submit_and_wait(); return 0; } int Net::create_pipeline() { return 0; } int Net::destroy_pipeline() { return 0; } #endif // NCNN_VULKAN #if NCNN_STRING int Net::find_blob_index_by_name(const char* name) const { for (size_t i = 0; i < blobs.size(); i++) { const Blob& blob = blobs[i]; if (blob.name == name) { return static_cast(i); } } NCNN_LOGE("find_blob_index_by_name %s failed", name); return -1; } int Net::find_layer_index_by_name(const char* name) const { for (size_t i = 0; i < layers.size(); i++) { const Layer* layer = layers[i]; if (layer->name == name) { return static_cast(i); } } NCNN_LOGE("find_layer_index_by_name %s failed", name); return -1; } int Net::custom_layer_to_index(const char* type) { const size_t custom_layer_registry_entry_count = custom_layer_registry.size(); for (size_t i = 0; i < custom_layer_registry_entry_count; i++) { if (strcmp(type, custom_layer_registry[i].name) == 0) return static_cast(i); } return -1; } Layer* Net::create_custom_layer(const char* type) { int index = custom_layer_to_index(type); if (index == -1) return 0; return create_custom_layer(index); } #endif // NCNN_STRING Layer* Net::create_custom_layer(int index) { const size_t custom_layer_registry_entry_count = custom_layer_registry.size(); if (index < 0 || static_cast(index) >= custom_layer_registry_entry_count) return 0; layer_creator_func layer_creator = custom_layer_registry[index].creator; if (!layer_creator) return 0; return layer_creator(); } int Net::forward_layer(int layer_index, std::vector& blob_mats, const Option& opt) const { const Layer* layer = layers[layer_index]; // NCNN_LOGE("forward_layer %d %s", layer_index, layer->name.c_str()); if (layer->one_blob_only) { // load bottom blob int bottom_blob_index = layer->bottoms[0]; int top_blob_index = layer->tops[0]; if (blob_mats[bottom_blob_index].dims == 0) { int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, opt); if (ret != 0) return ret; } Mat bottom_blob = blob_mats[bottom_blob_index]; if (opt.lightmode) { // delete after taken in light mode blob_mats[bottom_blob_index].release(); // deep copy for inplace forward if data is shared if (layer->support_inplace && *bottom_blob.refcount != 1) { bottom_blob = bottom_blob.clone(); } } // clang-format off // *INDENT-OFF* #if NCNN_ARM82 if (opt.use_fp16_storage && cpu_support_arm_asimdhp()) { if (bottom_blob.elemsize / bottom_blob.elempack == 4u && layer->support_fp16_storage) { Mat bottom_blob_fp16; cast_float32_to_float16(bottom_blob, bottom_blob_fp16, opt); bottom_blob = bottom_blob_fp16; } if (bottom_blob.elemsize / bottom_blob.elempack == 2u && !layer->support_fp16_storage) { Mat bottom_blob_fp32; cast_float16_to_float32(bottom_blob, bottom_blob_fp32, opt); bottom_blob = bottom_blob_fp32; } } else #endif // NCNN_ARM82 if (opt.use_bf16_storage) { if (bottom_blob.elemsize / bottom_blob.elempack == 4u && layer->support_bf16_storage) { Mat bottom_blob_bf16; cast_float32_to_bfloat16(bottom_blob, bottom_blob_bf16, opt); bottom_blob = bottom_blob_bf16; } if (bottom_blob.elemsize / bottom_blob.elempack == 2u && !layer->support_bf16_storage) { Mat bottom_blob_fp32; cast_bfloat16_to_float32(bottom_blob, bottom_blob_fp32, opt); bottom_blob = bottom_blob_fp32; } } // *INDENT-ON* // clang-format on if (opt.use_packing_layout) { // resolve dst_elempack int dims = bottom_blob.dims; int elemcount = 0; if (dims == 1) elemcount = bottom_blob.elempack * bottom_blob.w; if (dims == 2) elemcount = bottom_blob.elempack * bottom_blob.h; if (dims == 3) elemcount = bottom_blob.elempack * bottom_blob.c; int dst_elempack = 1; if (layer->support_packing) { #if NCNN_AVX2 if (elemcount % 8 == 0) dst_elempack = 8; #elif NCNN_ARM82 if (elemcount % 8 == 0 && opt.use_fp16_arithmetic && layer->support_fp16_storage) dst_elempack = 8; else if (elemcount % 4 == 0) dst_elempack = 4; #else if (elemcount % 4 == 0) dst_elempack = 4; #endif } Mat bottom_blob_packed; convert_packing(bottom_blob, bottom_blob_packed, dst_elempack, opt); bottom_blob = bottom_blob_packed; } // forward if (opt.lightmode && layer->support_inplace) { Mat& bottom_top_blob = bottom_blob; #if NCNN_BENCHMARK double start = get_current_time(); int ret = layer->forward_inplace(bottom_top_blob, opt); double end = get_current_time(); benchmark(layer, bottom_top_blob, bottom_top_blob, start, end); #else int ret = layer->forward_inplace(bottom_top_blob, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; // store top blob blob_mats[top_blob_index] = bottom_top_blob; } else { Mat top_blob; #if NCNN_BENCHMARK double start = get_current_time(); int ret = layer->forward(bottom_blob, top_blob, opt); double end = get_current_time(); benchmark(layer, bottom_blob, top_blob, start, end); #else int ret = layer->forward(bottom_blob, top_blob, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; // store top blob blob_mats[top_blob_index] = top_blob; } } else { // load bottom blobs std::vector bottom_blobs(layer->bottoms.size()); for (size_t i = 0; i < layer->bottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; if (blob_mats[bottom_blob_index].dims == 0) { int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, opt); if (ret != 0) return ret; } bottom_blobs[i] = blob_mats[bottom_blob_index]; if (opt.lightmode) { // delete after taken in light mode blob_mats[bottom_blob_index].release(); // deep copy for inplace forward if data is shared if (layer->support_inplace && *bottom_blobs[i].refcount != 1) { bottom_blobs[i] = bottom_blobs[i].clone(); } } // clang-format off // *INDENT-OFF* #if NCNN_ARM82 if (opt.use_fp16_storage && cpu_support_arm_asimdhp()) { if (bottom_blobs[i].elemsize / bottom_blobs[i].elempack == 4u && layer->support_fp16_storage) { Mat bottom_blob_fp16; cast_float32_to_float16(bottom_blobs[i], bottom_blob_fp16, opt); bottom_blobs[i] = bottom_blob_fp16; } if (bottom_blobs[i].elemsize / bottom_blobs[i].elempack == 2u && !layer->support_fp16_storage) { Mat bottom_blob_fp32; cast_float16_to_float32(bottom_blobs[i], bottom_blob_fp32, opt); bottom_blobs[i] = bottom_blob_fp32; } } else #endif // NCNN_ARM82 if (opt.use_bf16_storage) { if (bottom_blobs[i].elemsize / bottom_blobs[i].elempack == 4u && layer->support_bf16_storage) { Mat bottom_blob_bf16; cast_float32_to_bfloat16(bottom_blobs[i], bottom_blob_bf16, opt); bottom_blobs[i] = bottom_blob_bf16; } if (bottom_blobs[i].elemsize / bottom_blobs[i].elempack == 2u && !layer->support_bf16_storage) { Mat bottom_blob_fp32; cast_bfloat16_to_float32(bottom_blobs[i], bottom_blob_fp32, opt); bottom_blobs[i] = bottom_blob_fp32; } } // *INDENT-ON* // clang-format on if (opt.use_packing_layout) { // resolve dst_elempack int dims = bottom_blobs[i].dims; int elemcount = 0; if (dims == 1) elemcount = bottom_blobs[i].elempack * bottom_blobs[i].w; if (dims == 2) elemcount = bottom_blobs[i].elempack * bottom_blobs[i].h; if (dims == 3) elemcount = bottom_blobs[i].elempack * bottom_blobs[i].c; int dst_elempack = 1; if (layer->support_packing) { #if NCNN_AVX2 if (elemcount % 8 == 0) dst_elempack = 8; #elif NCNN_ARM82 if (elemcount % 8 == 0 && opt.use_fp16_arithmetic && layer->support_fp16_storage) dst_elempack = 8; else if (elemcount % 4 == 0) dst_elempack = 4; #else if (elemcount % 4 == 0) dst_elempack = 4; #endif } Mat bottom_blob_packed; convert_packing(bottom_blobs[i], bottom_blob_packed, dst_elempack, opt); bottom_blobs[i] = bottom_blob_packed; } } // forward if (opt.lightmode && layer->support_inplace) { std::vector& bottom_top_blobs = bottom_blobs; #if NCNN_BENCHMARK double start = get_current_time(); int ret = layer->forward_inplace(bottom_top_blobs, opt); double end = get_current_time(); benchmark(layer, start, end); #else int ret = layer->forward_inplace(bottom_top_blobs, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; // store top blobs for (size_t i = 0; i < layer->tops.size(); i++) { int top_blob_index = layer->tops[i]; blob_mats[top_blob_index] = bottom_top_blobs[i]; } } else { std::vector top_blobs(layer->tops.size()); #if NCNN_BENCHMARK double start = get_current_time(); int ret = layer->forward(bottom_blobs, top_blobs, opt); double end = get_current_time(); benchmark(layer, start, end); #else int ret = layer->forward(bottom_blobs, top_blobs, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; // store top blobs for (size_t i = 0; i < layer->tops.size(); i++) { int top_blob_index = layer->tops[i]; blob_mats[top_blob_index] = top_blobs[i]; } } } // NCNN_LOGE("forward_layer %d %s done", layer_index, layer->name.c_str()); // const Mat& blob = blob_mats[layer->tops[0]]; // NCNN_LOGE("[%-2d %-16s %-16s] %d blobs count = %-3d size = %-3d x %-3d", layer_index, layer->type.c_str(), layer->name.c_str(), layer->tops[0], blob.c, blob.h, blob.w); return 0; } #if NCNN_VULKAN int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector& blob_mats_gpu, VkCompute& cmd, const Option& opt) const { const Layer* layer = layers[layer_index]; // NCNN_LOGE("forward_layer %d %d %s", layer->support_vulkan, layer_index, layer->name.c_str()); if (layer->support_vulkan) { if (layer->one_blob_only) { // load bottom blob int bottom_blob_index = layer->bottoms[0]; int top_blob_index = layer->tops[0]; if (blob_mats_gpu[bottom_blob_index].dims == 0) { if (blob_mats[bottom_blob_index].dims == 0) { int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, cmd, opt); if (ret != 0) return ret; } if (blob_mats_gpu[bottom_blob_index].dims == 0) { // host to buffer cmd.record_upload(blob_mats[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); } } VkMat bottom_blob = blob_mats_gpu[bottom_blob_index]; if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu[bottom_blob_index].release(); // deep copy for inplace forward if data is shared if (layer->support_inplace && *bottom_blob.refcount != 1) { VkMat bottom_blob_copy; cmd.record_clone(bottom_blob, bottom_blob_copy, opt); // NCNN_LOGE("clone %p[+%lu] %p[+%lu]", bottom_blob.buffer(), bottom_blob.buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset()); bottom_blob = bottom_blob_copy; } } // forward if (opt.lightmode && layer->support_inplace) { VkMat& bottom_top_blob = bottom_blob; #if NCNN_BENCHMARK cmd.record_write_timestamp(layer_index * 2); int ret = layer->forward_inplace(bottom_top_blob, cmd, opt); cmd.record_write_timestamp(layer_index * 2 + 1); #else int ret = layer->forward_inplace(bottom_top_blob, cmd, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; // store top blob blob_mats_gpu[top_blob_index] = bottom_top_blob; } else { VkMat top_blob; #if NCNN_BENCHMARK cmd.record_write_timestamp(layer_index * 2); int ret = layer->forward(bottom_blob, top_blob, cmd, opt); cmd.record_write_timestamp(layer_index * 2 + 1); #else int ret = layer->forward(bottom_blob, top_blob, cmd, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; // store top blob blob_mats_gpu[top_blob_index] = top_blob; } } else { // load bottom blobs std::vector bottom_blobs(layer->bottoms.size()); for (size_t i = 0; i < layer->bottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; if (blob_mats_gpu[bottom_blob_index].dims == 0) { if (blob_mats[bottom_blob_index].dims == 0) { int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, cmd, opt); if (ret != 0) return ret; } if (blob_mats_gpu[bottom_blob_index].dims == 0) { // host to buffer cmd.record_upload(blob_mats[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); } } bottom_blobs[i] = blob_mats_gpu[bottom_blob_index]; if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu[bottom_blob_index].release(); // deep copy for inplace forward if data is shared if (layer->support_inplace && *bottom_blobs[i].refcount != 1) { VkMat bottom_blob_copy; cmd.record_clone(bottom_blobs[i], bottom_blob_copy, opt); // NCNN_LOGE("clone %p[+%lu] %p[+%lu]", bottom_blobs[i].buffer(), bottom_blobs[i].buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset()); bottom_blobs[i] = bottom_blob_copy; } } } // forward if (opt.lightmode && layer->support_inplace) { std::vector& bottom_top_blobs = bottom_blobs; #if NCNN_BENCHMARK cmd.record_write_timestamp(layer_index * 2); int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt); cmd.record_write_timestamp(layer_index * 2 + 1); #else int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; // store top blobs for (size_t i = 0; i < layer->tops.size(); i++) { int top_blob_index = layer->tops[i]; blob_mats_gpu[top_blob_index] = bottom_top_blobs[i]; } } else { std::vector top_blobs(layer->tops.size()); #if NCNN_BENCHMARK cmd.record_write_timestamp(layer_index * 2); int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt); cmd.record_write_timestamp(layer_index * 2 + 1); #else int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; // store top blobs for (size_t i = 0; i < layer->tops.size(); i++) { int top_blob_index = layer->tops[i]; blob_mats_gpu[top_blob_index] = top_blobs[i]; } } } } else { if (layer->one_blob_only) { // load bottom blob int bottom_blob_index = layer->bottoms[0]; int top_blob_index = layer->tops[0]; if (blob_mats[bottom_blob_index].dims == 0) { if (blob_mats_gpu[bottom_blob_index].dims == 0) { int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, cmd, opt); if (ret != 0) return ret; } if (blob_mats[bottom_blob_index].dims == 0) { Option opt_download = opt; opt_download.use_packing_layout = layer->support_packing; // buffer to host cmd.record_download(blob_mats_gpu[bottom_blob_index], blob_mats[bottom_blob_index], opt_download); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu[bottom_blob_index].release(); } cmd.submit_and_wait(); #if NCNN_BENCHMARK std::vector results(layer_index * 2); cmd.get_query_pool_results(0, layer_index * 2, results); for (int i = 0; i < layer_index; i++) { uint64_t start = results[i * 2]; uint64_t end = results[i * 2 + 1]; if (start == 0 || end == 0) continue; double duration_us = (end - start) * vkdev->info.timestamp_period / 1000; NCNN_LOGE("%-24s %-30s %8.2lfus |", layers[i]->type.c_str(), layers[i]->name.c_str(), duration_us); } #endif // NCNN_BENCHMARK cmd.reset(); } } Mat bottom_blob = blob_mats[bottom_blob_index]; if (opt.lightmode) { // delete after taken in light mode blob_mats[bottom_blob_index].release(); // deep copy for inplace forward if data is shared if (layer->support_inplace && *bottom_blob.refcount != 1) { bottom_blob = bottom_blob.clone(); } } // forward if (opt.lightmode && layer->support_inplace) { Mat& bottom_top_blob = bottom_blob; #if NCNN_BENCHMARK double start = get_current_time(); int ret = layer->forward_inplace(bottom_top_blob, opt); double end = get_current_time(); benchmark(layer, bottom_top_blob, bottom_top_blob, start, end); #else int ret = layer->forward_inplace(bottom_top_blob, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; // store top blob blob_mats[top_blob_index] = bottom_top_blob; } else { Mat top_blob; #if NCNN_BENCHMARK double start = get_current_time(); int ret = layer->forward(bottom_blob, top_blob, opt); double end = get_current_time(); benchmark(layer, bottom_blob, top_blob, start, end); #else int ret = layer->forward(bottom_blob, top_blob, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; // store top blob blob_mats[top_blob_index] = top_blob; } } else { // load bottom blobs for (size_t i = 0; i < layer->bottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; if (blob_mats[bottom_blob_index].dims == 0) { if (blob_mats_gpu[bottom_blob_index].dims == 0) { int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, cmd, opt); if (ret != 0) return ret; } if (blob_mats[bottom_blob_index].dims == 0) { Option opt_download = opt; opt_download.use_packing_layout = layer->support_packing; // buffer to host cmd.record_download(blob_mats_gpu[bottom_blob_index], blob_mats[bottom_blob_index], opt_download); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu[bottom_blob_index].release(); } } } } { cmd.submit_and_wait(); #if NCNN_BENCHMARK std::vector results(layer_index * 2); cmd.get_query_pool_results(0, layer_index * 2, results); for (int i = 0; i < layer_index; i++) { uint64_t start = results[i * 2]; uint64_t end = results[i * 2 + 1]; if (start == 0 || end == 0) continue; double duration_us = (end - start) * vkdev->info.timestamp_period / 1000; NCNN_LOGE("%-24s %-30s %8.2lfus |", layers[i]->type.c_str(), layers[i]->name.c_str(), duration_us); } #endif // NCNN_BENCHMARK cmd.reset(); } std::vector bottom_blobs(layer->bottoms.size()); for (size_t i = 0; i < layer->bottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; bottom_blobs[i] = blob_mats[bottom_blob_index]; if (opt.lightmode) { // delete after taken in light mode blob_mats[bottom_blob_index].release(); // deep copy for inplace forward if data is shared if (layer->support_inplace && *bottom_blobs[i].refcount != 1) { bottom_blobs[i] = bottom_blobs[i].clone(); } } } // forward if (opt.lightmode && layer->support_inplace) { std::vector& bottom_top_blobs = bottom_blobs; #if NCNN_BENCHMARK double start = get_current_time(); int ret = layer->forward_inplace(bottom_top_blobs, opt); double end = get_current_time(); benchmark(layer, start, end); #else int ret = layer->forward_inplace(bottom_top_blobs, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; // store top blobs for (size_t i = 0; i < layer->tops.size(); i++) { int top_blob_index = layer->tops[i]; blob_mats[top_blob_index] = bottom_top_blobs[i]; } } else { std::vector top_blobs(layer->tops.size()); #if NCNN_BENCHMARK double start = get_current_time(); int ret = layer->forward(bottom_blobs, top_blobs, opt); double end = get_current_time(); benchmark(layer, start, end); #else int ret = layer->forward(bottom_blobs, top_blobs, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; // store top blobs for (size_t i = 0; i < layer->tops.size(); i++) { int top_blob_index = layer->tops[i]; blob_mats[top_blob_index] = top_blobs[i]; } } } } // NCNN_LOGE("forward_layer %d %d %s done", layer->support_vulkan, layer_index, layer->name.c_str()); return 0; } int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector& blob_mats_gpu, std::vector& blob_mats_gpu_image, VkCompute& cmd, const Option& opt) const { const Layer* layer = layers[layer_index]; // NCNN_LOGE("forward_layer %d %d %s", layer->support_vulkan, layer_index, layer->name.c_str()); if (layer->support_vulkan) { if (layer->support_image_storage) { if (layer->one_blob_only) { // load bottom blob int bottom_blob_index = layer->bottoms[0]; int top_blob_index = layer->tops[0]; if (blob_mats_gpu_image[bottom_blob_index].dims == 0) { if (blob_mats_gpu[bottom_blob_index].dims == 0) { if (blob_mats[bottom_blob_index].dims == 0) { int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, blob_mats_gpu_image, cmd, opt); if (ret != 0) return ret; } if (blob_mats_gpu_image[bottom_blob_index].dims == 0) { if (blob_mats_gpu[bottom_blob_index].dims == 0) { // host to image cmd.record_upload(blob_mats[bottom_blob_index], blob_mats_gpu_image[bottom_blob_index], opt); } else { // buffer to image cmd.record_buffer_to_image(blob_mats_gpu[bottom_blob_index], blob_mats_gpu_image[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu[bottom_blob_index].release(); } } } } else { // buffer to image cmd.record_buffer_to_image(blob_mats_gpu[bottom_blob_index], blob_mats_gpu_image[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu[bottom_blob_index].release(); } } } VkImageMat bottom_blob = blob_mats_gpu_image[bottom_blob_index]; if (bottom_blob.empty()) { goto IMAGE_ALLOCATION_FAILED; } if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu_image[bottom_blob_index].release(); // deep copy for inplace forward if data is shared if (layer->support_inplace && *bottom_blob.refcount != 1) { VkImageMat bottom_blob_copy; cmd.record_clone(bottom_blob, bottom_blob_copy, opt); // NCNN_LOGE("clone %p[+%lu] %p[+%lu]", bottom_blob.buffer(), bottom_blob.buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset()); bottom_blob = bottom_blob_copy; } } // forward if (opt.lightmode && layer->support_inplace) { VkImageMat& bottom_top_blob = bottom_blob; #if NCNN_BENCHMARK cmd.record_write_timestamp(layer_index * 2); int ret = layer->forward_inplace(bottom_top_blob, cmd, opt); cmd.record_write_timestamp(layer_index * 2 + 1); #else int ret = layer->forward_inplace(bottom_top_blob, cmd, opt); #endif // NCNN_BENCHMARK if (ret == -100) { goto IMAGE_ALLOCATION_FAILED; } if (ret != 0) return ret; // store top blob blob_mats_gpu_image[top_blob_index] = bottom_top_blob; } else { VkImageMat top_blob; #if NCNN_BENCHMARK cmd.record_write_timestamp(layer_index * 2); int ret = layer->forward(bottom_blob, top_blob, cmd, opt); cmd.record_write_timestamp(layer_index * 2 + 1); #else int ret = layer->forward(bottom_blob, top_blob, cmd, opt); #endif // NCNN_BENCHMARK if (ret == -100) { goto IMAGE_ALLOCATION_FAILED; } if (ret != 0) return ret; // store top blob blob_mats_gpu_image[top_blob_index] = top_blob; } } else { // load bottom blobs std::vector bottom_blobs(layer->bottoms.size()); for (size_t i = 0; i < layer->bottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; if (blob_mats_gpu_image[bottom_blob_index].dims == 0) { if (blob_mats_gpu[bottom_blob_index].dims == 0) { if (blob_mats[bottom_blob_index].dims == 0) { int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, blob_mats_gpu_image, cmd, opt); if (ret != 0) return ret; } if (blob_mats_gpu_image[bottom_blob_index].dims == 0) { if (blob_mats_gpu[bottom_blob_index].dims == 0) { // host to image cmd.record_upload(blob_mats[bottom_blob_index], blob_mats_gpu_image[bottom_blob_index], opt); } else { // buffer to image cmd.record_buffer_to_image(blob_mats_gpu[bottom_blob_index], blob_mats_gpu_image[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu[bottom_blob_index].release(); } } } } else { // buffer to image cmd.record_buffer_to_image(blob_mats_gpu[bottom_blob_index], blob_mats_gpu_image[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu[bottom_blob_index].release(); } } } bottom_blobs[i] = blob_mats_gpu_image[bottom_blob_index]; if (bottom_blobs[i].empty()) { goto IMAGE_ALLOCATION_FAILED; } if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu_image[bottom_blob_index].release(); // deep copy for inplace forward if data is shared if (layer->support_inplace && *bottom_blobs[i].refcount != 1) { VkImageMat bottom_blob_copy; cmd.record_clone(bottom_blobs[i], bottom_blob_copy, opt); // NCNN_LOGE("clone %p[+%lu] %p[+%lu]", bottom_blobs[i].buffer(), bottom_blobs[i].buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset()); bottom_blobs[i] = bottom_blob_copy; } } } // forward if (opt.lightmode && layer->support_inplace) { std::vector& bottom_top_blobs = bottom_blobs; #if NCNN_BENCHMARK cmd.record_write_timestamp(layer_index * 2); int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt); cmd.record_write_timestamp(layer_index * 2 + 1); #else int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt); #endif // NCNN_BENCHMARK if (ret == -100) { goto IMAGE_ALLOCATION_FAILED; } if (ret != 0) return ret; // store top blobs for (size_t i = 0; i < layer->tops.size(); i++) { int top_blob_index = layer->tops[i]; blob_mats_gpu_image[top_blob_index] = bottom_top_blobs[i]; } } else { std::vector top_blobs(layer->tops.size()); #if NCNN_BENCHMARK cmd.record_write_timestamp(layer_index * 2); int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt); cmd.record_write_timestamp(layer_index * 2 + 1); #else int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt); #endif // NCNN_BENCHMARK if (ret == -100) { goto IMAGE_ALLOCATION_FAILED; } if (ret != 0) return ret; // store top blobs for (size_t i = 0; i < layer->tops.size(); i++) { int top_blob_index = layer->tops[i]; blob_mats_gpu_image[top_blob_index] = top_blobs[i]; } } } } else { if (layer->one_blob_only) { // load bottom blob int bottom_blob_index = layer->bottoms[0]; int top_blob_index = layer->tops[0]; if (blob_mats_gpu[bottom_blob_index].dims == 0) { if (blob_mats_gpu_image[bottom_blob_index].dims == 0) { if (blob_mats[bottom_blob_index].dims == 0) { int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, blob_mats_gpu_image, cmd, opt); if (ret != 0) return ret; } if (blob_mats_gpu[bottom_blob_index].dims == 0) { if (blob_mats_gpu_image[bottom_blob_index].dims == 0) { // host to buffer cmd.record_upload(blob_mats[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); } else { // image to buffer cmd.record_image_to_buffer(blob_mats_gpu_image[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu_image[bottom_blob_index].release(); } } } } else { // image to buffer cmd.record_image_to_buffer(blob_mats_gpu_image[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu_image[bottom_blob_index].release(); } } } VkMat bottom_blob = blob_mats_gpu[bottom_blob_index]; if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu[bottom_blob_index].release(); // deep copy for inplace forward if data is shared if (layer->support_inplace && *bottom_blob.refcount != 1) { VkMat bottom_blob_copy; cmd.record_clone(bottom_blob, bottom_blob_copy, opt); // NCNN_LOGE("clone %p[+%lu] %p[+%lu]", bottom_blob.buffer(), bottom_blob.buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset()); bottom_blob = bottom_blob_copy; } } // forward if (opt.lightmode && layer->support_inplace) { VkMat& bottom_top_blob = bottom_blob; #if NCNN_BENCHMARK cmd.record_write_timestamp(layer_index * 2); int ret = layer->forward_inplace(bottom_top_blob, cmd, opt); cmd.record_write_timestamp(layer_index * 2 + 1); #else int ret = layer->forward_inplace(bottom_top_blob, cmd, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; // store top blob blob_mats_gpu[top_blob_index] = bottom_top_blob; } else { VkMat top_blob; #if NCNN_BENCHMARK cmd.record_write_timestamp(layer_index * 2); int ret = layer->forward(bottom_blob, top_blob, cmd, opt); cmd.record_write_timestamp(layer_index * 2 + 1); #else int ret = layer->forward(bottom_blob, top_blob, cmd, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; // store top blob blob_mats_gpu[top_blob_index] = top_blob; } } else { // load bottom blobs std::vector bottom_blobs(layer->bottoms.size()); for (size_t i = 0; i < layer->bottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; if (blob_mats_gpu[bottom_blob_index].dims == 0) { if (blob_mats_gpu_image[bottom_blob_index].dims == 0) { if (blob_mats[bottom_blob_index].dims == 0) { int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, blob_mats_gpu_image, cmd, opt); if (ret != 0) return ret; } if (blob_mats_gpu[bottom_blob_index].dims == 0) { if (blob_mats_gpu_image[bottom_blob_index].dims == 0) { // host to buffer cmd.record_upload(blob_mats[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); } else { // image to buffer cmd.record_image_to_buffer(blob_mats_gpu_image[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu_image[bottom_blob_index].release(); } } } } else { // image to buffer cmd.record_image_to_buffer(blob_mats_gpu_image[bottom_blob_index], blob_mats_gpu[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu_image[bottom_blob_index].release(); } } } bottom_blobs[i] = blob_mats_gpu[bottom_blob_index]; if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu[bottom_blob_index].release(); // deep copy for inplace forward if data is shared if (layer->support_inplace && *bottom_blobs[i].refcount != 1) { VkMat bottom_blob_copy; cmd.record_clone(bottom_blobs[i], bottom_blob_copy, opt); // NCNN_LOGE("clone %p[+%lu] %p[+%lu]", bottom_blobs[i].buffer(), bottom_blobs[i].buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset()); bottom_blobs[i] = bottom_blob_copy; } } } // forward if (opt.lightmode && layer->support_inplace) { std::vector& bottom_top_blobs = bottom_blobs; #if NCNN_BENCHMARK cmd.record_write_timestamp(layer_index * 2); int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt); cmd.record_write_timestamp(layer_index * 2 + 1); #else int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; // store top blobs for (size_t i = 0; i < layer->tops.size(); i++) { int top_blob_index = layer->tops[i]; blob_mats_gpu[top_blob_index] = bottom_top_blobs[i]; } } else { std::vector top_blobs(layer->tops.size()); #if NCNN_BENCHMARK cmd.record_write_timestamp(layer_index * 2); int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt); cmd.record_write_timestamp(layer_index * 2 + 1); #else int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; // store top blobs for (size_t i = 0; i < layer->tops.size(); i++) { int top_blob_index = layer->tops[i]; blob_mats_gpu[top_blob_index] = top_blobs[i]; } } } } } else { IMAGE_ALLOCATION_FAILED: if (layer->one_blob_only) { // load bottom blob int bottom_blob_index = layer->bottoms[0]; int top_blob_index = layer->tops[0]; if (blob_mats[bottom_blob_index].dims == 0) { if (blob_mats_gpu_image[bottom_blob_index].dims == 0) { if (blob_mats_gpu[bottom_blob_index].dims == 0) { int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, blob_mats_gpu_image, cmd, opt); if (ret != 0) return ret; } if (blob_mats[bottom_blob_index].dims == 0) { if (blob_mats_gpu_image[bottom_blob_index].dims == 0) { // buffer to host cmd.record_download(blob_mats_gpu[bottom_blob_index], blob_mats[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu[bottom_blob_index].release(); } } else { // image to host cmd.record_download(blob_mats_gpu_image[bottom_blob_index], blob_mats[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu_image[bottom_blob_index].release(); } } } } else { // image to host cmd.record_download(blob_mats_gpu_image[bottom_blob_index], blob_mats[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu_image[bottom_blob_index].release(); } } cmd.submit_and_wait(); #if NCNN_BENCHMARK std::vector results(layer_index * 2); cmd.get_query_pool_results(0, layer_index * 2, results); for (int i = 0; i < layer_index; i++) { uint64_t start = results[i * 2]; uint64_t end = results[i * 2 + 1]; if (start == 0 || end == 0) continue; double duration_us = (end - start) * vkdev->info.timestamp_period / 1000; NCNN_LOGE("%-24s %-30s %8.2lfus |", layers[i]->type.c_str(), layers[i]->name.c_str(), duration_us); } #endif // NCNN_BENCHMARK cmd.reset(); } Mat bottom_blob = blob_mats[bottom_blob_index]; if (opt.lightmode) { // delete after taken in light mode blob_mats[bottom_blob_index].release(); // deep copy for inplace forward if data is shared if (layer->support_inplace && *bottom_blob.refcount != 1) { bottom_blob = bottom_blob.clone(); } } // forward if (opt.lightmode && layer->support_inplace) { Mat& bottom_top_blob = bottom_blob; #if NCNN_BENCHMARK double start = get_current_time(); int ret = layer->forward_inplace(bottom_top_blob, opt); double end = get_current_time(); benchmark(layer, bottom_top_blob, bottom_top_blob, start, end); #else int ret = layer->forward_inplace(bottom_top_blob, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; // store top blob blob_mats[top_blob_index] = bottom_top_blob; } else { Mat top_blob; #if NCNN_BENCHMARK double start = get_current_time(); int ret = layer->forward(bottom_blob, top_blob, opt); double end = get_current_time(); benchmark(layer, bottom_blob, top_blob, start, end); #else int ret = layer->forward(bottom_blob, top_blob, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; // store top blob blob_mats[top_blob_index] = top_blob; } } else { // load bottom blobs for (size_t i = 0; i < layer->bottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; if (blob_mats[bottom_blob_index].dims == 0) { if (blob_mats_gpu_image[bottom_blob_index].dims == 0) { if (blob_mats_gpu[bottom_blob_index].dims == 0) { int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, blob_mats_gpu_image, cmd, opt); if (ret != 0) return ret; } if (blob_mats[bottom_blob_index].dims == 0) { if (blob_mats_gpu_image[bottom_blob_index].dims == 0) { // buffer to host cmd.record_download(blob_mats_gpu[bottom_blob_index], blob_mats[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu[bottom_blob_index].release(); } } else { // image to host cmd.record_download(blob_mats_gpu_image[bottom_blob_index], blob_mats[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu_image[bottom_blob_index].release(); } } } } else { // image to host cmd.record_download(blob_mats_gpu_image[bottom_blob_index], blob_mats[bottom_blob_index], opt); if (opt.lightmode) { // delete after taken in light mode blob_mats_gpu_image[bottom_blob_index].release(); } } } } { cmd.submit_and_wait(); #if NCNN_BENCHMARK std::vector results(layer_index * 2); cmd.get_query_pool_results(0, layer_index * 2, results); for (int i = 0; i < layer_index; i++) { uint64_t start = results[i * 2]; uint64_t end = results[i * 2 + 1]; if (start == 0 || end == 0) continue; double duration_us = (end - start) * vkdev->info.timestamp_period / 1000; NCNN_LOGE("%-24s %-30s %8.2lfus |", layers[i]->type.c_str(), layers[i]->name.c_str(), duration_us); } #endif // NCNN_BENCHMARK cmd.reset(); } std::vector bottom_blobs(layer->bottoms.size()); for (size_t i = 0; i < layer->bottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; bottom_blobs[i] = blob_mats[bottom_blob_index]; if (opt.lightmode) { // delete after taken in light mode blob_mats[bottom_blob_index].release(); // deep copy for inplace forward if data is shared if (layer->support_inplace && *bottom_blobs[i].refcount != 1) { bottom_blobs[i] = bottom_blobs[i].clone(); } } } // forward if (opt.lightmode && layer->support_inplace) { std::vector& bottom_top_blobs = bottom_blobs; #if NCNN_BENCHMARK double start = get_current_time(); int ret = layer->forward_inplace(bottom_top_blobs, opt); double end = get_current_time(); benchmark(layer, start, end); #else int ret = layer->forward_inplace(bottom_top_blobs, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; // store top blobs for (size_t i = 0; i < layer->tops.size(); i++) { int top_blob_index = layer->tops[i]; blob_mats[top_blob_index] = bottom_top_blobs[i]; } } else { std::vector top_blobs(layer->tops.size()); #if NCNN_BENCHMARK double start = get_current_time(); int ret = layer->forward(bottom_blobs, top_blobs, opt); double end = get_current_time(); benchmark(layer, start, end); #else int ret = layer->forward(bottom_blobs, top_blobs, opt); #endif // NCNN_BENCHMARK if (ret != 0) return ret; // store top blobs for (size_t i = 0; i < layer->tops.size(); i++) { int top_blob_index = layer->tops[i]; blob_mats[top_blob_index] = top_blobs[i]; } } } } // NCNN_LOGE("forward_layer %d %d %s done", layer->support_vulkan, layer_index, layer->name.c_str()); return 0; } #endif // NCNN_VULKAN Extractor::Extractor(const Net* _net, size_t blob_count) : net(_net) { blob_mats.resize(blob_count); opt = net->opt; #if NCNN_VULKAN if (net->opt.use_vulkan_compute) { local_blob_vkallocator = 0; local_staging_vkallocator = 0; blob_mats_gpu.resize(blob_count); blob_mats_gpu_image.resize(blob_count); } #endif // NCNN_VULKAN } Extractor::~Extractor() { blob_mats.clear(); #if NCNN_VULKAN if (net->opt.use_vulkan_compute) { blob_mats_gpu.clear(); blob_mats_gpu_image.clear(); if (local_blob_vkallocator) { net->vkdev->reclaim_blob_allocator(local_blob_vkallocator); } if (local_staging_vkallocator) { net->vkdev->reclaim_staging_allocator(local_staging_vkallocator); } } #endif // NCNN_VULKAN } void Extractor::set_light_mode(bool enable) { opt.lightmode = enable; } void Extractor::set_num_threads(int num_threads) { opt.num_threads = num_threads; } void Extractor::set_blob_allocator(Allocator* allocator) { opt.blob_allocator = allocator; } void Extractor::set_workspace_allocator(Allocator* allocator) { opt.workspace_allocator = allocator; } #if NCNN_VULKAN void Extractor::set_vulkan_compute(bool enable) { if (net->opt.use_vulkan_compute) { opt.use_vulkan_compute = enable; } else { NCNN_LOGE("set_vulkan_compute failed, network use_vulkan_compute disabled"); } } void Extractor::set_blob_vkallocator(VkAllocator* allocator) { opt.blob_vkallocator = allocator; } void Extractor::set_workspace_vkallocator(VkAllocator* allocator) { opt.workspace_vkallocator = allocator; } void Extractor::set_staging_vkallocator(VkAllocator* allocator) { opt.staging_vkallocator = allocator; } #endif // NCNN_VULKAN #if NCNN_STRING int Extractor::input(const char* blob_name, const Mat& in) { int blob_index = net->find_blob_index_by_name(blob_name); if (blob_index == -1) return -1; return input(blob_index, in); } int Extractor::extract(const char* blob_name, Mat& feat) { int blob_index = net->find_blob_index_by_name(blob_name); if (blob_index == -1) return -1; return extract(blob_index, feat); } #endif // NCNN_STRING int Extractor::input(int blob_index, const Mat& in) { if (blob_index < 0 || blob_index >= (int)blob_mats.size()) return -1; blob_mats[blob_index] = in; return 0; } int Extractor::extract(int blob_index, Mat& feat) { if (blob_index < 0 || blob_index >= (int)blob_mats.size()) return -1; int old_blocktime = get_kmp_blocktime(); set_kmp_blocktime(opt.openmp_blocktime); int ret = 0; if (blob_mats[blob_index].dims == 0) { int layer_index = net->blobs[blob_index].producer; #if NCNN_VULKAN if (opt.use_vulkan_compute) { // use local allocator if (!opt.blob_vkallocator) { local_blob_vkallocator = net->vkdev->acquire_blob_allocator(); opt.blob_vkallocator = local_blob_vkallocator; } if (!opt.workspace_vkallocator) { opt.workspace_vkallocator = opt.blob_vkallocator; } if (!opt.staging_vkallocator) { local_staging_vkallocator = net->vkdev->acquire_staging_allocator(); opt.staging_vkallocator = local_staging_vkallocator; } ncnn::VkCompute cmd(net->vkdev); #if NCNN_BENCHMARK cmd.create_query_pool(net->layers.size() * 2); #endif // NCNN_BENCHMARK // TODO vkimagemat for adreno if (opt.use_image_storage) { VkImageMat feat_gpu; ret = extract(blob_index, feat_gpu, cmd); if (blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0) { cmd.record_download(feat_gpu, blob_mats[blob_index], opt); cmd.submit_and_wait(); #if NCNN_BENCHMARK std::vector results(net->layers.size() * 2); cmd.get_query_pool_results(0, net->layers.size() * 2, results); for (size_t i = 0; i < net->layers.size(); i++) { uint64_t start = results[i * 2]; uint64_t end = results[i * 2 + 1]; if (start == 0 || end == 0) continue; double duration_us = (end - start) * net->vkdev->info.timestamp_period / 1000; NCNN_LOGE("%-24s %-30s %8.2lfus |", net->layers[i]->type.c_str(), net->layers[i]->name.c_str(), duration_us); } #endif // NCNN_BENCHMARK } } else { VkMat feat_gpu; ret = extract(blob_index, feat_gpu, cmd); if (blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0) { cmd.record_download(feat_gpu, blob_mats[blob_index], opt); cmd.submit_and_wait(); #if NCNN_BENCHMARK std::vector results(net->layers.size() * 2); cmd.get_query_pool_results(0, net->layers.size() * 2, results); for (size_t i = 0; i < net->layers.size(); i++) { uint64_t start = results[i * 2]; uint64_t end = results[i * 2 + 1]; if (start == 0 || end == 0) continue; double duration_us = (end - start) * net->vkdev->info.timestamp_period / 1000; NCNN_LOGE("%-24s %-30s %8.2lfus |", net->layers[i]->type.c_str(), net->layers[i]->name.c_str(), duration_us); } #endif // NCNN_BENCHMARK } } } else { ret = net->forward_layer(layer_index, blob_mats, opt); } #else ret = net->forward_layer(layer_index, blob_mats, opt); #endif // NCNN_VULKAN } feat = blob_mats[blob_index]; if (opt.use_packing_layout) { Mat bottom_blob_unpacked; convert_packing(feat, bottom_blob_unpacked, 1, opt); feat = bottom_blob_unpacked; } // clang-format off // *INDENT-OFF* #if NCNN_ARM82 if (opt.use_fp16_storage && cpu_support_arm_asimdhp()) { if (feat.elemsize / feat.elempack == 2u) { Mat feat_fp32; cast_float16_to_float32(feat, feat_fp32, opt); feat = feat_fp32; } } else #endif // NCNN_ARM82 if (opt.use_bf16_storage) { if (feat.elemsize / feat.elempack == 2u) { Mat feat_fp32; cast_bfloat16_to_float32(feat, feat_fp32, opt); feat = feat_fp32; } } // *INDENT-ON* // clang-format on set_kmp_blocktime(old_blocktime); return ret; } #if NCNN_VULKAN #if NCNN_STRING int Extractor::input(const char* blob_name, const VkMat& in) { int blob_index = net->find_blob_index_by_name(blob_name); if (blob_index == -1) return -1; return input(blob_index, in); } int Extractor::extract(const char* blob_name, VkMat& feat, VkCompute& cmd) { int blob_index = net->find_blob_index_by_name(blob_name); if (blob_index == -1) return -1; return extract(blob_index, feat, cmd); } int Extractor::input(const char* blob_name, const VkImageMat& in) { int blob_index = net->find_blob_index_by_name(blob_name); if (blob_index == -1) return -1; return input(blob_index, in); } int Extractor::extract(const char* blob_name, VkImageMat& feat, VkCompute& cmd) { int blob_index = net->find_blob_index_by_name(blob_name); if (blob_index == -1) return -1; return extract(blob_index, feat, cmd); } #endif // NCNN_STRING int Extractor::input(int blob_index, const VkMat& in) { if (blob_index < 0 || blob_index >= (int)blob_mats.size()) return -1; blob_mats_gpu[blob_index] = in; return 0; } int Extractor::extract(int blob_index, VkMat& feat, VkCompute& cmd) { if (blob_index < 0 || blob_index >= (int)blob_mats.size()) return -1; int ret = 0; if (blob_mats_gpu[blob_index].dims == 0) { int layer_index = net->blobs[blob_index].producer; ret = net->forward_layer(layer_index, blob_mats, blob_mats_gpu, cmd, opt); } if (blob_mats_gpu[blob_index].dims == 0 && blob_mats_gpu_image[blob_index].dims != 0) { // image to buffer cmd.record_image_to_buffer(blob_mats_gpu_image[blob_index], blob_mats_gpu[blob_index], opt); } feat = blob_mats_gpu[blob_index]; return ret; } int Extractor::input(int blob_index, const VkImageMat& in) { if (blob_index < 0 || blob_index >= (int)blob_mats.size()) return -1; blob_mats_gpu_image[blob_index] = in; return 0; } int Extractor::extract(int blob_index, VkImageMat& feat, VkCompute& cmd) { if (blob_index < 0 || blob_index >= (int)blob_mats.size()) return -1; int old_blocktime = get_kmp_blocktime(); set_kmp_blocktime(opt.openmp_blocktime); int ret = 0; if (blob_mats_gpu_image[blob_index].dims == 0) { int layer_index = net->blobs[blob_index].producer; ret = net->forward_layer(layer_index, blob_mats, blob_mats_gpu, blob_mats_gpu_image, cmd, opt); } if (blob_mats_gpu_image[blob_index].dims == 0 && blob_mats_gpu[blob_index].dims != 0) { // buffer to image cmd.record_buffer_to_image(blob_mats_gpu[blob_index], blob_mats_gpu_image[blob_index], opt); } feat = blob_mats_gpu_image[blob_index]; set_kmp_blocktime(old_blocktime); return ret; } #endif // NCNN_VULKAN } // namespace ncnn