You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 18 kB

[WIP] vulkan compute (#618) * vulkan infrastructure * vkallocator and vkmat * layer interface for vulkan compute * wip... * default vulkan device, command wrapper, upload model weight in load_model to simplify layer interface * simplify command api, vkmat holds staging buffer, relu works * initialize specialization constant, simplify command dispatch, fix staging buffer copy with different shape, convolution works * init extension functions * dynamic local size and group count * group count=1 is invalid * regard device max workgroup size limit * fix relu oooops * decouple command record and staging allocation * create result blob * add pooling shader * buffer is faster than image :) * fix pooling shader * add innerproduct shader * readonly writeonly decoration * simplify buffer creation * decouple command and layer, VK_KHR_descriptor_update_template extension makes descriptor binding update easy :D * fix vulkan building issues in visual studio (#1) * fix building issues on visual studio * ignore benchmark * cancel changes * ... ... * decouple paramdict and vulkandevice * fix staging buffer destroy in model loading * remove vkdev member in option * add padding shader * simplify vulkan layer creation, simplify convolution and pooling shader for no padding, less debug output * add convolutiondepthwise and softmax shader * specialization float type, add leakyrelu * add dropout shader * add batchnorm shader * split vulkan forward * add scale shader * push constant type can be int or float * set_optimal_local_size_xyz * add eltwise shader * concat vulkan forward * fix convolution without bias * add dummy shader for concat and split, more fix ... * optional VK_KHR_descriptor_update_template and VK_KHR_push_descriptor * check VK_KHR_push_descriptor for vkCmdPushDescriptorSetWithTemplateKHR * binaryop and unaryop shader * hide raw command buffer * simple vkbenchncnn benchmark * create device with transfer queue * rename command to vkcompute, add vktransfer and layer upload_model interface * external VkMat, copy and map wrt buffer offset * command copy respect offset and size * decouple weight upload and load, simplify upload weight api, use one big staging buffer for uploading weights * fix build on android * binding count can not vary :( * barrier check state, fix sub-op destruction * declare local_size_xyz constant, fix crash on radv * fix local_size_xyz, second try * more barrier and state fix * fix softmax * reconstruct buffer memory allocator, reuse blob buffer, less verbose output * find unified memory type index * weight staging buffer allocator and weight buffer allocator, respect descriptor buffer offset alignment * use VK_KHR_descriptor_update_template for faster descriptor update if available, multithread pipeline creation * find more useful vulkan extensions and enable them * fix msvc build * respect VK_KHR_dedicated_allocation for weight buffer allocation * fix android build * fix bias name conflicts with metal * decouple pipeline and layer, building shader sources into shader module, dedicated create_pipeline api, simplify pipeline recording * drop dummy shader, inplace softmax, multiple shader module works * fix unique queue family index error * flatten support vulkan * mnasnet run * find shader module by name, each entry point per shader module, fix attribute/id conflict on moltenvk * some minor changes * add some high level api * use dedicated transfer queue to upload weight model * prefer mappable buffer on unified memory * global pooling and convolution fc, reuse staging buffer * implement ring-buffer style blob allocator, add VkBufferMemory capacity * use blob allocator for workspace blob, it works fine :) * vulkan option off * Update layer.cpp * fix build with vulkan off * less verbose output, fix crash on vulkan_compute off * merge benchncnn tool * allocator clear api, use new weight buffer allocator per net * add default locked allocator * mapped mat ptr api, persistent mapped memory works generally :) * travis ci linux vulkan * travis ci vulkan wip ... * more gpu wip ... * more gpu wip ... * wip... * wip... * wip... ... * wip... ios vulkan build... * find glslangValidator on ios build * use dynamic moltenvk library * travis ci wip ... * ios simulator does not support metal at all * fix cpu only extractor * optimize workgroup size, first try * optimize workgroup size, second try * conv1x1s1d1 vec4 * revert build system * fix ncnn2mem build * fix ncnn2mem build
7 years ago
7 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "convolution.h"
  15. #include <algorithm>
  16. #include "layer_type.h"
  17. namespace ncnn {
  18. DEFINE_LAYER_CREATOR(Convolution)
  19. Convolution::Convolution()
  20. {
  21. one_blob_only = true;
  22. support_inplace = false;
  23. use_int8_requantize = false;
  24. quantize = 0;
  25. }
  26. int Convolution::load_param(const ParamDict& pd)
  27. {
  28. num_output = pd.get(0, 0);
  29. kernel_w = pd.get(1, 0);
  30. kernel_h = pd.get(11, kernel_w);
  31. dilation_w = pd.get(2, 1);
  32. dilation_h = pd.get(12, dilation_w);
  33. stride_w = pd.get(3, 1);
  34. stride_h = pd.get(13, stride_w);
  35. pad_left = pd.get(4, 0);
  36. pad_right = pd.get(15, pad_left);
  37. pad_top = pd.get(14, pad_left);
  38. pad_bottom = pd.get(16, pad_top);
  39. pad_value = pd.get(18, 0.f);
  40. bias_term = pd.get(5, 0);
  41. weight_data_size = pd.get(6, 0);
  42. int8_scale_term = pd.get(8, 0);
  43. activation_type = pd.get(9, 0);
  44. activation_params = pd.get(10, Mat());
  45. impl_type = pd.get(17, 0);
  46. return 0;
  47. }
  48. int Convolution::load_model(const ModelBin& mb)
  49. {
  50. weight_data = mb.load(weight_data_size, 0);
  51. if (weight_data.empty())
  52. return -100;
  53. if (bias_term)
  54. {
  55. bias_data = mb.load(num_output, 1);
  56. if (bias_data.empty())
  57. return -100;
  58. }
  59. if (int8_scale_term)
  60. {
  61. weight_data_int8_scales = mb.load(num_output, 1);
  62. bottom_blob_int8_scale = mb.load(1, 1)[0];
  63. }
  64. return 0;
  65. }
  66. int Convolution::create_pipeline(const Option& opt)
  67. {
  68. use_int8_inference = opt.use_int8_inference;
  69. if (int8_scale_term == 0)
  70. use_int8_inference = false;
  71. bool weight_data_is_int8 = (weight_data.elemsize == (size_t)1u);
  72. bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u);
  73. if (weight_data_is_int8 && !use_int8_inference)
  74. {
  75. fprintf(stderr, "quantized int8 weight loaded but use_int8_inference disabled\n");
  76. return -1;
  77. }
  78. // runtime quantize the weight data
  79. if (weight_data_is_float32 && use_int8_inference)
  80. {
  81. // quantize weight to int8
  82. Mat int8_weight_data(weight_data_size, (size_t)1u);
  83. if (int8_weight_data.empty())
  84. return -100;
  85. const int weight_data_size_output = weight_data_size / num_output;
  86. for (int n=0; n<num_output; n++)
  87. {
  88. Layer* op = ncnn::create_layer(ncnn::LayerType::Quantize);
  89. ncnn::ParamDict pd;
  90. pd.set(0, weight_data_int8_scales[n]);// scale
  91. op->load_param(pd);
  92. op->create_pipeline(opt);
  93. ncnn::Option opt;
  94. opt.blob_allocator = int8_weight_data.allocator;
  95. const Mat weight_data_n = weight_data.range(weight_data_size_output * n, weight_data_size_output);
  96. Mat int8_weight_data_n = int8_weight_data.range(weight_data_size_output * n, weight_data_size_output);
  97. op->forward(weight_data_n, int8_weight_data_n, opt);
  98. delete op;
  99. }
  100. weight_data = int8_weight_data;
  101. }
  102. // initial the quantize,dequantize op layer
  103. if (use_int8_inference)
  104. {
  105. quantize = ncnn::create_layer(ncnn::LayerType::Quantize);
  106. {
  107. ncnn::ParamDict pd;
  108. pd.set(0, bottom_blob_int8_scale);// scale
  109. quantize->load_param(pd);
  110. quantize->create_pipeline(opt);
  111. }
  112. dequantize_ops.resize(num_output);
  113. for (int n=0; n<num_output; n++)
  114. {
  115. dequantize_ops[n] = ncnn::create_layer(ncnn::LayerType::Dequantize);
  116. float top_rescale = 1.f;
  117. if (weight_data_int8_scales[n] == 0)
  118. top_rescale = 0;
  119. else
  120. top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[n]);
  121. ncnn::ParamDict pd;
  122. pd.set(0, top_rescale);// scale
  123. pd.set(1, bias_term); // bias_term
  124. pd.set(2, 1); // bias_data_size
  125. dequantize_ops[n]->load_param(pd);
  126. dequantize_ops[n]->create_pipeline(opt);
  127. ncnn::Mat weights[1];
  128. weights[0] = bias_data.range(n, 1);
  129. dequantize_ops[n]->load_model(ModelBinFromMatArray(weights));
  130. dequantize_scales.push_back(top_rescale);
  131. }
  132. }
  133. return 0;
  134. }
  135. int Convolution::destroy_pipeline(const Option& opt)
  136. {
  137. if (quantize)
  138. {
  139. quantize->destroy_pipeline(opt);
  140. delete quantize;
  141. quantize = 0;
  142. }
  143. for (int i=0; i<(int)dequantize_ops.size(); i++)
  144. {
  145. dequantize_ops[i]->destroy_pipeline(opt);
  146. delete dequantize_ops[i];
  147. }
  148. dequantize_ops.clear();
  149. for (int i=0; i<(int)requantize_ops.size(); i++)
  150. {
  151. requantize_ops[i]->destroy_pipeline(opt);
  152. delete requantize_ops[i];
  153. }
  154. requantize_ops.clear();
  155. dequantize_scales.clear();
  156. requantize_scales.clear();
  157. return 0;
  158. }
  159. int Convolution::create_requantize_op(void)
  160. {
  161. if (!use_int8_requantize)
  162. {
  163. fprintf(stderr, "requantized op set but use_int8_requantize disabled\n");
  164. return -1;
  165. }
  166. requantize_ops.resize(num_output);
  167. for (int n=0; n<num_output; n++)
  168. {
  169. requantize_ops[n] = ncnn::create_layer(ncnn::LayerType::Requantize);
  170. float scale_in = 1.f;
  171. float scale_out = 1.f;
  172. if (weight_data_int8_scales[n] == 0)
  173. {
  174. scale_in = 0;
  175. }
  176. else
  177. {
  178. scale_in = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[n]);
  179. }
  180. scale_out = top_blob_int8_scale;
  181. ncnn::ParamDict pd;
  182. pd.set(0, scale_in); // scale in
  183. pd.set(1, scale_out); // scale_out
  184. pd.set(2, bias_term); // bias_term
  185. pd.set(3, 1); // bias_data_size
  186. requantize_ops[n]->load_param(pd);
  187. ncnn::Mat weights[1];
  188. weights[0] = bias_data.range(n, 1);
  189. requantize_ops[n]->load_model(ModelBinFromMatArray(weights));
  190. requantize_scales.push_back(scale_in);
  191. requantize_scales.push_back(scale_out);
  192. }
  193. return 0;
  194. }
  195. int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  196. {
  197. // convolv with NxN kernel
  198. // value = value + bias
  199. // flattened blob, implement as InnerProduct
  200. if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
  201. {
  202. int num_input = weight_data_size / num_output;
  203. if (bottom_blob.w == num_input)
  204. {
  205. // call InnerProduct
  206. ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::InnerProduct);
  207. // set param
  208. ncnn::ParamDict pd;
  209. pd.set(0, num_output);
  210. pd.set(1, bias_term);
  211. pd.set(2, weight_data_size);
  212. pd.set(8, int8_scale_term);
  213. op->load_param(pd);
  214. // set weights
  215. ncnn::Mat weights[4];
  216. weights[0] = weight_data;
  217. weights[1] = bias_data;
  218. if (int8_scale_term)
  219. {
  220. weights[2] = weight_data_int8_scales;
  221. weights[3] = Mat(1, (size_t)4u, (void*)&bottom_blob_int8_scale);
  222. }
  223. op->load_model(ModelBinFromMatArray(weights));
  224. op->create_pipeline(opt);
  225. // forward
  226. op->forward(bottom_blob, top_blob, opt);
  227. delete op;
  228. return 0;
  229. }
  230. }
  231. int w = bottom_blob.w;
  232. int h = bottom_blob.h;
  233. int channels = bottom_blob.c;
  234. size_t elemsize = bottom_blob.elemsize;
  235. // fprintf(stderr, "Convolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d\n", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
  236. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  237. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  238. Mat bottom_blob_unbordered = bottom_blob;
  239. if (use_int8_inference && elemsize != 1)
  240. {
  241. Mat bottom_blob_int8;
  242. bottom_blob_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
  243. if (bottom_blob_int8.empty())
  244. return -100;
  245. // quantize, scale and round to nearest
  246. {
  247. ncnn::Option opt_g = opt;
  248. opt_g.blob_allocator = bottom_blob_int8.allocator;
  249. quantize->forward(bottom_blob, bottom_blob_int8, opt_g);
  250. }
  251. bottom_blob_unbordered = bottom_blob_int8;
  252. }
  253. Mat bottom_blob_bordered = bottom_blob_unbordered;
  254. if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
  255. {
  256. Option opt_b = opt;
  257. opt_b.blob_allocator = opt.workspace_allocator;
  258. copy_make_border(bottom_blob_unbordered, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt_b);
  259. }
  260. else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
  261. {
  262. // tensorflow padding=SAME or onnx padding=SAME_UPPER
  263. int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
  264. int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
  265. if (wpad > 0 || hpad > 0)
  266. {
  267. Option opt_b = opt;
  268. opt_b.blob_allocator = opt.workspace_allocator;
  269. copy_make_border(bottom_blob_unbordered, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
  270. }
  271. }
  272. else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)
  273. {
  274. // onnx padding=SAME_LOWER
  275. int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
  276. int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
  277. if (wpad > 0 || hpad > 0)
  278. {
  279. Option opt_b = opt;
  280. opt_b.blob_allocator = opt.workspace_allocator;
  281. copy_make_border(bottom_blob_unbordered, bottom_blob_bordered, hpad - hpad / 2, hpad / 2, wpad - wpad / 2, wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
  282. }
  283. }
  284. if (bottom_blob_bordered.empty())
  285. return -100;
  286. w = bottom_blob_bordered.w;
  287. h = bottom_blob_bordered.h;
  288. int outw = (w - kernel_extent_w) / stride_w + 1;
  289. int outh = (h - kernel_extent_h) / stride_h + 1;
  290. const int maxk = kernel_w * kernel_h;
  291. // kernel offsets
  292. std::vector<int> _space_ofs(maxk);
  293. int* space_ofs = &_space_ofs[0];
  294. {
  295. int p1 = 0;
  296. int p2 = 0;
  297. int gap = w * dilation_h - kernel_w * dilation_w;
  298. for (int i = 0; i < kernel_h; i++)
  299. {
  300. for (int j = 0; j < kernel_w; j++)
  301. {
  302. space_ofs[p1] = p2;
  303. p1++;
  304. p2 += dilation_w;
  305. }
  306. p2 += gap;
  307. }
  308. }
  309. // int8
  310. if (use_int8_inference)
  311. {
  312. if (use_int8_requantize == true)
  313. {
  314. Mat top_blob_tm;
  315. top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
  316. if (top_blob_tm.empty())
  317. return -100;
  318. top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
  319. if (top_blob.empty())
  320. return -100;
  321. // num_output
  322. #pragma omp parallel for num_threads(opt.num_threads)
  323. for (int p=0; p<num_output; p++)
  324. {
  325. int* outptr = top_blob_tm.channel(p);
  326. for (int i = 0; i < outh; i++)
  327. {
  328. for (int j = 0; j < outw; j++)
  329. {
  330. int sum = 0;
  331. const signed char* kptr = (const signed char*)weight_data + maxk * channels * p;
  332. // channels
  333. for (int q=0; q<channels; q++)
  334. {
  335. const Mat m = bottom_blob_bordered.channel(q);
  336. const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w;
  337. for (int k = 0; k < maxk; k++)
  338. {
  339. int val = sptr[ space_ofs[k] ];
  340. int w = kptr[k];
  341. sum += val * w;
  342. }
  343. kptr += maxk;
  344. }
  345. outptr[j] = sum;
  346. }
  347. outptr += outw;
  348. }
  349. // requantize, reverse scale inplace
  350. {
  351. ncnn::Option opt_g = opt;
  352. opt_g.num_threads = 1;
  353. opt_g.blob_allocator = top_blob.allocator;
  354. Mat top_blob_tm_g = top_blob_tm.channel_range(p, 1);
  355. Mat top_blob_g = top_blob.channel_range(p, 1);
  356. requantize_ops[p]->forward(top_blob_tm_g, top_blob_g, opt_g);
  357. }
  358. // activation relu
  359. if (activation_type == 1)
  360. {
  361. signed char* outptr_s8 = top_blob.channel(p);
  362. for (int i = 0; i < outh*outw; i++)
  363. {
  364. if (outptr_s8[i] < 0)
  365. outptr_s8[i] = 0;
  366. }
  367. }
  368. }
  369. }
  370. else
  371. {
  372. top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
  373. if (top_blob.empty())
  374. return -100;
  375. // num_output
  376. #pragma omp parallel for num_threads(opt.num_threads)
  377. for (int p=0; p<num_output; p++)
  378. {
  379. int* outptr = top_blob.channel(p);
  380. for (int i = 0; i < outh; i++)
  381. {
  382. for (int j = 0; j < outw; j++)
  383. {
  384. int sum = 0;
  385. const signed char* kptr = (const signed char*)weight_data + maxk * channels * p;
  386. // channels
  387. for (int q=0; q<channels; q++)
  388. {
  389. const Mat m = bottom_blob_bordered.channel(q);
  390. const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w;
  391. for (int k = 0; k < maxk; k++)
  392. {
  393. int val = sptr[ space_ofs[k] ];
  394. int w = kptr[k];
  395. sum += val * w;
  396. }
  397. kptr += maxk;
  398. }
  399. outptr[j] = sum;
  400. }
  401. outptr += outw;
  402. }
  403. // dequantize, reverse scale inplace
  404. {
  405. ncnn::Option opt_g = opt;
  406. opt_g.num_threads = 1;
  407. opt_g.blob_allocator = top_blob.allocator;
  408. Mat top_blob_g = top_blob.channel_range(p, 1);
  409. dequantize_ops[p]->forward_inplace(top_blob_g, opt_g);
  410. }
  411. // activation relu
  412. if (activation_type == 1)
  413. {
  414. float* outptr_fp32 = top_blob.channel(p);
  415. for (int i = 0; i < outh*outw; i++)
  416. {
  417. outptr_fp32[i] = std::max(outptr_fp32[i], 0.f);
  418. }
  419. }
  420. }
  421. }
  422. return 0;
  423. }
  424. // float32
  425. top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
  426. if (top_blob.empty())
  427. return -100;
  428. // num_output
  429. #pragma omp parallel for num_threads(opt.num_threads)
  430. for (int p=0; p<num_output; p++)
  431. {
  432. float* outptr = top_blob.channel(p);
  433. for (int i = 0; i < outh; i++)
  434. {
  435. for (int j = 0; j < outw; j++)
  436. {
  437. float sum = 0.f;
  438. if (bias_term)
  439. sum = bias_data[p];
  440. const float* kptr = (const float*)weight_data + maxk * channels * p;
  441. // channels
  442. for (int q=0; q<channels; q++)
  443. {
  444. const Mat m = bottom_blob_bordered.channel(q);
  445. const float* sptr = m.row(i*stride_h) + j*stride_w;
  446. for (int k = 0; k < maxk; k++) // 29.23
  447. {
  448. float val = sptr[ space_ofs[k] ]; // 20.72
  449. float w = kptr[k];
  450. sum += val * w; // 41.45
  451. }
  452. kptr += maxk;
  453. }
  454. if (activation_type == 1)
  455. {
  456. sum = std::max(sum, 0.f);
  457. }
  458. else if (activation_type == 2)
  459. {
  460. float slope = activation_params[0];
  461. sum = sum > 0.f ? sum : sum * slope;
  462. }
  463. else if (activation_type == 3)
  464. {
  465. float min = activation_params[0];
  466. float max = activation_params[1];
  467. if (sum < min)
  468. sum = min;
  469. if (sum > max)
  470. sum = max;
  471. }
  472. else if (activation_type == 4)
  473. {
  474. sum = 1.f / (1.f + exp(-sum));
  475. }
  476. outptr[j] = sum;
  477. }
  478. outptr += outw;
  479. }
  480. }
  481. return 0;
  482. }
  483. } // namespace ncnn