You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 17 kB

[WIP] vulkan compute (#618) * vulkan infrastructure * vkallocator and vkmat * layer interface for vulkan compute * wip... * default vulkan device, command wrapper, upload model weight in load_model to simplify layer interface * simplify command api, vkmat holds staging buffer, relu works * initialize specialization constant, simplify command dispatch, fix staging buffer copy with different shape, convolution works * init extension functions * dynamic local size and group count * group count=1 is invalid * regard device max workgroup size limit * fix relu oooops * decouple command record and staging allocation * create result blob * add pooling shader * buffer is faster than image :) * fix pooling shader * add innerproduct shader * readonly writeonly decoration * simplify buffer creation * decouple command and layer, VK_KHR_descriptor_update_template extension makes descriptor binding update easy :D * fix vulkan building issues in visual studio (#1) * fix building issues on visual studio * ignore benchmark * cancel changes * ... ... * decouple paramdict and vulkandevice * fix staging buffer destroy in model loading * remove vkdev member in option * add padding shader * simplify vulkan layer creation, simplify convolution and pooling shader for no padding, less debug output * add convolutiondepthwise and softmax shader * specialization float type, add leakyrelu * add dropout shader * add batchnorm shader * split vulkan forward * add scale shader * push constant type can be int or float * set_optimal_local_size_xyz * add eltwise shader * concat vulkan forward * fix convolution without bias * add dummy shader for concat and split, more fix ... * optional VK_KHR_descriptor_update_template and VK_KHR_push_descriptor * check VK_KHR_push_descriptor for vkCmdPushDescriptorSetWithTemplateKHR * binaryop and unaryop shader * hide raw command buffer * simple vkbenchncnn benchmark * create device with transfer queue * rename command to vkcompute, add vktransfer and layer upload_model interface * external VkMat, copy and map wrt buffer offset * command copy respect offset and size * decouple weight upload and load, simplify upload weight api, use one big staging buffer for uploading weights * fix build on android * binding count can not vary :( * barrier check state, fix sub-op destruction * declare local_size_xyz constant, fix crash on radv * fix local_size_xyz, second try * more barrier and state fix * fix softmax * reconstruct buffer memory allocator, reuse blob buffer, less verbose output * find unified memory type index * weight staging buffer allocator and weight buffer allocator, respect descriptor buffer offset alignment * use VK_KHR_descriptor_update_template for faster descriptor update if available, multithread pipeline creation * find more useful vulkan extensions and enable them * fix msvc build * respect VK_KHR_dedicated_allocation for weight buffer allocation * fix android build * fix bias name conflicts with metal * decouple pipeline and layer, building shader sources into shader module, dedicated create_pipeline api, simplify pipeline recording * drop dummy shader, inplace softmax, multiple shader module works * fix unique queue family index error * flatten support vulkan * mnasnet run * find shader module by name, each entry point per shader module, fix attribute/id conflict on moltenvk * some minor changes * add some high level api * use dedicated transfer queue to upload weight model * prefer mappable buffer on unified memory * global pooling and convolution fc, reuse staging buffer * implement ring-buffer style blob allocator, add VkBufferMemory capacity * use blob allocator for workspace blob, it works fine :) * vulkan option off * Update layer.cpp * fix build with vulkan off * less verbose output, fix crash on vulkan_compute off * merge benchncnn tool * allocator clear api, use new weight buffer allocator per net * add default locked allocator * mapped mat ptr api, persistent mapped memory works generally :) * travis ci linux vulkan * travis ci vulkan wip ... * more gpu wip ... * more gpu wip ... * wip... * wip... * wip... ... * wip... ios vulkan build... * find glslangValidator on ios build * use dynamic moltenvk library * travis ci wip ... * ios simulator does not support metal at all * fix cpu only extractor * optimize workgroup size, first try * optimize workgroup size, second try * conv1x1s1d1 vec4 * revert build system * fix ncnn2mem build * fix ncnn2mem build
7 years ago
7 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "convolution.h"
  15. #include <algorithm>
  16. #include "layer_type.h"
  17. namespace ncnn {
  18. DEFINE_LAYER_CREATOR(Convolution)
  19. Convolution::Convolution()
  20. {
  21. one_blob_only = true;
  22. support_inplace = false;
  23. use_int8_requantize = false;
  24. quantize = 0;
  25. }
  26. int Convolution::load_param(const ParamDict& pd)
  27. {
  28. num_output = pd.get(0, 0);
  29. kernel_w = pd.get(1, 0);
  30. kernel_h = pd.get(11, kernel_w);
  31. dilation_w = pd.get(2, 1);
  32. dilation_h = pd.get(12, dilation_w);
  33. stride_w = pd.get(3, 1);
  34. stride_h = pd.get(13, stride_w);
  35. pad_w = pd.get(4, 0);
  36. pad_h = pd.get(14, pad_w);
  37. bias_term = pd.get(5, 0);
  38. weight_data_size = pd.get(6, 0);
  39. int8_scale_term = pd.get(8, 0);
  40. activation_type = pd.get(9, 0);
  41. activation_params = pd.get(10, Mat());
  42. return 0;
  43. }
  44. int Convolution::load_model(const ModelBin& mb)
  45. {
  46. weight_data = mb.load(weight_data_size, 0);
  47. if (weight_data.empty())
  48. return -100;
  49. if (bias_term)
  50. {
  51. bias_data = mb.load(num_output, 1);
  52. if (bias_data.empty())
  53. return -100;
  54. }
  55. if (int8_scale_term)
  56. {
  57. weight_data_int8_scales = mb.load(num_output, 1);
  58. bottom_blob_int8_scale = mb.load(1, 1)[0];
  59. }
  60. return 0;
  61. }
  62. int Convolution::create_pipeline(const Option& opt)
  63. {
  64. Option opt_cpu = opt;
  65. opt_cpu.vulkan_compute = false;
  66. use_int8_inference = opt.use_int8_inference;
  67. if (int8_scale_term == 0)
  68. use_int8_inference = false;
  69. bool weight_data_is_int8 = (weight_data.elemsize == (size_t)1u);
  70. bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u);
  71. if (weight_data_is_int8 && !use_int8_inference)
  72. {
  73. fprintf(stderr, "quantized int8 weight loaded but use_int8_inference disabled\n");
  74. return -1;
  75. }
  76. // runtime quantize the weight data
  77. if (weight_data_is_float32 && use_int8_inference)
  78. {
  79. // quantize weight to int8
  80. Mat int8_weight_data(weight_data_size, (size_t)1u);
  81. if (int8_weight_data.empty())
  82. return -100;
  83. const int weight_data_size_output = weight_data_size / num_output;
  84. for (int n=0; n<num_output; n++)
  85. {
  86. Layer* op = ncnn::create_layer(ncnn::LayerType::Quantize);
  87. ncnn::ParamDict pd;
  88. pd.set(0, weight_data_int8_scales[n]);// scale
  89. op->load_param(pd);
  90. op->create_pipeline(opt_cpu);
  91. ncnn::Option opt = ncnn::get_default_option();
  92. opt.blob_allocator = int8_weight_data.allocator;
  93. const Mat weight_data_n = weight_data.range(weight_data_size_output * n, weight_data_size_output);
  94. Mat int8_weight_data_n = int8_weight_data.range(weight_data_size_output * n, weight_data_size_output);
  95. op->forward(weight_data_n, int8_weight_data_n, opt);
  96. delete op;
  97. }
  98. weight_data = int8_weight_data;
  99. }
  100. // initial the quantize,dequantize op layer
  101. if (use_int8_inference)
  102. {
  103. quantize = ncnn::create_layer(ncnn::LayerType::Quantize);
  104. {
  105. ncnn::ParamDict pd;
  106. pd.set(0, bottom_blob_int8_scale);// scale
  107. quantize->load_param(pd);
  108. quantize->create_pipeline(opt_cpu);
  109. }
  110. dequantize_ops.resize(num_output);
  111. for (int n=0; n<num_output; n++)
  112. {
  113. dequantize_ops[n] = ncnn::create_layer(ncnn::LayerType::Dequantize);
  114. float top_rescale = 1.f;
  115. if (weight_data_int8_scales[n] == 0)
  116. top_rescale = 0;
  117. else
  118. top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[n]);
  119. ncnn::ParamDict pd;
  120. pd.set(0, top_rescale);// scale
  121. pd.set(1, bias_term); // bias_term
  122. pd.set(2, 1); // bias_data_size
  123. dequantize_ops[n]->load_param(pd);
  124. dequantize_ops[n]->create_pipeline(opt_cpu);
  125. ncnn::Mat weights[1];
  126. weights[0] = bias_data.range(n, 1);
  127. dequantize_ops[n]->load_model(ModelBinFromMatArray(weights));
  128. dequantize_scales.push_back(top_rescale);
  129. }
  130. }
  131. return 0;
  132. }
  133. int Convolution::destroy_pipeline(const Option& opt)
  134. {
  135. Option opt_cpu = opt;
  136. opt_cpu.vulkan_compute = false;
  137. if (quantize)
  138. {
  139. quantize->destroy_pipeline(opt_cpu);
  140. delete quantize;
  141. quantize = 0;
  142. }
  143. for (int i=0; i<(int)dequantize_ops.size(); i++)
  144. {
  145. dequantize_ops[i]->destroy_pipeline(opt_cpu);
  146. delete dequantize_ops[i];
  147. }
  148. dequantize_ops.clear();
  149. for (int i=0; i<(int)requantize_ops.size(); i++)
  150. {
  151. requantize_ops[i]->destroy_pipeline(opt_cpu);
  152. delete requantize_ops[i];
  153. }
  154. requantize_ops.clear();
  155. dequantize_scales.clear();
  156. requantize_scales.clear();
  157. return 0;
  158. }
  159. int Convolution::create_requantize_op(void)
  160. {
  161. if (!use_int8_requantize)
  162. {
  163. fprintf(stderr, "requantized op set but use_int8_requantize disabled\n");
  164. return -1;
  165. }
  166. requantize_ops.resize(num_output);
  167. for (int n=0; n<num_output; n++)
  168. {
  169. requantize_ops[n] = ncnn::create_layer(ncnn::LayerType::Requantize);
  170. float scale_in = 1.f;
  171. float scale_out = 1.f;
  172. if (weight_data_int8_scales[n] == 0)
  173. {
  174. scale_in = 0;
  175. }
  176. else
  177. {
  178. scale_in = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[n]);
  179. }
  180. scale_out = top_blob_int8_scale;
  181. ncnn::ParamDict pd;
  182. pd.set(0, scale_in); // scale in
  183. pd.set(1, scale_out); // scale_out
  184. pd.set(2, bias_term); // bias_term
  185. pd.set(3, 1); // bias_data_size
  186. requantize_ops[n]->load_param(pd);
  187. ncnn::Mat weights[1];
  188. weights[0] = bias_data.range(n, 1);
  189. requantize_ops[n]->load_model(ModelBinFromMatArray(weights));
  190. requantize_scales.push_back(scale_in);
  191. requantize_scales.push_back(scale_out);
  192. }
  193. return 0;
  194. }
  195. int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
  196. {
  197. // convolv with NxN kernel
  198. // value = value + bias
  199. // flattened blob, implement as InnerProduct
  200. if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
  201. {
  202. int num_input = weight_data_size / num_output;
  203. if (bottom_blob.w == num_input)
  204. {
  205. // call InnerProduct
  206. ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::InnerProduct);
  207. // set param
  208. ncnn::ParamDict pd;
  209. pd.set(0, num_output);
  210. pd.set(1, bias_term);
  211. pd.set(2, weight_data_size);
  212. pd.set(8, int8_scale_term);
  213. op->load_param(pd);
  214. // set weights
  215. ncnn::Mat weights[4];
  216. weights[0] = weight_data;
  217. weights[1] = bias_data;
  218. if (int8_scale_term)
  219. {
  220. weights[2] = weight_data_int8_scales;
  221. weights[3] = Mat(1, (size_t)4u, (void*)&bottom_blob_int8_scale);
  222. }
  223. op->load_model(ModelBinFromMatArray(weights));
  224. Option opt_cpu = opt;
  225. opt_cpu.vulkan_compute = false;
  226. op->create_pipeline(opt_cpu);
  227. // forward
  228. op->forward(bottom_blob, top_blob, opt);
  229. delete op;
  230. return 0;
  231. }
  232. }
  233. int w = bottom_blob.w;
  234. int h = bottom_blob.h;
  235. int channels = bottom_blob.c;
  236. size_t elemsize = bottom_blob.elemsize;
  237. // fprintf(stderr, "Convolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d\n", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
  238. const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
  239. const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
  240. Mat bottom_blob_unbordered = bottom_blob;
  241. if (use_int8_inference && elemsize != 1)
  242. {
  243. Mat bottom_blob_int8;
  244. bottom_blob_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
  245. if (bottom_blob_int8.empty())
  246. return -100;
  247. // quantize, scale and round to nearest
  248. {
  249. ncnn::Option opt_g = opt;
  250. opt_g.blob_allocator = bottom_blob_int8.allocator;
  251. quantize->forward(bottom_blob, bottom_blob_int8, opt_g);
  252. }
  253. bottom_blob_unbordered = bottom_blob_int8;
  254. }
  255. Mat bottom_blob_bordered = bottom_blob_unbordered;
  256. if (pad_w > 0 || pad_h > 0)
  257. {
  258. copy_make_border(bottom_blob_unbordered, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
  259. if (bottom_blob_bordered.empty())
  260. return -100;
  261. w = bottom_blob_bordered.w;
  262. h = bottom_blob_bordered.h;
  263. }
  264. else if (pad_w == -233 && pad_h == -233)
  265. {
  266. int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
  267. int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
  268. if (wpad > 0 || hpad > 0)
  269. {
  270. copy_make_border(bottom_blob_unbordered, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads);
  271. if (bottom_blob_bordered.empty())
  272. return -100;
  273. }
  274. w = bottom_blob_bordered.w;
  275. h = bottom_blob_bordered.h;
  276. }
  277. int outw = (w - kernel_extent_w) / stride_w + 1;
  278. int outh = (h - kernel_extent_h) / stride_h + 1;
  279. const int maxk = kernel_w * kernel_h;
  280. // kernel offsets
  281. std::vector<int> _space_ofs(maxk);
  282. int* space_ofs = &_space_ofs[0];
  283. {
  284. int p1 = 0;
  285. int p2 = 0;
  286. int gap = w * dilation_h - kernel_w * dilation_w;
  287. for (int i = 0; i < kernel_h; i++)
  288. {
  289. for (int j = 0; j < kernel_w; j++)
  290. {
  291. space_ofs[p1] = p2;
  292. p1++;
  293. p2 += dilation_w;
  294. }
  295. p2 += gap;
  296. }
  297. }
  298. // int8
  299. if (use_int8_inference)
  300. {
  301. if (use_int8_requantize == true)
  302. {
  303. Mat top_blob_tm;
  304. top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
  305. if (top_blob_tm.empty())
  306. return -100;
  307. top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
  308. if (top_blob.empty())
  309. return -100;
  310. // num_output
  311. #pragma omp parallel for num_threads(opt.num_threads)
  312. for (int p=0; p<num_output; p++)
  313. {
  314. int* outptr = top_blob_tm.channel(p);
  315. for (int i = 0; i < outh; i++)
  316. {
  317. for (int j = 0; j < outw; j++)
  318. {
  319. int sum = 0;
  320. const signed char* kptr = (const signed char*)weight_data + maxk * channels * p;
  321. // channels
  322. for (int q=0; q<channels; q++)
  323. {
  324. const Mat m = bottom_blob_bordered.channel(q);
  325. const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w;
  326. for (int k = 0; k < maxk; k++)
  327. {
  328. int val = sptr[ space_ofs[k] ];
  329. int w = kptr[k];
  330. sum += val * w;
  331. }
  332. kptr += maxk;
  333. }
  334. outptr[j] = sum;
  335. }
  336. outptr += outw;
  337. }
  338. // requantize, reverse scale inplace
  339. {
  340. ncnn::Option opt_g = opt;
  341. opt_g.num_threads = 1;
  342. opt_g.blob_allocator = top_blob.allocator;
  343. Mat top_blob_tm_g = top_blob_tm.channel_range(p, 1);
  344. Mat top_blob_g = top_blob.channel_range(p, 1);
  345. requantize_ops[p]->forward(top_blob_tm_g, top_blob_g, opt_g);
  346. }
  347. }
  348. }
  349. else
  350. {
  351. top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
  352. if (top_blob.empty())
  353. return -100;
  354. // num_output
  355. #pragma omp parallel for num_threads(opt.num_threads)
  356. for (int p=0; p<num_output; p++)
  357. {
  358. int* outptr = top_blob.channel(p);
  359. for (int i = 0; i < outh; i++)
  360. {
  361. for (int j = 0; j < outw; j++)
  362. {
  363. int sum = 0;
  364. const signed char* kptr = (const signed char*)weight_data + maxk * channels * p;
  365. // channels
  366. for (int q=0; q<channels; q++)
  367. {
  368. const Mat m = bottom_blob_bordered.channel(q);
  369. const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w;
  370. for (int k = 0; k < maxk; k++)
  371. {
  372. int val = sptr[ space_ofs[k] ];
  373. int w = kptr[k];
  374. sum += val * w;
  375. }
  376. kptr += maxk;
  377. }
  378. outptr[j] = sum;
  379. }
  380. outptr += outw;
  381. }
  382. // dequantize, reverse scale inplace
  383. {
  384. ncnn::Option opt_g = opt;
  385. opt_g.num_threads = 1;
  386. opt_g.blob_allocator = top_blob.allocator;
  387. Mat top_blob_g = top_blob.channel_range(p, 1);
  388. dequantize_ops[p]->forward_inplace(top_blob_g, opt_g);
  389. }
  390. }
  391. }
  392. return 0;
  393. }
  394. // float32
  395. top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
  396. if (top_blob.empty())
  397. return -100;
  398. // num_output
  399. #pragma omp parallel for num_threads(opt.num_threads)
  400. for (int p=0; p<num_output; p++)
  401. {
  402. float* outptr = top_blob.channel(p);
  403. for (int i = 0; i < outh; i++)
  404. {
  405. for (int j = 0; j < outw; j++)
  406. {
  407. float sum = 0.f;
  408. if (bias_term)
  409. sum = bias_data[p];
  410. const float* kptr = (const float*)weight_data + maxk * channels * p;
  411. // channels
  412. for (int q=0; q<channels; q++)
  413. {
  414. const Mat m = bottom_blob_bordered.channel(q);
  415. const float* sptr = m.row(i*stride_h) + j*stride_w;
  416. for (int k = 0; k < maxk; k++) // 29.23
  417. {
  418. float val = sptr[ space_ofs[k] ]; // 20.72
  419. float w = kptr[k];
  420. sum += val * w; // 41.45
  421. }
  422. kptr += maxk;
  423. }
  424. if (activation_type == 1)
  425. {
  426. sum = std::max(sum, 0.f);
  427. }
  428. else if (activation_type == 2)
  429. {
  430. float slope = activation_params[0];
  431. sum = sum > 0.f ? sum : sum * slope;
  432. }
  433. else if (activation_type == 3)
  434. {
  435. float min = activation_params[0];
  436. float max = activation_params[1];
  437. if (sum < min)
  438. sum = min;
  439. if (sum > max)
  440. sum = max;
  441. }
  442. else if (activation_type == 4)
  443. {
  444. sum = 1.f / (1.f + exp(-sum));
  445. }
  446. outptr[j] = sum;
  447. }
  448. outptr += outw;
  449. }
  450. }
  451. return 0;
  452. }
  453. } // namespace ncnn