You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_pipecache.cpp 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "datareader.h"
  15. #include "gpu.h"
  16. #include "mat.h"
  17. #include "net.h"
  18. #include "pipelinecache.h"
  19. #include "testutil.h"
  20. #include <iostream>
  21. #include <chrono>
  22. #include <vector>
  23. // 一个空数据读取器,用于加载模型结构,权重将全部为0
  24. class DataReaderFromEmpty : public ncnn::DataReader
  25. {
  26. public:
  27. virtual int scan(const char* format, void* p) const
  28. {
  29. (void)format; // unused
  30. (void)p; // unused
  31. return 0;
  32. }
  33. virtual size_t read(void* buf, size_t size) const
  34. {
  35. memset(buf, 0, size);
  36. return size;
  37. }
  38. };
  39. // MobileNetV3 的网络结构参数
  40. static const char* mobilenet_v3_param = R"delimiter(
  41. 7767517
  42. 145 163
  43. Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
  44. Convolution 313 1 1 data 313 -23330=4,3,112,112,16 0=16 1=3 3=2 4=1 5=1 6=432
  45. Split splitncnn_0 1 2 313 313_splitncnn_0 313_splitncnn_1 -23330=8,3,112,112,16,3,112,112,16
  46. HardSigmoid 319 1 1 313_splitncnn_1 319 -23330=4,3,112,112,16
  47. BinaryOp 320 2 1 313_splitncnn_0 319 320 -23330=4,3,112,112,16 0=2
  48. Split splitncnn_1 1 2 320 320_splitncnn_0 320_splitncnn_1 -23330=8,3,112,112,16,3,112,112,16
  49. ConvolutionDepthWise 321 1 1 320_splitncnn_1 323 -23330=4,3,112,112,16 0=16 1=3 4=1 5=1 6=144 7=16 9=1
  50. Convolution 324 1 1 323 324 -23330=4,3,112,112,16 0=16 1=1 5=1 6=256
  51. BinaryOp 326 2 1 320_splitncnn_0 324 326 -23330=4,3,112,112,16
  52. Convolution 327 1 1 326 329 -23330=4,3,112,112,64 0=64 1=1 5=1 6=1024 9=1
  53. ConvolutionDepthWise 330 1 1 329 332 -23330=4,3,56,56,64 0=64 1=3 3=2 4=1 5=1 6=576 7=64 9=1
  54. Convolution 333 1 1 332 333 -23330=4,3,56,56,24 0=24 1=1 5=1 6=1536
  55. Split splitncnn_2 1 2 333 333_splitncnn_0 333_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24
  56. Convolution 335 1 1 333_splitncnn_1 337 -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1
  57. ConvolutionDepthWise 338 1 1 337 340 -23330=4,3,56,56,72 0=72 1=3 4=1 5=1 6=648 7=72 9=1
  58. Convolution 341 1 1 340 341 -23330=4,3,56,56,24 0=24 1=1 5=1 6=1728
  59. BinaryOp 343 2 1 333_splitncnn_0 341 343 -23330=4,3,56,56,24
  60. Convolution 344 1 1 343 346 -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1
  61. ConvolutionDepthWise 347 1 1 346 347 -23330=4,3,28,28,72 0=72 1=5 3=2 4=2 5=1 6=1800 7=72
  62. Split splitncnn_3 1 2 347 347_splitncnn_0 347_splitncnn_1 -23330=8,3,28,28,72,3,28,28,72
  63. Pooling 355 1 1 347_splitncnn_1 359 -23330=4,1,72,1,1 0=1 4=1
  64. InnerProduct 360 1 1 359 361 -23330=4,1,18,1,1 0=18 1=1 2=1296 9=1
  65. InnerProduct 362 1 1 361 362 -23330=4,1,72,1,1 0=72 1=1 2=1296
  66. HardSigmoid 367 1 1 362 367 -23330=4,1,72,1,1
  67. BinaryOp 376 2 1 347_splitncnn_0 367 376 -23330=4,3,28,28,72 0=2
  68. ReLU 377 1 1 376 377 -23330=4,3,28,28,72
  69. Convolution 378 1 1 377 378 -23330=4,3,28,28,40 0=40 1=1 5=1 6=2880
  70. Split splitncnn_4 1 2 378 378_splitncnn_0 378_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40
  71. Convolution 380 1 1 378_splitncnn_1 382 -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1
  72. ConvolutionDepthWise 383 1 1 382 383 -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120
  73. Split splitncnn_5 1 2 383 383_splitncnn_0 383_splitncnn_1 -23330=8,3,28,28,120,3,28,28,120
  74. Pooling 391 1 1 383_splitncnn_1 395 -23330=4,1,120,1,1 0=1 4=1
  75. InnerProduct 396 1 1 395 397 -23330=4,1,30,1,1 0=30 1=1 2=3600 9=1
  76. InnerProduct 398 1 1 397 398 -23330=4,1,120,1,1 0=120 1=1 2=3600
  77. HardSigmoid 403 1 1 398 403 -23330=4,1,120,1,1
  78. BinaryOp 412 2 1 383_splitncnn_0 403 412 -23330=4,3,28,28,120 0=2
  79. ReLU 413 1 1 412 413 -23330=4,3,28,28,120
  80. Convolution 414 1 1 413 414 -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800
  81. BinaryOp 416 2 1 378_splitncnn_0 414 416 -23330=4,3,28,28,40
  82. Split splitncnn_6 1 2 416 416_splitncnn_0 416_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40
  83. Convolution 417 1 1 416_splitncnn_1 419 -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1
  84. ConvolutionDepthWise 420 1 1 419 420 -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120
  85. Split splitncnn_7 1 2 420 420_splitncnn_0 420_splitncnn_1 -23330=8,3,28,28,120,3,28,28,120
  86. Pooling 428 1 1 420_splitncnn_1 432 -23330=4,1,120,1,1 0=1 4=1
  87. InnerProduct 433 1 1 432 434 -23330=4,1,30,1,1 0=30 1=1 2=3600 9=1
  88. InnerProduct 435 1 1 434 435 -23330=4,1,120,1,1 0=120 1=1 2=3600
  89. HardSigmoid 440 1 1 435 440 -23330=4,1,120,1,1
  90. BinaryOp 449 2 1 420_splitncnn_0 440 449 -23330=4,3,28,28,120 0=2
  91. ReLU 450 1 1 449 450 -23330=4,3,28,28,120
  92. Convolution 451 1 1 450 451 -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800
  93. BinaryOp 453 2 1 416_splitncnn_0 451 453 -23330=4,3,28,28,40
  94. Convolution 454 1 1 453 454 -23330=4,3,28,28,240 0=240 1=1 5=1 6=9600
  95. HardSwish 461 1 1 454 461 -23330=4,3,28,28,240
  96. ConvolutionDepthWise 462 1 1 461 462 -23330=4,3,14,14,240 0=240 1=3 3=2 4=1 5=1 6=2160 7=240
  97. HardSwish 469 1 1 462 469 -23330=4,3,14,14,240
  98. Convolution 470 1 1 469 470 -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200
  99. Split splitncnn_8 1 2 470 470_splitncnn_0 470_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
  100. Convolution 472 1 1 470_splitncnn_1 472 -23330=4,3,14,14,200 0=200 1=1 5=1 6=16000
  101. HardSwish 479 1 1 472 479 -23330=4,3,14,14,200
  102. ConvolutionDepthWise 480 1 1 479 480 -23330=4,3,14,14,200 0=200 1=3 4=1 5=1 6=1800 7=200
  103. HardSwish 487 1 1 480 487 -23330=4,3,14,14,200
  104. Convolution 488 1 1 487 488 -23330=4,3,14,14,80 0=80 1=1 5=1 6=16000
  105. BinaryOp 490 2 1 470_splitncnn_0 488 490 -23330=4,3,14,14,80
  106. Split splitncnn_9 1 2 490 490_splitncnn_0 490_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
  107. Convolution 491 1 1 490_splitncnn_1 491 -23330=4,3,14,14,184 0=184 1=1 5=1 6=14720
  108. HardSwish 498 1 1 491 498 -23330=4,3,14,14,184
  109. ConvolutionDepthWise 499 1 1 498 499 -23330=4,3,14,14,184 0=184 1=3 4=1 5=1 6=1656 7=184
  110. HardSwish 506 1 1 499 506 -23330=4,3,14,14,184
  111. Convolution 507 1 1 506 507 -23330=4,3,14,14,80 0=80 1=1 5=1 6=14720
  112. BinaryOp 509 2 1 490_splitncnn_0 507 509 -23330=4,3,14,14,80
  113. Split splitncnn_10 1 2 509 509_splitncnn_0 509_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
  114. Convolution 510 1 1 509_splitncnn_1 510 -23330=4,3,14,14,184 0=184 1=1 5=1 6=14720
  115. HardSwish 517 1 1 510 517 -23330=4,3,14,14,184
  116. ConvolutionDepthWise 518 1 1 517 518 -23330=4,3,14,14,184 0=184 1=3 4=1 5=1 6=1656 7=184
  117. HardSwish 525 1 1 518 525 -23330=4,3,14,14,184
  118. Convolution 526 1 1 525 526 -23330=4,3,14,14,80 0=80 1=1 5=1 6=14720
  119. BinaryOp 528 2 1 509_splitncnn_0 526 528 -23330=4,3,14,14,80
  120. Convolution 529 1 1 528 529 -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400
  121. HardSwish 536 1 1 529 536 -23330=4,3,14,14,480
  122. ConvolutionDepthWise 537 1 1 536 537 -23330=4,3,14,14,480 0=480 1=3 4=1 5=1 6=4320 7=480
  123. Split splitncnn_11 1 2 537 537_splitncnn_0 537_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480
  124. Pooling 545 1 1 537_splitncnn_1 549 -23330=4,1,480,1,1 0=1 4=1
  125. InnerProduct 550 1 1 549 551 -23330=4,1,120,1,1 0=120 1=1 2=57600 9=1
  126. InnerProduct 552 1 1 551 552 -23330=4,1,480,1,1 0=480 1=1 2=57600
  127. HardSigmoid 557 1 1 552 557 -23330=4,1,480,1,1
  128. BinaryOp 566 2 1 537_splitncnn_0 557 566 -23330=4,3,14,14,480 0=2
  129. HardSwish 572 1 1 566 572 -23330=4,3,14,14,480
  130. Convolution 573 1 1 572 573 -23330=4,3,14,14,112 0=112 1=1 5=1 6=53760
  131. Split splitncnn_12 1 2 573 573_splitncnn_0 573_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112
  132. Convolution 575 1 1 573_splitncnn_1 575 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264
  133. HardSwish 582 1 1 575 582 -23330=4,3,14,14,672
  134. ConvolutionDepthWise 583 1 1 582 583 -23330=4,3,14,14,672 0=672 1=3 4=1 5=1 6=6048 7=672
  135. Split splitncnn_13 1 2 583 583_splitncnn_0 583_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672
  136. Pooling 591 1 1 583_splitncnn_1 595 -23330=4,1,672,1,1 0=1 4=1
  137. InnerProduct 596 1 1 595 597 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1
  138. InnerProduct 598 1 1 597 598 -23330=4,1,672,1,1 0=672 1=1 2=112896
  139. HardSigmoid 603 1 1 598 603 -23330=4,1,672,1,1
  140. BinaryOp 612 2 1 583_splitncnn_0 603 612 -23330=4,3,14,14,672 0=2
  141. HardSwish 618 1 1 612 618 -23330=4,3,14,14,672
  142. Convolution 619 1 1 618 619 -23330=4,3,14,14,112 0=112 1=1 5=1 6=75264
  143. BinaryOp 621 2 1 573_splitncnn_0 619 621 -23330=4,3,14,14,112
  144. Convolution 622 1 1 621 622 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264
  145. HardSwish 629 1 1 622 629 -23330=4,3,14,14,672
  146. ConvolutionDepthWise 630 1 1 629 630 -23330=4,3,14,14,672 0=672 1=5 4=2 5=1 6=16800 7=672
  147. Split splitncnn_14 1 2 630 630_splitncnn_0 630_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672
  148. Pooling 638 1 1 630_splitncnn_1 642 -23330=4,1,672,1,1 0=1 4=1
  149. InnerProduct 643 1 1 642 644 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1
  150. InnerProduct 645 1 1 644 645 -23330=4,1,672,1,1 0=672 1=1 2=112896
  151. HardSigmoid 650 1 1 645 650 -23330=4,1,672,1,1
  152. BinaryOp 659 2 1 630_splitncnn_0 650 659 -23330=4,3,14,14,672 0=2
  153. HardSwish 665 1 1 659 665 -23330=4,3,14,14,672
  154. Convolution 666 1 1 665 666 -23330=4,3,14,14,160 0=160 1=1 5=1 6=107520
  155. Convolution 668 1 1 666 668 -23330=4,3,14,14,672 0=672 1=1 5=1 6=107520
  156. HardSwish 675 1 1 668 675 -23330=4,3,14,14,672
  157. ConvolutionDepthWise 676 1 1 675 676 -23330=4,3,7,7,672 0=672 1=5 3=2 4=2 5=1 6=16800 7=672
  158. Split splitncnn_15 1 2 676 676_splitncnn_0 676_splitncnn_1 -23330=8,3,7,7,672,3,7,7,672
  159. Pooling 684 1 1 676_splitncnn_1 688 -23330=4,1,672,1,1 0=1 4=1
  160. InnerProduct 689 1 1 688 690 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1
  161. InnerProduct 691 1 1 690 691 -23330=4,1,672,1,1 0=672 1=1 2=112896
  162. HardSigmoid 696 1 1 691 696 -23330=4,1,672,1,1
  163. BinaryOp 705 2 1 676_splitncnn_0 696 705 -23330=4,3,7,7,672 0=2
  164. HardSwish 711 1 1 705 711 -23330=4,3,7,7,672
  165. Convolution 712 1 1 711 712 -23330=4,3,7,7,160 0=160 1=1 5=1 6=107520
  166. Split splitncnn_16 1 2 712 712_splitncnn_0 712_splitncnn_1 -23330=8,3,7,7,160,3,7,7,160
  167. Convolution 714 1 1 712_splitncnn_1 714 -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600
  168. HardSwish 721 1 1 714 721 -23330=4,3,7,7,960
  169. ConvolutionDepthWise 722 1 1 721 722 -23330=4,3,7,7,960 0=960 1=5 4=2 5=1 6=24000 7=960
  170. Split splitncnn_17 1 2 722 722_splitncnn_0 722_splitncnn_1 -23330=8,3,7,7,960,3,7,7,960
  171. Pooling 730 1 1 722_splitncnn_1 734 -23330=4,1,960,1,1 0=1 4=1
  172. InnerProduct 735 1 1 734 736 -23330=4,1,240,1,1 0=240 1=1 2=230400 9=1
  173. InnerProduct 737 1 1 736 737 -23330=4,1,960,1,1 0=960 1=1 2=230400
  174. HardSigmoid 742 1 1 737 742 -23330=4,1,960,1,1
  175. BinaryOp 751 2 1 722_splitncnn_0 742 751 -23330=4,3,7,7,960 0=2
  176. HardSwish 757 1 1 751 757 -23330=4,3,7,7,960
  177. Convolution 758 1 1 757 758 -23330=4,3,7,7,160 0=160 1=1 5=1 6=153600
  178. BinaryOp 760 2 1 712_splitncnn_0 758 760 -23330=4,3,7,7,160
  179. Convolution 761 1 1 760 761 -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600
  180. HardSwish 768 1 1 761 768 -23330=4,3,7,7,960
  181. Pooling 769 1 1 768 769 -23330=4,1,960,1,1 0=1 4=1
  182. HardSwish 775 1 1 769 775 -23330=4,1,960,1,1
  183. Reshape 783 1 1 775 783 -23330=4,1,960,1,1 0=-1
  184. InnerProduct 784 1 1 783 784 -23330=4,1,1280,1,1 0=1280 1=1 2=1228800
  185. HardSwish 790 1 1 784 790 -23330=4,1,1280,1,1
  186. InnerProduct 791 1 1 790 791 -23330=4,1,1000,1,1 0=1000 1=1 2=1280000
  187. Softmax prob 1 1 791 output -23330=4,1,1000,1,1
  188. )delimiter";
  189. /**
  190. * @brief 使用一个简单的 Sigmoid 网络预热并测试 Pipeline Cache 的基本保存和加载功能
  191. * @return 0 on success, -1 on failure
  192. */
  193. static int warmup_gpu_pipecache()
  194. {
  195. std::cout << "==================================================" << std::endl;
  196. std::cout << " Warmup: Testing Basic Cache IO " << std::endl;
  197. std::cout << "==================================================" << std::endl;
  198. // 1. 创建一个网络,运行一次以生成 pipeline
  199. ncnn::Net net;
  200. net.opt.use_vulkan_compute = true;
  201. net.load_param_mem("7767517\n2 2\nInput input0 0 1 input0\nSigmoid sigmoid0 1 1 input0 output0");
  202. net.load_model((unsigned char*)""); // 用于创建 pipeline
  203. ncnn::Mat input0 = RandomMat(224, 224);
  204. ncnn::Mat output0;
  205. {
  206. ncnn::Extractor ex = net.create_extractor();
  207. ex.input("input0", input0);
  208. ex.extract("output0", output0);
  209. }
  210. if (output0.empty())
  211. {
  212. std::cerr << "Warmup failed: initial extraction failed." << std::endl;
  213. return -1;
  214. }
  215. // 2. 保存 pipeline cache
  216. const char* cache_path = "./sigmoid_pipecache.bin";
  217. if (net.opt.pipeline_cache->save_cache(cache_path) != 0)
  218. {
  219. std::cerr << "Warmup failed: could not save pipeline cache to " << cache_path << std::endl;
  220. return -1;
  221. }
  222. std::cout << "Warmup: Pipeline cache saved successfully." << std::endl;
  223. // 3. 创建第二个网络,加载刚才保存的 cache
  224. ncnn::Net net2;
  225. net2.opt.use_vulkan_compute = true;
  226. net2.opt.pipeline_cache = new ncnn::PipelineCache(net.vulkan_device());
  227. net2.load_param_mem("7767517\n2 2\nInput input0 0 1 input0\nSigmoid sigmoid0 1 1 input0 output0");
  228. if (net2.opt.pipeline_cache->load_cache(cache_path) != 0)
  229. {
  230. std::cerr << "Warmup failed: could not load pipeline cache from " << cache_path << std::endl;
  231. return -1;
  232. }
  233. std::cout << "Warmup: Pipeline cache loaded successfully." << std::endl;
  234. net2.load_model((unsigned char*)""); // 创建 pipeline
  235. // 4. 再次推理并验证结果是否一致
  236. ncnn::Mat output0_2;
  237. {
  238. ncnn::Extractor ex2 = net2.create_extractor();
  239. ex2.input("input0", input0);
  240. ex2.extract("output0", output0_2);
  241. }
  242. if (output0_2.empty())
  243. {
  244. std::cerr << "Warmup failed: extraction after loading cache failed." << std::endl;
  245. return -1;
  246. }
  247. if (CompareMat(output0, output0_2, 0.001) != 0)
  248. {
  249. std::cerr << "Warmup failed: output mismatch after loading cache." << std::endl;
  250. return -1;
  251. }
  252. std::cout << "Warmup PASSED: Outputs are identical." << std::endl;
  253. return 0;
  254. }
  255. /**
  256. * @brief 对比使用和不使用 Pipeline Cache 时的模型加载性能
  257. * @return 0 on success, -1 on failure
  258. */
  259. static int test_gpu_pipecache_performance()
  260. {
  261. ncnn::Mat output_no_cache;
  262. double time_no_cache = 0;
  263. const char* cache_path = "./mobilenet_pipecache.bin";
  264. DataReaderFromEmpty dr;
  265. ncnn::Mat input = RandomMat(224, 224, 3);
  266. // -------------------------------------------------
  267. // 1. 不使用 Pipeline Cache (首次加载)
  268. // -------------------------------------------------
  269. std::cout << "\n==================================================" << std::endl;
  270. std::cout << " Performance Test: Without Pipeline Cache " << std::endl;
  271. std::cout << "==================================================" << std::endl;
  272. {
  273. ncnn::Net net_no_cache;
  274. net_no_cache.opt.use_vulkan_compute = true;
  275. auto start = std::chrono::high_resolution_clock::now();
  276. net_no_cache.load_param_mem(mobilenet_v3_param);
  277. net_no_cache.load_model(dr);
  278. auto end = std::chrono::high_resolution_clock::now();
  279. time_no_cache = std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(end - start).count();
  280. std::cout << "Model loading time without cache: " << time_no_cache << " ms" << std::endl;
  281. // 推理以获得基准输出
  282. ncnn::Extractor ex = net_no_cache.create_extractor();
  283. ex.input("data", input);
  284. ex.extract("output", output_no_cache);
  285. if (output_no_cache.empty())
  286. {
  287. std::cerr << "Test failed: extraction without cache failed." << std::endl;
  288. return -1;
  289. }
  290. // 保存 cache 以供下一步使用
  291. if (net_no_cache.opt.pipeline_cache->save_cache(cache_path) != 0)
  292. {
  293. std::cerr << "Test failed: could not save pipeline cache to " << cache_path << std::endl;
  294. return -1;
  295. }
  296. std::cout << "Pipeline cache generated and saved to " << cache_path << std::endl;
  297. }
  298. // -------------------------------------------------
  299. // 2. 使用 Pipeline Cache (二次加载)
  300. // -------------------------------------------------
  301. ncnn::Mat output_with_cache;
  302. double time_with_cache = 0;
  303. std::cout << "\n==================================================" << std::endl;
  304. std::cout << " Performance Test: With Pipeline Cache " << std::endl;
  305. std::cout << "==================================================" << std::endl;
  306. {
  307. ncnn::Net net_with_cache;
  308. // 必须在加载模型前设置好 cache
  309. net_with_cache.opt.pipeline_cache = new ncnn::PipelineCache(ncnn::get_gpu_device());
  310. net_with_cache.opt.use_vulkan_compute = true;
  311. auto start = std::chrono::high_resolution_clock::now();
  312. // 从文件加载 cache
  313. if (net_with_cache.opt.pipeline_cache->load_cache(cache_path) != 0)
  314. {
  315. std::cerr << "Test failed: could not load pipeline cache from " << cache_path << std::endl;
  316. return -1;
  317. }
  318. net_with_cache.load_param_mem(mobilenet_v3_param);
  319. net_with_cache.load_model(dr);
  320. auto end = std::chrono::high_resolution_clock::now();
  321. time_with_cache = std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(end - start).count();
  322. std::cout << "Model loading time with cache: " << time_with_cache << " ms" << std::endl;
  323. // 推理
  324. ncnn::Extractor ex2 = net_with_cache.create_extractor();
  325. ex2.input("data", input);
  326. ex2.extract("output", output_with_cache);
  327. if (output_with_cache.empty())
  328. {
  329. std::cerr << "Test failed: extraction with cache failed." << std::endl;
  330. return -1;
  331. }
  332. }
  333. // -------------------------------------------------
  334. // 3. 结果验证与总结
  335. // -------------------------------------------------
  336. std::cout << "\n==================================================" << std::endl;
  337. std::cout << " Verification and Summary " << std::endl;
  338. std::cout << "==================================================" << std::endl;
  339. bool is_output_same = (CompareMat(output_no_cache, output_with_cache, 0.001) == 0);
  340. std::cout << "Output verification: " << (is_output_same ? "SUCCESS" : "FAILURE") << std::endl;
  341. std::cout << "--------------------------------------------------" << std::endl;
  342. std::cout << "Performance Summary:" << std::endl;
  343. std::cout << " - Without Cache: " << time_no_cache << " ms" << std::endl;
  344. std::cout << " - With Cache: " << time_with_cache << " ms" << std::endl;
  345. if (time_no_cache > 0) {
  346. double speedup = (time_no_cache - time_with_cache) / time_no_cache * 100;
  347. std::cout << " - Speedup: " << speedup << "%" << std::endl;
  348. }
  349. if (!is_output_same)
  350. {
  351. std::cerr << "\nTest FAILED due to output mismatch." << std::endl;
  352. return -1;
  353. }
  354. std::cout << "\nTest PASSED." << std::endl;
  355. return 0;
  356. }
  357. int main()
  358. {
  359. // 运行预热测试,检查基本IO功能
  360. if (warmup_gpu_pipecache() != 0)
  361. {
  362. return -1;
  363. }
  364. // 运行性能对比测试
  365. return test_gpu_pipecache_performance();
  366. }