You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_pipecache.cpp 23 kB

10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378
  1. // Copyright 2021 Tencent
  2. // SPDX-License-Identifier: BSD-3-Clause
  3. #include "datareader.h"
  4. #include "gpu.h"
  5. #include "mat.h"
  6. #include "net.h"
  7. #include "pipelinecache.h"
  8. #include "testutil.h"
  9. #include <iostream>
  10. #include <chrono>
  11. #include <vector>
  12. class DataReaderFromEmpty : public ncnn::DataReader
  13. {
  14. public:
  15. virtual int scan(const char* format, void* p) const
  16. {
  17. (void)format; // unused
  18. (void)p; // unused
  19. return 0;
  20. }
  21. virtual size_t read(void* buf, size_t size) const
  22. {
  23. memset(buf, 0, size);
  24. return size;
  25. }
  26. };
  27. // MobileNetV3
  28. static const char* mobilenet_v3_param = R"delimiter(
  29. 7767517
  30. 145 163
  31. Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
  32. Convolution 313 1 1 data 313 -23330=4,3,112,112,16 0=16 1=3 3=2 4=1 5=1 6=432
  33. Split splitncnn_0 1 2 313 313_splitncnn_0 313_splitncnn_1 -23330=8,3,112,112,16,3,112,112,16
  34. HardSigmoid 319 1 1 313_splitncnn_1 319 -23330=4,3,112,112,16
  35. BinaryOp 320 2 1 313_splitncnn_0 319 320 -23330=4,3,112,112,16 0=2
  36. Split splitncnn_1 1 2 320 320_splitncnn_0 320_splitncnn_1 -23330=8,3,112,112,16,3,112,112,16
  37. ConvolutionDepthWise 321 1 1 320_splitncnn_1 323 -23330=4,3,112,112,16 0=16 1=3 4=1 5=1 6=144 7=16 9=1
  38. Convolution 324 1 1 323 324 -23330=4,3,112,112,16 0=16 1=1 5=1 6=256
  39. BinaryOp 326 2 1 320_splitncnn_0 324 326 -23330=4,3,112,112,16
  40. Convolution 327 1 1 326 329 -23330=4,3,112,112,64 0=64 1=1 5=1 6=1024 9=1
  41. ConvolutionDepthWise 330 1 1 329 332 -23330=4,3,56,56,64 0=64 1=3 3=2 4=1 5=1 6=576 7=64 9=1
  42. Convolution 333 1 1 332 333 -23330=4,3,56,56,24 0=24 1=1 5=1 6=1536
  43. Split splitncnn_2 1 2 333 333_splitncnn_0 333_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24
  44. Convolution 335 1 1 333_splitncnn_1 337 -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1
  45. ConvolutionDepthWise 338 1 1 337 340 -23330=4,3,56,56,72 0=72 1=3 4=1 5=1 6=648 7=72 9=1
  46. Convolution 341 1 1 340 341 -23330=4,3,56,56,24 0=24 1=1 5=1 6=1728
  47. BinaryOp 343 2 1 333_splitncnn_0 341 343 -23330=4,3,56,56,24
  48. Convolution 344 1 1 343 346 -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1
  49. ConvolutionDepthWise 347 1 1 346 347 -23330=4,3,28,28,72 0=72 1=5 3=2 4=2 5=1 6=1800 7=72
  50. Split splitncnn_3 1 2 347 347_splitncnn_0 347_splitncnn_1 -23330=8,3,28,28,72,3,28,28,72
  51. Pooling 355 1 1 347_splitncnn_1 359 -23330=4,1,72,1,1 0=1 4=1
  52. InnerProduct 360 1 1 359 361 -23330=4,1,18,1,1 0=18 1=1 2=1296 9=1
  53. InnerProduct 362 1 1 361 362 -23330=4,1,72,1,1 0=72 1=1 2=1296
  54. HardSigmoid 367 1 1 362 367 -23330=4,1,72,1,1
  55. BinaryOp 376 2 1 347_splitncnn_0 367 376 -23330=4,3,28,28,72 0=2
  56. ReLU 377 1 1 376 377 -23330=4,3,28,28,72
  57. Convolution 378 1 1 377 378 -23330=4,3,28,28,40 0=40 1=1 5=1 6=2880
  58. Split splitncnn_4 1 2 378 378_splitncnn_0 378_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40
  59. Convolution 380 1 1 378_splitncnn_1 382 -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1
  60. ConvolutionDepthWise 383 1 1 382 383 -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120
  61. Split splitncnn_5 1 2 383 383_splitncnn_0 383_splitncnn_1 -23330=8,3,28,28,120,3,28,28,120
  62. Pooling 391 1 1 383_splitncnn_1 395 -23330=4,1,120,1,1 0=1 4=1
  63. InnerProduct 396 1 1 395 397 -23330=4,1,30,1,1 0=30 1=1 2=3600 9=1
  64. InnerProduct 398 1 1 397 398 -23330=4,1,120,1,1 0=120 1=1 2=3600
  65. HardSigmoid 403 1 1 398 403 -23330=4,1,120,1,1
  66. BinaryOp 412 2 1 383_splitncnn_0 403 412 -23330=4,3,28,28,120 0=2
  67. ReLU 413 1 1 412 413 -23330=4,3,28,28,120
  68. Convolution 414 1 1 413 414 -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800
  69. BinaryOp 416 2 1 378_splitncnn_0 414 416 -23330=4,3,28,28,40
  70. Split splitncnn_6 1 2 416 416_splitncnn_0 416_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40
  71. Convolution 417 1 1 416_splitncnn_1 419 -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1
  72. ConvolutionDepthWise 420 1 1 419 420 -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120
  73. Split splitncnn_7 1 2 420 420_splitncnn_0 420_splitncnn_1 -23330=8,3,28,28,120,3,28,28,120
  74. Pooling 428 1 1 420_splitncnn_1 432 -23330=4,1,120,1,1 0=1 4=1
  75. InnerProduct 433 1 1 432 434 -23330=4,1,30,1,1 0=30 1=1 2=3600 9=1
  76. InnerProduct 435 1 1 434 435 -23330=4,1,120,1,1 0=120 1=1 2=3600
  77. HardSigmoid 440 1 1 435 440 -23330=4,1,120,1,1
  78. BinaryOp 449 2 1 420_splitncnn_0 440 449 -23330=4,3,28,28,120 0=2
  79. ReLU 450 1 1 449 450 -23330=4,3,28,28,120
  80. Convolution 451 1 1 450 451 -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800
  81. BinaryOp 453 2 1 416_splitncnn_0 451 453 -23330=4,3,28,28,40
  82. Convolution 454 1 1 453 454 -23330=4,3,28,28,240 0=240 1=1 5=1 6=9600
  83. HardSwish 461 1 1 454 461 -23330=4,3,28,28,240
  84. ConvolutionDepthWise 462 1 1 461 462 -23330=4,3,14,14,240 0=240 1=3 3=2 4=1 5=1 6=2160 7=240
  85. HardSwish 469 1 1 462 469 -23330=4,3,14,14,240
  86. Convolution 470 1 1 469 470 -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200
  87. Split splitncnn_8 1 2 470 470_splitncnn_0 470_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
  88. Convolution 472 1 1 470_splitncnn_1 472 -23330=4,3,14,14,200 0=200 1=1 5=1 6=16000
  89. HardSwish 479 1 1 472 479 -23330=4,3,14,14,200
  90. ConvolutionDepthWise 480 1 1 479 480 -23330=4,3,14,14,200 0=200 1=3 4=1 5=1 6=1800 7=200
  91. HardSwish 487 1 1 480 487 -23330=4,3,14,14,200
  92. Convolution 488 1 1 487 488 -23330=4,3,14,14,80 0=80 1=1 5=1 6=16000
  93. BinaryOp 490 2 1 470_splitncnn_0 488 490 -23330=4,3,14,14,80
  94. Split splitncnn_9 1 2 490 490_splitncnn_0 490_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
  95. Convolution 491 1 1 490_splitncnn_1 491 -23330=4,3,14,14,184 0=184 1=1 5=1 6=14720
  96. HardSwish 498 1 1 491 498 -23330=4,3,14,14,184
  97. ConvolutionDepthWise 499 1 1 498 499 -23330=4,3,14,14,184 0=184 1=3 4=1 5=1 6=1656 7=184
  98. HardSwish 506 1 1 499 506 -23330=4,3,14,14,184
  99. Convolution 507 1 1 506 507 -23330=4,3,14,14,80 0=80 1=1 5=1 6=14720
  100. BinaryOp 509 2 1 490_splitncnn_0 507 509 -23330=4,3,14,14,80
  101. Split splitncnn_10 1 2 509 509_splitncnn_0 509_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
  102. Convolution 510 1 1 509_splitncnn_1 510 -23330=4,3,14,14,184 0=184 1=1 5=1 6=14720
  103. HardSwish 517 1 1 510 517 -23330=4,3,14,14,184
  104. ConvolutionDepthWise 518 1 1 517 518 -23330=4,3,14,14,184 0=184 1=3 4=1 5=1 6=1656 7=184
  105. HardSwish 525 1 1 518 525 -23330=4,3,14,14,184
  106. Convolution 526 1 1 525 526 -23330=4,3,14,14,80 0=80 1=1 5=1 6=14720
  107. BinaryOp 528 2 1 509_splitncnn_0 526 528 -23330=4,3,14,14,80
  108. Convolution 529 1 1 528 529 -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400
  109. HardSwish 536 1 1 529 536 -23330=4,3,14,14,480
  110. ConvolutionDepthWise 537 1 1 536 537 -23330=4,3,14,14,480 0=480 1=3 4=1 5=1 6=4320 7=480
  111. Split splitncnn_11 1 2 537 537_splitncnn_0 537_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480
  112. Pooling 545 1 1 537_splitncnn_1 549 -23330=4,1,480,1,1 0=1 4=1
  113. InnerProduct 550 1 1 549 551 -23330=4,1,120,1,1 0=120 1=1 2=57600 9=1
  114. InnerProduct 552 1 1 551 552 -23330=4,1,480,1,1 0=480 1=1 2=57600
  115. HardSigmoid 557 1 1 552 557 -23330=4,1,480,1,1
  116. BinaryOp 566 2 1 537_splitncnn_0 557 566 -23330=4,3,14,14,480 0=2
  117. HardSwish 572 1 1 566 572 -23330=4,3,14,14,480
  118. Convolution 573 1 1 572 573 -23330=4,3,14,14,112 0=112 1=1 5=1 6=53760
  119. Split splitncnn_12 1 2 573 573_splitncnn_0 573_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112
  120. Convolution 575 1 1 573_splitncnn_1 575 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264
  121. HardSwish 582 1 1 575 582 -23330=4,3,14,14,672
  122. ConvolutionDepthWise 583 1 1 582 583 -23330=4,3,14,14,672 0=672 1=3 4=1 5=1 6=6048 7=672
  123. Split splitncnn_13 1 2 583 583_splitncnn_0 583_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672
  124. Pooling 591 1 1 583_splitncnn_1 595 -23330=4,1,672,1,1 0=1 4=1
  125. InnerProduct 596 1 1 595 597 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1
  126. InnerProduct 598 1 1 597 598 -23330=4,1,672,1,1 0=672 1=1 2=112896
  127. HardSigmoid 603 1 1 598 603 -23330=4,1,672,1,1
  128. BinaryOp 612 2 1 583_splitncnn_0 603 612 -23330=4,3,14,14,672 0=2
  129. HardSwish 618 1 1 612 618 -23330=4,3,14,14,672
  130. Convolution 619 1 1 618 619 -23330=4,3,14,14,112 0=112 1=1 5=1 6=75264
  131. BinaryOp 621 2 1 573_splitncnn_0 619 621 -23330=4,3,14,14,112
  132. Convolution 622 1 1 621 622 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264
  133. HardSwish 629 1 1 622 629 -23330=4,3,14,14,672
  134. ConvolutionDepthWise 630 1 1 629 630 -23330=4,3,14,14,672 0=672 1=5 4=2 5=1 6=16800 7=672
  135. Split splitncnn_14 1 2 630 630_splitncnn_0 630_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672
  136. Pooling 638 1 1 630_splitncnn_1 642 -23330=4,1,672,1,1 0=1 4=1
  137. InnerProduct 643 1 1 642 644 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1
  138. InnerProduct 645 1 1 644 645 -23330=4,1,672,1,1 0=672 1=1 2=112896
  139. HardSigmoid 650 1 1 645 650 -23330=4,1,672,1,1
  140. BinaryOp 659 2 1 630_splitncnn_0 650 659 -23330=4,3,14,14,672 0=2
  141. HardSwish 665 1 1 659 665 -23330=4,3,14,14,672
  142. Convolution 666 1 1 665 666 -23330=4,3,14,14,160 0=160 1=1 5=1 6=107520
  143. Convolution 668 1 1 666 668 -23330=4,3,14,14,672 0=672 1=1 5=1 6=107520
  144. HardSwish 675 1 1 668 675 -23330=4,3,14,14,672
  145. ConvolutionDepthWise 676 1 1 675 676 -23330=4,3,7,7,672 0=672 1=5 3=2 4=2 5=1 6=16800 7=672
  146. Split splitncnn_15 1 2 676 676_splitncnn_0 676_splitncnn_1 -23330=8,3,7,7,672,3,7,7,672
  147. Pooling 684 1 1 676_splitncnn_1 688 -23330=4,1,672,1,1 0=1 4=1
  148. InnerProduct 689 1 1 688 690 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1
  149. InnerProduct 691 1 1 690 691 -23330=4,1,672,1,1 0=672 1=1 2=112896
  150. HardSigmoid 696 1 1 691 696 -23330=4,1,672,1,1
  151. BinaryOp 705 2 1 676_splitncnn_0 696 705 -23330=4,3,7,7,672 0=2
  152. HardSwish 711 1 1 705 711 -23330=4,3,7,7,672
  153. Convolution 712 1 1 711 712 -23330=4,3,7,7,160 0=160 1=1 5=1 6=107520
  154. Split splitncnn_16 1 2 712 712_splitncnn_0 712_splitncnn_1 -23330=8,3,7,7,160,3,7,7,160
  155. Convolution 714 1 1 712_splitncnn_1 714 -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600
  156. HardSwish 721 1 1 714 721 -23330=4,3,7,7,960
  157. ConvolutionDepthWise 722 1 1 721 722 -23330=4,3,7,7,960 0=960 1=5 4=2 5=1 6=24000 7=960
  158. Split splitncnn_17 1 2 722 722_splitncnn_0 722_splitncnn_1 -23330=8,3,7,7,960,3,7,7,960
  159. Pooling 730 1 1 722_splitncnn_1 734 -23330=4,1,960,1,1 0=1 4=1
  160. InnerProduct 735 1 1 734 736 -23330=4,1,240,1,1 0=240 1=1 2=230400 9=1
  161. InnerProduct 737 1 1 736 737 -23330=4,1,960,1,1 0=960 1=1 2=230400
  162. HardSigmoid 742 1 1 737 742 -23330=4,1,960,1,1
  163. BinaryOp 751 2 1 722_splitncnn_0 742 751 -23330=4,3,7,7,960 0=2
  164. HardSwish 757 1 1 751 757 -23330=4,3,7,7,960
  165. Convolution 758 1 1 757 758 -23330=4,3,7,7,160 0=160 1=1 5=1 6=153600
  166. BinaryOp 760 2 1 712_splitncnn_0 758 760 -23330=4,3,7,7,160
  167. Convolution 761 1 1 760 761 -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600
  168. HardSwish 768 1 1 761 768 -23330=4,3,7,7,960
  169. Pooling 769 1 1 768 769 -23330=4,1,960,1,1 0=1 4=1
  170. HardSwish 775 1 1 769 775 -23330=4,1,960,1,1
  171. Reshape 783 1 1 775 783 -23330=4,1,960,1,1 0=-1
  172. InnerProduct 784 1 1 783 784 -23330=4,1,1280,1,1 0=1280 1=1 2=1228800
  173. HardSwish 790 1 1 784 790 -23330=4,1,1280,1,1
  174. InnerProduct 791 1 1 790 791 -23330=4,1,1000,1,1 0=1000 1=1 2=1280000
  175. Softmax prob 1 1 791 output -23330=4,1,1000,1,1
  176. )delimiter";
  177. static int warmup_gpu_pipecache()
  178. {
  179. std::cout << "==================================================" << std::endl;
  180. std::cout << " Warmup: Testing Basic Cache IO " << std::endl;
  181. std::cout << "==================================================" << std::endl;
  182. ncnn::Net net;
  183. net.opt.use_vulkan_compute = true;
  184. net.load_param_mem("7767517\n2 2\nInput input0 0 1 input0\nSigmoid sigmoid0 1 1 input0 output0");
  185. net.load_model((unsigned char*)"");
  186. ncnn::Mat input0 = RandomMat(224, 224);
  187. ncnn::Mat output0;
  188. {
  189. ncnn::Extractor ex = net.create_extractor();
  190. ex.input("input0", input0);
  191. ex.extract("output0", output0);
  192. }
  193. if (output0.empty())
  194. {
  195. std::cerr << "Warmup failed: initial extraction failed." << std::endl;
  196. return -1;
  197. }
  198. const char* cache_path = "./sigmoid_pipecache.bin";
  199. if (net.opt.pipeline_cache->save_cache(cache_path) != 0)
  200. {
  201. std::cerr << "Warmup failed: could not save pipeline cache to " << cache_path << std::endl;
  202. return -1;
  203. }
  204. std::cout << "Warmup: Pipeline cache saved successfully." << std::endl;
  205. ncnn::Net net2;
  206. net2.opt.use_vulkan_compute = true;
  207. net2.opt.pipeline_cache = new ncnn::PipelineCache(net.vulkan_device());
  208. net2.load_param_mem("7767517\n2 2\nInput input0 0 1 input0\nSigmoid sigmoid0 1 1 input0 output0");
  209. if (net2.opt.pipeline_cache->load_cache(cache_path) != 0)
  210. {
  211. std::cerr << "Warmup failed: could not load pipeline cache from " << cache_path << std::endl;
  212. return -1;
  213. }
  214. std::cout << "Warmup: Pipeline cache loaded successfully." << std::endl;
  215. net2.load_model((unsigned char*)"");
  216. ncnn::Mat output0_2;
  217. {
  218. ncnn::Extractor ex2 = net2.create_extractor();
  219. ex2.input("input0", input0);
  220. ex2.extract("output0", output0_2);
  221. }
  222. if (output0_2.empty())
  223. {
  224. std::cerr << "Warmup failed: extraction after loading cache failed." << std::endl;
  225. return -1;
  226. }
  227. if (CompareMat(output0, output0_2, 0.001) != 0)
  228. {
  229. std::cerr << "Warmup failed: output mismatch after loading cache." << std::endl;
  230. return -1;
  231. }
  232. std::cout << "Warmup PASSED: Outputs are identical." << std::endl;
  233. return 0;
  234. }
  235. static int test_gpu_pipecache_performance()
  236. {
  237. ncnn::Mat output_no_cache;
  238. double time_no_cache = 0;
  239. const char* cache_path = "./mobilenet_pipecache.bin";
  240. DataReaderFromEmpty dr;
  241. ncnn::Mat input = RandomMat(224, 224, 3);
  242. // -------------------------------------------------
  243. // 1. Without cache
  244. // -------------------------------------------------
  245. std::cout << "\n==================================================" << std::endl;
  246. std::cout << " Performance Test: Without Pipeline Cache " << std::endl;
  247. std::cout << "==================================================" << std::endl;
  248. {
  249. ncnn::Net net_no_cache;
  250. net_no_cache.opt.use_vulkan_compute = true;
  251. auto start = std::chrono::high_resolution_clock::now();
  252. net_no_cache.load_param_mem(mobilenet_v3_param);
  253. net_no_cache.load_model(dr);
  254. auto end = std::chrono::high_resolution_clock::now();
  255. time_no_cache = std::chrono::duration_cast<std::chrono::duration<double, std::milli> >(end - start).count();
  256. std::cout << "Model loading time without cache: " << time_no_cache << " ms" << std::endl;
  257. ncnn::Extractor ex = net_no_cache.create_extractor();
  258. ex.input("data", input);
  259. ex.extract("output", output_no_cache);
  260. if (output_no_cache.empty())
  261. {
  262. std::cerr << "Test failed: extraction without cache failed." << std::endl;
  263. return -1;
  264. }
  265. // save cache
  266. if (net_no_cache.opt.pipeline_cache->save_cache(cache_path) != 0)
  267. {
  268. std::cerr << "Test failed: could not save pipeline cache to " << cache_path << std::endl;
  269. return -1;
  270. }
  271. std::cout << "Pipeline cache generated and saved to " << cache_path << std::endl;
  272. }
  273. // -------------------------------------------------
  274. // 2. With Cache
  275. // -------------------------------------------------
  276. ncnn::Mat output_with_cache;
  277. double time_with_cache = 0;
  278. std::cout << "\n==================================================" << std::endl;
  279. std::cout << " Performance Test: With Pipeline Cache " << std::endl;
  280. std::cout << "==================================================" << std::endl;
  281. {
  282. ncnn::Net net_with_cache;
  283. net_with_cache.opt.pipeline_cache = new ncnn::PipelineCache(ncnn::get_gpu_device());
  284. net_with_cache.opt.use_vulkan_compute = true;
  285. auto start = std::chrono::high_resolution_clock::now();
  286. // load from cache
  287. if (net_with_cache.opt.pipeline_cache->load_cache(cache_path) != 0)
  288. {
  289. std::cerr << "Test failed: could not load pipeline cache from " << cache_path << std::endl;
  290. return -1;
  291. }
  292. net_with_cache.load_param_mem(mobilenet_v3_param);
  293. net_with_cache.load_model(dr);
  294. auto end = std::chrono::high_resolution_clock::now();
  295. time_with_cache = std::chrono::duration_cast<std::chrono::duration<double, std::milli> >(end - start).count();
  296. std::cout << "Model loading time with cache: " << time_with_cache << " ms" << std::endl;
  297. ncnn::Extractor ex2 = net_with_cache.create_extractor();
  298. ex2.input("data", input);
  299. ex2.extract("output", output_with_cache);
  300. if (output_with_cache.empty())
  301. {
  302. std::cerr << "Test failed: extraction with cache failed." << std::endl;
  303. return -1;
  304. }
  305. }
  306. // -------------------------------------------------
  307. // 3. Verification
  308. // -------------------------------------------------
  309. std::cout << "\n==================================================" << std::endl;
  310. std::cout << " Verification and Summary " << std::endl;
  311. std::cout << "==================================================" << std::endl;
  312. bool is_output_same = (CompareMat(output_no_cache, output_with_cache, 0.001) == 0);
  313. std::cout << "Output verification: " << (is_output_same ? "SUCCESS" : "FAILURE") << std::endl;
  314. std::cout << "--------------------------------------------------" << std::endl;
  315. std::cout << "Performance Summary:" << std::endl;
  316. std::cout << " - Without Cache: " << time_no_cache << " ms" << std::endl;
  317. std::cout << " - With Cache: " << time_with_cache << " ms" << std::endl;
  318. if (time_no_cache > 0)
  319. {
  320. double speedup = (time_no_cache - time_with_cache) / time_no_cache * 100;
  321. std::cout << " - Speedup: " << speedup << "%" << std::endl;
  322. }
  323. if (!is_output_same)
  324. {
  325. std::cerr << "\nTest FAILED due to output mismatch." << std::endl;
  326. return -1;
  327. }
  328. std::cout << "\nTest PASSED." << std::endl;
  329. return 0;
  330. }
  331. int main()
  332. {
  333. // warming up
  334. if (warmup_gpu_pipecache() != 0)
  335. {
  336. return -1;
  337. }
  338. return test_gpu_pipecache_performance();
  339. }