You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

layernorm_vulkan.cpp 18 kB

10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474
  1. // Copyright 2025 Tencent
  2. // SPDX-License-Identifier: BSD-3-Clause
  3. #include "layernorm_vulkan.h"
  4. #include "layer_shader_type.h"
  5. #include "command.h" // For VkCompute
  6. #include <stdio.h> // For printf
  7. #include <algorithm> // For std::min
  8. namespace ncnn {
  9. // =================================================================================================
  10. // DEBUG HELPER FUNCTION
  11. // This function downloads a VkMat from GPU to CPU and prints its contents.
  12. // WARNING: This is extremely slow and should only be used for debugging.
  13. // =================================================================================================
  14. static void print_vkmat(const VkMat& m, const char* name, VkCompute& cmd, const Option& opt)
  15. {
  16. return;
  17. if (m.empty())
  18. {
  19. printf("--- %s ---\n", name);
  20. printf("VkMat is empty.\n\n");
  21. return;
  22. }
  23. // Create a CPU Mat with a CPU-compatible allocator to be the destination for the download.
  24. Mat staging_mat;
  25. staging_mat.create_like(m, opt.blob_allocator);
  26. if (staging_mat.empty())
  27. {
  28. NCNN_LOGE("print_vkmat failed to create staging_mat");
  29. return;
  30. }
  31. // Record the download command from the GPU VkMat to the CPU Mat.
  32. cmd.record_download(m, staging_mat, opt);
  33. // Submit and wait for the command to finish.
  34. // This is a blocking call, ensuring data is ready on the CPU side.
  35. cmd.submit_and_wait();
  36. cmd.reset();
  37. Mat cpu_mat;
  38. convert_packing(staging_mat,cpu_mat,1);
  39. printf("--- %s ---\n", name);
  40. printf("Dims: %d, w: %d, h: %d, d: %d, c: %d, cstep: %zu, elemsize: %zu, elempack: %d\n",
  41. m.dims, m.w, m.h, m.d, m.c, m.cstep, m.elemsize, m.elempack);
  42. printf("CPU Dims: %d, w: %d, h: %d, d: %d, c: %d, cstep: %zu, elemsize: %zu, elempack: %d\n",
  43. cpu_mat.dims, cpu_mat.w, cpu_mat.h, cpu_mat.d, cpu_mat.c, cpu_mat.cstep, cpu_mat.elemsize, cpu_mat.elempack);
  44. if (cpu_mat.elemsize == 4u) // float32
  45. {
  46. const float* ptr = cpu_mat;
  47. for (int i = 0; i < cpu_mat.c; i++)
  48. {
  49. printf("cpu_mat[%d]: \n", i);
  50. // 打印矩阵
  51. for (int j = 0; j< cpu_mat.h; j++)
  52. {
  53. for (int k = 0; k< cpu_mat.w;k++)
  54. {
  55. printf("%f ", ptr[i * cpu_mat.cstep + j * cpu_mat.w + k]);
  56. }
  57. printf("\n");
  58. }
  59. }
  60. }
  61. else if (cpu_mat.elemsize == 2u) // float16 or bfloat16
  62. {
  63. const unsigned short* ptr = cpu_mat;
  64. for (int i = 0; i < cpu_mat.c; i++)
  65. {
  66. printf("cpu_mat[%d]: \n", i);
  67. // 打印矩阵
  68. for (int j = 0; j< cpu_mat.h; j++)
  69. {
  70. for (int k = 0; k< cpu_mat.w;k++)
  71. {
  72. printf("%f ", ncnn::float16_to_float32(ptr[i * cpu_mat.cstep + j * cpu_mat.w + k]));
  73. }
  74. printf("\n");
  75. }
  76. }
  77. }
  78. else if (cpu_mat.elemsize == 1u) // int8
  79. {
  80. const signed char* ptr = cpu_mat;
  81. for (int i = 0; i < cpu_mat.c; i++)
  82. {
  83. printf("cpu_mat[%d]: \n", i);
  84. // 打印矩阵
  85. for (int j = 0; j< cpu_mat.h; j++)
  86. {
  87. for (int k = 0; k< cpu_mat.w;k++)
  88. {
  89. printf("%d ", ptr[i * cpu_mat.cstep + j * cpu_mat.w + k]);
  90. }
  91. printf("\n");
  92. }
  93. }
  94. }
  95. printf("\n\n");
  96. }
  97. LayerNorm_vulkan::LayerNorm_vulkan()
  98. {
  99. support_vulkan = true;
  100. pipeline_layernorm_reduce_sum4_fp16_to_fp32 = 0;
  101. pipeline_layernorm_reduce_sum4_fp32[0] = 0;
  102. pipeline_layernorm_reduce_sum4_fp32[1] = 0;
  103. pipeline_layernorm_reduce_mean = 0;
  104. pipeline_layernorm_sub_mean_square = 0;
  105. pipeline_layernorm_coeffs = 0;
  106. pipeline_layernorm_norm = 0;
  107. }
  108. int LayerNorm_vulkan::create_pipeline(const Option& opt)
  109. {
  110. std::vector<vk_specialization_type> no_specializations;
  111. pipeline_layernorm_reduce_sum4_fp16_to_fp32 = new Pipeline(vkdev);
  112. pipeline_layernorm_reduce_sum4_fp16_to_fp32->create(LayerShaderType::layernorm_reduce_sum4_fp16_to_fp32, opt, no_specializations);
  113. pipeline_layernorm_reduce_sum4_fp32[0] = new Pipeline(vkdev);
  114. pipeline_layernorm_reduce_sum4_fp32[0]->create(LayerShaderType::layernorm_reduce_sum4_fp32, opt, no_specializations);
  115. pipeline_layernorm_reduce_sum4_fp32[1] = new Pipeline(vkdev);
  116. pipeline_layernorm_reduce_sum4_fp32[1]->create(LayerShaderType::layernorm_reduce_sum4_fp32, opt, no_specializations);
  117. pipeline_layernorm_reduce_mean = new Pipeline(vkdev);
  118. pipeline_layernorm_reduce_mean->create(LayerShaderType::layernorm_reduce_mean, opt, no_specializations);
  119. pipeline_layernorm_sub_mean_square = new Pipeline(vkdev);
  120. pipeline_layernorm_sub_mean_square->create(LayerShaderType::layernorm_sub_mean_square, opt, no_specializations);
  121. std::vector<vk_specialization_type> coeffs_specializations(1);
  122. coeffs_specializations[0].f = eps;
  123. pipeline_layernorm_coeffs = new Pipeline(vkdev);
  124. pipeline_layernorm_coeffs->create(LayerShaderType::layernorm_coeffs, opt, coeffs_specializations);
  125. std::vector<vk_specialization_type> norm_specializations(1);
  126. norm_specializations[0].i = affine;
  127. pipeline_layernorm_norm = new Pipeline(vkdev);
  128. pipeline_layernorm_norm->create(LayerShaderType::layernorm_norm, opt, norm_specializations);
  129. return 0;
  130. }
  131. int LayerNorm_vulkan::destroy_pipeline(const Option& /*opt*/)
  132. {
  133. delete pipeline_layernorm_reduce_sum4_fp16_to_fp32;
  134. pipeline_layernorm_reduce_sum4_fp16_to_fp32 = 0;
  135. delete pipeline_layernorm_reduce_sum4_fp32[0];
  136. delete pipeline_layernorm_reduce_sum4_fp32[1];
  137. pipeline_layernorm_reduce_sum4_fp32[0] = 0;
  138. pipeline_layernorm_reduce_sum4_fp32[1] = 0;
  139. delete pipeline_layernorm_reduce_mean;
  140. pipeline_layernorm_reduce_mean = 0;
  141. delete pipeline_layernorm_sub_mean_square;
  142. pipeline_layernorm_sub_mean_square = 0;
  143. delete pipeline_layernorm_coeffs;
  144. pipeline_layernorm_coeffs = 0;
  145. delete pipeline_layernorm_norm;
  146. pipeline_layernorm_norm = 0;
  147. return 0;
  148. }
  149. int LayerNorm_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
  150. {
  151. if (affine == 0)
  152. return 0;
  153. cmd.record_upload(gamma_data, gamma_data_gpu, opt);
  154. cmd.record_upload(beta_data, beta_data_gpu, opt);
  155. return 0;
  156. }
  157. int LayerNorm_vulkan::forward_inplace(VkMat& _bottom_top_blob, VkCompute& cmd, const Option& opt) const
  158. {
  159. int elemsize_bak = _bottom_top_blob.elemsize;
  160. VkMat bottom_top_blob;
  161. vkdev->convert_packing(_bottom_top_blob, bottom_top_blob, 1,cmd, opt);
  162. int w = bottom_top_blob.w;
  163. int h = bottom_top_blob.h;
  164. int channels = bottom_top_blob.c;
  165. int cstep = bottom_top_blob.cstep;
  166. int dims = bottom_top_blob.dims;
  167. size_t elemsize = bottom_top_blob.elemsize;
  168. if (affine_size == 0)
  169. return 0;
  170. // ================== DEBUG PRINT ==================
  171. print_vkmat(bottom_top_blob, "===> INPUT to LayerNorm <===", cmd, opt);
  172. // ===============================================
  173. int group_size;
  174. int num_groups_per_channel;
  175. if (dims == 1) {
  176. group_size = w;
  177. num_groups_per_channel = 1;
  178. channels = 1;
  179. } else if (dims == 2) {
  180. group_size = w;
  181. num_groups_per_channel = h;
  182. channels = 1;
  183. } else { // dims == 3
  184. if (affine_size == w) {
  185. group_size = w;
  186. num_groups_per_channel = h;
  187. } else { // affine_size == w * h
  188. group_size = w * h;
  189. num_groups_per_channel = 1;
  190. }
  191. }
  192. int num_groups_total = num_groups_per_channel * channels;
  193. VkMat mean_workspace;
  194. mean_workspace.create(num_groups_total, 4u, 1, opt.workspace_vkallocator);
  195. VkMat var_workspace;
  196. var_workspace.create(num_groups_total, 4u, 1, opt.workspace_vkallocator);
  197. // --- 1. CALCULATE MEAN ---
  198. {
  199. int reduced_w = (group_size + 3) / 4;
  200. VkMat sum_workspace;
  201. sum_workspace.create(reduced_w, num_groups_per_channel, channels, 4u, 1, opt.workspace_vkallocator);
  202. std::vector<VkMat> bindings(2);
  203. bindings[0] = bottom_top_blob;
  204. bindings[1] = sum_workspace;
  205. std::vector<vk_constant_type> constants(8);
  206. constants[0].i = group_size;
  207. constants[1].i = num_groups_per_channel;
  208. constants[2].i = channels;
  209. constants[3].i = cstep;
  210. constants[4].i = reduced_w;
  211. constants[5].i = num_groups_per_channel;
  212. constants[6].i = channels;
  213. constants[7].i = sum_workspace.cstep;
  214. VkMat dispatcher;
  215. dispatcher.w = reduced_w;
  216. dispatcher.h = num_groups_per_channel;
  217. dispatcher.c = channels;
  218. int pb = 0;
  219. if (elemsize == 4u) {
  220. cmd.record_pipeline(pipeline_layernorm_reduce_sum4_fp32[pb % 2], bindings, constants, dispatcher);
  221. } else {
  222. cmd.record_pipeline(pipeline_layernorm_reduce_sum4_fp16_to_fp32, bindings, constants, dispatcher);
  223. }
  224. pb++;
  225. // ================== DEBUG PRINT ==================
  226. print_vkmat(sum_workspace, "1. MEAN: After Initial Reduce", cmd, opt);
  227. // ===============================================
  228. while (sum_workspace.w > 1) {
  229. int current_w = sum_workspace.w;
  230. reduced_w = (current_w + 3) / 4;
  231. VkMat sum_workspace_reduced;
  232. sum_workspace_reduced.create(reduced_w, num_groups_per_channel, channels, 4u, 1, opt.workspace_vkallocator);
  233. std::vector<VkMat> bindings_iter(2);
  234. bindings_iter[0] = sum_workspace;
  235. bindings_iter[1] = sum_workspace_reduced;
  236. std::vector<vk_constant_type> constants_iter(8);
  237. constants_iter[0].i = current_w;
  238. constants_iter[1].i = num_groups_per_channel;
  239. constants_iter[2].i = channels;
  240. constants_iter[3].i = sum_workspace.cstep;
  241. constants_iter[4].i = reduced_w;
  242. constants_iter[5].i = num_groups_per_channel;
  243. constants_iter[6].i = channels;
  244. constants_iter[7].i = sum_workspace_reduced.cstep;
  245. dispatcher.w = reduced_w;
  246. cmd.record_pipeline(pipeline_layernorm_reduce_sum4_fp32[pb % 2], bindings_iter, constants_iter, dispatcher);
  247. pb++;
  248. sum_workspace = sum_workspace_reduced;
  249. // ================== DEBUG PRINT ==================
  250. char msg[100];
  251. sprintf(msg, "1. MEAN: Iterative Reduce Output (w=%d)", sum_workspace.w);
  252. print_vkmat(sum_workspace, msg, cmd, opt);
  253. // ===============================================
  254. }
  255. std::vector<VkMat> mean_bindings(2);
  256. // CRITICAL FIX: Bind separate input (sum_workspace) and output (mean_workspace) buffers.
  257. mean_bindings[0] = sum_workspace;
  258. mean_bindings[1] = mean_workspace;
  259. std::vector<vk_constant_type> mean_constants(5);
  260. mean_constants[0].i = sum_workspace.w;
  261. mean_constants[1].i = num_groups_per_channel;
  262. mean_constants[2].i = channels;
  263. mean_constants[3].i = sum_workspace.cstep;
  264. mean_constants[4].f = (float)group_size;
  265. dispatcher.w = 1;
  266. cmd.record_pipeline(pipeline_layernorm_reduce_mean, mean_bindings, mean_constants, dispatcher);
  267. // ================== DEBUG PRINT ==================
  268. print_vkmat(mean_workspace, "1. MEAN: FINAL mean_workspace", cmd, opt);
  269. // ===============================================
  270. }
  271. // --- 2. CALCULATE VARIANCE ---
  272. {
  273. VkMat square_workspace;
  274. square_workspace.create(w, h, bottom_top_blob.c, elemsize, 1, opt.workspace_vkallocator);
  275. std::vector<VkMat> sq_bindings(3);
  276. sq_bindings[0] = bottom_top_blob;
  277. sq_bindings[1] = mean_workspace;
  278. sq_bindings[2] = square_workspace;
  279. std::vector<vk_constant_type> sq_constants(5);
  280. sq_constants[0].i = w;
  281. sq_constants[1].i = h;
  282. sq_constants[2].i = bottom_top_blob.c;
  283. sq_constants[3].i = cstep;
  284. sq_constants[4].i = affine_size;
  285. cmd.record_pipeline(pipeline_layernorm_sub_mean_square, sq_bindings, sq_constants, square_workspace);
  286. // ================== DEBUG PRINT ==================
  287. print_vkmat(square_workspace, "2. VAR: After sub_mean_square", cmd, opt);
  288. // ===============================================
  289. int reduced_w = (group_size + 3) / 4;
  290. VkMat sqsum_workspace;
  291. sqsum_workspace.create(reduced_w, num_groups_per_channel, channels, 4u, 1, opt.workspace_vkallocator);
  292. std::vector<VkMat> bindings(2);
  293. bindings[0] = square_workspace;
  294. bindings[1] = sqsum_workspace;
  295. std::vector<vk_constant_type> constants(8);
  296. constants[0].i = group_size;
  297. constants[1].i = num_groups_per_channel;
  298. constants[2].i = channels;
  299. constants[3].i = square_workspace.cstep;
  300. constants[4].i = reduced_w;
  301. constants[5].i = num_groups_per_channel;
  302. constants[6].i = channels;
  303. constants[7].i = sqsum_workspace.cstep;
  304. VkMat dispatcher;
  305. dispatcher.w = reduced_w;
  306. dispatcher.h = num_groups_per_channel;
  307. dispatcher.c = channels;
  308. int pb = 0;
  309. if (elemsize == 4u) {
  310. cmd.record_pipeline(pipeline_layernorm_reduce_sum4_fp32[pb % 2], bindings, constants, dispatcher);
  311. } else {
  312. cmd.record_pipeline(pipeline_layernorm_reduce_sum4_fp16_to_fp32, bindings, constants, dispatcher);
  313. }
  314. pb++;
  315. // ================== DEBUG PRINT ==================
  316. print_vkmat(sqsum_workspace, "2. VAR: After Initial Reduce", cmd, opt);
  317. // ===============================================
  318. while (sqsum_workspace.w > 1) {
  319. int current_w = sqsum_workspace.w;
  320. reduced_w = (current_w + 3) / 4;
  321. VkMat sum_workspace_reduced;
  322. sum_workspace_reduced.create(reduced_w, num_groups_per_channel, channels, 4u, 1, opt.workspace_vkallocator);
  323. std::vector<VkMat> bindings_iter(2);
  324. bindings_iter[0] = sqsum_workspace;
  325. bindings_iter[1] = sum_workspace_reduced;
  326. std::vector<vk_constant_type> constants_iter(8);
  327. constants_iter[0].i = current_w;
  328. constants_iter[1].i = num_groups_per_channel;
  329. constants_iter[2].i = channels;
  330. constants_iter[3].i = sqsum_workspace.cstep;
  331. constants_iter[4].i = reduced_w;
  332. constants_iter[5].i = num_groups_per_channel;
  333. constants_iter[6].i = channels;
  334. constants_iter[7].i = sum_workspace_reduced.cstep;
  335. dispatcher.w = reduced_w;
  336. cmd.record_pipeline(pipeline_layernorm_reduce_sum4_fp32[pb % 2], bindings_iter, constants_iter, dispatcher);
  337. pb++;
  338. sqsum_workspace = sum_workspace_reduced;
  339. // ================== DEBUG PRINT ==================
  340. char msg[100];
  341. sprintf(msg, "2. VAR: Iterative Reduce Output (w=%d)", sqsum_workspace.w);
  342. print_vkmat(sqsum_workspace, msg, cmd, opt);
  343. // ===============================================
  344. }
  345. std::vector<VkMat> var_bindings(2);
  346. var_bindings[0] = sqsum_workspace;
  347. var_bindings[1] = var_workspace;
  348. std::vector<vk_constant_type> var_constants(5);
  349. var_constants[0].i = sqsum_workspace.w;
  350. var_constants[1].i = num_groups_per_channel;
  351. var_constants[2].i = channels;
  352. var_constants[3].i = sqsum_workspace.cstep;
  353. var_constants[4].f = (float)group_size;
  354. dispatcher.w = 1;
  355. cmd.record_pipeline(pipeline_layernorm_reduce_mean, var_bindings, var_constants, dispatcher);
  356. // ================== DEBUG PRINT ==================
  357. print_vkmat(var_workspace, "2. VAR: FINAL var_workspace", cmd, opt);
  358. // ===============================================
  359. }
  360. // --- 3. CALCULATE COEFFICIENTS (a and b) ---
  361. VkMat coeffs_workspace;
  362. coeffs_workspace.create(num_groups_total * 2, elemsize, 1, opt.workspace_vkallocator);
  363. std::vector<VkMat> coeff_bindings(3);
  364. coeff_bindings[0] = coeffs_workspace;
  365. coeff_bindings[1] = mean_workspace;
  366. coeff_bindings[2] = var_workspace;
  367. std::vector<vk_constant_type> coeff_constants(2);
  368. coeff_constants[0].i = num_groups_per_channel;
  369. coeff_constants[1].i = channels;
  370. VkMat dispatcher_coeffs;
  371. dispatcher_coeffs.w = 1;
  372. dispatcher_coeffs.h = num_groups_per_channel;
  373. dispatcher_coeffs.c = channels;
  374. cmd.record_pipeline(pipeline_layernorm_coeffs, coeff_bindings, coeff_constants, dispatcher_coeffs);
  375. // ================== DEBUG PRINT ==================
  376. print_vkmat(coeffs_workspace, "3. COEFFS: After calculation", cmd, opt);
  377. // ===============================================
  378. // --- 4. APPLY NORMALIZATION ---
  379. std::vector<VkMat> norm_bindings(4);
  380. norm_bindings[0] = bottom_top_blob;
  381. norm_bindings[1] = coeffs_workspace;
  382. norm_bindings[2] = gamma_data_gpu;
  383. norm_bindings[3] = beta_data_gpu;
  384. std::vector<vk_constant_type> norm_constants(5);
  385. norm_constants[0].i = w;
  386. norm_constants[1].i = h;
  387. norm_constants[2].i = bottom_top_blob.c;
  388. norm_constants[3].i = cstep;
  389. norm_constants[4].i = affine_size;
  390. cmd.record_pipeline(pipeline_layernorm_norm, norm_bindings, norm_constants, bottom_top_blob);
  391. // ================== DEBUG PRINT ==================
  392. print_vkmat(bottom_top_blob, "===> FINAL OUTPUT of LayerNorm <===", cmd, opt);
  393. // ===============================================
  394. vkdev->convert_packing(bottom_top_blob, _bottom_top_blob, elemsize_bak, cmd, opt);
  395. return 0;
  396. }
  397. } // namespace ncnn