You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

get_params.cpp 47 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754
  1. /**
  2. * \file dnn/src/cuda/convolution/get_params.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "src/cuda/convolution/cudnn_heuristic.h"
  12. #include "megdnn.h"
  13. using namespace megdnn;
  14. using namespace cuda;
  15. using namespace convolution;
  16. bool convolution::heuristic_params_available(
  17. int cuda_major, int cuda_minor, size_t* layer_num_p,
  18. const size_t** layers_dim_p, const float** matrices_p,
  19. const float** biases_p, const float** alpha_p, const float** beta_p,
  20. const ConvolutionType& conv_type, float** hidden_units_p,
  21. float** time_pred_p, float** mask_p) {
  22. MEGDNN_MARK_USED_VAR(cuda_major);
  23. MEGDNN_MARK_USED_VAR(cuda_minor);
  24. MEGDNN_MARK_USED_VAR(layer_num_p);
  25. MEGDNN_MARK_USED_VAR(layers_dim_p);
  26. MEGDNN_MARK_USED_VAR(matrices_p);
  27. MEGDNN_MARK_USED_VAR(biases_p);
  28. MEGDNN_MARK_USED_VAR(alpha_p);
  29. MEGDNN_MARK_USED_VAR(beta_p);
  30. MEGDNN_MARK_USED_VAR(conv_type);
  31. MEGDNN_MARK_USED_VAR(hidden_units_p);
  32. MEGDNN_MARK_USED_VAR(time_pred_p);
  33. MEGDNN_MARK_USED_VAR(mask_p);
  34. #if CUDNN_MAJOR == 6 && CUDNN_MINOR == 0
  35. float cuda5_2_BACKWARD_FILTER_time_pred[7] = {0.0f};
  36. float cuda5_2_BACKWARD_FILTER_mask[7] = {0.0f};
  37. float cuda5_2_BACKWARD_FILTER_hidden_units[24] = {0.0f};
  38. const static size_t cuda5_2_BACKWARD_FILTER_layers_dim[4] = {9, 12, 12, 7};
  39. const static float cuda5_2_BACKWARD_FILTER_matrices[336] = {
  40. 3.499478e-03, 1.353932e-02, -1.316529e-01, 1.006798e-01,
  41. 1.249662e-02, -3.591197e-01, -4.299506e-01, -3.613592e-01,
  42. -3.783917e-01, -4.249511e-01, 6.287370e-03, -2.861480e-03,
  43. 3.128614e-03, 8.496360e-03, 5.568272e-01, 1.965293e-01,
  44. -6.205962e-02, -1.999864e-01, 9.333656e-03, -6.377945e-02,
  45. 6.122595e-02, 1.122032e-01, -1.683744e-02, -9.395520e-02,
  46. -2.953549e-02, -2.772853e-02, -2.892097e-02, 3.200796e-03,
  47. 5.553298e-03, 6.707606e-01, 3.111190e-01, -5.293804e-01,
  48. -8.127835e-02, -5.839296e-02, 9.633666e-02, 5.957389e-02,
  49. -7.131222e-02, 4.057650e-02, 4.311656e-02, -1.456163e-02,
  50. 5.683148e-02, 6.175192e-02, 9.331264e-02, 9.957494e-02,
  51. 5.202487e-02, 0.0, 0.0, -7.725500e-14,
  52. -8.058319e-17, 0.0, 0.0, 0.0,
  53. 0.0, 0.0, 1.988015e-04, -1.530555e-01,
  54. 3.629641e-03, -1.238047e-03, 1.692593e-02, 3.404703e-01,
  55. 5.441420e-01, -3.275000e-01, -3.742920e-01, -1.714999e-01,
  56. 1.979161e-02, 5.019676e-02, 1.406423e-02, -4.360787e-02,
  57. -5.948093e-03, 1.522342e-01, 1.012455e-02, 5.666151e-02,
  58. -7.033888e-05, 1.519375e-02, -2.360136e-02, -5.682724e-04,
  59. -2.552732e-02, 2.329080e-01, 3.437024e-01, 4.054402e-01,
  60. 3.379739e-01, 1.566344e-03, 3.172801e-02, -1.336258e-02,
  61. 1.401075e-02, 2.876163e-02, 1.293039e+00, 7.118387e-01,
  62. 2.966451e-01, 4.372724e-01, -2.286311e-02, -6.896693e-03,
  63. 3.156468e-02, 3.829155e-02, -9.890525e-04, 1.836302e-02,
  64. 2.394343e-02, 4.963258e-02, 4.368515e-02, 2.950634e-03,
  65. 1.129842e-02, 7.078686e-01, 3.193808e-01, 9.759862e-03,
  66. 2.906150e-01, 1.806232e-01, 1.396071e-01, 2.047469e-01,
  67. -2.561368e-01, -3.322504e-01, 7.250011e-02, -3.389789e-02,
  68. -1.372720e-02, 0.0, -1.690562e-01, -1.013354e-01,
  69. -1.920926e-02, 1.018956e-01, 2.467915e-02, 4.451101e-02,
  70. -4.139300e-02, -1.031867e-02, -5.686982e-03, 2.993172e-01,
  71. 1.746564e-02, -3.393853e-20, 1.905611e-02, -5.220098e-02,
  72. 4.550828e-02, 8.211702e-02, -2.850403e-03, -2.816908e-01,
  73. 6.826700e-02, -1.102444e-02, 7.373374e-03, 9.173237e-03,
  74. -6.144243e-03, 0.0, -1.675391e-02, 2.949211e-02,
  75. -1.925736e-02, 2.259453e-02, 6.339108e-02, -1.233638e-01,
  76. -1.239254e-02, -9.204817e-03, -6.979109e-02, -2.015045e-02,
  77. -1.624232e-02, 0.0, 8.557694e-02, -2.066801e-02,
  78. 2.876340e-01, -1.265177e-01, 7.225822e-03, 7.337274e-02,
  79. -4.342360e-02, -1.974944e-01, -6.721890e-03, -4.495411e-02,
  80. -3.655335e-02, 0.0, -4.551398e-01, 8.440251e-02,
  81. -2.404170e-01, 1.250752e-01, 1.646416e-03, 9.063166e-02,
  82. 2.506036e-02, 8.455078e-03, -1.908465e-02, 6.791655e-03,
  83. 2.511951e-02, 0.0, 7.265597e-03, -1.285137e-03,
  84. -3.404747e-04, 8.924944e-03, 4.234224e-03, -1.186513e-02,
  85. 2.454471e-02, 9.120111e-04, 2.120904e-02, -5.555666e-03,
  86. -1.493565e-02, 0.0, 2.764972e-03, -6.132948e-04,
  87. 6.180623e-03, 3.238724e-03, -1.073131e-02, -1.342798e-04,
  88. 8.969568e-02, 1.010931e-01, -1.038349e-02, -9.198243e-02,
  89. 4.724314e-02, 0.0, 1.175188e-02, -6.051729e-02,
  90. -2.525244e-03, -1.566657e-01, -1.447370e-02, 1.747005e-01,
  91. 1.078679e-01, 2.556116e-01, 3.880575e-02, 9.777729e-03,
  92. 1.078563e-01, 0.0, 4.525005e-01, 8.311278e-03,
  93. 8.198996e-02, -2.884443e-01, -1.808732e-02, -3.114621e-02,
  94. 1.732809e-02, 2.442103e-01, 3.329617e-02, 8.462872e-03,
  95. 6.775563e-02, -7.453864e-19, 1.846050e-01, 2.739331e-02,
  96. 1.029433e-01, -2.251960e-01, 3.331415e-02, -2.261097e-02,
  97. 3.815529e-02, -5.755350e-02, -8.908589e-03, -4.526101e-02,
  98. 1.555560e-02, 0.0, 2.347023e-02, -1.399980e-01,
  99. -2.699343e-02, 2.168779e-02, 2.629133e-03, 3.232189e-02,
  100. 3.693172e-02, -9.767429e-02, 2.461806e-02, 1.045579e-01,
  101. 5.808600e-02, 0.0, -1.331031e-02, 3.555656e-03,
  102. -9.530113e-02, -1.961061e-02, -1.579800e-02, -7.582582e-02,
  103. -3.099381e-02, 9.698432e-01, 7.805698e-01, 1.542833e-01,
  104. -1.025053e-01, -7.509316e-04, -1.675225e-02, -7.818724e-03,
  105. -2.718012e-01, 8.506276e-01, 3.869322e-02, 2.732933e-02,
  106. -4.932691e-02, 7.077541e-01, 4.385699e-01, 8.550947e-02,
  107. -1.737943e-01, -1.007005e-02, 1.884576e-02, 6.328513e-02,
  108. -2.711761e-01, 1.054725e+00, -1.001195e-02, 6.876359e-02,
  109. -4.647969e-01, 7.618478e-01, 1.170148e+00, 5.507177e-02,
  110. -1.284761e-01, 2.255174e-02, 5.041638e-03, 2.431494e-01,
  111. -2.259419e-01, 6.318219e-01, 4.526694e-02, -1.068190e-01,
  112. 9.181661e-05, 7.900977e-01, 5.499427e-01, 2.147153e-02,
  113. -1.855706e-01, -6.816355e-03, 2.600182e-02, 7.784649e-02,
  114. -2.902775e-01, 9.821153e-01, -1.705817e-02, 9.162355e-02,
  115. 0.0, 0.0, 0.0, 0.0,
  116. 0.0, 0.0, 0.0, 0.0,
  117. 0.0, 0.0, 0.0, 0.0,
  118. 4.378970e-02, 7.106545e-01, 5.512720e-01, 1.076883e-01,
  119. -3.036375e-01, 4.190212e-02, -1.192542e-02, 1.002918e-01,
  120. -2.498885e-01, 6.789825e-01, -1.278644e-01, 8.962566e-02,
  121. -4.231460e-02, 2.334089e-01, 3.083326e-03, 2.404322e-02,
  122. -2.668908e-01, 3.057625e-03, -1.283901e-03, 1.349618e-02,
  123. -4.993697e-02, 6.061308e-01, -9.689163e-02, 1.609056e-01};
  124. const static float cuda5_2_BACKWARD_FILTER_biases[31] = {
  125. 3.927711e-02, 4.658543e-01, 3.737917e-02, -4.173907e-02,
  126. 6.516264e-04, 0.0, 1.141180e+00, 5.656777e-03,
  127. -1.466774e-01, -3.637813e-01, 3.348432e-02, -2.374912e-01,
  128. 1.856181e-01, 1.458818e+00, 1.436140e+00, 1.708800e-01,
  129. 3.663654e-02, 2.147604e-02, 5.249544e-02, 9.389526e-02,
  130. -7.182905e-02, 2.513293e+00, -6.255527e-02, -1.452608e-01,
  131. -7.379941e-01, -5.884537e-01, -6.324590e-01, -6.180407e-01,
  132. 0.0, -1.712828e-01, -2.353933e-01};
  133. const static float cuda5_2_BACKWARD_FILTER_alpha[7] = {
  134. 2.189385e+08, 1.987406e+08, 6.368552e+07, 2.164986e+08,
  135. 2.000000e+08, 3.611623e+08, 8.509315e+06};
  136. const static float cuda5_2_BACKWARD_FILTER_beta[7] = {
  137. 1.558573e+00, 1.825239e+00, 1.782366e+00, 1.772095e+00,
  138. 2.000000e+00, 1.856787e+00, 1.625270e+00};
  139. float cuda5_2_BACKWARD_DATA_time_pred[6] = {0.0f};
  140. float cuda5_2_BACKWARD_DATA_mask[6] = {0.0f};
  141. float cuda5_2_BACKWARD_DATA_hidden_units[24] = {0.0f};
  142. const static size_t cuda5_2_BACKWARD_DATA_layers_dim[4] = {9, 12, 12, 6};
  143. const static float cuda5_2_BACKWARD_DATA_matrices[324] = {
  144. 1.090385e-03, -9.525486e-02, 2.116694e-02, 8.324536e-03,
  145. 2.443915e-03, -1.486993e-03, 1.996945e-01, -3.490458e-02,
  146. -2.909729e-01, -4.403929e-01, 3.302580e-03, 8.758115e-03,
  147. 2.016278e-03, 5.139519e-03, 6.631530e-01, 4.163170e-01,
  148. -2.275565e-01, -1.927734e-01, 4.901680e-02, 3.499708e-02,
  149. 4.430823e-02, -6.245822e-01, 2.489910e-02, 2.943479e-01,
  150. 3.011928e-01, -6.154800e-02, -6.945755e-02, 2.156114e-02,
  151. -2.706529e-02, 2.254039e-02, -2.130969e-01, -1.711698e-03,
  152. 3.185264e-01, 3.669779e-01, 2.366176e-01, 2.016553e-01,
  153. 1.742197e-04, 8.993217e-04, -3.757331e-01, -1.517802e-01,
  154. 1.150989e-03, 4.397022e-01, 2.472478e-01, -5.120142e-01,
  155. -5.310764e-01, -2.185705e-02, -1.019608e-02, -1.484592e-01,
  156. -1.720972e-01, 3.073631e-02, 1.679189e-02, 9.030435e-03,
  157. -4.171251e-03, -7.412981e-03, 3.670006e-02, 2.704583e-02,
  158. 1.162922e-01, 8.629673e-02, -1.661878e-01, -1.722751e-01,
  159. -2.494859e-01, 6.303188e-02, 2.379866e-03, -9.154570e-02,
  160. -8.703206e-02, 3.478937e-02, 2.733189e-02, -6.598901e-02,
  161. -2.212522e-02, -3.853705e-02, 2.827537e-02, 2.944724e-02,
  162. 1.588451e-02, 2.663488e-02, 1.933236e-02, 3.978135e-02,
  163. 1.509624e-02, 1.144023e+00, 7.680039e-01, 4.072323e-01,
  164. 3.243737e-01, 4.177893e-02, 4.054888e-02, 1.758260e-01,
  165. 1.351026e-01, 2.773634e-02, 8.728213e-02, 1.938261e-01,
  166. -1.641249e-02, 7.889663e-02, 4.266707e-04, 6.022587e-04,
  167. 6.884130e-04, 2.244700e-04, -3.188357e-01, 1.903596e-01,
  168. 3.979538e-01, -2.875198e-01, -5.881550e-01, -1.732513e-02,
  169. 2.107770e-02, -2.415357e-02, 5.184836e-02, 2.633666e-03,
  170. -4.351313e-01, -3.523280e-01, -1.124865e-01, -5.509025e-02,
  171. -2.874137e-03, -2.260433e-03, 5.087418e-03, 2.825674e-03,
  172. 4.565214e-03, 1.520132e-03, -1.722531e-03, -1.287867e-04,
  173. 1.223576e-03, -5.230475e-04, -2.300250e-03, -6.684309e-03,
  174. -7.956048e-03, -3.028432e-03, 2.238011e-02, -1.166453e-02,
  175. 6.994838e-02, 5.585106e-03, -9.814836e-03, -4.010206e-03,
  176. -3.232308e-03, -1.020571e-02, -1.587651e-02, 6.942352e-02,
  177. 6.370817e-01, 5.906755e-02, -3.062441e-03, 9.914325e-02,
  178. 2.335527e-01, -4.718621e-03, -2.132248e-02, 3.841487e-02,
  179. 7.563891e-02, -7.599686e-02, 1.408871e-01, 5.740594e-02,
  180. 1.902002e-01, 2.145507e-01, 3.427162e-02, 3.367433e-02,
  181. 2.967569e-01, 2.863470e-02, 3.392174e-02, 3.514072e-02,
  182. -1.441963e-01, -4.797359e-02, -5.965770e-03, 1.214167e-01,
  183. 0.0, 0.0, -4.498340e-06, -1.828862e-07,
  184. 0.0, 1.093948e-12, -2.601859e-06, 0.0,
  185. -9.811162e-09, -2.785148e-06, 0.0, -2.360134e-27,
  186. -1.110723e-01, -1.570218e-01, -4.062234e-02, -7.606770e-02,
  187. 5.144730e-01, 9.398572e-02, 1.906881e-01, 1.747961e-02,
  188. 1.106279e-01, -1.254419e-01, 6.205062e-01, -5.617496e-02,
  189. -1.629532e-01, -1.042091e-01, -1.413646e-01, 1.433934e-01,
  190. 1.425548e-01, 2.505819e-02, 5.484238e-04, -9.254320e-02,
  191. 1.448994e-01, 3.132954e-02, -1.425708e-01, -1.685494e-02,
  192. -3.513211e-01, -1.992232e-01, -1.081804e-01, 4.960524e-02,
  193. -5.546688e-01, -1.675645e-02, -3.610602e-02, 2.780567e-02,
  194. 2.227647e-01, 4.038066e-02, -6.002745e-01, -1.275032e-01,
  195. -1.026016e-01, -2.635376e-01, 2.059869e-02, -8.100250e-02,
  196. 8.695480e-02, -4.293829e-02, -1.870224e-02, 7.269356e-02,
  197. 3.979762e-02, 3.270284e-02, 1.190808e-01, -1.059370e-01,
  198. 1.286611e-02, 3.927987e-02, 7.228687e-03, 2.264480e-02,
  199. -1.119717e-01, 8.701903e-02, 2.064170e-02, 5.297894e-02,
  200. 9.965703e-03, 1.206108e-02, -5.411500e-02, -5.476563e-02,
  201. -1.837980e-01, -7.351980e-01, -1.781217e-01, 1.473823e-01,
  202. -4.530039e-01, -3.604104e-02, 2.418269e-02, 2.903621e-02,
  203. 4.367216e-01, -5.112789e-02, -3.706729e-01, -2.049569e-01,
  204. -9.153855e-02, -1.008104e-01, -1.009935e-02, -1.033947e-01,
  205. 5.495172e-02, 1.323372e-02, -5.191914e-02, -1.545710e-02,
  206. 3.271207e-02, 1.939050e-02, -3.092350e-02, 7.518642e-02,
  207. -5.528467e-03, 8.568556e-02, 1.924936e-02, 1.007434e+00,
  208. -6.850208e-07, 5.599304e-01, 3.076834e-01, -4.312680e-01,
  209. 7.534813e-02, -3.293671e-02, 5.830373e-03, -2.450454e-02,
  210. -3.698347e-04, -8.712796e-03, 4.009782e-01, 1.215293e+00,
  211. -5.273760e-07, 2.344936e-01, 1.927198e-01, -3.006327e-01,
  212. -2.927265e-02, -8.696410e-03, -2.446414e-02, 1.890189e-02,
  213. 3.553152e-03, -1.651816e-02, 2.438239e-01, 6.245783e-01,
  214. 1.809883e-07, 3.264363e-01, 7.772639e-01, -2.954962e-01,
  215. 2.704587e-02, -3.836469e-02, -4.457633e-01, 1.726713e-02,
  216. 5.172309e-03, 1.289187e-02, 5.472647e-01, 6.243305e-01,
  217. -4.123602e-08, 4.334479e-01, 7.573158e-02, -2.572208e-01,
  218. 5.492910e-02, -9.502222e-03, -2.104075e-01, -3.131663e-02,
  219. 2.312713e-03, 3.963990e-02, 4.713630e-01, 8.256559e-01,
  220. -2.583514e-08, 4.528451e-01, 7.318445e-02, -2.987004e-01,
  221. 8.577114e-02, -2.907754e-02, -5.389895e-02, 8.495960e-02,
  222. -1.558219e-04, 3.880079e-02, 4.180317e-01, 5.884213e-01,
  223. 3.963620e-07, 4.769594e-01, 3.800152e-01, -3.191836e-01,
  224. -1.669163e-01, 8.362461e-04, -1.668053e-01, -9.146041e-02};
  225. const static float cuda5_2_BACKWARD_DATA_biases[30] = {
  226. 1.238052e+00, 7.745910e-01, 3.356679e-01, -7.175566e-02,
  227. 1.497247e+00, 3.300638e-03, 2.789130e-01, -8.312362e-02,
  228. -7.829870e-02, -3.456568e-01, 1.328189e+00, -2.689771e-01,
  229. 9.444705e-03, -1.149580e-01, 4.422197e-01, 2.072980e+00,
  230. 0.0, 4.782698e-01, -1.116326e+00, 7.193607e-01,
  231. 2.938375e-02, 1.465170e-02, 8.513468e-02, 6.830001e-02,
  232. 4.035618e-01, 1.607704e-01, 9.502214e-01, 6.022118e-01,
  233. 2.584324e-01, 7.981322e-01};
  234. const static float cuda5_2_BACKWARD_DATA_alpha[6] = {
  235. 1.997689e+08, 3.799992e+08, 6.843723e+07, 1.140762e+08,
  236. 5.562133e+08, 3.324116e+08};
  237. const static float cuda5_2_BACKWARD_DATA_beta[6] = {
  238. 1.537834e+00, 1.587649e+00, 1.844705e+00, 1.671656e+00,
  239. 1.672516e+00, 1.705950e+00};
  240. float cuda5_2_FORWARD_time_pred[8] = {0.0f};
  241. float cuda5_2_FORWARD_mask[8] = {0.0f};
  242. float cuda5_2_FORWARD_hidden_units[24] = {0.0f};
  243. const static size_t cuda5_2_FORWARD_layers_dim[4] = {9, 12, 12, 8};
  244. const static float cuda5_2_FORWARD_matrices[348] = {
  245. -9.209032e-02, -1.659105e-01, -5.965192e-02, -2.153863e-02,
  246. 8.719379e-02, -3.499233e-02, 7.201853e-03, -1.419160e-02,
  247. -1.818457e-04, -3.145495e-01, 1.526620e-03, -3.928741e-03,
  248. -2.569693e-03, 3.410484e-03, 2.167806e-01, 1.747067e-01,
  249. -2.598841e-01, -3.055519e-01, 5.274500e-04, -9.025287e-03,
  250. -2.483256e-02, 4.541647e-02, 7.308841e-02, -4.819591e-01,
  251. -4.753071e-01, -1.471946e-02, 5.257137e-03, 2.392092e-03,
  252. -1.222254e-02, 1.609546e-02, -3.770980e-03, 1.646060e-02,
  253. 1.753314e-02, 1.508273e-02, 9.316003e-03, -5.777596e-04,
  254. -2.694935e-05, 1.604315e-03, -1.762570e-02, -4.887820e-01,
  255. 4.957791e-03, 2.363977e-01, 3.638881e-01, -4.731908e-01,
  256. -5.269557e-01, -1.159047e-03, 1.838379e-02, -1.427773e-01,
  257. -1.495254e-01, 1.330812e-01, 3.283872e-01, 3.582126e-01,
  258. -1.175109e-01, -1.454948e-01, 2.369200e-02, 1.493328e-02,
  259. 3.108240e-02, 3.270133e-02, -6.615507e-01, 3.380858e-01,
  260. 3.704230e-01, 8.769190e-02, -6.377754e-02, 4.325379e-02,
  261. -2.027540e-03, -1.402376e-01, -9.008316e-02, -2.559709e-03,
  262. -8.711295e-02, -9.627704e-02, -1.539383e-01, -1.632525e-01,
  263. 3.015039e-02, 3.144164e-02, 6.656437e-02, 5.488716e-02,
  264. 1.877632e-01, 5.748791e-01, 3.917130e-01, 2.071713e-01,
  265. 2.771358e-01, -5.960735e-02, 1.106716e-02, 5.781374e-02,
  266. 6.840285e-03, 2.902341e-02, -3.347534e-01, -1.212164e-01,
  267. -8.089989e-02, -1.384973e-01, 1.251527e-02, -2.644526e-01,
  268. 6.949010e-02, 2.681785e-02, 1.081700e-01, -3.502952e-02,
  269. 3.512865e-01, -9.033766e-02, 2.017496e-02, 2.095562e-02,
  270. 1.330583e-02, 2.582395e-02, -2.550245e-03, -1.596605e-03,
  271. -4.966798e-01, -5.384876e-01, -3.006902e-01, -2.735094e-01,
  272. 2.044184e-02, 3.490414e-01, 1.717040e-02, 6.914880e-03,
  273. 1.496788e-02, -7.078647e-02, 6.652176e-02, 6.768194e-03,
  274. -3.086404e-02, 1.317981e-01, -5.902661e-02, -8.681632e-02,
  275. -6.622906e-02, 1.597742e-01, 3.700355e-03, 1.707309e-02,
  276. -5.229016e-02, 2.836531e-02, 9.072421e-03, -1.104825e-01,
  277. 1.009224e-02, -1.915519e-02, -2.592222e-02, -9.112109e-02,
  278. -2.824950e-02, 5.274639e-01, 1.052709e-01, 1.325189e-02,
  279. 3.486569e-01, 1.155336e-01, 7.854062e-02, 1.637263e-02,
  280. -1.599528e-01, 1.090762e-01, 2.625560e-02, 8.724683e-02,
  281. 3.858089e-02, -5.696925e-01, -2.280933e-01, -3.096054e-02,
  282. -5.547203e-01, -6.229282e-02, -1.009606e-01, 5.365341e-02,
  283. 1.673071e-01, -1.734997e-01, -2.949879e-02, -2.640804e-01,
  284. 4.783161e-02, -4.411741e-01, -1.495569e-01, -1.043236e-02,
  285. -2.952088e-01, -2.866718e-02, 4.253592e-02, 3.828135e-02,
  286. 7.448777e-02, -2.757399e-02, -6.067163e-02, -2.007495e-01,
  287. -3.468005e-02, -1.678551e-01, -2.086982e-02, -2.114448e-02,
  288. -2.844830e-02, 3.823385e-03, 8.453450e-03, 1.447659e-03,
  289. 5.760803e-02, 7.803936e-02, -7.363023e-02, -1.894736e-03,
  290. 6.325649e-02, 1.527100e-02, -4.378622e-02, 3.171223e-03,
  291. 8.858634e-01, 7.191087e-02, 2.045580e-01, -3.890414e-03,
  292. -7.661989e-02, 2.667563e-02, -2.549908e-02, -9.384236e-02,
  293. -4.146666e-02, 2.281848e-01, 7.052436e-02, 1.180828e-03,
  294. 1.976338e-01, 1.647339e-02, -2.741527e-02, 1.641885e-02,
  295. -1.197201e-01, -3.670282e-02, 1.672286e-01, 5.267144e-02,
  296. 8.803396e-02, 4.463083e-01, -8.939818e-03, 4.523633e-03,
  297. -1.554685e-01, -1.392173e-02, 4.290194e-03, -9.498623e-03,
  298. -2.200229e-02, -1.022839e-01, 1.553784e-02, 4.006403e-02,
  299. -8.901481e-02, 1.353742e-01, -6.176645e-02, 2.818892e-03,
  300. 4.842044e-02, 1.031219e-02, 4.689164e-03, 2.677023e-01,
  301. -1.331718e-02, 2.130043e-01, 7.004514e-03, -5.422973e-01,
  302. 7.450043e-03, 4.017003e-01, -9.216257e-03, -2.551504e-02,
  303. -2.416791e-01, -1.451814e-01, -1.796521e-01, -1.749250e-02,
  304. 9.023457e-02, 9.444007e-02, -5.293583e-03, -1.027239e-01,
  305. 1.017421e-02, 1.213706e-01, -3.460192e-02, 8.999067e-03,
  306. -1.110771e-01, 2.168397e-01, -4.417743e-02, 8.891370e-02,
  307. -1.271863e-01, -7.239018e-02, -1.346174e-02, 5.777563e-02,
  308. 7.088694e-02, 6.467304e-02, 7.867605e-01, -2.014701e-01,
  309. 1.461604e-01, -6.571004e-02, 6.528026e-01, 6.720600e-01,
  310. 4.151264e-01, -6.271046e-03, -1.568682e-02, 2.438027e-01,
  311. 6.112317e-02, 8.257028e-02, 8.817917e-01, -1.995129e-01,
  312. 9.260281e-02, -6.511735e-02, 6.132895e-01, 5.789503e-01,
  313. 3.354024e-01, 1.621681e-04, -1.380093e-02, 2.710598e-01,
  314. 1.104726e-01, 5.625401e-02, 7.417016e-01, -2.523506e-01,
  315. 1.436054e-01, -7.903862e-02, 5.858161e-01, 4.287509e-01,
  316. 5.370684e-01, -9.449220e-02, -9.393471e-03, 3.037375e-01,
  317. 0.0, 0.0, 0.0, 0.0,
  318. 0.0, 0.0, 0.0, 0.0,
  319. 0.0, 0.0, 0.0, 0.0,
  320. 4.773019e-01, -2.101818e-02, 2.934896e-01, -4.207794e-01,
  321. -2.892000e-01, -1.383682e-02, 3.842597e-01, 5.408122e-01,
  322. -1.901669e-01, -5.255229e-02, 3.103573e-01, 7.447528e-01,
  323. 1.010295e-01, 5.580491e-03, 4.166604e-01, -2.997382e-01,
  324. -3.115629e-01, -2.585651e-02, 5.481771e-01, 6.307158e-01,
  325. 4.869811e-01, 6.668988e-01, -8.661555e-02, 6.073793e-01,
  326. 6.002924e-03, 1.855917e-02, 5.628079e-01, -1.967446e-01,
  327. -1.365761e-01, -3.095432e-02, 6.461580e-01, 7.712716e-01,
  328. 4.082011e-01, 8.834770e-02, -1.021050e-01, 4.353123e-01,
  329. 2.292985e-01, -6.493770e-02, 2.730630e-01, -3.267927e-01,
  330. -3.408634e-01, -6.609171e-02, 5.608538e-01, 7.108021e-01,
  331. 3.760323e-01, 3.335001e-01, 8.168215e-02, 2.858790e-01};
  332. const static float cuda5_2_FORWARD_biases[32] = {
  333. -1.021053e-02, 1.398318e+00, -2.447664e-01, 2.701163e-02,
  334. 1.148165e+00, 6.030037e-01, 2.089586e-01, 5.609234e-02,
  335. -4.842668e-01, 1.262153e-01, 2.643087e-01, 3.539835e-01,
  336. 4.792117e-01, 4.310244e-02, 1.993983e+00, 2.597207e-01,
  337. -2.811204e-01, 7.933383e-02, 1.056050e+00, 1.234862e+00,
  338. 7.894841e-01, 2.019784e-01, -1.216166e-01, 8.840314e-01,
  339. -3.542692e-01, -3.693904e-01, -2.181383e-01, 0.0,
  340. -2.216420e-01, -1.602890e-01, 8.500483e-03, 2.072607e-01};
  341. const static float cuda5_2_FORWARD_alpha[8] = {
  342. 2.549612e+08, 3.579459e+08, 1.927015e+08, 2.000000e+08,
  343. 3.222185e+07, 8.748824e+07, 6.676129e+08, 2.775480e+08};
  344. const static float cuda5_2_FORWARD_beta[8] = {
  345. 1.463412e+00, 1.553222e+00, 1.515109e+00, 2.000000e+00,
  346. 2.117807e+00, 1.622262e+00, 1.626601e+00, 1.669380e+00};
  347. if (conv_type == ConvolutionType::BACKWARD_FILTER && cuda_major == 5 &&
  348. cuda_minor == 2) {
  349. *layer_num_p = 4;
  350. *hidden_units_p = cuda5_2_BACKWARD_FILTER_hidden_units;
  351. *layers_dim_p = cuda5_2_BACKWARD_FILTER_layers_dim;
  352. *matrices_p = cuda5_2_BACKWARD_FILTER_matrices;
  353. *biases_p = cuda5_2_BACKWARD_FILTER_biases;
  354. *alpha_p = cuda5_2_BACKWARD_FILTER_alpha;
  355. *beta_p = cuda5_2_BACKWARD_FILTER_beta;
  356. *time_pred_p = cuda5_2_BACKWARD_FILTER_time_pred;
  357. *mask_p = cuda5_2_BACKWARD_FILTER_mask;
  358. } else if (conv_type == ConvolutionType::BACKWARD_DATA && cuda_major == 5 &&
  359. cuda_minor == 2) {
  360. *layer_num_p = 4;
  361. *hidden_units_p = cuda5_2_BACKWARD_DATA_hidden_units;
  362. *layers_dim_p = cuda5_2_BACKWARD_DATA_layers_dim;
  363. *matrices_p = cuda5_2_BACKWARD_DATA_matrices;
  364. *biases_p = cuda5_2_BACKWARD_DATA_biases;
  365. *alpha_p = cuda5_2_BACKWARD_DATA_alpha;
  366. *beta_p = cuda5_2_BACKWARD_DATA_beta;
  367. *time_pred_p = cuda5_2_BACKWARD_DATA_time_pred;
  368. *mask_p = cuda5_2_BACKWARD_DATA_mask;
  369. } else if (conv_type == ConvolutionType::FORWARD && cuda_major == 5 &&
  370. cuda_minor == 2) {
  371. *layer_num_p = 4;
  372. *hidden_units_p = cuda5_2_FORWARD_hidden_units;
  373. *layers_dim_p = cuda5_2_FORWARD_layers_dim;
  374. *matrices_p = cuda5_2_FORWARD_matrices;
  375. *biases_p = cuda5_2_FORWARD_biases;
  376. *alpha_p = cuda5_2_FORWARD_alpha;
  377. *beta_p = cuda5_2_FORWARD_beta;
  378. *time_pred_p = cuda5_2_FORWARD_time_pred;
  379. *mask_p = cuda5_2_FORWARD_mask;
  380. } else {
  381. return false;
  382. }
  383. return true;
  384. #endif
  385. #if CUDNN_MAJOR == 5 && CUDNN_MINOR == 1
  386. float cuda5_2_FORWARD_time_pred[9] = {0.0f};
  387. float cuda5_2_FORWARD_mask[9] = {0.0f};
  388. float cuda5_2_FORWARD_hidden_units[24] = {0.0f};
  389. const static size_t cuda5_2_FORWARD_layers_dim[4] = {9, 12, 12, 9};
  390. const static float cuda5_2_FORWARD_matrices[360] = {
  391. 3.087359e-03, -2.629997e-01, 9.492566e-02, 4.831330e-02,
  392. 4.493726e-02, -3.714851e-04, 8.981445e-02, -4.888808e-02,
  393. -7.350665e-02, -7.113249e-01, 2.111573e-02, 6.259846e-02,
  394. 2.931650e-02, 1.313162e-01, 1.926165e-02, 3.785147e-01,
  395. 1.765169e-01, 6.096475e-02, 4.104461e-03, 8.656193e-03,
  396. 1.102456e-02, 7.944959e-03, 4.644261e-02, -5.927094e-01,
  397. -6.180425e-01, -4.314502e-01, -4.073743e-01, 3.077646e-02,
  398. -1.029431e-01, 5.112506e-02, -8.541957e-02, 2.589677e-02,
  399. -5.164597e-02, 1.186986e-01, -4.672555e-02, -6.755380e-02,
  400. -2.806628e-04, 1.056535e-02, -1.438679e-01, -1.122842e-01,
  401. 5.779694e-02, 1.705828e-01, 3.862250e-01, -1.106681e-01,
  402. -5.471609e-02, -2.316525e-02, -4.610147e-02, 2.021985e-03,
  403. -5.761939e-03, 1.209045e-01, -7.279532e-02, 9.754839e-02,
  404. -6.032932e-02, -1.589997e-02, 1.985070e-03, 2.788936e-03,
  405. -2.104690e-01, -2.731634e-01, 1.189841e-02, 2.144678e-01,
  406. 1.771111e-01, -3.730702e-01, -3.886393e-01, -4.719765e-06,
  407. -2.289832e-22, 0.0, 0.0, -7.619362e-33,
  408. 0.0, 0.0, 0.0, 0.0,
  409. 1.652513e-02, 2.785243e-02, 6.713332e-02, 3.292293e-02,
  410. -7.087571e-01, 2.954406e-01, 2.942279e-01, 2.148153e-01,
  411. 9.042904e-02, 3.337476e-02, 5.262762e-02, 1.355991e-01,
  412. 6.802084e-02, 3.188081e-01, 1.053071e+00, 5.648708e-01,
  413. 3.254285e-01, 3.829584e-01, -3.902937e-02, 8.569189e-04,
  414. -6.860779e-03, -1.342737e-02, 9.002463e-04, 2.672171e-01,
  415. 1.833601e-02, -4.791870e-02, -4.673452e-01, -5.951233e-04,
  416. 1.327156e-02, 4.884608e-04, -6.395956e-04, -1.247312e-02,
  417. 2.616015e-03, 2.045540e-02, 1.826517e-02, 2.752957e-02,
  418. 4.864566e-03, 1.974226e-01, 8.022508e-02, 8.533795e-02,
  419. 7.867660e-02, 1.206522e-02, 1.408663e-01, 8.814420e-29,
  420. 2.803104e-02, -1.190598e-01, 4.397753e-01, 2.351956e-03,
  421. 2.934275e-02, 1.909389e-02, -1.119068e-01, -5.117084e-02,
  422. 6.178805e-03, -1.955722e-03, -4.881141e-02, 0.0,
  423. -5.396824e-02, 1.768444e-02, -1.764243e-01, -1.029730e-02,
  424. 3.943393e-02, -1.397969e-02, 9.628724e-02, -4.312754e-02,
  425. -1.602866e-01, -1.405657e-02, 1.331697e-01, 0.0,
  426. -2.396953e-02, 1.866630e-02, 3.267511e-02, -6.928004e-03,
  427. 7.034376e-02, -6.569391e-02, -1.199368e-01, 2.414189e-02,
  428. 3.878685e-02, 1.612695e-02, -9.410737e-02, 2.452490e-33,
  429. -3.085373e-02, 1.452446e-02, 5.175281e-02, -2.379139e-02,
  430. -5.039049e-02, 1.873454e-02, 9.242059e-02, -1.805802e-02,
  431. -4.347714e-02, -3.853900e-02, 1.008241e-01, 0.0,
  432. -9.480388e-03, 2.023331e-02, -6.792901e-03, -8.394149e-03,
  433. -7.546303e-02, 6.270129e-03, -3.894017e-01, -4.973264e-02,
  434. -1.555514e-01, -1.105092e-02, -1.950841e-01, -1.148950e-25,
  435. -2.661943e-02, 9.485362e-02, -4.270326e-01, 7.918665e-03,
  436. -1.816450e-01, -4.379404e-02, -3.889270e-02, -1.432468e-02,
  437. 1.501353e-02, -3.272457e-02, -1.477906e-01, 0.0,
  438. -1.104928e-01, 3.061369e-02, -1.783103e-01, -4.144012e-03,
  439. -1.341517e-02, -8.905338e-02, -2.880624e-01, -4.843873e-02,
  440. -8.718476e-02, -4.244976e-02, -5.811334e-02, 8.169911e-07,
  441. 3.018601e-01, -6.359625e-02, -6.384093e-02, -2.376516e-03,
  442. 1.381678e-01, 5.480919e-03, -1.754923e-02, 1.902135e-02,
  443. 1.838670e-01, 1.829514e-02, 9.986089e-01, 0.0,
  444. -3.820317e-02, -8.010733e-02, 2.023727e-01, -8.899641e-03,
  445. -6.265503e-02, 2.848809e-01, -6.972601e-02, 9.673467e-02,
  446. -6.779978e-02, -1.749464e-02, -1.618047e-01, 0.0,
  447. 5.618134e-03, -7.931516e-02, -7.710180e-01, -5.023658e-03,
  448. 2.721053e-02, 2.372581e-03, 1.131147e-01, 3.923619e-02,
  449. 1.188756e-01, 6.569220e-02, 3.954504e-02, 4.407177e-06,
  450. 3.772899e-02, -7.408679e-02, 2.722764e-01, 9.289873e-03,
  451. -1.720112e-08, -1.111527e-10, -3.223340e-33, 0.0,
  452. 0.0, 0.0, 0.0, 0.0,
  453. -3.947499e-10, -1.125618e-07, 0.0, 0.0,
  454. 7.252669e-01, -2.573835e-02, -3.086479e-03, 1.373577e-02,
  455. -2.595616e-02, -1.071919e-01, -1.039699e-01, 4.686809e-01,
  456. 6.939601e-01, 5.092673e-02, 8.983605e-01, 7.748492e-12,
  457. 7.637465e-01, -5.160391e-02, 4.367014e-03, 5.456513e-03,
  458. -1.755392e-02, -1.141231e-01, -9.624086e-02, 4.324957e-01,
  459. 7.202701e-01, 5.805269e-02, 8.917692e-01, 5.552060e-13,
  460. 6.970178e-01, -1.570065e-01, 3.382218e-02, -2.513156e-02,
  461. -1.520863e-02, -1.164639e-01, -1.687423e-01, 4.522114e-01,
  462. 5.808989e-01, 5.248518e-02, 8.544105e-01, 9.402750e-15,
  463. 0.0, 0.0, 0.0, 0.0,
  464. 0.0, 0.0, 0.0, 0.0,
  465. 0.0, 0.0, 0.0, 0.0,
  466. 4.326442e-01, -5.917080e-02, 2.801385e-01, -2.795843e-02,
  467. 1.264143e-02, -3.693263e-01, -1.749216e-01, 2.439530e-01,
  468. 5.274415e-01, 6.522411e-01, 2.642505e-01, -1.186306e-22,
  469. 4.592337e-01, -3.818674e-02, 1.983223e-02, -3.099717e-02,
  470. 3.941813e-02, -5.257453e-01, -3.692166e-02, 2.670639e-01,
  471. 6.403314e-01, 5.740828e-01, 2.307071e-01, -6.111520e-19,
  472. 5.923415e-01, -1.620244e-01, -6.315269e-03, 1.360147e-01,
  473. 3.776298e-02, -2.748910e-01, -9.679949e-02, 3.612375e-01,
  474. 6.582589e-01, 1.544350e-01, 8.423274e-01, 0.0,
  475. 4.770435e-01, -3.441220e-02, 7.110235e-02, 1.750984e-01,
  476. -1.088923e-01, -3.269669e-01, -3.097497e-01, 3.498318e-01,
  477. 6.162855e-01, 5.070065e-01, 4.478149e-01, 0.0,
  478. 0.0, 0.0, 0.0, 0.0,
  479. 0.0, 0.0, 0.0, 0.0,
  480. 0.0, 0.0, 0.0, 0.0};
  481. const static float cuda5_2_FORWARD_biases[33] = {
  482. 2.354680e-01, 4.575782e-01, 6.988282e-01, 2.040031e-01,
  483. 8.584012e-01, 8.249553e-02, 1.267146e+00, 0.0,
  484. 3.549752e-01, -4.857582e-01, 1.279055e+00, 6.212520e-03,
  485. 1.735605e+00, 1.737882e-01, 9.513135e-02, 1.042232e-01,
  486. 2.587379e-02, 1.125817e-01, 4.899196e-01, 8.571400e-01,
  487. 1.188120e+00, 1.079335e+00, 1.945481e+00, 0.0,
  488. -4.535237e-01, -4.646283e-01, -2.796752e-01, 0.0,
  489. -1.881813e-01, 6.431429e-02, 1.600823e-01, 3.773381e-01,
  490. 0.0};
  491. const static float cuda5_2_FORWARD_alpha[9] = {
  492. 2.371974e+08, 3.625653e+08, 1.961586e+08, 2.000000e+08,
  493. 2.259449e+07, 1.865459e+07, 6.657476e+08, 2.487226e+08,
  494. 2.000000e+08};
  495. const static float cuda5_2_FORWARD_beta[9] = {
  496. 1.575003e+00, 1.656241e+00, 1.577959e+00, 2.000000e+00,
  497. 2.396584e+00, 2.221534e+00, 1.692119e+00, 1.879424e+00,
  498. 2.000000e+00};
  499. float cuda5_2_BACKWARD_DATA_time_pred[6] = {0.0f};
  500. float cuda5_2_BACKWARD_DATA_mask[6] = {0.0f};
  501. float cuda5_2_BACKWARD_DATA_hidden_units[24] = {0.0f};
  502. const static size_t cuda5_2_BACKWARD_DATA_layers_dim[4] = {9, 12, 12, 6};
  503. const static float cuda5_2_BACKWARD_DATA_matrices[324] = {
  504. 8.340252e-04, -7.066309e-02, 6.012942e-03, -8.961015e-04,
  505. 5.308781e-02, 8.890389e-03, -1.695608e-02, -2.008141e-01,
  506. -2.327795e-01, 1.816323e-03, 1.741969e-03, -4.547063e-01,
  507. -3.278293e-01, 3.194534e-03, 5.590135e-01, 5.038606e-01,
  508. -6.899682e-01, -6.846661e-01, 1.296691e-02, 9.286657e-03,
  509. 6.076815e-02, 9.537656e-03, -1.845960e-01, 2.334390e-01,
  510. 6.584574e-02, -1.502425e-01, -1.464556e-01, 2.582188e-02,
  511. -2.801069e-01, 2.606461e-01, 5.094615e-02, 9.973006e-03,
  512. -2.273075e-01, 1.013311e-01, -2.977537e-01, -3.584019e-01,
  513. 1.550467e-02, -2.365348e-02, -2.361028e-01, -4.535604e-01,
  514. -1.099842e-01, 3.337491e-02, 3.386805e-02, 5.759778e-02,
  515. 5.773445e-02, -6.057084e-03, -5.215100e-03, -2.488342e-02,
  516. 4.550520e-01, -6.358563e-03, -4.111410e-01, -2.748287e-01,
  517. 6.576765e-01, 6.735925e-01, 1.382121e-02, 1.599379e-02,
  518. 2.175570e-01, 4.235858e-01, -4.743209e-03, 8.406488e-01,
  519. 5.463328e-01, 5.315352e-01, 5.759005e-01, -3.956826e-01,
  520. 1.770215e-03, 4.242290e-03, 5.961310e-03, 2.629623e-03,
  521. 3.968062e-01, 2.857247e-01, -3.694852e-01, -4.826791e-01,
  522. -1.361759e-01, 1.741970e-02, 2.067235e-01, -3.166322e-02,
  523. 1.676094e-02, 1.222352e-01, 3.594849e-01, 5.646787e-02,
  524. 9.237770e-02, 2.705673e-02, 3.022863e-02, 2.661669e-01,
  525. 1.342065e-01, 9.685011e-02, -4.619106e-01, -4.885407e-01,
  526. -1.207667e-01, -3.344076e-02, 1.247313e-03, 9.397045e-04,
  527. 3.326222e-03, 2.384325e-03, -5.191239e-01, 3.588830e-01,
  528. 5.642326e-01, -2.458584e-01, -6.050721e-01, -5.983715e-04,
  529. -3.112906e-04, -8.002273e-02, 2.754113e-01, 1.347607e-01,
  530. 2.869407e-01, 3.228108e-01, 2.589051e-01, 2.689373e-01,
  531. 2.097373e-03, -1.213292e-03, 2.289704e-02, 2.260412e-02,
  532. -4.001153e-03, -3.886382e-02, 1.744227e-02, 1.228004e-03,
  533. 5.637321e-02, 5.326664e-03, 5.775909e-02, -7.129682e-02,
  534. 2.957929e-02, -3.619472e-02, -7.687800e-02, 2.551496e-01,
  535. 2.791522e-02, -1.290575e-01, 7.948833e-02, 9.349618e-02,
  536. 4.568452e-03, -2.620651e-01, 9.037835e-03, 1.652229e-01,
  537. -1.035363e-02, -4.924298e-01, -1.359403e-01, -2.509044e-02,
  538. 6.072426e-02, -1.067680e-01, 9.075266e-02, -5.669300e-01,
  539. -5.016208e-02, -4.982992e-03, -4.493951e-01, 2.403491e-02,
  540. -5.795595e-03, 8.214971e-02, 1.994753e-03, 2.271867e-03,
  541. 8.008438e-03, -1.517102e-01, -2.790549e-02, 7.735109e-02,
  542. -1.794875e-02, 1.122736e-02, -4.320173e-02, -9.230874e-03,
  543. -4.703557e-02, -3.043727e-02, -1.645634e-01, -6.124438e-02,
  544. 2.416326e-01, -2.548371e-01, 2.711228e-01, 2.171408e-01,
  545. -1.613229e-02, -1.133995e-01, -5.881115e-01, 1.196182e-01,
  546. -1.574013e-02, -2.309249e-02, -9.163861e-02, -1.243609e-03,
  547. 2.755058e-03, -8.981592e-02, 4.023712e-02, 1.447185e-01,
  548. 1.773491e-02, -4.728686e-02, 4.132702e-02, 4.325303e-02,
  549. 9.868489e-02, -2.594438e-01, 1.111406e-02, 5.278649e-02,
  550. -5.842348e-02, 7.532353e-02, -3.890866e-02, 7.389170e-03,
  551. -8.200553e-02, -2.977651e-04, 2.846818e-01, -2.641009e-02,
  552. -3.923972e-06, 1.683590e-06, 4.231356e-06, -1.460619e-05,
  553. 1.480699e-05, -4.800242e-05, -3.605007e-05, 4.642337e-06,
  554. -1.237117e-05, -6.065346e-05, 1.122525e-07, -4.718931e-05,
  555. -4.836941e-02, 2.925190e-02, 5.125062e-02, -8.673830e-02,
  556. 4.049347e-02, -1.281789e-01, 4.054615e-02, -1.102404e-01,
  557. 1.797214e-02, 8.068577e-03, 9.849558e-02, 2.462221e-02,
  558. -3.952334e-02, 7.078841e-02, 5.095275e-03, -5.172743e-03,
  559. 1.358633e-01, -4.528875e-01, 4.454420e-01, -5.941349e-01,
  560. -8.203693e-02, -2.733144e-01, -4.668098e-01, 2.087940e-01,
  561. 2.732850e-01, 1.967585e-01, -1.648116e-02, -4.675763e-02,
  562. -2.471467e-02, -3.507713e-02, 1.268763e-01, -4.777270e-04,
  563. -6.884494e-02, -4.142293e-02, 4.568305e-01, -1.171813e-01,
  564. 4.104385e-02, 4.123072e-01, 1.201161e-01, 5.688429e-02,
  565. -6.769225e-02, 1.879334e-01, -1.869847e-01, 2.116578e-01,
  566. 1.023851e-01, -7.956885e-03, 3.125194e-02, -3.698255e-02,
  567. -1.742767e-02, 8.019327e-02, -2.414790e-01, 1.692867e-01,
  568. -1.363161e-01, -2.991336e-02, 1.571377e-01, -4.675832e-05,
  569. 3.410926e-02, -2.423313e-02, 3.784683e-01, 8.980562e-01,
  570. 1.445573e-02, 3.742977e-01, -1.449231e-01, 5.250753e-02,
  571. -9.320556e-02, 1.881413e-01, 1.525415e-01, 1.516415e-05,
  572. -2.865472e-02, -4.090607e-02, 1.368707e-01, 1.152067e+00,
  573. 3.926153e-02, 3.892255e-01, -3.988812e-01, 2.768721e-01,
  574. 1.682807e-01, -8.165011e-02, 2.984257e-01, -2.310482e-05,
  575. -1.301168e-01, -3.295192e-01, 1.955211e-01, 6.782165e-01,
  576. -1.859493e-02, 5.047321e-01, -3.545281e-01, 6.802614e-01,
  577. -2.701511e-02, 5.938844e-02, 1.288360e-01, 6.412582e-05,
  578. 6.354152e-02, -2.929806e-01, 1.172161e-01, 5.812020e-01,
  579. -1.526828e-03, 4.311178e-01, -1.572772e-01, 3.847064e-01,
  580. -1.406437e-01, -8.771673e-02, 1.723672e-01, -2.926565e-05,
  581. 1.170990e-01, -1.168602e-01, 2.353766e-01, 8.977429e-01,
  582. 1.029375e-02, 4.529134e-01, -3.884215e-01, 2.041353e-01,
  583. -2.684749e-02, 9.474846e-02, 1.718571e-01, 9.999280e-06,
  584. -9.272413e-02, -1.050809e-01, 2.637663e-01, 6.296775e-01};
  585. const static float cuda5_2_BACKWARD_DATA_biases[30] = {
  586. 2.758991e-01, 9.040871e-01, 6.578859e-01, 3.464146e-01,
  587. -1.074793e-01, -1.111640e+00, -4.436951e-03, 1.027522e+00,
  588. 5.782945e-02, -6.986979e-02, 1.183250e+00, -9.289587e-02,
  589. 2.339573e-03, 2.321955e-01, 6.579675e-01, 9.597613e-01,
  590. 4.900812e-02, 1.206250e-01, 1.320550e-01, 1.839768e-17,
  591. 1.678722e-01, -3.203184e-03, 7.736452e-01, 2.727852e+00,
  592. 1.589646e-01, -3.824490e-02, 5.180550e-01, 7.756407e-01,
  593. 4.521459e-01, 4.122442e-01};
  594. const static float cuda5_2_BACKWARD_DATA_alpha[6] = {
  595. 1.933176e+08, 4.558126e+08, 6.040167e+07, 4.608431e+07,
  596. 6.338093e+08, 3.281159e+08};
  597. const static float cuda5_2_BACKWARD_DATA_beta[6] = {
  598. 1.608048e+00, 1.659768e+00, 1.943038e+00, 1.953083e+00,
  599. 1.738348e+00, 1.891296e+00};
  600. float cuda5_2_BACKWARD_FILTER_time_pred[6] = {0.0f};
  601. float cuda5_2_BACKWARD_FILTER_mask[6] = {0.0f};
  602. float cuda5_2_BACKWARD_FILTER_hidden_units[24] = {0.0f};
  603. const static size_t cuda5_2_BACKWARD_FILTER_layers_dim[4] = {9, 12, 12, 6};
  604. const static float cuda5_2_BACKWARD_FILTER_matrices[324] = {
  605. 4.047185e-03, 3.388695e-04, 1.210363e-04, -6.148457e-06,
  606. -3.252271e-03, 8.122424e-04, 1.075851e-03, 3.066259e-03,
  607. 1.921126e-03, -1.042791e-04, -3.275821e-01, 4.278608e-03,
  608. -2.106100e-01, 8.295547e-02, 2.430674e-01, -2.748593e-02,
  609. -2.065240e-02, -1.395731e-02, -3.491511e-02, 3.520847e-03,
  610. 1.790237e-02, 1.188376e-02, 5.372314e-02, 1.494784e-02,
  611. 5.035055e-02, 6.581915e-02, 6.861494e-02, -2.199881e-03,
  612. -2.281682e-02, -9.687833e-02, 3.909182e-03, 1.024575e-01,
  613. 3.948949e-02, -4.566963e-02, -1.375550e-01, -6.794923e-02,
  614. 6.135985e-04, -4.608163e-01, 2.404660e-01, 6.274750e-03,
  615. 1.059302e-01, 1.676516e-01, -5.104349e-02, 9.925397e-02,
  616. -1.470984e-02, 1.031084e-04, 4.374801e-02, -5.167035e-01,
  617. -3.632444e-01, 8.170792e-02, 3.783056e-01, 3.212413e-01,
  618. -4.803373e-01, -4.874209e-01, 2.615676e-04, 3.406848e-02,
  619. 8.674651e-02, 3.508870e-03, -6.156053e-01, 3.270718e-01,
  620. 3.457363e-01, 1.898023e-01, -1.473479e-01, -2.987293e-01,
  621. 1.315816e-03, -5.991638e-03, 1.428707e-03, 1.580944e-03,
  622. 6.320467e-01, 2.342933e-01, -7.387988e-01, -4.437208e-01,
  623. -7.261886e-02, 5.008508e-03, 4.693171e-02, -5.879956e-02,
  624. 1.677305e-02, 1.845511e-01, 3.830231e-01, 4.003870e-02,
  625. 9.888364e-02, 7.434040e-04, 7.895462e-02, 2.310843e-01,
  626. 1.044731e-02, 1.716935e-01, 1.390186e-01, -3.862206e-01,
  627. -1.001334e-01, 1.338546e-02, -1.354914e-02, 5.464492e-02,
  628. 3.437773e-03, -2.069449e-03, -3.513253e-02, 1.837639e-02,
  629. -1.552736e-01, -1.349904e-02, -1.025307e-01, -4.804826e-06,
  630. 3.284197e-02, 5.086832e-02, 5.690669e-03, 7.154379e-02,
  631. 1.094594e+00, 1.068281e+00, 3.653902e-01, 3.107198e-01,
  632. -7.299128e-03, -3.042033e-04, 5.593516e-03, 3.541658e-03,
  633. 5.810616e-04, 8.030201e-03, -1.622678e-02, 1.400076e-04,
  634. 2.819623e-03, 4.108455e-03, 5.561182e-03, 2.512096e-03,
  635. -8.622734e-04, 5.333219e-02, 3.076694e-02, 1.795766e-01,
  636. -2.318845e-02, -3.202521e-02, 3.119619e-01, -1.606582e-01,
  637. -1.085588e-01, -9.067213e-02, -1.422861e-02, -3.444208e-02,
  638. -1.635176e-04, -2.596654e-01, 1.995525e-02, 2.055750e-02,
  639. 2.022944e-01, 4.327365e-01, -1.619481e-02, 1.125397e-01,
  640. 7.984060e-03, -2.073076e-01, -1.761664e-02, -4.832107e-02,
  641. 0.0, 0.0, 0.0, 0.0,
  642. 0.0, 0.0, 0.0, 0.0,
  643. 0.0, 0.0, 0.0, 0.0,
  644. 1.638518e-03, -1.793951e-02, 5.772194e-02, 2.851987e-02,
  645. 6.163087e-02, 2.867437e-02, 5.545961e-02, -1.660824e-01,
  646. 9.789789e-02, -1.159482e-01, 5.385513e-02, 6.836513e-02,
  647. 5.594874e-04, -2.741018e-02, 4.838353e-02, 4.298405e-02,
  648. 1.854298e-02, 3.633871e-02, 9.942706e-03, 3.490340e-01,
  649. 8.440907e-02, 2.376168e-02, 4.866724e-02, -2.214078e-01,
  650. -5.650432e-03, -8.008064e-04, 1.477945e-03, 9.983850e-04,
  651. 2.346494e-04, 2.069148e-03, -4.035380e-03, -5.895875e-03,
  652. -2.146410e-04, 8.988932e-04, 7.378523e-05, -3.107871e-05,
  653. 3.014900e-03, -2.577113e-01, 8.653076e-03, -2.681585e-02,
  654. -5.089819e-02, -2.550743e-02, -3.467115e-02, 3.631677e-01,
  655. -5.167207e-02, 9.202915e-02, -2.041105e-02, -1.355488e-01,
  656. -4.411176e-03, 1.459578e-01, -1.287185e-02, -5.766148e-03,
  657. -1.725510e-01, 1.716040e-01, -1.324064e-01, -1.831788e-01,
  658. -4.434610e-02, -7.823753e-02, -2.463202e-02, 2.183346e-02,
  659. 5.483676e-04, -7.481821e-02, -8.179377e-03, -3.340281e-02,
  660. -2.679154e-03, -3.484565e-02, -4.761697e-02, -7.778479e-01,
  661. -9.353197e-02, -1.011887e-01, -3.653892e-02, 3.624209e-01,
  662. -2.063141e-03, -1.785554e-03, 5.357111e-02, -4.105966e-02,
  663. 4.269572e-02, -1.532830e-01, 2.175374e-02, 1.304753e-01,
  664. 5.400207e-02, -4.020891e-02, -2.284152e-02, 1.153921e-01,
  665. 2.909448e-03, -1.312913e-02, -1.562593e-01, -1.018874e-01,
  666. 7.121818e-03, -1.468466e-01, 3.900497e-03, -2.249627e-02,
  667. -5.684932e-02, 2.612863e-02, -1.410081e-01, 2.298795e-02,
  668. 6.794739e-04, 7.064358e-01, 7.429705e-01, 0.0,
  669. 3.578874e-01, 7.372183e-01, -2.632545e-04, -1.001730e-01,
  670. 4.224807e-01, -1.673518e-01, 9.987204e-04, -7.437595e-02,
  671. 4.765817e-05, 5.283366e-01, 5.804700e-01, 0.0,
  672. 5.304079e-01, 8.826514e-01, 1.191588e-04, -2.403303e-02,
  673. 8.384521e-02, -1.913135e-01, -2.046290e-04, -4.905949e-02,
  674. -5.695952e-03, 4.907159e-01, 8.468218e-01, 0.0,
  675. 3.835697e-01, 4.161280e-01, -1.292199e-03, 2.594048e-01,
  676. 4.049456e-01, -4.400651e-01, 4.166223e-01, -1.978285e-01,
  677. 2.546945e-04, 5.691357e-01, 7.418897e-01, 0.0,
  678. 5.059269e-01, 8.695604e-01, -4.737849e-05, -1.666739e-02,
  679. 1.190503e-01, -1.528916e-01, -1.769190e-04, -4.045478e-02,
  680. 0.0, 0.0, 0.0, 0.0,
  681. 0.0, 0.0, 0.0, 0.0,
  682. 0.0, 0.0, 0.0, 0.0,
  683. 3.728615e-02, 3.964641e-01, 6.608990e-01, 0.0,
  684. 6.230336e-01, 5.074117e-01, 8.405939e-03, -1.422498e-01,
  685. 4.705996e-02, -2.407855e-01, -2.056813e-02, 2.624028e-01};
  686. const static float cuda5_2_BACKWARD_FILTER_biases[30] = {
  687. 2.526327e-03, 1.731556e-02, 7.836947e-02, 6.594411e-02,
  688. 1.693102e-01, 7.814206e-01, 6.354987e-01, 9.766987e-01,
  689. 2.128775e-01, -4.894586e-01, -3.741650e-02, -1.046441e-01,
  690. -2.802074e-02, 1.076976e+00, 1.484343e+00, 0.0,
  691. 1.488592e+00, 2.316875e+00, -5.133961e-03, 3.100583e-01,
  692. 6.346995e-01, 3.810246e-01, -2.523698e-01, 3.231826e-01,
  693. -4.245956e-01, -4.564983e-01, 4.500998e-03, -5.841292e-01,
  694. 0.0, -1.728347e-01};
  695. const static float cuda5_2_BACKWARD_FILTER_alpha[6] = {
  696. 2.066506e+08, 2.177061e+08, 5.654493e+07, 2.368001e+08,
  697. 2.000000e+08, 2.537848e+08};
  698. const static float cuda5_2_BACKWARD_FILTER_beta[6] = {
  699. 1.610186e+00, 1.844894e+00, 1.895551e+00, 1.816587e+00,
  700. 2.000000e+00, 2.252824e+00};
  701. if (conv_type == ConvolutionType::FORWARD && cuda_major == 5 &&
  702. cuda_minor == 2) {
  703. *layer_num_p = 4;
  704. *hidden_units_p = cuda5_2_FORWARD_hidden_units;
  705. *layers_dim_p = cuda5_2_FORWARD_layers_dim;
  706. *matrices_p = cuda5_2_FORWARD_matrices;
  707. *biases_p = cuda5_2_FORWARD_biases;
  708. *alpha_p = cuda5_2_FORWARD_alpha;
  709. *beta_p = cuda5_2_FORWARD_beta;
  710. *time_pred_p = cuda5_2_FORWARD_time_pred;
  711. *mask_p = cuda5_2_FORWARD_mask;
  712. } else if (conv_type == ConvolutionType::BACKWARD_DATA && cuda_major == 5 &&
  713. cuda_minor == 2) {
  714. *layer_num_p = 4;
  715. *hidden_units_p = cuda5_2_BACKWARD_DATA_hidden_units;
  716. *layers_dim_p = cuda5_2_BACKWARD_DATA_layers_dim;
  717. *matrices_p = cuda5_2_BACKWARD_DATA_matrices;
  718. *biases_p = cuda5_2_BACKWARD_DATA_biases;
  719. *alpha_p = cuda5_2_BACKWARD_DATA_alpha;
  720. *beta_p = cuda5_2_BACKWARD_DATA_beta;
  721. *time_pred_p = cuda5_2_BACKWARD_DATA_time_pred;
  722. *mask_p = cuda5_2_BACKWARD_DATA_mask;
  723. } else if (conv_type == ConvolutionType::BACKWARD_FILTER && cuda_major == 5 &&
  724. cuda_minor == 2) {
  725. *layer_num_p = 4;
  726. *hidden_units_p = cuda5_2_BACKWARD_FILTER_hidden_units;
  727. *layers_dim_p = cuda5_2_BACKWARD_FILTER_layers_dim;
  728. *matrices_p = cuda5_2_BACKWARD_FILTER_matrices;
  729. *biases_p = cuda5_2_BACKWARD_FILTER_biases;
  730. *alpha_p = cuda5_2_BACKWARD_FILTER_alpha;
  731. *beta_p = cuda5_2_BACKWARD_FILTER_beta;
  732. *time_pred_p = cuda5_2_BACKWARD_FILTER_time_pred;
  733. *mask_p = cuda5_2_BACKWARD_FILTER_mask;
  734. } else {
  735. return false;
  736. }
  737. return true;
  738. #endif
  739. return false;
  740. }
  741. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台