You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

arithmetic.c 40 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "nnacl/fp32/arithmetic.h"
  17. #include <math.h>
  18. #define ACCURACY_DATA 0.00000001
  19. int ElementOptMul(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
  20. #ifdef ENABLE_NEON
  21. float32x4_t vin0_opt = vdupq_n_f32(input0[0]);
  22. float32x4_t vin1_opt = vdupq_n_f32(input1[0]);
  23. #endif
  24. int index = 0;
  25. if (param->in_elements_num0_ == 1) {
  26. #ifdef ENABLE_NEON
  27. for (; index <= element_size - 4; index += C4NUM) {
  28. float32x4_t vin1 = vld1q_f32(input1 + index);
  29. float32x4_t vout = vmulq_f32(vin0_opt, vin1);
  30. vst1q_f32(output + index, vout);
  31. }
  32. #endif
  33. for (; index < element_size; index++) {
  34. output[index] = input0[0] * input1[index];
  35. }
  36. } else {
  37. #ifdef ENABLE_NEON
  38. for (; index <= element_size - 4; index += C4NUM) {
  39. float32x4_t vin0 = vld1q_f32(input0 + index);
  40. float32x4_t vout = vmulq_f32(vin0, vin1_opt);
  41. vst1q_f32(output + index, vout);
  42. }
  43. #endif
  44. for (; index < element_size; index++) {
  45. output[index] = input0[index] * input1[0];
  46. }
  47. }
  48. return NNACL_OK;
  49. }
  50. int ElementOptMulRelu(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
  51. #ifdef ENABLE_NEON
  52. float32x4_t vin0_opt = vdupq_n_f32(input0[0]);
  53. float32x4_t vin1_opt = vdupq_n_f32(input1[0]);
  54. float32x4_t zeros = vdupq_n_f32(0.0f);
  55. #endif
  56. int index = 0;
  57. if (param->in_elements_num0_ == 1) {
  58. #ifdef ENABLE_NEON
  59. for (; index <= element_size - 4; index += C4NUM) {
  60. float32x4_t vin1 = vld1q_f32(input1 + index);
  61. float32x4_t vout = vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros);
  62. vst1q_f32(output + index, vout);
  63. }
  64. #endif
  65. for (; index < element_size; index++) {
  66. output[index] = MSMAX(input0[0] * input1[index], 0);
  67. }
  68. } else {
  69. #ifdef ENABLE_NEON
  70. for (; index <= element_size - 4; index += C4NUM) {
  71. float32x4_t vin0 = vld1q_f32(input0 + index);
  72. float32x4_t vout = vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros);
  73. vst1q_f32(output + index, vout);
  74. }
  75. #endif
  76. for (; index < element_size; index++) {
  77. output[index] = MSMAX(input0[index] * input1[0], 0);
  78. }
  79. }
  80. return NNACL_OK;
  81. }
  82. int ElementOptMulRelu6(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
  83. #ifdef ENABLE_NEON
  84. float32x4_t vin0_opt = vdupq_n_f32(input0[0]);
  85. float32x4_t vin1_opt = vdupq_n_f32(input1[0]);
  86. float32x4_t zeros = vdupq_n_f32(0.0f);
  87. float32x4_t bounds = vdupq_n_f32(6.0f);
  88. #endif
  89. int index = 0;
  90. if (param->in_elements_num0_ == 1) {
  91. #ifdef ENABLE_NEON
  92. for (; index <= element_size - 4; index += C4NUM) {
  93. float32x4_t vin1 = vld1q_f32(input1 + index);
  94. float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0_opt, vin1), zeros), bounds);
  95. vst1q_f32(output + index, vout);
  96. }
  97. #endif
  98. for (; index < element_size; index++) {
  99. output[index] = MSMIN(MSMAX(input0[0] * input1[index], 0), 6);
  100. }
  101. } else {
  102. #ifdef ENABLE_NEON
  103. for (; index <= element_size - 4; index += C4NUM) {
  104. float32x4_t vin0 = vld1q_f32(input0 + index);
  105. float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1_opt), zeros), bounds);
  106. vst1q_f32(output + index, vout);
  107. }
  108. #endif
  109. for (; index < element_size; index++) {
  110. output[index] = MSMIN(MSMAX(input0[index] * input1[0], 0), 6);
  111. }
  112. }
  113. return NNACL_OK;
  114. }
  115. int ElementOptMulInt(int *input0, int *input1, int *output, int element_size, ArithmeticParameter *param) {
  116. #ifdef ENABLE_NEON
  117. int32x4_t vin0_opt = vdupq_n_s32(input0[0]);
  118. int32x4_t vin1_opt = vdupq_n_s32(input1[0]);
  119. #endif
  120. int index = 0;
  121. if (param->in_elements_num0_ == 1) {
  122. #ifdef ENABLE_NEON
  123. for (; index <= element_size - 4; index += C4NUM) {
  124. int32x4_t vin1 = vld1q_s32(input1 + index);
  125. int32x4_t vout = vmulq_s32(vin0_opt, vin1);
  126. vst1q_s32(output + index, vout);
  127. }
  128. #endif
  129. for (; index < element_size; index++) {
  130. output[index] = input0[0] * input1[index];
  131. }
  132. } else {
  133. #ifdef ENABLE_NEON
  134. for (; index <= element_size - 4; index += C4NUM) {
  135. int32x4_t vin0 = vld1q_s32(input0 + index);
  136. int32x4_t vout = vmulq_s32(vin0, vin1_opt);
  137. vst1q_s32(output + index, vout);
  138. }
  139. #endif
  140. for (; index < element_size; index++) {
  141. output[index] = input0[index] * input1[0];
  142. }
  143. }
  144. return NNACL_OK;
  145. }
  146. int ElementOptMulReluInt(int *input0, int *input1, int *output, int element_size, ArithmeticParameter *param) {
  147. #ifdef ENABLE_NEON
  148. int32x4_t vin0_opt = vdupq_n_s32(input0[0]);
  149. int32x4_t vin1_opt = vdupq_n_s32(input1[0]);
  150. int32x4_t zeros = vdupq_n_s32(0);
  151. #endif
  152. int index = 0;
  153. if (param->in_elements_num0_ == 1) {
  154. #ifdef ENABLE_NEON
  155. for (; index <= element_size - 4; index += C4NUM) {
  156. int32x4_t vin1 = vld1q_s32(input1 + index);
  157. int32x4_t vout = vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros);
  158. vst1q_s32(output + index, vout);
  159. }
  160. #endif
  161. for (; index < element_size; index++) {
  162. output[index] = MSMAX(input0[0] * input1[index], 0);
  163. }
  164. } else {
  165. #ifdef ENABLE_NEON
  166. for (; index <= element_size - 4; index += C4NUM) {
  167. int32x4_t vin0 = vld1q_s32(input0 + index);
  168. int32x4_t vout = vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros);
  169. vst1q_s32(output + index, vout);
  170. }
  171. #endif
  172. for (; index < element_size; index++) {
  173. output[index] = MSMAX(input0[index] * input1[0], 0);
  174. }
  175. }
  176. return NNACL_OK;
  177. }
  178. int ElementOptMulRelu6Int(int *input0, int *input1, int *output, int element_size, ArithmeticParameter *param) {
  179. #ifdef ENABLE_NEON
  180. int32x4_t vin0_opt = vdupq_n_s32(input0[0]);
  181. int32x4_t vin1_opt = vdupq_n_s32(input1[0]);
  182. int32x4_t zeros = vdupq_n_s32(0);
  183. int32x4_t bounds = vdupq_n_s32(6);
  184. #endif
  185. int index = 0;
  186. if (param->in_elements_num0_ == 1) {
  187. #ifdef ENABLE_NEON
  188. for (; index <= element_size - 4; index += C4NUM) {
  189. int32x4_t vin1 = vld1q_s32(input1 + index);
  190. int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0_opt, vin1), zeros), bounds);
  191. vst1q_s32(output + index, vout);
  192. }
  193. #endif
  194. for (; index < element_size; index++) {
  195. output[index] = MSMIN(MSMAX(input0[0] * input1[index], 0), 6);
  196. }
  197. } else {
  198. #ifdef ENABLE_NEON
  199. for (; index <= element_size - 4; index += C4NUM) {
  200. int32x4_t vin0 = vld1q_s32(input0 + index);
  201. int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1_opt), zeros), bounds);
  202. vst1q_s32(output + index, vout);
  203. }
  204. #endif
  205. for (; index < element_size; index++) {
  206. output[index] = MSMIN(MSMAX(input0[index] * input1[0], 0), 6);
  207. }
  208. }
  209. return NNACL_OK;
  210. }
  211. int ElementOptSub(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
  212. #ifdef ENABLE_NEON
  213. float32x4_t vin0_opt = vdupq_n_f32(input0[0]);
  214. float32x4_t vin1_opt = vdupq_n_f32(input1[0]);
  215. #endif
  216. int index = 0;
  217. if (param->in_elements_num0_ == 1) {
  218. #ifdef ENABLE_NEON
  219. for (; index <= element_size - 4; index += C4NUM) {
  220. float32x4_t vin1 = vld1q_f32(input1 + index);
  221. float32x4_t vout = vsubq_f32(vin0_opt, vin1);
  222. vst1q_f32(output + index, vout);
  223. }
  224. #endif
  225. for (; index < element_size; index++) {
  226. output[index] = input0[0] - input1[index];
  227. }
  228. } else {
  229. #ifdef ENABLE_NEON
  230. for (; index <= element_size - 4; index += C4NUM) {
  231. float32x4_t vin0 = vld1q_f32(input0 + index);
  232. float32x4_t vout = vsubq_f32(vin0, vin1_opt);
  233. vst1q_f32(output + index, vout);
  234. }
  235. #endif
  236. for (; index < element_size; index++) {
  237. output[index] = input0[index] - input1[0];
  238. }
  239. }
  240. return NNACL_OK;
  241. }
  242. int ElementOptSubRelu(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
  243. #ifdef ENABLE_NEON
  244. float32x4_t vin0_opt = vdupq_n_f32(input0[0]);
  245. float32x4_t vin1_opt = vdupq_n_f32(input1[0]);
  246. float32x4_t zeros = vdupq_n_f32(0.0f);
  247. #endif
  248. int index = 0;
  249. if (param->in_elements_num0_ == 1) {
  250. #ifdef ENABLE_NEON
  251. for (; index <= element_size - 4; index += C4NUM) {
  252. float32x4_t vin1 = vld1q_f32(input1 + index);
  253. float32x4_t vout = vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros);
  254. vst1q_f32(output + index, vout);
  255. }
  256. #endif
  257. for (; index < element_size; index++) {
  258. output[index] = MSMAX(input0[0] - input1[index], 0);
  259. }
  260. } else {
  261. #ifdef ENABLE_NEON
  262. for (; index <= element_size - 4; index += C4NUM) {
  263. float32x4_t vin0 = vld1q_f32(input0 + index);
  264. float32x4_t vout = vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros);
  265. vst1q_f32(output + index, vout);
  266. }
  267. #endif
  268. for (; index < element_size; index++) {
  269. output[index] = MSMAX(input0[index] - input1[0], 0);
  270. }
  271. }
  272. return NNACL_OK;
  273. }
  274. int ElementOptSubRelu6(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
  275. #ifdef ENABLE_NEON
  276. float32x4_t vin0_opt = vdupq_n_f32(input0[0]);
  277. float32x4_t vin1_opt = vdupq_n_f32(input1[0]);
  278. float32x4_t zeros = vdupq_n_f32(0.0f);
  279. float32x4_t bounds = vdupq_n_f32(6.0f);
  280. #endif
  281. int index = 0;
  282. if (param->in_elements_num0_ == 1) {
  283. #ifdef ENABLE_NEON
  284. for (; index <= element_size - 4; index += C4NUM) {
  285. float32x4_t vin1 = vld1q_f32(input1 + index);
  286. float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0_opt, vin1), zeros), bounds);
  287. vst1q_f32(output + index, vout);
  288. }
  289. #endif
  290. for (; index < element_size; index++) {
  291. output[index] = MSMIN(MSMAX(input0[0] - input1[index], 0), 6);
  292. }
  293. } else {
  294. #ifdef ENABLE_NEON
  295. for (; index <= element_size - 4; index += C4NUM) {
  296. float32x4_t vin0 = vld1q_f32(input0 + index);
  297. float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1_opt), zeros), bounds);
  298. vst1q_f32(output + index, vout);
  299. }
  300. #endif
  301. for (; index < element_size; index++) {
  302. output[index] = MSMIN(MSMAX(input0[index] - input1[0], 0), 6);
  303. }
  304. }
  305. return NNACL_OK;
  306. }
  307. int ElementOptAdd(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
  308. #ifdef ENABLE_NEON
  309. float32x4_t vin0_opt = vdupq_n_f32(input0[0]);
  310. float32x4_t vin1_opt = vdupq_n_f32(input1[0]);
  311. #endif
  312. int index = 0;
  313. if (param->in_elements_num0_ == 1) {
  314. #ifdef ENABLE_NEON
  315. for (; index <= element_size - 4; index += C4NUM) {
  316. float32x4_t vin1 = vld1q_f32(input1 + index);
  317. float32x4_t vout = vaddq_f32(vin0_opt, vin1);
  318. vst1q_f32(output + index, vout);
  319. }
  320. #endif
  321. for (; index < element_size; index++) {
  322. output[index] = input0[0] + input1[index];
  323. }
  324. } else {
  325. #ifdef ENABLE_NEON
  326. for (; index <= element_size - 4; index += C4NUM) {
  327. float32x4_t vin0 = vld1q_f32(input0 + index);
  328. float32x4_t vout = vaddq_f32(vin0, vin1_opt);
  329. vst1q_f32(output + index, vout);
  330. }
  331. #endif
  332. for (; index < element_size; index++) {
  333. output[index] = input0[index] + input1[0];
  334. }
  335. }
  336. return NNACL_OK;
  337. }
  338. int ElementOptAddInt(int *input0, int *input1, int *output, int element_size, ArithmeticParameter *param) {
  339. #ifdef ENABLE_NEON
  340. int32x4_t vin0_opt = vdupq_n_s32(input0[0]);
  341. int32x4_t vin1_opt = vdupq_n_s32(input1[0]);
  342. #endif
  343. int index = 0;
  344. if (param->in_elements_num0_ == 1) {
  345. #ifdef ENABLE_NEON
  346. for (; index <= element_size - 4; index += C4NUM) {
  347. int32x4_t vin1 = vld1q_s32(input1 + index);
  348. int32x4_t vout = vaddq_s32(vin0_opt, vin1);
  349. vst1q_s32(output + index, vout);
  350. }
  351. #endif
  352. for (; index < element_size; index++) {
  353. output[index] = input0[0] + input1[index];
  354. }
  355. } else {
  356. #ifdef ENABLE_NEON
  357. for (; index <= element_size - 4; index += C4NUM) {
  358. int32x4_t vin0 = vld1q_s32(input0 + index);
  359. int32x4_t vout = vaddq_s32(vin0, vin1_opt);
  360. vst1q_s32(output + index, vout);
  361. }
  362. #endif
  363. for (; index < element_size; index++) {
  364. output[index] = input0[index] + input1[0];
  365. }
  366. }
  367. return NNACL_OK;
  368. }
  369. int ElementOptAddRelu(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
  370. #ifdef ENABLE_NEON
  371. float32x4_t vin0_opt = vdupq_n_f32(input0[0]);
  372. float32x4_t vin1_opt = vdupq_n_f32(input1[0]);
  373. float32x4_t zeros = vdupq_n_f32(0.0f);
  374. #endif
  375. int index = 0;
  376. if (param->in_elements_num0_ == 1) {
  377. #ifdef ENABLE_NEON
  378. for (; index <= element_size - 4; index += C4NUM) {
  379. float32x4_t vin1 = vld1q_f32(input1 + index);
  380. float32x4_t vout = vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros);
  381. vst1q_f32(output + index, vout);
  382. }
  383. #endif
  384. for (; index < element_size; index++) {
  385. output[index] = MSMAX(input0[0] + input1[index], 0);
  386. }
  387. } else {
  388. #ifdef ENABLE_NEON
  389. for (; index <= element_size - 4; index += C4NUM) {
  390. float32x4_t vin0 = vld1q_f32(input0 + index);
  391. float32x4_t vout = vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros);
  392. vst1q_f32(output + index, vout);
  393. }
  394. #endif
  395. for (; index < element_size; index++) {
  396. output[index] = MSMAX(input0[index] + input1[0], 0);
  397. }
  398. }
  399. return NNACL_OK;
  400. }
  401. int ElementOptAddRelu6(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
  402. #ifdef ENABLE_NEON
  403. float32x4_t vin0_opt = vdupq_n_f32(input0[0]);
  404. float32x4_t vin1_opt = vdupq_n_f32(input1[0]);
  405. float32x4_t zeros = vdupq_n_f32(0.0f);
  406. float32x4_t bounds = vdupq_n_f32(6.0f);
  407. #endif
  408. int index = 0;
  409. if (param->in_elements_num0_ == 1) {
  410. #ifdef ENABLE_NEON
  411. for (; index <= element_size - 4; index += C4NUM) {
  412. float32x4_t vin1 = vld1q_f32(input1 + index);
  413. float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0_opt, vin1), zeros), bounds);
  414. vst1q_f32(output + index, vout);
  415. }
  416. #endif
  417. for (; index < element_size; index++) {
  418. output[index] = MSMIN(MSMAX(input0[0] + input1[index], 0), 6);
  419. }
  420. } else {
  421. #ifdef ENABLE_NEON
  422. for (; index <= element_size - 4; index += C4NUM) {
  423. float32x4_t vin0 = vld1q_f32(input0 + index);
  424. float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1_opt), zeros), bounds);
  425. vst1q_f32(output + index, vout);
  426. }
  427. #endif
  428. for (; index < element_size; index++) {
  429. output[index] = MSMIN(MSMAX(input0[index] + input1[0], 0), 6);
  430. }
  431. }
  432. return NNACL_OK;
  433. }
  434. int ElementOptDiv(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
  435. if (param->in_elements_num0_ == 1) {
  436. for (int index = 0; index < element_size; index++) {
  437. output[index] = input0[0] / input1[index];
  438. }
  439. } else {
  440. if (input1[0] == 0) {
  441. return NNACL_ERRCODE_DIVISOR_ZERO;
  442. }
  443. for (int index = 0; index < element_size; index++) {
  444. output[index] = input0[index] / input1[0];
  445. }
  446. }
  447. return NNACL_OK;
  448. }
  449. int ElementOptDivRelu(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
  450. if (param->in_elements_num0_ == 1) {
  451. for (int index = 0; index < element_size; index++) {
  452. output[index] = input0[0] / input1[index];
  453. output[index] = output[index] > 0 ? output[index] : 0;
  454. }
  455. } else {
  456. for (int index = 0; index < element_size; index++) {
  457. output[index] = input0[index] / input1[0];
  458. output[index] = output[index] > 0 ? output[index] : 0;
  459. }
  460. }
  461. return NNACL_OK;
  462. }
  463. int ElementOptDivRelu6(float *input0, float *input1, float *output, int element_size, ArithmeticParameter *param) {
  464. if (param->in_elements_num0_ == 1) {
  465. for (int index = 0; index < element_size; index++) {
  466. output[index] = MSMIN(MSMAX(input0[0] / input1[index], 0), 6);
  467. }
  468. } else {
  469. for (int index = 0; index < element_size; index++) {
  470. output[index] = MSMIN(MSMAX(input0[index] / input1[0], 0), 6);
  471. }
  472. }
  473. return NNACL_OK;
  474. }
  475. int ElementMul(float *input0, float *input1, float *output, int element_size) {
  476. int index = 0;
  477. #ifdef ENABLE_NEON
  478. for (; index <= element_size - 4; index += C4NUM) {
  479. float32x4_t vin0 = vld1q_f32(input0 + index);
  480. float32x4_t vin1 = vld1q_f32(input1 + index);
  481. float32x4_t vout = vmulq_f32(vin0, vin1);
  482. vst1q_f32(output + index, vout);
  483. }
  484. #endif
  485. for (; index < element_size; index++) {
  486. output[index] = input0[index] * input1[index];
  487. }
  488. return NNACL_OK;
  489. }
  490. int ElementMulRelu(float *input0, float *input1, float *output, int element_size) {
  491. int index = 0;
  492. #ifdef ENABLE_NEON
  493. float32x4_t zeros = vdupq_n_f32(0.0f);
  494. for (; index <= element_size - 4; index += C4NUM) {
  495. float32x4_t vin0 = vld1q_f32(input0 + index);
  496. float32x4_t vin1 = vld1q_f32(input1 + index);
  497. float32x4_t vout = vmulq_f32(vin0, vin1);
  498. vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
  499. vst1q_f32(output + index, vout);
  500. }
  501. #endif
  502. for (; index < element_size; index++) {
  503. float res = input0[index] * input1[index];
  504. output[index] = res > 0 ? res : 0;
  505. }
  506. return NNACL_OK;
  507. }
  508. int ElementMulRelu6(float *input0, float *input1, float *output, int element_size) {
  509. int index = 0;
  510. #ifdef ENABLE_NEON
  511. float32x4_t zeros = vdupq_n_f32(0.0f);
  512. float32x4_t bounds = vdupq_n_f32(6.0f);
  513. for (; index <= element_size - 4; index += C4NUM) {
  514. float32x4_t vin0 = vld1q_f32(input0 + index);
  515. float32x4_t vin1 = vld1q_f32(input1 + index);
  516. float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1), zeros), bounds);
  517. vst1q_f32(output + index, vout);
  518. }
  519. #endif
  520. for (; index < element_size; index++) {
  521. output[index] = MSMIN(MSMAX(input0[index] * input1[index], 0), 6);
  522. }
  523. return NNACL_OK;
  524. }
  525. int ElementMulInt(int *input0, int *input1, int *output, int element_size) {
  526. int index = 0;
  527. #ifdef ENABLE_NEON
  528. for (; index <= element_size - 4; index += C4NUM) {
  529. int32x4_t vin0 = vld1q_s32(input0 + index);
  530. int32x4_t vin1 = vld1q_s32(input1 + index);
  531. int32x4_t vout = vmulq_s32(vin0, vin1);
  532. vst1q_s32(output + index, vout);
  533. }
  534. #endif
  535. for (; index < element_size; index++) {
  536. output[index] = input0[index] * input1[index];
  537. }
  538. return NNACL_OK;
  539. }
  540. int ElementMulReluInt(int *input0, int *input1, int *output, int element_size) {
  541. int index = 0;
  542. #ifdef ENABLE_NEON
  543. int32x4_t zeros = vdupq_n_s32(0);
  544. for (; index <= element_size - 4; index += C4NUM) {
  545. int32x4_t vin0 = vld1q_s32(input0 + index);
  546. int32x4_t vin1 = vld1q_s32(input1 + index);
  547. int32x4_t vout = vmulq_s32(vin0, vin1);
  548. vout = vbslq_s32(vcgtq_s32(vout, zeros), vout, zeros);
  549. vst1q_s32(output + index, vout);
  550. }
  551. #endif
  552. for (; index < element_size; index++) {
  553. float res = input0[index] * input1[index];
  554. output[index] = res > 0 ? res : 0;
  555. }
  556. return NNACL_OK;
  557. }
  558. int ElementMulRelu6Int(int *input0, int *input1, int *output, int element_size) {
  559. int index = 0;
  560. #ifdef ENABLE_NEON
  561. int32x4_t zeros = vdupq_n_s32(0);
  562. int32x4_t bounds = vdupq_n_s32(6);
  563. for (; index <= element_size - 4; index += C4NUM) {
  564. int32x4_t vin0 = vld1q_s32(input0 + index);
  565. int32x4_t vin1 = vld1q_s32(input1 + index);
  566. int32x4_t vout = vminq_s32(vmaxq_s32(vmulq_s32(vin0, vin1), zeros), bounds);
  567. vst1q_s32(output + index, vout);
  568. }
  569. #endif
  570. for (; index < element_size; index++) {
  571. output[index] = MSMIN(MSMAX(input0[index] * input1[index], 0), 6);
  572. }
  573. return NNACL_OK;
  574. }
  575. int BroadcastMul(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output, int element_size,
  576. ArithmeticParameter *param) {
  577. TileDimensions(input0, input1, tile_input0, tile_input1, param);
  578. return ElementMul(tile_input0, tile_input1, output, element_size);
  579. }
  580. int ElementAdd(float *input0, float *input1, float *output, int element_size) {
  581. int index = 0;
  582. #ifdef ENABLE_NEON
  583. for (; index <= element_size - 4; index += C4NUM) {
  584. float32x4_t vin0 = vld1q_f32(input0 + index);
  585. float32x4_t vin1 = vld1q_f32(input1 + index);
  586. float32x4_t vout = vaddq_f32(vin0, vin1);
  587. vst1q_f32(output + index, vout);
  588. }
  589. #endif
  590. for (; index < element_size; index++) {
  591. output[index] = input0[index] + input1[index];
  592. }
  593. return NNACL_OK;
  594. }
  595. int ElementAddRelu(float *input0, float *input1, float *output, int element_size) {
  596. int index = 0;
  597. #ifdef ENABLE_NEON
  598. float32x4_t zeros = vdupq_n_f32(0.0f);
  599. for (; index <= element_size - 4; index += C4NUM) {
  600. float32x4_t vin0 = vld1q_f32(input0 + index);
  601. float32x4_t vin1 = vld1q_f32(input1 + index);
  602. float32x4_t vout = vaddq_f32(vin0, vin1);
  603. vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
  604. vst1q_f32(output + index, vout);
  605. }
  606. #endif
  607. for (; index < element_size; index++) {
  608. float res = input0[index] + input1[index];
  609. output[index] = res > 0 ? res : 0;
  610. }
  611. return NNACL_OK;
  612. }
  613. int ElementAddRelu6(float *input0, float *input1, float *output, int element_size) {
  614. int index = 0;
  615. #ifdef ENABLE_NEON
  616. float32x4_t zeros = vdupq_n_f32(0.0f);
  617. float32x4_t bounds = vdupq_n_f32(6.0f);
  618. for (; index <= element_size - 4; index += C4NUM) {
  619. float32x4_t vin0 = vld1q_f32(input0 + index);
  620. float32x4_t vin1 = vld1q_f32(input1 + index);
  621. float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1), zeros), bounds);
  622. vst1q_f32(output + index, vout);
  623. }
  624. #endif
  625. for (; index < element_size; index++) {
  626. output[index] = MSMIN(MSMAX(input0[index] + input1[index], 0), 6);
  627. }
  628. return NNACL_OK;
  629. }
  630. int ElementAddInt(int *input0, int *input1, int *output, int element_size) {
  631. int index = 0;
  632. #ifdef ENABLE_NEON
  633. for (; index <= element_size - 4; index += C4NUM) {
  634. int32x4_t vin0 = vld1q_s32(input0 + index);
  635. int32x4_t vin1 = vld1q_s32(input1 + index);
  636. int32x4_t vout = vaddq_s32(vin0, vin1);
  637. vst1q_s32(output + index, vout);
  638. }
  639. #endif
  640. for (; index < element_size; index++) {
  641. output[index] = input0[index] + input1[index];
  642. }
  643. return NNACL_OK;
  644. }
  645. int ElementAddInt8(int8_t *input0, int8_t *input1, int8_t *output, int element_size) {
  646. for (int i = 0; i < element_size; i++) {
  647. output[i] = input0[i] + input1[i];
  648. }
  649. return NNACL_OK;
  650. }
  651. int BroadcastAdd(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output, int element_size,
  652. ArithmeticParameter *param) {
  653. TileDimensions(input0, input1, tile_input0, tile_input1, param);
  654. return ElementAdd(tile_input0, tile_input1, output, element_size);
  655. }
  656. int BroadcastAddInt8(int8_t *input0, int8_t *input1, int8_t *tile_input0, int8_t *tile_input1, int8_t *output,
  657. int element_size, ArithmeticParameter *param) {
  658. TileDimensionsInt8(input0, input1, tile_input0, tile_input1, param);
  659. return ElementAddInt8(tile_input0, tile_input1, output, element_size);
  660. }
  661. int ElementSub(float *input0, float *input1, float *output, int element_size) {
  662. int index = 0;
  663. #ifdef ENABLE_NEON
  664. for (; index <= element_size - 4; index += C4NUM) {
  665. float32x4_t vin0 = vld1q_f32(input0 + index);
  666. float32x4_t vin1 = vld1q_f32(input1 + index);
  667. float32x4_t vout = vsubq_f32(vin0, vin1);
  668. vst1q_f32(output + index, vout);
  669. }
  670. #endif
  671. for (; index < element_size; index++) {
  672. output[index] = input0[index] - input1[index];
  673. }
  674. return NNACL_OK;
  675. }
  676. int ElementSubRelu(float *input0, float *input1, float *output, int element_size) {
  677. int index = 0;
  678. #ifdef ENABLE_NEON
  679. float32x4_t zeros = vdupq_n_f32(0.0f);
  680. for (; index <= element_size - 4; index += C4NUM) {
  681. float32x4_t vin0 = vld1q_f32(input0 + index);
  682. float32x4_t vin1 = vld1q_f32(input1 + index);
  683. float32x4_t vout = vsubq_f32(vin0, vin1);
  684. vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros);
  685. vst1q_f32(output + index, vout);
  686. }
  687. #endif
  688. for (; index < element_size; index++) {
  689. float res = input0[index] - input1[index];
  690. output[index] = res > 0 ? res : 0;
  691. }
  692. return NNACL_OK;
  693. }
  694. int ElementSubRelu6(float *input0, float *input1, float *output, int element_size) {
  695. int index = 0;
  696. #ifdef ENABLE_NEON
  697. float32x4_t zeros = vdupq_n_f32(0.0f);
  698. float32x4_t bounds = vdupq_n_f32(6.0f);
  699. for (; index <= element_size - 4; index += C4NUM) {
  700. float32x4_t vin0 = vld1q_f32(input0 + index);
  701. float32x4_t vin1 = vld1q_f32(input1 + index);
  702. float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1), zeros), bounds);
  703. vst1q_f32(output + index, vout);
  704. }
  705. #endif
  706. for (; index < element_size; index++) {
  707. output[index] = MSMIN(MSMAX(input0[index] - input1[index], 0), 6);
  708. }
  709. return NNACL_OK;
  710. }
  711. int BroadcastSub(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output, int element_size,
  712. ArithmeticParameter *param) {
  713. TileDimensions(input0, input1, tile_input0, tile_input1, param);
  714. return ElementSub(tile_input0, tile_input1, output, element_size);
  715. }
  716. int ElementDiv(float *input0, float *input1, float *output, int element_size) {
  717. for (int i = 0; i < element_size; i++) {
  718. output[i] = input0[i] / input1[i];
  719. }
  720. return NNACL_OK;
  721. }
  722. int ElementDivRelu(float *input0, float *input1, float *output, int element_size) {
  723. for (int i = 0; i < element_size; i++) {
  724. float res = input0[i] / input1[i];
  725. output[i] = res > 0 ? res : 0;
  726. }
  727. return NNACL_OK;
  728. }
  729. int ElementDivRelu6(float *input0, float *input1, float *output, int element_size) {
  730. for (int i = 0; i < element_size; i++) {
  731. output[i] = MSMIN(MSMAX(input0[i] / input1[i], 0), 6);
  732. }
  733. return NNACL_OK;
  734. }
  735. int BroadcastDiv(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output, int element_size,
  736. ArithmeticParameter *param) {
  737. TileDimensions(input0, input1, tile_input0, tile_input1, param);
  738. return ElementDiv(tile_input0, tile_input1, output, element_size);
  739. }
  740. int ElementFloorMod(float *input0, float *input1, float *output, int element_size) {
  741. for (int i = 0; i < element_size; i++) {
  742. output[i] = input0[i] - floorf(input0[i] / input1[i]) * input1[i];
  743. }
  744. return NNACL_OK;
  745. }
  746. int ElementFloorModInt(int *input0, int *input1, int *output, int element_size) {
  747. for (int i = 0; i < element_size; i++) {
  748. output[i] = input0[i] - (input0[i] / input1[i]) * input1[i];
  749. }
  750. return NNACL_OK;
  751. }
  752. int BroadcastFloorMod(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output,
  753. int element_size, ArithmeticParameter *param) {
  754. TileDimensions(input0, input1, tile_input0, tile_input1, param);
  755. return ElementFloorMod(tile_input0, tile_input1, output, element_size);
  756. }
  757. int ElementFloorDiv(float *input0, float *input1, float *output, int element_size) {
  758. for (int i = 0; i < element_size; i++) {
  759. output[i] = floorf(input0[i] / input1[i]);
  760. }
  761. return NNACL_OK;
  762. }
  763. int ElementFloorDivInt(int *input0, int *input1, int *output, int element_size) {
  764. for (int i = 0; i < element_size; i++) {
  765. output[i] = input0[i] / input1[i];
  766. }
  767. return NNACL_OK;
  768. }
  769. int BroadcastFloorDiv(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output,
  770. int element_size, ArithmeticParameter *param) {
  771. TileDimensions(input0, input1, tile_input0, tile_input1, param);
  772. return ElementFloorDiv(tile_input0, tile_input1, output, element_size);
  773. }
  774. int ElementLogicalAnd(float *input0, float *input1, float *output, int element_size) {
  775. int index = 0;
  776. #ifdef ENABLE_NEON
  777. float32x4_t vtrue = vdupq_n_f32(1);
  778. float32x4_t vfalse = vdupq_n_f32(0);
  779. uint32x4_t mask = vmovq_n_u32(((uint32_t)(1u << 31) - 1));
  780. uint32x4_t zeros = vdupq_n_u32(0);
  781. for (; index <= element_size - 4; index += C4NUM) {
  782. uint32x4_t vin0 = vandq_u32(vreinterpretq_s32_f32(vld1q_f32(input0 + index)), mask);
  783. uint32x4_t vin1 = vandq_u32(vreinterpretq_s32_f32(vld1q_f32(input1 + index)), mask);
  784. float32x4_t vout = vbslq_f32(vceqq_u32(vandq_u32(vin0, vin1), zeros), vfalse, vtrue);
  785. vst1q_f32(output + index, vout);
  786. }
  787. #endif
  788. for (; index < element_size; index++) {
  789. output[index] = (float)((bool)(input0[index]) & (bool)(input1[index]));
  790. }
  791. return NNACL_OK;
  792. }
  793. int ElementSquaredDifference(float *input0, float *input1, float *output, int element_size) {
  794. ElementSub(input0, input1, output, element_size);
  795. return ElementMul(output, output, output, element_size);
  796. }
  797. int BroadcastSquaredDifference(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output,
  798. int element_size, ArithmeticParameter *param) {
  799. BroadcastSub(input0, input1, tile_input0, tile_input1, output, element_size, param);
  800. return ElementMul(output, output, output, element_size);
  801. }
  802. int BroadcastLogicalAnd(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output,
  803. int element_size, ArithmeticParameter *param) {
  804. TileDimensions(input0, input1, tile_input0, tile_input1, param);
  805. return ElementLogicalAnd(tile_input0, tile_input1, output, element_size);
  806. }
  807. int ElementLogicalOr(float *input0, float *input1, float *output, int element_size) {
  808. int index = 0;
  809. #ifdef ENABLE_NEON
  810. float32x4_t vtrue = vdupq_n_f32(1);
  811. float32x4_t vfalse = vdupq_n_f32(0);
  812. uint32x4_t mask = vmovq_n_u32(((uint32_t)(1u << 31) - 1));
  813. uint32x4_t zeros = vdupq_n_u32(0);
  814. for (; index <= element_size - 4; index += C4NUM) {
  815. uint32x4_t vin0 = vandq_u32(vreinterpretq_s32_f32(vld1q_f32(input0 + index)), mask);
  816. uint32x4_t vin1 = vandq_u32(vreinterpretq_s32_f32(vld1q_f32(input1 + index)), mask);
  817. float32x4_t vout = vbslq_f32(vceqq_u32(vorrq_u32(vin0, vin1), zeros), vfalse, vtrue);
  818. vst1q_f32(output + index, vout);
  819. }
  820. #endif
  821. for (; index < element_size; index++) {
  822. output[index] = (float)((bool)(input0[index]) | (bool)(input1[index]));
  823. }
  824. return NNACL_OK;
  825. }
  826. int BroadcastLogicalOr(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output,
  827. int element_size, ArithmeticParameter *param) {
  828. TileDimensions(input0, input1, tile_input0, tile_input1, param);
  829. return ElementLogicalOr(tile_input0, tile_input1, output, element_size);
  830. }
  831. int ElementMaximum(float *input0, float *input1, float *output, int element_size) {
  832. int index = 0;
  833. #ifdef ENABLE_NEON
  834. for (; index <= element_size - 4; index += C4NUM) {
  835. float32x4_t vin0 = vld1q_f32(input0 + index);
  836. float32x4_t vin1 = vld1q_f32(input1 + index);
  837. float32x4_t vout = vmaxq_f32(vin0, vin1);
  838. vst1q_f32(output + index, vout);
  839. }
  840. #endif
  841. for (; index < element_size; index++) {
  842. output[index] = input0[index] > input1[index] ? input0[index] : input1[index];
  843. }
  844. return NNACL_OK;
  845. }
  846. int BroadcastMaximum(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output,
  847. int element_size, ArithmeticParameter *param) {
  848. TileDimensions(input0, input1, tile_input0, tile_input1, param);
  849. return ElementMaximum(tile_input0, tile_input1, output, element_size);
  850. }
  851. int ElementMinimum(float *input0, float *input1, float *output, int element_size) {
  852. int index = 0;
  853. #ifdef ENABLE_NEON
  854. for (; index <= element_size - 4; index += C4NUM) {
  855. float32x4_t vin0 = vld1q_f32(input0 + index);
  856. float32x4_t vin1 = vld1q_f32(input1 + index);
  857. float32x4_t vout = vminq_f32(vin0, vin1);
  858. vst1q_f32(output + index, vout);
  859. }
  860. #endif
  861. for (; index < element_size; index++) {
  862. output[index] = input0[index] > input1[index] ? input1[index] : input0[index];
  863. }
  864. return NNACL_OK;
  865. }
  866. int BroadcastMinimum(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output,
  867. int element_size, ArithmeticParameter *param) {
  868. TileDimensions(input0, input1, tile_input0, tile_input1, param);
  869. return ElementMinimum(tile_input0, tile_input1, output, element_size);
  870. }
  871. float FloatNotEqualCheck(float in0, float in1) {
  872. float tmp = in0 - in1;
  873. if (tmp <= ACCURACY_DATA && tmp >= -ACCURACY_DATA) {
  874. return (float)false;
  875. }
  876. return (float)true;
  877. }
  878. int ElementNotEqual(float *input0, float *input1, float *output, int element_size) {
  879. int index = 0;
  880. #ifdef ENABLE_NEON
  881. float32x4_t vtrue = vdupq_n_f32(1);
  882. float32x4_t vfalse = vdupq_n_f32(0);
  883. for (; index <= element_size - 4; index += C4NUM) {
  884. float32x4_t vin0 = vld1q_f32(input0 + index);
  885. float32x4_t vin1 = vld1q_f32(input1 + index);
  886. float32x4_t vout = vbslq_f32(vceqq_f32(vin0, vin1), vfalse, vtrue);
  887. vst1q_f32(output + index, vout);
  888. }
  889. #endif
  890. for (; index < element_size; index++) {
  891. output[index] = (float)(input0[index] != input1[index]);
  892. }
  893. return NNACL_OK;
  894. }
  895. int BroadcastNotEqual(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output,
  896. int element_size, ArithmeticParameter *param) {
  897. TileDimensions(input0, input1, tile_input0, tile_input1, param);
  898. return ElementNotEqual(tile_input0, tile_input1, output, element_size);
  899. }
  900. float FloatEqualCheck(float in0, float in1) {
  901. float tmp = in0 - in1;
  902. if (tmp <= ACCURACY_DATA && tmp >= -ACCURACY_DATA) {
  903. return (float)true;
  904. }
  905. return (float)false;
  906. }
  907. int ElementEqual(float *input0, float *input1, float *output, int element_size) {
  908. int index = 0;
  909. #ifdef ENABLE_NEON
  910. float32x4_t vtrue = vdupq_n_f32(1);
  911. float32x4_t vfalse = vdupq_n_f32(0);
  912. for (; index <= element_size - 4; index += C4NUM) {
  913. float32x4_t vin0 = vld1q_f32(input0 + index);
  914. float32x4_t vin1 = vld1q_f32(input1 + index);
  915. float32x4_t vout = vbslq_f32(vceqq_f32(vin0, vin1), vtrue, vfalse);
  916. vst1q_f32(output + index, vout);
  917. }
  918. #endif
  919. for (; index < element_size; index++) {
  920. output[index] = (float)(input0[index] == input1[index]);
  921. }
  922. return NNACL_OK;
  923. }
  924. int BroadcastEqual(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output,
  925. int element_size, ArithmeticParameter *param) {
  926. TileDimensions(input0, input1, tile_input0, tile_input1, param);
  927. return ElementEqual(tile_input0, tile_input1, output, element_size);
  928. }
  929. int ElementLess(float *input0, float *input1, float *output, int element_size) {
  930. int index = 0;
  931. #ifdef ENABLE_NEON
  932. float32x4_t vtrue = vdupq_n_f32(1);
  933. float32x4_t vfalse = vdupq_n_f32(0);
  934. for (; index <= element_size - 4; index += C4NUM) {
  935. float32x4_t vin0 = vld1q_f32(input0 + index);
  936. float32x4_t vin1 = vld1q_f32(input1 + index);
  937. float32x4_t vout = vbslq_f32(vcltq_f32(vin0, vin1), vtrue, vfalse);
  938. vst1q_f32(output + index, vout);
  939. }
  940. #endif
  941. for (; index < element_size; index++) {
  942. output[index] = (float)(input0[index] < input1[index]);
  943. }
  944. return NNACL_OK;
  945. }
  946. int BroadcastLess(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output, int element_size,
  947. ArithmeticParameter *param) {
  948. TileDimensions(input0, input1, tile_input0, tile_input1, param);
  949. return ElementLess(tile_input0, tile_input1, output, element_size);
  950. }
  951. int ElementLessEqual(float *input0, float *input1, float *output, int element_size) {
  952. int index = 0;
  953. #ifdef ENABLE_NEON
  954. float32x4_t vtrue = vdupq_n_f32(1);
  955. float32x4_t vfalse = vdupq_n_f32(0);
  956. for (; index <= element_size - 4; index += C4NUM) {
  957. float32x4_t vin0 = vld1q_f32(input0 + index);
  958. float32x4_t vin1 = vld1q_f32(input1 + index);
  959. float32x4_t vout = vbslq_f32(vcleq_f32(vin0, vin1), vtrue, vfalse);
  960. vst1q_f32(output + index, vout);
  961. }
  962. #endif
  963. for (; index < element_size; index++) {
  964. output[index] = (float)(input0[index] <= input1[index]);
  965. }
  966. return NNACL_OK;
  967. }
  968. int BroadcastLessEqual(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output,
  969. int element_size, ArithmeticParameter *param) {
  970. TileDimensions(input0, input1, tile_input0, tile_input1, param);
  971. return ElementLessEqual(tile_input0, tile_input1, output, element_size);
  972. }
  973. int ElementGreater(float *input0, float *input1, float *output, int element_size) {
  974. int index = 0;
  975. #ifdef ENABLE_NEON
  976. float32x4_t vtrue = vdupq_n_f32(1);
  977. float32x4_t vfalse = vdupq_n_f32(0);
  978. for (; index <= element_size - 4; index += C4NUM) {
  979. float32x4_t vin0 = vld1q_f32(input0 + index);
  980. float32x4_t vin1 = vld1q_f32(input1 + index);
  981. float32x4_t vout = vbslq_f32(vcgtq_f32(vin0, vin1), vtrue, vfalse);
  982. vst1q_f32(output + index, vout);
  983. }
  984. #endif
  985. for (; index < element_size; index++) {
  986. output[index] = (float)(input0[index] > input1[index]);
  987. }
  988. return NNACL_OK;
  989. }
  990. int BroadcastGreater(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output,
  991. int element_size, ArithmeticParameter *param) {
  992. TileDimensions(input0, input1, tile_input0, tile_input1, param);
  993. return ElementGreater(tile_input0, tile_input1, output, element_size);
  994. }
  995. int ElementGreaterEqual(float *input0, float *input1, float *output, int element_size) {
  996. int index = 0;
  997. #ifdef ENABLE_NEON
  998. float32x4_t vtrue = vdupq_n_f32(1);
  999. float32x4_t vfalse = vdupq_n_f32(0);
  1000. for (; index <= element_size - 4; index += C4NUM) {
  1001. float32x4_t vin0 = vld1q_f32(input0 + index);
  1002. float32x4_t vin1 = vld1q_f32(input1 + index);
  1003. float32x4_t vout = vbslq_f32(vcgeq_f32(vin0, vin1), vtrue, vfalse);
  1004. vst1q_f32(output + index, vout);
  1005. }
  1006. #endif
  1007. for (; index < element_size; index++) {
  1008. output[index] = (float)(input0[index] >= input1[index]);
  1009. }
  1010. return NNACL_OK;
  1011. }
  1012. int BroadcastGreaterEqual(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output,
  1013. int element_size, ArithmeticParameter *param) {
  1014. TileDimensions(input0, input1, tile_input0, tile_input1, param);
  1015. return ElementGreaterEqual(tile_input0, tile_input1, output, element_size);
  1016. }
  1017. #undef ACCURACY_DATA
  1018. #ifdef ENABLE_NNACL_INFER_SHAPE
  1019. int ArithmeticInferShape(int **in_shape, size_t *dim_size, int *out_shape, int *in_format, int *out_format,
  1020. int *in_datatype, int *out_datatype, OpParameter *param) {
  1021. *out_format = in_format[0];
  1022. *out_datatype = in_datatype[0];
  1023. ArithmeticParameter *arithmetic_parameter = (ArithmeticParameter *)param;
  1024. int ndim0 = dim_size[0];
  1025. int ndim1 = dim_size[1];
  1026. int *in_shape0 = in_shape[0];
  1027. int *in_shape1 = in_shape[1];
  1028. if (ndim0 < ndim1) {
  1029. arithmetic_parameter->ndim_ = ndim1;
  1030. int fill_dim_num = ndim1 - ndim0;
  1031. int j = 0;
  1032. for (int i = 0; i < ndim1; ++i) {
  1033. if (i < fill_dim_num) {
  1034. arithmetic_parameter->in_shape0_[i] = 1;
  1035. } else {
  1036. arithmetic_parameter->in_shape0_[i] = in_shape0[j++];
  1037. }
  1038. arithmetic_parameter->in_shape1_[i] = in_shape1[i];
  1039. }
  1040. } else if (ndim0 > ndim1) {
  1041. arithmetic_parameter->ndim_ = ndim0;
  1042. int fill_dim_num = ndim0 - ndim1;
  1043. int j = 0;
  1044. for (int i = 0; i < ndim0; ++i) {
  1045. if (i < fill_dim_num) {
  1046. arithmetic_parameter->in_shape1_[i] = 1;
  1047. } else {
  1048. arithmetic_parameter->in_shape1_[i] = in_shape1[j++];
  1049. }
  1050. arithmetic_parameter->in_shape0_[i] = in_shape0[i];
  1051. }
  1052. } else {
  1053. arithmetic_parameter->ndim_ = ndim0;
  1054. for (int i = 0; i < ndim0; ++i) {
  1055. arithmetic_parameter->in_shape0_[i] = in_shape0[i];
  1056. arithmetic_parameter->in_shape1_[i] = in_shape1[i];
  1057. }
  1058. }
  1059. int j = 0;
  1060. for (size_t i = 0; i < arithmetic_parameter->ndim_; ++i) {
  1061. if (arithmetic_parameter->in_shape0_[i] != arithmetic_parameter->in_shape1_[i]) {
  1062. if (arithmetic_parameter->in_shape0_[i] == 1) {
  1063. out_shape[j++] = arithmetic_parameter->in_shape1_[i];
  1064. } else if (arithmetic_parameter->in_shape1_[i] == 1) {
  1065. out_shape[j++] = arithmetic_parameter->in_shape0_[i];
  1066. } else {
  1067. return NNACL_PARAM_INVALID;
  1068. }
  1069. } else {
  1070. out_shape[j++] = arithmetic_parameter->in_shape0_[i];
  1071. }
  1072. }
  1073. return NNACL_OK;
  1074. }
  1075. #endif