You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

arithmetic_fp16.c 44 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "nnacl/fp16/arithmetic_fp16.h"
  17. #include <math.h>
  18. #include "nnacl/arithmetic_common.h"
  19. void TileOneDimensionFp16(float16_t *inData, float16_t *outData, int dim, size_t ndim, int *inShape, int *inStrides,
  20. int *outStrides, int *multiple) {
  21. int srcDimSize = inShape[dim];
  22. if (dim == ndim - 1) {
  23. for (int i = 0; i < multiple[dim]; i++) {
  24. memcpy(outData, inData, srcDimSize * sizeof(float16_t));
  25. outData += srcDimSize;
  26. }
  27. return;
  28. }
  29. for (size_t i = 0; i < srcDimSize; i++) {
  30. for (size_t j = 0; j < multiple[dim]; j++) {
  31. TileOneDimensionFp16(inData + inStrides[dim] * i, outData + outStrides[dim] * (i + j * srcDimSize), dim + 1, ndim,
  32. inShape, inStrides, outStrides, multiple);
  33. }
  34. }
  35. }
  36. void TileDimensionsFp16(float16_t *data0, float16_t *data1, float16_t *tile_data0, float16_t *tile_data1,
  37. ArithmeticParameter *param) {
  38. CalcMultiplesAndStrides(param);
  39. TileOneDimensionFp16(data0, tile_data0, 0, param->ndim_, param->in_shape0_, param->in_strides0_, param->out_strides_,
  40. param->multiples0_);
  41. TileOneDimensionFp16(data1, tile_data1, 0, param->ndim_, param->in_shape1_, param->in_strides1_, param->out_strides_,
  42. param->multiples1_);
  43. }
  44. int ElementMulFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  45. int index = 0;
  46. #ifdef ENABLE_NEON
  47. for (; index <= element_size - 8; index += C8NUM) {
  48. float16x8_t vin0 = vld1q_f16(input0 + index);
  49. float16x8_t vin1 = vld1q_f16(input1 + index);
  50. float16x8_t vout = vmulq_f16(vin0, vin1);
  51. vst1q_f16(output + index, vout);
  52. }
  53. #endif
  54. for (; index < element_size; index++) {
  55. output[index] = input0[index] * input1[index];
  56. }
  57. return NNACL_OK;
  58. }
  59. int ElementOptMulFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  60. ArithmeticParameter *param) {
  61. #ifdef ENABLE_NEON
  62. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  63. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  64. #endif
  65. int index = 0;
  66. if (param->in_elements_num0_ == 1) {
  67. #ifdef ENABLE_NEON
  68. for (; index <= element_size - 8; index += C8NUM) {
  69. float16x8_t vin1 = vld1q_f16(input1 + index);
  70. float16x8_t vout = vmulq_f16(vin0_opt, vin1);
  71. vst1q_f16(output + index, vout);
  72. }
  73. #endif
  74. for (; index < element_size; index++) {
  75. output[index] = input0[0] * input1[index];
  76. }
  77. } else {
  78. #ifdef ENABLE_NEON
  79. for (; index <= element_size - 8; index += C8NUM) {
  80. float16x8_t vin0 = vld1q_f16(input0 + index);
  81. float16x8_t vout = vmulq_f16(vin0, vin1_opt);
  82. vst1q_f16(output + index, vout);
  83. }
  84. #endif
  85. for (; index < element_size; index++) {
  86. output[index] = input0[index] * input1[0];
  87. }
  88. }
  89. return NNACL_OK;
  90. }
  91. int ElementMulReluFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  92. #ifdef ENABLE_NEON
  93. float16x8_t zeros = vdupq_n_f16(0.0);
  94. #endif
  95. int index = 0;
  96. #ifdef ENABLE_NEON
  97. for (; index <= element_size - 8; index += C8NUM) {
  98. float16x8_t vin0 = vld1q_f16(input0 + index);
  99. float16x8_t vin1 = vld1q_f16(input1 + index);
  100. float16x8_t vout = vmulq_f16(vin0, vin1);
  101. vout = vmaxq_f16(vout, zeros);
  102. vst1q_f16(output + index, vout);
  103. }
  104. #endif
  105. for (; index < element_size; index++) {
  106. float16_t res = input0[index] * input1[index];
  107. output[index] = res > 0 ? res : 0;
  108. }
  109. return NNACL_OK;
  110. }
  111. int ElementOptMulReluFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  112. ArithmeticParameter *param) {
  113. #ifdef ENABLE_NEON
  114. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  115. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  116. float16x8_t zeros = vdupq_n_f16(0.0);
  117. #endif
  118. int index = 0;
  119. if (param->in_elements_num0_ == 1) {
  120. #ifdef ENABLE_NEON
  121. for (; index <= element_size - 8; index += C8NUM) {
  122. float16x8_t vin1 = vld1q_f16(input1 + index);
  123. float16x8_t vout = vmulq_f16(vin0_opt, vin1);
  124. vout = vmaxq_f16(vout, zeros);
  125. vst1q_f16(output + index, vout);
  126. }
  127. #endif
  128. for (; index < element_size; index++) {
  129. float16_t res = input0[0] * input1[index];
  130. output[index] = res > 0 ? res : 0;
  131. }
  132. } else {
  133. #ifdef ENABLE_NEON
  134. for (; index <= element_size - 8; index += C8NUM) {
  135. float16x8_t vin0 = vld1q_f16(input0 + index);
  136. float16x8_t vout = vmulq_f16(vin0, vin1_opt);
  137. vout = vmaxq_f16(vout, zeros);
  138. vst1q_f16(output + index, vout);
  139. }
  140. #endif
  141. for (; index < element_size; index++) {
  142. float16_t res = input0[index] * input1[0];
  143. output[index] = res > 0 ? res : 0;
  144. }
  145. }
  146. return NNACL_OK;
  147. }
  148. int ElementMulRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  149. int index = 0;
  150. #ifdef ENABLE_NEON
  151. float16x8_t zeros = vdupq_n_f16(0.0);
  152. float16x8_t bounds = vdupq_n_f16(6.0);
  153. for (; index <= element_size - 8; index += C8NUM) {
  154. float16x8_t vin0 = vld1q_f16(input0 + index);
  155. float16x8_t vin1 = vld1q_f16(input1 + index);
  156. float16x8_t vout = vmulq_f16(vin0, vin1);
  157. vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
  158. vst1q_f16(output + index, vout);
  159. }
  160. #endif
  161. for (; index < element_size; index++) {
  162. output[index] = MSMIN(MSMAX(input0[index] * input1[index], 0), 6);
  163. }
  164. return NNACL_OK;
  165. }
  166. int ElementOptMulRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  167. ArithmeticParameter *param) {
  168. #ifdef ENABLE_NEON
  169. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  170. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  171. float16x8_t zeros = vdupq_n_f16(0.0);
  172. float16x8_t bounds = vdupq_n_f16(6.0);
  173. #endif
  174. int index = 0;
  175. if (param->in_elements_num0_ == 1) {
  176. #ifdef ENABLE_NEON
  177. for (; index <= element_size - 8; index += C8NUM) {
  178. float16x8_t vin1 = vld1q_f16(input1 + index);
  179. float16x8_t vout = vmulq_f16(vin0_opt, vin1);
  180. vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
  181. vst1q_f16(output + index, vout);
  182. }
  183. #endif
  184. for (; index < element_size; index++) {
  185. output[index] = MSMIN(MSMAX(input0[0] * input1[index], 0), 6);
  186. }
  187. } else {
  188. #ifdef ENABLE_NEON
  189. for (; index <= element_size - 8; index += C8NUM) {
  190. float16x8_t vin0 = vld1q_f16(input0 + index);
  191. float16x8_t vout = vmulq_f16(vin0, vin1_opt);
  192. vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
  193. vst1q_f16(output + index, vout);
  194. }
  195. #endif
  196. for (; index < element_size; index++) {
  197. output[index] = MSMIN(MSMAX(input0[index] * input1[0], 0), 6);
  198. }
  199. }
  200. return NNACL_OK;
  201. }
  202. int ElementAddFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  203. int index = 0;
  204. #ifdef ENABLE_NEON
  205. for (; index <= element_size - 8; index += C8NUM) {
  206. float16x8_t vin0 = vld1q_f16(input0 + index);
  207. float16x8_t vin1 = vld1q_f16(input1 + index);
  208. float16x8_t vout = vaddq_f16(vin0, vin1);
  209. vst1q_f16(output + index, vout);
  210. }
  211. #endif
  212. for (; index < element_size; index++) {
  213. output[index] = input0[index] + input1[index];
  214. }
  215. return NNACL_OK;
  216. }
  217. int ElementOptAddFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  218. ArithmeticParameter *param) {
  219. #ifdef ENABLE_NEON
  220. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  221. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  222. #endif
  223. int index = 0;
  224. if (param->in_elements_num0_ == 1) {
  225. #ifdef ENABLE_NEON
  226. for (; index <= element_size - 8; index += C8NUM) {
  227. float16x8_t vin1 = vld1q_f16(input1 + index);
  228. float16x8_t vout = vaddq_f16(vin0_opt, vin1);
  229. vst1q_f16(output + index, vout);
  230. }
  231. #endif
  232. for (; index < element_size; index++) {
  233. output[index] = input0[0] + input1[index];
  234. }
  235. } else {
  236. #ifdef ENABLE_NEON
  237. for (; index <= element_size - 8; index += C8NUM) {
  238. float16x8_t vin0 = vld1q_f16(input0 + index);
  239. float16x8_t vout = vaddq_f16(vin0, vin1_opt);
  240. vst1q_f16(output + index, vout);
  241. }
  242. #endif
  243. for (; index < element_size; index++) {
  244. output[index] = input0[index] + input1[0];
  245. }
  246. }
  247. return NNACL_OK;
  248. }
  249. int ElementAddReluFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  250. int index = 0;
  251. #ifdef ENABLE_NEON
  252. float16x8_t zeros = vdupq_n_f16(0.0);
  253. for (; index <= element_size - 8; index += C8NUM) {
  254. float16x8_t vin0 = vld1q_f16(input0 + index);
  255. float16x8_t vin1 = vld1q_f16(input1 + index);
  256. float16x8_t vout = vaddq_f16(vin0, vin1);
  257. vout = vmaxq_f16(vout, zeros);
  258. vst1q_f16(output + index, vout);
  259. }
  260. #endif
  261. for (; index < element_size; index++) {
  262. float16_t res = input0[index] + input1[index];
  263. output[index] = res > 0 ? res : 0;
  264. }
  265. return NNACL_OK;
  266. }
  267. int ElementOptAddReluFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  268. ArithmeticParameter *param) {
  269. #ifdef ENABLE_NEON
  270. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  271. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  272. float16x8_t zeros = vdupq_n_f16(0.0);
  273. #endif
  274. int index = 0;
  275. if (param->in_elements_num0_ == 1) {
  276. #ifdef ENABLE_NEON
  277. for (; index <= element_size - 8; index += C8NUM) {
  278. float16x8_t vin1 = vld1q_f16(input1 + index);
  279. float16x8_t vout = vaddq_f16(vin0_opt, vin1);
  280. vout = vmaxq_f16(vout, zeros);
  281. vst1q_f16(output + index, vout);
  282. }
  283. #endif
  284. for (; index < element_size; index++) {
  285. float16_t res = input0[0] + input1[index];
  286. output[index] = res > 0 ? res : 0;
  287. }
  288. } else {
  289. #ifdef ENABLE_NEON
  290. for (; index <= element_size - 8; index += C8NUM) {
  291. float16x8_t vin0 = vld1q_f16(input0 + index);
  292. float16x8_t vout = vaddq_f16(vin0, vin1_opt);
  293. vout = vmaxq_f16(vout, zeros);
  294. vst1q_f16(output + index, vout);
  295. }
  296. #endif
  297. for (; index < element_size; index++) {
  298. float16_t res = input0[index] + input1[0];
  299. output[index] = res > 0 ? res : 0;
  300. }
  301. }
  302. return NNACL_OK;
  303. }
  304. int ElementAddRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  305. int index = 0;
  306. #ifdef ENABLE_NEON
  307. float16x8_t zeros = vdupq_n_f16(0.0);
  308. float16x8_t bounds = vdupq_n_f16(6.0);
  309. for (; index <= element_size - 8; index += C8NUM) {
  310. float16x8_t vin0 = vld1q_f16(input0 + index);
  311. float16x8_t vin1 = vld1q_f16(input1 + index);
  312. float16x8_t vout = vaddq_f16(vin0, vin1);
  313. vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
  314. vst1q_f16(output + index, vout);
  315. }
  316. #endif
  317. for (; index < element_size; index++) {
  318. output[index] = MSMIN(MSMAX(input0[index] + input1[index], 0), 6);
  319. }
  320. return NNACL_OK;
  321. }
  322. int ElementOptAddRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  323. ArithmeticParameter *param) {
  324. #ifdef ENABLE_NEON
  325. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  326. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  327. float16x8_t zeros = vdupq_n_f16(0.0);
  328. float16x8_t bounds = vdupq_n_f16(6.0);
  329. #endif
  330. int index = 0;
  331. if (param->in_elements_num0_ == 1) {
  332. #ifdef ENABLE_NEON
  333. for (; index <= element_size - 8; index += C8NUM) {
  334. float16x8_t vin1 = vld1q_f16(input1 + index);
  335. float16x8_t vout = vaddq_f16(vin0_opt, vin1);
  336. vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
  337. vst1q_f16(output + index, vout);
  338. }
  339. #endif
  340. for (; index < element_size; index++) {
  341. output[index] = MSMIN(MSMAX(input0[0] + input1[index], 0), 6);
  342. }
  343. } else {
  344. #ifdef ENABLE_NEON
  345. for (; index <= element_size - 8; index += C8NUM) {
  346. float16x8_t vin0 = vld1q_f16(input0 + index);
  347. float16x8_t vout = vaddq_f16(vin0, vin1_opt);
  348. vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
  349. vst1q_f16(output + index, vout);
  350. }
  351. #endif
  352. for (; index < element_size; index++) {
  353. output[index] = MSMIN(MSMAX(input0[index] + input1[0], 0), 6);
  354. }
  355. }
  356. return NNACL_OK;
  357. }
  358. int ElementSubFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  359. int index = 0;
  360. #ifdef ENABLE_NEON
  361. for (; index <= element_size - 8; index += C8NUM) {
  362. float16x8_t vin0 = vld1q_f16(input0 + index);
  363. float16x8_t vin1 = vld1q_f16(input1 + index);
  364. float16x8_t vout = vsubq_f16(vin0, vin1);
  365. vst1q_f16(output + index, vout);
  366. }
  367. #endif
  368. for (; index < element_size; index++) {
  369. output[index] = input0[index] - input1[index];
  370. }
  371. return NNACL_OK;
  372. }
  373. int ElementOptSubFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  374. ArithmeticParameter *param) {
  375. #ifdef ENABLE_NEON
  376. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  377. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  378. #endif
  379. int index = 0;
  380. if (param->in_elements_num0_ == 1) {
  381. #ifdef ENABLE_NEON
  382. for (; index <= element_size - 8; index += C8NUM) {
  383. float16x8_t vin1 = vld1q_f16(input1 + index);
  384. float16x8_t vout = vsubq_f16(vin0_opt, vin1);
  385. vst1q_f16(output + index, vout);
  386. }
  387. #endif
  388. for (; index < element_size; index++) {
  389. output[index] = input0[0] - input1[index];
  390. }
  391. } else {
  392. #ifdef ENABLE_NEON
  393. for (; index <= element_size - 8; index += C8NUM) {
  394. float16x8_t vin0 = vld1q_f16(input0 + index);
  395. float16x8_t vout = vsubq_f16(vin0, vin1_opt);
  396. vst1q_f16(output + index, vout);
  397. }
  398. #endif
  399. for (; index < element_size; index++) {
  400. output[index] = input0[index] - input1[0];
  401. }
  402. }
  403. return NNACL_OK;
  404. }
  405. int ElementSubReluFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  406. int index = 0;
  407. #ifdef ENABLE_NEON
  408. float16x8_t zeros = vdupq_n_f16(0.0);
  409. for (; index <= element_size - 8; index += C8NUM) {
  410. float16x8_t vin0 = vld1q_f16(input0 + index);
  411. float16x8_t vin1 = vld1q_f16(input1 + index);
  412. float16x8_t vout = vsubq_f16(vin0, vin1);
  413. vout = vmaxq_f16(vout, zeros);
  414. vst1q_f16(output + index, vout);
  415. }
  416. #endif
  417. for (; index < element_size; index++) {
  418. float16_t res = input0[index] - input1[index];
  419. output[index] = res > 0 ? res : 0;
  420. }
  421. return NNACL_OK;
  422. }
  423. int ElementOptSubReluFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  424. ArithmeticParameter *param) {
  425. #ifdef ENABLE_NEON
  426. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  427. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  428. float16x8_t zeros = vdupq_n_f16(0.0);
  429. #endif
  430. int index = 0;
  431. if (param->in_elements_num0_ == 1) {
  432. #ifdef ENABLE_NEON
  433. for (; index <= element_size - 8; index += C8NUM) {
  434. float16x8_t vin1 = vld1q_f16(input1 + index);
  435. float16x8_t vout = vsubq_f16(vin0_opt, vin1);
  436. vout = vmaxq_f16(vout, zeros);
  437. vst1q_f16(output + index, vout);
  438. }
  439. #endif
  440. for (; index < element_size; index++) {
  441. float16_t res = input0[0] - input1[index];
  442. output[index] = res > 0 ? res : 0;
  443. }
  444. } else {
  445. #ifdef ENABLE_NEON
  446. for (; index <= element_size - 8; index += C8NUM) {
  447. float16x8_t vin0 = vld1q_f16(input0 + index);
  448. float16x8_t vout = vsubq_f16(vin0, vin1_opt);
  449. vout = vmaxq_f16(vout, zeros);
  450. vst1q_f16(output + index, vout);
  451. }
  452. #endif
  453. for (; index < element_size; index++) {
  454. float16_t res = input0[index] - input1[0];
  455. output[index] = res > 0 ? res : 0;
  456. }
  457. }
  458. return NNACL_OK;
  459. }
  460. int ElementSubRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  461. int index = 0;
  462. #ifdef ENABLE_NEON
  463. float16x8_t zeros = vdupq_n_f16(0.0);
  464. float16x8_t bounds = vdupq_n_f16(6.0);
  465. for (; index <= element_size - 8; index += C8NUM) {
  466. float16x8_t vin0 = vld1q_f16(input0 + index);
  467. float16x8_t vin1 = vld1q_f16(input1 + index);
  468. float16x8_t vout = vsubq_f16(vin0, vin1);
  469. vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
  470. vst1q_f16(output + index, vout);
  471. }
  472. #endif
  473. for (; index < element_size; index++) {
  474. output[index] = MSMIN(MSMAX(input0[index] - input1[index], 0), 6);
  475. }
  476. return NNACL_OK;
  477. }
  478. int ElementOptSubRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  479. ArithmeticParameter *param) {
  480. #ifdef ENABLE_NEON
  481. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  482. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  483. float16x8_t zeros = vdupq_n_f16(0.0);
  484. float16x8_t bounds = vdupq_n_f16(6.0);
  485. #endif
  486. int index = 0;
  487. if (param->in_elements_num0_ == 1) {
  488. #ifdef ENABLE_NEON
  489. for (; index <= element_size - 8; index += C8NUM) {
  490. float16x8_t vin1 = vld1q_f16(input1 + index);
  491. float16x8_t vout = vsubq_f16(vin0_opt, vin1);
  492. vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
  493. vst1q_f16(output + index, vout);
  494. }
  495. #endif
  496. for (; index < element_size; index++) {
  497. output[index] = MSMIN(MSMAX(input0[0] - input1[index], 0), 6);
  498. }
  499. } else {
  500. #ifdef ENABLE_NEON
  501. for (; index <= element_size - 8; index += C8NUM) {
  502. float16x8_t vin0 = vld1q_f16(input0 + index);
  503. float16x8_t vout = vsubq_f16(vin0, vin1_opt);
  504. vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
  505. vst1q_f16(output + index, vout);
  506. }
  507. #endif
  508. for (; index < element_size; index++) {
  509. output[index] = MSMIN(MSMAX(input0[index] - input1[0], 0), 6);
  510. }
  511. }
  512. return NNACL_OK;
  513. }
  514. int ElementDivFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  515. int index = 0;
  516. #ifdef ENABLE_NEON
  517. for (; index <= element_size - 8; index += C8NUM) {
  518. float16x8_t vin0 = vld1q_f16(input0 + index);
  519. float16x8_t vin1 = vld1q_f16(input1 + index);
  520. float16x8_t vout = vdivq_f16(vin0, vin1);
  521. vst1q_f16(output + index, vout);
  522. }
  523. #endif
  524. for (; index < element_size; index++) {
  525. output[index] = input0[index] / input1[index];
  526. }
  527. return NNACL_OK;
  528. }
  529. int ElementOptDivFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  530. ArithmeticParameter *param) {
  531. #ifdef ENABLE_NEON
  532. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  533. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  534. #endif
  535. int index = 0;
  536. if (param->in_elements_num0_ == 1) {
  537. #ifdef ENABLE_NEON
  538. for (; index <= element_size - 8; index += C8NUM) {
  539. float16x8_t vin1 = vld1q_f16(input1 + index);
  540. float16x8_t vout = vdivq_f16(vin0_opt, vin1);
  541. vst1q_f16(output + index, vout);
  542. }
  543. #endif
  544. for (; index < element_size; index++) {
  545. output[index] = input0[0] / input1[index];
  546. }
  547. } else {
  548. if (input1[0] == 0) {
  549. return NNACL_ERRCODE_DIVISOR_ZERO;
  550. }
  551. #ifdef ENABLE_NEON
  552. for (; index <= element_size - 8; index += C8NUM) {
  553. float16x8_t vin0 = vld1q_f16(input0 + index);
  554. float16x8_t vout = vdivq_f16(vin0, vin1_opt);
  555. vst1q_f16(output + index, vout);
  556. }
  557. #endif
  558. for (; index < element_size; index++) {
  559. output[index] = input0[index] / input1[0];
  560. }
  561. }
  562. return NNACL_OK;
  563. }
  564. int ElementDivReluFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  565. int index = 0;
  566. #ifdef ENABLE_NEON
  567. float16x8_t zeros = vdupq_n_f16(0.0);
  568. for (; index <= element_size - 8; index += C8NUM) {
  569. float16x8_t vin0 = vld1q_f16(input0 + index);
  570. float16x8_t vin1 = vld1q_f16(input1 + index);
  571. float16x8_t vout = vdivq_f16(vin0, vin1);
  572. vout = vmaxq_f16(vout, zeros);
  573. vst1q_f16(output + index, vout);
  574. }
  575. #endif
  576. for (; index < element_size; index++) {
  577. if (input1[index] == 0) {
  578. return NNACL_ERRCODE_DIVISOR_ZERO;
  579. }
  580. float16_t res = input0[index] / input1[index];
  581. output[index] = res > 0 ? res : 0;
  582. }
  583. return NNACL_OK;
  584. }
  585. int ElementOptDivReluFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  586. ArithmeticParameter *param) {
  587. #ifdef ENABLE_NEON
  588. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  589. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  590. float16x8_t zeros = vdupq_n_f16(0.0);
  591. #endif
  592. int index = 0;
  593. if (param->in_elements_num0_ == 1) {
  594. #ifdef ENABLE_NEON
  595. for (; index <= element_size - 8; index += C8NUM) {
  596. float16x8_t vin1 = vld1q_f16(input1 + index);
  597. float16x8_t vout = vmaxq_f16(vdivq_f16(vin0_opt, vin1), zeros);
  598. vst1q_f16(output + index, vout);
  599. }
  600. #endif
  601. for (; index < element_size; index++) {
  602. if (input1[index] == 0) {
  603. return NNACL_ERRCODE_DIVISOR_ZERO;
  604. }
  605. output[index] = MSMAX(input0[0] / input1[index], 0);
  606. }
  607. } else {
  608. if (input1[0] == 0) {
  609. return NNACL_ERRCODE_DIVISOR_ZERO;
  610. }
  611. #ifdef ENABLE_NEON
  612. for (; index <= element_size - 8; index += C8NUM) {
  613. float16x8_t vin0 = vld1q_f16(input0 + index);
  614. float16x8_t vout = vmaxq_f16(vdivq_f16(vin0, vin1_opt), zeros);
  615. vst1q_f16(output + index, vout);
  616. }
  617. #endif
  618. for (; index < element_size; index++) {
  619. output[index] = MSMAX(input0[index] / input1[0], 0);
  620. }
  621. }
  622. return NNACL_OK;
  623. }
  624. int ElementDivRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  625. int index = 0;
  626. #ifdef ENABLE_NEON
  627. float16x8_t zeros = vdupq_n_f16(0.0);
  628. float16x8_t bounds = vdupq_n_f16(6.0);
  629. for (; index <= element_size - 8; index += C8NUM) {
  630. float16x8_t vin0 = vld1q_f16(input0 + index);
  631. float16x8_t vin1 = vld1q_f16(input1 + index);
  632. float16x8_t vout = vdivq_f16(vin0, vin1);
  633. vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
  634. vst1q_f16(output + index, vout);
  635. }
  636. #endif
  637. for (; index < element_size; index++) {
  638. if (input1[index] == 0) {
  639. return NNACL_ERRCODE_DIVISOR_ZERO;
  640. }
  641. output[index] = MSMIN(MSMAX(input0[index] / input1[index], 0), 6);
  642. }
  643. return NNACL_OK;
  644. }
  645. int ElementOptDivRelu6Fp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  646. ArithmeticParameter *param) {
  647. #ifdef ENABLE_NEON
  648. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  649. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  650. float16x8_t zeros = vdupq_n_f16(0.0);
  651. float16x8_t bounds = vdupq_n_f16(6.0);
  652. #endif
  653. int index = 0;
  654. if (param->in_elements_num0_ == 1) {
  655. #ifdef ENABLE_NEON
  656. for (; index <= element_size - 8; index += C8NUM) {
  657. float16x8_t vin1 = vld1q_f16(input1 + index);
  658. float16x8_t vout = vminq_f16(vmaxq_f16(vdivq_f16(vin0_opt, vin1), zeros), bounds);
  659. vst1q_f16(output + index, vout);
  660. }
  661. #endif
  662. for (; index < element_size; index++) {
  663. if (input1[index] == 0) {
  664. return NNACL_ERRCODE_DIVISOR_ZERO;
  665. }
  666. output[index] = MSMIN(MSMAX(input0[0] / input1[index], 0), 6);
  667. }
  668. } else {
  669. if (input1[0] == 0) {
  670. return NNACL_ERRCODE_DIVISOR_ZERO;
  671. }
  672. #ifdef ENABLE_NEON
  673. for (; index <= element_size - 8; index += C8NUM) {
  674. float16x8_t vin0 = vld1q_f16(input0 + index);
  675. float16x8_t vout = vminq_f16(vmaxq_f16(vdivq_f16(vin0, vin1_opt), zeros), bounds);
  676. vst1q_f16(output + index, vout);
  677. }
  678. #endif
  679. for (; index < element_size; index++) {
  680. output[index] = MSMIN(MSMAX(input0[index] / input1[0], 0), 6);
  681. }
  682. }
  683. return NNACL_OK;
  684. }
  685. int ElementFloorModFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  686. for (int i = 0; i < element_size; ++i) {
  687. if (input1[i] == 0) {
  688. return NNACL_ERRCODE_DIVISOR_ZERO;
  689. }
  690. output[i] = input0[i] - floorf(input0[i] / input1[i]) * input1[i];
  691. }
  692. return NNACL_OK;
  693. }
  694. int ElementOptFloorModFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  695. ArithmeticParameter *param) {
  696. if (param->in_elements_num1_ == 1) {
  697. for (int i = 0; i < element_size; ++i) {
  698. output[i] = input0[i] - floorf(input0[i] / input1[0]) * input1[0];
  699. }
  700. } else {
  701. for (int i = 0; i < element_size; ++i) {
  702. output[i] = input0[i] - floorf(input0[i] / input1[i]) * input1[i];
  703. }
  704. }
  705. return NNACL_OK;
  706. }
  707. int ElementFloorDivFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  708. for (int i = 0; i < element_size; ++i) {
  709. output[i] = floorf(input0[i] / input1[i]);
  710. }
  711. return NNACL_OK;
  712. }
  713. int ElementOptFloorDivFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  714. ArithmeticParameter *param) {
  715. if (param->in_elements_num1_ == 1) {
  716. for (int i = 0; i < element_size; ++i) {
  717. output[i] = floorf(input0[i] / input1[0]);
  718. }
  719. } else {
  720. for (int i = 0; i < element_size; ++i) {
  721. output[i] = floorf(input0[i] / input1[i]);
  722. }
  723. }
  724. return NNACL_OK;
  725. }
  726. int ElementLogicalAndFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  727. int index = 0;
  728. #ifdef ENABLE_NEON
  729. float16x8_t vtrue = vdupq_n_f16(1);
  730. float16x8_t vfalse = vdupq_n_f16(0);
  731. uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1));
  732. uint16x8_t zeros = vdupq_n_u16(0);
  733. for (; index <= element_size - 8; index += C8NUM) {
  734. uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input0 + index)), mask);
  735. uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input1 + index)), mask);
  736. float16x8_t vout = vbslq_f16(vceqq_u16(vandq_u16(vin0, vin1), zeros), vfalse, vtrue);
  737. vst1q_f16(output + index, vout);
  738. }
  739. #endif
  740. for (; index < element_size; index++) {
  741. output[index] = (float16_t)((bool)(input0[index]) & (bool)(input1[index]));
  742. }
  743. return NNACL_OK;
  744. }
  745. int ElementOptLogicalAndFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  746. ArithmeticParameter *param) {
  747. #ifdef ENABLE_NEON
  748. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  749. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  750. float16x8_t vtrue = vdupq_n_f16(1);
  751. float16x8_t vfalse = vdupq_n_f16(0);
  752. uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1));
  753. uint16x8_t zeros = vdupq_n_u16(0);
  754. #endif
  755. int index = 0;
  756. if (param->in_elements_num0_ == 1) {
  757. #ifdef ENABLE_NEON
  758. for (; index <= element_size - 8; index += C8NUM) {
  759. float16x8_t vin1_ = vld1q_f16(input1 + index);
  760. uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vin0_opt), mask);
  761. uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vin1_), mask);
  762. float16x8_t vout = vbslq_f16(vceqq_u16(vandq_u16(vin0, vin1), zeros), vfalse, vtrue);
  763. vst1q_f16(output + index, vout);
  764. }
  765. #endif
  766. for (; index < element_size; index++) {
  767. output[index] = (float16_t)((bool)(input0[0]) & (bool)(input1[index]));
  768. }
  769. } else {
  770. #ifdef ENABLE_NEON
  771. for (; index <= element_size - 8; index += C8NUM) {
  772. float16x8_t vin0_ = vld1q_f16(input0 + index);
  773. uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vin0_), mask);
  774. uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vin1_opt), mask);
  775. float16x8_t vout = vbslq_f16(vceqq_u16(vandq_u16(vin0, vin1), zeros), vfalse, vtrue);
  776. vst1q_f16(output + index, vout);
  777. }
  778. #endif
  779. for (; index < element_size; index++) {
  780. output[index] = (float16_t)((bool)(input0[index]) & (bool)(input1[0]));
  781. }
  782. }
  783. return NNACL_OK;
  784. }
  785. int ElementLogicalOrFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  786. int index = 0;
  787. #ifdef ENABLE_NEON
  788. float16x8_t vtrue = vdupq_n_f16(1);
  789. float16x8_t vfalse = vdupq_n_f16(0);
  790. uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1));
  791. uint16x8_t zeros = vdupq_n_u16(0);
  792. for (; index <= element_size - 8; index += C8NUM) {
  793. uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input0 + index)), mask);
  794. uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input1 + index)), mask);
  795. float16x8_t vout = vbslq_f16(vceqq_u16(vorrq_u16(vin0, vin1), zeros), vfalse, vtrue);
  796. vst1q_f16(output + index, vout);
  797. }
  798. #endif
  799. for (; index < element_size; index++) {
  800. output[index] = (float16_t)((bool)(input0[index]) | (bool)(input1[index]));
  801. }
  802. return NNACL_OK;
  803. }
  804. int ElementOptLogicalOrFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  805. ArithmeticParameter *param) {
  806. #ifdef ENABLE_NEON
  807. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  808. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  809. float16x8_t vtrue = vdupq_n_f16(1);
  810. float16x8_t vfalse = vdupq_n_f16(0);
  811. uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1));
  812. uint16x8_t zeros = vdupq_n_u16(0);
  813. #endif
  814. int index = 0;
  815. if (param->in_elements_num0_ == 1) {
  816. #ifdef ENABLE_NEON
  817. for (; index <= element_size - 8; index += C8NUM) {
  818. float16x8_t vin1_ = vld1q_f16(input1 + index);
  819. uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vin0_opt), mask);
  820. uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vin1_), mask);
  821. float16x8_t vout = vbslq_f16(vceqq_u16(vorrq_u16(vin0, vin1), zeros), vfalse, vtrue);
  822. vst1q_f16(output + index, vout);
  823. }
  824. #endif
  825. for (; index < element_size; index++) {
  826. output[index] = (float16_t)((bool)(input0[0]) | (bool)(input1[index]));
  827. }
  828. } else {
  829. #ifdef ENABLE_NEON
  830. for (; index <= element_size - 8; index += C8NUM) {
  831. float16x8_t vin0_ = vld1q_f16(input0 + index);
  832. uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vin0_), mask);
  833. uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vin1_opt), mask);
  834. float16x8_t vout = vbslq_f16(vceqq_u16(vorrq_u16(vin0, vin1), zeros), vfalse, vtrue);
  835. vst1q_f16(output + index, vout);
  836. }
  837. #endif
  838. for (; index < element_size; index++) {
  839. output[index] = (float16_t)((bool)(input0[index]) | (bool)(input1[0]));
  840. }
  841. }
  842. return NNACL_OK;
  843. }
  844. int ElementSquaredDifferenceFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  845. ElementSubFp16(input0, input1, output, element_size);
  846. return ElementMulFp16(output, output, output, element_size);
  847. }
  848. int ElementOptSquaredDifferenceFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  849. ArithmeticParameter *param) {
  850. ElementOptSubFp16(input0, input1, output, element_size, param);
  851. return ElementMulFp16(output, output, output, element_size);
  852. }
  853. int ElementMaximumFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  854. int index = 0;
  855. #ifdef ENABLE_NEON
  856. for (; index <= element_size - 8; index += C8NUM) {
  857. float16x8_t vin0 = vld1q_f16(input0 + index);
  858. float16x8_t vin1 = vld1q_f16(input1 + index);
  859. float16x8_t vout = vmaxq_f16(vin0, vin1);
  860. vst1q_f16(output + index, vout);
  861. }
  862. #endif
  863. for (; index < element_size; index++) {
  864. output[index] = MSMAX(input0[index], input1[index]);
  865. }
  866. return NNACL_OK;
  867. }
  868. int ElementOptMaximumFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  869. ArithmeticParameter *param) {
  870. #ifdef ENABLE_NEON
  871. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  872. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  873. #endif
  874. int index = 0;
  875. if (param->in_elements_num0_ == 1) {
  876. #ifdef ENABLE_NEON
  877. for (; index <= element_size - 8; index += C8NUM) {
  878. float16x8_t vin1 = vld1q_f16(input1 + index);
  879. float16x8_t vout = vmaxq_f16(vin0_opt, vin1);
  880. vst1q_f16(output + index, vout);
  881. }
  882. #endif
  883. for (; index < element_size; index++) {
  884. output[index] = MSMAX(input0[0], input1[index]);
  885. }
  886. } else {
  887. #ifdef ENABLE_NEON
  888. for (; index <= element_size - 8; index += C8NUM) {
  889. float16x8_t vin0 = vld1q_f16(input0 + index);
  890. float16x8_t vout = vmaxq_f16(vin0, vin1_opt);
  891. vst1q_f16(output + index, vout);
  892. }
  893. #endif
  894. for (; index < element_size; index++) {
  895. output[index] = MSMAX(input0[index], input1[0]);
  896. }
  897. }
  898. return NNACL_OK;
  899. }
  900. int ElementMinimumFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  901. int index = 0;
  902. #ifdef ENABLE_NEON
  903. for (; index <= element_size - 8; index += C8NUM) {
  904. float16x8_t vin0 = vld1q_f16(input0 + index);
  905. float16x8_t vin1 = vld1q_f16(input1 + index);
  906. float16x8_t vout = vminq_f16(vin0, vin1);
  907. vst1q_f16(output + index, vout);
  908. }
  909. #endif
  910. for (; index < element_size; index++) {
  911. output[index] = MSMIN(input0[index], input1[index]);
  912. }
  913. return NNACL_OK;
  914. }
  915. int ElementOptMinimumFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  916. ArithmeticParameter *param) {
  917. #ifdef ENABLE_NEON
  918. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  919. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  920. #endif
  921. int index = 0;
  922. if (param->in_elements_num0_ == 1) {
  923. #ifdef ENABLE_NEON
  924. for (; index <= element_size - 8; index += C8NUM) {
  925. float16x8_t vin1 = vld1q_f16(input1 + index);
  926. float16x8_t vout = vminq_f16(vin0_opt, vin1);
  927. vst1q_f16(output + index, vout);
  928. }
  929. #endif
  930. for (; index < element_size; index++) {
  931. output[index] = MSMIN(input0[0], input1[index]);
  932. }
  933. } else {
  934. #ifdef ENABLE_NEON
  935. for (; index <= element_size - 8; index += C8NUM) {
  936. float16x8_t vin0 = vld1q_f16(input0 + index);
  937. float16x8_t vout = vminq_f16(vin0, vin1_opt);
  938. vst1q_f16(output + index, vout);
  939. }
  940. #endif
  941. for (; index < element_size; index++) {
  942. output[index] = MSMIN(input0[index], input1[0]);
  943. }
  944. }
  945. return NNACL_OK;
  946. }
  947. int ElementNotEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  948. int index = 0;
  949. #ifdef ENABLE_NEON
  950. float16x8_t vtrue = vdupq_n_f16(1);
  951. float16x8_t vfalse = vdupq_n_f16(0);
  952. for (; index <= element_size - 8; index += C8NUM) {
  953. float16x8_t vin0 = vld1q_f16(input0 + index);
  954. float16x8_t vin1 = vld1q_f16(input1 + index);
  955. float16x8_t vout = vbslq_f16(vceqq_f16(vin0, vin1), vfalse, vtrue);
  956. vst1q_f16(output + index, vout);
  957. }
  958. #endif
  959. for (; index < element_size; index++) {
  960. output[index] = (float16_t)(input0[index] != input1[index]);
  961. }
  962. return NNACL_OK;
  963. }
  964. int ElementOptNotEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  965. ArithmeticParameter *param) {
  966. #ifdef ENABLE_NEON
  967. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  968. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  969. float16x8_t vtrue = vdupq_n_f16(1);
  970. float16x8_t vfalse = vdupq_n_f16(0);
  971. #endif
  972. int index = 0;
  973. if (param->in_elements_num0_ == 1) {
  974. #ifdef ENABLE_NEON
  975. for (; index <= element_size - 8; index += C8NUM) {
  976. float16x8_t vin1 = vld1q_f16(input1 + index);
  977. float16x8_t vout = vbslq_f16(vceqq_f16(vin0_opt, vin1), vfalse, vtrue);
  978. vst1q_f16(output + index, vout);
  979. }
  980. #endif
  981. for (; index < element_size; index++) {
  982. output[index] = (float16_t)(input0[0] != input1[index]);
  983. }
  984. } else {
  985. #ifdef ENABLE_NEON
  986. for (; index <= element_size - 8; index += C8NUM) {
  987. float16x8_t vin0 = vld1q_f16(input0 + index);
  988. float16x8_t vout = vbslq_f16(vceqq_f16(vin0, vin1_opt), vfalse, vtrue);
  989. vst1q_f16(output + index, vout);
  990. }
  991. #endif
  992. for (; index < element_size; index++) {
  993. output[index] = (float16_t)(input0[index] != input1[0]);
  994. }
  995. }
  996. return NNACL_OK;
  997. }
  998. int ElementEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  999. int index = 0;
  1000. #ifdef ENABLE_NEON
  1001. float16x8_t vtrue = vdupq_n_f16(1);
  1002. float16x8_t vfalse = vdupq_n_f16(0);
  1003. for (; index <= element_size - 8; index += C8NUM) {
  1004. float16x8_t vin0 = vld1q_f16(input0 + index);
  1005. float16x8_t vin1 = vld1q_f16(input1 + index);
  1006. float16x8_t vout = vbslq_f16(vceqq_f16(vin0, vin1), vtrue, vfalse);
  1007. vst1q_f16(output + index, vout);
  1008. }
  1009. #endif
  1010. for (; index < element_size; index++) {
  1011. output[index] = (float16_t)(input0[index] == input1[index]);
  1012. }
  1013. return NNACL_OK;
  1014. }
  1015. int ElementOptEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  1016. ArithmeticParameter *param) {
  1017. #ifdef ENABLE_NEON
  1018. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  1019. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  1020. float16x8_t vtrue = vdupq_n_f16(1);
  1021. float16x8_t vfalse = vdupq_n_f16(0);
  1022. #endif
  1023. int index = 0;
  1024. if (param->in_elements_num0_ == 1) {
  1025. #ifdef ENABLE_NEON
  1026. for (; index <= element_size - 8; index += C8NUM) {
  1027. float16x8_t vin1 = vld1q_f16(input1 + index);
  1028. float16x8_t vout = vbslq_f16(vceqq_f16(vin0_opt, vin1), vtrue, vfalse);
  1029. vst1q_f16(output + index, vout);
  1030. }
  1031. #endif
  1032. for (; index < element_size; index++) {
  1033. output[index] = (float16_t)(input0[0] == input1[index]);
  1034. }
  1035. } else {
  1036. #ifdef ENABLE_NEON
  1037. for (; index <= element_size - 8; index += C8NUM) {
  1038. float16x8_t vin0 = vld1q_f16(input0 + index);
  1039. float16x8_t vout = vbslq_f16(vceqq_f16(vin0, vin1_opt), vtrue, vfalse);
  1040. vst1q_f16(output + index, vout);
  1041. }
  1042. #endif
  1043. for (; index < element_size; index++) {
  1044. output[index] = (float16_t)(input0[index] == input1[0]);
  1045. }
  1046. }
  1047. return NNACL_OK;
  1048. }
  1049. int ElementLessFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  1050. int index = 0;
  1051. #ifdef ENABLE_NEON
  1052. float16x8_t vtrue = vdupq_n_f16(1);
  1053. float16x8_t vfalse = vdupq_n_f16(0);
  1054. for (; index <= element_size - 8; index += C8NUM) {
  1055. float16x8_t vin0 = vld1q_f16(input0 + index);
  1056. float16x8_t vin1 = vld1q_f16(input1 + index);
  1057. float16x8_t vout = vbslq_f16(vcltq_f16(vin0, vin1), vtrue, vfalse);
  1058. vst1q_f16(output + index, vout);
  1059. }
  1060. #endif
  1061. for (; index < element_size; index++) {
  1062. output[index] = (float16_t)(input0[index] < input1[index]);
  1063. }
  1064. return NNACL_OK;
  1065. }
  1066. int ElementOptLessFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  1067. ArithmeticParameter *param) {
  1068. #ifdef ENABLE_NEON
  1069. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  1070. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  1071. float16x8_t vtrue = vdupq_n_f16(1);
  1072. float16x8_t vfalse = vdupq_n_f16(0);
  1073. #endif
  1074. int index = 0;
  1075. if (param->in_elements_num0_ == 1) {
  1076. #ifdef ENABLE_NEON
  1077. for (; index <= element_size - 8; index += C8NUM) {
  1078. float16x8_t vin1 = vld1q_f16(input1 + index);
  1079. float16x8_t vout = vbslq_f16(vcltq_f16(vin0_opt, vin1), vtrue, vfalse);
  1080. vst1q_f16(output + index, vout);
  1081. }
  1082. #endif
  1083. for (; index < element_size; index++) {
  1084. output[index] = (float16_t)(input0[0] < input1[index]);
  1085. }
  1086. } else {
  1087. #ifdef ENABLE_NEON
  1088. for (; index <= element_size - 8; index += C8NUM) {
  1089. float16x8_t vin0 = vld1q_f16(input0 + index);
  1090. float16x8_t vout = vbslq_f16(vcltq_f16(vin0, vin1_opt), vtrue, vfalse);
  1091. vst1q_f16(output + index, vout);
  1092. }
  1093. #endif
  1094. for (; index < element_size; index++) {
  1095. output[index] = (float16_t)(input0[index] < input1[0]);
  1096. }
  1097. }
  1098. return NNACL_OK;
  1099. }
  1100. int ElementLessEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  1101. int index = 0;
  1102. #ifdef ENABLE_NEON
  1103. float16x8_t vtrue = vdupq_n_f16(1);
  1104. float16x8_t vfalse = vdupq_n_f16(0);
  1105. for (; index <= element_size - 8; index += C8NUM) {
  1106. float16x8_t vin0 = vld1q_f16(input0 + index);
  1107. float16x8_t vin1 = vld1q_f16(input1 + index);
  1108. float16x8_t vout = vbslq_f16(vcleq_f16(vin0, vin1), vtrue, vfalse);
  1109. vst1q_f16(output + index, vout);
  1110. }
  1111. #endif
  1112. for (; index < element_size; index++) {
  1113. output[index] = (float16_t)(input0[index] <= input1[index]);
  1114. }
  1115. return NNACL_OK;
  1116. }
  1117. int ElementOptLessEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  1118. ArithmeticParameter *param) {
  1119. #ifdef ENABLE_NEON
  1120. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  1121. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  1122. float16x8_t vtrue = vdupq_n_f16(1);
  1123. float16x8_t vfalse = vdupq_n_f16(0);
  1124. #endif
  1125. int index = 0;
  1126. if (param->in_elements_num0_ == 1) {
  1127. #ifdef ENABLE_NEON
  1128. for (; index <= element_size - 8; index += C8NUM) {
  1129. float16x8_t vin1 = vld1q_f16(input1 + index);
  1130. float16x8_t vout = vbslq_f16(vcleq_f16(vin0_opt, vin1), vtrue, vfalse);
  1131. vst1q_f16(output + index, vout);
  1132. }
  1133. #endif
  1134. for (; index < element_size; index++) {
  1135. output[index] = (float16_t)(input0[0] <= input1[index]);
  1136. }
  1137. } else {
  1138. #ifdef ENABLE_NEON
  1139. for (; index <= element_size - 8; index += C8NUM) {
  1140. float16x8_t vin0 = vld1q_f16(input0 + index);
  1141. float16x8_t vout = vbslq_f16(vcleq_f16(vin0, vin1_opt), vtrue, vfalse);
  1142. vst1q_f16(output + index, vout);
  1143. }
  1144. #endif
  1145. for (; index < element_size; index++) {
  1146. output[index] = (float16_t)(input0[index] <= input1[0]);
  1147. }
  1148. }
  1149. return NNACL_OK;
  1150. }
  1151. int ElementGreaterFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  1152. int index = 0;
  1153. #ifdef ENABLE_NEON
  1154. float16x8_t vtrue = vdupq_n_f16(1);
  1155. float16x8_t vfalse = vdupq_n_f16(0);
  1156. for (; index <= element_size - 8; index += C8NUM) {
  1157. float16x8_t vin0 = vld1q_f16(input0 + index);
  1158. float16x8_t vin1 = vld1q_f16(input1 + index);
  1159. float16x8_t vout = vbslq_f16(vcgtq_f16(vin0, vin1), vtrue, vfalse);
  1160. vst1q_f16(output + index, vout);
  1161. }
  1162. #endif
  1163. for (; index < element_size; index++) {
  1164. output[index] = (float16_t)(input0[index] > input1[index]);
  1165. }
  1166. return NNACL_OK;
  1167. }
  1168. int ElementOptGreaterFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  1169. ArithmeticParameter *param) {
  1170. #ifdef ENABLE_NEON
  1171. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  1172. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  1173. float16x8_t vtrue = vdupq_n_f16(1);
  1174. float16x8_t vfalse = vdupq_n_f16(0);
  1175. #endif
  1176. int index = 0;
  1177. if (param->in_elements_num0_ == 1) {
  1178. #ifdef ENABLE_NEON
  1179. for (; index <= element_size - 8; index += C8NUM) {
  1180. float16x8_t vin1 = vld1q_f16(input1 + index);
  1181. float16x8_t vout = vbslq_f16(vcgtq_f16(vin0_opt, vin1), vtrue, vfalse);
  1182. vst1q_f16(output + index, vout);
  1183. }
  1184. #endif
  1185. for (; index < element_size; index++) {
  1186. output[index] = (float16_t)(input0[0] > input1[index]);
  1187. }
  1188. } else {
  1189. #ifdef ENABLE_NEON
  1190. for (; index <= element_size - 8; index += C8NUM) {
  1191. float16x8_t vin0 = vld1q_f16(input0 + index);
  1192. float16x8_t vout = vbslq_f16(vcgtq_f16(vin0, vin1_opt), vtrue, vfalse);
  1193. vst1q_f16(output + index, vout);
  1194. }
  1195. #endif
  1196. for (; index < element_size; index++) {
  1197. output[index] = (float16_t)(input0[index] > input1[0]);
  1198. }
  1199. }
  1200. return NNACL_OK;
  1201. }
  1202. int ElementGreaterEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) {
  1203. int index = 0;
  1204. #ifdef ENABLE_NEON
  1205. float16x8_t vtrue = vdupq_n_f16(1);
  1206. float16x8_t vfalse = vdupq_n_f16(0);
  1207. for (; index <= element_size - 8; index += C8NUM) {
  1208. float16x8_t vin0 = vld1q_f16(input0 + index);
  1209. float16x8_t vin1 = vld1q_f16(input1 + index);
  1210. float16x8_t vout = vbslq_f16(vcgeq_f16(vin0, vin1), vtrue, vfalse);
  1211. vst1q_f16(output + index, vout);
  1212. }
  1213. #endif
  1214. for (; index < element_size; index++) {
  1215. output[index] = (float16_t)(input0[index] >= input1[index]);
  1216. }
  1217. return NNACL_OK;
  1218. }
  1219. int ElementOptGreaterEqualFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size,
  1220. ArithmeticParameter *param) {
  1221. #ifdef ENABLE_NEON
  1222. float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
  1223. float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
  1224. float16x8_t vtrue = vdupq_n_f16(1);
  1225. float16x8_t vfalse = vdupq_n_f16(0);
  1226. #endif
  1227. int index = 0;
  1228. if (param->in_elements_num0_ == 1) {
  1229. #ifdef ENABLE_NEON
  1230. for (; index <= element_size - 8; index += C8NUM) {
  1231. float16x8_t vin1 = vld1q_f16(input1 + index);
  1232. float16x8_t vout = vbslq_f16(vcgeq_f16(vin0_opt, vin1), vtrue, vfalse);
  1233. vst1q_f16(output + index, vout);
  1234. }
  1235. #endif
  1236. for (; index < element_size; index++) {
  1237. output[index] = (float16_t)(input0[0] >= input1[index]);
  1238. }
  1239. } else {
  1240. #ifdef ENABLE_NEON
  1241. for (; index <= element_size - 8; index += C8NUM) {
  1242. float16x8_t vin0 = vld1q_f16(input0 + index);
  1243. float16x8_t vout = vbslq_f16(vcgeq_f16(vin0, vin1_opt), vtrue, vfalse);
  1244. vst1q_f16(output + index, vout);
  1245. }
  1246. #endif
  1247. for (; index < element_size; index++) {
  1248. output[index] = (float16_t)(input0[index] >= input1[0]);
  1249. }
  1250. }
  1251. return NNACL_OK;
  1252. }