You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

AddElewise.py 8.8 kB

4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. from __future__ import absolute_import
  2. from .Node import Op
  3. from .. import ndarray
  4. from .._base import DNNL_LIB
  5. from ..cpu_links import matrix_elementwise_add as\
  6. cpu_matrix_elementwise_add
  7. from ..cpu_links import matrix_elementwise_add_by_const as\
  8. cpu_matrix_elementwise_add_by_const
  9. from ..gpu_links import matrix_elementwise_add_by_const,\
  10. indexedslice_oneside_add,\
  11. array_set,\
  12. matrix_elementwise_add_simple,\
  13. matrix_elementwise_add_lazy
  14. from .DataTransfer import DataD2HSparseOp
  15. from .EmbeddingLookUp import EmbeddingLookUp_Gradient
  16. import numpy as np
  17. class AddOp(Op):
  18. def __init__(self, node_A, node_B, ctx=None):
  19. super().__init__(AddOp, [node_A, node_B], ctx)
  20. self.lazy_execution = True
  21. self.compute_to_be_config = False
  22. def _compute_with_index(self, input_vals, output_val, stream_handle=None):
  23. def cpu_oneside_add(sparse, dense):
  24. sparse.cpu_deduplicate()
  25. dense[sparse.indices.asnumpy().astype(
  26. np.int)] += sparse.values.asnumpy()
  27. sparse.free_deduplicate()
  28. first_indexed = isinstance(input_vals[0], ndarray.IndexedSlices)
  29. second_indexed = isinstance(input_vals[1], ndarray.IndexedSlices)
  30. if self.on_cpu:
  31. if first_indexed and not second_indexed:
  32. cpu_output = input_vals[1].numpy()
  33. cpu_oneside_add(input_vals[0], cpu_output)
  34. output_val[:] = cpu_output
  35. elif not first_indexed and second_indexed:
  36. cpu_output = input_vals[0].numpy()
  37. cpu_oneside_add(input_vals[1], cpu_output)
  38. output_val[:] = cpu_output
  39. elif first_indexed and second_indexed:
  40. cpu_output = np.zeros(output_val.shape).astype(np.float32)
  41. cpu_oneside_add(input_vals[0], cpu_output)
  42. cpu_oneside_add(input_vals[1], cpu_output)
  43. output_val[:] = cpu_output
  44. else:
  45. assert False
  46. else:
  47. if first_indexed and not second_indexed:
  48. input_vals[1].copyto(output_val)
  49. indexedslice_oneside_add(
  50. input_vals[0], output_val, stream_handle)
  51. elif not first_indexed and second_indexed:
  52. input_vals[0].copyto(output_val)
  53. indexedslice_oneside_add(
  54. input_vals[1], output_val, stream_handle)
  55. elif first_indexed and second_indexed:
  56. array_set(output_val, 0, stream_handle)
  57. indexedslice_oneside_add(
  58. input_vals[0], output_val, stream_handle)
  59. indexedslice_oneside_add(
  60. input_vals[1], output_val, stream_handle)
  61. else:
  62. assert False
  63. def _compute_on_cpu_simple(self, input_vals, output_val, stream_handle=None):
  64. assert self.on_cpu
  65. if DNNL_LIB['DnnlMatrixElementwiseAdd'] and input_vals[0].shape == input_vals[1].shape:
  66. cpu_matrix_elementwise_add(
  67. input_vals[0], input_vals[1], output_val)
  68. elif DNNL_LIB['DnnlMatrixElementwiseAddByConst'] and (input_vals[1].shape == (1,) or input_vals[0].shape == (1,)):
  69. if input_vals[1].shape == (1,):
  70. const_val = input_vals[1].asnumpy()[0]
  71. cpu_matrix_elementwise_add_by_const(
  72. input_vals[0], const_val, output_val)
  73. elif input_vals[0].shape == (1,):
  74. const_val = input_vals[0].asnumpy()[0]
  75. cpu_matrix_elementwise_add_by_const(
  76. input_vals[1], const_val, output_val)
  77. else:
  78. # output_val[:] allows modify in-place
  79. output_val[:] = input_vals[0].asnumpy() + input_vals[1].asnumpy()
  80. def _compute_on_gpu_add_const(self, input_vals, output_val, stream_handle=None):
  81. assert self.on_gpu
  82. if input_vals[1].shape == (1,):
  83. const_val = input_vals[1].asnumpy()[0]
  84. matrix_elementwise_add_by_const(
  85. input_vals[0], const_val, output_val, stream_handle)
  86. elif input_vals[0].shape == (1,):
  87. const_val = input_vals[0].asnumpy()[0]
  88. matrix_elementwise_add_by_const(
  89. input_vals[1], const_val, output_val, stream_handle)
  90. else:
  91. assert False
  92. def _compute_on_gpu_simple(self, input_vals, output_val, stream_handle=None):
  93. assert self.on_gpu
  94. matrix_elementwise_add_simple(
  95. input_vals[0], input_vals[1], output_val, stream_handle)
  96. def _compute_on_gpu_lazy(self, input_vals, output_val, stream_handle=None):
  97. assert self.on_gpu
  98. self._reset_gpu_buffer(input_vals[0], input_vals[1], output_val)
  99. matrix_elementwise_add_lazy(
  100. input_vals[0], input_vals[1], output_val, self.gpu_buffer, stream_handle)
  101. def _compute_on_gpu_broadcast_to_0(self, input_vals, output_val, stream_handle=None):
  102. assert self.on_gpu
  103. input_vals[1].broadcast_to(input_vals[0].shape, self.middle_result)
  104. self._reset_gpu_buffer(input_vals[0], self.middle_result, output_val)
  105. matrix_elementwise_add_lazy(
  106. input_vals[0], self.middle_result, output_val, self.gpu_buffer, stream_handle)
  107. def _compute_on_gpu_broadcast_to_1(self, input_vals, output_val, stream_handle=None):
  108. assert self.on_gpu
  109. input_vals[0].broadcast_to(input_vals[1].shape, self.middle_result)
  110. self._reset_gpu_buffer(self.middle_result, input_vals[1], output_val)
  111. matrix_elementwise_add_lazy(
  112. self.middle_result, input_vals[1], output_val, self.gpu_buffer, stream_handle)
  113. def _reset_gpu_buffer(self, input_val1, input_val2, output_val):
  114. if self.check_reset:
  115. strides = list(input_val1.stride) + \
  116. list(input_val2.stride) + list(output_val.stride)
  117. self.gpu_buffer = ndarray.array(
  118. strides, self.ctx, data_type=np.uintc)
  119. self.check_reset = False
  120. def gradient(self, output_grad):
  121. return [output_grad, output_grad]
  122. def infer_shape(self, input_shapes):
  123. """Need to handle input_vals[0].shape != input_vals[1].shape"""
  124. assert len(input_shapes) == 2
  125. no_broadcast = input_shapes[0] == input_shapes[1]
  126. has_const = input_shapes[0] == (1,) or input_shapes[1] == (1,)
  127. if no_broadcast:
  128. output = input_shapes[0]
  129. elif not has_const:
  130. first_size = np.prod(input_shapes[0])
  131. second_size = np.prod(input_shapes[1])
  132. if first_size > second_size:
  133. long_shapes = input_shapes[0]
  134. short_shapes = input_shapes[1]
  135. first_long = True
  136. else:
  137. long_shapes = input_shapes[1]
  138. short_shapes = input_shapes[0]
  139. first_long = False
  140. for i in range(len(short_shapes)):
  141. if short_shapes[i] != 1 and short_shapes[i] != long_shapes[len(long_shapes)-len(short_shapes) + i]:
  142. assert False, "can't add variables of shapes " + \
  143. str(input_shapes[0])+str(input_shapes[1])
  144. output = long_shapes
  145. if self.compute_to_be_config:
  146. if has_const:
  147. self.compute = self._compute_on_gpu_add_const
  148. elif no_broadcast:
  149. if self.inputs[0].inplace or self.inputs[1].inplace:
  150. self.compute = self._compute_on_gpu_lazy
  151. self.check_reset = True
  152. else:
  153. self.compute = self._compute_on_gpu_simple
  154. else:
  155. self.middle_result = ndarray.NDArray(None)
  156. if first_long:
  157. self.compute = self._compute_on_gpu_broadcast_to_0
  158. else:
  159. self.compute = self._compute_on_gpu_broadcast_to_1
  160. self.check_reset = True
  161. return output
  162. def forward_hook(self, config):
  163. super().forward_hook(config)
  164. if isinstance(self.inputs[0], (EmbeddingLookUp_Gradient, DataD2HSparseOp)) or \
  165. isinstance(self.inputs[1], (EmbeddingLookUp_Gradient, DataD2HSparseOp)):
  166. self.compute = self._compute_with_index
  167. elif self.on_cpu:
  168. self.compute = self._compute_on_cpu_simple
  169. else:
  170. # determine in infer_shape
  171. self.compute_to_be_config = True
  172. self.check_reset = False
  173. def add_op(node_A, node_B, ctx=None):
  174. """Make a new instance of Node Addition and call the instance.
  175. Parameters:
  176. ----
  177. node_A : Node
  178. The Node to be added.
  179. node_B : Node
  180. Another Node to be added.
  181. Returns:
  182. ----
  183. A new Node instance created by Op.
  184. """
  185. return AddOp(node_A, node_B, ctx=ctx)