|
- from __future__ import absolute_import
- from .Node import Op
- from .. import ndarray
- from .._base import DNNL_LIB
- from ..cpu_links import matrix_elementwise_add as\
- cpu_matrix_elementwise_add
- from ..cpu_links import matrix_elementwise_add_by_const as\
- cpu_matrix_elementwise_add_by_const
- from ..gpu_links import matrix_elementwise_add_by_const,\
- indexedslice_oneside_add,\
- array_set,\
- matrix_elementwise_add_simple,\
- matrix_elementwise_add_lazy
- from .DataTransfer import DataD2HSparseOp
- from .EmbeddingLookUp import EmbeddingLookUp_Gradient
- import numpy as np
-
-
- class AddOp(Op):
- def __init__(self, node_A, node_B, ctx=None):
- super().__init__(AddOp, [node_A, node_B], ctx)
- self.lazy_execution = True
- self.compute_to_be_config = False
-
- def _compute_with_index(self, input_vals, output_val, stream_handle=None):
- def cpu_oneside_add(sparse, dense):
- sparse.cpu_deduplicate()
- dense[sparse.indices.asnumpy().astype(
- np.int)] += sparse.values.asnumpy()
- sparse.free_deduplicate()
- first_indexed = isinstance(input_vals[0], ndarray.IndexedSlices)
- second_indexed = isinstance(input_vals[1], ndarray.IndexedSlices)
- if self.on_cpu:
- if first_indexed and not second_indexed:
- cpu_output = input_vals[1].numpy()
- cpu_oneside_add(input_vals[0], cpu_output)
- output_val[:] = cpu_output
- elif not first_indexed and second_indexed:
- cpu_output = input_vals[0].numpy()
- cpu_oneside_add(input_vals[1], cpu_output)
- output_val[:] = cpu_output
- elif first_indexed and second_indexed:
- cpu_output = np.zeros(output_val.shape).astype(np.float32)
- cpu_oneside_add(input_vals[0], cpu_output)
- cpu_oneside_add(input_vals[1], cpu_output)
- output_val[:] = cpu_output
- else:
- assert False
- else:
- if first_indexed and not second_indexed:
- input_vals[1].copyto(output_val)
- indexedslice_oneside_add(
- input_vals[0], output_val, stream_handle)
- elif not first_indexed and second_indexed:
- input_vals[0].copyto(output_val)
- indexedslice_oneside_add(
- input_vals[1], output_val, stream_handle)
- elif first_indexed and second_indexed:
- array_set(output_val, 0, stream_handle)
- indexedslice_oneside_add(
- input_vals[0], output_val, stream_handle)
- indexedslice_oneside_add(
- input_vals[1], output_val, stream_handle)
- else:
- assert False
-
- def _compute_on_cpu_simple(self, input_vals, output_val, stream_handle=None):
- assert self.on_cpu
- if DNNL_LIB['DnnlMatrixElementwiseAdd'] and input_vals[0].shape == input_vals[1].shape:
- cpu_matrix_elementwise_add(
- input_vals[0], input_vals[1], output_val)
- elif DNNL_LIB['DnnlMatrixElementwiseAddByConst'] and (input_vals[1].shape == (1,) or input_vals[0].shape == (1,)):
- if input_vals[1].shape == (1,):
- const_val = input_vals[1].asnumpy()[0]
- cpu_matrix_elementwise_add_by_const(
- input_vals[0], const_val, output_val)
- elif input_vals[0].shape == (1,):
- const_val = input_vals[0].asnumpy()[0]
- cpu_matrix_elementwise_add_by_const(
- input_vals[1], const_val, output_val)
- else:
- # output_val[:] allows modify in-place
- output_val[:] = input_vals[0].asnumpy() + input_vals[1].asnumpy()
-
- def _compute_on_gpu_add_const(self, input_vals, output_val, stream_handle=None):
- assert self.on_gpu
- if input_vals[1].shape == (1,):
- const_val = input_vals[1].asnumpy()[0]
- matrix_elementwise_add_by_const(
- input_vals[0], const_val, output_val, stream_handle)
- elif input_vals[0].shape == (1,):
- const_val = input_vals[0].asnumpy()[0]
- matrix_elementwise_add_by_const(
- input_vals[1], const_val, output_val, stream_handle)
- else:
- assert False
-
- def _compute_on_gpu_simple(self, input_vals, output_val, stream_handle=None):
- assert self.on_gpu
- matrix_elementwise_add_simple(
- input_vals[0], input_vals[1], output_val, stream_handle)
-
- def _compute_on_gpu_lazy(self, input_vals, output_val, stream_handle=None):
- assert self.on_gpu
- self._reset_gpu_buffer(input_vals[0], input_vals[1], output_val)
- matrix_elementwise_add_lazy(
- input_vals[0], input_vals[1], output_val, self.gpu_buffer, stream_handle)
-
- def _compute_on_gpu_broadcast_to_0(self, input_vals, output_val, stream_handle=None):
- assert self.on_gpu
- input_vals[1].broadcast_to(input_vals[0].shape, self.middle_result)
- self._reset_gpu_buffer(input_vals[0], self.middle_result, output_val)
- matrix_elementwise_add_lazy(
- input_vals[0], self.middle_result, output_val, self.gpu_buffer, stream_handle)
-
- def _compute_on_gpu_broadcast_to_1(self, input_vals, output_val, stream_handle=None):
- assert self.on_gpu
- input_vals[0].broadcast_to(input_vals[1].shape, self.middle_result)
- self._reset_gpu_buffer(self.middle_result, input_vals[1], output_val)
- matrix_elementwise_add_lazy(
- self.middle_result, input_vals[1], output_val, self.gpu_buffer, stream_handle)
-
- def _reset_gpu_buffer(self, input_val1, input_val2, output_val):
- if self.check_reset:
- strides = list(input_val1.stride) + \
- list(input_val2.stride) + list(output_val.stride)
- self.gpu_buffer = ndarray.array(
- strides, self.ctx, data_type=np.uintc)
- self.check_reset = False
-
- def gradient(self, output_grad):
- return [output_grad, output_grad]
-
- def infer_shape(self, input_shapes):
- """Need to handle input_vals[0].shape != input_vals[1].shape"""
- assert len(input_shapes) == 2
- no_broadcast = input_shapes[0] == input_shapes[1]
- has_const = input_shapes[0] == (1,) or input_shapes[1] == (1,)
- if no_broadcast:
- output = input_shapes[0]
- elif not has_const:
- first_size = np.prod(input_shapes[0])
- second_size = np.prod(input_shapes[1])
- if first_size > second_size:
- long_shapes = input_shapes[0]
- short_shapes = input_shapes[1]
- first_long = True
- else:
- long_shapes = input_shapes[1]
- short_shapes = input_shapes[0]
- first_long = False
- for i in range(len(short_shapes)):
- if short_shapes[i] != 1 and short_shapes[i] != long_shapes[len(long_shapes)-len(short_shapes) + i]:
- assert False, "can't add variables of shapes " + \
- str(input_shapes[0])+str(input_shapes[1])
- output = long_shapes
- if self.compute_to_be_config:
- if has_const:
- self.compute = self._compute_on_gpu_add_const
- elif no_broadcast:
- if self.inputs[0].inplace or self.inputs[1].inplace:
- self.compute = self._compute_on_gpu_lazy
- self.check_reset = True
- else:
- self.compute = self._compute_on_gpu_simple
- else:
- self.middle_result = ndarray.NDArray(None)
- if first_long:
- self.compute = self._compute_on_gpu_broadcast_to_0
- else:
- self.compute = self._compute_on_gpu_broadcast_to_1
- self.check_reset = True
- return output
-
- def forward_hook(self, config):
- super().forward_hook(config)
- if isinstance(self.inputs[0], (EmbeddingLookUp_Gradient, DataD2HSparseOp)) or \
- isinstance(self.inputs[1], (EmbeddingLookUp_Gradient, DataD2HSparseOp)):
- self.compute = self._compute_with_index
- elif self.on_cpu:
- self.compute = self._compute_on_cpu_simple
- else:
- # determine in infer_shape
- self.compute_to_be_config = True
- self.check_reset = False
-
-
- def add_op(node_A, node_B, ctx=None):
- """Make a new instance of Node Addition and call the instance.
-
- Parameters:
- ----
- node_A : Node
- The Node to be added.
- node_B : Node
- Another Node to be added.
-
- Returns:
- ----
- A new Node instance created by Op.
-
- """
- return AddOp(node_A, node_B, ctx=ctx)
|