Browse Source

acc reducesum

tags/v1.4.0
fangzehua 4 years ago
parent
commit
f4a9b57ffe
11 changed files with 66 additions and 30 deletions
  1. +1
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.cc
  2. +2
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h
  3. +7
    -5
      mindspore/ccsrc/backend/kernel_compiler/cpu/concat_cpu_kernel.cc
  4. +2
    -2
      mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.cc
  5. +3
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/map_cache_idx_cpu_kernel.cc
  6. +28
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/reduce_fp32.c
  7. +1
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/reduce_fp32.h
  8. +19
    -18
      mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc
  9. +1
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.h
  10. +1
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/split_cpu_kernel.cc
  11. +1
    -1
      mindspore/ccsrc/frontend/parallel/cache_embedding/cache_embedding.cc

+ 1
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.cc View File

@@ -31,7 +31,7 @@ void Square(const T *in, T *out, size_t size) {
out[i] = in[i] * in[i];
}
};
CPUKernelUtils::ParallelFor(task, size);
CPUKernelUtils::ParallelFor(task, size, MAX_SQUARE_SERIAL_SIZE);
}

template <typename T>


+ 2
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h View File

@@ -20,7 +20,8 @@
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

const float MAX_NEG_SERIAL_SIZE = 10000;
const float MAX_NEG_SERIAL_SIZE = 20000;
const float MAX_SQUARE_SERIAL_SIZE = 20000;

namespace mindspore {
namespace kernel {


+ 7
- 5
mindspore/ccsrc/backend/kernel_compiler/cpu/concat_cpu_kernel.cc View File

@@ -52,6 +52,11 @@ bool ConcatCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, c
output_dim_1 += input_flat_shape_list[j][1];
}
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
std::vector<T *> input_addr_list;
for (size_t j = 0; j < input_num; ++j) {
auto tmp_addr = reinterpret_cast<T *>(inputs[j]->addr);
input_addr_list.emplace_back(tmp_addr);
}
// each input's row of shape after flat are same
auto before_axis = input_flat_shape_list[0][0];
auto task = [&](size_t start, size_t end) {
@@ -61,13 +66,10 @@ bool ConcatCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, c
if (input_flat_shape_list[j][1] == 0) {
continue;
}
auto input_j_addr = reinterpret_cast<T *>(inputs[j]->addr);
auto copy_num = input_flat_shape_list[j][1];
auto copy_size = copy_num * sizeof(T);
auto offset = copy_num * i;
auto ret = memcpy_s(output_ptr, copy_num * sizeof(T), input_j_addr + offset, copy_num * sizeof(T));
if (ret != EOK) {
MS_LOG(EXCEPTION) << "Memcpy failed.";
}
(void)memcpy(output_ptr, input_addr_list[j] + offset, copy_size);
output_ptr += copy_num;
}
}


+ 2
- 2
mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.cc View File

@@ -238,9 +238,9 @@ bool EltWiseGradCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inpu
const auto input1 = reinterpret_cast<T *>(inputs[1]->addr);
auto output = reinterpret_cast<T *>(outputs[0]->addr);

CPUKernelUtils::ParallelFor(
CPUKernelUtils::ParallelForAutoSearch(
std::bind(elt_map.at(kernel_name_), this, input0, input1, output, std::placeholders::_1, std::placeholders::_2),
outputs[0]->size / sizeof(T));
outputs[0]->size / sizeof(T), &parallel_search_info_);
return true;
}
} // namespace kernel


+ 3
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/map_cache_idx_cpu_kernel.cc View File

@@ -63,6 +63,9 @@ void MapCacheIdxCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_LOG(EXCEPTION) << "Dimension of HashMap must be 2, (n, 4)";
}
hashmap_length_ = hashmap_shape[0];
if (hashmap_length_ <= 0) {
MS_LOG(INFO) << "Value of hashmap_length_ must > 0!";
}
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
}



+ 28
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/reduce_fp32.c View File

@@ -486,3 +486,31 @@ int ReduceSumDim2Axis0(size_t col_size, size_t col_len, size_t row_len, const fl
}
return NNACL_OK;
}

// [A, B] -> [A]
int ReduceSumDim2Axis1(size_t col_len, const float *src_data, float *dst_data) {
if (src_data == NULL || dst_data == NULL) {
return NNACL_NULL_PTR;
}
size_t k = 0;
float tmp = 0;
#ifdef ENABLE_AVX
size_t block_mod = col_len % C8NUM;
size_t block_c8 = col_len - block_mod;
float tmp_arr[8] = {0, 0, 0, 0, 0, 0, 0, 0};
MS_FLOAT32X8 tmp_arr_8 = MS_MOV256_F32(tmp_arr[0]);
for (; k < block_c8; k += C8NUM) {
MS_FLOAT32X8 src_in = MS_LD256_F32(src_data + k);
tmp_arr_8 = MS_ADD256_F32(tmp_arr_8, src_in);
}
MS_ST256_F32(tmp_arr, tmp_arr_8);
for (size_t i = 0; i < 8; ++i) {
tmp += tmp_arr[i];
}
#endif
for (; k < col_len; k++) {
tmp += src_data[k];
}
dst_data[0] = tmp;
return NNACL_OK;
}

+ 1
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/reduce_fp32.h View File

@@ -47,6 +47,7 @@ int ReduceSumSquare(int outer_size, int inner_size, int axis_size, const float *
int ReduceAll(int outer_size, int inner_size, int axis_size, const bool *src_data, bool *dst_data, int tid,
int thread_num);
int ReduceSumDim2Axis0(size_t col_size, size_t col_len, size_t row_len, const float *src_data, float *dst_data);
int ReduceSumDim2Axis1(size_t col_len, const float *src_data, float *dst_data);

#ifdef ENABLE_NNACL_INFER_SHAPE
int ReduceInferShape(int **in_shape, size_t *dim_size, int *out_shape, int *in_format, int *out_format,


+ 19
- 18
mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc View File

@@ -19,6 +19,7 @@
#include <vector>
#include <algorithm>
#include <utility>
#include "nnacl/fp32/reduce_fp32.h"

namespace mindspore {
namespace kernel {
@@ -73,20 +74,12 @@ void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << kernel_name;
}
}
}

template <typename T>
void ReduceCPUKernel<T>::SimpleReduce(size_t start, size_t end, size_t stride, const T *input_addr, T *output_addr) {
auto pos = start * stride;
for (size_t i = start; i < end; ++i) {
output_addr[i] = input_addr[pos];
pos++;
for (size_t j = 1; j < stride; ++j) {
reduce_func_(input_addr, pos, &output_addr[i]);
pos++;
}
if (reduce_type_ == kReduceMean) {
output_addr[i] /= stride;
// special accelerate for axis = 1 and input has 2 dims
if constexpr (std::is_same<T, float>::value) {
if ((reduce_type_ == kReduceMean || reduce_type_ == kReduceSum) && axis_.size() == 1 && axis_[0] == 1 &&
input_shape_.size() == 2) {
simple_execute_ = true;
}
}
}
@@ -136,11 +129,19 @@ bool ReduceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, c
}

size_t output_size = outputs[0]->size / sizeof(T);
// special accelerate for axis = 1 and input has 2 dims
if (axis_.size() == 1 && axis_[0] == 1 && input_shape_.size() == 2) {
auto task = [&](size_t start, size_t end) { SimpleReduce(start, end, stride, input_addr, output_addr); };
CPUKernelUtils::ParallelForAutoSearch(task, output_size, &parallel_search_info_);
return true;
if constexpr (std::is_same<T, float>::value) {
if (simple_execute_) {
auto task = [&](size_t start, size_t end) {
for (size_t i = start; i < end; ++i) {
ReduceSumDim2Axis1(stride, input_addr + i * stride, output_addr + i);
if (reduce_type_ == kReduceMean) {
output_addr[i] /= stride;
}
}
};
CPUKernelUtils::ParallelForAutoSearch(task, output_size, &parallel_search_info_);
return true;
}
}
// Calculate transpose shape
std::vector<size_t> transpose_shape(input_shape_.size());


+ 1
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.h View File

@@ -32,7 +32,6 @@ class ReduceCPUKernel : public CPUKernel {
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
void SimpleReduce(size_t start, size_t end, size_t stride, const T *input_addr, T *output_addr);

private:
void AccelerateLongVector(T *input_addr, T *output_addr, size_t input_size);
@@ -42,6 +41,7 @@ class ReduceCPUKernel : public CPUKernel {
std::vector<int64_t> axis_;
ReduceType reduce_type_{kReduceAll};
std::function<void(const T *, size_t, T *)> reduce_func_;
bool simple_execute_{false};
};

MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, float);


+ 1
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/split_cpu_kernel.cc View File

@@ -66,7 +66,7 @@ void SplitCPUKernel<T>::LaunchSplit(T *input, T **output, size_t size) {
auto task = [&](size_t start, size_t end) {
(void)DoSplit(input, reinterpret_cast<void **>(output), &input_shape_[0], start, end - start, &param, sizeof(T));
};
CPUKernelUtils::ParallelFor(task, param.split_count_ * param.num_split_);
CPUKernelUtils::ParallelForAutoSearch(task, param.split_count_ * param.num_split_, &parallel_search_info_);
return;
}



+ 1
- 1
mindspore/ccsrc/frontend/parallel/cache_embedding/cache_embedding.cc View File

@@ -352,10 +352,10 @@ AnfNodePtr CreateTupleGetItem(const FuncGraphPtr &func_graph, const AnfNodePtr &
void CreateTupleGetItems(const FuncGraphPtr &func_graph, const AnfNodePtr &input, std::vector<AnfNodePtr> *outputs) {
auto input_abstract_tuple = dyn_cast<abstract::AbstractTuple>(input->abstract());
auto size = input_abstract_tuple->elements().size();
MS_EXCEPTION_IF_NULL(outputs);
for (size_t i = 0; i < size; ++i) {
(*outputs).emplace_back(CreateTupleGetItem(func_graph, input, i));
}
MS_EXCEPTION_IF_NULL(outputs);
}

AnfNodePtr CreateEmbeddingLookup(const FuncGraphPtr &graph, AnfNodePtr params, AnfNodePtr indices) {


Loading…
Cancel
Save