Browse Source

[CPU] fix some defects

tags/v1.5.0-rc1
zhanyuan 4 years ago
parent
commit
f7da9c0777
5 changed files with 26 additions and 21 deletions
  1. +8
    -7
      mindspore/ccsrc/backend/kernel_compiler/cpu/gather_cpu_kernel.cc
  2. +5
    -2
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/CMakeLists.txt
  3. +3
    -3
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/matmul_int8.c
  4. +3
    -3
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/matmul_int8.h
  5. +7
    -6
      mindspore/ccsrc/backend/kernel_compiler/cpu/split_cpu_kernel.cc

+ 8
- 7
mindspore/ccsrc/backend/kernel_compiler/cpu/gather_cpu_kernel.cc View File

@@ -58,25 +58,26 @@ bool GatherV2CPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,

template <typename T>
void GatherV2CPUKernel<T>::ParallelRun(int8_t *input_addr, int8_t *output_addr, int thread_num) {
int outer_size = 1, inner_size = 1;
for (int64_t i = 0; i < axis_; ++i) {
size_t outer_size = 1, inner_size = 1;
auto axis = static_cast<size_t>(axis_);
for (size_t i = 0; i < axis; ++i) {
outer_size *= input_shape_.at(i);
}
for (size_t i = axis_ + 1; i < input_shape_.size(); ++i) {
for (size_t i = axis + 1; i < input_shape_.size(); ++i) {
inner_size *= input_shape_.at(i);
}
int indices_element_size = 1;
size_t indices_element_size = 1;
for (size_t i = 0; i < indices_shape_.size(); i++) {
indices_element_size *= indices_shape_.at(i);
}
const int limit = input_shape_.at(axis_);
int stride = UP_DIV(outer_size, thread_num);
auto limit = input_shape_.at(axis);
size_t stride = UP_DIV(outer_size, thread_num);
std::vector<common::Task> tasks;
int thread_index = 0;
while (thread_index < thread_num) {
int count = MSMIN(stride, outer_size - stride * thread_index);
if (count <= 0) break;
auto thread_stride = stride * thread_index;
auto thread_stride = static_cast<size_t>(stride * thread_index);
int8_t *in = input_addr + thread_stride * limit * inner_size * sizeof(T);
int8_t *out = output_addr + thread_stride * indices_element_size * inner_size * sizeof(T);
auto block = [&, in, count, out]() {


+ 5
- 2
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/CMakeLists.txt View File

@@ -109,10 +109,13 @@ if(ENABLE_CPU)
target_compile_options(nnacl_mid PRIVATE -mavx512f)
endif()
target_compile_options(nnacl_mid PRIVATE -fPIC)
add_library(nnacl SHARED $<TARGET_OBJECTS:nnacl_mid>)
if(NOT CMAKE_SYSTEM_NAME MATCHES "Windows")
target_link_options(nnacl_mid PRIVATE -Wl,-z,relro,-z,now)
target_link_options(nnacl PRIVATE -Wl,-z,relro,-z,now,-z,noexecstack)
if("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
target_link_options(nnacl PRIVATE -s)
endif()
endif()
add_library(nnacl SHARED $<TARGET_OBJECTS:nnacl_mid>)
endif()

########################### arm fp16 build optimize library ########################


+ 3
- 3
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/matmul_int8.c View File

@@ -525,7 +525,7 @@ void PackInput4x4AndInputSumPert(const int8_t *src_input, int8_t *packed_input,
return;
}

void RowMajor2Col16x4MajorInt8(int8_t *src, int row, int col, int8_t *dst) {
void RowMajor2Col16x4MajorInt8(const int8_t *src, int row, int col, int8_t *dst) {
int row_16 = UP_ROUND(row, C16NUM);
int stride = sizeof(int8_t) * 16 * 4;
for (int r = 0; r < row_16; ++r) {
@@ -542,7 +542,7 @@ void RowMajor2Col16x4MajorInt8(int8_t *src, int row, int col, int8_t *dst) {
}

// dst: weight_zp * input_row_sums
void CalcInputSums(int8_t *input, int row, int col, int weight_zp, int *dst, DataOrder order) {
void CalcInputSums(const int8_t *input, int row, int col, int weight_zp, int *dst, DataOrder order) {
for (int r = 0; r < row; ++r) {
int sum = 0;
for (int c = 0; c < col; ++c) {
@@ -558,7 +558,7 @@ void CalcInputSums(int8_t *input, int row, int col, int weight_zp, int *dst, Dat
}

// dst: bias + depth*input_zp*weight_zp - input_zp*weight_col_sums
void CalcWeightBiasSums(int8_t *weight, int row, int col, int input_zp, const int *weight_zp_ptr, const int *bias,
void CalcWeightBiasSums(const int8_t *weight, int row, int col, int input_zp, const int *weight_zp_ptr, const int *bias,
int *dst, DataOrder order, bool filter_per_channel) {
for (int c = 0; c < col; ++c) {
int sum = 0;


+ 3
- 3
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/matmul_int8.h View File

@@ -29,9 +29,9 @@ extern "C" {
void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16,
const int *input_sum, const int *bias);
void RowMajor2Row16x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
void RowMajor2Col16x4MajorInt8(int8_t *src, int row, int col, int8_t *dst);
void CalcInputSums(int8_t *input, int row, int col, int weight_zp, int *dst, DataOrder order);
void CalcWeightBiasSums(int8_t *weight, int row, int col, int input_zp, const int *weight_zp_ptr, const int *bias,
void RowMajor2Col16x4MajorInt8(const int8_t *src, int row, int col, int8_t *dst);
void CalcInputSums(const int8_t *input, int row, int col, int weight_zp, int *dst, DataOrder order);
void CalcWeightBiasSums(const int8_t *weight, int row, int col, int input_zp, const int *weight_zp_ptr, const int *bias,
int *dst, DataOrder order, bool filter_per_channel);
void MatmulInt8Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16, const int *a_sums,
const int *bias, int act_min, int act_max, int out_zp, const int32_t *multiplier,


+ 7
- 6
mindspore/ccsrc/backend/kernel_compiler/cpu/split_cpu_kernel.cc View File

@@ -47,27 +47,28 @@ bool SplitCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
template <typename T>
void SplitCPUKernel<T>::LaunchSplit(T *input, T **output, size_t /* size */) {
SplitParameter param;
param.num_split_ = output_num_;
param.split_dim_ = axis_;
param.num_split_ = static_cast<int>(output_num_);
param.split_dim_ = static_cast<int>(axis_);
param.strides_[input_shape_.size() - 1] = 1;
for (int i = input_shape_.size() - 2; i >= 0; i--) { // from -2 to 0 dim
param.strides_[i] = param.strides_[i + 1] * input_shape_[i + 1];
}
auto split_sizes = std::make_unique<int[]>(param.num_split_);
auto split_sizes = std::make_unique<int[]>(static_cast<size_t>(param.num_split_));
param.split_sizes_ = split_sizes.get();
int split_size = input_shape_[param.split_dim_] / output_num_;
int split_size = input_shape_[static_cast<size_t>(param.split_dim_)] / output_num_;
for (int i = 0; i < param.num_split_; i++) {
param.split_sizes_[i] = split_size;
}
param.split_count_ = 1;
for (int i = 0; i < axis_; ++i) {
for (size_t i = 0; i < static_cast<size_t>(axis_); ++i) {
param.split_count_ *= input_shape_[i];
}
auto task = [&](size_t start, size_t end) {
(void)DoSplit(input, reinterpret_cast<void **>(output), &input_shape_[0], SizeToInt(start), SizeToInt(end - start),
&param, SizeToInt(sizeof(T)));
};
ParallelLaunchAutoSearch(task, param.split_count_ * param.num_split_, this, &parallel_search_info_);
ParallelLaunchAutoSearch(task, static_cast<size_t>(param.split_count_ * param.num_split_), this,
&parallel_search_info_);
return;
}



Loading…
Cancel
Save