| @@ -58,25 +58,26 @@ bool GatherV2CPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||
| template <typename T> | |||
| void GatherV2CPUKernel<T>::ParallelRun(int8_t *input_addr, int8_t *output_addr, int thread_num) { | |||
| int outer_size = 1, inner_size = 1; | |||
| for (int64_t i = 0; i < axis_; ++i) { | |||
| size_t outer_size = 1, inner_size = 1; | |||
| auto axis = static_cast<size_t>(axis_); | |||
| for (size_t i = 0; i < axis; ++i) { | |||
| outer_size *= input_shape_.at(i); | |||
| } | |||
| for (size_t i = axis_ + 1; i < input_shape_.size(); ++i) { | |||
| for (size_t i = axis + 1; i < input_shape_.size(); ++i) { | |||
| inner_size *= input_shape_.at(i); | |||
| } | |||
| int indices_element_size = 1; | |||
| size_t indices_element_size = 1; | |||
| for (size_t i = 0; i < indices_shape_.size(); i++) { | |||
| indices_element_size *= indices_shape_.at(i); | |||
| } | |||
| const int limit = input_shape_.at(axis_); | |||
| int stride = UP_DIV(outer_size, thread_num); | |||
| auto limit = input_shape_.at(axis); | |||
| size_t stride = UP_DIV(outer_size, thread_num); | |||
| std::vector<common::Task> tasks; | |||
| int thread_index = 0; | |||
| while (thread_index < thread_num) { | |||
| int count = MSMIN(stride, outer_size - stride * thread_index); | |||
| if (count <= 0) break; | |||
| auto thread_stride = stride * thread_index; | |||
| auto thread_stride = static_cast<size_t>(stride * thread_index); | |||
| int8_t *in = input_addr + thread_stride * limit * inner_size * sizeof(T); | |||
| int8_t *out = output_addr + thread_stride * indices_element_size * inner_size * sizeof(T); | |||
| auto block = [&, in, count, out]() { | |||
| @@ -109,10 +109,13 @@ if(ENABLE_CPU) | |||
| target_compile_options(nnacl_mid PRIVATE -mavx512f) | |||
| endif() | |||
| target_compile_options(nnacl_mid PRIVATE -fPIC) | |||
| add_library(nnacl SHARED $<TARGET_OBJECTS:nnacl_mid>) | |||
| if(NOT CMAKE_SYSTEM_NAME MATCHES "Windows") | |||
| target_link_options(nnacl_mid PRIVATE -Wl,-z,relro,-z,now) | |||
| target_link_options(nnacl PRIVATE -Wl,-z,relro,-z,now,-z,noexecstack) | |||
| if("${CMAKE_BUILD_TYPE}" STREQUAL "Release") | |||
| target_link_options(nnacl PRIVATE -s) | |||
| endif() | |||
| endif() | |||
| add_library(nnacl SHARED $<TARGET_OBJECTS:nnacl_mid>) | |||
| endif() | |||
| ########################### arm fp16 build optimize library ######################## | |||
| @@ -525,7 +525,7 @@ void PackInput4x4AndInputSumPert(const int8_t *src_input, int8_t *packed_input, | |||
| return; | |||
| } | |||
| void RowMajor2Col16x4MajorInt8(int8_t *src, int row, int col, int8_t *dst) { | |||
| void RowMajor2Col16x4MajorInt8(const int8_t *src, int row, int col, int8_t *dst) { | |||
| int row_16 = UP_ROUND(row, C16NUM); | |||
| int stride = sizeof(int8_t) * 16 * 4; | |||
| for (int r = 0; r < row_16; ++r) { | |||
| @@ -542,7 +542,7 @@ void RowMajor2Col16x4MajorInt8(int8_t *src, int row, int col, int8_t *dst) { | |||
| } | |||
| // dst: weight_zp * input_row_sums | |||
| void CalcInputSums(int8_t *input, int row, int col, int weight_zp, int *dst, DataOrder order) { | |||
| void CalcInputSums(const int8_t *input, int row, int col, int weight_zp, int *dst, DataOrder order) { | |||
| for (int r = 0; r < row; ++r) { | |||
| int sum = 0; | |||
| for (int c = 0; c < col; ++c) { | |||
| @@ -558,7 +558,7 @@ void CalcInputSums(int8_t *input, int row, int col, int weight_zp, int *dst, Dat | |||
| } | |||
| // dst: bias + depth*input_zp*weight_zp - input_zp*weight_col_sums | |||
| void CalcWeightBiasSums(int8_t *weight, int row, int col, int input_zp, const int *weight_zp_ptr, const int *bias, | |||
| void CalcWeightBiasSums(const int8_t *weight, int row, int col, int input_zp, const int *weight_zp_ptr, const int *bias, | |||
| int *dst, DataOrder order, bool filter_per_channel) { | |||
| for (int c = 0; c < col; ++c) { | |||
| int sum = 0; | |||
| @@ -29,9 +29,9 @@ extern "C" { | |||
| void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16, | |||
| const int *input_sum, const int *bias); | |||
| void RowMajor2Row16x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col); | |||
| void RowMajor2Col16x4MajorInt8(int8_t *src, int row, int col, int8_t *dst); | |||
| void CalcInputSums(int8_t *input, int row, int col, int weight_zp, int *dst, DataOrder order); | |||
| void CalcWeightBiasSums(int8_t *weight, int row, int col, int input_zp, const int *weight_zp_ptr, const int *bias, | |||
| void RowMajor2Col16x4MajorInt8(const int8_t *src, int row, int col, int8_t *dst); | |||
| void CalcInputSums(const int8_t *input, int row, int col, int weight_zp, int *dst, DataOrder order); | |||
| void CalcWeightBiasSums(const int8_t *weight, int row, int col, int input_zp, const int *weight_zp_ptr, const int *bias, | |||
| int *dst, DataOrder order, bool filter_per_channel); | |||
| void MatmulInt8Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int col, int deep16, const int *a_sums, | |||
| const int *bias, int act_min, int act_max, int out_zp, const int32_t *multiplier, | |||
| @@ -47,27 +47,28 @@ bool SplitCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||
| template <typename T> | |||
| void SplitCPUKernel<T>::LaunchSplit(T *input, T **output, size_t /* size */) { | |||
| SplitParameter param; | |||
| param.num_split_ = output_num_; | |||
| param.split_dim_ = axis_; | |||
| param.num_split_ = static_cast<int>(output_num_); | |||
| param.split_dim_ = static_cast<int>(axis_); | |||
| param.strides_[input_shape_.size() - 1] = 1; | |||
| for (int i = input_shape_.size() - 2; i >= 0; i--) { // from -2 to 0 dim | |||
| param.strides_[i] = param.strides_[i + 1] * input_shape_[i + 1]; | |||
| } | |||
| auto split_sizes = std::make_unique<int[]>(param.num_split_); | |||
| auto split_sizes = std::make_unique<int[]>(static_cast<size_t>(param.num_split_)); | |||
| param.split_sizes_ = split_sizes.get(); | |||
| int split_size = input_shape_[param.split_dim_] / output_num_; | |||
| int split_size = input_shape_[static_cast<size_t>(param.split_dim_)] / output_num_; | |||
| for (int i = 0; i < param.num_split_; i++) { | |||
| param.split_sizes_[i] = split_size; | |||
| } | |||
| param.split_count_ = 1; | |||
| for (int i = 0; i < axis_; ++i) { | |||
| for (size_t i = 0; i < static_cast<size_t>(axis_); ++i) { | |||
| param.split_count_ *= input_shape_[i]; | |||
| } | |||
| auto task = [&](size_t start, size_t end) { | |||
| (void)DoSplit(input, reinterpret_cast<void **>(output), &input_shape_[0], SizeToInt(start), SizeToInt(end - start), | |||
| ¶m, SizeToInt(sizeof(T))); | |||
| }; | |||
| ParallelLaunchAutoSearch(task, param.split_count_ * param.num_split_, this, ¶llel_search_info_); | |||
| ParallelLaunchAutoSearch(task, static_cast<size_t>(param.split_count_ * param.num_split_), this, | |||
| ¶llel_search_info_); | |||
| return; | |||
| } | |||