zzy34407230
/
mindspore2022

 
			
			   
				 
					
						
						
							
							/**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#include "nnacl/fp32/softmax_fp32.h"
#include <math.h>
#include <float.h>
#include "nnacl/fp32/exp_fp32.h"

void SoftmaxNorm(const float *src, float *dst, int batch, int channel) {
  int cur_batch_offset = 0;
  for (int i = 0; i < batch; i++, cur_batch_offset += channel) {
    int j = 0;
#ifdef ENABLE_ARM64
    float32x4_t max4 = vdupq_n_f32(-FLT_MAX);
    int count = (channel / C4NUM) * C4NUM;
    for (; j < count; j += C4NUM) {
      float32x4_t input4 = vld1q_f32(src + cur_batch_offset + j);
      max4 = vmaxq_f32(max4, input4);
    }
    float max = vmaxvq_f32(max4);
#else
    float max = -FLT_MAX;
#endif
    for (; j < channel; j++) {
      float input = src[cur_batch_offset + j];
      if (input > max) {
        max = input;
      }
    }
    int k = 0;
#ifdef ENABLE_NEON
    int count2 = (channel / C4NUM) * C4NUM;
    for (; k < count2; k += C4NUM) {
      float32x4_t input4 = vld1q_f32(src + cur_batch_offset + k);
      float32x4_t output4 = vsubq_f32(input4, vdupq_n_f32(max));
      vst1q_f32(dst + cur_batch_offset + k, output4);
    }
#endif
    for (; k < channel; k++) {
      int offset = cur_batch_offset + k;
      dst[offset] = src[offset] - max;
    }
  }
}

void SumAndDiv(const float *src, float *dst, int batch, int channel) {
  int cur_batch_offset = 0;
  for (int i = 0; i < batch; i++, cur_batch_offset += channel) {
    float sum = 0;
    int j = 0;
#ifdef ENABLE_NEON
    float32x4_t sum4 = vdupq_n_f32(0);
    int count = (channel / C4NUM) * C4NUM;
    for (; j < count; j += C4NUM) {
      sum4 = vaddq_f32(sum4, vld1q_f32(src + cur_batch_offset + j));
    }
    sum = sum4[0] + sum4[1] + sum4[2] + sum4[3];
#endif
    for (; j < channel; j++) {
      sum += src[cur_batch_offset + j];
    }
    int k = 0;
#ifdef ENABLE_NEON
    const float div = 1.0f / sum;
    for (; k < count; k += C4NUM) {
      vst1q_f32(dst + cur_batch_offset + k, vmulq_n_f32(vld1q_f32(src + cur_batch_offset + k), div));
    }
#endif
    for (; k < channel; k++) {
      dst[cur_batch_offset + k] = src[cur_batch_offset + k] / sum;
    }
  }
}

void SoftmaxLastAxis(const float *src, float *dst, int batch, int channel) {
  SoftmaxNorm(src, dst, batch, channel);
  ExpFp32(dst, dst, batch * channel);
  SumAndDiv(dst, dst, batch, channel);
}

// output = exp(input) / reduce_sum(exp(input), axis)
void Softmax(const float *input_ptr, float *output_ptr, float *sum_data, const SoftmaxParameter *parameter) {
  int axis = parameter->axis_;
  int n_dim = parameter->n_dim_;
  const int *input_shape = parameter->input_shape_;
  int inner_size = 1;
  int outter_size = 1;

  for (int i = 0; i < axis; i++) {
    outter_size *= input_shape[i];
  }
  for (int i = axis + 1; i < n_dim; i++) {
    inner_size *= input_shape[i];
  }
  for (int i = 0; i < outter_size; i++) {
    int outter_offset = i * input_shape[axis] * inner_size;
    int sum_outter_offset = i * inner_size;
    for (int k = 0; k < inner_size; k++) {
      int inner_offset = outter_offset + k;
      float max_data = input_ptr[inner_offset];
      for (int j = 0; j < input_shape[axis]; j++) {
        int axis_offset = inner_offset + j * inner_size;
        max_data = max_data > input_ptr[axis_offset] ? max_data : input_ptr[axis_offset];
      }
      for (int j = 0; j < input_shape[axis]; j++) {
        int axis_offset = inner_offset + j * inner_size;
        output_ptr[axis_offset] = exp(input_ptr[axis_offset] - max_data);
        sum_data[k + sum_outter_offset] += output_ptr[axis_offset];
      }
    }
  }
  for (int i = 0; i < outter_size; i++) {
    int outter_offset = i * input_shape[axis] * inner_size;
    int sum_outter_offset = i * inner_size;
    for (int j = 0; j < input_shape[axis]; j++) {
      int axis_offset = outter_offset + j * inner_size;
      for (int k = 0; k < inner_size; k++) {
        int inner_offset = axis_offset + k;
        output_ptr[inner_offset] = output_ptr[inner_offset] / sum_data[k + sum_outter_offset];
      }
    }
  }
}