/**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "nnacl/common_func.h"
#include "nnacl/quantization/fixed_point.h"

int offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3) {
  return ((dim0 * shape[1] + dim1) * shape[2] + dim2) * shape[3] + dim3;
}

int offsetComm(const int *shape, const int dim0, const int dim1, const int dim2) {
  return ((dim0 * shape[1] + dim1) * shape[2] + dim2) * shape[3];
}

int offset4d(const int *shape, const int *dims) { return offset(shape, dims[0], dims[1], dims[2], dims[3]); }

int8_t MinInt8(int8_t a, int8_t b) { return b ^ ((a ^ b) & -(a < b)); }

int8_t MaxInt8(int8_t a, int8_t b) { return a ^ ((a ^ b) & -(a < b)); }

void ReluFp32(float *data, float *dst, int ele_num) {
  int four_block = UP_DIV(ele_num, C4NUM);
  for (int i = 0; i < four_block - 1; i++) {
    int index = i * C4NUM;
#ifdef ENABLE_NEON
    float32x4_t relu_data = vld1q_f32(data + index);
    float32x4_t zero_data = vdupq_n_f32(0);
    relu_data = vmaxq_f32(relu_data, zero_data);
    vst1q_f32(dst + index, relu_data);
#else
    data[index] = data[index] < 0 ? 0 : data[index];
    data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1];
    data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2];
    data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3];
#endif
  }
  for (int j = (four_block - 1) * C4NUM; j < ele_num; ++j) {
    data[j] = data[j] < 0 ? 0 : data[j];
  }
}

void Relu6Fp32(float *data, float *dst, int ele_num) {
  int four_block = UP_DIV(ele_num, C4NUM);
  for (int i = 0; i < four_block - 1; i++) {
    int index = i * C4NUM;
#ifdef ENABLE_NEON
    float32x4_t relu6_data = vld1q_f32(data + index);
    float32x4_t zero_data = vdupq_n_f32(0);
    float32x4_t six_data = vdupq_n_f32(6);
    relu6_data = vmaxq_f32(relu6_data, zero_data);
    relu6_data = vminq_f32(relu6_data, six_data);
    vst1q_f32(dst + index, relu6_data);
#else
    data[index] = data[index] < 0 ? 0 : data[index];
    data[index] = data[index] > 6 ? 6 : data[index];
    data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1];
    data[index + 1] = data[index + 1] > 6 ? 6 : data[index + 1];
    data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2];
    data[index + 2] = data[index + 2] > 6 ? 6 : data[index + 2];
    data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3];
    data[index + 3] = data[index + 3] > 6 ? 6 : data[index + 3];
#endif
  }
  for (int j = (four_block - 1) * C4NUM; j < ele_num; ++j) {
    data[j] = data[j] < 0 ? 0 : data[j];
    data[j] = data[j] > 6 ? 6 : data[j];
  }
}