Merge pull request !7018 from zhaozhenlong/lite/op/scale_stack_fp16tags/v1.1.0
| @@ -0,0 +1,223 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "nnacl/fp16/scale_fp16.h" | |||||
| void ScaleInner(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start, | |||||
| int outer_end, int axis_size, int inner_size) { | |||||
| for (int out = outer_start; out < outer_end; out++) { | |||||
| int out_offset = out * axis_size * inner_size; | |||||
| for (int i = 0; i < axis_size; i++) { | |||||
| int axis_offset = out_offset + i * inner_size; | |||||
| int in_index = 0; | |||||
| #ifdef ENABLE_ARM64 | |||||
| for (; in_index < inner_size - 8; in_index += 8) { | |||||
| int in_offset = axis_offset + in_index; | |||||
| float16x8_t data = vld1q_f16(in_data + in_offset); | |||||
| float16x8_t scale_8 = vdupq_n_f16(scale[i]); | |||||
| float16x8_t offset_8 = vdupq_n_f16(offset[i]); | |||||
| float16x8_t reslut = vfmaq_f16(offset_8, data, scale_8); | |||||
| vst1q_f16(out_data + in_offset, reslut); | |||||
| } | |||||
| #endif | |||||
| for (; in_index < inner_size; in_index++) { | |||||
| int in_offset = axis_offset + in_index; | |||||
| out_data[in_offset] = in_data[in_offset] * scale[i] + offset[i]; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| void ScaleAxis(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start, | |||||
| int outer_end, int axis_size) { | |||||
| for (int out = outer_start; out < outer_end; out++) { | |||||
| int out_offset = out * axis_size; | |||||
| int index = 0; | |||||
| #ifdef ENABLE_ARM64 | |||||
| for (; index < axis_size - 8; index += 8) { | |||||
| int in_offset = out_offset + index; | |||||
| float16x8_t data = vld1q_f16(in_data + in_offset); | |||||
| float16x8_t scale_8 = vld1q_f16(scale + index); | |||||
| float16x8_t offset_8 = vld1q_f16(offset + index); | |||||
| float16x8_t reslut = vfmaq_f16(offset_8, data, scale_8); | |||||
| vst1q_f16(out_data + in_offset, reslut); | |||||
| } | |||||
| #endif | |||||
| for (; index < axis_size; index++) { | |||||
| int in_offset = out_offset + index; | |||||
| out_data[in_offset] = in_data[in_offset] * scale[index] + offset[index]; | |||||
| } | |||||
| } | |||||
| } | |||||
| void DoScaleFp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id, | |||||
| ScaleParameter *scale_param) { | |||||
| int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_); | |||||
| int outer_start = task_id * outer_step; | |||||
| int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_); | |||||
| if (scale_param->inner_size_ == 1) { | |||||
| ScaleAxis(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_); | |||||
| } else { | |||||
| ScaleInner(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_, | |||||
| scale_param->inner_size_); | |||||
| } | |||||
| } | |||||
| void ScaleInnerRelu(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start, | |||||
| int outer_end, int axis_size, int inner_size) { | |||||
| #ifdef ENABLE_ARM64 | |||||
| float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0}; | |||||
| #endif | |||||
| for (int out = outer_start; out < outer_end; out++) { | |||||
| int out_offset = out * axis_size * inner_size; | |||||
| for (int i = 0; i < axis_size; i++) { | |||||
| int axis_offset = out_offset + i * inner_size; | |||||
| int in_index = 0; | |||||
| #ifdef ENABLE_ARM64 | |||||
| for (; in_index < inner_size - 8; in_index += 8) { | |||||
| int in_offset = axis_offset + in_index; | |||||
| float16x8_t data = vld1q_f16(in_data + in_offset); | |||||
| float16x8_t scale_8 = vdupq_n_f16(scale[i]); | |||||
| float16x8_t offset_8 = vdupq_n_f16(offset[i]); | |||||
| float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8); | |||||
| float16x8_t result = vmaxq_f16(tmp, zeros); | |||||
| vst1q_f16(out_data + in_offset, result); | |||||
| } | |||||
| #endif | |||||
| for (; in_index < inner_size; in_index++) { | |||||
| int in_offset = axis_offset + in_index; | |||||
| float tmp = in_data[in_offset] * scale[i] + offset[i]; | |||||
| out_data[in_offset] = tmp > 0.0f ? tmp : 0.0f; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| void ScaleAxisRelu(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start, | |||||
| int outer_end, int axis_size) { | |||||
| #ifdef ENABLE_ARM64 | |||||
| float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0}; | |||||
| #endif | |||||
| for (int out = outer_start; out < outer_end; out++) { | |||||
| int out_offset = out * axis_size; | |||||
| int index = 0; | |||||
| #ifdef ENABLE_ARM64 | |||||
| for (; index < axis_size - 8; index += 8) { | |||||
| int in_offset = out_offset + index; | |||||
| float16x8_t data = vld1q_f16(in_data + in_offset); | |||||
| float16x8_t scale_8 = vld1q_f16(scale + index); | |||||
| float16x8_t offset_8 = vld1q_f16(offset + index); | |||||
| float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8); | |||||
| float16x8_t result = vmaxq_f16(tmp, zeros); | |||||
| vst1q_f16(out_data + in_offset, result); | |||||
| } | |||||
| #endif | |||||
| for (; index < axis_size; index++) { | |||||
| int in_offset = out_offset + index; | |||||
| float tmp = in_data[in_offset] * scale[index] + offset[index]; | |||||
| out_data[in_offset] = tmp > 0.0f ? tmp : 0.0f; | |||||
| } | |||||
| } | |||||
| } | |||||
| void DoScaleReluFp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id, | |||||
| ScaleParameter *scale_param) { | |||||
| int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_); | |||||
| int outer_start = task_id * outer_step; | |||||
| int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_); | |||||
| if (scale_param->inner_size_ == 1) { | |||||
| ScaleAxisRelu(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_); | |||||
| } else { | |||||
| ScaleInnerRelu(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_, | |||||
| scale_param->inner_size_); | |||||
| } | |||||
| } | |||||
| void ScaleInnerRelu6(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start, | |||||
| int outer_end, int axis_size, int inner_size) { | |||||
| #ifdef ENABLE_ARM64 | |||||
| float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0}; | |||||
| float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6}; | |||||
| #endif | |||||
| for (int out = outer_start; out < outer_end; out++) { | |||||
| int out_offset = out * axis_size * inner_size; | |||||
| for (int i = 0; i < axis_size; i++) { | |||||
| int axis_offset = out_offset + i * inner_size; | |||||
| int in_index = 0; | |||||
| #ifdef ENABLE_ARM64 | |||||
| for (; in_index < inner_size - 8; in_index += 8) { | |||||
| int in_offset = axis_offset + in_index; | |||||
| float16x8_t data = vld1q_f16(in_data + in_offset); | |||||
| float16x8_t scale_8 = vdupq_n_f16(scale[i]); | |||||
| float16x8_t offset_8 = vdupq_n_f16(offset[i]); | |||||
| float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8); | |||||
| float16x8_t result = vminq_f16(vmaxq_f16(tmp, zeros), bounds); | |||||
| vst1q_f16(out_data + in_offset, result); | |||||
| } | |||||
| #endif | |||||
| for (; in_index < inner_size; in_index++) { | |||||
| int in_offset = axis_offset + in_index; | |||||
| float tmp = in_data[in_offset] * scale[i] + offset[i]; | |||||
| out_data[in_offset] = MSMIN(MSMAX(tmp, 0.0f), 6.0f); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| void ScaleAxisRelu6(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int outer_start, | |||||
| int outer_end, int axis_size) { | |||||
| #ifdef ENABLE_ARM64 | |||||
| float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0}; | |||||
| float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6}; | |||||
| #endif | |||||
| for (int out = outer_start; out < outer_end; out++) { | |||||
| int out_offset = out * axis_size; | |||||
| int index = 0; | |||||
| #ifdef ENABLE_ARM64 | |||||
| for (; index < axis_size - 8; index += 8) { | |||||
| int in_offset = out_offset + index; | |||||
| float16x8_t data = vld1q_f16(in_data + in_offset); | |||||
| float16x8_t scale_8 = vld1q_f16(scale + index); | |||||
| float16x8_t offset_8 = vld1q_f16(offset + index); | |||||
| float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8); | |||||
| float16x8_t result = vminq_f16(vmaxq_f16(tmp, zeros), bounds); | |||||
| vst1q_f16(out_data + in_offset, result); | |||||
| } | |||||
| #endif | |||||
| for (; index < axis_size; index++) { | |||||
| int in_offset = out_offset + index; | |||||
| float tmp = in_data[in_offset] * scale[index] + offset[index]; | |||||
| out_data[in_offset] = MSMIN(MSMAX(tmp, 0.0f), 6.0f); | |||||
| } | |||||
| } | |||||
| } | |||||
| void DoScaleRelu6Fp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id, | |||||
| ScaleParameter *scale_param) { | |||||
| int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_); | |||||
| int outer_start = task_id * outer_step; | |||||
| int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_); | |||||
| if (scale_param->inner_size_ == 1) { | |||||
| ScaleAxisRelu6(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_); | |||||
| } else { | |||||
| ScaleInnerRelu6(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_, | |||||
| scale_param->inner_size_); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,38 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_NNACL_SCALE_FP16_H_ | |||||
| #define MINDSPORE_LITE_NNACL_SCALE_FP16_H_ | |||||
| #include "nnacl/op_base.h" | |||||
| #include "nnacl/scale.h" | |||||
| #ifdef ENABLE_NEON | |||||
| #include <arm_neon.h> | |||||
| #endif | |||||
| #ifdef __cplusplus | |||||
| extern "C" { | |||||
| #endif | |||||
| void DoScaleFp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id, | |||||
| ScaleParameter *scale_param); | |||||
| void DoScaleReluFp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id, | |||||
| ScaleParameter *scale_param); | |||||
| void DoScaleRelu6Fp16(float16_t *in_data, float16_t *out_data, float16_t *scale, float16_t *offset, int task_id, | |||||
| ScaleParameter *scale_param); | |||||
| #ifdef __cplusplus | |||||
| } | |||||
| #endif | |||||
| #endif // MINDSPORE_LITE_NNACL_SCALE_FP16_H_ | |||||
| @@ -0,0 +1,54 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "nnacl/fp16/stack_fp16.h" | |||||
| #include "nnacl/arithmetic_common.h" | |||||
| size_t GetStackCopyNum(int axis, int *in_shape, size_t shape_size) { | |||||
| size_t one_input_size = 1; | |||||
| for (size_t i = 0; i < shape_size; ++i) { | |||||
| one_input_size *= in_shape[i]; | |||||
| } | |||||
| int in_strides[4]; | |||||
| ComputeStrides(in_shape, in_strides, shape_size); | |||||
| size_t copy_num = axis > 0 ? in_strides[axis - 1] : one_input_size; | |||||
| return copy_num; | |||||
| } | |||||
| size_t GetStackPreAxisCount(const int *in_shape, int axis) { | |||||
| size_t pre_axis_count = 1; | |||||
| for (size_t i = 0; i < axis; ++i) { | |||||
| pre_axis_count *= in_shape[i]; | |||||
| } | |||||
| return pre_axis_count; | |||||
| } | |||||
| void DoStackFp16(const float16_t *const *inputs, size_t input_num, int *in_shape, size_t shape_size, int axis, | |||||
| float16_t *output) { | |||||
| size_t copy_num = GetStackCopyNum(axis, in_shape, shape_size); | |||||
| size_t copy_size = copy_num * sizeof(float16_t); | |||||
| size_t pre_axis_count = GetStackPreAxisCount(in_shape, axis); | |||||
| size_t in_offset = 0; | |||||
| size_t out_offset = 0; | |||||
| for (size_t i = 0; i < pre_axis_count; ++i) { | |||||
| for (size_t j = 0; j < input_num; ++j) { | |||||
| memcpy(output + out_offset, inputs[j] + in_offset, copy_size); | |||||
| out_offset += copy_num; | |||||
| } | |||||
| in_offset += copy_num; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,33 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_NNACL_FP16_STACK_FP16_H_ | |||||
| #define MINDSPORE_LITE_NNACL_FP16_STACK_FP16_H_ | |||||
| #include "nnacl/op_base.h" | |||||
| #ifdef ENABLE_NEON | |||||
| #include <arm_neon.h> | |||||
| #endif | |||||
| #ifdef __cplusplus | |||||
| extern "C" { | |||||
| #endif | |||||
| void DoStackFp16(const float16_t *const *inputs, size_t input_num, int *in_shape, size_t shape_size, int axis, | |||||
| float16_t *output); | |||||
| #ifdef __cplusplus | |||||
| } | |||||
| #endif | |||||
| #endif // MINDSPORE_LITE_NNACL_FP16_STACK_FP16_H_ | |||||
| @@ -18,11 +18,6 @@ | |||||
| #include "nnacl/op_base.h" | #include "nnacl/op_base.h" | ||||
| typedef struct StackParameter { | |||||
| OpParameter op_parameter_; | |||||
| int32_t axis_; | |||||
| } StackParameter; | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| extern "C" { | extern "C" { | ||||
| #endif | #endif | ||||
| @@ -0,0 +1,26 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_NNACL_STACK_PARAMETER_H_ | |||||
| #define MINDSPORE_LITE_NNACL_STACK_PARAMETER_H_ | |||||
| #include "nnacl/op_base.h" | |||||
| typedef struct StackParameter { | |||||
| OpParameter op_parameter_; | |||||
| int32_t axis_; | |||||
| } StackParameter; | |||||
| #endif // MINDSPORE_LITE_NNACL_STACK_PARAMETER_H_ | |||||
| @@ -126,7 +126,7 @@ | |||||
| #include "nnacl/prelu_parameter.h" | #include "nnacl/prelu_parameter.h" | ||||
| #include "nnacl/shape.h" | #include "nnacl/shape.h" | ||||
| #include "nnacl/fp32/constant_of_shape.h" | #include "nnacl/fp32/constant_of_shape.h" | ||||
| #include "nnacl/fp32/stack.h" | |||||
| #include "nnacl/stack_parameter.h" | |||||
| #include "nnacl/unstack.h" | #include "nnacl/unstack.h" | ||||
| #include "nnacl/depth_to_space.h" | #include "nnacl/depth_to_space.h" | ||||
| #include "nnacl/conv_parameter.h" | #include "nnacl/conv_parameter.h" | ||||
| @@ -0,0 +1,214 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "src/runtime/kernel/arm/fp16/scale_fp16.h" | |||||
| #include <string.h> | |||||
| #include <vector> | |||||
| #include "schema/model_generated.h" | |||||
| #include "src/kernel_registry.h" | |||||
| #include "include/errorcode.h" | |||||
| #include "src/runtime/runtime_api.h" | |||||
| #include "src/runtime/kernel/arm/fp16/common_fp16.h" | |||||
| #include "nnacl/fp16/scale_fp16.h" | |||||
| #include "nnacl/fp16/cast_fp16.h" | |||||
| using mindspore::lite::KernelRegistrar; | |||||
| using mindspore::lite::RET_ERROR; | |||||
| using mindspore::lite::RET_OK; | |||||
| using mindspore::schema::PrimitiveType_Scale; | |||||
| namespace mindspore::kernel { | |||||
| int ScaleFp16CPUKernel::InitScaleOffset() { | |||||
| auto input_tensor = in_tensors_.at(0); | |||||
| malloc_input_ = input_tensor->data_type() == kNumberTypeFloat32; | |||||
| auto scale_tensor = in_tensors_.at(1); | |||||
| malloc_scale_ = scale_tensor->data_type() == kNumberTypeFloat32; | |||||
| if (in_tensors_.size() == 2) { | |||||
| malloc_offset_ = true; | |||||
| } else { | |||||
| auto offset_tensor = in_tensors_.at(2); | |||||
| malloc_offset_ = offset_tensor->data_type() == kNumberTypeFloat32; | |||||
| } | |||||
| auto output_tensor = out_tensors_.at(0); | |||||
| malloc_output_ = output_tensor->data_type() == kNumberTypeFloat32; | |||||
| return RET_OK; | |||||
| } | |||||
| int ScaleFp16CPUKernel::Init() { | |||||
| if (in_tensors_.size() < 2 || in_tensors_.size() > 3) { | |||||
| MS_LOG(ERROR) << "inputs to Scale operator should be 2 or 3, but " << in_tensors_.size() << " is given."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| if (!InferShapeDone()) { | |||||
| return RET_OK; | |||||
| } | |||||
| ReSize(); | |||||
| return RET_OK; | |||||
| } | |||||
| int ScaleFp16CPUKernel::ReSize() { | |||||
| auto ret = CalculateParameter(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Scale fp16 CalculateParameter failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int ScaleFp16CPUKernel::Scale(int task_id) { | |||||
| switch (scale_param_->activation_type_) { | |||||
| case schema::ActivationType_RELU6: | |||||
| DoScaleRelu6Fp16(input_, output_, scale_, offset_, task_id, scale_param_); | |||||
| break; | |||||
| case schema::ActivationType_RELU: | |||||
| DoScaleReluFp16(input_, output_, scale_, offset_, task_id, scale_param_); | |||||
| break; | |||||
| case schema::ActivationType_NO_ACTIVATION: | |||||
| DoScaleFp16(input_, output_, scale_, offset_, task_id, scale_param_); | |||||
| break; | |||||
| default: | |||||
| MS_LOG(ERROR) << "ScaleFp16 does not support activation type " << scale_param_->activation_type_; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int ScaleRun(void *cdata, int task_id) { | |||||
| auto scale = reinterpret_cast<ScaleFp16CPUKernel *>(cdata); | |||||
| auto ret = scale->Scale(task_id); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "ScaleRun error task_id[" << task_id << "] error_code[" << ret << "]"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int ScaleFp16CPUKernel::Run() { | |||||
| auto ret = Prepare(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Prepare fail!ret: " << ret; | |||||
| return ret; | |||||
| } | |||||
| ret = InitScaleOffset(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Scale fp16 InitScaleOffset failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| ret = MallocAssignTmpBuffer(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Scale Fp16 malloc tmp buffer failed"; | |||||
| FreeTmpBuffer(); | |||||
| return ret; | |||||
| } | |||||
| ret = ParallelLaunch(this->context_->thread_pool_, ScaleRun, this, op_parameter_->thread_num_); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Scale error error_code[" << ret << "]"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| // if output tensor is fp32, we need to transform | |||||
| if (malloc_output_) { | |||||
| auto out_tensor = out_tensors_.at(0); | |||||
| Float16ToFloat32(output_, reinterpret_cast<float *>(out_tensor->MutableData()), out_tensor->ElementsNum()); | |||||
| } | |||||
| FreeTmpBuffer(); | |||||
| return RET_OK; | |||||
| } | |||||
| int ScaleFp16CPUKernel::MallocAssignTmpBuffer() { | |||||
| input_ = ConvertInputFp32toFp16(in_tensors_.at(0), context_); | |||||
| if (input_ == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| scale_ = ConvertInputFp32toFp16(in_tensors_.at(1), context_); | |||||
| if (scale_ == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| if (in_tensors_.size() == 3) { | |||||
| offset_ = ConvertInputFp32toFp16(in_tensors_.at(2), context_); | |||||
| if (offset_ == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| } else { | |||||
| offset_ = | |||||
| reinterpret_cast<float16_t *>(context_->allocator->Malloc(in_tensors_.at(1)->ElementsNum() * sizeof(float16_t))); | |||||
| if (offset_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Malloc data failed"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(offset_, 0, in_tensors_.at(1)->ElementsNum() * sizeof(float16_t)); | |||||
| } | |||||
| output_ = MallocOutputFp16(out_tensors_.at(0), context_); | |||||
| if (output_ == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| void ScaleFp16CPUKernel::FreeTmpBuffer() { | |||||
| if (malloc_input_ && input_ != nullptr) { | |||||
| context_->allocator->Free(input_); | |||||
| input_ = nullptr; | |||||
| } | |||||
| if (malloc_scale_ && scale_ != nullptr) { | |||||
| context_->allocator->Free(scale_); | |||||
| scale_ = nullptr; | |||||
| } | |||||
| if (malloc_offset_ && offset_ != nullptr) { | |||||
| context_->allocator->Free(offset_); | |||||
| offset_ = nullptr; | |||||
| } | |||||
| if (malloc_output_ && output_ != nullptr) { | |||||
| context_->allocator->Free(output_); | |||||
| output_ = nullptr; | |||||
| } | |||||
| } | |||||
| kernel::LiteKernel *CpuScaleFp16KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||||
| const mindspore::lite::PrimitiveC *primitive) { | |||||
| MS_ASSERT(desc.type == schema::PrimitiveType_Scale); | |||||
| if (opParameter == nullptr) { | |||||
| MS_LOG(ERROR) << "opParameter is nullptr"; | |||||
| return nullptr; | |||||
| } | |||||
| auto *kernel = new (std::nothrow) ScaleFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||||
| if (kernel == nullptr) { | |||||
| MS_LOG(ERROR) << "New kernel fails."; | |||||
| return nullptr; | |||||
| } | |||||
| auto ret = kernel->Init(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: " | |||||
| << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_)); | |||||
| delete kernel; | |||||
| return nullptr; | |||||
| } | |||||
| return kernel; | |||||
| } | |||||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Scale, CpuScaleFp16KernelCreator) | |||||
| } // namespace mindspore::kernel | |||||
| @@ -0,0 +1,58 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_SCALE_FP16_H_ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_SCALE_FP16_H_ | |||||
| #include <vector> | |||||
| #include "src/lite_kernel.h" | |||||
| #include "src/runtime/kernel/arm/fp32/scale.h" | |||||
| #include "nnacl/scale.h" | |||||
| namespace mindspore::kernel { | |||||
| class ScaleFp16CPUKernel : public ScaleCPUKernel { | |||||
| public: | |||||
| ScaleFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | |||||
| const mindspore::lite::PrimitiveC *primitive) | |||||
| : ScaleCPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||||
| ~ScaleFp16CPUKernel() = default; | |||||
| int Init() override; | |||||
| int ReSize() override; | |||||
| int Run() override; | |||||
| int InitScaleOffset() override; | |||||
| int Scale(int task_id); | |||||
| private: | |||||
| int MallocAssignTmpBuffer(); | |||||
| void FreeTmpBuffer(); | |||||
| private: | |||||
| bool malloc_input_ = false; | |||||
| bool malloc_scale_ = false; | |||||
| bool malloc_offset_ = false; | |||||
| bool malloc_output_ = false; | |||||
| float16_t *input_ = nullptr; | |||||
| float16_t *scale_ = nullptr; | |||||
| float16_t *offset_ = nullptr; | |||||
| float16_t *output_ = nullptr; | |||||
| }; | |||||
| } // namespace mindspore::kernel | |||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_SCALE_FP16_H_ | |||||
| @@ -0,0 +1,134 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "src/runtime/kernel/arm/fp16/stack_fp16.h" | |||||
| #include <vector> | |||||
| #include "schema/model_generated.h" | |||||
| #include "src/kernel_registry.h" | |||||
| #include "nnacl/stack_parameter.h" | |||||
| #include "include/errorcode.h" | |||||
| #include "src/runtime/kernel/arm/fp16/common_fp16.h" | |||||
| #include "nnacl/fp16/cast_fp16.h" | |||||
| #include "nnacl/fp16/stack_fp16.h" | |||||
| using mindspore::lite::KernelRegistrar; | |||||
| using mindspore::lite::RET_ERROR; | |||||
| using mindspore::lite::RET_OK; | |||||
| using mindspore::schema::PrimitiveType_Stack; | |||||
| namespace mindspore::kernel { | |||||
| int StackFp16CPUKernel::Init() { | |||||
| if (!InferShapeDone()) { | |||||
| return RET_OK; | |||||
| } | |||||
| return ReSize(); | |||||
| } | |||||
| void StackFp16CPUKernel::InitMallocFlags() { | |||||
| malloc_buffers_.resize(in_tensors_.size()); | |||||
| for (size_t i = 0; i < in_tensors_.size(); ++i) { | |||||
| malloc_buffers_[i] = in_tensors_[i]->data_type() == kNumberTypeFloat32; | |||||
| } | |||||
| malloc_out = out_tensors_[0]->data_type() == kNumberTypeFloat32; | |||||
| } | |||||
| int StackFp16CPUKernel::MallocAssignBuffer() { | |||||
| buffers_.resize(in_tensors_.size(), nullptr); | |||||
| for (size_t i = 0; i < in_tensors_.size(); ++i) { | |||||
| buffers_[i] = ConvertInputFp32toFp16(in_tensors_[i], context_); | |||||
| if (buffers_[i] == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| } | |||||
| out_buffer_ = nullptr; | |||||
| out_buffer_ = MallocOutputFp16(out_tensors_[0], context_); | |||||
| if (out_buffer_ == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| void StackFp16CPUKernel::FreeBuffer() { | |||||
| for (size_t i = 0; i < buffers_.size(); ++i) { | |||||
| if (malloc_buffers_[i] && buffers_[i] != nullptr) { | |||||
| context_->allocator->Free(buffers_[i]); | |||||
| buffers_[i] = nullptr; | |||||
| } | |||||
| } | |||||
| if (malloc_out && out_buffer_ != nullptr) { | |||||
| context_->allocator->Free(out_buffer_); | |||||
| out_buffer_ = nullptr; | |||||
| } | |||||
| } | |||||
| int StackFp16CPUKernel::Run() { | |||||
| auto ret = Prepare(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Prepare fail!ret: " << ret; | |||||
| return ret; | |||||
| } | |||||
| size_t inputs_num = in_tensors_.size(); | |||||
| auto input0 = in_tensors_[0]; | |||||
| if (inputs_num == 1) { | |||||
| memcpy(out_tensors_[0]->MutableData(), input0->MutableData(), input0->Size()); | |||||
| return RET_OK; | |||||
| } | |||||
| InitMallocFlags(); | |||||
| ret = MallocAssignBuffer(); | |||||
| if (ret != RET_OK) { | |||||
| FreeBuffer(); | |||||
| return ret; | |||||
| } | |||||
| auto input0_shape = input0->shape(); | |||||
| DoStackFp16(buffers_.data(), inputs_num, input0_shape.data(), input0_shape.size(), axis_, out_buffer_); | |||||
| // if output tensor is fp32, we need to transform | |||||
| if (malloc_out) { | |||||
| auto out_tensor = out_tensors_.at(0); | |||||
| Float16ToFloat32(out_buffer_, reinterpret_cast<float *>(out_tensor->MutableData()), out_tensor->ElementsNum()); | |||||
| } | |||||
| FreeBuffer(); | |||||
| return RET_OK; | |||||
| } | |||||
| kernel::LiteKernel *CpuStackFp16KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||||
| const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter, | |||||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||||
| const mindspore::lite::PrimitiveC *primitive) { | |||||
| if (op_parameter == nullptr) { | |||||
| MS_LOG(ERROR) << "Input op_parameter is nullptr!"; | |||||
| return nullptr; | |||||
| } | |||||
| MS_ASSERT(desc.type == schema::PrimitiveType_Stack); | |||||
| auto *kernel = new (std::nothrow) StackFp16CPUKernel(op_parameter, inputs, outputs, ctx, primitive); | |||||
| if (kernel == nullptr) { | |||||
| MS_LOG(ERROR) << "new StackFp16CPUKernel fail!"; | |||||
| return nullptr; | |||||
| } | |||||
| auto ret = kernel->Init(); | |||||
| if (ret != RET_OK) { | |||||
| delete kernel; | |||||
| MS_LOG(ERROR) << "Init kernel failed, name: " << op_parameter->name_ << ", type: " | |||||
| << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(op_parameter->type_)); | |||||
| return nullptr; | |||||
| } | |||||
| return kernel; | |||||
| } | |||||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Stack, CpuStackFp16KernelCreator) | |||||
| } // namespace mindspore::kernel | |||||
| @@ -0,0 +1,49 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_STACK_FP16_H_ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_STACK_FP16_H_ | |||||
| #include <vector> | |||||
| #include "src/lite_kernel.h" | |||||
| #include "src/runtime/kernel/arm/fp32/stack.h" | |||||
| namespace mindspore::kernel { | |||||
| class StackFp16CPUKernel : public StackCPUKernel { | |||||
| public: | |||||
| StackFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | |||||
| const mindspore::lite::PrimitiveC *primitive) | |||||
| : StackCPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||||
| ~StackFp16CPUKernel() = default; | |||||
| int Init() override; | |||||
| int Run() override; | |||||
| private: | |||||
| void InitMallocFlags(); | |||||
| int MallocAssignBuffer(); | |||||
| void FreeBuffer(); | |||||
| private: | |||||
| std::vector<bool> malloc_buffers_; | |||||
| std::vector<float16_t *> buffers_; | |||||
| float16_t *out_buffer_; | |||||
| bool malloc_out; | |||||
| }; | |||||
| } // namespace mindspore::kernel | |||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_STACK_FP16_H_ | |||||
| @@ -138,7 +138,7 @@ int ScaleCPUKernel::Init() { | |||||
| int ScaleCPUKernel::ReSize() { | int ScaleCPUKernel::ReSize() { | ||||
| auto ret = CalculateParameter(); | auto ret = CalculateParameter(); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "Scale fp32 InitParameter failed."; | |||||
| MS_LOG(ERROR) << "Scale fp32 CalculateParameter failed."; | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| @@ -37,15 +37,17 @@ class ScaleCPUKernel : public LiteKernel { | |||||
| int ReSize() override; | int ReSize() override; | ||||
| int Run() override; | int Run() override; | ||||
| int CalculateParameter(); | int CalculateParameter(); | ||||
| int InitScaleOffset(); | |||||
| virtual int InitScaleOffset(); | |||||
| int Scale(int task_id); | int Scale(int task_id); | ||||
| protected: | |||||
| ScaleParameter *scale_param_; | |||||
| private: | private: | ||||
| float *input_ptr_ = nullptr; | float *input_ptr_ = nullptr; | ||||
| float *scale_ = nullptr; | float *scale_ = nullptr; | ||||
| float *offset_ = nullptr; | float *offset_ = nullptr; | ||||
| float *output_ptr_ = nullptr; | float *output_ptr_ = nullptr; | ||||
| ScaleParameter *scale_param_; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -18,6 +18,7 @@ | |||||
| #include "schema/model_generated.h" | #include "schema/model_generated.h" | ||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| #include "nnacl/fp32/stack.h" | #include "nnacl/fp32/stack.h" | ||||
| #include "nnacl/stack_parameter.h" | |||||
| #include "include/errorcode.h" | #include "include/errorcode.h" | ||||
| using mindspore::lite::KernelRegistrar; | using mindspore::lite::KernelRegistrar; | ||||
| @@ -33,7 +33,7 @@ class StackCPUKernel : public LiteKernel { | |||||
| int ReSize() override; | int ReSize() override; | ||||
| int Run() override; | int Run() override; | ||||
| private: | |||||
| protected: | |||||
| int axis_; | int axis_; | ||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||