From: @yeyunpeng2020 Reviewed-by: Signed-off-by:tags/v1.1.0
| @@ -755,8 +755,6 @@ build_lite_java_arm64() { | |||
| [ -n "${JAVA_PATH}" ] && rm -rf ${JAVA_PATH}/java/app/libs/arm64-v8a/ | |||
| mkdir -p ${JAVA_PATH}/java/app/libs/arm64-v8a/ | |||
| cp ${BASEPATH}/output/mindspore-lite-${VERSION_STR}-runtime-arm64-cpu/lib/libmindspore-lite.so ${JAVA_PATH}/java/app/libs/arm64-v8a/ | |||
| cp ${BASEPATH}/output/mindspore-lite-${VERSION_STR}-runtime-arm64-cpu/lib/libmindspore-lite-fp16.so ${JAVA_PATH}/java/app/libs/arm64-v8a/ | |||
| cp ${BASEPATH}/output/mindspore-lite-${VERSION_STR}-runtime-arm64-cpu/lib/libmindspore-lite-optimize.so ${JAVA_PATH}/java/app/libs/arm64-v8a/ | |||
| echo mindspore-lite-${VERSION_STR}-runtime-arm64-cpu | |||
| [ -n "${VERSION_STR}" ] && rm -rf mindspore-lite-${VERSION_STR}-runtime-arm64-cpu | |||
| } | |||
| @@ -77,8 +77,6 @@ if (PLATFORM_ARM64) | |||
| install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite.a DESTINATION ${LIB_DIR} COMPONENT ${COMPONENT_NAME}) | |||
| install(FILES ${TOP_DIR}/mindspore/core/ir/dtype/type_id.h DESTINATION ${INC_DIR}/ir/dtype COMPONENT ${COMPONENT_NAME}) | |||
| install(DIRECTORY ${TOP_DIR}/mindspore/lite/build/schema/ DESTINATION ${INC_DIR}/schema COMPONENT ${COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "inner" EXCLUDE) | |||
| install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite-optimize.so DESTINATION ${LIB_DIR} COMPONENT ${COMPONENT_NAME}) | |||
| install(FILES ${TOP_DIR}/mindspore/lite/build/src/libmindspore-lite-fp16.so DESTINATION ${LIB_DIR} COMPONENT ${COMPONENT_NAME}) | |||
| install(DIRECTORY ${flatbuffers_INC} DESTINATION ${FLATBF_DIR} COMPONENT ${COMPONENT_NAME}) | |||
| if (ENABLE_TOOLS) | |||
| install(TARGETS benchmark RUNTIME DESTINATION ${MAIN_DIR}-${COMPONENT_NAME}/benchmark COMPONENT ${COMPONENT_NAME}) | |||
| @@ -21,8 +21,6 @@ string(REPLACE "-fvisibility=hidden" "-fvisibility=default" CMAKE_C_FLAGS "${CMA | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8.2-a+dotprod+fp16") | |||
| set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8.2-a+dotprod+fp16") | |||
| add_library(nnacl_optimize STATIC ${SDOT_FILES}) | |||
| target_link_libraries(nnacl_optimize mindspore-lite) | |||
| add_library(nnacl_optimize_mid OBJECT ${SDOT_FILES}) | |||
| add_library(nnacl_fp16 STATIC ${FP16_FILES}) | |||
| target_link_libraries(nnacl_fp16 mindspore-lite) | |||
| add_library(nnacl_fp16_mid OBJECT ${FP16_FILES}) | |||
| @@ -1,99 +0,0 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_NNACL_OPTIMIZED_KERNEL_H_ | |||
| #define MINDSPORE_LITE_NNACL_OPTIMIZED_KERNEL_H_ | |||
| #ifndef _WIN32 | |||
| #include <dlfcn.h> | |||
| #endif | |||
| #ifdef __ANDROID__ | |||
| #include <asm/hwcap.h> | |||
| #include "nnacl/nnacl_utils.h" | |||
| #endif | |||
| #include "src/common/log_adapter.h" | |||
| #define OPTIMIZE_SHARED_LIBRARY_PATH "libmindspore-lite-optimize.so" | |||
| #define FLOAT16_SHARED_LIBRARY_PATH "libmindspore-lite-fp16.so" | |||
| class OptimizeModule { | |||
| public: | |||
| OptimizeModule() { | |||
| bool support_optimize_ops = false; | |||
| #ifdef ENABLE_ARM64 | |||
| int hwcap_type = 16; | |||
| uint32_t hwcap = getHwCap(hwcap_type); | |||
| if (hwcap & HWCAP_ASIMDDP) { | |||
| MS_LOG(INFO) << "Hw cap support SMID Dot Product, hwcap: 0x" << hwcap; | |||
| support_optimize_ops = true; | |||
| } else { | |||
| MS_LOG(INFO) << "Hw cap NOT support SIMD Dot Product, hwcap: 0x" << hwcap; | |||
| } | |||
| #endif | |||
| if (support_optimize_ops == false) { | |||
| return; | |||
| } | |||
| #ifdef ENABLE_ARM64 | |||
| optimized_op_handler_ = dlopen(OPTIMIZE_SHARED_LIBRARY_PATH, RTLD_LAZY); | |||
| if (optimized_op_handler_ == nullptr) { | |||
| MS_LOG(INFO) << "Open optimize shared library failed: " << dlerror(); | |||
| } | |||
| #endif | |||
| } | |||
| ~OptimizeModule() = default; | |||
| static OptimizeModule *GetInstance() { | |||
| static OptimizeModule opt_module; | |||
| return &opt_module; | |||
| } | |||
| void *optimized_op_handler_ = nullptr; | |||
| }; | |||
| class Float16Module { | |||
| public: | |||
| Float16Module() { | |||
| bool support_fp16 = false; | |||
| #ifdef ENABLE_ARM64 | |||
| int hwcap_type = 16; | |||
| uint32_t hwcap = getHwCap(hwcap_type); | |||
| if (hwcap & HWCAP_FPHP) { | |||
| MS_LOG(INFO) << "Hw cap support FP16, hwcap: 0x" << hwcap; | |||
| support_fp16 = true; | |||
| } | |||
| #endif | |||
| if (support_fp16 == false) { | |||
| return; | |||
| } | |||
| #ifdef ENABLE_ARM64 | |||
| float16_op_handler_ = dlopen(FLOAT16_SHARED_LIBRARY_PATH, RTLD_LAZY); | |||
| if (float16_op_handler_ == nullptr) { | |||
| MS_LOG(INFO) << "Open optimize shared library failed: " << dlerror(); | |||
| } | |||
| #endif | |||
| } | |||
| ~Float16Module() = default; | |||
| static Float16Module *GetInstance() { | |||
| static Float16Module fp16_module; | |||
| return &fp16_module; | |||
| } | |||
| void *float16_op_handler_ = nullptr; | |||
| }; | |||
| #endif // MINDSPORE_LITE_NNACL_OPTIMIZED_KERNEL_H_ | |||
| @@ -17,6 +17,7 @@ if (PLATFORM_ARM32 OR PLATFORM_ARM64) | |||
| endif () | |||
| set(LITE_SRC | |||
| ${CMAKE_CURRENT_SOURCE_DIR}/common/utils.cc | |||
| ${CMAKE_CURRENT_SOURCE_DIR}/common/graph_util.cc | |||
| ${CMAKE_CURRENT_SOURCE_DIR}/common/log_adapter.cc | |||
| ${CMAKE_CURRENT_SOURCE_DIR}/common/string_util.cc | |||
| @@ -114,22 +115,10 @@ endif () | |||
| ########################## build optimize and float16 library #################################3 | |||
| if (PLATFORM_ARM64) | |||
| add_library(mindspore-lite-optimize SHARED) | |||
| target_link_libraries(mindspore-lite-optimize cpu_opt_kernel_mid) | |||
| target_link_libraries(mindspore-lite-optimize nnacl_optimize) | |||
| target_link_libraries(mindspore-lite cpu_opt_kernel_mid nnacl_optimize_mid) | |||
| target_link_libraries(mindspore-lite_static cpu_opt_kernel_mid nnacl_optimize_mid) | |||
| add_library(mindspore-lite-fp16 SHARED) | |||
| target_link_libraries(mindspore-lite-fp16 cpu_fp16_kernel_mid) | |||
| target_link_libraries(mindspore-lite-fp16 nnacl_fp16) | |||
| endif () | |||
| if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release" AND (PLATFORM_ARM64)) | |||
| add_custom_command(TARGET mindspore-lite-optimize POST_BUILD COMMAND | |||
| ${ANDROID_NDK}/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/aarch64-linux-android/bin/strip | |||
| ${CMAKE_BINARY_DIR}/src/libmindspore-lite-optimize.so) | |||
| add_custom_command(TARGET mindspore-lite-fp16 POST_BUILD COMMAND | |||
| ${ANDROID_NDK}/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/aarch64-linux-android/bin/strip | |||
| ${CMAKE_BINARY_DIR}/src/libmindspore-lite-fp16.so) | |||
| target_link_libraries(mindspore-lite cpu_fp16_kernel_mid nnacl_fp16_mid) | |||
| target_link_libraries(mindspore-lite_static cpu_fp16_kernel_mid nnacl_fp16_mid) | |||
| endif () | |||
| @@ -16,6 +16,7 @@ | |||
| #ifdef __ANDROID__ | |||
| #include <sys/auxv.h> | |||
| #include <asm/hwcap.h> | |||
| #endif | |||
| #include "src/common/utils.h" | |||
| @@ -257,5 +258,38 @@ uint32_t getHwCap(int hwcap_type) { | |||
| return ret; | |||
| } | |||
| #endif | |||
| bool IsSupportSDot() { | |||
| bool status = false; | |||
| #ifdef ENABLE_ARM64 | |||
| int hwcap_type = 16; | |||
| uint32_t hwcap = getHwCap(hwcap_type); | |||
| if (hwcap & HWCAP_ASIMDDP) { | |||
| MS_LOG(DEBUG) << "Hw cap support SMID Dot Product, hwcap: 0x" << hwcap; | |||
| status = true; | |||
| } else { | |||
| MS_LOG(DEBUG) << "Hw cap NOT support SIMD Dot Product, hwcap: 0x" << hwcap; | |||
| status = false; | |||
| } | |||
| #endif | |||
| return status; | |||
| } | |||
| bool IsSupportFloat16() { | |||
| bool status = false; | |||
| #ifdef ENABLE_ARM64 | |||
| int hwcap_type = 16; | |||
| uint32_t hwcap = getHwCap(hwcap_type); | |||
| if (hwcap & HWCAP_FPHP) { | |||
| MS_LOG(DEBUG) << "Hw cap support FP16, hwcap: 0x" << hwcap; | |||
| status = true; | |||
| } else { | |||
| MS_LOG(DEBUG) << "Hw cap NOT support FP16, hwcap: 0x" << hwcap; | |||
| status = false; | |||
| } | |||
| #endif | |||
| return status; | |||
| } | |||
| } // namespace lite | |||
| } // namespace mindspore | |||
| @@ -44,6 +44,9 @@ void ShortToFloat32(const int16_t *srcdata, float *dstdata, size_t elementSize); | |||
| void Float32ToShort(const float *srcdata, int16_t *dstdata, size_t elementSize); | |||
| bool IsSupportSDot(); | |||
| bool IsSupportFloat16(); | |||
| #if defined(__arm__) || defined(__aarch64__) | |||
| uint32_t getHwCap(int hwcap_type); | |||
| #endif | |||
| @@ -20,7 +20,7 @@ | |||
| #include <asm/hwcap.h> | |||
| #include "common/utils.h" | |||
| #include "src/common/log_adapter.h" | |||
| #include "nnacl/optimized_kernel.h" | |||
| #include "src/common/utils.h" | |||
| #endif | |||
| using mindspore::kernel::kCPU; | |||
| @@ -36,17 +36,15 @@ KernelRegistry *KernelRegistry::GetInstance() { | |||
| int KernelRegistry::Init() { | |||
| #ifdef ENABLE_ARM64 | |||
| void *optimized_lib_handler = OptimizeModule::GetInstance()->optimized_op_handler_; | |||
| if (optimized_lib_handler != nullptr) { | |||
| MS_LOG(INFO) << "load optimize lib success."; | |||
| if (mindspore::lite::IsSupportSDot()) { | |||
| MS_LOG(INFO) << "The current device supports Sdot."; | |||
| } else { | |||
| MS_LOG(INFO) << "load optimize lib failed."; | |||
| MS_LOG(INFO) << "The current device NOT supports Sdot."; | |||
| } | |||
| void *float16_op_handler = Float16Module::GetInstance()->float16_op_handler_; | |||
| if (float16_op_handler != nullptr) { | |||
| MS_LOG(INFO) << "load float16 lib success."; | |||
| if (mindspore::lite::IsSupportFloat16()) { | |||
| MS_LOG(INFO) << "The current device supports float16."; | |||
| } else { | |||
| MS_LOG(INFO) << "load float16 lib failed."; | |||
| MS_LOG(INFO) << "The current device NOT supports float16."; | |||
| } | |||
| #endif | |||
| return RET_OK; | |||
| @@ -21,7 +21,7 @@ | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" | |||
| #include "nnacl/optimized_kernel.h" | |||
| #include "src/common/utils.h" | |||
| #include "nnacl/matmul_parameter.h" | |||
| #include "nnacl/fp16/matmul_fp16.h" | |||
| @@ -21,7 +21,7 @@ | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | |||
| #include "nnacl/optimized_kernel.h" | |||
| #include "src/common/utils.h" | |||
| namespace mindspore::kernel { | |||
| class ConvolutionBaseFP16CPUKernel : public ConvolutionBaseCPUKernel { | |||
| @@ -23,7 +23,7 @@ | |||
| #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" | |||
| #include "nnacl/fp16/conv_fp16.h" | |||
| #include "nnacl/fp16/winograd_utils_fp16.h" | |||
| #include "nnacl/optimized_kernel.h" | |||
| #include "src/common/utils.h" | |||
| #include "nnacl/minimal_filtering_generator.h" | |||
| namespace mindspore::kernel { | |||
| @@ -14,7 +14,9 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #ifdef ENABLE_ARM64 | |||
| #include <arm_neon.h> | |||
| #endif | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| @@ -102,11 +102,11 @@ int QuantDTypeCastFp16CPUKernel::QuantDTypeCast(int task_id) { | |||
| return RET_OK; | |||
| } | |||
| int QuantDTypeCastRun(void *cdata, int task_id) { | |||
| int QuantDTypeCastFP16Run(void *cdata, int task_id) { | |||
| auto g_kernel = reinterpret_cast<QuantDTypeCastFp16CPUKernel *>(cdata); | |||
| auto ret = g_kernel->QuantDTypeCast(task_id); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "QuantDTypeCastRun error task_id[" << task_id << "] error_code[" << ret << "]"; | |||
| MS_LOG(ERROR) << "QuantDTypeCastFP16Run error task_id[" << task_id << "] error_code[" << ret << "]"; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| @@ -126,7 +126,7 @@ int QuantDTypeCastFp16CPUKernel::Run() { | |||
| return RET_ERROR; | |||
| } | |||
| auto ret = ParallelLaunch(this->context_->thread_pool_, QuantDTypeCastRun, this, thread_n_num_); | |||
| auto ret = ParallelLaunch(this->context_->thread_pool_, QuantDTypeCastFP16Run, this, thread_n_num_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Scale error error_code[" << ret << "]"; | |||
| return RET_ERROR; | |||
| @@ -17,6 +17,9 @@ | |||
| #include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h" | |||
| #include "src/runtime/runtime_api.h" | |||
| #include "src/common/file_utils.h" | |||
| #ifdef ENABLE_ARM64 | |||
| #include "src/runtime/kernel/arm/int8/opt_op_handler.h" | |||
| #endif | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_MEMORY_FAILED; | |||
| @@ -74,18 +77,9 @@ void Convolution1x1Int8CPUKernel::CheckSupportOptimize() { | |||
| support_optimize_ = false; | |||
| matmul_func_ = MatMulInt8_8x8_r; | |||
| #ifdef ENABLE_ARM64 | |||
| void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_; | |||
| if (optimize_op_handler != nullptr) { | |||
| dlerror(); | |||
| *(reinterpret_cast<void **>(&matmul_func_)) = dlsym(optimize_op_handler, "MatMulRInt8_optimize_handler"); | |||
| auto dlopen_error = dlerror(); | |||
| if (dlopen_error != nullptr) { | |||
| MS_LOG(ERROR) << "load matmul func failed! " << dlopen_error << "."; | |||
| support_optimize_ = false; | |||
| matmul_func_ = nullptr; | |||
| } else { | |||
| support_optimize_ = true; | |||
| } | |||
| if (mindspore::lite::IsSupportSDot()) { | |||
| support_optimize_ = true; | |||
| matmul_func_ = MatMulRInt8_optimize_handler; | |||
| } else { | |||
| support_optimize_ = false; | |||
| matmul_func_ = nullptr; | |||
| @@ -25,7 +25,7 @@ | |||
| #include "nnacl/int8/conv_int8.h" | |||
| #include "nnacl/int8/matmul_int8.h" | |||
| #include "nnacl/matmul_parameter.h" | |||
| #include "nnacl/optimized_kernel.h" | |||
| #include "src/common/utils.h" | |||
| namespace mindspore::kernel { | |||
| class Convolution1x1Int8CPUKernel : public ConvolutionBaseCPUKernel { | |||
| @@ -23,6 +23,9 @@ | |||
| #include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h" | |||
| #include "src/runtime/kernel/arm/int8/convolution_3x3_int8.h" | |||
| #include "src/runtime/runtime_api.h" | |||
| #ifdef ENABLE_ARM64 | |||
| #include "src/runtime/kernel/arm/int8/opt_op_handler.h" | |||
| #endif | |||
| using mindspore::kernel::KERNEL_ARCH::kCPU; | |||
| using mindspore::lite::KernelRegistrar; | |||
| @@ -39,18 +42,9 @@ void ConvolutionInt8CPUKernel::CheckSupportOptimize() { | |||
| #endif | |||
| #ifdef ENABLE_ARM64 | |||
| void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_; | |||
| if (optimize_op_handler != nullptr) { | |||
| dlerror(); | |||
| *(reinterpret_cast<void **>(&matmul_func_)) = dlsym(optimize_op_handler, "MatMulRInt8_optimize_handler"); | |||
| auto dlopen_error = dlerror(); | |||
| if (dlopen_error != nullptr) { | |||
| MS_LOG(ERROR) << "load matmul func failed! " << dlopen_error << "."; | |||
| support_optimize_ = false; | |||
| tile_num_ = 4; | |||
| } else { | |||
| support_optimize_ = true; | |||
| } | |||
| if (mindspore::lite::IsSupportSDot()) { | |||
| matmul_func_ = MatMulRInt8_optimize_handler; | |||
| support_optimize_ = true; | |||
| } else { | |||
| tile_num_ = 4; | |||
| support_optimize_ = false; | |||
| @@ -260,8 +254,7 @@ kernel::LiteKernel *CpuConvInt8KernelCreator(const std::vector<lite::Tensor *> & | |||
| kernel::LiteKernel *kernel; | |||
| if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { | |||
| #ifdef ENABLE_ARM64 | |||
| void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_; | |||
| if (optimize_op_handler != nullptr) { | |||
| if (mindspore::lite::IsSupportSDot()) { | |||
| kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| } else { | |||
| kernel = new (std::nothrow) kernel::Convolution3x3Int8CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| @@ -20,7 +20,7 @@ | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | |||
| #include "nnacl/optimized_kernel.h" | |||
| #include "src/common/utils.h" | |||
| #include "nnacl/int8/conv_int8.h" | |||
| namespace mindspore::kernel { | |||
| @@ -16,7 +16,8 @@ | |||
| #include "src/runtime/kernel/arm/int8/deconvolution_int8.h" | |||
| #include "src/runtime/runtime_api.h" | |||
| #include "nnacl/optimized_kernel.h" | |||
| #include "src/common/utils.h" | |||
| #include "src/runtime/kernel/arm/int8/opt_op_handler.h" | |||
| using mindspore::kernel::KERNEL_ARCH::kCPU; | |||
| using mindspore::lite::KernelRegistrar; | |||
| @@ -95,18 +96,9 @@ void DeConvInt8CPUKernel::CheckSupportOptimize() { | |||
| support_optimize_ = true; | |||
| matmul_func_ = MatMulInt8_16x4; | |||
| #ifdef ENABLE_ARM64 | |||
| void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_; | |||
| if (optimize_op_handler != nullptr) { | |||
| dlerror(); | |||
| *(reinterpret_cast<void **>(&matmul_func_)) = dlsym(optimize_op_handler, "MatMulR4Int8_optimize_handler"); | |||
| auto dlopen_error = dlerror(); | |||
| if (dlopen_error != nullptr) { | |||
| MS_LOG(ERROR) << "load matmul func failed! " << dlopen_error << "."; | |||
| support_optimize_ = false; | |||
| matmul_func_ = MatMulR4Int8Neon64; | |||
| } else { | |||
| support_optimize_ = true; | |||
| } | |||
| if (mindspore::lite::IsSupportSDot()) { | |||
| support_optimize_ = true; | |||
| matmul_func_ = MatMulR4Int8_optimize_handler; | |||
| } else { | |||
| support_optimize_ = false; | |||
| matmul_func_ = MatMulR4Int8Neon64; | |||
| @@ -14,8 +14,8 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #include "src/runtime/kernel/arm/int8/opt_op_handler.h" | |||
| #include <stdlib.h> | |||
| #include <stdbool.h> | |||
| #include "nnacl/op_base.h" | |||
| #ifdef __cplusplus | |||
| @@ -0,0 +1,40 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <stdlib.h> | |||
| #include <stdbool.h> | |||
| #include "nnacl/op_base.h" | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| #endif | |||
| #ifdef ENABLE_ARM64 | |||
| void IndirectGemmInt8_optimize_handler(int8_t *dst, const int8_t *src, const int8_t *weight, const int32_t *bias, | |||
| size_t ksize, size_t ic4, size_t output_channel, size_t offset, | |||
| const int32_t *input_sum, size_t act_min, size_t act_max, size_t out_zp, | |||
| int32_t *out_multiplier, int32_t *shift_before, int32_t *shift_after, | |||
| size_t asymmetric, size_t per_channel, size_t per_channel_offset); | |||
| void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, int row4, int col4, int deep16, | |||
| const int *input_sum, const int *bias); | |||
| void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4, | |||
| size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, | |||
| int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, | |||
| int32_t maxi, size_t per_channel); | |||
| #endif | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| @@ -275,7 +275,8 @@ kernel::LiteKernel *Scheduler::ScheduleNode(const std::vector<Tensor *> &in_tens | |||
| } | |||
| } | |||
| #endif | |||
| if ((context_->IsCpuFloat16Enabled() && data_type == kNumberTypeFloat32) || data_type == kNumberTypeFloat16) { | |||
| if (mindspore::lite::IsSupportFloat16() && | |||
| ((context_->IsCpuFloat16Enabled() && data_type == kNumberTypeFloat32) || data_type == kNumberTypeFloat16)) { | |||
| kernel::KernelKey fp16_cpu_desc{desc.arch, kNumberTypeFloat16, desc.type}; | |||
| auto *kernel = | |||
| KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, primitive, context_, fp16_cpu_desc); | |||
| @@ -17,7 +17,8 @@ | |||
| #include "src/sub_graph_kernel.h" | |||
| #include "src/tensor.h" | |||
| #ifdef ENABLE_ARM64 | |||
| #include "nnacl/optimized_kernel.h" | |||
| #include "src/common/utils.h" | |||
| #include "src/runtime/kernel/arm/fp16/fp16_op_handler.h" | |||
| #endif | |||
| namespace mindspore::kernel { | |||
| @@ -183,9 +184,9 @@ void CpuFp16SubGraph::FreeOriginInputData() { | |||
| } | |||
| int CpuFp16SubGraph::PreProcess() { | |||
| auto fp32_to_fp16_cast_func = Float16CastUtil::GetInstance()->float32_to_float16_func_; | |||
| if (fp32_to_fp16_cast_func == nullptr) { | |||
| MS_LOG(ERROR) << "Can not find cast fp32 to fp16 func"; | |||
| #ifdef ENABLE_ARM64 | |||
| if (!mindspore::lite::IsSupportFloat16()) { | |||
| MS_LOG(ERROR) << "Unsupport fp16 in this devices"; | |||
| return RET_ERROR; | |||
| } | |||
| MS_ASSERT(origin_input_data_.empty()); | |||
| @@ -203,7 +204,7 @@ int CpuFp16SubGraph::PreProcess() { | |||
| return RET_ERROR; | |||
| } | |||
| MS_ASSERT(tensor->data_c() != nullptr); | |||
| fp32_to_fp16_cast_func(float32_data, tensor->data_c(), tensor->ElementsNum()); | |||
| Float32ToFloat16_fp16_handler(float32_data, tensor->data_c(), tensor->ElementsNum()); | |||
| auto *data_store = DataStore::CreateDataStore(float32_data, tensor->allocator(), this->context_->allocator.get()); | |||
| if (data_store == nullptr) { | |||
| MS_LOG(ERROR) << "Create DataStore failed"; | |||
| @@ -223,12 +224,15 @@ int CpuFp16SubGraph::PreProcess() { | |||
| } | |||
| } | |||
| return RET_OK; | |||
| #else | |||
| return RET_OK; | |||
| #endif | |||
| } | |||
| int CpuFp16SubGraph::PostProcess() { | |||
| auto fp16_to_fp32_cast_func = Float16CastUtil::GetInstance()->float16_to_float32_func_; | |||
| if (fp16_to_fp32_cast_func == nullptr) { | |||
| MS_LOG(ERROR) << "Can not find cast fp16 to fp32 func"; | |||
| #ifdef ENABLE_ARM64 | |||
| if (!mindspore::lite::IsSupportFloat16()) { | |||
| MS_LOG(ERROR) << "Unsupport fp16 in this devices"; | |||
| return RET_ERROR; | |||
| } | |||
| for (auto tensor : this->out_tensors_) { | |||
| @@ -249,7 +253,7 @@ int CpuFp16SubGraph::PostProcess() { | |||
| return RET_ERROR; | |||
| } | |||
| MS_ASSERT(tensor->data_c() != nullptr); | |||
| fp16_to_fp32_cast_func(float16_data, tensor->data_c(), tensor->ElementsNum()); | |||
| Float16ToFloat32_fp16_handler(float16_data, tensor->data_c(), tensor->ElementsNum()); | |||
| if (tensor->allocator() != nullptr) { | |||
| tensor->allocator()->Free(float16_data); | |||
| } else { | |||
| @@ -273,5 +277,8 @@ int CpuFp16SubGraph::PostProcess() { | |||
| } | |||
| this->FreeOriginInputData(); | |||
| return RET_OK; | |||
| #else | |||
| return RET_OK; | |||
| #endif | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -24,41 +24,10 @@ | |||
| #include "src/executor.h" | |||
| #include "src/common/log_adapter.h" | |||
| #ifdef ENABLE_ARM64 | |||
| #include "nnacl/optimized_kernel.h" | |||
| #include "src/common/utils.h" | |||
| #endif | |||
| namespace mindspore::kernel { | |||
| using Float16CastFunc = void (*)(const void *, void *, int); | |||
| class Float16CastUtil { | |||
| public: | |||
| static Float16CastUtil *GetInstance() { | |||
| static Float16CastUtil float16_cast_util; | |||
| return &float16_cast_util; | |||
| } | |||
| private: | |||
| Float16CastUtil() { | |||
| #ifdef ENABLE_ARM64 | |||
| void *fp16_op_handler = Float16Module::GetInstance()->float16_op_handler_; | |||
| if (fp16_op_handler != nullptr) { | |||
| dlerror(); | |||
| *(reinterpret_cast<void **>(&float16_to_float32_func_)) = dlsym(fp16_op_handler, "Float16ToFloat32_fp16_handler"); | |||
| *(reinterpret_cast<void **>(&float32_to_float16_func_)) = dlsym(fp16_op_handler, "Float32ToFloat16_fp16_handler"); | |||
| auto dlopen_error = dlerror(); | |||
| if (dlopen_error != nullptr) { | |||
| MS_LOG(ERROR) << "load float16 cast func failed! " << dlopen_error << "."; | |||
| } | |||
| } | |||
| #endif | |||
| } | |||
| ~Float16CastUtil() = default; | |||
| public: | |||
| Float16CastFunc float16_to_float32_func_ = nullptr; | |||
| Float16CastFunc float32_to_float16_func_ = nullptr; | |||
| }; | |||
| // store origin data and allocator of input tensor of subgraph for PreProcess and PostProcess | |||
| struct DataStore { | |||
| void *data_ = nullptr; | |||
| @@ -66,17 +66,6 @@ if (PLATFORM_ARM32) | |||
| ) | |||
| endif() | |||
| if (ENABLE_FP16) | |||
| file(GLOB KERNEL_OP_FP16_SRC | |||
| ${LITE_DIR}/src/runtime/kernel/arm/fp16/*.cc | |||
| ${LITE_DIR}/nnacl/fp16/*.c | |||
| ) | |||
| set(KERNEL_OP_SRC | |||
| ${KERNEL_OP_SRC} | |||
| ${KERNEL_OP_FP16_SRC} | |||
| ) | |||
| endif () | |||
| if ("${X86_64_SIMD}" STREQUAL "sse") | |||
| file(GLOB TEST_ASSEMBLY_SRC ${LITE_DIR}/nnacl/x86_64_sse/*.c) | |||
| set_property(SOURCE ${TEST_ASSEMBLY_SRC} PROPERTY LANGUAGE C) | |||
| @@ -295,7 +284,7 @@ add_executable(lite-test ${TEST_SRC}) | |||
| target_link_libraries(lite-test dl ${GTEST_LIBRARY}) | |||
| if (PLATFORM_ARM64) | |||
| target_link_libraries(lite-test mslite_internal) | |||
| target_link_libraries(lite-test mslite_internal nnacl_fp16_mid nnacl_optimize_mid) | |||
| endif() | |||
| if (PLATFORM_ARM) | |||
| @@ -727,8 +727,6 @@ function Run_arm64() { | |||
| fi | |||
| cp -a ${arm64_path}/mindspore-lite-${version}-runtime-arm64-${process_unit_arm64}/lib/libmindspore-lite.so ${benchmark_test_path}/libmindspore-lite.so || exit 1 | |||
| cp -a ${arm64_path}/mindspore-lite-${version}-runtime-arm64-${process_unit_arm64}/lib/libmindspore-lite-fp16.so ${benchmark_test_path}/libmindspore-lite-fp16.so || exit 1 | |||
| cp -a ${arm64_path}/mindspore-lite-${version}-runtime-arm64-${process_unit_arm64}/lib/libmindspore-lite-optimize.so ${benchmark_test_path}/libmindspore-lite-optimize.so || exit 1 | |||
| cp -a ${arm64_path}/mindspore-lite-${version}-runtime-arm64-${process_unit_arm64}/benchmark/benchmark ${benchmark_test_path}/benchmark || exit 1 | |||
| # adb push all needed files to the phone | |||
| @@ -28,6 +28,7 @@ file(GLOB_RECURSE CONVERTER_SRC RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} | |||
| ${CMAKE_CURRENT_SOURCE_DIR}/../common/flag_parser.cc | |||
| ${CMAKE_CURRENT_SOURCE_DIR}/../common/storage.cc | |||
| ${CMAKE_CURRENT_SOURCE_DIR}/../../src/ir/primitive_t_value.cc | |||
| ${CMAKE_CURRENT_SOURCE_DIR}/../../src/common/utils.cc | |||
| ../optimizer/common/node_pass_extends.cc | |||
| ../optimizer/common/pass_manager_extends.cc | |||