| @@ -39,6 +39,7 @@ if (PLATFORM_ARM64) | |||||
| # assembly | # assembly | ||||
| file(GLOB ASSEMBLY_SRC | file(GLOB ASSEMBLY_SRC | ||||
| ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/MatmulFp32Opt.S | ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/MatmulFp32Opt.S | ||||
| ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/MatVecMulFp32.S | |||||
| ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/MatmulFp32.S) | ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/assembly/arm64/MatmulFp32.S) | ||||
| set_property(SOURCE ${ASSEMBLY_SRC} PROPERTY LANGUAGE C) | set_property(SOURCE ${ASSEMBLY_SRC} PROPERTY LANGUAGE C) | ||||
| set(KERNEL_SRC ${KERNEL_SRC} ${ASSEMBLY_SRC}) | set(KERNEL_SRC ${KERNEL_SRC} ${ASSEMBLY_SRC}) | ||||
| @@ -0,0 +1,203 @@ | |||||
| #ifdef __aarch64__ | |||||
| .text | |||||
| .align 5 | |||||
| .global MatVecMulFp32Neon64 | |||||
| #ifndef __APPLE__ | |||||
| .type MatVecMulFp32Neon64, %function | |||||
| #endif | |||||
| // void MatVecMulFp32Neon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col) | |||||
| // x0: a | |||||
| // x1: b | |||||
| // x2: c | |||||
| // x3: bias | |||||
| // w4: act_type | |||||
| // w5: depth | |||||
| // w6: col | |||||
| MatVecMulFp32Neon64: | |||||
| sub sp, sp, #128 | |||||
| st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | |||||
| st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 | |||||
| mov w14, #4 // sizeof(float) | |||||
| mul w8, w14, w5 // rhs depthx1 block stride | |||||
| mov w14, #4 | |||||
| mul w13, w8, w14 // rhs depthx4 block stride | |||||
| Loop: | |||||
| mov x15, x0 // reload a ptr | |||||
| mov x7, x1 // reload b ptr | |||||
| mov w9, w5 // reload depth | |||||
| cmp w6, #4 | |||||
| blt Loop1x1 | |||||
| Loop1x4: | |||||
| dup v10.8h, wzr | |||||
| dup v11.8h, wzr | |||||
| dup v12.8h, wzr | |||||
| dup v13.8h, wzr | |||||
| dup v14.8h, wzr | |||||
| add x10, x7, x8 | |||||
| add x11, x10, x8 | |||||
| add x12, x11, x8 | |||||
| Depth8_1x4: | |||||
| cmp w9, #8 | |||||
| blt Depth4_1x4 | |||||
| ld1 {v0.4s, v1.4s}, [x15], #32 | |||||
| ld1 {v2.4s, v3.4s}, [x7], #32 | |||||
| ld1 {v4.4s, v5.4s}, [x10], #32 | |||||
| fmla v10.4s, v0.4s, v2.4s | |||||
| fmla v10.4s, v1.4s, v3.4s | |||||
| fmla v11.4s, v0.4s, v4.4s | |||||
| fmla v11.4s, v1.4s, v5.4s | |||||
| ld1 {v6.4s, v7.4s}, [x11], #32 | |||||
| ld1 {v8.4s, v9.4s}, [x12], #32 | |||||
| fmla v12.4s, v0.4s, v6.4s | |||||
| fmla v12.4s, v1.4s, v7.4s | |||||
| fmla v13.4s, v0.4s, v8.4s | |||||
| fmla v13.4s, v1.4s, v9.4s | |||||
| sub w9, w9, #8 | |||||
| cbz w9, End1x4 | |||||
| b Depth8_1x4 | |||||
| Depth4_1x4: | |||||
| cmp w9, #4 | |||||
| blt Depth1_1x4 | |||||
| ld1 {v0.4s}, [x15], #16 | |||||
| ld1 {v1.4s}, [x7], #16 | |||||
| ld1 {v2.4s}, [x10], #16 | |||||
| ld1 {v3.4s}, [x11], #16 | |||||
| ld1 {v4.4s}, [x12], #16 | |||||
| fmla v10.4s, v1.4s, v0.4s | |||||
| fmla v11.4s, v2.4s, v0.4s | |||||
| fmla v12.4s, v3.4s, v0.4s | |||||
| fmla v13.4s, v4.4s, v0.4s | |||||
| sub w9, w9, #4 | |||||
| cbz w9, End1x4 | |||||
| b Depth8_1x4 | |||||
| Depth1_1x4: | |||||
| ld1 {v0.s}[0], [x15], #4 | |||||
| ld1 {v1.s}[0], [x7], #4 | |||||
| ld1 {v1.s}[1], [x10], #4 | |||||
| ld1 {v1.s}[2], [x11], #4 | |||||
| ld1 {v1.s}[3], [x12], #4 | |||||
| fmla v14.4s, v1.4s, v0.s[0] | |||||
| sub w9, w9, #1 | |||||
| cbz w9, End1x4 | |||||
| b Depth1_1x4 | |||||
| End1x4: | |||||
| faddp v15.4s, v10.4s, v11.4s | |||||
| faddp v16.4s, v12.4s, v13.4s | |||||
| faddp v17.4s, v15.4s, v16.4s | |||||
| fadd v14.4s, v14.4s, v17.4s | |||||
| cbz x3, Act1x4 | |||||
| ld1 {v15.4s}, [x3], #16 | |||||
| fadd v14.4s, v14.4s, v15.4s // add bias | |||||
| Act1x4: | |||||
| cmp w4, #3 | |||||
| beq Relu6_1x4 | |||||
| cmp w4, #1 | |||||
| beq Relu1x4 | |||||
| b Write1x4 | |||||
| Relu6_1x4: | |||||
| movi v15.4s, #0x46, lsl #8 | |||||
| fmin v14.4s, v14.4s, v15.4s | |||||
| Relu1x4: | |||||
| dup v15.4s, wzr | |||||
| fmax v14.4s, v14.4s, v15.4s | |||||
| Write1x4: | |||||
| st1 {v14.4s}, [x2], #16 | |||||
| sub w6, w6, #4 | |||||
| cbz w6, End | |||||
| add x1, x1, x13 | |||||
| b Loop | |||||
| Loop1x1: | |||||
| dup v4.4s, wzr | |||||
| dup v5.4s, wzr | |||||
| Depth8_1x1: | |||||
| cmp w9, #8 | |||||
| blt Depth4_1x1 | |||||
| ld1 {v0.4s, v1.4s}, [x15], #32 | |||||
| ld1 {v2.4s, v3.4s}, [x7], #32 | |||||
| fmla v4.4s, v2.4s, v0.4s | |||||
| fmla v4.4s, v3.4s, v1.4s | |||||
| sub w9, w9, #8 | |||||
| cbz w9, End1x1 | |||||
| b Depth8_1x1 | |||||
| Depth4_1x1: | |||||
| cmp w9, #4 | |||||
| blt Depth1_1x1 | |||||
| ld1 {v0.4s}, [x15], #16 | |||||
| ld1 {v1.4s}, [x7], #16 | |||||
| fmla v4.4s, v1.4s, v0.4s | |||||
| sub w9, w9, #4 | |||||
| cbz w9, End1x1 | |||||
| b Depth8_1x1 | |||||
| Depth1_1x1: | |||||
| ld1 {v0.s}[0], [x15], #4 | |||||
| ld1 {v1.s}[0], [x7], #4 | |||||
| fmla v5.4s, v1.4s, v0.s[0] | |||||
| sub w9, w9, #1 | |||||
| cbz w9, End1x1 | |||||
| b Depth1_1x1 | |||||
| End1x1: | |||||
| faddp v6.4s, v4.4s, v4.4s | |||||
| faddp v7.4s, v6.4s, v6.4s | |||||
| fadd v7.4s, v7.4s, v5.4s | |||||
| cbz x3, Act1x1 | |||||
| ld1 {v8.s}[0], [x3], #4 | |||||
| fadd v7.4s, v7.4s, v8.4s // add bias | |||||
| Act1x1: | |||||
| cmp w4, #3 | |||||
| beq Relu6_1x1 | |||||
| cmp w4, #1 | |||||
| beq Relu1x1 | |||||
| b Write1x1 | |||||
| Relu6_1x1: | |||||
| movi v8.4s, #0x46, lsl #8 | |||||
| fmin v7.4s, v7.4s, v8.4s | |||||
| Relu1x1: | |||||
| dup v8.4s, wzr | |||||
| fmax v7.4s, v7.4s, v8.4s | |||||
| Write1x1: | |||||
| st1 {v7.s}[0], [x2], #4 | |||||
| sub w6, w6, #1 | |||||
| cbz w6, End | |||||
| add x1, x1, x8 | |||||
| b Loop | |||||
| End: | |||||
| sub sp, sp, #128 | |||||
| ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | |||||
| ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 | |||||
| ret | |||||
| #endif | |||||
| @@ -1,12 +1,12 @@ | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| .text | .text | ||||
| .align 5 | .align 5 | ||||
| .global MatmulFp16Neon64_1xN | |||||
| .global MatVecMulFp16Neon64 | |||||
| #ifndef __APPLE__ | #ifndef __APPLE__ | ||||
| .type MatmulFp16Neon64_1xN, %function | |||||
| .type MatVecMulFp16Neon64, %function | |||||
| #endif | #endif | ||||
| // void MatmulFp16Neon64_1xN(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, int col) | |||||
| // void MatVecMulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, int col) | |||||
| // x0: a | // x0: a | ||||
| // x1: b | // x1: b | ||||
| // x2: c | // x2: c | ||||
| @@ -15,7 +15,7 @@ | |||||
| // w5: depth | // w5: depth | ||||
| // w6: col | // w6: col | ||||
| MatmulFp16Neon64_1xN: | |||||
| MatVecMulFp16Neon64: | |||||
| sub sp, sp, #128 | sub sp, sp, #128 | ||||
| st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 | ||||
| st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 | st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 | ||||
| @@ -289,9 +289,9 @@ void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const floa | |||||
| return; | return; | ||||
| } | } | ||||
| void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, | |||||
| int col) { | |||||
| MatmulFp16Neon64_1xN(a, b, c, bias, act_type, depth, col); | |||||
| void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, | |||||
| int depth, int col) { | |||||
| MatVecMulFp16Neon64(a, b, c, bias, (int)act_type, depth, col); | |||||
| } | } | ||||
| void RowMajor2Col16MajorFp16Opt(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col) { | void RowMajor2Col16MajorFp16Opt(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col) { | ||||
| @@ -17,14 +17,14 @@ | |||||
| #ifndef MINDSPORE_LITE_NNACL_FP16_MATMUL_H_ | #ifndef MINDSPORE_LITE_NNACL_FP16_MATMUL_H_ | ||||
| #define MINDSPORE_LITE_NNACL_FP16_MATMUL_H_ | #define MINDSPORE_LITE_NNACL_FP16_MATMUL_H_ | ||||
| #include <string.h> | |||||
| #include <float.h> | #include <float.h> | ||||
| #include <string.h> | |||||
| #ifdef ENABLE_NEON | #ifdef ENABLE_NEON | ||||
| #include <arm_neon.h> | #include <arm_neon.h> | ||||
| #endif | #endif | ||||
| #include "nnacl/errorcode.h" | #include "nnacl/errorcode.h" | ||||
| #include "nnacl/op_base.h" | |||||
| #include "nnacl/matmul_parameter.h" | #include "nnacl/matmul_parameter.h" | ||||
| #include "nnacl/op_base.h" | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| extern "C" { | extern "C" { | ||||
| @@ -35,8 +35,8 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl | |||||
| void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, | void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, | ||||
| int depth, int row, int col, int stride, int out_type); | int depth, int row, int col, int stride, int out_type); | ||||
| void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, int depth, | |||||
| int col); | |||||
| void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, | |||||
| int depth, int col); | |||||
| void ColMajor2Row8MajorFp16(void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16); | void ColMajor2Row8MajorFp16(void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16); | ||||
| @@ -48,8 +48,8 @@ void MatmulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, cons | |||||
| void MatmulFp16Neon64Opt(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, | void MatmulFp16Neon64Opt(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, | ||||
| size_t depth, size_t row, size_t col, size_t stride, size_t write_nhwc); | size_t depth, size_t row, size_t col, size_t stride, size_t write_nhwc); | ||||
| void MatmulFp16Neon64_1xN(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, | |||||
| int depth, int col); | |||||
| void MatVecMulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, | |||||
| int depth, int col); | |||||
| void RowMajor2Col16MajorFp16(void *src, float16_t *dst, int row, int col, bool is_fp32_src); | void RowMajor2Col16MajorFp16(void *src, float16_t *dst, int row, int col, bool is_fp32_src); | ||||
| @@ -16,6 +16,14 @@ | |||||
| #include "nnacl/fp32/matmul.h" | #include "nnacl/fp32/matmul.h" | ||||
| void RowMajor2ColMajor(float *src_ptr, float *dst_ptr, int row, int col) { | |||||
| for (int r = 0; r < row; ++r) { | |||||
| for (int c = 0; c < col; ++c) { | |||||
| dst_ptr[c * row + r] = src_ptr[r * col + c]; | |||||
| } | |||||
| } | |||||
| } | |||||
| void RowMajor2Row4Major(float *src_ptr, float *dst_ptr, int row, int col) { | void RowMajor2Row4Major(float *src_ptr, float *dst_ptr, int row, int col) { | ||||
| for (int r = 0; r < row; r++) { | for (int r = 0; r < row; r++) { | ||||
| float *src = src_ptr + r * col; | float *src = src_ptr + r * col; | ||||
| @@ -562,6 +570,12 @@ void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActT | |||||
| #endif | #endif | ||||
| } | } | ||||
| void MatVecMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int depth, int col) { | |||||
| #ifdef ENABLE_ARM64 | |||||
| MatVecMulFp32Neon64(a, b, c, bias, (int)act_type, depth, col); | |||||
| #endif | |||||
| } | |||||
| #ifdef ENABLE_NNACL_INFER_SHAPE | #ifdef ENABLE_NNACL_INFER_SHAPE | ||||
| static void SwapDims(int *dims, int index1, int index2) { | static void SwapDims(int *dims, int index1, int index2) { | ||||
| int tmp = dims[index1]; | int tmp = dims[index1]; | ||||
| @@ -17,17 +17,19 @@ | |||||
| #ifndef MINDSPORE_LITE_NNACL_FP32_MATMUL_H_ | #ifndef MINDSPORE_LITE_NNACL_FP32_MATMUL_H_ | ||||
| #define MINDSPORE_LITE_NNACL_FP32_MATMUL_H_ | #define MINDSPORE_LITE_NNACL_FP32_MATMUL_H_ | ||||
| #include <string.h> | |||||
| #include <float.h> | #include <float.h> | ||||
| #include <string.h> | |||||
| #include "nnacl/errorcode.h" | #include "nnacl/errorcode.h" | ||||
| #include "nnacl/op_base.h" | |||||
| #include "nnacl/matmul_parameter.h" | #include "nnacl/matmul_parameter.h" | ||||
| #include "nnacl/op_base.h" | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| extern "C" { | extern "C" { | ||||
| #endif | #endif | ||||
| void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActType act_type, int deep, int row, | void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActType act_type, int deep, int row, | ||||
| int col, size_t stride, int out_type); | int col, size_t stride, int out_type); | ||||
| void MatVecMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int depth, int col); | |||||
| void RowMajor2ColMajor(float *src_ptr, float *dst_ptr, int row, int col); | |||||
| void RowMajor2Row4Major(float *src_ptr, float *dst_ptr, int row, int col); | void RowMajor2Row4Major(float *src_ptr, float *dst_ptr, int row, int col); | ||||
| void RowMajor2Row8Major(float *src_ptr, float *dst_ptr, int row, int col); | void RowMajor2Row8Major(float *src_ptr, float *dst_ptr, int row, int col); | ||||
| void RowMajor2Row12Major(float *src_ptr, float *dst_ptr, int row, int col); | void RowMajor2Row12Major(float *src_ptr, float *dst_ptr, int row, int col); | ||||
| @@ -39,6 +41,7 @@ void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bi | |||||
| int col, size_t stride, size_t writeNhwc, size_t WriteWino); | int col, size_t stride, size_t writeNhwc, size_t WriteWino); | ||||
| void MatmulFloatNeon64Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, | void MatmulFloatNeon64Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, | ||||
| int col, size_t stride, size_t write_mode); | int col, size_t stride, size_t write_mode); | ||||
| void MatVecMulFp32Neon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col); | |||||
| #elif ENABLE_ARM32 | #elif ENABLE_ARM32 | ||||
| void MatmulFloatNeon32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, | void MatmulFloatNeon32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, | ||||
| int col, int stride, size_t writeNhwc, size_t WriteWino); | int col, int stride, size_t writeNhwc, size_t WriteWino); | ||||
| @@ -27,13 +27,13 @@ FullconnectionCPUKernel::~FullconnectionCPUKernel() { | |||||
| } | } | ||||
| void FullconnectionCPUKernel::FreeBuf() { | void FullconnectionCPUKernel::FreeBuf() { | ||||
| if (a_c12_ptr_ != nullptr) { | |||||
| free(a_c12_ptr_); | |||||
| a_c12_ptr_ = nullptr; | |||||
| if (a_pack_ptr_ != nullptr) { | |||||
| free(a_pack_ptr_); | |||||
| a_pack_ptr_ = nullptr; | |||||
| } | } | ||||
| if (b_r8_ptr_ != nullptr) { | |||||
| free(b_r8_ptr_); | |||||
| b_r8_ptr_ = nullptr; | |||||
| if (b_pack_ptr_ != nullptr) { | |||||
| free(b_pack_ptr_); | |||||
| b_pack_ptr_ = nullptr; | |||||
| } | } | ||||
| if (bias_ptr_ != nullptr) { | if (bias_ptr_ != nullptr) { | ||||
| free(bias_ptr_); | free(bias_ptr_); | ||||
| @@ -56,37 +56,50 @@ int FullconnectionCPUKernel::ReSize() { | |||||
| thread_count_ = MSMIN(thread_count_, UP_DIV(fc_param_->col_8_, 8)); | thread_count_ = MSMIN(thread_count_, UP_DIV(fc_param_->col_8_, 8)); | ||||
| thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_8_, 8), thread_count_); | thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_8_, 8), thread_count_); | ||||
| bias_ptr_ = reinterpret_cast<float *>(malloc(fc_param_->col_8_ * sizeof(float))); | |||||
| memset(bias_ptr_, 0, fc_param_->col_8_ * sizeof(float)); | |||||
| #ifdef ENABLE_ARM64 | |||||
| if (fc_param_->row_ == 1) { | |||||
| is_vector_input_ = true; | |||||
| } | |||||
| #endif | |||||
| if (in_tensors_.size() == 3) { | if (in_tensors_.size() == 3) { | ||||
| int col_tmp = is_vector_input_ ? fc_param_->col_ : fc_param_->col_8_; | |||||
| bias_ptr_ = reinterpret_cast<float *>(malloc(col_tmp * sizeof(float))); | |||||
| memcpy(bias_ptr_, in_tensors_[2]->MutableData(), fc_param_->col_ * sizeof(float)); | memcpy(bias_ptr_, in_tensors_[2]->MutableData(), fc_param_->col_ * sizeof(float)); | ||||
| } | } | ||||
| #ifdef ENABLE_ARM32 | #ifdef ENABLE_ARM32 | ||||
| a_c12_ptr_ = reinterpret_cast<float *>(malloc(fc_param_->row_4_ * fc_param_->deep_ * sizeof(float))); | |||||
| if (a_c12_ptr_ == nullptr) { | |||||
| a_pack_ptr_ = reinterpret_cast<float *>(malloc(fc_param_->row_4_ * fc_param_->deep_ * sizeof(float))); | |||||
| if (a_pack_ptr_ == nullptr) { | |||||
| return RET_MEMORY_FAILED; | return RET_MEMORY_FAILED; | ||||
| } | } | ||||
| memset(a_c12_ptr_, 0, fc_param_->row_4_ * fc_param_->deep_ * sizeof(float)); | |||||
| memset(a_pack_ptr_, 0, fc_param_->row_4_ * fc_param_->deep_ * sizeof(float)); | |||||
| #else | #else | ||||
| a_c12_ptr_ = reinterpret_cast<float *>(malloc(fc_param_->row_12_ * fc_param_->deep_ * sizeof(float))); | |||||
| if (a_c12_ptr_ == nullptr) { | |||||
| int row_tmp = is_vector_input_ ? 1 : fc_param_->row_12_; | |||||
| a_pack_ptr_ = reinterpret_cast<float *>(malloc(row_tmp * fc_param_->deep_ * sizeof(float))); | |||||
| if (a_pack_ptr_ == nullptr) { | |||||
| return RET_MEMORY_FAILED; | return RET_MEMORY_FAILED; | ||||
| } | } | ||||
| memset(a_c12_ptr_, 0, fc_param_->row_12_ * fc_param_->deep_ * sizeof(float)); | |||||
| memset(a_pack_ptr_, 0, row_tmp * fc_param_->deep_ * sizeof(float)); | |||||
| #endif | #endif | ||||
| b_r8_ptr_ = reinterpret_cast<float *>(malloc(fc_param_->col_8_ * fc_param_->deep_ * sizeof(float))); | |||||
| if (b_r8_ptr_ == nullptr) { | |||||
| int col_tmp = is_vector_input_ ? fc_param_->col_ : fc_param_->col_8_; | |||||
| b_pack_ptr_ = reinterpret_cast<float *>(malloc(col_tmp * fc_param_->deep_ * sizeof(float))); | |||||
| if (b_pack_ptr_ == nullptr) { | |||||
| FreeBuf(); | FreeBuf(); | ||||
| return RET_MEMORY_FAILED; | return RET_MEMORY_FAILED; | ||||
| } | } | ||||
| memset(b_r8_ptr_, 0, fc_param_->col_8_ * fc_param_->deep_ * sizeof(float)); | |||||
| memset(b_pack_ptr_, 0, col_tmp * fc_param_->deep_ * sizeof(float)); | |||||
| fc_param_->a_const_ = (in_tensors_[0]->data_c() != nullptr); | fc_param_->a_const_ = (in_tensors_[0]->data_c() != nullptr); | ||||
| fc_param_->b_const_ = (in_tensors_[1]->data_c() != nullptr); | fc_param_->b_const_ = (in_tensors_[1]->data_c() != nullptr); | ||||
| if (fc_param_->a_const_) InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->MutableData()), a_c12_ptr_); | |||||
| if (fc_param_->b_const_) InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->MutableData()), b_r8_ptr_); | |||||
| if (fc_param_->a_const_) { | |||||
| InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->MutableData()), a_pack_ptr_); | |||||
| a_ptr_ = a_pack_ptr_; | |||||
| } | |||||
| if (fc_param_->b_const_) { | |||||
| InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->MutableData()), b_pack_ptr_); | |||||
| b_ptr_ = b_pack_ptr_; | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -98,14 +111,24 @@ int FullconnectionCPUKernel::Init() { | |||||
| } | } | ||||
| void FullconnectionCPUKernel::InitMatrixA(float *src_ptr, float *dst_ptr) { | void FullconnectionCPUKernel::InitMatrixA(float *src_ptr, float *dst_ptr) { | ||||
| if (is_vector_input_) { | |||||
| memcpy(dst_ptr, src_ptr, fc_param_->deep_ * sizeof(float)); | |||||
| return; | |||||
| } | |||||
| #ifdef ENABLE_ARM32 | #ifdef ENABLE_ARM32 | ||||
| RowMajor2Col4Major(src_ptr, a_c12_ptr_, fc_param_->row_, fc_param_->deep_); | |||||
| RowMajor2Col4Major(src_ptr, a_pack_ptr_, fc_param_->row_, fc_param_->deep_); | |||||
| #else | #else | ||||
| RowMajor2Col12Major(src_ptr, a_c12_ptr_, fc_param_->row_, fc_param_->deep_); | |||||
| RowMajor2Col12Major(src_ptr, a_pack_ptr_, fc_param_->row_, fc_param_->deep_); | |||||
| #endif | #endif | ||||
| } | } | ||||
| void FullconnectionCPUKernel::InitMatrixB(float *src_ptr, float *dst_ptr) { | void FullconnectionCPUKernel::InitMatrixB(float *src_ptr, float *dst_ptr) { | ||||
| if (is_vector_input_) { | |||||
| memcpy(dst_ptr, src_ptr, fc_param_->col_ * fc_param_->deep_ * sizeof(float)); | |||||
| return; | |||||
| } | |||||
| RowMajor2Col8Major(src_ptr, dst_ptr, fc_param_->col_, fc_param_->deep_); | RowMajor2Col8Major(src_ptr, dst_ptr, fc_param_->col_, fc_param_->deep_); | ||||
| } | } | ||||
| @@ -125,9 +148,16 @@ int FullconnectionCPUKernel::DoMatmul(int task_id) { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| MatMulOpt(a_c12_ptr_, b_r8_ptr_ + task_id * thread_stride_ * C8NUM * fc_param_->deep_, | |||||
| c_r_ptr + task_id * thread_stride_ * C8NUM, bias_ptr_ + task_id * thread_stride_ * C8NUM, | |||||
| fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_, OutType_Nhwc); | |||||
| auto b = b_ptr_ + task_id * thread_stride_ * C8NUM * fc_param_->deep_; | |||||
| auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + task_id * thread_stride_ * C8NUM; | |||||
| auto c = c_ptr_ + task_id * thread_stride_ * C8NUM; | |||||
| if (is_vector_input_) { | |||||
| MatVecMul(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, cur_oc); | |||||
| } else { | |||||
| MatMulOpt(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_, | |||||
| OutType_Nhwc); | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -139,11 +169,24 @@ int FullconnectionCPUKernel::Run() { | |||||
| } | } | ||||
| auto a_ptr = reinterpret_cast<float *>(in_tensors_.at(0)->data_c()); | auto a_ptr = reinterpret_cast<float *>(in_tensors_.at(0)->data_c()); | ||||
| auto b_ptr = reinterpret_cast<float *>(in_tensors_.at(1)->data_c()); | auto b_ptr = reinterpret_cast<float *>(in_tensors_.at(1)->data_c()); | ||||
| c_r_ptr = reinterpret_cast<float *>(out_tensors_.at(0)->data_c()); | |||||
| if (!fc_param_->a_const_) InitMatrixA(a_ptr, a_c12_ptr_); | |||||
| if (!fc_param_->b_const_) InitMatrixB(b_ptr, b_r8_ptr_); | |||||
| c_ptr_ = reinterpret_cast<float *>(out_tensors_.at(0)->data_c()); | |||||
| if (!fc_param_->a_const_) { | |||||
| if (is_vector_input_) { | |||||
| a_ptr_ = a_ptr; | |||||
| } else { | |||||
| InitMatrixA(a_ptr, a_pack_ptr_); | |||||
| a_ptr_ = a_pack_ptr_; | |||||
| } | |||||
| } | |||||
| if (!fc_param_->b_const_) { | |||||
| if (is_vector_input_) { | |||||
| b_ptr_ = b_ptr; | |||||
| } else { | |||||
| InitMatrixB(b_ptr, b_pack_ptr_); | |||||
| b_ptr_ = b_pack_ptr_; | |||||
| } | |||||
| } | |||||
| ParallelLaunch(this->context_->thread_pool_, FcFp32MatmulRun, this, thread_count_); | ParallelLaunch(this->context_->thread_pool_, FcFp32MatmulRun, this, thread_count_); | ||||
| return RET_OK; | return RET_OK; | ||||
| @@ -19,8 +19,8 @@ | |||||
| #include <vector> | #include <vector> | ||||
| #include "src/runtime/kernel/arm/base/fullconnection_base.h" | #include "src/runtime/kernel/arm/base/fullconnection_base.h" | ||||
| #include "include/errorcode.h" | |||||
| #include "include/context.h" | #include "include/context.h" | ||||
| #include "include/errorcode.h" | |||||
| #include "nnacl/fp32/matmul.h" | #include "nnacl/fp32/matmul.h" | ||||
| using mindspore::lite::InnerContext; | using mindspore::lite::InnerContext; | ||||
| @@ -47,10 +47,13 @@ class FullconnectionCPUKernel : public FullconnectionBaseCPUKernel { | |||||
| void InitMatrixB(float *src_ptr, float *dst_ptr); | void InitMatrixB(float *src_ptr, float *dst_ptr); | ||||
| private: | private: | ||||
| float *a_c12_ptr_ = nullptr; | |||||
| float *b_r8_ptr_ = nullptr; | |||||
| float *c_r_ptr = nullptr; | |||||
| float *a_pack_ptr_ = nullptr; | |||||
| float *b_pack_ptr_ = nullptr; | |||||
| float *c_ptr_ = nullptr; | |||||
| float *bias_ptr_ = nullptr; | float *bias_ptr_ = nullptr; | ||||
| float *a_ptr_ = nullptr; | |||||
| float *b_ptr_ = nullptr; | |||||
| bool is_vector_input_ = false; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_FULLCONNECTION_H_ | #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_FULLCONNECTION_H_ | ||||
| @@ -15,9 +15,9 @@ | |||||
| */ | */ | ||||
| #include "src/runtime/kernel/arm/fp32/matmul.h" | #include "src/runtime/kernel/arm/fp32/matmul.h" | ||||
| #include "include/errorcode.h" | |||||
| #include "nnacl/fp32/matmul.h" | #include "nnacl/fp32/matmul.h" | ||||
| #include "src/runtime/runtime_api.h" | #include "src/runtime/runtime_api.h" | ||||
| #include "include/errorcode.h" | |||||
| using mindspore::lite::RET_ERROR; | using mindspore::lite::RET_ERROR; | ||||
| using mindspore::lite::RET_INPUT_TENSOR_ERROR; | using mindspore::lite::RET_INPUT_TENSOR_ERROR; | ||||
| @@ -28,13 +28,13 @@ namespace mindspore::kernel { | |||||
| MatmulCPUKernel::~MatmulCPUKernel() { FreeTmpBuffer(); } | MatmulCPUKernel::~MatmulCPUKernel() { FreeTmpBuffer(); } | ||||
| void MatmulCPUKernel::FreeTmpBuffer() { | void MatmulCPUKernel::FreeTmpBuffer() { | ||||
| if (a_c12_ptr_ != nullptr) { | |||||
| free(a_c12_ptr_); | |||||
| a_c12_ptr_ = nullptr; | |||||
| if (a_pack_ptr_ != nullptr) { | |||||
| free(a_pack_ptr_); | |||||
| a_pack_ptr_ = nullptr; | |||||
| } | } | ||||
| if (b_r8_ptr_ != nullptr) { | |||||
| free(b_r8_ptr_); | |||||
| b_r8_ptr_ = nullptr; | |||||
| if (b_pack_ptr_ != nullptr) { | |||||
| free(b_pack_ptr_); | |||||
| b_pack_ptr_ = nullptr; | |||||
| } | } | ||||
| if (bias_ptr_ != nullptr) { | if (bias_ptr_ != nullptr) { | ||||
| free(bias_ptr_); | free(bias_ptr_); | ||||
| @@ -50,24 +50,30 @@ int MatmulCPUKernel::MallocMatrixABuffer() { | |||||
| } | } | ||||
| params_->batch = batch; | params_->batch = batch; | ||||
| params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2]; | params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2]; | ||||
| #ifdef ENABLE_ARM64 | |||||
| if (params_->row_ == 1) { | |||||
| is_vector_a_ = true; | |||||
| } | |||||
| #endif | |||||
| params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; | params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; | ||||
| params_->row_4_ = UP_ROUND(params_->row_, C4NUM); | params_->row_4_ = UP_ROUND(params_->row_, C4NUM); | ||||
| params_->row_12_ = UP_ROUND(params_->row_, C12NUM); | params_->row_12_ = UP_ROUND(params_->row_, C12NUM); | ||||
| #ifdef ENABLE_ARM32 | #ifdef ENABLE_ARM32 | ||||
| a_c12_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * params_->row_4_ * params_->deep_ * sizeof(float))); | |||||
| if (a_c12_ptr_ == nullptr) { | |||||
| a_pack_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * params_->row_4_ * params_->deep_ * sizeof(float))); | |||||
| if (a_pack_ptr_ == nullptr) { | |||||
| FreeTmpBuffer(); | FreeTmpBuffer(); | ||||
| return RET_MEMORY_FAILED; | return RET_MEMORY_FAILED; | ||||
| } | } | ||||
| memset(a_c12_ptr_, 0, params_->row_4_ * params_->deep_ * sizeof(float)); | |||||
| memset(a_pack_ptr_, 0, params_->row_4_ * params_->deep_ * sizeof(float)); | |||||
| #else | #else | ||||
| a_c12_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * params_->row_12_ * params_->deep_ * sizeof(float))); | |||||
| if (a_c12_ptr_ == nullptr) { | |||||
| int row_tmp = is_vector_a_ ? 1 : params_->row_12_; | |||||
| a_pack_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * row_tmp * params_->deep_ * sizeof(float))); | |||||
| if (a_pack_ptr_ == nullptr) { | |||||
| FreeTmpBuffer(); | FreeTmpBuffer(); | ||||
| return RET_MEMORY_FAILED; | return RET_MEMORY_FAILED; | ||||
| } | } | ||||
| memset(a_c12_ptr_, 0, params_->row_12_ * params_->deep_ * sizeof(float)); | |||||
| memset(a_pack_ptr_, 0, params_->batch * row_tmp * params_->deep_ * sizeof(float)); | |||||
| #endif | #endif | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -86,12 +92,13 @@ int MatmulCPUKernel::MallocMatrixBBuffer() { | |||||
| params_->col_8_ = UP_ROUND(params_->col_, 8); | params_->col_8_ = UP_ROUND(params_->col_, 8); | ||||
| params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2]; | params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2]; | ||||
| b_r8_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float))); | |||||
| if (b_r8_ptr_ == nullptr) { | |||||
| int col_tmp = is_vector_a_ ? params_->col_ : params_->col_8_; | |||||
| b_pack_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * col_tmp * params_->deep_ * sizeof(float))); | |||||
| if (b_pack_ptr_ == nullptr) { | |||||
| FreeTmpBuffer(); | FreeTmpBuffer(); | ||||
| return RET_MEMORY_FAILED; | return RET_MEMORY_FAILED; | ||||
| } | } | ||||
| memset(b_r8_ptr_, 0, params_->col_8_ * params_->deep_ * sizeof(float)); | |||||
| memset(b_pack_ptr_, 0, params_->batch * col_tmp * params_->deep_ * sizeof(float)); | |||||
| thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_8_, 8)); | thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_8_, 8)); | ||||
| thread_stride_ = UP_DIV(UP_DIV(params_->col_8_, 8), thread_count_); | thread_stride_ = UP_DIV(UP_DIV(params_->col_8_, 8), thread_count_); | ||||
| @@ -99,24 +106,22 @@ int MatmulCPUKernel::MallocMatrixBBuffer() { | |||||
| } | } | ||||
| int MatmulCPUKernel::InitBias() { | int MatmulCPUKernel::InitBias() { | ||||
| auto c_shape = out_tensors_[0]->shape(); | |||||
| if (c_shape.empty()) { | |||||
| return RET_OK; | |||||
| } | |||||
| auto col_8 = UP_ROUND(c_shape[c_shape.size() - 1], 8); | |||||
| bias_ptr_ = reinterpret_cast<float *>(malloc(col_8 * sizeof(float))); | |||||
| if (bias_ptr_ == nullptr) { | |||||
| FreeTmpBuffer(); | |||||
| return RET_MEMORY_FAILED; | |||||
| } | |||||
| memset(bias_ptr_, 0, col_8 * sizeof(float)); | |||||
| if (in_tensors_.size() == 3) { | if (in_tensors_.size() == 3) { | ||||
| auto c_shape = out_tensors_[0]->shape(); | |||||
| auto bias_shape = in_tensors_[1]->shape(); | auto bias_shape = in_tensors_[1]->shape(); | ||||
| if (bias_shape[bias_shape.size() - 1] != c_shape[c_shape.size() - 1]) { | if (bias_shape[bias_shape.size() - 1] != c_shape[c_shape.size() - 1]) { | ||||
| MS_LOG(ERROR) << "The bias'dimension is not equal with colum"; | MS_LOG(ERROR) << "The bias'dimension is not equal with colum"; | ||||
| FreeTmpBuffer(); | FreeTmpBuffer(); | ||||
| return RET_INPUT_TENSOR_ERROR; | return RET_INPUT_TENSOR_ERROR; | ||||
| } | } | ||||
| auto col = c_shape[c_shape.size() - 1]; | |||||
| auto col_8 = UP_ROUND(col, 8); | |||||
| auto col_tmp = is_vector_a_ ? col : col_8; | |||||
| bias_ptr_ = reinterpret_cast<float *>(malloc(col_tmp * sizeof(float))); | |||||
| if (bias_ptr_ == nullptr) { | |||||
| FreeTmpBuffer(); | |||||
| return RET_MEMORY_FAILED; | |||||
| } | |||||
| memcpy(bias_ptr_, in_tensors_[2]->data_c(), in_tensors_[2]->ElementsNum() * sizeof(float)); | memcpy(bias_ptr_, in_tensors_[2]->data_c(), in_tensors_[2]->ElementsNum() * sizeof(float)); | ||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| @@ -124,9 +129,9 @@ int MatmulCPUKernel::InitBias() { | |||||
| int MatmulCPUKernel::ReSize() { | int MatmulCPUKernel::ReSize() { | ||||
| if (params_->a_const_ == false || params_->a_has_shape_ == false) { | if (params_->a_const_ == false || params_->a_has_shape_ == false) { | ||||
| if (a_c12_ptr_ != nullptr) { | |||||
| free(a_c12_ptr_); | |||||
| a_c12_ptr_ = nullptr; | |||||
| if (a_pack_ptr_ != nullptr) { | |||||
| free(a_pack_ptr_); | |||||
| a_pack_ptr_ = nullptr; | |||||
| } | } | ||||
| auto ret = MallocMatrixABuffer(); | auto ret = MallocMatrixABuffer(); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| @@ -135,9 +140,9 @@ int MatmulCPUKernel::ReSize() { | |||||
| } | } | ||||
| } | } | ||||
| if (params_->b_const_ == false || params_->b_has_shape_ == false) { | if (params_->b_const_ == false || params_->b_has_shape_ == false) { | ||||
| if (b_r8_ptr_ != nullptr) { | |||||
| free(b_r8_ptr_); | |||||
| b_r8_ptr_ = nullptr; | |||||
| if (b_pack_ptr_ != nullptr) { | |||||
| free(b_pack_ptr_); | |||||
| b_pack_ptr_ = nullptr; | |||||
| } | } | ||||
| auto ret = MallocMatrixBBuffer(); | auto ret = MallocMatrixBBuffer(); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| @@ -158,6 +163,11 @@ int MatmulCPUKernel::ReSize() { | |||||
| } | } | ||||
| void MatmulCPUKernel::InitMatrixA(float *src_ptr, float *dst_ptr) { | void MatmulCPUKernel::InitMatrixA(float *src_ptr, float *dst_ptr) { | ||||
| if (is_vector_a_) { | |||||
| memcpy(dst_ptr, src_ptr, params_->batch * params_->deep_ * sizeof(float)); | |||||
| return; | |||||
| } | |||||
| for (int i = 0; i < params_->batch; i++) { | for (int i = 0; i < params_->batch; i++) { | ||||
| float *src = src_ptr + i * params_->deep_ * params_->row_; | float *src = src_ptr + i * params_->deep_ * params_->row_; | ||||
| #ifdef ENABLE_ARM32 | #ifdef ENABLE_ARM32 | ||||
| @@ -180,6 +190,19 @@ void MatmulCPUKernel::InitMatrixA(float *src_ptr, float *dst_ptr) { | |||||
| } | } | ||||
| void MatmulCPUKernel::InitMatrixB(float *src_ptr, float *dst_ptr) { | void MatmulCPUKernel::InitMatrixB(float *src_ptr, float *dst_ptr) { | ||||
| if (is_vector_a_) { | |||||
| if (params_->b_transpose_) { | |||||
| memcpy(dst_ptr, src_ptr, params_->batch * params_->col_ * params_->deep_ * sizeof(float)); | |||||
| } else { | |||||
| for (int i = 0; i < params_->batch; i++) { | |||||
| float *src = src_ptr + i * params_->deep_ * params_->col_; | |||||
| float *dst = dst_ptr + i * params_->deep_ * params_->col_; | |||||
| RowMajor2ColMajor(src, dst, params_->deep_, params_->col_); | |||||
| } | |||||
| } | |||||
| return; | |||||
| } | |||||
| for (int i = 0; i < params_->batch; i++) { | for (int i = 0; i < params_->batch; i++) { | ||||
| float *src = src_ptr + i * params_->deep_ * params_->col_; | float *src = src_ptr + i * params_->deep_ * params_->col_; | ||||
| float *dst = dst_ptr + i * params_->deep_ * params_->col_8_; | float *dst = dst_ptr + i * params_->deep_ * params_->col_8_; | ||||
| @@ -213,10 +236,12 @@ int MatmulCPUKernel::Init() { | |||||
| params_->a_const_ = (in_tensors_[0]->data_c() != nullptr); | params_->a_const_ = (in_tensors_[0]->data_c() != nullptr); | ||||
| params_->b_const_ = (in_tensors_[1]->data_c() != nullptr); | params_->b_const_ = (in_tensors_[1]->data_c() != nullptr); | ||||
| if (params_->a_const_ == true) { | if (params_->a_const_ == true) { | ||||
| InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c()), a_c12_ptr_); | |||||
| InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c()), a_pack_ptr_); | |||||
| a_ptr_ = a_pack_ptr_; | |||||
| } | } | ||||
| if (params_->b_const_ == true) { | if (params_->b_const_ == true) { | ||||
| InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_r8_ptr_); | |||||
| InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_); | |||||
| b_ptr_ = b_pack_ptr_; | |||||
| } | } | ||||
| if (!InferShapeDone()) { | if (!InferShapeDone()) { | ||||
| return RET_OK; | return RET_OK; | ||||
| @@ -234,9 +259,14 @@ int MatmulCPUKernel::RunImpl(int task_id) { | |||||
| if (cur_oc <= 0) { | if (cur_oc <= 0) { | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| MatMulOpt(a_ptr_, b_ptr_ + task_id * thread_stride_ * C8NUM * params_->deep_, | |||||
| c_ptr_ + task_id * thread_stride_ * C8NUM, bias_ptr_ + task_id * thread_stride_ * C8NUM, ActType_No, | |||||
| params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc); | |||||
| auto b = cur_b_ptr_ + task_id * thread_stride_ * C8NUM * params_->deep_; | |||||
| auto c = cur_c_ptr_ + task_id * thread_stride_ * C8NUM; | |||||
| auto bias = bias_ptr_ ? bias_ptr_ + task_id * thread_stride_ * C8NUM : NULL; | |||||
| if (is_vector_a_) { | |||||
| MatVecMul(cur_a_ptr_, b, c, bias, ActType_No, params_->deep_, cur_oc); | |||||
| } else { | |||||
| MatMulOpt(cur_a_ptr_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc); | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -261,16 +291,32 @@ int MatmulCPUKernel::Run() { | |||||
| auto c_src = reinterpret_cast<float *>(out_tensors_[0]->data_c()); | auto c_src = reinterpret_cast<float *>(out_tensors_[0]->data_c()); | ||||
| if (params_->a_const_ == false || is_train()) { | if (params_->a_const_ == false || is_train()) { | ||||
| InitMatrixA(a_src, a_c12_ptr_); | |||||
| if (is_vector_a_) { | |||||
| a_ptr_ = a_src; | |||||
| } else { | |||||
| InitMatrixA(a_src, a_pack_ptr_); | |||||
| a_ptr_ = a_pack_ptr_; | |||||
| } | |||||
| } | } | ||||
| if (params_->b_const_ == false || is_train()) { | if (params_->b_const_ == false || is_train()) { | ||||
| InitMatrixB(b_src, b_r8_ptr_); | |||||
| if (is_vector_a_) { | |||||
| b_ptr_ = b_src; | |||||
| } else { | |||||
| InitMatrixB(b_src, b_pack_ptr_); | |||||
| b_ptr_ = b_pack_ptr_; | |||||
| } | |||||
| } | } | ||||
| for (int i = 0; i < params_->batch; ++i) { | for (int i = 0; i < params_->batch; ++i) { | ||||
| a_ptr_ = a_c12_ptr_ + i * params_->row_12_ * params_->deep_; | |||||
| b_ptr_ = b_r8_ptr_ + i * params_->deep_ * params_->col_8_; | |||||
| c_ptr_ = c_src + i * params_->row_ * params_->col_; | |||||
| if (is_vector_a_) { | |||||
| cur_a_ptr_ = a_ptr_ + i * params_->deep_; | |||||
| cur_b_ptr_ = b_ptr_ + i * params_->deep_ * params_->col_; | |||||
| cur_c_ptr_ = c_src + i * params_->row_ * params_->col_; | |||||
| } else { | |||||
| cur_a_ptr_ = a_ptr_ + i * params_->row_12_ * params_->deep_; | |||||
| cur_b_ptr_ = b_ptr_ + i * params_->deep_ * params_->col_8_; | |||||
| cur_c_ptr_ = c_src + i * params_->row_ * params_->col_; | |||||
| } | |||||
| ParallelLaunch(this->context_->thread_pool_, MatmulFloatRun, this, thread_count_); | ParallelLaunch(this->context_->thread_pool_, MatmulFloatRun, this, thread_count_); | ||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| @@ -280,10 +326,10 @@ void MatmulCPUKernel::eval() { | |||||
| // Copy weights after training | // Copy weights after training | ||||
| LiteKernel::eval(); | LiteKernel::eval(); | ||||
| if (params_->a_const_ == true) { | if (params_->a_const_ == true) { | ||||
| InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->MutableData()), a_c12_ptr_); | |||||
| InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->MutableData()), a_pack_ptr_); | |||||
| } | } | ||||
| if (params_->b_const_ == true) { | if (params_->b_const_ == true) { | ||||
| InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->MutableData()), b_r8_ptr_); | |||||
| InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->MutableData()), b_pack_ptr_); | |||||
| } | } | ||||
| } | } | ||||
| @@ -18,8 +18,8 @@ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_H_ | #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_H_ | ||||
| #include <vector> | #include <vector> | ||||
| #include "src/lite_kernel.h" | |||||
| #include "nnacl/matmul_parameter.h" | #include "nnacl/matmul_parameter.h" | ||||
| #include "src/lite_kernel.h" | |||||
| #include "src/runtime/kernel/arm/base/matmul_base.h" | #include "src/runtime/kernel/arm/base/matmul_base.h" | ||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| @@ -45,12 +45,15 @@ class MatmulCPUKernel : public MatmulBaseCPUKernel { | |||||
| void FreeTmpBuffer(); | void FreeTmpBuffer(); | ||||
| private: | private: | ||||
| float *a_c12_ptr_ = nullptr; | |||||
| float *b_r8_ptr_ = nullptr; | |||||
| float *a_pack_ptr_ = nullptr; | |||||
| float *b_pack_ptr_ = nullptr; | |||||
| float *bias_ptr_ = nullptr; | float *bias_ptr_ = nullptr; | ||||
| float *a_ptr_ = nullptr; | float *a_ptr_ = nullptr; | ||||
| float *b_ptr_ = nullptr; | float *b_ptr_ = nullptr; | ||||
| float *c_ptr_ = nullptr; | |||||
| float *cur_a_ptr_ = nullptr; | |||||
| float *cur_b_ptr_ = nullptr; | |||||
| float *cur_c_ptr_ = nullptr; | |||||
| bool is_vector_a_ = false; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -17,11 +17,11 @@ | |||||
| #include <sys/time.h> | #include <sys/time.h> | ||||
| #include <iostream> | #include <iostream> | ||||
| #include <memory> | #include <memory> | ||||
| #include "src/common/log_adapter.h" | |||||
| #include "common/common_test.h" | #include "common/common_test.h" | ||||
| #include "nnacl/fp32/matmul.h" | |||||
| #include "src/common/file_utils.h" | #include "src/common/file_utils.h" | ||||
| #include "src/common/log_adapter.h" | |||||
| #include "src/runtime/kernel/arm/fp32/fullconnection.h" | #include "src/runtime/kernel/arm/fp32/fullconnection.h" | ||||
| #include "nnacl/fp32/matmul.h" | |||||
| namespace mindspore { | namespace mindspore { | ||||
| using mindspore::lite::Tensor; | using mindspore::lite::Tensor; | ||||
| @@ -144,4 +144,59 @@ TEST_F(TestFcFp32, FcTest2) { | |||||
| fc->Run(); | fc->Run(); | ||||
| CompareOutputData(reinterpret_cast<float *>(outputs_[0]->MutableData()), correct, total_size, 0.0001); | CompareOutputData(reinterpret_cast<float *>(outputs_[0]->MutableData()), correct, total_size, 0.0001); | ||||
| } | } | ||||
| int FcTestInit3(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tensor *> *outputs_, | |||||
| MatMulParameter *matmal_param, float **correct) { | |||||
| Tensor *in_t = new Tensor(kNumberTypeFloat, {1, 1, 1, 20}, schema::Format_NHWC, lite::Tensor::Category::CONST); | |||||
| in_t->MallocData(); | |||||
| float in[] = {1, 0, 3, 0, 4, 5, 2, 5, 2, 5, 1, 5, 0, 1, 2, 0, 2, 1, 0, 5}; | |||||
| memcpy(in_t->MutableData(), in, sizeof(float) * in_t->ElementsNum()); | |||||
| inputs_->push_back(in_t); | |||||
| Tensor *weight_t = new Tensor(kNumberTypeFloat, {16, 20}, schema::Format_NHWC, lite::Tensor::Category::CONST); | |||||
| weight_t->MallocData(); | |||||
| float weight[] = {0, 5, 5, 3, 0, 5, 3, 1, 0, 1, 3, 0, 5, 5, 2, 4, 0, 1, 1, 2, 3, 0, 5, 5, 4, 4, 1, 4, 1, 1, 5, 3, | |||||
| 3, 1, 0, 3, 1, 2, 4, 5, 3, 4, 4, 0, 3, 5, 0, 3, 4, 1, 0, 1, 3, 4, 0, 5, 2, 5, 0, 4, 2, 2, 2, 2, | |||||
| 4, 4, 5, 2, 1, 1, 5, 1, 4, 4, 5, 1, 2, 4, 0, 3, 1, 1, 0, 2, 1, 5, 2, 0, 1, 1, 5, 5, 4, 0, 0, 4, | |||||
| 2, 3, 2, 1, 4, 0, 5, 0, 2, 3, 1, 2, 1, 2, 1, 4, 2, 3, 5, 5, 4, 5, 2, 0, 3, 0, 2, 0, 1, 3, 0, 4, | |||||
| 1, 5, 2, 5, 4, 2, 5, 1, 4, 5, 3, 1, 0, 4, 4, 4, 1, 3, 4, 2, 2, 4, 1, 4, 0, 1, 0, 2, 4, 5, 2, 1, | |||||
| 0, 3, 5, 2, 4, 2, 1, 4, 2, 0, 1, 0, 2, 3, 0, 3, 2, 5, 5, 4, 3, 0, 0, 2, 0, 1, 5, 2, 2, 1, 3, 0, | |||||
| 3, 0, 5, 3, 3, 3, 5, 5, 3, 4, 0, 1, 2, 1, 2, 4, 3, 5, 4, 3, 0, 0, 4, 4, 2, 3, 5, 4, 3, 5, 1, 2, | |||||
| 1, 5, 0, 5, 1, 1, 5, 5, 0, 0, 1, 3, 2, 2, 2, 3, 4, 2, 2, 3, 2, 4, 3, 0, 2, 0, 3, 2, 1, 5, 2, 4, | |||||
| 4, 5, 2, 5, 0, 5, 3, 3, 0, 3, 2, 5, 5, 1, 1, 0, 2, 3, 0, 1, 1, 2, 4, 1, 3, 3, 5, 5, 0, 1, 0, 0, | |||||
| 1, 2, 3, 3, 5, 2, 2, 5, 1, 4, 3, 3, 0, 2, 5, 4, 3, 1, 2, 4, 0, 2, 1, 3, 1, 2, 1, 0, 5, 5, 4, 5}; | |||||
| memcpy(weight_t->MutableData(), weight, sizeof(float) * weight_t->ElementsNum()); | |||||
| inputs_->push_back(weight_t); | |||||
| Tensor *out_t = new Tensor(kNumberTypeFloat, {1, 16}, schema::Format_NHWC, lite::Tensor::Category::CONST); | |||||
| out_t->MallocData(); | |||||
| outputs_->push_back(out_t); | |||||
| matmal_param->b_transpose_ = true; | |||||
| matmal_param->a_transpose_ = false; | |||||
| matmal_param->has_bias_ = false; | |||||
| matmal_param->act_type_ = ActType_No; | |||||
| return out_t->ElementsNum(); | |||||
| } | |||||
| TEST_F(TestFcFp32, FcTest3) { | |||||
| std::vector<lite::Tensor *> inputs_; | |||||
| std::vector<lite::Tensor *> outputs_; | |||||
| auto matmul_param = new MatMulParameter(); | |||||
| float *correct; | |||||
| int total_size = FcTestInit3(&inputs_, &outputs_, matmul_param, &correct); | |||||
| lite::InnerContext *ctx = new lite::InnerContext; | |||||
| ctx->thread_num_ = 1; | |||||
| ASSERT_EQ(lite::RET_OK, ctx->Init()); | |||||
| kernel::FullconnectionCPUKernel *fc = | |||||
| new kernel::FullconnectionCPUKernel(reinterpret_cast<OpParameter *>(matmul_param), inputs_, outputs_, ctx, nullptr); | |||||
| fc->Init(); | |||||
| struct timeval start, end; | |||||
| gettimeofday(&start, NULL); | |||||
| for (int i = 0; i < 100000; ++i) fc->Run(); | |||||
| gettimeofday(&end, NULL); | |||||
| // printf("## elapsed: %llu\n", 1000000 * (end.tv_sec - start.tv_sec) + end.tv_usec - end.tv_usec); | |||||
| } | |||||
| } // namespace mindspore | } // namespace mindspore | ||||