Browse Source

fix write for big endian devices

pull/13699/head
lixian 4 years ago
parent
commit
08ed63ff91
10 changed files with 407 additions and 592 deletions
  1. +1
    -1
      include/api/context.h
  2. +2
    -2
      mindspore/lite/include/ms_tensor.h
  3. +24
    -48
      mindspore/lite/nnacl/assembly/arm64/AdderFp32.S
  4. +1
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S
  5. +1
    -1
      mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S
  6. +49
    -96
      mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
  7. +196
    -248
      mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S
  8. +130
    -192
      mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S
  9. +2
    -2
      mindspore/lite/src/cxx_api/context.cc
  10. +1
    -1
      mindspore/lite/src/tensor.h

+ 1
- 1
include/api/context.h View File

@@ -82,7 +82,7 @@ class MS_API CPUDeviceInfo : public DeviceInfoContext {
public: public:
enum DeviceType GetDeviceType() const override { return DeviceType::kCPU; }; enum DeviceType GetDeviceType() const override { return DeviceType::kCPU; };


/// \brief Set the thread affinity of CPU cores.
/// \brief Set the thread affinity to CPU cores.
/// ///
/// \param mode: 0: no affinities, 1: big cores first, 2: little cores first /// \param mode: 0: no affinities, 1: big cores first, 2: little cores first
void SetThreadAffinity(int mode); void SetThreadAffinity(int mode);


+ 2
- 2
mindspore/lite/include/ms_tensor.h View File

@@ -53,7 +53,7 @@ class MS_API MSTensor {
virtual Vector<int> shape() const = 0; virtual Vector<int> shape() const = 0;


/// \brief Set the shape of MSTensor. /// \brief Set the shape of MSTensor.
virtual void set_shape(const Vector<int> &name) = 0;
virtual void set_shape(const Vector<int> &shape) = 0;


/// \brief Get number of element in MSTensor. /// \brief Get number of element in MSTensor.
/// ///
@@ -71,7 +71,7 @@ class MS_API MSTensor {
virtual String tensor_name() const = 0; virtual String tensor_name() const = 0;


/// \brief Set the name of MSTensor. /// \brief Set the name of MSTensor.
virtual void set_tensor_name(const String name) = 0;
virtual void set_tensor_name(const String &name) = 0;


/// \brief Get the pointer of data in MSTensor. /// \brief Get the pointer of data in MSTensor.
/// ///


+ 24
- 48
mindspore/lite/nnacl/assembly/arm64/AdderFp32.S View File

@@ -458,115 +458,91 @@ LoopRow4:
b WriteEnd b WriteEnd
Write2: Write2:
add x2, x2, #8 add x2, x2, #8
str d9, [x11]
st1 {v9.2s}, [x11], x8
cmp x6, #1 cmp x6, #1
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d11, [x11]
st1 {v11.2s}, [x11], x8
cmp x6, #2 cmp x6, #2
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d13, [x11]
st1 {v13.2s}, [x11], x8
cmp x6, #3 cmp x6, #3
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d15, [x11]
st1 {v15.2s}, [x11], x8
cmp x6, #4 cmp x6, #4
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d17, [x11]
st1 {v17.2s}, [x11], x8
cmp x6, #5 cmp x6, #5
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d19, [x11]
st1 {v19.2s}, [x11], x8
cmp x6, #6 cmp x6, #6
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d21, [x11]
st1 {v21.2s}, [x11], x8
cmp x6, #7 cmp x6, #7
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d23, [x11]
st1 {v23.2s}, [x11], x8
cmp x6, #8 cmp x6, #8
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d25, [x11]
st1 {v25.2s}, [x11], x8
cmp x6, #9 cmp x6, #9
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d27, [x11]
st1 {v27.2s}, [x11], x8
cmp x6, #10 cmp x6, #10
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d29, [x11]
st1 {v29.2s}, [x11], x8
cmp x6, #11 cmp x6, #11
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d31, [x11]
add x11, x11, x8
st1 {v31.2s}, [x11], x8
add x11, x11, #8 add x11, x11, #8
b WriteEnd b WriteEnd
Write3: Write3:
add x2, x2, #12 add x2, x2, #12
add x19, x11, #8 add x19, x11, #8
str d9, [x11]
st1 {v9.2s}, [x11], x8
st1 {v9.s}[2], [x19], x8 st1 {v9.s}[2], [x19], x8
cmp x6, #1 cmp x6, #1
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d11, [x11]
st1 {v11.2s}, [x11], x8
st1 {v11.s}[2], [x19], x8 st1 {v11.s}[2], [x19], x8
cmp x6, #2 cmp x6, #2
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d13, [x11]
st1 {v13.2s}, [x11], x8
st1 {v13.s}[2], [x19], x8 st1 {v13.s}[2], [x19], x8
cmp x6, #3 cmp x6, #3
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d15, [x11]
st1 {v15.2s}, [x11], x8
st1 {v15.s}[2], [x19], x8 st1 {v15.s}[2], [x19], x8
cmp x6, #4 cmp x6, #4
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d17, [x11]
st1 {v17.2s}, [x11], x8
st1 {v17.s}[2], [x19], x8 st1 {v17.s}[2], [x19], x8
cmp x6, #5 cmp x6, #5
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d19, [x11]
st1 {v19.2s}, [x11], x8
st1 {v19.s}[2], [x19], x8 st1 {v19.s}[2], [x19], x8
cmp x6, #6 cmp x6, #6
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d21, [x11]
st1 {v21.2s}, [x11], x8
st1 {v21.s}[2], [x19], x8 st1 {v21.s}[2], [x19], x8
cmp x6, #7 cmp x6, #7
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d23, [x11]
st1 {v23.2s}, [x11], x8
st1 {v23.s}[2], [x19], x8 st1 {v23.s}[2], [x19], x8
cmp x6, #8 cmp x6, #8
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d25, [x11]
st1 {v25.2s}, [x11], x8
st1 {v25.s}[2], [x19], x8 st1 {v25.s}[2], [x19], x8
cmp x6, #9 cmp x6, #9
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d27, [x11]
st1 {v27.2s}, [x11], x8
st1 {v27.s}[2], [x19], x8 st1 {v27.s}[2], [x19], x8
cmp x6, #10 cmp x6, #10
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d29, [x11]
st1 {v29.2s}, [x11], x8
st1 {v29.s}[2], [x19], x8 st1 {v29.s}[2], [x19], x8
cmp x6, #11 cmp x6, #11
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d31, [x11]
st1 {v31.2s}, [x11], x8
st1 {v31.s}[2], [x19] st1 {v31.s}[2], [x19]
add x11, x11, x8
add x11, x11, #12 add x11, x11, #12
b WriteEnd b WriteEnd
Write4: Write4:


+ 1
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect3x3.S View File

@@ -129,7 +129,7 @@ asm_function ConvDwFp32Indirect3x3
tbnz w11, #1, Write2 tbnz w11, #1, Write2
tbnz w11, #0, Write1 tbnz w11, #0, Write1
Write2: Write2:
str d29, [x0], #8
st1 {v29.2s}, [x0], #8
ext v29.16b, v29.16b, v29.16b, #8 ext v29.16b, v29.16b, v29.16b, #8
tbz w11, #0, NextPixel tbz w11, #0, NextPixel
Write1: Write1:


+ 1
- 1
mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Indirect5x5.S View File

@@ -260,7 +260,7 @@ asm_function ConvDwFp32Indirect5x5
tbnz w2, #1, Write2 tbnz w2, #1, Write2
tbnz w2, #0, Write1 tbnz w2, #0, Write1
Write2: Write2:
str d29, [x0], #8
st1 {v29.2s}, [x0], #8
ext v29.16b, v29.16b, v29.16b, #8 ext v29.16b, v29.16b, v29.16b, #8
tbz w2, #0, NextPixel tbz w2, #0, NextPixel
Write1: Write1:


+ 49
- 96
mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S View File

@@ -740,115 +740,91 @@ LoopRow4:
b WriteEnd b WriteEnd
Write2: Write2:
add x2, x2, #8 add x2, x2, #8
str d8, [x11]
st1 {v8.2s}, [x11], x8
cmp x6, #1 cmp x6, #1
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d10, [x11]
st1 {v10.2s}, [x11], x8
cmp x6, #2 cmp x6, #2
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d12, [x11]
st1 {v12.2s}, [x11], x8
cmp x6, #3 cmp x6, #3
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d14, [x11]
st1 {v14.2s}, [x11], x8
cmp x6, #4 cmp x6, #4
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d16, [x11]
st1 {v16.2s}, [x11], x8
cmp x6, #5 cmp x6, #5
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d18, [x11]
st1 {v18.2s}, [x11], x8
cmp x6, #6 cmp x6, #6
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d20, [x11]
st1 {v20.2s}, [x11], x8
cmp x6, #7 cmp x6, #7
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d22, [x11]
st1 {v22.2s}, [x11], x8
cmp x6, #8 cmp x6, #8
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d24, [x11]
st1 {v24.2s}, [x11], x8
cmp x6, #9 cmp x6, #9
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d26, [x11]
st1 {v26.2s}, [x11], x8
cmp x6, #10 cmp x6, #10
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d28, [x11]
st1 {v28.2s}, [x11], x8
cmp x6, #11 cmp x6, #11
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d30, [x11]
add x11, x11, x8
st1 {v30.2s}, [x11], x8
add x11, x11, #8 add x11, x11, #8
b WriteEnd b WriteEnd
Write3: Write3:
add x2, x2, #12 add x2, x2, #12
add x19, x11, #8 add x19, x11, #8
str d8, [x11]
st1 {v8.2s}, [x11], x8
st1 {v8.s}[2], [x19], x8 st1 {v8.s}[2], [x19], x8
cmp x6, #1 cmp x6, #1
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d10, [x11]
st1 {v10.2s}, [x11], x8
st1 {v10.s}[2], [x19], x8 st1 {v10.s}[2], [x19], x8
cmp x6, #2 cmp x6, #2
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d12, [x11]
st1 {v12.2s}, [x11], x8
st1 {v12.s}[2], [x19], x8 st1 {v12.s}[2], [x19], x8
cmp x6, #3 cmp x6, #3
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d14, [x11]
st1 {v14.2s}, [x11], x8
st1 {v14.s}[2], [x19], x8 st1 {v14.s}[2], [x19], x8
cmp x6, #4 cmp x6, #4
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d16, [x11]
st1 {v16.2s}, [x11], x8
st1 {v16.s}[2], [x19], x8 st1 {v16.s}[2], [x19], x8
cmp x6, #5 cmp x6, #5
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d18, [x11]
st1 {v18.2s}, [x11], x8
st1 {v18.s}[2], [x19], x8 st1 {v18.s}[2], [x19], x8
cmp x6, #6 cmp x6, #6
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d20, [x11]
st1 {v20.2s}, [x11], x8
st1 {v20.s}[2], [x19], x8 st1 {v20.s}[2], [x19], x8
cmp x6, #7 cmp x6, #7
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d22, [x11]
st1 {v22.2s}, [x11], x8
st1 {v22.s}[2], [x19], x8 st1 {v22.s}[2], [x19], x8
cmp x6, #8 cmp x6, #8
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d24, [x11]
st1 {v24.2s}, [x11], x8
st1 {v24.s}[2], [x19], x8 st1 {v24.s}[2], [x19], x8
cmp x6, #9 cmp x6, #9
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d26, [x11]
st1 {v26.2s}, [x11], x8
st1 {v26.s}[2], [x19], x8 st1 {v26.s}[2], [x19], x8
cmp x6, #10 cmp x6, #10
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d28, [x11]
st1 {v28.2s}, [x11], x8
st1 {v28.s}[2], [x19], x8 st1 {v28.s}[2], [x19], x8
cmp x6, #11 cmp x6, #11
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d30, [x11]
st1 {v30.2s}, [x11], x8
st1 {v30.s}[2], [x19] st1 {v30.s}[2], [x19]
add x11, x11, x8
add x11, x11, #12 add x11, x11, #12
b WriteEnd b WriteEnd
Write4: Write4:
@@ -955,62 +931,51 @@ LoopRow4:
add x2, x2, #24 add x2, x2, #24
add x19, x11, #16 add x19, x11, #16
st1 {v8.4s}, [x11], x8 st1 {v8.4s}, [x11], x8
str d9, [x19]
st1 {v9.2s}, [x19], x8
cmp x6, #1 cmp x6, #1
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v10.4s}, [x11], x8 st1 {v10.4s}, [x11], x8
str d11, [x19]
st1 {v11.2s}, [x19], x8
cmp x6, #2 cmp x6, #2
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v12.4s}, [x11], x8 st1 {v12.4s}, [x11], x8
str d13, [x19]
st1 {v13.2s}, [x19], x8
cmp x6, #3 cmp x6, #3
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v14.4s}, [x11], x8 st1 {v14.4s}, [x11], x8
str d15, [x19]
st1 {v15.2s}, [x19], x8
cmp x6, #4 cmp x6, #4
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v16.4s}, [x11], x8 st1 {v16.4s}, [x11], x8
str d17, [x19]
st1 {v17.2s}, [x19], x8
cmp x6, #5 cmp x6, #5
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v18.4s}, [x11], x8 st1 {v18.4s}, [x11], x8
str d19, [x19]
st1 {v19.2s}, [x19], x8
cmp x6, #6 cmp x6, #6
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v20.4s}, [x11], x8 st1 {v20.4s}, [x11], x8
str d21, [x19]
st1 {v21.2s}, [x19], x8
cmp x6, #7 cmp x6, #7
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v22.4s}, [x11], x8 st1 {v22.4s}, [x11], x8
str d23, [x19]
st1 {v23.2s}, [x19], x8
cmp x6, #8 cmp x6, #8
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v24.4s}, [x11], x8 st1 {v24.4s}, [x11], x8
str d25, [x19]
st1 {v25.2s}, [x19], x8
cmp x6, #9 cmp x6, #9
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v26.4s}, [x11], x8 st1 {v26.4s}, [x11], x8
str d27, [x19]
st1 {v27.2s}, [x19], x8
cmp x6, #10 cmp x6, #10
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v28.4s}, [x11], x8 st1 {v28.4s}, [x11], x8
str d29, [x19]
st1 {v29.2s}, [x19], x8
cmp x6, #11 cmp x6, #11
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v30.4s}, [x11], x8 st1 {v30.4s}, [x11], x8
str d31, [x19]
st1 {v31.2s}, [x19]
add x11, x11, #24 add x11, x11, #24
b WriteEnd b WriteEnd
Write7: Write7:
@@ -1018,75 +983,63 @@ LoopRow4:
add x19, x11, #16 add x19, x11, #16
add x20, x11, #24 add x20, x11, #24
st1 {v8.4s}, [x11], x8 st1 {v8.4s}, [x11], x8
str d9, [x19]
st1 {v9.2s}, [x19], x8
st1 {v9.s}[2], [x20], x8 st1 {v9.s}[2], [x20], x8
cmp x6, #1 cmp x6, #1
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v10.4s}, [x11], x8 st1 {v10.4s}, [x11], x8
str d11, [x19]
st1 {v11.2s}, [x19], x8
st1 {v11.s}[2], [x20], x8 st1 {v11.s}[2], [x20], x8
cmp x6, #2 cmp x6, #2
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v12.4s}, [x11], x8 st1 {v12.4s}, [x11], x8
str d13, [x19]
st1 {v13.2s}, [x19], x8
st1 {v13.s}[2], [x20], x8 st1 {v13.s}[2], [x20], x8
cmp x6, #3 cmp x6, #3
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v14.4s}, [x11], x8 st1 {v14.4s}, [x11], x8
str d15, [x19]
st1 {v15.2s}, [x19], x8
st1 {v15.s}[2], [x20], x8 st1 {v15.s}[2], [x20], x8
cmp x6, #4 cmp x6, #4
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v16.4s}, [x11], x8 st1 {v16.4s}, [x11], x8
str d17, [x19]
st1 {v17.2s}, [x19], x8
st1 {v17.s}[2], [x20], x8 st1 {v17.s}[2], [x20], x8
cmp x6, #5 cmp x6, #5
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v18.4s}, [x11], x8 st1 {v18.4s}, [x11], x8
str d19, [x19]
st1 {v19.2s}, [x19], x8
st1 {v19.s}[2], [x20], x8 st1 {v19.s}[2], [x20], x8
cmp x6, #6 cmp x6, #6
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v20.4s}, [x11], x8 st1 {v20.4s}, [x11], x8
str d21, [x19]
st1 {v21.2s}, [x19], x8
st1 {v21.s}[2], [x20], x8 st1 {v21.s}[2], [x20], x8
cmp x6, #7 cmp x6, #7
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v22.4s}, [x11], x8 st1 {v22.4s}, [x11], x8
str d23, [x19]
st1 {v23.2s}, [x19], x8
st1 {v23.s}[2], [x20], x8 st1 {v23.s}[2], [x20], x8
cmp x6, #8 cmp x6, #8
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v24.4s}, [x11], x8 st1 {v24.4s}, [x11], x8
str d25, [x19]
st1 {v25.2s}, [x19], x8
st1 {v25.s}[2], [x20], x8 st1 {v25.s}[2], [x20], x8
cmp x6, #9 cmp x6, #9
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v26.4s}, [x11], x8 st1 {v26.4s}, [x11], x8
str d27, [x19]
st1 {v27.2s}, [x19], x8
st1 {v27.s}[2], [x20], x8 st1 {v27.s}[2], [x20], x8
cmp x6, #10 cmp x6, #10
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v28.4s}, [x11], x8 st1 {v28.4s}, [x11], x8
str d29, [x19]
st1 {v29.2s}, [x19], x8
st1 {v29.s}[2], [x20], x8 st1 {v29.s}[2], [x20], x8
cmp x6, #11 cmp x6, #11
beq WriteEnd beq WriteEnd
add x19, x19, x8
st1 {v30.4s}, [x11], x8 st1 {v30.4s}, [x11], x8
str d31, [x19]
add x19, x19, x8
st1 {v31.s}[2], [x20], x8
st1 {v31.2s}, [x19]
st1 {v31.s}[2], [x20]
add x11, x11, #28 add x11, x11, #28
b WriteEnd b WriteEnd
WriteC8: WriteC8:


+ 196
- 248
mindspore/lite/nnacl/assembly/fp16/IndirectGemmFp16_16x8.S View File

@@ -334,353 +334,301 @@ IndirectGemmStart:
add x0, x0, #2 add x0, x0, #2
b WriteEnd b WriteEnd
Write2: Write2:
str s16, [x15]
add x15, x15, x7
str s17, [x15]
add x15, x15, x7
str s18, [x15]
add x15, x15, x7
str s19, [x15]
add x15, x15, x7
str s20, [x15]
add x15, x15, x7
str s21, [x15]
add x15, x15, x7
str s22, [x15]
add x15, x15, x7
str s23, [x15]
add x15, x15, x7
str s24, [x15]
add x15, x15, x7
str s25, [x15]
add x15, x15, x7
str s26, [x15]
add x15, x15, x7
str s27, [x15]
add x15, x15, x7
str s28, [x15]
add x15, x15, x7
str s29, [x15]
add x15, x15, x7
str s30, [x15]
add x15, x15, x7
str s31, [x15]
add x17, x15, #2
st1 {v16.h}[0], [x15], x7
st1 {v16.h}[1], [x17], x7
st1 {v17.h}[0], [x15], x7
st1 {v17.h}[1], [x17], x7
st1 {v18.h}[0], [x15], x7
st1 {v18.h}[1], [x17], x7
st1 {v19.h}[0], [x15], x7
st1 {v19.h}[1], [x17], x7
st1 {v20.h}[0], [x15], x7
st1 {v20.h}[1], [x17], x7
st1 {v21.h}[0], [x15], x7
st1 {v21.h}[1], [x17], x7
st1 {v22.h}[0], [x15], x7
st1 {v22.h}[1], [x17], x7
st1 {v23.h}[0], [x15], x7
st1 {v23.h}[1], [x17], x7
st1 {v24.h}[0], [x15], x7
st1 {v24.h}[1], [x17], x7
st1 {v25.h}[0], [x15], x7
st1 {v25.h}[1], [x17], x7
st1 {v26.h}[0], [x15], x7
st1 {v26.h}[1], [x17], x7
st1 {v27.h}[0], [x15], x7
st1 {v27.h}[1], [x17], x7
st1 {v28.h}[0], [x15], x7
st1 {v28.h}[1], [x17], x7
st1 {v29.h}[0], [x15], x7
st1 {v29.h}[1], [x17], x7
st1 {v30.h}[0], [x15], x7
st1 {v30.h}[1], [x17], x7
st1 {v31.h}[0], [x15]
st1 {v31.h}[1], [x17]
add x0, x0, #4 add x0, x0, #4
b WriteEnd b WriteEnd
Write3: Write3:
add x17, x15, #4 add x17, x15, #4
str s16, [x15]
add x15, x15, x7
add x16, x15, #2
st1 {v16.h}[0], [x15], x7
st1 {v16.h}[1], [x16], x7
st1 {v16.h}[2], [x17], x7 st1 {v16.h}[2], [x17], x7
str s17, [x15]
add x15, x15, x7
st1 {v17.h}[0], [x15], x7
st1 {v17.h}[1], [x16], x7
st1 {v17.h}[2], [x17], x7 st1 {v17.h}[2], [x17], x7
str s18, [x15]
add x15, x15, x7
st1 {v18.h}[0], [x15], x7
st1 {v18.h}[1], [x16], x7
st1 {v18.h}[2], [x17], x7 st1 {v18.h}[2], [x17], x7
str s19, [x15]
add x15, x15, x7
st1 {v19.h}[0], [x15], x7
st1 {v19.h}[1], [x16], x7
st1 {v19.h}[2], [x17], x7 st1 {v19.h}[2], [x17], x7
str s20, [x15]
add x15, x15, x7
st1 {v20.h}[0], [x15], x7
st1 {v20.h}[1], [x16], x7
st1 {v20.h}[2], [x17], x7 st1 {v20.h}[2], [x17], x7
str s21, [x15]
add x15, x15, x7
st1 {v21.h}[0], [x15], x7
st1 {v21.h}[1], [x16], x7
st1 {v21.h}[2], [x17], x7 st1 {v21.h}[2], [x17], x7
str s22, [x15]
add x15, x15, x7
st1 {v22.h}[0], [x15], x7
st1 {v22.h}[1], [x16], x7
st1 {v22.h}[2], [x17], x7 st1 {v22.h}[2], [x17], x7
str s23, [x15]
add x15, x15, x7
st1 {v23.h}[0], [x15], x7
st1 {v23.h}[1], [x16], x7
st1 {v23.h}[2], [x17], x7 st1 {v23.h}[2], [x17], x7
str s24, [x15]
add x15, x15, x7
st1 {v24.h}[0], [x15], x7
st1 {v24.h}[1], [x16], x7
st1 {v24.h}[2], [x17], x7 st1 {v24.h}[2], [x17], x7
str s25, [x15]
add x15, x15, x7
st1 {v25.h}[0], [x15], x7
st1 {v25.h}[1], [x16], x7
st1 {v25.h}[2], [x17], x7 st1 {v25.h}[2], [x17], x7
str s26, [x15]
add x15, x15, x7
st1 {v26.h}[0], [x15], x7
st1 {v26.h}[1], [x16], x7
st1 {v26.h}[2], [x17], x7 st1 {v26.h}[2], [x17], x7
str s27, [x15]
add x15, x15, x7
st1 {v27.h}[0], [x15], x7
st1 {v27.h}[1], [x16], x7
st1 {v27.h}[2], [x17], x7 st1 {v27.h}[2], [x17], x7
str s28, [x15]
add x15, x15, x7
st1 {v28.h}[0], [x15], x7
st1 {v28.h}[1], [x16], x7
st1 {v28.h}[2], [x17], x7 st1 {v28.h}[2], [x17], x7
str s29, [x15]
add x15, x15, x7
st1 {v29.h}[0], [x15], x7
st1 {v29.h}[1], [x16], x7
st1 {v29.h}[2], [x17], x7 st1 {v29.h}[2], [x17], x7
str s30, [x15]
add x15, x15, x7
st1 {v30.h}[0], [x15], x7
st1 {v30.h}[1], [x16], x7
st1 {v30.h}[2], [x17], x7 st1 {v30.h}[2], [x17], x7
str s31, [x15]
st1 {v31.h}[0], [x15]
st1 {v31.h}[1], [x16]
st1 {v31.h}[2], [x17] st1 {v31.h}[2], [x17]
add x0, x0, #6 add x0, x0, #6
b WriteEnd b WriteEnd
Write4: Write4:
str d16, [x15]
add x15, x15, x7
str d17, [x15]
add x15, x15, x7
str d18, [x15]
add x15, x15, x7
str d19, [x15]
add x15, x15, x7
str d20, [x15]
add x15, x15, x7
str d21, [x15]
add x15, x15, x7
str d22, [x15]
add x15, x15, x7
str d23, [x15]
add x15, x15, x7
str d24, [x15]
add x15, x15, x7
str d25, [x15]
add x15, x15, x7
str d26, [x15]
add x15, x15, x7
str d27, [x15]
add x15, x15, x7
str d28, [x15]
add x15, x15, x7
str d29, [x15]
add x15, x15, x7
str d30, [x15]
add x15, x15, x7
str d31, [x15]
st1 {v16.4h}, [x15], x7
st1 {v17.4h}, [x15], x7
st1 {v18.4h}, [x15], x7
st1 {v19.4h}, [x15], x7
st1 {v20.4h}, [x15], x7
st1 {v21.4h}, [x15], x7
st1 {v22.4h}, [x15], x7
st1 {v23.4h}, [x15], x7
st1 {v24.4h}, [x15], x7
st1 {v25.4h}, [x15], x7
st1 {v26.4h}, [x15], x7
st1 {v27.4h}, [x15], x7
st1 {v28.4h}, [x15], x7
st1 {v29.4h}, [x15], x7
st1 {v30.4h}, [x15], x7
st1 {v31.4h}, [x15]
add x0, x0, #8 add x0, x0, #8
b WriteEnd b WriteEnd
Write5: Write5:
add x17, x15, #8 add x17, x15, #8
str d16, [x15]
add x15, x15, x7
st1 {v16.4h}, [x15], x7
st1 {v16.h}[4], [x17], x7 st1 {v16.h}[4], [x17], x7
str d17, [x15]
add x15, x15, x7
st1 {v17.4h}, [x15], x7
st1 {v17.h}[4], [x17], x7 st1 {v17.h}[4], [x17], x7
str d18, [x15]
add x15, x15, x7
st1 {v18.4h}, [x15], x7
st1 {v18.h}[4], [x17], x7 st1 {v18.h}[4], [x17], x7
str d19, [x15]
add x15, x15, x7
st1 {v19.4h}, [x15], x7
st1 {v19.h}[4], [x17], x7 st1 {v19.h}[4], [x17], x7
str d20, [x15]
add x15, x15, x7
st1 {v20.4h}, [x15], x7
st1 {v20.h}[4], [x17], x7 st1 {v20.h}[4], [x17], x7
str d21, [x15]
add x15, x15, x7
st1 {v21.4h}, [x15], x7
st1 {v21.h}[4], [x17], x7 st1 {v21.h}[4], [x17], x7
str d22, [x15]
add x15, x15, x7
st1 {v22.4h}, [x15], x7
st1 {v22.h}[4], [x17], x7 st1 {v22.h}[4], [x17], x7
str d23, [x15]
add x15, x15, x7
st1 {v23.4h}, [x15], x7
st1 {v23.h}[4], [x17], x7 st1 {v23.h}[4], [x17], x7
str d24, [x15]
add x15, x15, x7
st1 {v24.4h}, [x15], x7
st1 {v24.h}[4], [x17], x7 st1 {v24.h}[4], [x17], x7
str d25, [x15]
add x15, x15, x7
st1 {v25.4h}, [x15], x7
st1 {v25.h}[4], [x17], x7 st1 {v25.h}[4], [x17], x7
str d26, [x15]
add x15, x15, x7
st1 {v26.4h}, [x15], x7
st1 {v26.h}[4], [x17], x7 st1 {v26.h}[4], [x17], x7
str d27, [x15]
add x15, x15, x7
st1 {v27.4h}, [x15], x7
st1 {v27.h}[4], [x17], x7 st1 {v27.h}[4], [x17], x7
str d28, [x15]
add x15, x15, x7
st1 {v28.4h}, [x15], x7
st1 {v28.h}[4], [x17], x7 st1 {v28.h}[4], [x17], x7
str d29, [x15]
add x15, x15, x7
st1 {v29.4h}, [x15], x7
st1 {v29.h}[4], [x17], x7 st1 {v29.h}[4], [x17], x7
str d30, [x15]
add x15, x15, x7
st1 {v30.4h}, [x15], x7
st1 {v30.h}[4], [x17], x7 st1 {v30.h}[4], [x17], x7
str d31, [x15]
st1 {v31.4h}, [x15]
st1 {v31.h}[4], [x17] st1 {v31.h}[4], [x17]
add x0, x0, #10 add x0, x0, #10
b WriteEnd b WriteEnd
Write6: Write6:
add x17, x15, #8 add x17, x15, #8
str d16, [x15]
add x15, x15, x7
add x16, x15, #10
st1 {v16.4h}, [x15], x7
ins v0.s[0], v16.s[2] ins v0.s[0], v16.s[2]
str s0, [x17]
add x17, x17, x7
str d17, [x15]
add x15, x15, x7
st1 {v0.h}[0], [x17], x7
st1 {v0.h}[1], [x16], x7
st1 {v17.4h}, [x15], x7
ins v1.s[0], v17.s[2] ins v1.s[0], v17.s[2]
str s1, [x17]
add x17, x17, x7
str d18, [x15]
add x15, x15, x7
st1 {v1.h}[0], [x17], x7
st1 {v1.h}[1], [x16], x7
st1 {v18.4h}, [x15], x7
ins v2.s[0], v18.s[2] ins v2.s[0], v18.s[2]
str s2, [x17]
add x17, x17, x7
str d19, [x15]
add x15, x15, x7
st1 {v2.h}[0], [x17], x7
st1 {v2.h}[1], [x16], x7
st1 {v19.4h}, [x15], x7
ins v3.s[0], v19.s[2] ins v3.s[0], v19.s[2]
str s3, [x17]
add x17, x17, x7
str d20, [x15]
add x15, x15, x7
st1 {v3.h}[0], [x17], x7
st1 {v3.h}[1], [x16], x7
st1 {v20.4h}, [x15], x7
ins v4.s[0], v20.s[2] ins v4.s[0], v20.s[2]
str s4, [x17]
add x17, x17, x7
str d21, [x15]
add x15, x15, x7
st1 {v4.h}[0], [x17], x7
st1 {v4.h}[1], [x16], x7
st1 {v21.4h}, [x15], x7
ins v5.s[0], v21.s[2] ins v5.s[0], v21.s[2]
str s5, [x17]
add x17, x17, x7
str d22, [x15]
add x15, x15, x7
st1 {v5.h}[0], [x17], x7
st1 {v5.h}[1], [x16], x7
st1 {v22.4h}, [x15], x7
ins v6.s[0], v22.s[2] ins v6.s[0], v22.s[2]
str s6, [x17]
add x17, x17, x7
str d23, [x15]
add x15, x15, x7
st1 {v6.h}[0], [x17], x7
st1 {v6.h}[1], [x16], x7
st1 {v23.4h}, [x15], x7
ins v7.s[0], v23.s[2] ins v7.s[0], v23.s[2]
str s7, [x17]
add x17, x17, x7
str d24, [x15]
add x15, x15, x7
st1 {v7.h}[0], [x17], x7
st1 {v7.h}[1], [x16], x7
st1 {v24.4h}, [x15], x7
ins v8.s[0], v24.s[2] ins v8.s[0], v24.s[2]
str s8, [x17]
add x17, x17, x7
str d25, [x15]
add x15, x15, x7
st1 {v8.h}[0], [x17], x7
st1 {v8.h}[1], [x16], x7
st1 {v25.4h}, [x15], x7
ins v9.s[0], v25.s[2] ins v9.s[0], v25.s[2]
str s9, [x17]
add x17, x17, x7
str d26, [x15]
add x15, x15, x7
st1 {v9.h}[0], [x17], x7
st1 {v9.h}[1], [x16], x7
st1 {v26.4h}, [x15], x7
ins v10.s[0], v26.s[2] ins v10.s[0], v26.s[2]
str s10, [x17]
add x17, x17, x7
str d27, [x15]
add x15, x15, x7
st1 {v10.h}[0], [x17], x7
st1 {v10.h}[1], [x16], x7
st1 {v27.4h}, [x15], x7
ins v11.s[0], v27.s[2] ins v11.s[0], v27.s[2]
str s11, [x17]
add x17, x17, x7
str d28, [x15]
add x15, x15, x7
st1 {v11.h}[0], [x17], x7
st1 {v11.h}[1], [x16], x7
st1 {v28.4h}, [x15], x7
ins v12.s[0], v28.s[2] ins v12.s[0], v28.s[2]
str s12, [x17]
add x17, x17, x7
str d29, [x15]
add x15, x15, x7
st1 {v12.h}[0], [x17], x7
st1 {v12.h}[1], [x16], x7
st1 {v29.4h}, [x15], x7
ins v13.s[0], v29.s[2] ins v13.s[0], v29.s[2]
str s13, [x17]
add x17, x17, x7
str d30, [x15]
add x15, x15, x7
st1 {v13.h}[0], [x17], x7
st1 {v13.h}[1], [x16], x7
st1 {v30.4h}, [x15], x7
ins v14.s[0], v30.s[2] ins v14.s[0], v30.s[2]
str s14, [x17]
add x17, x17, x7
str d31, [x15]
st1 {v14.h}[0], [x17], x7
st1 {v14.h}[1], [x16], x7
st1 {v31.4h}, [x15]
ins v15.s[0], v31.s[2] ins v15.s[0], v31.s[2]
str s15, [x17]
st1 {v14.h}[0], [x17]
st1 {v14.h}[1], [x16]
add x0, x0, #12 add x0, x0, #12
b WriteEnd b WriteEnd
Write7: Write7:
add x17, x15, #8 add x17, x15, #8
add x18, x15, #10
add x16, x15, #12 add x16, x15, #12
str d16, [x15]
add x15, x15, x7
st1 {v16.4h}, [x15], x7
ins v0.s[0], v16.s[2] ins v0.s[0], v16.s[2]
str s0, [x17]
add x17, x17, x7
st1 {v0.h}[0], [x17], x7
st1 {v0.h}[1], [x18], x7
st1 {v16.h}[6], [x16], x7 st1 {v16.h}[6], [x16], x7
str d17, [x15]
add x15, x15, x7
st1 {v17.4h}, [x15], x7
ins v1.s[0], v17.s[2] ins v1.s[0], v17.s[2]
str s1, [x17]
add x17, x17, x7
st1 {v1.h}[0], [x17], x7
st1 {v1.h}[1], [x18], x7
st1 {v17.h}[6], [x16], x7 st1 {v17.h}[6], [x16], x7
str d18, [x15]
add x15, x15, x7
st1 {v18.4h}, [x15], x7
ins v2.s[0], v18.s[2] ins v2.s[0], v18.s[2]
str s2, [x17]
add x17, x17, x7
st1 {v2.h}[0], [x17], x7
st1 {v2.h}[1], [x18], x7
st1 {v18.h}[6], [x16], x7 st1 {v18.h}[6], [x16], x7
str d19, [x15]
add x15, x15, x7
st1 {v19.4h}, [x15], x7
ins v3.s[0], v19.s[2] ins v3.s[0], v19.s[2]
str s3, [x17]
add x17, x17, x7
st1 {v3.h}[0], [x17], x7
st1 {v3.h}[1], [x18], x7
st1 {v19.h}[6], [x16], x7 st1 {v19.h}[6], [x16], x7
str d20, [x15]
add x15, x15, x7
st1 {v20.4h}, [x15], x7
ins v4.s[0], v20.s[2] ins v4.s[0], v20.s[2]
str s4, [x17]
add x17, x17, x7
st1 {v4.h}[0], [x17], x7
st1 {v4.h}[1], [x18], x7
st1 {v20.h}[6], [x16], x7 st1 {v20.h}[6], [x16], x7
str d21, [x15]
add x15, x15, x7
st1 {v21.4h}, [x15], x7
ins v5.s[0], v21.s[2] ins v5.s[0], v21.s[2]
str s5, [x17]
add x17, x17, x7
st1 {v5.h}[0], [x17], x7
st1 {v5.h}[1], [x18], x7
st1 {v21.h}[6], [x16], x7 st1 {v21.h}[6], [x16], x7
str d22, [x15]
add x15, x15, x7
st1 {v22.4h}, [x15], x7
ins v6.s[0], v22.s[2] ins v6.s[0], v22.s[2]
str s6, [x17]
add x17, x17, x7
st1 {v6.h}[0], [x17], x7
st1 {v6.h}[1], [x18], x7
st1 {v22.h}[6], [x16], x7 st1 {v22.h}[6], [x16], x7
str d23, [x15]
add x15, x15, x7
st1 {v23.4h}, [x15], x7
ins v7.s[0], v23.s[2] ins v7.s[0], v23.s[2]
str s7, [x17]
add x17, x17, x7
st1 {v7.h}[0], [x17], x7
st1 {v7.h}[1], [x18], x7
st1 {v23.h}[6], [x16], x7 st1 {v23.h}[6], [x16], x7
str d24, [x15]
add x15, x15, x7
st1 {v24.4h}, [x15], x7
ins v8.s[0], v24.s[2] ins v8.s[0], v24.s[2]
str s8, [x17]
add x17, x17, x7
st1 {v8.h}[0], [x17], x7
st1 {v8.h}[1], [x18], x7
st1 {v24.h}[6], [x16], x7 st1 {v24.h}[6], [x16], x7
str d25, [x15]
add x15, x15, x7
st1 {v25.4h}, [x15], x7
ins v9.s[0], v25.s[2] ins v9.s[0], v25.s[2]
str s9, [x17]
add x17, x17, x7
st1 {v9.h}[0], [x17], x7
st1 {v9.h}[1], [x18], x7
st1 {v25.h}[6], [x16], x7 st1 {v25.h}[6], [x16], x7
str d26, [x15]
add x15, x15, x7
st1 {v26.4h}, [x15], x7
ins v10.s[0], v26.s[2] ins v10.s[0], v26.s[2]
str s10, [x17]
add x17, x17, x7
st1 {v10.h}[0], [x17], x7
st1 {v10.h}[1], [x18], x7
st1 {v26.h}[6], [x16], x7 st1 {v26.h}[6], [x16], x7
str d27, [x15]
add x15, x15, x7
st1 {v27.4h}, [x15], x7
ins v11.s[0], v27.s[2] ins v11.s[0], v27.s[2]
str s11, [x17]
add x17, x17, x7
st1 {v11.h}[0], [x17], x7
st1 {v11.h}[1], [x18], x7
st1 {v27.h}[6], [x16], x7 st1 {v27.h}[6], [x16], x7
str d28, [x15]
add x15, x15, x7
st1 {v28.4h}, [x15], x7
ins v12.s[0], v28.s[2] ins v12.s[0], v28.s[2]
str s12, [x17]
add x17, x17, x7
st1 {v12.h}[0], [x17], x7
st1 {v12.h}[1], [x18], x7
st1 {v28.h}[6], [x16], x7 st1 {v28.h}[6], [x16], x7
str d29, [x15]
add x15, x15, x7
st1 {v29.4h}, [x15], x7
ins v13.s[0], v29.s[2] ins v13.s[0], v29.s[2]
str s13, [x17]
add x17, x17, x7
st1 {v13.h}[0], [x17], x7
st1 {v13.h}[1], [x18], x7
st1 {v29.h}[6], [x16], x7 st1 {v29.h}[6], [x16], x7
str d30, [x15]
add x15, x15, x7
st1 {v30.4h}, [x15], x7
ins v14.s[0], v30.s[2] ins v14.s[0], v30.s[2]
str s14, [x17]
add x17, x17, x7
st1 {v14.h}[0], [x17], x7
st1 {v14.h}[1], [x18], x7
st1 {v30.h}[6], [x16], x7 st1 {v30.h}[6], [x16], x7
str d31, [x15]
st1 {v31.4h}, [x15]
ins v15.s[0], v31.s[2] ins v15.s[0], v31.s[2]
str s15, [x17]
st1 {v15.h}[0], [x17]
st1 {v15.h}[1], [x18]
st1 {v31.h}[6], [x16] st1 {v31.h}[6], [x16]
add x0, x0, #14 add x0, x0, #14
b WriteEnd b WriteEnd


+ 130
- 192
mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S View File

@@ -677,400 +677,354 @@ LoopRow:
b WriteEnd b WriteEnd
Write2: Write2:
add x2, x2, #4 add x2, x2, #4
str s16, [x11]
add x19, x11, #2
st1 {v16.h}[0], [x11], x8
st1 {v16.h}[1], [x19], x8
cmp x6, #1 cmp x6, #1
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s17, [x11]
st1 {v17.h}[0], [x11], x8
st1 {v17.h}[1], [x19], x8
cmp x6, #2 cmp x6, #2
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s18, [x11]
st1 {v18.h}[0], [x11], x8
st1 {v18.h}[1], [x19], x8
cmp x6, #3 cmp x6, #3
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s19, [x11]
st1 {v19.h}[0], [x11], x8
st1 {v19.h}[1], [x19], x8
cmp x6, #4 cmp x6, #4
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s20, [x11]
st1 {v20.h}[0], [x11], x8
st1 {v20.h}[1], [x19], x8
cmp x6, #5 cmp x6, #5
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s21, [x11]
st1 {v21.h}[0], [x11], x8
st1 {v21.h}[1], [x19], x8
cmp x6, #6 cmp x6, #6
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s22, [x11]
st1 {v22.h}[0], [x11], x8
st1 {v22.h}[1], [x19], x8
cmp x6, #7 cmp x6, #7
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s23, [x11]
st1 {v23.h}[0], [x11], x8
st1 {v23.h}[1], [x19], x8
cmp x6, #8 cmp x6, #8
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s24, [x11]
st1 {v24.h}[0], [x11], x8
st1 {v24.h}[1], [x19], x8
cmp x6, #9 cmp x6, #9
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s25, [x11]
st1 {v25.h}[0], [x11], x8
st1 {v25.h}[1], [x19], x8
cmp x6, #10 cmp x6, #10
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s26, [x11]
st1 {v26.h}[0], [x11], x8
st1 {v26.h}[1], [x19], x8
cmp x6, #11 cmp x6, #11
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s27, [x11]
st1 {v27.h}[0], [x11], x8
st1 {v27.h}[1], [x19], x8
cmp x6, #12 cmp x6, #12
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s28, [x11]
st1 {v28.h}[0], [x11], x8
st1 {v28.h}[1], [x19], x8
cmp x6, #13 cmp x6, #13
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s29, [x11]
st1 {v29.h}[0], [x11], x8
st1 {v29.h}[1], [x19], x8
cmp x6, #14 cmp x6, #14
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s30, [x11]
st1 {v30.h}[0], [x11], x8
st1 {v30.h}[1], [x19], x8
cmp x6, #15 cmp x6, #15
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s31, [x11]
add x11, x11, x8
st1 {v31.h}[0], [x11], x8
st1 {v31.h}[1], [x19]
add x11, x11, #4 add x11, x11, #4
b WriteEnd b WriteEnd
Write3: Write3:
add x2, x2, #6 add x2, x2, #6
add x19, x11, #4 add x19, x11, #4
str s16, [x11]
add x20, x11, #2
st1 {v16.h}[0], [x11], x8
st1 {v16.h}[1], [x20], x8
st1 {v16.h}[2], [x19], x8 st1 {v16.h}[2], [x19], x8
cmp x6, #1 cmp x6, #1
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s17, [x11]
st1 {v17.h}[0], [x11], x8
st1 {v17.h}[1], [x20], x8
st1 {v17.h}[2], [x19], x8 st1 {v17.h}[2], [x19], x8
cmp x6, #2 cmp x6, #2
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s18, [x11]
st1 {v18.h}[0], [x11], x8
st1 {v18.h}[1], [x20], x8
st1 {v18.h}[2], [x19], x8 st1 {v18.h}[2], [x19], x8
cmp x6, #3 cmp x6, #3
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s19, [x11]
st1 {v19.h}[0], [x11], x8
st1 {v19.h}[1], [x20], x8
st1 {v19.h}[2], [x19], x8 st1 {v19.h}[2], [x19], x8
cmp x6, #4 cmp x6, #4
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s20, [x11]
st1 {v20.h}[0], [x11], x8
st1 {v20.h}[1], [x20], x8
st1 {v20.h}[2], [x19], x8 st1 {v20.h}[2], [x19], x8
cmp x6, #5 cmp x6, #5
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s21, [x11]
st1 {v21.h}[0], [x11], x8
st1 {v21.h}[1], [x20], x8
st1 {v21.h}[2], [x19], x8 st1 {v21.h}[2], [x19], x8
cmp x6, #6 cmp x6, #6
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s22, [x11]
st1 {v22.h}[0], [x11], x8
st1 {v22.h}[1], [x20], x8
st1 {v22.h}[2], [x19], x8 st1 {v22.h}[2], [x19], x8
cmp x6, #7 cmp x6, #7
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s23, [x11]
st1 {v23.h}[0], [x11], x8
st1 {v23.h}[1], [x20], x8
st1 {v23.h}[2], [x19], x8 st1 {v23.h}[2], [x19], x8
cmp x6, #8 cmp x6, #8
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s24, [x11]
st1 {v24.h}[0], [x11], x8
st1 {v24.h}[1], [x20], x8
st1 {v24.h}[2], [x19], x8 st1 {v24.h}[2], [x19], x8
cmp x6, #9 cmp x6, #9
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s25, [x11]
st1 {v25.h}[0], [x11], x8
st1 {v25.h}[1], [x20], x8
st1 {v25.h}[2], [x19], x8 st1 {v25.h}[2], [x19], x8
cmp x6, #10 cmp x6, #10
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s26, [x11]
st1 {v26.h}[0], [x11], x8
st1 {v26.h}[1], [x20], x8
st1 {v26.h}[2], [x19], x8 st1 {v26.h}[2], [x19], x8
cmp x6, #11 cmp x6, #11
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s27, [x11]
st1 {v27.h}[0], [x11], x8
st1 {v27.h}[1], [x20], x8
st1 {v27.h}[2], [x19], x8 st1 {v27.h}[2], [x19], x8
cmp x6, #12 cmp x6, #12
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s28, [x11]
st1 {v28.h}[0], [x11], x8
st1 {v28.h}[1], [x20], x8
st1 {v28.h}[2], [x19], x8 st1 {v28.h}[2], [x19], x8
cmp x6, #13 cmp x6, #13
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s29, [x11]
st1 {v29.h}[0], [x11], x8
st1 {v29.h}[1], [x20], x8
st1 {v29.h}[2], [x19], x8 st1 {v29.h}[2], [x19], x8
cmp x6, #14 cmp x6, #14
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s30, [x11]
st1 {v30.h}[0], [x11], x8
st1 {v30.h}[1], [x20], x8
st1 {v30.h}[2], [x19], x8 st1 {v30.h}[2], [x19], x8
cmp x6, #15 cmp x6, #15
beq WriteEnd beq WriteEnd
add x11, x11, x8
str s31, [x11]
st1 {v31.h}[0], [x11], x8
st1 {v31.h}[1], [x20]
st1 {v31.h}[2], [x19] st1 {v31.h}[2], [x19]
add x11, x11, x8
add x11, x11, #6 add x11, x11, #6
b WriteEnd b WriteEnd
Write4: Write4:
add x2, x2, #8 add x2, x2, #8
str d16, [x11]
st1 {v16.4h}, [x11], x8
cmp x6, #1 cmp x6, #1
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d17, [x11]
st1 {v17.4h}, [x11], x8
cmp x6, #2 cmp x6, #2
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d18, [x11]
st1 {v18.4h}, [x11], x8
cmp x6, #3 cmp x6, #3
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d19, [x11]
st1 {v19.4h}, [x11], x8
cmp x6, #4 cmp x6, #4
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d20, [x11]
st1 {v20.4h}, [x11], x8
cmp x6, #5 cmp x6, #5
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d21, [x11]
st1 {v21.4h}, [x11], x8
cmp x6, #6 cmp x6, #6
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d22, [x11]
st1 {v22.4h}, [x11], x8
cmp x6, #7 cmp x6, #7
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d23, [x11]
st1 {v23.4h}, [x11], x8
cmp x6, #8 cmp x6, #8
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d24, [x11]
st1 {v24.4h}, [x11], x8
cmp x6, #9 cmp x6, #9
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d25, [x11]
st1 {v25.4h}, [x11], x8
cmp x6, #10 cmp x6, #10
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d26, [x11]
st1 {v26.4h}, [x11], x8
cmp x6, #11 cmp x6, #11
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d27, [x11]
st1 {v27.4h}, [x11], x8
cmp x6, #12 cmp x6, #12
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d28, [x11]
st1 {v28.4h}, [x11], x8
cmp x6, #13 cmp x6, #13
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d29, [x11]
st1 {v29.4h}, [x11], x8
cmp x6, #14 cmp x6, #14
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d30, [x11]
st1 {v30.4h}, [x11], x8
cmp x6, #15 cmp x6, #15
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d31, [x11]
add x11, x11, x8
st1 {v31.4h}, [x11], x8
add x11, x11, #8 add x11, x11, #8
b WriteEnd b WriteEnd
Write5: Write5:
add x2, x2, #10 add x2, x2, #10
add x19, x11, #8 add x19, x11, #8
str d16, [x11]
st1 {v16.4h}, [x11], x8
st1 {v16.h}[4], [x19], x8 st1 {v16.h}[4], [x19], x8
cmp x6, #1 cmp x6, #1
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d17, [x11]
st1 {v17.4h}, [x11], x8
st1 {v17.h}[4], [x19], x8 st1 {v17.h}[4], [x19], x8
cmp x6, #2 cmp x6, #2
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d18, [x11]
st1 {v18.4h}, [x11], x8
st1 {v18.h}[4], [x19], x8 st1 {v18.h}[4], [x19], x8
cmp x6, #3 cmp x6, #3
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d19, [x11]
st1 {v19.4h}, [x11], x8
st1 {v19.h}[4], [x19], x8 st1 {v19.h}[4], [x19], x8
cmp x6, #4 cmp x6, #4
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d20, [x11]
st1 {v20.4h}, [x11], x8
st1 {v20.h}[4], [x19], x8 st1 {v20.h}[4], [x19], x8
cmp x6, #5 cmp x6, #5
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d21, [x11]
st1 {v21.4h}, [x11], x8
st1 {v21.h}[4], [x19], x8 st1 {v21.h}[4], [x19], x8
cmp x6, #6 cmp x6, #6
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d22, [x11]
st1 {v22.4h}, [x11], x8
st1 {v22.h}[4], [x19], x8 st1 {v22.h}[4], [x19], x8
cmp x6, #7 cmp x6, #7
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d23, [x11]
st1 {v23.4h}, [x11], x8
st1 {v23.h}[4], [x19], x8 st1 {v23.h}[4], [x19], x8
cmp x6, #8 cmp x6, #8
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d24, [x11]
st1 {v24.4h}, [x11], x8
st1 {v24.h}[4], [x19], x8 st1 {v24.h}[4], [x19], x8
cmp x6, #9 cmp x6, #9
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d25, [x11]
st1 {v25.4h}, [x11], x8
st1 {v25.h}[4], [x19], x8 st1 {v25.h}[4], [x19], x8
cmp x6, #10 cmp x6, #10
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d26, [x11]
st1 {v26.4h}, [x11], x8
st1 {v26.h}[4], [x19], x8 st1 {v26.h}[4], [x19], x8
cmp x6, #11 cmp x6, #11
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d27, [x11]
st1 {v27.4h}, [x11], x8
st1 {v27.h}[4], [x19], x8 st1 {v27.h}[4], [x19], x8
cmp x6, #12 cmp x6, #12
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d28, [x11]
st1 {v28.4h}, [x11], x8
st1 {v28.h}[4], [x19], x8 st1 {v28.h}[4], [x19], x8
cmp x6, #13 cmp x6, #13
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d29, [x11]
st1 {v29.4h}, [x11], x8
st1 {v29.h}[4], [x19], x8 st1 {v29.h}[4], [x19], x8
cmp x6, #14 cmp x6, #14
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d30, [x11]
st1 {v30.4h}, [x11], x8
st1 {v30.h}[4], [x19], x8 st1 {v30.h}[4], [x19], x8
cmp x6, #15 cmp x6, #15
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d31, [x11]
st1 {v31.4h}, [x11], x8
st1 {v31.h}[4], [x19] st1 {v31.h}[4], [x19]
add x11, x11, x8
add x11, x11, #10 add x11, x11, #10
b WriteEnd b WriteEnd
Write6: Write6:
add x2, x2, #12 add x2, x2, #12
add x19, x11, #8 add x19, x11, #8
add x20, x11, #10 add x20, x11, #10
str d16, [x11]
st1 {v16.4h}, [x11], x8
st1 {v16.h}[4], [x19], x8 st1 {v16.h}[4], [x19], x8
st1 {v16.h}[5], [x20], x8 st1 {v16.h}[5], [x20], x8
cmp x6, #1 cmp x6, #1
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d17, [x11]
st1 {v17.4h}, [x11], x8
st1 {v17.h}[4], [x19], x8 st1 {v17.h}[4], [x19], x8
st1 {v17.h}[5], [x20], x8 st1 {v17.h}[5], [x20], x8
cmp x6, #2 cmp x6, #2
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d18, [x11]
st1 {v18.4h}, [x11], x8
st1 {v18.h}[4], [x19], x8 st1 {v18.h}[4], [x19], x8
st1 {v18.h}[5], [x20], x8 st1 {v18.h}[5], [x20], x8
cmp x6, #3 cmp x6, #3
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d19, [x11]
st1 {v19.4h}, [x11], x8
st1 {v19.h}[4], [x19], x8 st1 {v19.h}[4], [x19], x8
st1 {v19.h}[5], [x20], x8 st1 {v19.h}[5], [x20], x8
cmp x6, #4 cmp x6, #4
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d20, [x11]
st1 {v20.4h}, [x11], x8
st1 {v20.h}[4], [x19], x8 st1 {v20.h}[4], [x19], x8
st1 {v20.h}[5], [x20], x8 st1 {v20.h}[5], [x20], x8
cmp x6, #5 cmp x6, #5
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d21, [x11]
st1 {v21.4h}, [x11], x8
st1 {v21.h}[4], [x19], x8 st1 {v21.h}[4], [x19], x8
st1 {v21.h}[5], [x20], x8 st1 {v21.h}[5], [x20], x8
cmp x6, #6 cmp x6, #6
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d22, [x11]
st1 {v22.4h}, [x11], x8
st1 {v22.h}[4], [x19], x8 st1 {v22.h}[4], [x19], x8
st1 {v22.h}[5], [x20], x8 st1 {v22.h}[5], [x20], x8
cmp x6, #7 cmp x6, #7
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d23, [x11]
st1 {v23.4h}, [x11], x8
st1 {v23.h}[4], [x19], x8 st1 {v23.h}[4], [x19], x8
st1 {v23.h}[5], [x20], x8 st1 {v23.h}[5], [x20], x8
cmp x6, #8 cmp x6, #8
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d24, [x11]
st1 {v24.4h}, [x11], x8
st1 {v24.h}[4], [x19], x8 st1 {v24.h}[4], [x19], x8
st1 {v24.h}[5], [x20], x8 st1 {v24.h}[5], [x20], x8
cmp x6, #9 cmp x6, #9
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d25, [x11]
st1 {v25.4h}, [x11], x8
st1 {v25.h}[4], [x19], x8 st1 {v25.h}[4], [x19], x8
st1 {v25.h}[5], [x20], x8 st1 {v25.h}[5], [x20], x8
cmp x6, #10 cmp x6, #10
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d26, [x11]
st1 {v26.4h}, [x11], x8
st1 {v26.h}[4], [x19], x8 st1 {v26.h}[4], [x19], x8
st1 {v26.h}[5], [x20], x8 st1 {v26.h}[5], [x20], x8
cmp x6, #11 cmp x6, #11
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d27, [x11]
st1 {v27.4h}, [x11], x8
st1 {v27.h}[4], [x19], x8 st1 {v27.h}[4], [x19], x8
st1 {v27.h}[5], [x20], x8 st1 {v27.h}[5], [x20], x8
cmp x6, #12 cmp x6, #12
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d28, [x11]
st1 {v28.4h}, [x11], x8
st1 {v28.h}[4], [x19], x8 st1 {v28.h}[4], [x19], x8
st1 {v28.h}[5], [x20], x8 st1 {v28.h}[5], [x20], x8
cmp x6, #13 cmp x6, #13
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d29, [x11]
st1 {v29.4h}, [x11], x8
st1 {v29.h}[4], [x19], x8 st1 {v29.h}[4], [x19], x8
st1 {v29.h}[5], [x20], x8 st1 {v29.h}[5], [x20], x8
cmp x6, #14 cmp x6, #14
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d30, [x11]
st1 {v30.4h}, [x11], x8
st1 {v30.h}[4], [x19], x8 st1 {v30.h}[4], [x19], x8
st1 {v30.h}[5], [x20], x8 st1 {v30.h}[5], [x20], x8
cmp x6, #15 cmp x6, #15
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d31, [x11]
st1 {v31.4h}, [x11], x8
st1 {v31.h}[4], [x19] st1 {v31.h}[4], [x19]
st1 {v31.h}[5], [x20] st1 {v31.h}[5], [x20]
add x11, x11, x8
add x11, x11, #12 add x11, x11, #12
b WriteEnd b WriteEnd
Write7: Write7:
@@ -1078,116 +1032,100 @@ LoopRow:
add x19, x11, #8 add x19, x11, #8
add x20, x11, #10 add x20, x11, #10
add x10, x11, #12 add x10, x11, #12
str d16, [x11]
st1 {v16.4h}, [x11], x8
st1 {v16.h}[4], [x19], x8 st1 {v16.h}[4], [x19], x8
st1 {v16.h}[5], [x20], x8 st1 {v16.h}[5], [x20], x8
st1 {v16.h}[6], [x10], x8 st1 {v16.h}[6], [x10], x8
cmp x6, #1 cmp x6, #1
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d17, [x11]
st1 {v17.4h}, [x11], x8
st1 {v17.h}[4], [x19], x8 st1 {v17.h}[4], [x19], x8
st1 {v17.h}[5], [x20], x8 st1 {v17.h}[5], [x20], x8
st1 {v17.h}[6], [x10], x8 st1 {v17.h}[6], [x10], x8
cmp x6, #2 cmp x6, #2
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d18, [x11]
st1 {v18.4h}, [x11], x8
st1 {v18.h}[4], [x19], x8 st1 {v18.h}[4], [x19], x8
st1 {v18.h}[5], [x20], x8 st1 {v18.h}[5], [x20], x8
st1 {v18.h}[6], [x10], x8 st1 {v18.h}[6], [x10], x8
cmp x6, #3 cmp x6, #3
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d19, [x11]
st1 {v19.4h}, [x11], x8
st1 {v19.h}[4], [x19], x8 st1 {v19.h}[4], [x19], x8
st1 {v19.h}[5], [x20], x8 st1 {v19.h}[5], [x20], x8
st1 {v19.h}[6], [x10], x8 st1 {v19.h}[6], [x10], x8
cmp x6, #4 cmp x6, #4
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d20, [x11]
st1 {v20.4h}, [x11], x8
st1 {v20.h}[4], [x19], x8 st1 {v20.h}[4], [x19], x8
st1 {v20.h}[5], [x20], x8 st1 {v20.h}[5], [x20], x8
st1 {v20.h}[6], [x10], x8 st1 {v20.h}[6], [x10], x8
cmp x6, #5 cmp x6, #5
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d21, [x11]
st1 {v21.4h}, [x11], x8
st1 {v21.h}[4], [x19], x8 st1 {v21.h}[4], [x19], x8
st1 {v21.h}[5], [x20], x8 st1 {v21.h}[5], [x20], x8
st1 {v21.h}[6], [x10], x8 st1 {v21.h}[6], [x10], x8
cmp x6, #6 cmp x6, #6
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d22, [x11]
st1 {v22.4h}, [x11], x8
st1 {v22.h}[4], [x19], x8 st1 {v22.h}[4], [x19], x8
st1 {v22.h}[5], [x20], x8 st1 {v22.h}[5], [x20], x8
st1 {v22.h}[6], [x10], x8 st1 {v22.h}[6], [x10], x8
cmp x6, #7 cmp x6, #7
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d23, [x11]
st1 {v23.4h}, [x11], x8
st1 {v23.h}[4], [x19], x8 st1 {v23.h}[4], [x19], x8
st1 {v23.h}[5], [x20], x8 st1 {v23.h}[5], [x20], x8
st1 {v23.h}[6], [x10], x8 st1 {v23.h}[6], [x10], x8
cmp x6, #8 cmp x6, #8
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d24, [x11]
st1 {v24.4h}, [x11], x8
st1 {v24.h}[4], [x19], x8 st1 {v24.h}[4], [x19], x8
st1 {v24.h}[5], [x20], x8 st1 {v24.h}[5], [x20], x8
st1 {v24.h}[6], [x10], x8 st1 {v24.h}[6], [x10], x8
cmp x6, #9 cmp x6, #9
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d25, [x11]
st1 {v25.4h}, [x11], x8
st1 {v25.h}[4], [x19], x8 st1 {v25.h}[4], [x19], x8
st1 {v25.h}[5], [x20], x8 st1 {v25.h}[5], [x20], x8
st1 {v25.h}[6], [x10], x8 st1 {v25.h}[6], [x10], x8
cmp x6, #10 cmp x6, #10
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d26, [x11]
st1 {v26.4h}, [x11], x8
st1 {v26.h}[4], [x19], x8 st1 {v26.h}[4], [x19], x8
st1 {v26.h}[5], [x20], x8 st1 {v26.h}[5], [x20], x8
st1 {v26.h}[6], [x10], x8 st1 {v26.h}[6], [x10], x8
cmp x6, #11 cmp x6, #11
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d27, [x11]
st1 {v27.4h}, [x11], x8
st1 {v27.h}[4], [x19], x8 st1 {v27.h}[4], [x19], x8
st1 {v27.h}[5], [x20], x8 st1 {v27.h}[5], [x20], x8
st1 {v27.h}[6], [x10], x8 st1 {v27.h}[6], [x10], x8
cmp x6, #12 cmp x6, #12
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d28, [x11]
st1 {v28.4h}, [x11], x8
st1 {v28.h}[4], [x19], x8 st1 {v28.h}[4], [x19], x8
st1 {v28.h}[5], [x20], x8 st1 {v28.h}[5], [x20], x8
st1 {v28.h}[6], [x10], x8 st1 {v28.h}[6], [x10], x8
cmp x6, #13 cmp x6, #13
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d29, [x11]
st1 {v29.4h}, [x11], x8
st1 {v29.h}[4], [x19], x8 st1 {v29.h}[4], [x19], x8
st1 {v29.h}[5], [x20], x8 st1 {v29.h}[5], [x20], x8
st1 {v29.h}[6], [x10], x8 st1 {v29.h}[6], [x10], x8
cmp x6, #14 cmp x6, #14
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d30, [x11]
st1 {v30.4h}, [x11], x8
st1 {v30.h}[4], [x19], x8 st1 {v30.h}[4], [x19], x8
st1 {v30.h}[5], [x20], x8 st1 {v30.h}[5], [x20], x8
st1 {v30.h}[6], [x10], x8 st1 {v30.h}[6], [x10], x8
cmp x6, #15 cmp x6, #15
beq WriteEnd beq WriteEnd
add x11, x11, x8
str d31, [x11]
st1 {v31.4h}, [x11], x8
st1 {v31.h}[4], [x19] st1 {v31.h}[4], [x19]
st1 {v31.h}[5], [x20] st1 {v31.h}[5], [x20]
st1 {v31.h}[6], [x10] st1 {v31.h}[6], [x10]
add x11, x11, x8
add x11, x11, #14 add x11, x11, #14
b WriteEnd b WriteEnd
WriteC8: WriteC8:


+ 2
- 2
mindspore/lite/src/cxx_api/context.cc View File

@@ -30,8 +30,8 @@ constexpr auto kModelOptionKirinNpuFrequency = "mindspore.option.kirin_npu.frequ


struct Context::Data { struct Context::Data {
std::vector<std::shared_ptr<DeviceInfoContext>> device_info_list; std::vector<std::shared_ptr<DeviceInfoContext>> device_info_list;
int32_t thread_num;
std::shared_ptr<Allocator> allocator;
int32_t thread_num = 2;
std::shared_ptr<Allocator> allocator = nullptr;
}; };


struct DeviceInfoContext::Data { struct DeviceInfoContext::Data {


+ 1
- 1
mindspore/lite/src/tensor.h View File

@@ -74,7 +74,7 @@ class Tensor : public mindspore::tensor::MSTensor {


virtual bool operator==(const Tensor &tensor); virtual bool operator==(const Tensor &tensor);


void set_tensor_name(std::string name) override { tensor_name_ = name; }
void set_tensor_name(const std::string &name) override { tensor_name_ = name; }


std::string tensor_name() const override { return tensor_name_; } std::string tensor_name() const override { return tensor_name_; }




Loading…
Cancel
Save