|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520 |
- // Tencent is pleased to support the open source community by making ncnn available.
- //
- // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
- //
- // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
- // in compliance with the License. You may obtain a copy of the License at
- //
- // https://opensource.org/licenses/BSD-3-Clause
- //
- // Unless required by applicable law or agreed to in writing, software distributed
- // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
- // CONDITIONS OF ANY KIND, either express or implied. See the License for the
- // specific language governing permissions and limitations under the License.
-
- #include "mat.h"
-
- #if __ARM_NEON
- #include <arm_neon.h>
- #endif // __ARM_NEON
-
- #include "cpu.h"
-
- namespace ncnn {
-
- void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_vals)
- {
- int size = w * h;
-
- if (mean_vals && !norm_vals)
- {
- // substract mean only
- #pragma omp parallel for
- for (int q=0; q<c; q++)
- {
- float* ptr = data + cstep * q;
- const float mean = mean_vals[q];
-
- #if __ARM_NEON
- int nn = size >> 2;
- int remain = size - (nn << 2);
- #else
- int remain = size;
- #endif // __ARM_NEON
-
- #if __ARM_NEON
- #if __aarch64__
- float32x4_t _mean = vdupq_n_f32(mean);
- for (; nn>0; nn--)
- {
- float32x4_t _ptr = vld1q_f32(ptr);
- _ptr = vsubq_f32(_ptr, _mean);
- vst1q_f32(ptr, _ptr);
- ptr += 4;
- }
- #else
- if (nn > 0)
- {
- asm volatile(
- "vdup.f32 q1, %4 \n"
- "0: \n"
- "pld [%1, #128] \n"
- "vld1.f32 {d0-d1}, [%1 :128] \n"
- "vsub.f32 q0, q0, q1 \n"
- "subs %0, #1 \n"
- "vst1.f32 {d0-d1}, [%1 :128]! \n"
- "bne 0b \n"
- : "=r"(nn), // %0
- "=r"(ptr) // %1
- : "0"(nn),
- "1"(ptr),
- "r"(mean) // %4
- : "cc", "memory", "q0", "q1"
- );
- }
- #endif // __aarch64__
- #endif // __ARM_NEON
- for (; remain>0; remain--)
- {
- *ptr -= mean;
- ptr++;
- }
- }
- }
- else if (!mean_vals && norm_vals)
- {
- // normalize only
- #pragma omp parallel for
- for (int q=0; q<c; q++)
- {
- float* ptr = data + cstep * q;
- const float norm = norm_vals[q];
-
- #if __ARM_NEON
- int nn = size >> 2;
- int remain = size - (nn << 2);
- #else
- int remain = size;
- #endif // __ARM_NEON
-
- #if __ARM_NEON
- #if __aarch64__
- float32x4_t _norm = vdupq_n_f32(norm);
- for (; nn>0; nn--)
- {
- float32x4_t _ptr = vld1q_f32(ptr);
- _ptr = vmulq_f32(_ptr, _norm);
- vst1q_f32(ptr, _ptr);
- ptr += 4;
- }
- #else
- if (nn > 0)
- {
- asm volatile(
- "vdup.f32 q1, %4 \n"
- "0: \n"
- "pld [%1, #128] \n"
- "vld1.f32 {d0-d1}, [%1 :128] \n"
- "vmul.f32 q0, q0, q1 \n"
- "subs %0, #1 \n"
- "vst1.f32 {d0-d1}, [%1 :128]! \n"
- "bne 0b \n"
- : "=r"(nn), // %0
- "=r"(ptr) // %1
- : "0"(nn),
- "1"(ptr),
- "r"(norm) // %4
- : "cc", "memory", "q0", "q1"
- );
- }
- #endif // __aarch64__
- #endif // __ARM_NEON
- for (; remain>0; remain--)
- {
- *ptr *= norm;
- ptr++;
- }
- }
- }
- else if (mean_vals && norm_vals)
- {
- // substract mean and normalize
- #pragma omp parallel for
- for (int q=0; q<c; q++)
- {
- float* ptr = data + cstep * q;
- const float mean = mean_vals[q];
- const float norm = norm_vals[q];
-
- #if __ARM_NEON
- int nn = size >> 2;
- int remain = size - (nn << 2);
- #else
- int remain = size;
- #endif // __ARM_NEON
-
- #if __ARM_NEON
- #if __aarch64__
- float32x4_t _mean = vdupq_n_f32(mean);
- float32x4_t _norm = vdupq_n_f32(norm);
- for (; nn>0; nn--)
- {
- float32x4_t _ptr = vld1q_f32(ptr);
- _ptr = vsubq_f32(_ptr, _mean);
- _ptr = vmulq_f32(_ptr, _norm);
- vst1q_f32(ptr, _ptr);
- ptr += 4;
- }
- #else
- if (nn > 0)
- {
- asm volatile(
- "vdup.f32 q1, %4 \n"
- "vdup.f32 q2, %5 \n"
- "0: \n"
- "pld [%1, #128] \n"
- "vld1.f32 {d0-d1}, [%1 :128] \n"
- "vsub.f32 q0, q0, q1 \n"
- "vmul.f32 q0, q0, q2 \n"
- "subs %0, #1 \n"
- "vst1.f32 {d0-d1}, [%1 :128]! \n"
- "bne 0b \n"
- : "=r"(nn), // %0
- "=r"(ptr) // %1
- : "0"(nn),
- "1"(ptr),
- "r"(mean), // %4
- "r"(norm) // %5
- : "cc", "memory", "q0", "q1", "q2"
- );
- }
- #endif // __aarch64__
- #endif // __ARM_NEON
- for (; remain>0; remain--)
- {
- *ptr = (*ptr - mean) * norm;
- ptr++;
- }
- }
- }
- }
-
- // convert half precision floating point to float
- static float half2float(unsigned short value)
- {
- // 1 : 5 : 10
- unsigned short sign = (value & 0x8000) >> 15;
- unsigned short exponent = (value & 0x7c00) >> 10;
- unsigned short significand = value & 0x03FF;
-
- // fprintf(stderr, "%d %d %d\n", sign, exponent, significand);
-
- // 1 : 8 : 23
- union
- {
- unsigned int u;
- float f;
- } tmp;
- if (exponent == 0)
- {
- if (significand == 0)
- {
- // zero
- tmp.u = (sign << 31);
- }
- else
- {
- // denormal
- exponent = 0;
- // find non-zero bit
- while ((significand & 0x200) == 0)
- {
- significand <<= 1;
- exponent++;
- }
- significand <<= 1;
- significand &= 0x3FF;
- tmp.u = (sign << 31) | ((-exponent + (-15 + 127)) << 23) | (significand << 13);
- }
- }
- else if (exponent == 0x1F)
- {
- // infinity or NaN
- tmp.u = (sign << 31) | (0xFF << 23) | (significand << 13);
- }
- else
- {
- // normalized
- tmp.u = (sign << 31) | ((exponent + (-15 + 127)) << 23) | (significand << 13);
- }
-
- return tmp.f;
- }
-
- Mat Mat::from_float16(const unsigned short* data, int size)
- {
- Mat m(size);
- if (m.empty())
- return m;
-
- float* ptr = m.data;
-
- #if __ARM_NEON && (__ARM_FP & 2)
- int nn = cpu_support_arm_vfpv4() ? size >> 2 : 0;
- int remain = size - (nn << 2);
- #else
- int remain = size;
- #endif // __ARM_NEON
-
- #if __ARM_NEON && (__ARM_FP & 2)
- #if __aarch64__
- if (nn > 0)
- {
- asm volatile(
- "0: \n"
- "ldr d0, [%1], #8 \n"
- "fcvtl v1.4s, v0.4h \n"
- "subs %w0, %w0, #1 \n"
- "str q1, [%2], #16 \n"
- "bne 0b \n"
- : "=r"(nn), // %0
- "=r"(data), // %1
- "=r"(ptr) // %2
- : "0"(nn),
- "1"(data),
- "2"(ptr)
- : "cc", "memory", "v0", "v1"
- );
- }
- #else
- if (nn > 0)
- {
- asm volatile(
- "0: \n"
- "pld [%1, #64] \n"
- "vld1.s16 {d0}, [%1 :64]! \n"
- "vcvt.f32.f16 q1, d0 \n"
- "subs %0, #1 \n"
- "vst1.f32 {d2-d3}, [%2 :128]! \n"
- "bne 0b \n"
- : "=r"(nn), // %0
- "=r"(data), // %1
- "=r"(ptr) // %2
- : "0"(nn),
- "1"(data),
- "2"(ptr)
- : "cc", "memory", "q0", "q1"
- );
- }
- #endif // __aarch64__
- #endif // __ARM_NEON
- for (; remain>0; remain--)
- {
- *ptr = half2float(*data);
-
- data++;
- ptr++;
- }
-
- return m;
- }
-
- static void copy_make_border_image(const Mat& src, Mat& dst, int top, int left, int type, float v)
- {
- int w = dst.w;
- int h = dst.h;
-
- const float* ptr = src.data;
- float* outptr = dst.data;
-
- if (type == BORDER_CONSTANT)
- {
- int y = 0;
- // fill top
- for (; y < top; y++)
- {
- int x = 0;
- for (; x < w; x++)
- {
- outptr[x] = v;
- }
- outptr += w;
- }
- // fill center
- for (; y < (top + src.h); y++)
- {
- int x = 0;
- for (; x < left; x++)
- {
- outptr[x] = v;
- }
- for (; x < (left + src.w); x++)
- {
- outptr[x] = ptr[x - left];
- }
- for (; x < w; x++)
- {
- outptr[x] = v;
- }
- ptr += src.w;
- outptr += w;
- }
- // fill bottom
- for (; y < h; y++)
- {
- int x = 0;
- for (; x < w; x++)
- {
- outptr[x] = v;
- }
- outptr += w;
- }
- }
- else if (type == BORDER_REPLICATE)
- {
- int y = 0;
- // fill top
- for (; y < top; y++)
- {
- int x = 0;
- for (; x < left; x++)
- {
- outptr[x] = ptr[0];
- }
- for (; x < (left + src.w); x++)
- {
- outptr[x] = ptr[x - left];
- }
- for (; x < w; x++)
- {
- outptr[x] = ptr[src.w - 1];
- }
- outptr += w;
- }
- // fill center
- for (; y < (top + src.h); y++)
- {
- int x = 0;
- for (; x < left; x++)
- {
- outptr[x] = ptr[0];
- }
- for (; x < (left + src.w); x++)
- {
- outptr[x] = ptr[x - left];
- }
- for (; x < w; x++)
- {
- outptr[x] = ptr[src.w - 1];
- }
- ptr += src.w;
- outptr += w;
- }
- // fill bottom
- ptr -= src.w;
- for (; y < h; y++)
- {
- int x = 0;
- for (; x < left; x++)
- {
- outptr[x] = ptr[0];
- }
- for (; x < (left + src.w); x++)
- {
- outptr[x] = ptr[x - left];
- }
- for (; x < w; x++)
- {
- outptr[x] = ptr[src.w - 1];
- }
- outptr += w;
- }
- }
- }
-
- void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v)
- {
- int w = src.w + left + right;
- int h = src.h + top + bottom;
-
- if (src.dims == 2)
- {
- dst.create(w, h);
- if (dst.empty())
- return;
-
- copy_make_border_image(src, dst, top, left, type, v);
- }
- else if (src.dims == 3)
- {
- int channels = src.c;
-
- dst.create(w, h, channels);
- if (dst.empty())
- return;
-
- // unroll image channel
- #pragma omp parallel for
- for (int q=0; q<channels; q++)
- {
- const Mat m = src.channel(q);
- Mat borderm = dst.channel(q);
-
- copy_make_border_image(m, borderm, top, left, type, v);
- }
- }
- }
-
- static void copy_cut_border_image(const Mat& src, Mat& dst, int top, int left)
- {
- int w = dst.w;
- int h = dst.h;
-
- const float* ptr = src.data + src.w * top + left;
- float* outptr = dst.data;
-
- for (int y = 0; y < h; y++)
- {
- for (int x = 0; x < w; x++)
- {
- outptr[x] = ptr[x];
- }
- outptr += w;
- ptr += src.w;
- }
- }
-
- void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
- {
- int w = src.w - left - right;
- int h = src.h - top - bottom;
-
- if (src.dims == 2)
- {
- dst.create(w, h);
- if (dst.empty())
- return;
-
- copy_cut_border_image(src, dst, top, left);
- }
- else if (src.dims == 3)
- {
- int channels = src.c;
-
- dst.create(w, h, channels);
- if (dst.empty())
- return;
-
- // unroll image channel
- #pragma omp parallel for
- for (int q=0; q<channels; q++)
- {
- const Mat m = src.channel(q);
- Mat cutm = dst.channel(q);
-
- copy_cut_border_image(m, cutm, top, left);
- }
- }
- }
-
- } // namespace ncnn
|